diff --git a/NAMESPACE b/NAMESPACE index a277c4a6..ebf58489 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,7 +62,6 @@ export(github_api_page_first) export(github_api_page_last) export(github_api_page_next) export(github_api_page_prev) -export(github_api_project_commit_refresh) export(github_api_project_commits) export(github_api_project_contributors) export(github_api_project_issue) @@ -71,7 +70,6 @@ export(github_api_project_issue_or_pr_comment_refresh) export(github_api_project_issue_or_pr_comments) export(github_api_project_issue_refresh) export(github_api_project_pull_request) -export(github_api_project_pull_request_refresh) export(github_api_rate_limit) export(github_parse_project_commits) export(github_parse_project_issue) @@ -123,7 +121,6 @@ export(parse_gitlog_entity) export(parse_gof_patterns) export(parse_java_code_refactoring_json) export(parse_jira) -export(parse_jira_latest_date) export(parse_jira_replies) export(parse_jira_rss_xml) export(parse_line_metrics) diff --git a/NEWS.md b/NEWS.md index 6b15348c..e74bf146 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ========================= ### NEW FEATURES - + * `github_api_project_issue_refresh` and `github_api_project_issue_or_pr_comment_refresh` were added to download issue data or comments respectively that have not already been downloaded. * Kaiaulu architecture has been refactored. Instead of using a parser, download, network module structure, Kaiaulu now uses a combination of data type and tool structure. In that manner, various parser functions of download,R, parser.R, and network.R now are separated in git.R, jira.R, git.R, etc. When only small functionality of a tool is required, functions are grouped based on the data type they are associated to, for example, src.R. Kaiaulu API documentation has been updated accordingly. Functions signature and behavior remain the same: The only modification was the new placement of functions into files. For further rationale and changes, see the issue for more details. [#241](https://github.com/sailuh/kaiaulu/issues/241) * Temporal bipartite projections are now weighted. The temporal projection can be parameterized by `weight_scheme_cum_temporal()` `weight_scheme_pairwise_cum_temporal()` when all time lag edges are used, or the existing weight schemes can also be used when using a single lag. The all lag weight schemes reproduce the same behavior as Codeface's paper. See the issue for details. [#229](https://github.com/sailuh/kaiaulu/issues/229) * The `make_jira_issue()` and `make_jira_issue_tracker()` have been added, alongside examples and unit tests for `parse_jira()`. [#228](https://github.com/sailuh/kaiaulu/issues/228) diff --git a/R/github.R b/R/github.R index 67a1a302..951d8f8e 100644 --- a/R/github.R +++ b/R/github.R @@ -541,19 +541,25 @@ github_api_iterate_pages <- function(token,gh_response,save_folder_path,prefix=N #' Download Project Issues after a date #' -#' Download Issues from "GET /repos/{owner}/{repo}/issues" endpoint. +#' Returns issue data that has not already been downloaded +#' Gets the name of the file with the most recent data along the designated save path. +#' Extracts the greatest 'created_at' date from that file +#' Calls search/issues endpoint to download issues created after that date #' #' @param owner GitHub's repository owner (e.g. sailuh) #' @param repo GitHub's repository name (e.g. kaiaulu) #' @param created Github's created at date #' @param token Your GitHub API token -#' @param created Only issues created after this date are fetched #' @export #' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -github_api_project_issue_refresh <- function(owner,repo,token,created){ - # Construct the search query - # Ensure 'created' is in the format "YYYY-MM-DD" - # For more precise filtering, including time, use "YYYY-MM-DDTHH:MM:SSZ" +github_api_project_issue_refresh <- function(owner,repo,token){ + # Get the name of the file with the most recent date + latest_date_issue <- paste0(save_path_issue, parse_jira_latest_date(save_path_issue)) + message(latest_date_issue) + # get the created_at value + created <- format_created_at_from_file(latest_date_issue) + message("Date of latest issue downloaded: ", issue_most_recent_created_date) + # API Call query <- sprintf("repo:%s/%s is:issue created:>%s", owner, repo, created) # Use the Search API endpoint to search for issues @@ -569,32 +575,25 @@ github_api_project_issue_refresh <- function(owner,repo,token,created){ #' Download Project issues or pr comments after certain date #' -#' Download Issues from "GET /repos/{owner}/{repo}/issues" endpoint. +#' Returns issue and pull request comements that has not already been downloaded +#' Gets the name of the file with the most recent date along the designated save path. +#' Extracts the greatest 'created_at' date from that file +#' Calls issues/comments endpoint to download comments created after that date #' #' @param owner GitHub's repository owner (e.g. sailuh) #' @param repo GitHub's repository name (e.g. kaiaulu) #' @param created Github's created at date #' @param token Your GitHub API token -#' @param created Only issues created after this date are fetched #' @export #' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -github_api_project_issue_or_pr_comment_refresh <- function(owner,repo,token,created){ - # Construct the search query - # Ensure 'created' is in the format "YYYY-MM-DD" - # For more precise filtering, including time, use "YYYY-MM-DDTHH:MM:SSZ" - # query <- sprintf("repo:%s/%s is:issue created:>%s", owner, repo, created) - # - # # Use the Search API endpoint to search for issues - # issues <- gh::gh("/search/issues", - # q = query, - # .token = token, - # .limit = 100) # Adjust .limit as needed, though GitHub API has its own paging mechanisms - # - # items_only <- issues$items - # #issues_json <- jsonlite::toJSON(items_only, auto_unbox = TRUE, pretty = TRUE) - # return(items_only) - - +github_api_project_issue_or_pr_comment_refresh <- function(owner,repo,token){ + # Get the name of the file with the most recent date + latest_date_issue_or_pr_comment <- paste0(save_path_issue_or_pr_comments, parse_jira_latest_date(save_path_issue_or_pr_comments)) + message(latest_date_issue_or_pr_comment) + # get the created_at value + created <- format_created_at_from_file(latest_date_issue_or_pr_comment) + message("Date of most recent comment: ", issue_or_pr_comment_most_recent_created_date) + # Github API Call gh::gh("GET /repos/{owner}/{repo}/issues/comments", owner=owner, repo=repo, @@ -604,99 +603,6 @@ github_api_project_issue_or_pr_comment_refresh <- function(owner,repo,token,crea .token=token) } -#' Download Project Pull Requests after a date -#' -#' Download Issues from "GET /repos/{owner}/{repo}/search/issues" endpoint. -#' -#' @param owner GitHub's repository owner (e.g. sailuh) -#' @param repo GitHub's repository name (e.g. kaiaulu) -#' @param created Github's created at date -#' @param token Your GitHub API token -#' @param created Only issues created after this date are fetched -#' @export -#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -github_api_project_pull_request_refresh <- function(owner,repo,token,created){ - - query <- sprintf('repo:%s/%s+type:issue+in:comments+created:>%s', owner, repo, since) - - # Perform the API call to search issues/comments - comments <- gh::gh("/search/issues", - q=query, - page=1, - per_page=100, - .token=token) - - - #items_only <- pull_requests$items - return(items_only) -} - -#' Download Project Pull Requests after a date -#' -#' Download Issues from "GET /repos/{owner}/{repo}/search/issues" endpoint. -#' -#' @param owner GitHub's repository owner (e.g. sailuh) -#' @param repo GitHub's repository name (e.g. kaiaulu) -#' @param created Github's created at date -#' @param token Your GitHub API token -#' @param created Only issues created after this date are fetched -#' @export -#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -github_api_project_commit_refresh <- function(owner,repo,token,created){ - # Construct the search query for pull requests - # Ensure 'created' is in the format "YYYY-MM-DD" - # For more precise filtering, including time, use "YYYY-MM-DDTHH:MM:SSZ" - query <- sprintf("repo:%s/%s is:pr created:>%s", owner, repo, created) - - # Use the Search API endpoint to search for pull requests - pull_requests <- gh::gh("/search/issues", - q = query, - .token = token, - .limit = 100) # Adjust .limit as needed, though GitHub API has its own paging mechanisms - items_only <- pull_requests$items - return(items_only) - # Call the GitHub API to get issue comments created after a certain date - #comments <- - gh::gh("GET /repos/{owner}/{repo}/issues/comments", - owner=owner, - repo=repo, - since=created, # Pass the `since` parameter in the API request - page=1, - per_page=100, - .token=token) - - #return(comments) -} - -#' parse latest date -#' -#' Takes a filepath and returns a filename of the .json file that contains the -#' most recent 'created_at' value -#' -#' @param json_path the path with folders to read -#' @export -parse_jira_latest_date <- function(json_path){ - file_list <- list.files(json_path) - time_list <- list() - - # Checking if the save folder is empty - if (identical(file_list, character(0))){ - stop(stringi::stri_c("cannot open the connection")) - } - - for (j in file_list){ - j <- sub(".*_(\\w+)\\.[^.]+$", "\\1", j) - j <- as.numeric(j) - time_list <- append(time_list, j) - } - - overall_latest_date <- as.character(max(unlist(time_list))) - - latest_issue_file <- grep(overall_latest_date, file_list, value = TRUE) - - return(latest_issue_file) -} - #' get the created_at field from a filename #' #' Function to read a JSON file along a path and return the 'created_at' diff --git a/man/github_api_project_commit_refresh.Rd b/man/github_api_project_commit_refresh.Rd deleted file mode 100644 index 4da9dbd7..00000000 --- a/man/github_api_project_commit_refresh.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/github.R -\name{github_api_project_commit_refresh} -\alias{github_api_project_commit_refresh} -\title{Download Project Pull Requests after a date} -\usage{ -github_api_project_commit_refresh(owner, repo, token, created) -} -\arguments{ -\item{owner}{GitHub's repository owner (e.g. sailuh)} - -\item{repo}{GitHub's repository name (e.g. kaiaulu)} - -\item{token}{Your GitHub API token} - -\item{created}{Only issues created after this date are fetched} -} -\description{ -Download Issues from "GET /repos/{owner}/{repo}/search/issues" endpoint. -} -\references{ -For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -} diff --git a/man/github_api_project_issue_or_pr_comment_refresh.Rd b/man/github_api_project_issue_or_pr_comment_refresh.Rd index 2b3d3495..4a838664 100644 --- a/man/github_api_project_issue_or_pr_comment_refresh.Rd +++ b/man/github_api_project_issue_or_pr_comment_refresh.Rd @@ -4,7 +4,7 @@ \alias{github_api_project_issue_or_pr_comment_refresh} \title{Download Project issues or pr comments after certain date} \usage{ -github_api_project_issue_or_pr_comment_refresh(owner, repo, token, created) +github_api_project_issue_or_pr_comment_refresh(owner, repo, token) } \arguments{ \item{owner}{GitHub's repository owner (e.g. sailuh)} @@ -13,10 +13,13 @@ github_api_project_issue_or_pr_comment_refresh(owner, repo, token, created) \item{token}{Your GitHub API token} -\item{created}{Only issues created after this date are fetched} +\item{created}{Github's created at date} } \description{ -Download Issues from "GET /repos/{owner}/{repo}/issues" endpoint. +Returns issue and pull request comements that has not already been downloaded +Gets the name of the file with the most recent date along the designated save path. +Extracts the greatest 'created_at' date from that file +Calls issues/comments endpoint to download comments created after that date } \references{ For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. diff --git a/man/github_api_project_issue_refresh.Rd b/man/github_api_project_issue_refresh.Rd index d9984cb0..a03647d4 100644 --- a/man/github_api_project_issue_refresh.Rd +++ b/man/github_api_project_issue_refresh.Rd @@ -4,7 +4,7 @@ \alias{github_api_project_issue_refresh} \title{Download Project Issues after a date} \usage{ -github_api_project_issue_refresh(owner, repo, token, created) +github_api_project_issue_refresh(owner, repo, token) } \arguments{ \item{owner}{GitHub's repository owner (e.g. sailuh)} @@ -13,10 +13,13 @@ github_api_project_issue_refresh(owner, repo, token, created) \item{token}{Your GitHub API token} -\item{created}{Only issues created after this date are fetched} +\item{created}{Github's created at date} } \description{ -Download Issues from "GET /repos/{owner}/{repo}/issues" endpoint. +Returns issue data that has not already been downloaded +Gets the name of the file with the most recent data along the designated save path. +Extracts the greatest 'created_at' date from that file +Calls search/issues endpoint to download issues created after that date } \references{ For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. diff --git a/man/github_api_project_pull_request_refresh.Rd b/man/github_api_project_pull_request_refresh.Rd deleted file mode 100644 index 1f5a3fa0..00000000 --- a/man/github_api_project_pull_request_refresh.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/github.R -\name{github_api_project_pull_request_refresh} -\alias{github_api_project_pull_request_refresh} -\title{Download Project Pull Requests after a date} -\usage{ -github_api_project_pull_request_refresh(owner, repo, token, created) -} -\arguments{ -\item{owner}{GitHub's repository owner (e.g. sailuh)} - -\item{repo}{GitHub's repository name (e.g. kaiaulu)} - -\item{token}{Your GitHub API token} - -\item{created}{Only issues created after this date are fetched} -} -\description{ -Download Issues from "GET /repos/{owner}/{repo}/search/issues" endpoint. -} -\references{ -For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}. -} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd deleted file mode 100644 index 0727f93c..00000000 --- a/man/parse_jira_latest_date.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/github.R -\name{parse_jira_latest_date} -\alias{parse_jira_latest_date} -\title{parse latest date} -\usage{ -parse_jira_latest_date(json_path) -} -\arguments{ -\item{json_path}{the path with folders to read} -} -\description{ -Takes a filepath and returns a filename of the .json file that contains the -most recent 'created_at' value -} diff --git a/vignettes/download_github_comments.Rmd b/vignettes/download_github_comments.Rmd index 869dded6..2f9fc9fa 100644 --- a/vignettes/download_github_comments.Rmd +++ b/vignettes/download_github_comments.Rmd @@ -81,26 +81,12 @@ github_api_iterate_pages(token,gh_response, prefix="issue") ``` -#REFRESH Issues -```{r} -# get the filename of the latest date -latest_date_issue <- paste0(save_path_issue, parse_jira_latest_date(save_path_issue)) -message(latest_date_issue) - -``` +# REFRESH Issues -```{r} -# Example usage -issue_most_recent_created_date <- format_created_at_from_file(latest_date_issue) -print(issue_most_recent_created_date) -``` +The above function call downloads all the available issues less than the specified limit (default to API limit). The following chunk allows the downloading of issues that have been created only after the most recently created issue already downloaded. This allows the user to 'refresh' their data or continue downloading if a rate limit was previously reached. ```{r Collect all issues, eval = FALSE} #gh call but with date -#created <- paste0('>',most_recent_created_date) -gh_response <- github_api_project_issue_refresh(owner,repo,token,issue_most_recent_created_date) -#message(gh_response) -``` -```{r} +gh_response <- github_api_project_issue_refresh(owner,repo,token) #dir.create(save_path_issue) github_api_iterate_pages(token,gh_response, save_path_issue, @@ -122,26 +108,6 @@ github_api_iterate_pages(token,gh_response, save_path_pull_request, prefix="pull_request") ``` - -#REFRESH PULL REQUESTS -```{r} -# get the filename -latest_date_pr <- paste0(save_path_pull_request, parse_jira_latest_date(save_path_pull_request)) -message(latest_date) -# get the created_at -pr_most_recent_created_date <- format_created_at_from_file(latest_date_pr) -print(pr_most_recent_created_date) -``` - -```{r Collect all issues, eval = FALSE} -#gh call but with date -#created <- paste0('>',most_recent_created_date) -gh_response_pr <- github_api_project_pull_request_refresh(owner,repo,token,pr_most_recent_created_date) -dir.create(save_path_pull_request) -github_api_iterate_pages(token,gh_response_pr, - save_path_pull_request, - prefix="pr") -``` ## Issues and Pull Requests Comments Finally we obtain the comments of both issue and pull requests (which does not contain the data obtained in the prior two endpoints). @@ -159,20 +125,12 @@ github_api_iterate_pages(token,gh_response, prefix="issue_or_pr_comment") ``` #REFRESH ISSUE OR PR COMMENT -```{r} -# get the filename -latest_date_issue_or_pr_comment <- paste0(save_path_issue_or_pr_comments, parse_jira_latest_date(save_path_issue_or_pr_comments)) -message(latest_date_issue_or_pr_comment) -# get the created_at -issue_or_pr_comment_most_recent_created_date <- format_created_at_from_file(latest_date_issue_or_pr_comment) - -print(issue_or_pr_comment_most_recent_created_date) -``` +Similar to the refresh of the issues, this chunk allows for the downloading of comments that have been created since the most recently created date among data already downloaded. This allows us to 'refresh' the comments, downloading comments made since that date or continue downloading if a rate limit was reached. ```{r Collect all issues, eval = FALSE} #gh call but with date # get the data -gh_response_issue_or_pr_comment <- github_api_project_issue_or_pr_comment_refresh(owner,repo,token,issue_or_pr_comment_most_recent_created_date) +gh_response_issue_or_pr_comment <- github_api_project_issue_or_pr_comment_refresh(owner,repo,token) # create direcetory and iterate over data dir.create(save_path_issue_or_pr_comments) @@ -199,27 +157,6 @@ github_api_iterate_pages(token,gh_response, prefix="commit") ``` -#REFRESH Commit data -```{r} -# get the filename -latest_date_commit <- paste0(save_path_commit, parse_jira_latest_date(save_path_commit)) -message(latest_date_commit) -# get the created_at -commit_most_recent_created_date <- format_created_at_from_file(latest_date_commit) -print(commit_most_recent_created_date) -``` - -```{r Collect all issues, eval = FALSE} -#gh call but with date -#created <- paste0('>',most_recent_created_date) -gh_response_issue_or_pr_comment <- github_api_project_commit_refresh(owner,repo,token,issue_or_pr_comment_most_recent_created_date) - -dir.create(save_path_commit) - -github_api_iterate_pages(token,gh_response_issue_or_pr_comment, - save_path_commit, - prefix="commit") -``` # Parsing Raw Data to Csv