From 6cad299412523271fd402974b52287e182011cba Mon Sep 17 00:00:00 2001 From: LiNk-NY Date: Thu, 2 Jan 2025 16:19:28 -0500 Subject: [PATCH 1/3] wip: analyze commit stats --- DESCRIPTION | 8 +- NAMESPACE | 8 + R/commit-stats.R | 305 ++++++++++++++++++++++++++++++++++ R/download-stats.R | 49 +++--- R/support-site-stats.R | 40 +++-- man/get_support_site_stats.Rd | 2 +- 6 files changed, 367 insertions(+), 45 deletions(-) create mode 100644 R/commit-stats.R diff --git a/DESCRIPTION b/DESCRIPTION index f106df1..641431e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,10 +17,14 @@ Imports: tibble, utils, dplyr, ggplot2, + gh, httr, - lubridate, rorcid + lubridate, + purrr, + rorcid, + tibble Suggests: knitr, testthat LazyData: true VignetteBuilder: knitr biocViews: Infrastructure -RoxygenNote: 7.2.1 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 6203799..c7fe67b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand S3method(print,bioc_support_stats) +S3method(print,r_account_analysis) +export(analyze_r_account_activity) export(get_support_site_stats) export(orcid_table) export(summarize_software_downloads) @@ -10,4 +12,10 @@ import(httr) import(lubridate) import(rorcid) import(tibble) +importFrom(gh,gh) +importFrom(purrr,map) +importFrom(purrr,map_chr) +importFrom(purrr,map_dbl) +importFrom(purrr,map_df) +importFrom(tibble,tibble) importFrom(utils,read.table) diff --git a/R/commit-stats.R b/R/commit-stats.R new file mode 100644 index 0000000..94b713f --- /dev/null +++ b/R/commit-stats.R @@ -0,0 +1,305 @@ +#' @import lubridate +#' @importFrom tibble tibble +#' @importFrom gh gh +#' @importFrom purrr map_df map_chr map_dbl map +#' +#' @examples +#' results <- analyze_r_account_activity( +#' "LiNk-NY", "waldronlab", +#' start_date = "2023-08-31", +#' end_date = "2024-09-01" +#' ) +#' +#' @export +analyze_r_account_activity <- function( + username, + org, + start_date, + end_date, + github_token = gh::gh_token() +) { + # Input validation + if (is.null(github_token)) { + stop("No GitHub token found. Please set GITHUB_PAT environment variable or authenticate with GitHub CLI") + } + + # Step 1: Find all repositories for the account + message("Finding repositories for ", username, "...") + repos <- list() + page <- 1 + has_more <- TRUE + + while (has_more) { + if (!missing(org)) { + endpoint <- "GET /orgs/{org}/repos" + new_repos <- gh::gh( + endpoint, + org = org, + page = page, + .token = github_token + ) + } else { + endpoint <- "GET /users/{username}/repos" + new_repos <- gh::gh( + endpoint, + username = username, + org = username, + page = page, + .token = github_token + ) + } + + + if (length(new_repos)) { + repos <- c(repos, new_repos) + page <- page + 1 + } else { + has_more <- FALSE + } + } + + # Step 2: Filter for R repositories + message("Identifying R repositories...") + r_repos <- purrr::map_df(repos, function(repo, org) { + # Check for R language + if (!missing(org)) + languages <- gh::gh( + "GET /repos/{owner}/{repo}/languages", + owner = org, + repo = repo$name, + .token = github_token + ) + else + languages <- gh::gh( + "GET /repos/{owner}/{repo}/languages", + owner = username, + repo = repo$name, + .token = github_token + ) + + if ("R" %in% names(languages)) { + tibble::tibble( + full_name = repo$full_name, + name = repo$name, + description = ifelse(is.null(repo$description), NA, repo$description), + stars = repo$stargazers_count, + forks = repo$forks_count, + last_updated = repo$updated_at, + is_fork = repo$fork, + default_branch = repo$default_branch, + r_percentage = round(languages$R / sum(unlist(languages)) * 100, 1) + ) + } + }, org = org) + + if (!nrow(r_repos)) { + stop("No R repositories found for this account") + } + + # Step 3: Fetch commits for each R repository + message("Fetching commits for ", nrow(r_repos), " R repositories...") + all_commits <- list() + + for (i in seq_len(nrow(r_repos))) { + repo <- r_repos$full_name[i] + message("Processing ", repo, " (", i, "/", nrow(r_repos), ")") + + commits <- tryCatch({ + if (!missing(org)) + gh::gh( + "GET /repos/{owner}/{repo}/commits", + author = username, + owner = org, + repo = r_repos$name[i], + since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + .token = github_token + ) + else + gh::gh( + "GET /repos/{owner}/{repo}/commits", + author = username, + owner = username, + repo = r_repos$name[i], + since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + .token = github_token + ) + }, error = function(e) { + warning(paste("Error fetching commits for", repo, ":", e$message)) + return(list()) + }) + + # Process commits + repo_commits <- purrr::map(commits, function(commit, org) { + list( + repository = repo, + sha = commit$sha, + author = commit$commit$author$name, + date = commit$commit$author$date, + message = commit$commit$message, + changes = tryCatch({ + if (!missing(org)) + commit_detail <- gh::gh( + "GET /repos/{owner}/{repo}/commits/{sha}", + owner = org, + repo = r_repos$name[i], + sha = commit$sha, + since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + .token = github_token + ) + else + commit_detail <- gh::gh( + "GET /repos/{owner}/{repo}/commits/{sha}", + owner = username, + repo = r_repos$name[i], + sha = commit$sha, + since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), + .token = github_token + ) + list( + additions = commit_detail$stats$additions, + deletions = commit_detail$stats$deletions, + files_changed = length(commit_detail$files) + ) + }, error = function(e) { + list(additions = NA, deletions = NA, files_changed = NA) + }) + ) + }, org = org) + + all_commits <- c(all_commits, repo_commits) + } + + # Step 4: Generate statistics + commit_stats <- tibble::tibble( + repository = map_chr(all_commits, "repository"), + author = map_chr(all_commits, "author"), + date = map_chr(all_commits, "date"), + additions = map_dbl(all_commits, function(x) x$changes$additions), + deletions = map_dbl(all_commits, function(x) x$changes$deletions), + files_changed = map_dbl(all_commits, function(x) x$changes$files_changed) + ) + + repo_summary <- commit_stats |> + group_by(repository) |> + summarise( + total_commits = n(), + unique_authors = n_distinct(author), + total_additions = sum(additions, na.rm = TRUE), + total_deletions = sum(deletions, na.rm = TRUE), + total_files_changed = sum(files_changed, na.rm = TRUE) + ) + + # Step 5: Generate analysis prompt + analysis_prompt <- sprintf( + "You are a senior R developer and data scientist. Please analyze the + following GitHub development activity for %s between %s and %s.\n\n", + username, start_date, end_date + ) + + # Add repository overview + analysis_prompt <- paste0( + analysis_prompt, + "Repository Overview:\n", + paste(map_chr(seq_len(nrow(r_repos)), function(i) { + sprintf("- %s: %s (Stars: %d, Forks: %d, R code: %.1f%%)", + r_repos$full_name[i], + if(is.na(r_repos$description[i])) "No description" else r_repos$description[i], + r_repos$stars[i], + r_repos$forks[i], + r_repos$r_percentage[i]) + }), collapse = "\n"), + "\n\n" + ) + + # Add commit details + analysis_prompt <- paste0( + analysis_prompt, + "Commit Activity:\n\n", + paste(map_chr(all_commits, function(x) { + sprintf("Repository: %s\nDate: %s\nAuthor: %s\nChanges: +%s/-%s (%s files)\nMessage: %s\n", + x$repository, + x$date, + x$author, + ifelse(is.na(x$changes$additions), "?", x$changes$additions), + ifelse(is.na(x$changes$deletions), "?", x$changes$deletions), + ifelse(is.na(x$changes$files_changed), "?", x$changes$files_changed), + x$message) + }), collapse = "\n"), + "\n" + ) + + # Add overall statistics + analysis_prompt <- paste0( + analysis_prompt, + "Overall Statistics:\n", + sprintf("- Total R Repositories: %d\n", nrow(r_repos)), + sprintf("- Total Commits: %d\n", nrow(commit_stats)), + sprintf("- Unique Contributors: %d\n", n_distinct(commit_stats$author)), + sprintf("- Total Lines Added: %d\n", sum(commit_stats$additions, na.rm = TRUE)), + sprintf("- Total Lines Deleted: %d\n", sum(commit_stats$deletions, na.rm = TRUE)), + sprintf("- Total Files Changed: %d\n\n", sum(commit_stats$files_changed, na.rm = TRUE)) + ) + + # Add analysis request + analysis_prompt <- paste0( + analysis_prompt, + "Please provide a comprehensive analysis of this R development activity, including:\n", + "1. Overall development patterns and trends\n", + "2. Key areas of focus across repositories\n", + "3. Notable features or changes\n", + "4. Development activity distribution\n", + "5. Collaboration patterns\n", + "6. Recommendations for future development\n\n", + "Please structure your response in clear sections and provide specific examples from the commit data to support your analysis." + ) + + # Return results + list( + account_info = list( + name = username, + date_range = list(start = start_date, end = end_date) + ), + repositories = r_repos, + repository_stats = repo_summary, + commit_details = commit_stats, + analysis_prompt = analysis_prompt, + overall_stats = list( + total_repositories = nrow(r_repos), + total_commits = nrow(commit_stats), + unique_authors = n_distinct(commit_stats$author), + total_additions = sum(commit_stats$additions, na.rm = TRUE), + total_deletions = sum(commit_stats$deletions, na.rm = TRUE), + total_files_changed = sum(commit_stats$files_changed, na.rm = TRUE) + ) + ) +} + +# Print method for nice output +#' @export +print.r_account_analysis <- function(x) { + cat("\nR Development Activity Analysis\n") + cat("============================\n") + cat(sprintf("Username/Org: %s (%s)\n", x$account_info$name)) + cat(sprintf("Period: %s to %s\n\n", x$account_info$date_range$start, x$account_info$date_range$end)) + + cat("Overall Statistics:\n") + cat(sprintf("- R Repositories: %d\n", x$overall_stats$total_repositories)) + cat(sprintf("- Total Commits: %d\n", x$overall_stats$total_commits)) + cat(sprintf("- Unique Contributors: %d\n", x$overall_stats$unique_authors)) + cat(sprintf("- Lines Added: %d\n", x$overall_stats$total_additions)) + cat(sprintf("- Lines Deleted: %d\n", x$overall_stats$total_deletions)) + cat(sprintf("- Files Changed: %d\n\n", x$overall_stats$total_files_changed)) + + cat("Repository Summary:\n") + print(x$repository_stats) + + cat("\nAnalysis Prompt:\n") + cat("Copy and paste the following prompt into your preferred AI tool:\n") + cat("----------------------------------------------------------------\n") + cat(x$analysis_prompt) + cat("\n----------------------------------------------------------------\n") +} diff --git a/R/download-stats.R b/R/download-stats.R index 557d2b4..c9955e0 100644 --- a/R/download-stats.R +++ b/R/download-stats.R @@ -8,7 +8,7 @@ #' @note Modified from BiocReports script to give raw data on download activities per month. #' There may be effects in 2022 of modified distribution channels. #' @return a list with components ipplot, dlplot with ggplot elements, and a tibble with all stats. -#' @examples +#' @examples #' pls = summarize_software_downloads() #' opar = par(no.readonly=TRUE) #' par(ask=TRUE) @@ -16,29 +16,30 @@ #' pls$dlplot #' par(opar) #' @export -summarize_software_downloads = function( years = 2018:2022, final = "2022-08-16" ) { +summarize_software_downloads <- + function(years = 2018:2022, final = "2022-08-16") +{ + urls <- sprintf( + "http://bioconductor.org/packages/stats/bioc/bioc_%d_stats.tab", + years + ) - urls <- sprintf( - "http://bioconductor.org/packages/stats/bioc/bioc_%d_stats.tab", - years - ) + tbl <- Map(read.table, urls, MoreArgs=list(header=TRUE)) + tbl0 <- as_tibble(do.call(rbind, unname(tbl))) + tbl0 = tbl0 |> filter(Month != "all") + dd = paste(tbl0$Year, tbl0$Month, "15", sep="-") + dld = as_date(dd) + tbl0$Date = dld + tbl0 = tbl0 |> dplyr::filter(Date < as_date(final)) - tbl <- Map(read.table, urls, MoreArgs=list(header=TRUE)) - tbl0 <- as_tibble(do.call(rbind, unname(tbl))) - tbl0 = tbl0 |> filter(Month != "all") - dd = paste(tbl0$Year, tbl0$Month, "15", sep="-") - dld = as_date(dd) - tbl0$Date = dld - tbl0 = tbl0 |> dplyr::filter(Date < as_date(final)) - - ipplot = ggplot(tbl0, aes(x=Date, y=Nb_of_distinct_IPs)) + - geom_point(size=5) + - ylab("# Unique IP addresses") + - ggtitle("Unique IPs requesting downloads") - dlplot = ggplot(tbl0, aes(x=Date, y=Nb_of_downloads)) + - geom_point(size=5) + - ylab("# downloads") + - ggtitle("Number of downloads requested") - list(ipplot=ipplot, dlplot=dlplot, table=tbl0) -} + ipplot = ggplot(tbl0, aes(x=Date, y=Nb_of_distinct_IPs)) + + geom_point(size=5) + + ylab("# Unique IP addresses") + + ggtitle("Unique IPs requesting downloads") + dlplot = ggplot(tbl0, aes(x=Date, y=Nb_of_downloads)) + + geom_point(size=5) + + ylab("# downloads") + + ggtitle("Number of downloads requested") + list(ipplot=ipplot, dlplot=dlplot, table=tbl0) +} diff --git a/R/support-site-stats.R b/R/support-site-stats.R index 9020e8b..ad0c106 100644 --- a/R/support-site-stats.R +++ b/R/support-site-stats.R @@ -1,3 +1,5 @@ +.BASE_SUPPORT_SITE_URL <- + "https://support.bioconductor.org/api/stats/date/" #' gather support site statistics in an interval #' @import httr #' @param base character(1) URL @@ -7,29 +9,31 @@ #' @examples #' get_support_site_stats() #' @export -get_support_site_stats = function(base = "https://support.bioconductor.org/api/stats/date/", - from = "2021/01/01/", to = "2021/12/31/") { +get_support_site_stats <- + function( + base = .BASE_SUPPORT_SITE_URL, from = "2021/01/01/", to = "2021/12/31/" + ) +{ + stat0 <- GET(paste0(base, from)) |> content() + stat1 <- GET(paste0(base, to)) |> content() - stat0 <- GET(paste0(base, from)) |> content() - stat1 <- GET(paste0(base, to)) |> content() - - stats = list() - stats$userdiff = stat1$users - stat0$users - stats$toplevdiff = stat1$toplevel - stat0$toplevel - stats$questdiff = stat1$questions - stat0$questions - stats$respdiff = (stat1$answers + stat1$comments) - (stat0$answers + stat0$comments) - stats$from = substr(from,1,10) - stats$to = substr(to,1,10) - class(stats) = c("bioc_support_stats", "list") - stats + stats = list() + stats$userdiff = stat1$users - stat0$users + stats$toplevdiff = stat1$toplevel - stat0$toplevel + stats$questdiff = stat1$questions - stat0$questions + stats$respdiff = (stat1$answers + stat1$comments) - (stat0$answers + stat0$comments) + stats$from = substr(from,1,10) + stats$to = substr(to,1,10) + class(stats) = c("bioc_support_stats", "list") + stats } #' show a report on support site usage #' @export print.bioc_support_stats = function(x, ...) { - cat("Bioconductor support site usage increments.\n") - cat(sprintf(" From %s to %s : \n", x$from, x$to)) - cat(sprintf(" Users added: %d; Top-level posts added: %d\n", x$userdiff, x$toplevdiff)) - cat(sprintf(" Questions added: %d; answers/comments added: %d\n", x$questdiff, x$respdiff)) + cat("Bioconductor support site usage increments.\n") + cat(sprintf(" From %s to %s : \n", x$from, x$to)) + cat(sprintf(" Users added: %d; Top-level posts added: %d\n", x$userdiff, x$toplevdiff)) + cat(sprintf(" Questions added: %d; answers/comments added: %d\n", x$questdiff, x$respdiff)) } diff --git a/man/get_support_site_stats.Rd b/man/get_support_site_stats.Rd index 5da5085..eb31152 100644 --- a/man/get_support_site_stats.Rd +++ b/man/get_support_site_stats.Rd @@ -5,7 +5,7 @@ \title{gather support site statistics in an interval} \usage{ get_support_site_stats( - base = "https://support.bioconductor.org/api/stats/date/", + base = .BASE_SUPPORT_SITE_URL, from = "2021/01/01/", to = "2021/12/31/" ) From 13d7263c33b697d30c5e43bd604be797fc0e6eea Mon Sep 17 00:00:00 2001 From: LiNk-NY Date: Mon, 6 Jan 2025 16:01:40 -0500 Subject: [PATCH 2/3] document commit_stats --- NAMESPACE | 7 +- R/commit-stats.R | 397 +++++++++++++++++++++----------------------- man/commit_stats.Rd | 94 +++++++++++ 3 files changed, 290 insertions(+), 208 deletions(-) create mode 100644 man/commit_stats.Rd diff --git a/NAMESPACE b/NAMESPACE index c7fe67b..4c87595 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,9 +2,13 @@ S3method(print,bioc_support_stats) S3method(print,r_account_analysis) -export(analyze_r_account_activity) +export(filter_r_repos) +export(get_repositories) export(get_support_site_stats) export(orcid_table) +export(repository_commits) +export(repository_summary) +export(summarize_account_activity) export(summarize_software_downloads) import(dplyr) import(ggplot2) @@ -13,6 +17,7 @@ import(lubridate) import(rorcid) import(tibble) importFrom(gh,gh) +importFrom(gh,gh_token) importFrom(purrr,map) importFrom(purrr,map_chr) importFrom(purrr,map_dbl) diff --git a/R/commit-stats.R b/R/commit-stats.R index 94b713f..681fb7d 100644 --- a/R/commit-stats.R +++ b/R/commit-stats.R @@ -1,29 +1,36 @@ -#' @import lubridate -#' @importFrom tibble tibble -#' @importFrom gh gh -#' @importFrom purrr map_df map_chr map_dbl map +#' @name commit_stats #' -#' @examples -#' results <- analyze_r_account_activity( -#' "LiNk-NY", "waldronlab", -#' start_date = "2023-08-31", -#' end_date = "2024-09-01" -#' ) +#' @title Get and summarize repository statistics from GitHub +#' +#' @description These functions allow the user to query the GitHub API and +#' produce a comprehensive summary of commit activity for each R package +#' repository. The main workhorse function is `summarize_account_activity` +#' which calls all other functions internally to produce a summary of GitHub R +#' repository activity. +#' +#' @param username `character(1)` The GitHub username +#' +#' @param org `character(1)` optional. The organization account for which to +#' search repositories for. #' +#' @param github_token `gh_pat` The personal access token obtained from GitHub. +#' By default, `gh::gh_token()` is used. +#' +#' @returns `get_repositories`: A list of repositories for the corresponding +#' account or organization +#' +#' @examples +#' if (interactive()) { +#' gitcreds::gitcreds_set() +#' summarize_account_activity( +#' username = "LiNk-NY", +#' org = "waldronlab", +#' start_date = "2023-08-31", +#' end_date = "2024-09-01" +#' ) +#' } #' @export -analyze_r_account_activity <- function( - username, - org, - start_date, - end_date, - github_token = gh::gh_token() -) { - # Input validation - if (is.null(github_token)) { - stop("No GitHub token found. Please set GITHUB_PAT environment variable or authenticate with GitHub CLI") - } - - # Step 1: Find all repositories for the account +get_repositories <- function(username, org, github_token = gh::gh_token()) { message("Finding repositories for ", username, "...") repos <- list() page <- 1 @@ -32,24 +39,17 @@ analyze_r_account_activity <- function( while (has_more) { if (!missing(org)) { endpoint <- "GET /orgs/{org}/repos" - new_repos <- gh::gh( - endpoint, - org = org, - page = page, - .token = github_token - ) + username <- org } else { endpoint <- "GET /users/{username}/repos" - new_repos <- gh::gh( - endpoint, - username = username, - org = username, - page = page, - .token = github_token - ) } - - + new_repos <- gh::gh( + endpoint, + username = username, + org = username, + page = page, + .token = github_token + ) if (length(new_repos)) { repos <- c(repos, new_repos) page <- page + 1 @@ -57,81 +57,85 @@ analyze_r_account_activity <- function( has_more <- FALSE } } + repos +} - # Step 2: Filter for R repositories +#' @rdname commit_stats +#' +#' @param repo_list `list` A list as obtained from `get_repositories` +#' +#' @returns `filter_r_repos`: A list of filtered repositories containing R code +#' +#' @export +filter_r_repos <- + function(repo_list, username, org, github_token = gh::gh_token()) +{ message("Identifying R repositories...") - r_repos <- purrr::map_df(repos, function(repo, org) { - # Check for R language + purrr::map_df(repo_list, function(repo, username, org, github_token) { if (!missing(org)) - languages <- gh::gh( - "GET /repos/{owner}/{repo}/languages", - owner = org, - repo = repo$name, - .token = github_token - ) - else - languages <- gh::gh( - "GET /repos/{owner}/{repo}/languages", - owner = username, - repo = repo$name, - .token = github_token - ) - + username <- org + languages <- gh::gh( + "GET /repos/{owner}/{repo}/languages", + owner = username, + repo = repo$name, + .token = github_token + ) if ("R" %in% names(languages)) { tibble::tibble( full_name = repo$full_name, name = repo$name, - description = ifelse(is.null(repo$description), NA, repo$description), + description = + ifelse(is.null(repo$description), NA, repo$description), stars = repo$stargazers_count, forks = repo$forks_count, last_updated = repo$updated_at, is_fork = repo$fork, default_branch = repo$default_branch, - r_percentage = round(languages$R / sum(unlist(languages)) * 100, 1) + r_percentage = + round(languages$R / sum(unlist(languages)) * 100, 1) ) } - }, org = org) - - if (!nrow(r_repos)) { - stop("No R repositories found for this account") - } + }, username = username, org = org, github_token = github_token) +} - # Step 3: Fetch commits for each R repository - message("Fetching commits for ", nrow(r_repos), " R repositories...") +#' @rdname commit_stats +#' +#' @param repos_df `tibble` A tibble of filtered R repositories as obtained from +#' `filter_r_repos` +#' +#' @param start_date,end_date `character(1)` The start and end dates delimiting +#' commit searches in the `YYYY-MM-DD` format +#' +#' @returns `repository_commits`: A `list` of commits for each row in the +#' `repos_df` input +#' +#' @export +repository_commits <- function( + repos_df, username, org, github_token = gh::gh_token(), + start_date, end_date +) { + message("Fetching commits for ", nrow(repos_df), " R repositories...") all_commits <- list() - - for (i in seq_len(nrow(r_repos))) { - repo <- r_repos$full_name[i] - message("Processing ", repo, " (", i, "/", nrow(r_repos), ")") - + if (missing(org)) + org <- username + for (i in seq_len(nrow(repos_df))) { + repo <- repos_df$full_name[i] + message("Processing ", repo, " (", i, "/", nrow(repos_df), ")") commits <- tryCatch({ - if (!missing(org)) - gh::gh( - "GET /repos/{owner}/{repo}/commits", - author = username, - owner = org, - repo = r_repos$name[i], - since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - .token = github_token - ) - else - gh::gh( - "GET /repos/{owner}/{repo}/commits", - author = username, - owner = username, - repo = r_repos$name[i], - since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - .token = github_token - ) + gh::gh( + "GET /repos/{owner}/{repo}/commits", + author = username, + owner = org, + repo = repos_df$name[i], + since = start_date, + until = end_date, + .token = github_token + ) }, error = function(e) { - warning(paste("Error fetching commits for", repo, ":", e$message)) + warning("Error fetching commits for ", repo, ": ", e$message) return(list()) }) - - # Process commits - repo_commits <- purrr::map(commits, function(commit, org) { + repo_commits <- purrr::map(commits, function(commit) { list( repository = repo, sha = commit$sha, @@ -139,26 +143,15 @@ analyze_r_account_activity <- function( date = commit$commit$author$date, message = commit$commit$message, changes = tryCatch({ - if (!missing(org)) - commit_detail <- gh::gh( - "GET /repos/{owner}/{repo}/commits/{sha}", - owner = org, - repo = r_repos$name[i], - sha = commit$sha, - since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - .token = github_token - ) - else - commit_detail <- gh::gh( - "GET /repos/{owner}/{repo}/commits/{sha}", - owner = username, - repo = r_repos$name[i], - sha = commit$sha, - since = as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - until = as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ"), - .token = github_token - ) + commit_detail <- gh::gh( + "GET /repos/{owner}/{repo}/commits/{sha}", + owner = org, + repo = repos_df$name[i], + sha = commit$sha, + since = start_date, + until = end_date, + .token = github_token + ) list( additions = commit_detail$stats$additions, deletions = commit_detail$stats$deletions, @@ -168,21 +161,35 @@ analyze_r_account_activity <- function( list(additions = NA, deletions = NA, files_changed = NA) }) ) - }, org = org) - + }) all_commits <- c(all_commits, repo_commits) } + all_commits +} - # Step 4: Generate statistics +#' @rdname commit_stats +#' +#' @param commits_list `list` The output of `repository_commits` that contains +#' commit details for each repository +#' +#' @returns `repository_summary`: A `list` of `tibbles` that summarize activity +#' in the associated `repositories` for the `username` / `org` account +#' +#' @export +repository_summary <- function( + commits_list, repositories, username, org, start_date, end_date +) { + if (missing(org)) + org <- username commit_stats <- tibble::tibble( - repository = map_chr(all_commits, "repository"), - author = map_chr(all_commits, "author"), - date = map_chr(all_commits, "date"), - additions = map_dbl(all_commits, function(x) x$changes$additions), - deletions = map_dbl(all_commits, function(x) x$changes$deletions), - files_changed = map_dbl(all_commits, function(x) x$changes$files_changed) + repository = map_chr(commits_list, "repository"), + author = map_chr(commits_list, "author"), + date = map_chr(commits_list, "date"), + additions = map_dbl(commits_list, function(x) x$changes$additions), + deletions = map_dbl(commits_list, function(x) x$changes$deletions), + files_changed = + map_dbl(commits_list, function(x) x$changes$files_changed) ) - repo_summary <- commit_stats |> group_by(repository) |> summarise( @@ -192,83 +199,15 @@ analyze_r_account_activity <- function( total_deletions = sum(deletions, na.rm = TRUE), total_files_changed = sum(files_changed, na.rm = TRUE) ) - - # Step 5: Generate analysis prompt - analysis_prompt <- sprintf( - "You are a senior R developer and data scientist. Please analyze the - following GitHub development activity for %s between %s and %s.\n\n", - username, start_date, end_date - ) - - # Add repository overview - analysis_prompt <- paste0( - analysis_prompt, - "Repository Overview:\n", - paste(map_chr(seq_len(nrow(r_repos)), function(i) { - sprintf("- %s: %s (Stars: %d, Forks: %d, R code: %.1f%%)", - r_repos$full_name[i], - if(is.na(r_repos$description[i])) "No description" else r_repos$description[i], - r_repos$stars[i], - r_repos$forks[i], - r_repos$r_percentage[i]) - }), collapse = "\n"), - "\n\n" - ) - - # Add commit details - analysis_prompt <- paste0( - analysis_prompt, - "Commit Activity:\n\n", - paste(map_chr(all_commits, function(x) { - sprintf("Repository: %s\nDate: %s\nAuthor: %s\nChanges: +%s/-%s (%s files)\nMessage: %s\n", - x$repository, - x$date, - x$author, - ifelse(is.na(x$changes$additions), "?", x$changes$additions), - ifelse(is.na(x$changes$deletions), "?", x$changes$deletions), - ifelse(is.na(x$changes$files_changed), "?", x$changes$files_changed), - x$message) - }), collapse = "\n"), - "\n" - ) - - # Add overall statistics - analysis_prompt <- paste0( - analysis_prompt, - "Overall Statistics:\n", - sprintf("- Total R Repositories: %d\n", nrow(r_repos)), - sprintf("- Total Commits: %d\n", nrow(commit_stats)), - sprintf("- Unique Contributors: %d\n", n_distinct(commit_stats$author)), - sprintf("- Total Lines Added: %d\n", sum(commit_stats$additions, na.rm = TRUE)), - sprintf("- Total Lines Deleted: %d\n", sum(commit_stats$deletions, na.rm = TRUE)), - sprintf("- Total Files Changed: %d\n\n", sum(commit_stats$files_changed, na.rm = TRUE)) - ) - - # Add analysis request - analysis_prompt <- paste0( - analysis_prompt, - "Please provide a comprehensive analysis of this R development activity, including:\n", - "1. Overall development patterns and trends\n", - "2. Key areas of focus across repositories\n", - "3. Notable features or changes\n", - "4. Development activity distribution\n", - "5. Collaboration patterns\n", - "6. Recommendations for future development\n\n", - "Please structure your response in clear sections and provide specific examples from the commit data to support your analysis." - ) - - # Return results - list( - account_info = list( - name = username, - date_range = list(start = start_date, end = end_date) + summary <- list( + account_info = tibble::tibble( + name = username, org = org, start = start_date, end = end_date ), - repositories = r_repos, + repositories = repositories, repository_stats = repo_summary, commit_details = commit_stats, - analysis_prompt = analysis_prompt, - overall_stats = list( - total_repositories = nrow(r_repos), + overall_stats = tibble::tibble( + total_repositories = nrow(repositories), total_commits = nrow(commit_stats), unique_authors = n_distinct(commit_stats$author), total_additions = sum(commit_stats$additions, na.rm = TRUE), @@ -276,16 +215,66 @@ analyze_r_account_activity <- function( total_files_changed = sum(commit_stats$files_changed, na.rm = TRUE) ) ) + class(summary) <- c("commit_summary", class(summary)) + summary +} + +#' @rdname commit_stats +#' +#' @importFrom tibble tibble +#' @importFrom gh gh gh_token +#' @importFrom purrr map_df map_chr map_dbl map +#' +#' @returns `summarize_account_activity`: A `list` of `tibbles` that summarize +#' activity in the associated `repositories` for the `username` / `org` +#' account +#' +#' @export +summarize_account_activity <- function( + username, + org, + start_date, + end_date, + github_token = gh::gh_token() +) { + start_date <- as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ") + end_date <- as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ") + # Step 1: Find all repositories for the account + repos <- get_repositories( + username = username, org = org, github_token = github_token + ) + # Step 2: Filter for R repositories + r_repos <- filter_r_repos( + repos, username = username, org = org, github_token = github_token + ) + if (!nrow(r_repos)) + stop("No R package repositories found in 'username' / 'org' account") + # Step 3: Fetch commits for each R repository + commits_list <- repository_commits( + repos_df = r_repos, username = username, org = org, + github_token = github_token, + start_date = start_date, end_date = end_date + ) + # Step 4: Summarize statistics + repository_summary( + commits_list = commits_list, + repositories = r_repos, username = username, org = org, + start_date = start_date, end_date = end_date + ) } # Print method for nice output + #' @export -print.r_account_analysis <- function(x) { +print.commit_summary <- function(x) { cat("\nR Development Activity Analysis\n") cat("============================\n") - cat(sprintf("Username/Org: %s (%s)\n", x$account_info$name)) - cat(sprintf("Period: %s to %s\n\n", x$account_info$date_range$start, x$account_info$date_range$end)) - + cat("Username:",x$account_info$name, "\n") + if (length(x$account_info$org)) + cat("Org:", x$account_info$org, "\n") + cat( + "Period:", x$account_info$start, "to", x$account_info$end, "\n\n" + ) cat("Overall Statistics:\n") cat(sprintf("- R Repositories: %d\n", x$overall_stats$total_repositories)) cat(sprintf("- Total Commits: %d\n", x$overall_stats$total_commits)) @@ -296,10 +285,4 @@ print.r_account_analysis <- function(x) { cat("Repository Summary:\n") print(x$repository_stats) - - cat("\nAnalysis Prompt:\n") - cat("Copy and paste the following prompt into your preferred AI tool:\n") - cat("----------------------------------------------------------------\n") - cat(x$analysis_prompt) - cat("\n----------------------------------------------------------------\n") } diff --git a/man/commit_stats.Rd b/man/commit_stats.Rd new file mode 100644 index 0000000..70765cc --- /dev/null +++ b/man/commit_stats.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/commit-stats.R +\name{commit_stats} +\alias{commit_stats} +\alias{get_repositories} +\alias{filter_r_repos} +\alias{repository_commits} +\alias{repository_summary} +\alias{summarize_account_activity} +\title{Get and summarize repository statistics from GitHub} +\usage{ +get_repositories(username, org, github_token = gh::gh_token()) + +filter_r_repos(repo_list, username, org, github_token = gh::gh_token()) + +repository_commits( + repos_df, + username, + org, + github_token = gh::gh_token(), + start_date, + end_date +) + +repository_summary( + commits_list, + repositories, + username, + org, + start_date, + end_date +) + +summarize_account_activity( + username, + org, + start_date, + end_date, + github_token = gh::gh_token() +) +} +\arguments{ +\item{username}{`character(1)` The GitHub username} + +\item{org}{`character(1)` optional. The organization account for which to +search repositories for.} + +\item{github_token}{`gh_pat` The personal access token obtained from GitHub. +By default, `gh::gh_token()` is used.} + +\item{repo_list}{`list` A list as obtained from `get_repositories`} + +\item{repos_df}{`tibble` A tibble of filtered R repositories as obtained from +`filter_r_repos`} + +\item{start_date, end_date}{`character(1)` The start and end dates delimiting +commit searches in the `YYYY-MM-DD` format} + +\item{commits_list}{`list` The output of `repository_commits` that contains +commit details for each repository} +} +\value{ +`get_repositories`: A list of repositories for the corresponding + account or organization + +`filter_r_repos`: A list of filtered repositories containing R code + +`repository_commits`: A `list` of commits for each row in the + `repos_df` input + +`repository_summary`: A `list` of `tibbles` that summarize activity + in the associated `repositories` for the `username` / `org` account + +`summarize_account_activity`: A `list` of `tibbles` that summarize + activity in the associated `repositories` for the `username` / `org` + account +} +\description{ +These functions allow the user to query the GitHub API and + produce a comprehensive summary of commit activity for each R package + repository. The main workhorse function is `summarize_account_activity` + which calls all other functions internally to produce a summary of GitHub R + repository activity. +} +\examples{ +if (interactive()) { + summarize_account_activity( + username = "LiNk-NY", + org = "waldronlab", + start_date = "2023-08-31", + end_date = "2024-09-01" + ) +} +} From 3953586ca4cf7662ab713f4bfd3d89f9ae93bb4a Mon Sep 17 00:00:00 2001 From: LiNk-NY Date: Mon, 6 Jan 2025 16:04:59 -0500 Subject: [PATCH 3/3] update print.commit_summary in docs --- NAMESPACE | 2 +- R/commit-stats.R | 2 ++ man/commit_stats.Rd | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 4c87595..f34c73b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ # Generated by roxygen2: do not edit by hand S3method(print,bioc_support_stats) -S3method(print,r_account_analysis) +S3method(print,commit_summary) export(filter_r_repos) export(get_repositories) export(get_support_site_stats) diff --git a/R/commit-stats.R b/R/commit-stats.R index 681fb7d..7a70742 100644 --- a/R/commit-stats.R +++ b/R/commit-stats.R @@ -265,6 +265,8 @@ summarize_account_activity <- function( # Print method for nice output +#' @rdname commit_stats +#' #' @export print.commit_summary <- function(x) { cat("\nR Development Activity Analysis\n") diff --git a/man/commit_stats.Rd b/man/commit_stats.Rd index 70765cc..bff5d2d 100644 --- a/man/commit_stats.Rd +++ b/man/commit_stats.Rd @@ -7,6 +7,7 @@ \alias{repository_commits} \alias{repository_summary} \alias{summarize_account_activity} +\alias{print.commit_summary} \title{Get and summarize repository statistics from GitHub} \usage{ get_repositories(username, org, github_token = gh::gh_token()) @@ -38,6 +39,8 @@ summarize_account_activity( end_date, github_token = gh::gh_token() ) + +\method{print}{commit_summary}(x) } \arguments{ \item{username}{`character(1)` The GitHub username} @@ -84,6 +87,7 @@ These functions allow the user to query the GitHub API and } \examples{ if (interactive()) { + gitcreds::gitcreds_set() summarize_account_activity( username = "LiNk-NY", org = "waldronlab",