Skip to content

Commit

Permalink
Merge pull request #7 from LiNk-NY/commit-stats
Browse files Browse the repository at this point in the history
Add summary statistics for commits in repositories
  • Loading branch information
vjcitn authored Jan 6, 2025
2 parents 841d097 + 3953586 commit 1b5c0cc
Show file tree
Hide file tree
Showing 7 changed files with 455 additions and 45 deletions.
8 changes: 6 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ Imports: tibble,
utils,
dplyr,
ggplot2,
gh,
httr,
lubridate, rorcid
lubridate,
purrr,
rorcid,
tibble
Suggests: knitr, testthat
LazyData: true
VignetteBuilder: knitr
biocViews: Infrastructure
RoxygenNote: 7.2.1
RoxygenNote: 7.3.2
13 changes: 13 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
# Generated by roxygen2: do not edit by hand

S3method(print,bioc_support_stats)
S3method(print,commit_summary)
export(filter_r_repos)
export(get_repositories)
export(get_support_site_stats)
export(orcid_table)
export(repository_commits)
export(repository_summary)
export(summarize_account_activity)
export(summarize_software_downloads)
import(dplyr)
import(ggplot2)
import(httr)
import(lubridate)
import(rorcid)
import(tibble)
importFrom(gh,gh)
importFrom(gh,gh_token)
importFrom(purrr,map)
importFrom(purrr,map_chr)
importFrom(purrr,map_dbl)
importFrom(purrr,map_df)
importFrom(tibble,tibble)
importFrom(utils,read.table)
290 changes: 290 additions & 0 deletions R/commit-stats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
#' @name commit_stats
#'
#' @title Get and summarize repository statistics from GitHub
#'
#' @description These functions allow the user to query the GitHub API and
#' produce a comprehensive summary of commit activity for each R package
#' repository. The main workhorse function is `summarize_account_activity`
#' which calls all other functions internally to produce a summary of GitHub R
#' repository activity.
#'
#' @param username `character(1)` The GitHub username
#'
#' @param org `character(1)` optional. The organization account for which to
#' search repositories for.
#'
#' @param github_token `gh_pat` The personal access token obtained from GitHub.
#' By default, `gh::gh_token()` is used.
#'
#' @returns `get_repositories`: A list of repositories for the corresponding
#' account or organization
#'
#' @examples
#' if (interactive()) {
#' gitcreds::gitcreds_set()
#' summarize_account_activity(
#' username = "LiNk-NY",
#' org = "waldronlab",
#' start_date = "2023-08-31",
#' end_date = "2024-09-01"
#' )
#' }
#' @export
get_repositories <- function(username, org, github_token = gh::gh_token()) {
message("Finding repositories for ", username, "...")
repos <- list()
page <- 1
has_more <- TRUE

while (has_more) {
if (!missing(org)) {
endpoint <- "GET /orgs/{org}/repos"
username <- org
} else {
endpoint <- "GET /users/{username}/repos"
}
new_repos <- gh::gh(
endpoint,
username = username,
org = username,
page = page,
.token = github_token
)
if (length(new_repos)) {
repos <- c(repos, new_repos)
page <- page + 1
} else {
has_more <- FALSE
}
}
repos
}

#' @rdname commit_stats
#'
#' @param repo_list `list` A list as obtained from `get_repositories`
#'
#' @returns `filter_r_repos`: A list of filtered repositories containing R code
#'
#' @export
filter_r_repos <-
function(repo_list, username, org, github_token = gh::gh_token())
{
message("Identifying R repositories...")
purrr::map_df(repo_list, function(repo, username, org, github_token) {
if (!missing(org))
username <- org
languages <- gh::gh(
"GET /repos/{owner}/{repo}/languages",
owner = username,
repo = repo$name,
.token = github_token
)
if ("R" %in% names(languages)) {
tibble::tibble(
full_name = repo$full_name,
name = repo$name,
description =
ifelse(is.null(repo$description), NA, repo$description),
stars = repo$stargazers_count,
forks = repo$forks_count,
last_updated = repo$updated_at,
is_fork = repo$fork,
default_branch = repo$default_branch,
r_percentage =
round(languages$R / sum(unlist(languages)) * 100, 1)
)
}
}, username = username, org = org, github_token = github_token)
}

#' @rdname commit_stats
#'
#' @param repos_df `tibble` A tibble of filtered R repositories as obtained from
#' `filter_r_repos`
#'
#' @param start_date,end_date `character(1)` The start and end dates delimiting
#' commit searches in the `YYYY-MM-DD` format
#'
#' @returns `repository_commits`: A `list` of commits for each row in the
#' `repos_df` input
#'
#' @export
repository_commits <- function(
repos_df, username, org, github_token = gh::gh_token(),
start_date, end_date
) {
message("Fetching commits for ", nrow(repos_df), " R repositories...")
all_commits <- list()
if (missing(org))
org <- username
for (i in seq_len(nrow(repos_df))) {
repo <- repos_df$full_name[i]
message("Processing ", repo, " (", i, "/", nrow(repos_df), ")")
commits <- tryCatch({
gh::gh(
"GET /repos/{owner}/{repo}/commits",
author = username,
owner = org,
repo = repos_df$name[i],
since = start_date,
until = end_date,
.token = github_token
)
}, error = function(e) {
warning("Error fetching commits for ", repo, ": ", e$message)
return(list())
})
repo_commits <- purrr::map(commits, function(commit) {
list(
repository = repo,
sha = commit$sha,
author = commit$commit$author$name,
date = commit$commit$author$date,
message = commit$commit$message,
changes = tryCatch({
commit_detail <- gh::gh(
"GET /repos/{owner}/{repo}/commits/{sha}",
owner = org,
repo = repos_df$name[i],
sha = commit$sha,
since = start_date,
until = end_date,
.token = github_token
)
list(
additions = commit_detail$stats$additions,
deletions = commit_detail$stats$deletions,
files_changed = length(commit_detail$files)
)
}, error = function(e) {
list(additions = NA, deletions = NA, files_changed = NA)
})
)
})
all_commits <- c(all_commits, repo_commits)
}
all_commits
}

#' @rdname commit_stats
#'
#' @param commits_list `list` The output of `repository_commits` that contains
#' commit details for each repository
#'
#' @returns `repository_summary`: A `list` of `tibbles` that summarize activity
#' in the associated `repositories` for the `username` / `org` account
#'
#' @export
repository_summary <- function(
commits_list, repositories, username, org, start_date, end_date
) {
if (missing(org))
org <- username
commit_stats <- tibble::tibble(
repository = map_chr(commits_list, "repository"),
author = map_chr(commits_list, "author"),
date = map_chr(commits_list, "date"),
additions = map_dbl(commits_list, function(x) x$changes$additions),
deletions = map_dbl(commits_list, function(x) x$changes$deletions),
files_changed =
map_dbl(commits_list, function(x) x$changes$files_changed)
)
repo_summary <- commit_stats |>
group_by(repository) |>
summarise(
total_commits = n(),
unique_authors = n_distinct(author),
total_additions = sum(additions, na.rm = TRUE),
total_deletions = sum(deletions, na.rm = TRUE),
total_files_changed = sum(files_changed, na.rm = TRUE)
)
summary <- list(
account_info = tibble::tibble(
name = username, org = org, start = start_date, end = end_date
),
repositories = repositories,
repository_stats = repo_summary,
commit_details = commit_stats,
overall_stats = tibble::tibble(
total_repositories = nrow(repositories),
total_commits = nrow(commit_stats),
unique_authors = n_distinct(commit_stats$author),
total_additions = sum(commit_stats$additions, na.rm = TRUE),
total_deletions = sum(commit_stats$deletions, na.rm = TRUE),
total_files_changed = sum(commit_stats$files_changed, na.rm = TRUE)
)
)
class(summary) <- c("commit_summary", class(summary))
summary
}

#' @rdname commit_stats
#'
#' @importFrom tibble tibble
#' @importFrom gh gh gh_token
#' @importFrom purrr map_df map_chr map_dbl map
#'
#' @returns `summarize_account_activity`: A `list` of `tibbles` that summarize
#' activity in the associated `repositories` for the `username` / `org`
#' account
#'
#' @export
summarize_account_activity <- function(
username,
org,
start_date,
end_date,
github_token = gh::gh_token()
) {
start_date <- as.POSIXct(start_date) |> format("%Y-%m-%dT%H:%M:%SZ")
end_date <- as.POSIXct(end_date) |> format("%Y-%m-%dT%H:%M:%SZ")
# Step 1: Find all repositories for the account
repos <- get_repositories(
username = username, org = org, github_token = github_token
)
# Step 2: Filter for R repositories
r_repos <- filter_r_repos(
repos, username = username, org = org, github_token = github_token
)
if (!nrow(r_repos))
stop("No R package repositories found in 'username' / 'org' account")
# Step 3: Fetch commits for each R repository
commits_list <- repository_commits(
repos_df = r_repos, username = username, org = org,
github_token = github_token,
start_date = start_date, end_date = end_date
)
# Step 4: Summarize statistics
repository_summary(
commits_list = commits_list,
repositories = r_repos, username = username, org = org,
start_date = start_date, end_date = end_date
)
}

# Print method for nice output

#' @rdname commit_stats
#'
#' @export
print.commit_summary <- function(x) {
cat("\nR Development Activity Analysis\n")
cat("============================\n")
cat("Username:",x$account_info$name, "\n")
if (length(x$account_info$org))
cat("Org:", x$account_info$org, "\n")
cat(
"Period:", x$account_info$start, "to", x$account_info$end, "\n\n"
)
cat("Overall Statistics:\n")
cat(sprintf("- R Repositories: %d\n", x$overall_stats$total_repositories))
cat(sprintf("- Total Commits: %d\n", x$overall_stats$total_commits))
cat(sprintf("- Unique Contributors: %d\n", x$overall_stats$unique_authors))
cat(sprintf("- Lines Added: %d\n", x$overall_stats$total_additions))
cat(sprintf("- Lines Deleted: %d\n", x$overall_stats$total_deletions))
cat(sprintf("- Files Changed: %d\n\n", x$overall_stats$total_files_changed))

cat("Repository Summary:\n")
print(x$repository_stats)
}
49 changes: 25 additions & 24 deletions R/download-stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,38 @@
#' @note Modified from BiocReports script to give raw data on download activities per month.
#' There may be effects in 2022 of modified distribution channels.
#' @return a list with components ipplot, dlplot with ggplot elements, and a tibble with all stats.
#' @examples
#' @examples
#' pls = summarize_software_downloads()
#' opar = par(no.readonly=TRUE)
#' par(ask=TRUE)
#' pls$ipplot
#' pls$dlplot
#' par(opar)
#' @export
summarize_software_downloads = function( years = 2018:2022, final = "2022-08-16" ) {
summarize_software_downloads <-
function(years = 2018:2022, final = "2022-08-16")
{
urls <- sprintf(
"http://bioconductor.org/packages/stats/bioc/bioc_%d_stats.tab",
years
)

urls <- sprintf(
"http://bioconductor.org/packages/stats/bioc/bioc_%d_stats.tab",
years
)
tbl <- Map(read.table, urls, MoreArgs=list(header=TRUE))
tbl0 <- as_tibble(do.call(rbind, unname(tbl)))
tbl0 = tbl0 |> filter(Month != "all")
dd = paste(tbl0$Year, tbl0$Month, "15", sep="-")
dld = as_date(dd)
tbl0$Date = dld
tbl0 = tbl0 |> dplyr::filter(Date < as_date(final))

tbl <- Map(read.table, urls, MoreArgs=list(header=TRUE))
tbl0 <- as_tibble(do.call(rbind, unname(tbl)))
tbl0 = tbl0 |> filter(Month != "all")
dd = paste(tbl0$Year, tbl0$Month, "15", sep="-")
dld = as_date(dd)
tbl0$Date = dld
tbl0 = tbl0 |> dplyr::filter(Date < as_date(final))

ipplot = ggplot(tbl0, aes(x=Date, y=Nb_of_distinct_IPs)) +
geom_point(size=5) +
ylab("# Unique IP addresses") +
ggtitle("Unique IPs requesting downloads")
dlplot = ggplot(tbl0, aes(x=Date, y=Nb_of_downloads)) +
geom_point(size=5) +
ylab("# downloads") +
ggtitle("Number of downloads requested")
list(ipplot=ipplot, dlplot=dlplot, table=tbl0)
}
ipplot = ggplot(tbl0, aes(x=Date, y=Nb_of_distinct_IPs)) +
geom_point(size=5) +
ylab("# Unique IP addresses") +
ggtitle("Unique IPs requesting downloads")
dlplot = ggplot(tbl0, aes(x=Date, y=Nb_of_downloads)) +
geom_point(size=5) +
ylab("# downloads") +
ggtitle("Number of downloads requested")
list(ipplot=ipplot, dlplot=dlplot, table=tbl0)
}

Loading

0 comments on commit 1b5c0cc

Please sign in to comment.