From 0cc41231724e9e096672c753deeb52e040bb8627 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 22 Sep 2024 12:02:41 -1000 Subject: [PATCH] i #284 Added refresh_mod_mbox function for refreshing Mod Mbox archives - Created `refresh_mod_mbox` function to automatically refresh mailing list archives downloaded using Mod Mbox. - The function checks for the latest downloaded file, deletes it, and redownloads the archive from that month to the current date. - Added documentation for `refresh_mod_mbox` to the notebook. Signed-off-by: Dao McGill --- NAMESPACE | 2 +- R/mail.R | 111 +++++++++++++++-------------- conf/helix.yml | 4 +- man/download_mod_mbox.Rd | 6 +- man/download_mod_mbox_per_month.Rd | 37 ---------- man/refresh_mod_mbox.Rd | 35 +++++++++ vignettes/download_mail.Rmd | 22 +++++- 7 files changed, 116 insertions(+), 101 deletions(-) delete mode 100644 man/download_mod_mbox_per_month.Rd create mode 100644 man/refresh_mod_mbox.Rd diff --git a/NAMESPACE b/NAMESPACE index f6e15a60..75f0d9fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,6 @@ export(download_jira_issues) export(download_jira_issues_by_date) export(download_jira_issues_by_issue_key) export(download_mod_mbox) -export(download_mod_mbox_per_month) export(download_pipermail) export(dv8_clsxb_to_clsxj) export(dv8_clsxj_to_clsxb) @@ -139,6 +138,7 @@ export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) +export(refresh_mod_mbox) export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) diff --git a/R/mail.R b/R/mail.R index 49a041cb..c58152cc 100644 --- a/R/mail.R +++ b/R/mail.R @@ -313,11 +313,11 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). -#' @param save_file_path The folder path where all the downloaded mbox files will be stored. +#' @param save_folder_path The folder path where all the downloaded mbox files will be stored. #' @param verbose if TRUE, prints detailed messages during the download process. -#' @return Returns `save_file_path`, the folder path where the mbox files are stored. +#' @return Returns `save_folder_path`, the folder path where the mbox files are stored. #' @export -download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_file_path, verbose = FALSE) { +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = FALSE) { ########## Extract Mailing List Name ########## # Extract the mailing list name from the given URL. This is because the actual list name is @@ -357,7 +357,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") - file_path <- file.path(save_file_path, file_name) + file_path <- file.path(save_folder_path, file_name) if (verbose) { cat("Constructed URL:", download_url, "\n") @@ -386,70 +386,71 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa ########## Return Save Path ########## # Return the folder path where all mbox files were saved. - return(save_file_path) + return(save_folder_path) } -#' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) -#' @param from_year First year in the range to be downloaded -#' @param to_year Last year in the range to be downloaded -#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. -#' @param verbose Prints progress during execution -#' @return Returns the path of the downloaded mbox file. -#' @export -download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, from_year, to_year, save_folder_path,verbose=FALSE) { - - #Initialize variables - counter <- 0 - destination <- list() +############## Mod Mbox Refresher ############## - #Open file handle to output file - output <- path.expand(save_folder_path) +#' Refresh mbox files downloaded via mod_mbox +#' +#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org') +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. +#' @param verbose if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +#' @export +refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - current_date <- Sys.Date() - current_year <- as.numeric(substr(current_date, 1, 4)) - current_month <- as.numeric(substr(current_date, 6, 7)) + ########## Check if Folder is Empty ########## + # Check the contents of the folder to see if any .mbox files are already present. + # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") - #Loop through time and compose the mbox file - for (year in (from_year:to_year)) { + if (length(files_in_folder) == 0) { + # If the folder is empty, download all mod_mbox files starting from start_year_month + # The end date is set to the current month based on the system date + end_year_month <- format(Sys.Date(), "%Y%m") + if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") - for (month in 1:12) { - # Check to stop function when month iterates path current real life month - if (year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 + # Call the download_mod_mbox function to download files from start_year_month to end_year_month + download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) + } + ########## Identify the Most Recent Month ########## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, destination[[counter]], sep = "_") + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) - if(verbose){ - print(stringi::stri_c("Downloading:",mbox_file_name,sep = " ")) - } + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + } - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(stringi::stri_c("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } + ########## Redownload from the Most Recent Month ########## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") - } + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") + # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month + download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) } - - #return output location - return(output) } diff --git a/conf/helix.yml b/conf/helix.yml index c5f62d27..18b1bc6d 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,12 +53,12 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 - save_file_path: "../save_mbox_mail" + save_folder_path: "../save_mbox_mail" mail_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 - save_file_path: "../save_mbox_mail" + save_folder_path: "../save_mbox_mail" pipermail: project_key_1: # archive_url: https://mta.openssl.org/mailman/listinfo/ diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 26a765e3..c628be38 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -8,7 +8,7 @@ download_mod_mbox( mailing_list, start_year_month, end_year_month, - save_file_path, + save_folder_path, verbose = FALSE ) } @@ -20,12 +20,12 @@ download_mod_mbox( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{save_file_path}{The folder path where all the downloaded mbox files will be stored.} +\item{save_folder_path}{The folder path where all the downloaded mbox files will be stored.} \item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns `save_file_path`, the folder path where the mbox files are stored. +Returns `save_folder_path`, the folder path where the mbox files are stored. } \description{ This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. diff --git a/man/download_mod_mbox_per_month.Rd b/man/download_mod_mbox_per_month.Rd deleted file mode 100644 index 2debab7b..00000000 --- a/man/download_mod_mbox_per_month.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{download_mod_mbox_per_month} -\alias{download_mod_mbox_per_month} -\title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} -\usage{ -download_mod_mbox_per_month( - archive_url, - mailing_list, - archive_type, - from_year, - to_year, - save_folder_path, - verbose = FALSE -) -} -\arguments{ -\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} - -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} - -\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} - -\item{from_year}{First year in the range to be downloaded} - -\item{to_year}{Last year in the range to be downloaded} - -\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} - -\item{verbose}{Prints progress during execution} -} -\value{ -Returns the path of the downloaded mbox file. -} -\description{ -Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -} diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd new file mode 100644 index 00000000..43f6349a --- /dev/null +++ b/man/refresh_mod_mbox.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_mod_mbox} +\alias{refresh_mod_mbox} +\title{Refresh mbox files downloaded via mod_mbox} +\usage{ +refresh_mod_mbox( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) +} +\arguments{ +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org')} + +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} + +\item{save_folder_path}{The folder path in which all the downloaded mod_mbox files will be stored.} + +\item{verbose}{if TRUE, prints diagnostic messages.} +} +\value{ +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +} +\description{ +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. + +The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +} diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index a5f7f53a..1e635350 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -108,14 +108,14 @@ Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["start_year_month"]] mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["end_year_month"]] -mod_save_file_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_file_path"]] +mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_folder_path"]] ``` ### Explanation of Configuration Parameters - mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). - start_year_month: The first month to download (format: YYYYMM). - end_year_month: The last month to download (format: YYYYMM). -- save_file_path: The directory where the downloaded .mbox files will be saved. +- save_folder_path: The directory where the downloaded .mbox files will be saved. ##Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. @@ -125,11 +125,27 @@ download_mod_mbox( mailing_list = mod_mbox_list, start_year_month = mod_start_year_month, end_year_month = mod_end_year_month, - save_file_path = mod_save_file_path, + save_folder_path = mod_save_folder_path, verbose = TRUE ``` After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 and saves the files in the specified folder. +## Mod Mbox Refresher +To refresh these archives to ensure that you have the latest messages, you can use the refresh_mod_mbox function. This function works similarly to the Pipermail refresher. +How refresh_mod_mbox Works +1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). +2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. + +```{r} +refresh_mod_mbox( + mailing_list = mod_mbox_list, + start_year_month = mod_start_year_month, + save_folder_path = mod_save_folder_path + verbose = TRUE +) +``` + +This ensures your archive is up-to-date, accounting for new data that may have been added to the mailing list since the last download.