From 7bf8ba6510c4050a44d625965181fb8246b53ad4 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:15:04 -1000 Subject: [PATCH] i #284 Refactored parse_mbox_latest_date and Fixed Roxygen Errors - parse_mbox_lateset_date() now uses new naming convention for files - Added to download_mail.Rmd - Fixed documentation for download_pipermail() Signed-off-by: Dao McGill --- DESCRIPTION | 2 +- R/mail.R | 45 ++++++++++++++++++------------- man/commit_message_id_coverage.Rd | 2 +- man/download_mod_mbox.Rd | 2 +- man/download_pipermail.Rd | 6 ++--- man/parse_mbox.Rd | 4 +-- man/parse_mbox_latest_date.Rd | 17 +++++------- vignettes/download_mail.Rmd | 32 +++++++++++++++++++++- 8 files changed, 73 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ae28c702..5a793074 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,4 +50,4 @@ Imports: VignetteBuilder: knitr URL: https://github.com/sailuh/kaiaulu BugReports: https://github.com/sailuh/kaiaulu/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/R/mail.R b/R/mail.R index 710ccaa2..c2e355c6 100644 --- a/R/mail.R +++ b/R/mail.R @@ -18,9 +18,9 @@ #' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. #' -#' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") -#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') -#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) +#' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" +#' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' +#' @param end_year_month The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored #' @param verbose if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory @@ -501,25 +501,34 @@ parse_mbox <- function(perceval_path, mbox_path){ #' Parse mbox latest date #' -#' Returns the name of the latest mod_mbox file downloaded in the specified folder +#' @description This function returns the name of the latest mod_mbox file downloaded in the specified folder +#' based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. #' -#' The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" -#' For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} -#' -#' @param mbox path to mbox archive file (ends in .mbox) -#' @return Returns the name of the latest mod_mbox file +#' @param save_folder_path path to the folder containing the mbox files +#' @return `latest_mbox_file` the name of the latest mod_mbox file #' @export #' @family parsers -parse_mbox_latest_date <- function(mbox) { - file_list <- list.files(mbox) - date_list <- list() - for(i in file_list){ - i <- sub(".mbox", "", i) - i <- sub("[^_]*_[^_]*_", "", i) - date_list <- append(date_list, i) +parse_mbox_latest_date <- function(save_folder_path) { + # List all .mbox files in the folder with the expected naming pattern + file_list <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + + if (length(file_list) == 0) { + warning("No .mbox files found in the folder.") + return(invisible(NULL)) } - latest_date <- as.character(max(unlist(date_list))) - latest_mbox_file <- grep(latest_date, file_list, value = TRUE) + + # Extract the dates from the filenames + date_list <- sub("kaiaulu_(\\d{6})\\.mbox$", "\\1", file_list) + + # Convert dates to numeric for comparison + date_numeric <- as.numeric(date_list) + + # Find the latest date + latest_date <- max(date_numeric, na.rm = TRUE) + + # Find the file corresponding to the latest date + latest_mbox_file <- file_list[date_numeric == latest_date] + return(latest_mbox_file) } diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index 68fad761..e7f0c6ef 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c628be38..c02cf5d8 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -9,7 +9,7 @@ download_mod_mbox( start_year_month, end_year_month, save_folder_path, - verbose = FALSE + verbose = TRUE ) } \arguments{ diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0aa1bc50..a4e2fdd8 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -13,11 +13,11 @@ download_pipermail( ) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/")} +\item{mailing_list}{The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/"} -\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM')} +\item{start_year_month}{The year and month of the first file to be downloaded format: 'YYYYMM'} -\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} +\item{end_year_month}{The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date} \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index fd578695..d4852995 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 149caec7..eedf9633 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -4,35 +4,32 @@ \alias{parse_mbox_latest_date} \title{Parse mbox latest date} \usage{ -parse_mbox_latest_date(mbox) +parse_mbox_latest_date(save_folder_path) } \arguments{ -\item{mbox}{path to mbox archive file (ends in .mbox)} +\item{save_folder_path}{path to the folder containing the mbox files} } \value{ -Returns the name of the latest mod_mbox file +`latest_mbox_file` the name of the latest mod_mbox file } \description{ -Returns the name of the latest mod_mbox file downloaded in the specified folder -} -\details{ -The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" -For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} +This function returns the name of the latest mod_mbox file downloaded in the specified folder +based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. } \seealso{ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 3ea7a547..a6c17804 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -57,7 +57,7 @@ mailing_list: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../../extdata/save_folder_mail" + save_folder_path: "../extdata/save_folder_mail" ``` @@ -240,3 +240,33 @@ This will store the parsed data into the parsed_mail variable. To view the table ```{r} View(parsed_mail) ``` + +## Retrieve the Latest Mbox File +We can use the parse_mbox_latest_date() function to identify the most recent .mbox file in the specified folder. This can be useful when you want to automate the parsing of the latest data without manually specifying the file name. + +First, make sure that the save_folder_path is correctly set to the directory where your .mbox files are stored. +```{r} +# Get the latest mbox file +latest_mbox_file <- parse_mbox_latest_date(save_folder_path = save_folder_path) +print(latest_mbox_file) +``` +This will output the name of the latest .mbox file based on the YYYYMM pattern in the filename. +We can use this to update mbox_path to point to the latest file, and call the parse_mbox() function to parse the latest data. +```{r} +# Update mbox_path to use the latest file +mbox_path <- file.path(save_folder_path, latest_mbox_file) +print(mbox_path) +``` +To parse this file: +```{r} +# Parse the latest mbox file +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_path = mbox_path +) +``` +Now, parsed_mail contains the parsed data from the latest .mbox file. +```{r} +# View the parsed data +View(parsed_mail) +```