From 72238a759267f92bdf5c9d9fe0ead278c0130935 Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Wed, 24 Apr 2024 00:35:31 -1000 Subject: [PATCH] Edited download_pipermail to save pipermail files as mbox files, created refresh_pipermail, updated news Found out that the pipermail downloader function already downloads the files by month and year, so all I really needed to do was change it so that it downloads the files as mbox files (change the extension from .txt to .mbox). Created the refresher for pipermail. I had no need to create a parse latest pipermail since they were mbox files anyway. --- NAMESPACE | 1 + NEWS.md | 3 + R/mail.R | 236 ++++++++++++++++++++++++++++++-- conf/openssl.yml | 14 +- man/download_pipermail.Rd | 14 +- man/refresh_pipermail.Rd | 35 +++++ vignettes/download_mod_mbox.Rmd | 28 ++++ 7 files changed, 306 insertions(+), 25 deletions(-) create mode 100644 man/refresh_pipermail.Rd diff --git a/NAMESPACE b/NAMESPACE index e4525716..59ac138a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -140,6 +140,7 @@ export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) export(refresh_mbox) +export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) export(smell_radio_silence) diff --git a/NEWS.md b/NEWS.md index cf2de75b..57acc182 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,8 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES + * `refresh_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `parse_mbox_latest_date()` has been added. This function returns the file name of the downloaded mbox file containing the latest date for use by `download_mbox_per_month()` and `download_pipermail` to implement a refresh capability. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. * `download_jira_issues()`, `download_jira_issues_by_issue_key()`, and `download_jira_issues_by_date()` has been added. This allows for downloading of Jira issues without the use of JirAgileR [#275](https://github.com/sailuh/kaiaulu/issues/275) and specification of issue Id and created ranges. It also interacts with `parse_jira_latest_date` to implement a refresh capability. * `make_jira_issue()` and `make_jira_issue_tracker()` no longer create fake issues following JirAgileR format, but instead the raw data obtained from JIRA API. This is compatible with the new parser function for JIRA. [#277](https://github.com/sailuh/kaiaulu/issues/277) @@ -28,6 +30,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### MINOR IMPROVEMENTS + * `download_pipermail()` now downloads all the txt and txt.gz files in the accessed pipermail archive as mbox files. [#284](https://github.com/sailuh/kaiaulu/issues/284) * The line metrics notebook now provides further guidance on adjusting the snapshot and filtering. * The R File and R Function parser can now properly parse R folders which contain folders within (not following R package structure). Both `.r` and `.R` files are also now captured (previously only one of the two were specified, but R accepts both). [#235](https://github.com/sailuh/kaiaulu/issues/235) * Refactor GoF Notebook in Graph GoF and Text GoF Notebooks [#224](https://github.com/sailuh/kaiaulu/issues/224) diff --git a/R/mail.R b/R/mail.R index daa76f21..7548234b 100644 --- a/R/mail.R +++ b/R/mail.R @@ -6,14 +6,17 @@ ############## Downloader ############## -#' Download all pipermail files in an archive -#' @param url An url pointing to a pipermail archive +#' Download all pipermail files in an archive as mbox files +#' @param archive_url An url pointing to a pipermail archive +#' @param mailing_list The name of the mailing list being downloaded +#' @param archive_type The name of the type of archive that the mailing list is stored in +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored #' @return Returns `destination`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(url) { +download_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path) { #Get page - pagedata <- httr::GET(url) + pagedata <- httr::GET(archive_url) #Parse html file into object tbls_xml <- XML::htmlParse(pagedata) @@ -26,32 +29,40 @@ download_pipermail <- function(url) { #Create Vector files <- vector() + file_names <- vector() #Compose download urls for both gunzipped and plain text files for (i in hrefs ){ if (endsWith(i, ".txt.gz")){ - i <- paste0(url, i) + f_month <- match(sub("[^_]*-","", sub(".txt.gz","",i)), month.name) + f_year <- sub("-[^_]*", "", i) + file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) + i <- stringi::stri_c(archive_url, i, sep = "/") files <- c(files, i) } else if (endsWith(i, ".txt")) { - i <- paste0(url, i) + f_month <- match(sub("[^_]*-","", sub(".txt","",i)), month.name) + f_year <- sub("-[^_]*", "", i) + file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) + i <- stringi::stri_c(archive_url, i, sep = "/") files <- c(files, i) } } - - destination <- vector() + amount <- length(files) # File downloading loop - for (i in files){ + for (i in 1:amount){ #split filename from url and create download destination out of it - splits <- stringi::stri_split_fixed(i, "/") - destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) + #splits <- stringi::stri_split_fixed(i, "/") + #destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) #download file and place it at the destination - httr::GET(i, httr::write_disk(destination[[i]], overwrite=TRUE)) + save_file_name <- stringi::stri_c(mailing_list, archive_type, file_names[[i]], sep = "_") + save_file_path <- stringi::stri_c(save_folder_path, save_file_name, sep = "/") + httr::GET(files[[i]], httr::write_disk(save_file_path, overwrite=TRUE)) } #Return filenames - return(destination) + return(save_folder_path) } @@ -326,6 +337,205 @@ refresh_mbox <- function(archive_url, mailing_list, archive_type, from_year, sav # End of if-else } +#' Refresh mbox files downloaded via pipermail +#' +#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +#' the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +#' then redownloads it along with the remaining months past j up to 12. Then, it calls +#' \code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +#' the current real-life year so that all newer mbox files are downloaded. +#' +#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +#' +#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes +#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory +#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) +#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. +#' @param verbose prints progress during execution +#' @export +refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path,verbose=FALSE) { + # Get a list of mbox files currently downloaded in save path folder + existing_mbox_files <- list.files(save_folder_path) + + # Get the current year + current_date <- Sys.Date() + current_year <- as.numeric(substr(current_date, 1, 4)) + + # If there are no mbox files downloaded, then download mbox files as normal using download_pipermail + if (length(existing_mbox_files) == 0) { + if (verbose) { + message("The folder is empty. Downloading all pipermail files. \n") + } + download_pipermail(archive_url = archive_url, + mailing_list = mailing_list, + archive_type = archive_type, + save_folder_path = save_folder_path) + } else { + latest_file_name <- parse_mbox_latest_date(save_folder_path) + extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) + output <- path.expand(save_folder_path) + + latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) + latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) + this_file <- paste(save_folder_path, latest_file_name, sep = "/") + file.remove(this_file) + + # Download txt files starting from deleted file month to end of that year, save as mbox + download_txt_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (month in (latest_downloaded_month:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt", latest_downloaded_year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + # Download txt.gz files starting from deleted file month to the end of that year, save as mbox + download_txt_gz_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (month in (latest_downloaded_month:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt.gz", latest_downloaded_year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + # Download txt files from the year after the latest downloaded year to the current real life year + download_txt_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (year in (latest_downloaded_year+1):current_year) { + for (month in (1:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt", year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + } + + # Download txt.gz files from the year after the latest downloaded year to the current real life year + download_txt_gz_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (year in (latest_downloaded_year+1):current_year) { + for (month in (1:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt.gz", year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + } + + download_txt_files_latest_downloaded_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + latest_downloaded_month=latest_downloaded_month, + save_folder_path=save_folder_path) + + download_txt_gz_files_latest_downloaded_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + latest_downloaded_month=latest_downloaded_month, + save_folder_path=save_folder_path) + + download_txt_files_current_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + current_year=current_year, + save_folder_path=save_folder_path) + + download_txt_gz_files_current_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + current_year=current_year, + save_folder_path=save_folder_path) + } + # End of if-else +} + ############## Parsers ############## #' Parse mbox from Perceval diff --git a/conf/openssl.yml b/conf/openssl.yml index aa7b2254..41ec5af2 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -45,14 +45,12 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/openssl_dev_mbox # 2004-2008 fields are complete - mbox: ../../rawdata/mbox/openssl-dev.mbx # 2002-2019 gmail field is redacted due to google groups - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - apr-dev + pipermail: + mail_key_1: + archive_url: https://mta.openssl.org/pipermail/openssl-dev + pipermail: ../../rawdata/openssl/pipermail/openssl-dev/ + mailing_list: openssl-dev + archive_type: mta #issue_tracker: #jira: diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 9f4db683..218527c6 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -2,16 +2,22 @@ % Please edit documentation in R/mail.R \name{download_pipermail} \alias{download_pipermail} -\title{Download all pipermail files in an archive} +\title{Download all pipermail files in an archive as mbox files} \usage{ -download_pipermail(url) +download_pipermail(archive_url, mailing_list, archive_type, save_folder_path) } \arguments{ -\item{url}{An url pointing to a pipermail archive} +\item{archive_url}{An url pointing to a pipermail archive} + +\item{mailing_list}{The name of the mailing list being downloaded} + +\item{archive_type}{The name of the type of archive that the mailing list is stored in} + +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} } \value{ Returns `destination`, a vector of the downloaded files in the current working directory } \description{ -Download all pipermail files in an archive +Download all pipermail files in an archive as mbox files } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd new file mode 100644 index 00000000..427c66d2 --- /dev/null +++ b/man/refresh_pipermail.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_pipermail} +\alias{refresh_pipermail} +\title{Refresh mbox files downloaded via pipermail} +\usage{ +refresh_pipermail( + archive_url, + mailing_list, + archive_type, + save_folder_path, + verbose = FALSE +) +} +\arguments{ +\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} + +\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} + +\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} + +\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} + +\item{verbose}{prints progress during execution} +} +\description{ +Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +then redownloads it along with the remaining months past j up to 12. Then, it calls +\code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +the current real-life year so that all newer mbox files are downloaded. +} +\details{ +If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +} diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd index 6733fe63..ee6ab6a1 100644 --- a/vignettes/download_mod_mbox.Rmd +++ b/vignettes/download_mod_mbox.Rmd @@ -40,6 +40,14 @@ mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_l archive_type <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_type"]] start_year <- 2017 end_year <- 2018 + +conf2 <- yaml::read_yaml("../conf/openssl.yml") +save_path_pipermail <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["pipermail"]] +pipermail_url <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_url"]] +mailing_list2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["mailing_list"]] +archive_type2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_type"]] + +perceval_path <- yaml::read_yaml("../tools.yml")[["perceval"]] ``` ```{r eval = FALSE} @@ -61,3 +69,23 @@ refresh_mbox(archive_url = mod_mbox_url, save_folder_path = save_path_mbox, verbose = TRUE) ``` + +```{r eval = FALSE} +download_pipermail(archive_url = pipermail_url, + mailing_list = mailing_list2, + archive_type = archive_type2, + save_folder_path = save_path_pipermail) +``` + +```{r eval = FALSE} +mbox_latest <- parse_mbox_latest_date(save_path_pipermail) +refresh_pipermail(archive_url = pipermail_url, + mailing_list=mailing_list2, + archive_type=archive_type2, + save_folder_path=save_path_pipermail, + verbose=TRUE) +``` + +```{r eval = FALSE} +parse_mbox(perceval_path, save_path_pipermail) +```