From 7f83f51695253a8249e0e1257734a0e27d5d5e25 Mon Sep 17 00:00:00 2001 From: joelnitta Date: Thu, 12 Dec 2024 07:35:04 +0900 Subject: [PATCH] Update parallelization episode --- episodes/branch.Rmd | 34 +++++- episodes/files/plans/plan_10.R | 35 +++--- episodes/files/plans/plan_6c.R | 34 ++++++ episodes/files/plans/plan_7.R | 31 ++++-- episodes/files/plans/plan_8.R | 35 +++--- episodes/files/plans/plan_9.R | 35 +++--- episodes/files/tar_functions/model_augment.R | 2 +- .../files/tar_functions/model_augment_slow.R | 17 +++ .../files/tar_functions/model_glance_slow.R | 17 +++ episodes/parallel.Rmd | 67 +++++++----- renv/activate.R | 103 ++++++++++++++++-- 11 files changed, 301 insertions(+), 109 deletions(-) create mode 100644 episodes/files/plans/plan_6c.R create mode 100644 episodes/files/tar_functions/model_augment_slow.R create mode 100644 episodes/files/tar_functions/model_glance_slow.R diff --git a/episodes/branch.Rmd b/episodes/branch.Rmd index b055392c..2c646125 100644 --- a/episodes/branch.Rmd +++ b/episodes/branch.Rmd @@ -1,6 +1,6 @@ --- title: 'Branching' -teaching: 10 +teaching: 30 exercises: 2 --- @@ -152,7 +152,7 @@ Before moving on, let's define another **custom function** function: `model_glan You will need to write custom functions frequently when using `targets`, so it's good to get used to it! As the name `model_glance()` suggests (it is good to write functions with names that indicate their purpose), this will build a model then immediately run `glance()` on it. -The reason for doing so is that we get a **dataframe as a result**, which as previously mentioned is very helpful for branching, as we will see in the next section. +The reason for doing so is that we get a **dataframe as a result**, which is very helpful for branching, as we will see in the next section. Save this in `R/functions.R`: ```{r} @@ -310,7 +310,7 @@ Add the step to the workflow: ```{r} #| label = "example-model-augment-show", -#| code = readLines("files/plans/plan_8.R")[2:35], +#| code = readLines("files/plans/plan_7.R")[2:36], #| eval = FALSE ``` @@ -318,6 +318,34 @@ Add the step to the workflow: ::::::::::::::::::::::::::::::::::::: +### Further simplify the workflow + +You may have noticed that we can further simplify the workflow: there is no need to have separate `penguins_data` and `penguins_data_grouped` dataframes. +In general it is best to keep the number of named objects as small as possible to make it easier to reason about your code. +Let's combine the cleaning and grouping step into a single command: + +```{r} +#| label = "example-model-show-8", +#| eval = FALSE, +#| code = readLines("files/plans/plan_8.R")[2:35] +``` + +And run it once more: + +```{r} +#| label: example-model-show-8 +#| echo: false +pushd(plan_6_dir) +# simulate already running the plan once +write_example_plan("plan_7.R") +tar_make(reporter = "silent") +# run version of plan that uses `model_glance_orig()` (doesn't include species +# names in output) +write_example_plan("plan_8.R") +tar_make() +popd() +``` + ::::::::::::::::::::::::::::::::::::: {.callout} ## Best practices for branching diff --git a/episodes/files/plans/plan_10.R b/episodes/files/plans/plan_10.R index be92fd01..59bcfed9 100644 --- a/episodes/files/plans/plan_10.R +++ b/episodes/files/plans/plan_10.R @@ -16,27 +16,26 @@ tar_plan( path_to_file("penguins_raw.csv"), read_csv(!!.x, show_col_types = FALSE) ), - # Clean data - penguins_data = clean_penguin_data(penguins_data_raw), - # Build models - models = list( - combined_model = lm( - bill_depth_mm ~ bill_length_mm, data = penguins_data), - species_model = lm( - bill_depth_mm ~ bill_length_mm + species, data = penguins_data), - interaction_model = lm( - bill_depth_mm ~ bill_length_mm * species, data = penguins_data) + # Clean and group data + tar_group_by( + penguins_data, + clean_penguin_data(penguins_data_raw), + species ), - # Get model summaries + # Get summary of combined model with all species together + combined_summary = model_glance(penguins_data), + # Get summary of one model per species tar_target( - model_summaries, - glance_with_mod_name_slow(models), - pattern = map(models) + species_summary, + model_glance_slow(penguins_data), + pattern = map(penguins_data) ), - # Get model predictions + # Get predictions of combined model with all species together + combined_predictions = model_glance_slow(penguins_data), + # Get predictions of one model per species tar_target( - model_predictions, - augment_with_mod_name_slow(models), - pattern = map(models) + species_predictions, + model_augment_slow(penguins_data), + pattern = map(penguins_data) ) ) diff --git a/episodes/files/plans/plan_6c.R b/episodes/files/plans/plan_6c.R new file mode 100644 index 00000000..8b72fa69 --- /dev/null +++ b/episodes/files/plans/plan_6c.R @@ -0,0 +1,34 @@ +options(tidyverse.quiet = TRUE) +source("R/functions.R") +source("R/packages.R") + +tar_plan( + # Load raw data + tar_file_read( + penguins_data_raw, + path_to_file("penguins_raw.csv"), + read_csv(!!.x, show_col_types = FALSE) + ), + # Clean and group data + tar_group_by( + penguins_data, + clean_penguin_data(penguins_data_raw), + species + ), + # Get summary of combined model with all species together + combined_summary = model_glance(penguins_data), + # Get summary of one model per species + tar_target( + species_summary, + model_glance(penguins_data), + pattern = map(penguins_data) + ), + # Get predictions of combined model with all species together + combined_predictions = model_glance(penguins_data), + # Get predictions of one model per species + tar_target( + species_predictions, + model_augment(penguins_data), + pattern = map(penguins_data) + ) +) diff --git a/episodes/files/plans/plan_7.R b/episodes/files/plans/plan_7.R index 346cca74..da5f7bc5 100644 --- a/episodes/files/plans/plan_7.R +++ b/episodes/files/plans/plan_7.R @@ -11,19 +11,26 @@ tar_plan( ), # Clean data penguins_data = clean_penguin_data(penguins_data_raw), - # Build models - models = list( - combined_model = lm( - bill_depth_mm ~ bill_length_mm, data = penguins_data), - species_model = lm( - bill_depth_mm ~ bill_length_mm + species, data = penguins_data), - interaction_model = lm( - bill_depth_mm ~ bill_length_mm * species, data = penguins_data) + # Group data + tar_group_by( + penguins_data_grouped, + penguins_data, + species ), - # Get model summaries + # Get summary of combined model with all species together + combined_summary = model_glance(penguins_data), + # Get summary of one model per species tar_target( - model_summaries, - glance_with_mod_name(models), - pattern = map(models) + species_summary, + model_glance(penguins_data_grouped), + pattern = map(penguins_data_grouped) + ), + # Get predictions of combined model with all species together + combined_predictions = model_glance(penguins_data_grouped), + # Get predictions of one model per species + tar_target( + species_predictions, + model_augment(penguins_data_grouped), + pattern = map(penguins_data_grouped) ) ) diff --git a/episodes/files/plans/plan_8.R b/episodes/files/plans/plan_8.R index 8a6779ef..8b72fa69 100644 --- a/episodes/files/plans/plan_8.R +++ b/episodes/files/plans/plan_8.R @@ -9,27 +9,26 @@ tar_plan( path_to_file("penguins_raw.csv"), read_csv(!!.x, show_col_types = FALSE) ), - # Clean data - penguins_data = clean_penguin_data(penguins_data_raw), - # Build models - models = list( - combined_model = lm( - bill_depth_mm ~ bill_length_mm, data = penguins_data), - species_model = lm( - bill_depth_mm ~ bill_length_mm + species, data = penguins_data), - interaction_model = lm( - bill_depth_mm ~ bill_length_mm * species, data = penguins_data) + # Clean and group data + tar_group_by( + penguins_data, + clean_penguin_data(penguins_data_raw), + species ), - # Get model summaries + # Get summary of combined model with all species together + combined_summary = model_glance(penguins_data), + # Get summary of one model per species tar_target( - model_summaries, - glance_with_mod_name(models), - pattern = map(models) + species_summary, + model_glance(penguins_data), + pattern = map(penguins_data) ), - # Get model predictions + # Get predictions of combined model with all species together + combined_predictions = model_glance(penguins_data), + # Get predictions of one model per species tar_target( - model_predictions, - augment_with_mod_name(models), - pattern = map(models) + species_predictions, + model_augment(penguins_data), + pattern = map(penguins_data) ) ) diff --git a/episodes/files/plans/plan_9.R b/episodes/files/plans/plan_9.R index 164359b1..99958265 100644 --- a/episodes/files/plans/plan_9.R +++ b/episodes/files/plans/plan_9.R @@ -16,27 +16,26 @@ tar_plan( path_to_file("penguins_raw.csv"), read_csv(!!.x, show_col_types = FALSE) ), - # Clean data - penguins_data = clean_penguin_data(penguins_data_raw), - # Build models - models = list( - combined_model = lm( - bill_depth_mm ~ bill_length_mm, data = penguins_data), - species_model = lm( - bill_depth_mm ~ bill_length_mm + species, data = penguins_data), - interaction_model = lm( - bill_depth_mm ~ bill_length_mm * species, data = penguins_data) + # Clean and group data + tar_group_by( + penguins_data, + clean_penguin_data(penguins_data_raw), + species ), - # Get model summaries + # Get summary of combined model with all species together + combined_summary = model_glance(penguins_data), + # Get summary of one model per species tar_target( - model_summaries, - glance_with_mod_name(models), - pattern = map(models) + species_summary, + model_glance(penguins_data), + pattern = map(penguins_data) ), - # Get model predictions + # Get predictions of combined model with all species together + combined_predictions = model_glance(penguins_data), + # Get predictions of one model per species tar_target( - model_predictions, - augment_with_mod_name(models), - pattern = map(models) + species_predictions, + model_augment(penguins_data), + pattern = map(penguins_data) ) ) diff --git a/episodes/files/tar_functions/model_augment.R b/episodes/files/tar_functions/model_augment.R index 68d65591..68875d00 100644 --- a/episodes/files/tar_functions/model_augment.R +++ b/episodes/files/tar_functions/model_augment.R @@ -1,4 +1,4 @@ -model_glance <- function(penguins_data) { +model_augment <- function(penguins_data) { # Make model model <- lm( bill_depth_mm ~ bill_length_mm, diff --git a/episodes/files/tar_functions/model_augment_slow.R b/episodes/files/tar_functions/model_augment_slow.R new file mode 100644 index 00000000..8dd99fe6 --- /dev/null +++ b/episodes/files/tar_functions/model_augment_slow.R @@ -0,0 +1,17 @@ +model_augment_slow <- function(penguins_data) { + Sys.sleep(4) + # Make model + model <- lm( + bill_depth_mm ~ bill_length_mm, + data = penguins_data) + # Get species name + species_name <- unique(penguins_data$species) + # If this is the combined dataset with multiple + # species, changed name to 'combined' + if (length(species_name) > 1) { + species_name <- "combined" + } + # Get model summary and add species name + augment(model) |> + mutate(species = species_name, .before = 1) +} diff --git a/episodes/files/tar_functions/model_glance_slow.R b/episodes/files/tar_functions/model_glance_slow.R new file mode 100644 index 00000000..ba37fe66 --- /dev/null +++ b/episodes/files/tar_functions/model_glance_slow.R @@ -0,0 +1,17 @@ +model_glance_slow <- function(penguins_data) { + Sys.sleep(4) + # Make model + model <- lm( + bill_depth_mm ~ bill_length_mm, + data = penguins_data) + # Get species name + species_name <- unique(penguins_data$species) + # If this is the combined dataset with multiple + # species, changed name to 'combined' + if (length(species_name) > 1) { + species_name <- "combined" + } + # Get model summary and add species name + glance(model) |> + mutate(species = species_name, .before = 1) +} diff --git a/episodes/parallel.Rmd b/episodes/parallel.Rmd index 1bdcb79b..4b458fc4 100644 --- a/episodes/parallel.Rmd +++ b/episodes/parallel.Rmd @@ -1,6 +1,6 @@ --- title: 'Parallel Processing' -teaching: 10 +teaching: 15 exercises: 2 --- @@ -30,6 +30,11 @@ Episode summary: Show how to use parallel processing library(targets) library(tarchetypes) library(broom) + +if (interactive()) { + setwd("episodes") +} + source("files/lesson_functions.R") # Increase width for printing tibbles @@ -76,7 +81,7 @@ It should now look like this: There is still one more thing we need to modify only for the purposes of this demo: if we ran the analysis in parallel now, you wouldn't notice any difference in compute time because the functions are so fast. -So let's make "slow" versions of `glance_with_mod_name()` and `augment_with_mod_name()` using the `Sys.sleep()` function, which just tells the computer to wait some number of seconds. +So let's make "slow" versions of `model_glance()` and `model_augment()` using the `Sys.sleep()` function, which just tells the computer to wait some number of seconds. This will simulate a long-running computation and enable us to see the difference between running sequentially and in parallel. Add these functions to `functions.R` (you can copy-paste the original ones, then modify them): @@ -85,8 +90,8 @@ Add these functions to `functions.R` (you can copy-paste the original ones, then #| label: slow-funcs #| eval: false #| file: -#| - files/tar_functions/glance_with_mod_name_slow.R -#| - files/tar_functions/augment_with_mod_name_slow.R +#| - files/tar_functions/model_glance_slow.R +#| - files/tar_functions/model_augment_slow.R ``` Then, change the plan to use the "slow" version of the functions: @@ -109,34 +114,36 @@ Finally, run the pipeline with `tar_make()` as normal. # with sandpaper::build_lesson(), even though it only uses 2 when run # interactively # -# plan_10_dir <- make_tempdir() -# pushd(plan_10_dir) -# write_example_plan("plan_9.R") -# tar_make(reporter = "silent") -# write_example_plan("plan_10.R") -# tar_make() -# popd() +plan_10_dir <- make_tempdir() +pushd(plan_10_dir) +write_example_plan("plan_9.R") +tar_make(reporter = "silent") +write_example_plan("plan_10.R") +tar_make() +popd() # Solution for now is to hard-code output -cat("✔ skip target penguins_data_raw_file -✔ skip target penguins_data_raw -✔ skip target penguins_data -✔ skip target models -• start branch model_predictions_5ad4cec5 -• start branch model_predictions_c73912d5 -• start branch model_predictions_91696941 -• start branch model_summaries_5ad4cec5 -• start branch model_summaries_c73912d5 -• start branch model_summaries_91696941 -• built branch model_predictions_5ad4cec5 [4.884 seconds] -• built branch model_predictions_c73912d5 [4.896 seconds] -• built branch model_predictions_91696941 [4.006 seconds] -• built pattern model_predictions -• built branch model_summaries_5ad4cec5 [4.011 seconds] -• built branch model_summaries_c73912d5 [4.011 seconds] -• built branch model_summaries_91696941 [4.011 seconds] -• built pattern model_summaries -• end pipeline [15.153 seconds]") +cat("✔ skipped target penguins_data_raw_file +✔ skipped target penguins_data_raw +✔ skipped target penguins_data +✔ skipped target combined_summary +▶ dispatched branch species_summary_1598bb4431372f32 +▶ dispatched branch species_summary_6b9109ba2e9d27fd +● completed branch species_summary_1598bb4431372f32 [4.815 seconds, 367 bytes] +▶ dispatched branch species_summary_625f9fbc7f62298a +● completed branch species_summary_6b9109ba2e9d27fd [4.813 seconds, 370 bytes] +▶ dispatched target combined_predictions +● completed branch species_summary_625f9fbc7f62298a [4.01 seconds, 367 bytes] +● completed pattern species_summary +▶ dispatched branch species_predictions_1598bb4431372f32 +● completed target combined_predictions [4.012 seconds, 370 bytes] +▶ dispatched branch species_predictions_6b9109ba2e9d27fd +● completed branch species_predictions_1598bb4431372f32 [4.014 seconds, 11.585 kilobytes] +▶ dispatched branch species_predictions_625f9fbc7f62298a +● completed branch species_predictions_6b9109ba2e9d27fd [4.01 seconds, 6.25 kilobytes] +● completed branch species_predictions_625f9fbc7f62298a [4.007 seconds, 9.628 kilobytes] +● completed pattern species_predictions +▶ ended pipeline [19.363 seconds]") ``` Notice that although the time required to build each individual target is about 4 seconds, the total time to run the entire workflow is less than the sum of the individual target times! That is proof that processes are running in parallel **and saving you time**. diff --git a/renv/activate.R b/renv/activate.R index d13f9932..8638f7fe 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -98,6 +98,66 @@ local({ unloadNamespace("renv") # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -142,7 +202,10 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - paste(substring(lines, common), collapse = "\n") + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) } @@ -305,8 +368,11 @@ local({ quiet = TRUE ) - if ("headers" %in% names(formals(utils::download.file))) - args$headers <- renv_bootstrap_download_custom_headers(url) + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } do.call(utils::download.file, args) @@ -385,10 +451,21 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + # retrieve package database db <- tryCatch( as.data.frame( - utils::available.packages(type = type, repos = repos), + do.call(utils::available.packages, args), stringsAsFactors = FALSE ), error = identity @@ -470,6 +547,14 @@ local({ } + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -477,16 +562,16 @@ local({ return(FALSE) # prepare download options - pat <- Sys.getenv("GITHUB_PAT") - if (nzchar(Sys.which("curl")) && nzchar(pat)) { + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, pat) + extra <- sprintf(fmt, token) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE)