diff --git a/DESCRIPTION b/DESCRIPTION index c0f0feb..6715468 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ Suggests: RcppSimdJson, readr, vroom -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.1 Roxygen: list(markdown = TRUE, load = "source") Collate: 'benchmark.R' diff --git a/R/bm-read-file.R b/R/bm-read-file.R index 39f58eb..f1aaf3f 100644 --- a/R/bm-read-file.R +++ b/R/bm-read-file.R @@ -8,7 +8,7 @@ #' #' @export read_file <- Benchmark("read_file", - setup = function(source = names(known_sources), + setup = function(source = c("fanniemae_2016Q4", "nyctaxi_2010-01"), # TODO: break out feather_v1 and feather_v2, feather_v2 only in >= 0.17 format = c("parquet", "feather"), compression = c("uncompressed", "snappy", "lz4"), diff --git a/R/bm-write-file.R b/R/bm-write-file.R index b2d45bc..cb761a1 100644 --- a/R/bm-write-file.R +++ b/R/bm-write-file.R @@ -8,7 +8,7 @@ #' #' @export write_file <- Benchmark("write_file", - setup = function(source = names(known_sources), + setup = function(source = c("fanniemae_2016Q4", "nyctaxi_2010-01"), format = c("parquet", "feather"), compression = c("uncompressed", "snappy", "lz4"), input = c("arrow_table", "data_frame")) { diff --git a/R/result.R b/R/result.R index 9ea4c43..a4cdfc9 100644 --- a/R/result.R +++ b/R/result.R @@ -97,6 +97,9 @@ Serializable <- R6Point1Class( active = list( list = function() { + modifyList(self$list_serializable, private$not_to_serialize) + }, + list_serializable = function() { lapply(private$to_serialize, function(element) { # recurse if (inherits(element, "Serializable")) { @@ -119,12 +122,20 @@ Serializable <- R6Point1Class( private = list( to_serialize = list(), + not_to_serialize = list(), get_or_set_serializable = function(variable, value) { if (!missing(value)) { private$to_serialize[[variable]] <- value } private$to_serialize[[variable]] + }, + + get_or_set_not_to_serialize = function(variable, value) { + if (!missing(value)) { + private$not_to_serialize[[variable]] <- value + } + private$not_to_serialize[[variable]] } ), @@ -164,15 +175,16 @@ BenchmarkResult <- R6Point1Class( public = list( initialize = function(name, - result, - params, + result = NULL, + params = NULL, tags = NULL, info = NULL, context = NULL, github = NULL, options = NULL, output = NULL, - rscript = NULL) { + rscript = NULL, + error = NULL) { self$name <- name self$result <- result self$params <- params @@ -183,6 +195,7 @@ BenchmarkResult <- R6Point1Class( self$options <- options self$output <- output self$rscript <- rscript + self$error <- error }, to_dataframe = function(row.names = NULL, optional = FALSE, packages = "arrow", ...) { @@ -213,20 +226,41 @@ BenchmarkResult <- R6Point1Class( } out + }, + + to_publishable_json = function() { + res_list <- self$list_serializable + + if (!is.null(res_list$result)) { + res_list[["stats"]] <- list( + data = list(res_list$result$real), + units = "s", + iterations = length(res_list$result$real), + times = list(), + times_unit = "s" + ) + res_list$result <- NULL + } + + res_list$tags$name <- res_list$name + res_list$name <- NULL + + jsonlite::toJSON(res_list, auto_unbox = TRUE) } ), active = list( name = function(name) private$get_or_set_serializable(variable = "name", value = name), result = function(result) private$get_or_set_serializable(variable = "result", value = result), - params = function(params) private$get_or_set_serializable(variable = "params", value = params), + params = function(params) private$get_or_set_not_to_serialize(variable = "params", value = params), tags = function(tags) private$get_or_set_serializable(variable = "tags", value = tags), info = function(info) private$get_or_set_serializable(variable = "info", value = info), context = function(context) private$get_or_set_serializable(variable = "context", value = context), github = function(github) private$get_or_set_serializable(variable = "github", value = github), - options = function(options) private$get_or_set_serializable(variable = "options", value = options), - output = function(output) private$get_or_set_serializable(variable = "output", value = output), - rscript = function(rscript) private$get_or_set_serializable(variable = "rscript", value = rscript), + options = function(options) private$get_or_set_not_to_serialize(variable = "options", value = options), + output = function(output) private$get_or_set_not_to_serialize(variable = "output", value = output), + rscript = function(rscript) private$get_or_set_not_to_serialize(variable = "rscript", value = rscript), + error = function(error) private$get_or_set_serializable(variable = "error", value = error), params_summary = function() { d <- self$params @@ -280,7 +314,7 @@ BenchmarkFailure <- R6Point1Class( # A class for holding a set of benchmark results # # This class is primarily a list of `BenchmarkResult` instances, one for each -# combination of arguments for the benchmark's parameters. The list is acessible +# combination of arguments for the benchmark's parameters. The list is accessible # via the `$results` active binding. # # An instance can be passed to `as.data.frame()` and `get_params_summary()`, the @@ -299,7 +333,7 @@ BenchmarkResults <- R6Point1Class( }, to_dataframe = function(row.names = NULL, optional = FALSE, ...) { x <- self$results - valid <- purrr::map_lgl(x, ~inherits(.x, "BenchmarkResult")) # failures will be BenchmarkFailure + valid <- purrr::map_lgl(x, ~!is.null(.x$result)) dplyr::bind_rows(lapply(x[valid], function(res) res$to_dataframe(...))) } diff --git a/inst/arrowbench b/inst/arrowbench new file mode 100755 index 0000000..df9f18a --- /dev/null +++ b/inst/arrowbench @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript +library(arrowbench) + + +args <- commandArgs(trailingOnly = TRUE) + +benchmark_list <- list( + read_file, + write_file +) +names(benchmark_list) <- vapply(benchmark_list, function(x) x$name, character(1)) + +benchmark_command_json <- benchmark_list |> + purrr::imap(~cbind(data.frame(bm = .y), arrowbench:::default_params(.x))) |> + lapply(function(x) split(x, seq(nrow(x)))) |> + lapply(unname) |> + purrr::flatten() |> + lapply(as.list) |> + jsonlite::toJSON(auto_unbox = TRUE) + + +switch (args[[1]], + "help" = if (length(args) == 1) { + cat( + "List and run arrowbench benchmarks", + "", + "Commands:", + " help [run|list]", + " list", + " run BENCHMARK [OPTIONS]", + sep = "\n" + ) + } else if (length(args) >= 2 && args[[2]] == "list") { + cat( + "List available benchmarks in a JSON list.", + "", + "Usage:", + " arrowbench list", + sep = "\n" + ) + } else if (length(args) >= 2 && args[[2]] == "run") { + cat( + "Run a benchmark.", + "", + "Usage:", + " arrowbench run BENCHMARK [OPTIONS]", + "", + "Example:", + " arrowbench run read_file n_iter=2", + sep = "\n" + ) + } else { + cat("Help topic not found", sep = "\n") + }, + "list" = cat(benchmark_command_json), + "run" = { + arg_list <- jsonlite::fromJSON(args[[2]]) + arg_list$bm <- parse(text = arg_list$bm)[[1]] + cat(suppressWarnings(do.call(run_one, arg_list)$to_publishable_json())) + } +) \ No newline at end of file diff --git a/inst/arrowbench-adapter.py b/inst/arrowbench-adapter.py new file mode 100644 index 0000000..379cf4e --- /dev/null +++ b/inst/arrowbench-adapter.py @@ -0,0 +1,68 @@ +import json +import subprocess +from pathlib import Path +from typing import Any, Dict, Generator, List + +from benchadapt import BenchmarkResult +from benchadapt.adapters import GeneratorAdapter +from benchadapt.log import log + + +class ArrowbenchAdapter(GeneratorAdapter): + """ + An adapter for running arrowbench benchmarks + """ + + def __init__( + self, + arrowbench_executable: str, + result_fields_override: Dict[str, Any] = None, + result_fields_append: Dict[str, Any] = None, + ) -> None: + self.arrowbench = arrowbench_executable + + super().__init__( + generator=self.run_arrowbench, + result_fields_override=result_fields_override, + result_fields_append=result_fields_append, + ) + + def list_benchmarks(self) -> List[Dict[str, Any]]: + """ + Get list of benchmark commands from arrowbench CLI + + Returns + ------- + A list of dicts that can be passed to `arrowbench run` + """ + res = subprocess.run(f"{self.arrowbench} list", shell=True, capture_output=True) + return json.loads(res.stdout.decode()) + + def run_arrowbench(self) -> Generator[BenchmarkResult, None, None]: + """ + A generator that uses the arrowbench CLI to list available benchmarks, + then iterate through the list, running each and yielding the result. + """ + benchmarks = self.list_benchmarks() + # subset for demo purposes: + benchmarks = benchmarks[:10] + for benchmark in benchmarks: + command = f"{self.arrowbench} run '{json.dumps(benchmark)}'" + log.info(f"Running `{command}`") + res = subprocess.run( + command, + shell=True, + capture_output=True, + ) + dict_result = json.loads(res.stdout.decode()) + result = BenchmarkResult(**dict_result) + yield result + + +if __name__ == "__main__": + adapter = ArrowbenchAdapter( + arrowbench_executable=Path(__file__).resolve().parent / "arrowbench", + result_fields_override={"run_reason": "test"}, + ) + for result in adapter.run(): + print(result) diff --git a/man/Benchmark.Rd b/man/Benchmark.Rd index e738e82..f7ebefa 100644 --- a/man/Benchmark.Rd +++ b/man/Benchmark.Rd @@ -63,14 +63,16 @@ Define a Benchmark } \section{Evaluation}{ -A \code{Benchmark} is evaluated something like:\preformatted{env <- bm$setup(param1 = "value", param2 = "value") +A \code{Benchmark} is evaluated something like: + +\if{html}{\out{