Skip to content

Commit

Permalink
The biggest leap forward
Browse files Browse the repository at this point in the history
  • Loading branch information
jonkeane committed Dec 6, 2022
1 parent 85c3e7d commit 155fba2
Show file tree
Hide file tree
Showing 74 changed files with 1,684 additions and 1,892 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
DATALOGISTIK_BRANCH: "main"
DATALOGISTIK_NO_PERMISSIONS_CHANGE: True

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -78,9 +80,6 @@ jobs:
_R_CHECK_CRAN_INCOMING_: false
_R_CHECK_FORCE_SUGGESTS_: ${{ matrix.config.force_suggests }}
run: |
if ('${{ matrix.config.r }}' == 'release' && grepl('ubuntu', '${{ matrix.config.os }}')) {
Sys.setenv("ARROWBENCH_TEST_CUSTOM_DUCKDB" = TRUE)
}
options(crayon.enabled = TRUE)
rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
shell: Rscript {0}
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ jobs:
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
DATALOGISTIK_NO_PERMISSIONS_CHANGE: True
DATALOGISTIK_BRANCH: "main"

steps:
- uses: actions/checkout@v3

Expand Down
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ Imports:
dplyr,
duckdb (>= 0.6.0),
distro,
glue,
glue,
jsonlite,
processx,
progress,
purrr,
R6,
Expand Down Expand Up @@ -66,7 +67,7 @@ Collate:
'bm-write-csv.R'
'bm-write-file.R'
'custom-duckdb.R'
'ensure-format.R'
'datalogistik.R'
'ensure-lib.R'
'known-sources.R'
'ensure-source.R'
Expand Down
13 changes: 4 additions & 9 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,26 @@ S3method(as.list,Serializable)
export("%||%")
export(BenchEnvironment)
export(Benchmark)
export(all_sources)
export(array_altrep_materialization)
export(array_to_vector)
export(confirm_mem_alloc)
export(dataset_taxi_2013)
export(dataset_taxi_parquet)
export(df_to_table)
export(ensure_dataset)
export(ensure_format)
export(ensure_source)
export(ensure_tpch)
export(file_with_ext)
export(generate_tpch)
export(get_csv_reader)
export(get_csv_writer)
export(get_dataset_attr)
export(get_input_func)
export(get_json_reader)
export(get_params_summary)
export(get_query_func)
export(get_read_function)
export(get_source_attr)
export(get_sql_query_func)
export(get_write_function)
export(install_datalogistik)
export(install_pipx)
export(known_compressions)
export(known_formats)
export(known_sources)
Expand All @@ -51,12 +48,12 @@ export(tpch_answer)
export(tpch_tables)
export(write_csv)
export(write_file)
importFrom(R.utils,gunzip)
importFrom(R.utils,gzip)
importFrom(distro,distro)
importFrom(glue,glue)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(processx,run)
importFrom(progress,progress_bar)
importFrom(purrr,flatten)
importFrom(purrr,map)
Expand All @@ -66,7 +63,6 @@ importFrom(purrr,pmap)
importFrom(purrr,transpose)
importFrom(remotes,install_github)
importFrom(rlang,is_missing)
importFrom(rlang,set_names)
importFrom(sessioninfo,package_info)
importFrom(stats,setNames)
importFrom(utils,head)
Expand All @@ -79,4 +75,3 @@ importFrom(utils,write.csv)
importFrom(waldo,compare)
importFrom(withr,with_envvar)
importFrom(withr,with_makevars)
importFrom(withr,with_options)
5 changes: 2 additions & 3 deletions R/bm-array-altrep-materialization.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,18 @@
array_altrep_materialization <- Benchmark(
"array_altrep_materialization",

setup = function(source = names(known_sources),
setup = function(source = known_sources,
exclude_nulls = FALSE,
altrep = TRUE,
subset_indices = list(1:10)) {
stopifnot(
is.logical(exclude_nulls),
is.logical(altrep)
)
source <- match.arg(source, names(all_sources))
subset_indices <- subset_indices[[1]]

options(arrow.use_altrep = altrep)
path <- ensure_format(source, format = "parquet", compression = "snappy")
path <- ensure_source(source, format = "parquet", compression = "snappy")$path

# exclude non-altrep types
pq <- arrow::ParquetFileReader$create(path)
Expand Down
8 changes: 4 additions & 4 deletions R/bm-array-to-vector.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ array_to_vector <- Benchmark("array_to_vector",
is.logical(exclude_nulls),
is.logical(alt_rep)
)
source <- match.arg(source, names(all_sources))
source <- ensure_source(source)
result_dim <- get_source_attr(source, "dim")
table <- read_source(source, as_data_frame = FALSE)
source <- ensure_source(source, format = "parquet")
result_dim <- source$dim

table <- read_source(source$path, as_data_frame = FALSE)

if (exclude_nulls) {
cols_without_nulls <- unlist(lapply(colnames(table), function(x) table[[x]]$null_count == 0))
Expand Down
2 changes: 1 addition & 1 deletion R/bm-dataset-taxi-2013.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dataset_taxi_2013 <- Benchmark(
query = names(dataset_taxi_2013$cases)) {
name <- match.arg(dataset, c("taxi_2013", "taxi_2013_sample"))
library("dplyr", warn.conflicts = FALSE)
dataset <- ensure_dataset(name)
dataset <- ensure_source(name)
query <- dataset_taxi_2013$cases[[match.arg(query)]]

BenchEnvironment(
Expand Down
2 changes: 1 addition & 1 deletion R/bm-dataset-taxi-parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
dataset_taxi_parquet <- Benchmark("partitioned-dataset-filter",
setup = function(query = names(dataset_taxi_parquet$cases)) {
library("dplyr", warn.conflicts = FALSE)
dataset <- ensure_dataset("taxi_parquet")
dataset <- ensure_source("taxi_parquet")
query <- dataset_taxi_parquet$cases[[match.arg(query)]]

BenchEnvironment(
Expand Down
2 changes: 1 addition & 1 deletion R/bm-df-to-table.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ df_to_table <- Benchmark("dataframe-to-table",
)
) {
source <- ensure_source(source)
result_dim <- get_source_attr(source, "dim")
result_dim <- source$dim
# Make sure that we're not (accidentally) creating altrep vectors which will
# make the benchmark measure both arrow->R and then also R->arrow when we
# really want to just measure R->arrow.
Expand Down
10 changes: 5 additions & 5 deletions R/bm-read-csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,25 @@
#' @importFrom R.utils gzip
read_csv <- Benchmark(
"read_csv",
setup = function(source = names(known_sources),
setup = function(source = known_sources,
reader = "arrow",
compression = c("uncompressed", "gzip"),
output_format = c("arrow_table", "data_frame")) {
reader <- match.arg(reader, c("arrow", "data.table", "vroom", "readr"))
compression <- match.arg(compression)
output_format <- match.arg(output_format)
# ensure the the file exists
input_file <- ensure_format(source, "csv", compression)
source <- ensure_source(source, "csv", compression)

# Map string param name to function
delim <- get_source_attr(source, "delim") %||% ","
delim <- source$delim %||% ","
read_func <- get_csv_reader(reader, delim)
result_dim <- get_source_attr(source, "dim")
result_dim <- source$dim

BenchEnvironment(
# Map string param name to function
read_func = get_csv_reader(reader, delim),
input_file = input_file,
input_file = source$path,
result_dim = result_dim,
as_data_frame = output_format == "data_frame",
delim = delim
Expand Down
19 changes: 13 additions & 6 deletions R/bm-read-file.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ read_file <- Benchmark("file-read",
output_type <- match.arg(output_type)

# ensure that we have the right kind of file available
input_file <- ensure_format(source, file_type, compression)
# retrieve the dimensions for run-checking after the benchmark
result_dim <- get_source_attr(source, "dim")
file_up <- ensure_source(source, file_type, compression)
input_file <- file_up$path
# retrieve the dimnesions for run-checking after the benchmark
result_dim <- file_up$dim

# put the necessary variables into a BenchmarkEnvironment to be used when the
# benchmark is running.
Expand Down Expand Up @@ -58,8 +59,10 @@ read_file <- Benchmark("file-read",
packages_used = function(params) {
pkg_map <- c(
"feather" = "arrow",
"arrow" = "arrow",
"parquet" = "arrow",
"fst" = "fst"
"fst" = "fst",
"ndjson" = "arrow"
)
pkg_map[params$file_type]
}
Expand All @@ -74,15 +77,19 @@ read_file <- Benchmark("file-read",
get_read_function <- function(file_type) {
pkg_map <- c(
"feather" = "arrow",
"arrow" = "arrow",
"parquet" = "arrow",
"fst" = "fst"
"fst" = "fst",
"ndjson" = "arrow"
)
library(pkg_map[[file_type]], character.only = TRUE, warn.conflicts = FALSE)

if (file_type == "feather") {
if (file_type %in% c("feather", "arrow")) {
return(function(...) arrow::read_feather(...))
} else if (file_type == "parquet") {
return(function(...) arrow::read_parquet(...))
} else if (file_type == "ndjson") {
return(function(..., as_data_frame) arrow::read_json_arrow(...))
} else if (file_type == "fst") {
return(function(..., as_data_frame) fst::read_fst(...))
} else {
Expand Down
8 changes: 4 additions & 4 deletions R/bm-read-json.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#' @importFrom R.utils gzip
read_json <- Benchmark(
"read_json",
setup = function(source = names(known_sources),
setup = function(source = known_sources,
reader = c("arrow", "jsonlite", "ndjson", "RcppSimdJson"),
compression = c("uncompressed", "gzip"),
output_format = c("arrow_table", "data_frame"),
Expand All @@ -30,13 +30,13 @@ read_json <- Benchmark(
base = function(...) do.call(rbind.data.frame, ...)
)

input_file <- ensure_format(source, "json", compression)
source <- ensure_source(source, "ndjson", compression)

BenchEnvironment(
# Map string param name to function
read_func = get_json_reader(reader),
input_file = input_file,
result_dim = get_source_attr(source, "dim"),
input_file = source$path,
result_dim = source$dim,
as_data_frame = output_format == "data_frame",
rbinder = rbinder,
rbind_func = rbind_func
Expand Down
7 changes: 4 additions & 3 deletions R/bm-remote-dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
remote_dataset <- Benchmark("remote_dataset",
setup = function(source = c("taxi_file_list_parquet", "taxi_file_list_feather")) {
library("dplyr")
dataset <- ensure_dataset(source, download = FALSE)
result_dim <- get_dataset_attr(source, "dim")
# TODO: need to add back in `download = FALSE` when datalogistik supports it
dataset <- ensure_source(source)
result_dim <- dataset$dim

BenchEnvironment(
dataset = dataset,
dataset = open_dataset(dataset$path),
expected_dim = result_dim
)
},
Expand Down
8 changes: 5 additions & 3 deletions R/bm-row-group-size.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,17 @@ row_group_size <- Benchmark(
queries = c("filters", "everything"),
chunk_size = NULL) {
# ensure that we have the right kind of file available
input_file <- ensure_format(
input_source <- ensure_source(
name = source, format = "parquet", compression = "snappy", chunk_size = chunk_size
)
input_file <- input_source$path
dims <- input_source$dim

library("dplyr", warn.conflicts = FALSE)

# put the necessary variables into a BenchmarkEnvironment to be used when the
# benchmark is running.
BenchEnvironment(source = source, input_file = input_file, queries = queries)
BenchEnvironment(source = source, input_file = input_file, queries = queries, dims = dims)
},

# delete the results before each iteration
Expand Down Expand Up @@ -62,7 +64,7 @@ row_group_size <- Benchmark(

if ("everything" %in% queries) {
result[["everything"]] <- ds %>% collect()
result_dim[["everything"]] <- all_sources[[source]]$dim
result_dim[["everything"]] <- dims
}
},
# after each iteration, check the dimensions and delete the results
Expand Down
6 changes: 3 additions & 3 deletions R/bm-table-to-df.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
#'
#' @export
table_to_df <- Benchmark("table_to_df",
setup = function(source = names(known_sources)) {
setup = function(source = known_sources) {
source <- ensure_source(source)
result_dim <- get_source_attr(source, "dim")
table <- read_source(source, as_data_frame = FALSE)
result_dim <- source$dim
table <- read_source(source$path, as_data_frame = FALSE)

transfer_func <- function(table) as.data.frame(table)

Expand Down
Loading

0 comments on commit 155fba2

Please sign in to comment.