From 08aefc395d5763e405c736c35231706f6f9634a2 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Mon, 22 Apr 2024 20:52:57 +0800 Subject: [PATCH 001/106] MINOR: [C++][CMake] Remove unused CMAKE_SKIP_INSTALL_ALL_DEPENDENCY (#41332) ### Rationale for this change CMAKE_SKIP_INSTALL_ALL_DEPENDENCY was removed in https://github.com/apache/arrow/pull/75 but it seems that there is still one line remaining. ### What changes are included in this PR? Remove unused CMAKE_SKIP_INSTALL_ALL_DEPENDENCY. ### Are these changes tested? Pass all CIs. ### Are there any user-facing changes? No. Authored-by: Gang Wu Signed-off-by: Gang Wu --- cpp/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1499bb7767d4a..cc4df6d2fa2eb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -256,9 +256,6 @@ if(ARROW_USE_CCACHE endif() if(ARROW_OPTIONAL_INSTALL) - # Don't make the "install" target depend on the "all" target - set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) - set(INSTALL_IS_OPTIONAL OPTIONAL) endif() From 79799e59b1a0c7bde8b85ea955593b2d63d2a46c Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Mon, 22 Apr 2024 22:07:03 +0900 Subject: [PATCH 002/106] GH-39664: [C++][Acero] Ensure Acero benchmarks present a metric for identifying throughput (#40884) ### Rationale for this change Acero benchmarks sometimes output metrics such as `items/s`, `bytes/s`, `batches/s`, and `rows/s`. However, there is inconsistency in how these metrics are presented across different benchmarks. We are undertaking an effort to standardize the output of these metrics to ensure uniformity and clarity in performance measurement across all Acero benchmarks. ### What changes are included in this PR? `rows/s` has a similar meaning to `items/s`. - `bytes/s` and `items/s`: aggregate - `bytes/s` and `rows/s`: asof_join - `batches/s` and `rows/s`: project, filter, expression ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #39664 Authored-by: Hyunseok Seo Signed-off-by: Weston Pace --- cpp/src/arrow/acero/aggregate_benchmark.cc | 19 +++++++++++++++++++ cpp/src/arrow/acero/asof_join_benchmark.cc | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 4db7e443227d9..854862e3e48ca 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -29,6 +29,7 @@ #include "arrow/util/benchmark_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_reader.h" +#include "arrow/util/byte_size.h" #include "arrow/util/string.h" namespace arrow { @@ -50,6 +51,7 @@ namespace acero { #include using arrow::internal::ToChars; +using arrow::util::TotalBufferSize; #ifdef ARROW_WITH_BENCHMARKS_REFERENCE @@ -371,9 +373,11 @@ static void BenchmarkGroupBy(benchmark::State& state, std::vector agg for (std::size_t arg_idx = 0; arg_idx < arguments.size(); arg_idx++) { aggregates[arg_idx].target = {FieldRef(static_cast(arg_idx))}; } + int64_t total_bytes = TotalBufferSize(*batch); for (auto _ : state) { ABORT_NOT_OK(BatchGroupBy(batch, aggregates, key_refs)); } + state.SetBytesProcessed(total_bytes * state.iterations()); } #define GROUP_BY_BENCHMARK(Name, Impl) \ @@ -578,6 +582,8 @@ static void SumKernel(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Sum(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void SumKernelArgs(benchmark::internal::Benchmark* bench) { @@ -611,6 +617,8 @@ void ModeKernel(benchmark::State& state, int min, int max) { for (auto _ : state) { ABORT_NOT_OK(Mode(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } template @@ -625,13 +633,18 @@ void ModeKernelNarrow(benchmark::State& state) { template <> void ModeKernelNarrow(benchmark::State& state) { + using CType = typename TypeTraits::CType; + RegressionArgs args(state); + const int64_t array_size = args.size / sizeof(CType); auto rand = random::RandomArrayGenerator(1924); auto array = rand.Boolean(args.size * 8, 0.5, args.null_proportion); for (auto _ : state) { ABORT_NOT_OK(Mode(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } template @@ -668,6 +681,8 @@ static void MinMaxKernelBench(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(MinMax(array).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void MinMaxKernelBenchArgs(benchmark::internal::Benchmark* bench) { @@ -698,6 +713,8 @@ static void CountKernelBenchInt64(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Count(array->Slice(1, array_size)).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } BENCHMARK(CountKernelBenchInt64)->Args({1 * 1024 * 1024, 2}); // 1M with 50% null. @@ -718,6 +735,8 @@ void VarianceKernelBench(benchmark::State& state) { for (auto _ : state) { ABORT_NOT_OK(Variance(array, options).status()); } + + state.SetItemsProcessed(state.iterations() * array_size); } static void VarianceKernelBenchArgs(benchmark::internal::Benchmark* bench) { diff --git a/cpp/src/arrow/acero/asof_join_benchmark.cc b/cpp/src/arrow/acero/asof_join_benchmark.cc index 02116b09fc1fd..ed2ac2258eb6c 100644 --- a/cpp/src/arrow/acero/asof_join_benchmark.cc +++ b/cpp/src/arrow/acero/asof_join_benchmark.cc @@ -91,7 +91,7 @@ static void TableJoinOverhead(benchmark::State& state, ASSERT_OK(DeclarationToStatus(std::move(join_node), /*use_threads=*/false)); } - state.counters["input_rows_per_second"] = benchmark::Counter( + state.counters["rows_per_second"] = benchmark::Counter( static_cast(state.iterations() * (left_table_stats.rows + right_hand_rows)), benchmark::Counter::kIsRate); From 5865e96db2fdbfd17d51768b8c7dbbe3d2aca71f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 22 Apr 2024 16:20:06 -0400 Subject: [PATCH 003/106] GH-41323: [R] Redo how summarize() evaluates expressions (#41223) ### Rationale for this change Previously, the NSE expression handling in `summarize()` worked differently from filter/mutate/etc. Among the implications, it would not have been possible to define bindings for other aggregation functions that can be translated into some combination of supported aggregations, such as `weighted.mean()`. ### What changes are included in this PR? * Expressions in `summarize()` can now be evaluated with "regular" `arrow_eval()`. Aggregation bindings stick the contents of the aggregation data they previously returned into an `..aggregations` list that lives in an enclosing environment, and then return a FieldRef pointing to that. This makes the code in e.g. `summarize_eval()` a little harder to follow, since it's grabbing and pointing to objects out of its immediate scope, but I've tried to comment thoroughly and am happy to add more. * `arrow_eval()` inspects the expression it receives for any functions that are not in the NSE mask and not in some other package's namespace (i.e. hopefully just user functions) and inserts them into the NSE mask, setting the enclosing environment for that copy of the function to be the mask, so that if the function calls other functions that we do have bindings for, the bindings get called. This is the approach I suggested back in https://github.com/apache/arrow/issues/29667#issuecomment-1378049226, and it is what fixes #29667 and #40938. ### Are these changes tested? Existing tests, which are pretty comprehensive, pass. But it would be good to try to be more evil in manual testing with the user-defined R function support. ### Are there any user-facing changes? Yes. * GitHub Issue: #41323 --- r/NEWS.md | 3 + r/R/arrowExports.R | 4 + r/R/dplyr-across.R | 1 - r/R/dplyr-eval.R | 76 +++- r/R/dplyr-summarize.R | 345 ++++++++---------- r/R/expression.R | 3 + r/src/arrowExports.cpp | 9 + r/src/expression.cpp | 17 + r/tests/testthat/test-dplyr-across.R | 20 +- r/tests/testthat/test-dplyr-filter.R | 1 - .../testthat/test-dplyr-funcs-conditional.R | 15 + r/tests/testthat/test-dplyr-summarize.R | 137 +++++-- 12 files changed, 398 insertions(+), 233 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index a6e5f6e11fdf9..71ac0e8111537 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,9 @@ # arrow 16.0.0.9000 +* R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are not true user-defined functions (UDFs); for those, see `register_scalar_function()`. (#41223) +* `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. + # arrow 16.0.0 # arrow 15.0.2 diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index f4ff3ef894532..967caba727898 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1132,6 +1132,10 @@ compute___expr__get_field_ref_name <- function(x) { .Call(`_arrow_compute___expr__get_field_ref_name`, x) } +compute___expr__field_names_in_expression <- function(x) { + .Call(`_arrow_compute___expr__field_names_in_expression`, x) +} + compute___expr__field_ref <- function(name) { .Call(`_arrow_compute___expr__field_ref`, name) } diff --git a/r/R/dplyr-across.R b/r/R/dplyr-across.R index da61353b22cdb..0d85764f7fb35 100644 --- a/r/R/dplyr-across.R +++ b/r/R/dplyr-across.R @@ -145,7 +145,6 @@ across_setup <- function(cols, fns, names, .caller_env, mask, inline = FALSE, ex fns <- call_args(fns) } - # ARROW-14071 if (all(map_lgl(fns, is_call, name = "function"))) { abort("Anonymous functions are not yet supported in Arrow") } diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 08a4b62aa8f8f..3aaa29696b8c8 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -15,10 +15,14 @@ # specific language governing permissions and limitations # under the License. +# filter(), mutate(), etc. work by evaluating the quoted `exprs` to generate Expressions arrow_eval <- function(expr, mask) { - # filter(), mutate(), etc. work by evaluating the quoted `exprs` to generate Expressions - # with references to Arrays (if .data is Table/RecordBatch) or Fields (if - # .data is a Dataset). + # Look for R functions referenced in expr that are not in the mask and add + # them. If they call other functions in the mask, this will let them find them + # and just work. (If they call things not supported in Arrow, it won't work, + # but it wouldn't have worked anyway!) + # Note this is *not* true UDFs. + add_user_functions_to_mask(expr, mask) # This yields an Expression as long as the `exprs` are implemented in Arrow. # Otherwise, it returns a try-error @@ -48,6 +52,47 @@ arrow_eval <- function(expr, mask) { }) } +add_user_functions_to_mask <- function(expr, mask) { + # Look for the user's R functions referenced in expr that are not in the mask, + # see if we can add them to the mask and set their parent env to the mask + # so that they can reference other functions in the mask. + if (is_quosure(expr)) { + # case_when calls arrow_eval() on regular formulas not quosures, which don't + # have their own environment. But, we've already walked those expressions + # when calling arrow_eval() on the case_when expression itself, so we don't + # need to worry about adding them again. + function_env <- parent.env(parent.env(mask)) + quo_expr <- quo_get_expr(expr) + funs_in_expr <- all_funs(quo_expr) + quo_env <- quo_get_env(expr) + # Enumerate the things we have bindings for, and add anything else that we + # explicitly want to block from trying to add to the function environment + known_funcs <- c(ls(function_env, all.names = TRUE), "~", "[", ":") + unknown <- setdiff(funs_in_expr, known_funcs) + for (func_name in unknown) { + if (exists(func_name, quo_env)) { + user_fun <- get(func_name, quo_env) + if (!is.null(environment(user_fun)) && !rlang::is_namespace(environment(user_fun))) { + # Primitives don't have an environment, and we can't trust that + # functions from packages will work in arrow. (If they could be + # expressed in arrow, they would be in the mask already.) + if (getOption("arrow.debug", FALSE)) { + print(paste("Adding", func_name, "to the function environment")) + } + function_env[[func_name]] <- user_fun + # Also set the enclosing environment to be the function environment. + # This allows the function to reference other functions in the env. + # This may have other undesired side effects(?) + environment(function_env[[func_name]]) <- function_env + } + } + } + } + # Don't need to return anything because we assigned into environments, + # which pass by reference + invisible() +} + handle_arrow_not_supported <- function(err, lab) { # Look for informative message from the Arrow function version (see above) if (inherits(err, "arrow-try-error")) { @@ -79,24 +124,27 @@ arrow_not_supported <- function(msg) { arrow_mask <- function(.data, aggregation = FALSE) { f_env <- new_environment(.cache$functions) - # Add functions that need to error hard and clear. - # Some R functions will still try to evaluate on an Expression - # and return NA with a warning - fail <- function(...) stop("Not implemented") - for (f in c("mean", "sd")) { - f_env[[f]] <- fail - } - if (aggregation) { - # This should probably be done with an environment inside an environment - # but a first attempt at that had scoping problems (ARROW-13499) + # Add the aggregation functions to the environment, and set the enclosing + # environment to the parent frame so that, when called from summarize_eval(), + # they can reference and assign into `..aggregations` defined there. + pf <- parent.frame() for (f in names(agg_funcs)) { f_env[[f]] <- agg_funcs[[f]] + environment(f_env[[f]]) <- pf + } + } else { + # Add functions that need to error hard and clear. + # Some R functions will still try to evaluate on an Expression + # and return NA with a warning :exploding_head: + fail <- function(...) stop("Not implemented") + for (f in c("mean", "sd")) { + f_env[[f]] <- fail } } - schema <- .data$.data$schema # Assign the schema to the expressions + schema <- .data$.data$schema walk(.data$selected_columns, ~ (.$schema <- schema)) # Add the column references and make the mask diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index e2c70f9e89e42..1b625e34ad606 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -36,49 +36,49 @@ ensure_one_arg <- function(args, fun) { register_bindings_aggregate <- function() { register_binding_agg("base::sum", function(..., na.rm = FALSE) { - list( + set_agg( fun = "sum", data = ensure_one_arg(list2(...), "sum"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("base::prod", function(..., na.rm = FALSE) { - list( + set_agg( fun = "product", data = ensure_one_arg(list2(...), "prod"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("base::any", function(..., na.rm = FALSE) { - list( + set_agg( fun = "any", data = ensure_one_arg(list2(...), "any"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("base::all", function(..., na.rm = FALSE) { - list( + set_agg( fun = "all", data = ensure_one_arg(list2(...), "all"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("base::mean", function(x, na.rm = FALSE) { - list( + set_agg( fun = "mean", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { - list( + set_agg( fun = "stddev", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { - list( + set_agg( fun = "variance", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) @@ -98,7 +98,7 @@ register_bindings_aggregate <- function() { .frequency_id = "arrow.quantile.approximate", class = "arrow.quantile.approximate" ) - list( + set_agg( fun = "tdigest", data = list(x), options = list(skip_nulls = na.rm, q = probs) @@ -120,7 +120,7 @@ register_bindings_aggregate <- function() { .frequency_id = "arrow.median.approximate", class = "arrow.median.approximate" ) - list( + set_agg( fun = "approximate_median", data = list(x), options = list(skip_nulls = na.rm) @@ -129,28 +129,28 @@ register_bindings_aggregate <- function() { notes = "approximate median (t-digest) is computed" ) register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { - list( + set_agg( fun = "count_distinct", data = ensure_one_arg(list2(...), "n_distinct"), options = list(na.rm = na.rm) ) }) register_binding_agg("dplyr::n", function() { - list( + set_agg( fun = "count_all", data = list(), options = list() ) }) register_binding_agg("base::min", function(..., na.rm = FALSE) { - list( + set_agg( fun = "min", data = ensure_one_arg(list2(...), "min"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) register_binding_agg("base::max", function(..., na.rm = FALSE) { - list( + set_agg( fun = "max", data = ensure_one_arg(list2(...), "max"), options = list(skip_nulls = na.rm, min_count = 0L) @@ -158,6 +158,42 @@ register_bindings_aggregate <- function() { }) } +set_agg <- function(...) { + agg_data <- list2(...) + # Find the environment where ..aggregations is stored + target <- find_aggregations_env() + aggs <- get("..aggregations", target) + lapply(agg_data[["data"]], function(expr) { + # If any of the fields referenced in the expression are in ..aggregations, + # then we can't aggregate over them. + # This is mainly for combinations of dataset columns and aggregations, + # like sum(x - mean(x)), i.e. window functions. + # This will reject (sum(sum(x)) as well, but that's not a useful operation. + if (any(expr$field_names_in_expression() %in% names(aggs))) { + # TODO: support in ARROW-13926 + arrow_not_supported("aggregate within aggregate expression") + } + }) + + # Record the (fun, data, options) in ..aggregations + # and return a FieldRef pointing to it + tmpname <- paste0("..temp", length(aggs)) + aggs[[tmpname]] <- agg_data + assign("..aggregations", aggs, envir = target) + Expression$field_ref(tmpname) +} + +find_aggregations_env <- function() { + # Find the environment where ..aggregations is stored, + # it's in parent.env of something in the call stack + for (f in sys.frames()) { + if (exists("..aggregations", envir = f)) { + return(f) + } + } + stop("Could not find ..aggregations") +} + # we register 2 versions of the "::" binding - one for use with agg_funcs # (registered below) and another one for use with nse_funcs # (registered in dplyr-funcs.R) @@ -216,53 +252,85 @@ summarise.Dataset <- summarise.ArrowTabular <- summarise.RecordBatchReader <- su # This is the Arrow summarize implementation do_arrow_summarize <- function(.data, ..., .groups = NULL) { exprs <- ensure_named_exprs(quos(...)) - - # Create a stateful environment for recording our evaluated expressions - # It's more complex than other places because a single summarize() expr - # may result in multiple query nodes (Aggregate, Project), - # and we have to walk through the expressions to disentangle them. - ctx <- env( - mask = arrow_mask(.data, aggregation = TRUE), - aggregations = empty_named_list(), - post_mutate = empty_named_list() + # Do any pre-processing to the expressions we need + exprs <- map( + exprs, + adjust_summarize_expression, + hash = length(.data$group_by_vars) > 0 ) + + # nolint start + # summarize() is complicated because you can do a mixture of scalar operations + # and aggregations, but that's not how Acero works. For example, for us to do + # summarize(mean = sum(x) / n()) + # we basically have to translate it into + # summarize(..temp0 = sum(x), ..temp1 = n()) %>% + # mutate(mean = ..temp0 / ..temp1) %>% + # select(-starts_with("..temp")) + # That is, "first aggregate, then transform the result further." + # + # When we do filter() and mutate(), we just turn the user's code into a single + # Arrow Expression per column. But when we do summarize(), we have to pull out + # the aggregations, collect them in one list (that will become an Aggregate + # ExecNode), and in the expressions, replace them with FieldRefs so that + # further operations can happen (in what will become a ProjectNode that works + # on the result of the Aggregate). + # To do this, we create a list in this function scope, and in arrow_mask(), + # and we make sure this environment here is the parent env of the binding + # functions, so that when they receive an expression, they can pull out + # aggregations and insert them into the list, which they can find because it + # is in the parent env. + # nolint end + ..aggregations <- empty_named_list() + + # We'll collect any transformations after the aggregation here + ..post_mutate <- empty_named_list() + mask <- arrow_mask(.data, aggregation = TRUE) + for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) - summarize_eval( - names(exprs)[i], - exprs[[i]], - ctx, - length(.data$group_by_vars) > 0 - ) + name <- names(exprs)[i] + ..post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) } # Apply the results to the .data object. # First, the aggregations - .data$aggregations <- ctx$aggregations + .data$aggregations <- ..aggregations # Then collapse the query so that the resulting query object can have # additional operations applied to it out <- collapse.arrow_dplyr_query(.data) - # The expressions may have been translated into - # "first, aggregate, then transform the result further" - # nolint start - # For example, - # summarize(mean = sum(x) / n()) - # is effectively implemented as - # summarize(..temp0 = sum(x), ..temp1 = n()) %>% - # mutate(mean = ..temp0 / ..temp1) %>% - # select(-starts_with("..temp")) - # If this is the case, there will be expressions in post_mutate - # nolint end - if (length(ctx$post_mutate)) { - # Append post_mutate, and make sure order is correct - # according to input exprs (also dropping ..temp columns) - out$selected_columns <- c( - out$selected_columns, - ctx$post_mutate - )[c(.data$group_by_vars, names(exprs))] + + # Now, add the projections in ..post_mutate (if any) + for (post in names(..post_mutate)) { + # One last check: it's possible that an expression like y - mean(y) would + # successfully evaluate, but it's not supported. It gets transformed to: + # nolint start + # summarize(..temp0 = mean(y)) %>% + # mutate(y - ..temp0) + # nolint end + # but y is not in the schema of the data after summarize(). To catch this + # in the expression evaluation step, we'd have to remove all data + # variables from the mask, which would be a bit tortured (even for me). + # So we'll check here. + # We can tell the expression is invalid if it references fields not in + # the schema of the data after summarize(). Evaulating its type will + # throw an error if it's invalid. + tryCatch(..post_mutate[[post]]$type(out$.data$schema), error = function(e) { + msg <- paste( + "Expression", as_label(exprs[[post]]), + "is not a valid aggregation expression or is" + ) + arrow_not_supported(msg) + }) + # If it's valid, add it to the .data object + out$selected_columns[[post]] <- ..post_mutate[[post]] } + # Make sure column order is correct (and also drop ..temp columns) + col_order <- c(.data$group_by_vars, unique(names(exprs))) + out$selected_columns <- out$selected_columns[col_order] + # If the object has .drop = FALSE and any group vars are dictionaries, # we can't (currently) preserve the empty rows that dplyr does, # so give a warning about that. @@ -390,157 +458,60 @@ format_aggregation <- function(x) { paste0(x$fun, "(", paste(map(x$data, ~ .$ToString()), collapse = ","), ")") } -# This function handles each summarize expression and turns it into the -# appropriate combination of (1) aggregations (possibly temporary) and -# (2) post-aggregation transformations (mutate) -# The function returns nothing: it assigns into the `ctx` environment -summarize_eval <- function(name, quosure, ctx, hash) { - expr <- quo_get_expr(quosure) - ctx$quo_env <- quo_get_env(quosure) - - funs_in_expr <- all_funs(expr) - - if (length(funs_in_expr) == 0) { - # This branch only gets called at the top level, where expr is something - # that is not a function call (could be a quosure, a symbol, or atomic - # value). This needs to evaluate to a scalar or something that can be - # converted to one. - value <- arrow_eval_or_stop(quosure, ctx$mask) - - if (!inherits(value, "Expression")) { - value <- Expression$scalar(value) - } - - # We can't support a bare field reference because this is not - # an aggregate expression - if (!identical(value$field_name, "")) { - abort( - paste( - "Expression", format_expr(quosure), - "is not an aggregate expression or is not supported in Arrow" - ) - ) - } +# This function evaluates an expression and returns the post-summarize +# projection that results, or NULL if there is none because the top-level +# expression was an aggregation. Any aggregations are pulled out and collected +# in the ..aggregations list outside this function. +summarize_eval <- function(name, quosure, mask) { + # Add previous aggregations to the mask, so they can be referenced + for (n in names(get("..aggregations", parent.frame()))) { + mask[[n]] <- mask$.data[[n]] <- Expression$field_ref(n) + } + # Evaluate: + value <- arrow_eval_or_stop(quosure, mask) + # Handle the result. There are a few different cases. + if (!inherits(value, "Expression")) { + # Must have just been a scalar? (If it's not a scalar, this will error) # Scalars need to be added to post_mutate because they don't need # to be sent to the query engine as an aggregation - ctx$post_mutate[[name]] <- value - return() + value <- Expression$scalar(value) } + # Handle case where outer expr is ..temp field ref. This came from an + # aggregation at the top level. So the resulting name should be `name`. + # not `..tempN`. Rename the corresponding aggregation. + post_aggs <- get("..aggregations", parent.frame()) + result_field_name <- value$field_name + if (result_field_name %in% names(post_aggs)) { + # Do this by assigning over `name` in case something else was in `name` + post_aggs[[name]] <- post_aggs[[result_field_name]] + post_aggs[[result_field_name]] <- NULL + # Assign back into the parent environment + assign("..aggregations", post_aggs, parent.frame()) + # Return NULL because there is no post-mutate projection, it's just + # the aggregation + return(NULL) + } else { + # This is an expression that is not a ..temp fieldref, so it is some + # function of aggregations. Return it so it can be added to post_mutate. + return(value) + } +} + +adjust_summarize_expression <- function(quosure, hash) { # For the quantile() binding in the hash aggregation case, we need to mutate # the list output from the Arrow hash_tdigest kernel to flatten it into a # column of type float64. We do that by modifying the unevaluated expression # to replace quantile(...) with arrow_list_element(quantile(...), 0L) - if (hash && any(c("quantile", "stats::quantile") %in% funs_in_expr)) { + expr <- quo_get_expr(quosure) + if (hash && any(c("quantile", "stats::quantile") %in% all_funs(expr))) { expr <- wrap_hash_quantile(expr) - funs_in_expr <- all_funs(expr) - } - - # Start inspecting the expr to see what aggregations it involves - agg_funs <- names(agg_funcs) - outer_agg <- funs_in_expr[1] %in% agg_funs - inner_agg <- funs_in_expr[-1] %in% agg_funs - - # First, pull out any aggregations wrapped in other function calls - if (any(inner_agg)) { - expr <- extract_aggregations(expr, ctx) - } - - # By this point, there are no more aggregation functions in expr - # except for possibly the outer function call: - # they've all been pulled out to ctx$aggregations, and in their place in expr - # there are variable names, which would correspond to field refs in the - # query object after aggregation and collapse() or non-field variable - # references. So if we want to know if there are any aggregations inside expr, - # we have to look for them by their new var names in ctx$aggregations. - inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations) - inner_is_fieldref <- all_vars(expr) %in% names(ctx$mask$.data) - - if (outer_agg) { - # This is something like agg(fun(x, y) - # It just works by normal arrow_eval, unless there's a mix of aggs and - # columns in the original data like agg(fun(x, agg(x))) - # (but that will have been caught in extract_aggregations()) - ctx$aggregations[[name]] <- arrow_eval_or_stop( - as_quosure(expr, ctx$quo_env), - ctx$mask - ) - return() - } else if (all(inner_agg_exprs | !inner_is_fieldref)) { - # Something like: fun(agg(x), agg(y)) - # So based on the aggregations that have been extracted, mutate after - agg_field_refs <- make_field_refs(names(ctx$aggregations)) - agg_field_types <- aggregate_types(ctx$aggregations, hash) - - mutate_mask <- arrow_mask( - list( - selected_columns = agg_field_refs, - .data = list( - schema = schema(agg_field_types) - ) - ) - ) - - value <- arrow_eval_or_stop( - as_quosure(expr, ctx$quo_env), - mutate_mask - ) - - if (!inherits(value, "Expression")) { - value <- Expression$scalar(value) - } - - ctx$post_mutate[[name]] <- value - return() - } - - # Backstop for any other odd cases, like fun(x, y) (i.e. no aggregation), - # or aggregation functions that aren't supported in Arrow (not in agg_funcs) - abort( - paste( - "Expression", format_expr(quosure), - "is not an aggregate expression or is not supported in Arrow" - ) - ) -} - -# This function recurses through expr, pulls out any aggregation expressions, -# and inserts a variable name (field ref) in place of the aggregation -extract_aggregations <- function(expr, ctx) { - # Keep the input in case we need to raise an error message with it - original_expr <- expr - funs <- all_funs(expr) - if (length(funs) == 0) { - return(expr) - } else if (length(funs) > 1) { - # Recurse more - expr[-1] <- lapply(expr[-1], extract_aggregations, ctx) - } - if (funs[1] %in% names(agg_funcs)) { - inner_agg_exprs <- all_vars(expr) %in% names(ctx$aggregations) - if (any(inner_agg_exprs)) { - # We can't aggregate over a combination of dataset columns and other - # aggregations (e.g. sum(x - mean(x))) - # TODO: support in ARROW-13926 - abort( - paste( - "Aggregate within aggregate expression", - format_expr(original_expr), - "not supported in Arrow" - ) - ) - } - - # We have an aggregation expression with no other aggregations inside it, - # so arrow_eval the expression on the data and give it a ..temp name prefix, - # then insert that name (symbol) back into the expression so that we can - # mutate() on the result of the aggregation and reference this field. - tmpname <- paste0("..temp", length(ctx$aggregations)) - ctx$aggregations[[tmpname]] <- arrow_eval_or_stop(as_quosure(expr, ctx$quo_env), ctx$mask) - expr <- as.symbol(tmpname) + quo_env <- quo_get_env(quosure) + quosure <- as_quosure(expr, quo_env) } - expr + # We could add any other adjustments here, but currently quantile is the only one + quosure } # This function recurses through expr and wraps each call to quantile() with a diff --git a/r/R/expression.R b/r/R/expression.R index 8f84b4b31ec33..0e823e3f697ac 100644 --- a/r/R/expression.R +++ b/r/R/expression.R @@ -60,6 +60,9 @@ Expression <- R6Class("Expression", is_field_ref = function() { compute___expr__is_field_ref(self) }, + field_names_in_expression = function() { + compute___expr__field_names_in_expression(self) + }, cast = function(to_type, safe = TRUE, ...) { opts <- cast_options(safe, ...) opts$to_type <- as_type(to_type) diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 75e0f27b4002e..5031c79f200b3 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -2980,6 +2980,14 @@ BEGIN_CPP11 END_CPP11 } // expression.cpp +std::vector compute___expr__field_names_in_expression(const std::shared_ptr& x); +extern "C" SEXP _arrow_compute___expr__field_names_in_expression(SEXP x_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type x(x_sexp); + return cpp11::as_sexp(compute___expr__field_names_in_expression(x)); +END_CPP11 +} +// expression.cpp std::shared_ptr compute___expr__field_ref(std::string name); extern "C" SEXP _arrow_compute___expr__field_ref(SEXP name_sexp){ BEGIN_CPP11 @@ -5951,6 +5959,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_compute___expr__call", (DL_FUNC) &_arrow_compute___expr__call, 3}, { "_arrow_compute___expr__is_field_ref", (DL_FUNC) &_arrow_compute___expr__is_field_ref, 1}, { "_arrow_compute___expr__get_field_ref_name", (DL_FUNC) &_arrow_compute___expr__get_field_ref_name, 1}, + { "_arrow_compute___expr__field_names_in_expression", (DL_FUNC) &_arrow_compute___expr__field_names_in_expression, 1}, { "_arrow_compute___expr__field_ref", (DL_FUNC) &_arrow_compute___expr__field_ref, 1}, { "_arrow_compute___expr__nested_field_ref", (DL_FUNC) &_arrow_compute___expr__nested_field_ref, 2}, { "_arrow_compute___expr__scalar", (DL_FUNC) &_arrow_compute___expr__scalar, 1}, diff --git a/r/src/expression.cpp b/r/src/expression.cpp index e9d08e13affe0..cd9737f71fbc0 100644 --- a/r/src/expression.cpp +++ b/r/src/expression.cpp @@ -64,6 +64,23 @@ std::string compute___expr__get_field_ref_name( return ""; } +// [[arrow::export]] +std::vector compute___expr__field_names_in_expression( + const std::shared_ptr& x) { + std::vector names; + for (const auto& ref : compute::FieldsInExpression(*x)) { + if (ref.IsNested()) { + // Slight hack: this isn't the field's "name", but it's good enough + // for my current purposes. A nested field ref doesn't have a name property. + // Alternatively, we could skip nested refs like in get_field_ref_name + names.push_back(ref.ToString()); + } else { + names.push_back(*ref.name()); + } + } + return names; +} + // [[arrow::export]] std::shared_ptr compute___expr__field_ref(std::string name) { return std::make_shared(compute::field_ref(std::move(name))); diff --git a/r/tests/testthat/test-dplyr-across.R b/r/tests/testthat/test-dplyr-across.R index eebb8a23eadf7..32476bab06fce 100644 --- a/r/tests/testthat/test-dplyr-across.R +++ b/r/tests/testthat/test-dplyr-across.R @@ -18,7 +18,6 @@ library(dplyr, warn.conflicts = FALSE) test_that("expand_across correctly expands quosures", { - # single unnamed function expect_across_equal( quos(across(c(dbl, dbl2), round)), @@ -236,7 +235,6 @@ test_that("expand_across correctly expands quosures", { }) test_that("purrr-style lambda functions are supported", { - # using `.x` inside lambda functions expect_across_equal( quos(across(c(dbl, dbl2), ~ round(.x, digits = 0))), @@ -279,7 +277,17 @@ test_that("purrr-style lambda functions are supported", { ) }) -test_that("ARROW-14071 - function(x)-style lambda functions are not supported", { +test_that("ARROW-14071 - R functions from a user's environment", { + makeWhole <- function(x) round(x, digits = 0) + compare_dplyr_binding( + .input %>% + mutate(across(c(int, dbl), makeWhole)) %>% + collect(), + example_data + ) +}) + +test_that("function(x)-style lambda functions are not supported", { expect_error( expand_across(as_adq(example_data), quos(across(.cols = c(dbl, dbl2), list(function(x) { head(x, 1) @@ -301,17 +309,15 @@ test_that("ARROW-14071 - function(x)-style lambda functions are not supported", }) test_that("if_all() and if_any() are supported", { - expect_across_equal( - quos(if_any(everything(), ~is.na(.x))), + quos(if_any(everything(), ~ is.na(.x))), quos(is.na(int) | is.na(dbl) | is.na(dbl2) | is.na(lgl) | is.na(false) | is.na(chr) | is.na(fct)), example_data ) expect_across_equal( - quos(if_all(everything(), ~is.na(.x))), + quos(if_all(everything(), ~ is.na(.x))), quos(is.na(int) & is.na(dbl) & is.na(dbl2) & is.na(lgl) & is.na(false) & is.na(chr) & is.na(fct)), example_data ) - }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index 724b93c96609f..bf23685362a82 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -277,7 +277,6 @@ test_that("filter environment scope", { tbl ) isShortString <- function(x) nchar(x) < 10 - skip("TODO: ARROW-14071") compare_dplyr_binding( .input %>% select(-fct) %>% diff --git a/r/tests/testthat/test-dplyr-funcs-conditional.R b/r/tests/testthat/test-dplyr-funcs-conditional.R index 52edc25a67f5a..3ea1853fec455 100644 --- a/r/tests/testthat/test-dplyr-funcs-conditional.R +++ b/r/tests/testthat/test-dplyr-funcs-conditional.R @@ -192,6 +192,21 @@ test_that("case_when()", { collect(), tbl ) + + # Test finding R functions from the user's environment + isIn <- function(x, y) x %in% y + withr::with_options(list(arrow.debug = TRUE), { + expect_output( + compare_dplyr_binding( + .input %>% + transmute(cw = case_when(isIn(chr, letters[1:3]) ~ 1L) + 41L) %>% + collect(), + tbl + ), + "Adding isIn to the function environment" + ) + }) + compare_dplyr_binding( .input %>% filter(case_when( diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index b2b2a9e54695d..87bb5e5fac959 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -792,7 +792,6 @@ test_that("Expressions on aggregations", { any = any(lgl), all = all(lgl) ) %>% - ungroup() %>% # TODO: loosen the restriction on mutate after group_by mutate(some = any & !all) %>% select(some_grouping, some) %>% collect(), @@ -836,8 +835,8 @@ test_that("Expressions on aggregations", { expect_warning( record_batch(tbl) %>% summarise(any(any(lgl))), paste( - "Aggregate within aggregate expression", - "any\\(any\\(lgl\\)\\) not supported in Arrow" + "In any\\(any\\(lgl\\)\\), aggregate within aggregate expression", + "not supported in Arrow" ) ) @@ -845,19 +844,117 @@ test_that("Expressions on aggregations", { expect_warning( record_batch(tbl) %>% summarise(any(any(!lgl))), paste( - "Aggregate within aggregate expression", - "any\\(any\\(!lgl\\)\\) not supported in Arrow" + "In any\\(any\\(!lgl\\)\\), aggregate within aggregate expression", + "not supported in Arrow" ) ) expect_warning( record_batch(tbl) %>% summarise(!any(any(lgl))), paste( - "Aggregate within aggregate expression", - "any\\(any\\(lgl\\)\\) not supported in Arrow" + "In \\!any\\(any\\(lgl\\)\\), aggregate within aggregate expression", + "not supported in Arrow" ) ) }) +test_that("Re-using/overwriting column names", { + compare_dplyr_binding( + .input %>% + summarize( + # These are both aggregations + y = sum(int, na.rm = TRUE), + y = sum(dbl, na.rm = TRUE) + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + summarize( + # This is just aggregation + y = sum(int, na.rm = TRUE), + # This is aggregations and a projection after + y = mean(int, na.rm = TRUE) * n() + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + summarize( + # Same thing, but in the other order + y = mean(int, na.rm = TRUE) * n(), + y = sum(dbl, na.rm = TRUE), + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + summarize( + int = sum(int, na.rm = TRUE), + # This needs to pick up *that* int, not the column in the data + y = int / n() + ) %>% + collect(), + tbl + ) + + compare_dplyr_binding( + .input %>% + summarize( + # No one should do this! But it's valid right? + int = sum(int, na.rm = TRUE), + int = int / n() + ) %>% + collect(), + tbl + ) +}) + +test_that("Weighted mean", { + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + weighted_mean = sum(int * dbl) / sum(dbl) + ) %>% + collect(), + tbl + ) + + division <- function(x, y) x / y + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + weighted_mean = division(sum(int * dbl), sum(dbl)) + ) %>% + collect(), + tbl + ) + + # We can also define functions that call supported aggregation functions + # and it just works + wtd_mean <- function(x, w) sum(x * w) / sum(w) + withr::local_options(list(arrow.debug = TRUE)) + expect_output( + compare_dplyr_binding( + .input %>% + group_by(some_grouping) %>% + summarize( + weighted_mean = wtd_mean(int, dbl) + ) %>% + collect(), + tbl + ), + "Adding wtd_mean to the function environment" + ) +}) + test_that("Summarize with 0 arguments", { compare_dplyr_binding( .input %>% @@ -868,8 +965,7 @@ test_that("Summarize with 0 arguments", { ) }) -test_that("Not (yet) supported: implicit join", { - withr::local_options(list(arrow.debug = TRUE)) +test_that("Not (yet) supported: window functions", { compare_dplyr_binding( .input %>% group_by(some_grouping) %>% @@ -879,8 +975,8 @@ test_that("Not (yet) supported: implicit join", { collect(), tbl, warning = paste( - "Aggregate within aggregate expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\)", - "not supported in Arrow; pulling data into R" + "In sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\), aggregate within", + "aggregate expression not supported in Arrow; pulling data into R" ) ) compare_dplyr_binding( @@ -892,7 +988,7 @@ test_that("Not (yet) supported: implicit join", { collect(), tbl, warning = paste( - "Aggregate within aggregate expression sum\\(dbl - mean\\(dbl\\)\\)", + "In sum\\(dbl - mean\\(dbl\\)\\), aggregate within aggregate expression", "not supported in Arrow; pulling data into R" ) ) @@ -905,8 +1001,8 @@ test_that("Not (yet) supported: implicit join", { collect(), tbl, warning = paste( - "Aggregate within aggregate expression sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\)", - "not supported in Arrow; pulling data into R" + "In sqrt\\(sum\\(\\(dbl - mean\\(dbl\\)\\)\\^2\\)/\\(n\\(\\) - 1L\\)\\), aggregate within", + "aggregate expression not supported in Arrow; pulling data into R" ) ) @@ -917,7 +1013,7 @@ test_that("Not (yet) supported: implicit join", { collect(), data.frame(x = 1, y = 2), warning = paste( - "Expression y - mean\\(y\\) is not an aggregate expression", + "Expression y - mean\\(y\\) is not a valid aggregation expression", "or is not supported in Arrow; pulling data into R" ) ) @@ -929,7 +1025,7 @@ test_that("Not (yet) supported: implicit join", { collect(), data.frame(x = 1, y = 2), warning = paste( - "Expression y is not an aggregate expression", + "Expression y is not a valid aggregation expression", "or is not supported in Arrow; pulling data into R" ) ) @@ -942,7 +1038,7 @@ test_that("Not (yet) supported: implicit join", { collect(), data.frame(x = 1, y = 2, z = 3), warning = paste( - "Expression x - y is not an aggregate expression", + "Expression x - y is not a valid aggregation expression", "or is not supported in Arrow; pulling data into R" ) ) @@ -1083,11 +1179,6 @@ test_that("summarise() can handle scalars and literal values", { tibble(y = 1L) ) - expect_identical( - record_batch(tbl) %>% summarise(y = Expression$scalar(1L)) %>% collect(), - tibble(y = 1L) - ) - expect_identical( record_batch(tbl) %>% summarise(y = Scalar$create(1L)) %>% collect(), tibble(y = 1L) @@ -1189,7 +1280,7 @@ test_that("Can use across() within summarise()", { group_by(x) %>% summarise(across(everything())) %>% collect(), - regexp = "Expression y is not an aggregate expression or is not supported in Arrow; pulling data into R" + regexp = "Expression y is not a valid aggregation expression or is not supported in Arrow; pulling data into R" ) }) From 7e2245cf060acbf63395a1417ce6b278b3f2d659 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:41:32 -0700 Subject: [PATCH 004/106] MINOR: [C#] Bump ZstdSharp.Port and System.Runtime.CompilerServices.Unsafe in /csharp (#41218) Bumps [ZstdSharp.Port](https://github.com/oleg-st/ZstdSharp) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `ZstdSharp.Port` from 0.7.3 to 0.7.6
Release notes

Sourced from ZstdSharp.Port's releases.

0.7.6

Improved decompression speed in .NET 8 (~5%) Return available data from stream, before waiting for more in DecompressionStream Write all data in CompressionStream.Flush

0.7.5

Added multi-threading support

0.7.4

Added .NET 8 target, added fix for IL2CPP

Commits
  • 331e03a 0.7.6
  • 402fb3a while (true)
  • 679a7b0 .NET 8: InlineArray for fixed buffers
  • d1e2f53 Improved decompression performance for .NET 8+
  • 5957fa8 Updated packages
  • d7e31bb Prefetch inlined
  • f41b5e7 Merge pull request #33 from LinusKardellInfobric/read-return-available-data
  • 76c21e3 Use correct decompression in-buffer size
  • 192b93a Return available data from stream, before waiting for more
  • 17a9ebc Write all data in CompressionStream.Flush
  • Additional commits viewable in compare view

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 6988567193db4..c9758504265e8 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + From 67003f427e021f6a87462c1abbce311af384ffc5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 19:18:34 -0700 Subject: [PATCH 005/106] MINOR: [C#] Bump K4os.Compression.LZ4.Streams and System.Runtime.CompilerServices.Unsafe in /csharp (#41216) Bumps [K4os.Compression.LZ4.Streams](https://github.com/MiloszKrajewski/K4os.Compression.LZ4) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `K4os.Compression.LZ4.Streams` from 1.3.6 to 1.3.8
Release notes

Sourced from K4os.Compression.LZ4.Streams's releases.

v1.3.8

  • FIXED #96: fixed issue returning wrong compressed length when compressing into Span

v1.3.7-beta

  • ADDED #61: Partial decompression support (see #61, #90) (thanks: Sewer56)
  • CHANGED: build process from FAKE (sorry!) to Nuke
Changelog

Sourced from K4os.Compression.LZ4.Streams's changelog.

1.3.8 (2023/03/17)

  • FIXED #96: fixed issue returning wrong compressed length when compressing into Span

1.3.7-beta (2023/10/03)

  • ADDED #61: partial decompression support (see #61, #90) (thanks: Sewer56)
  • CHANGED: build process from FAKE (sorry!) to Nuke
Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index c9758504265e8..0ce8c89bb1d1b 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -12,7 +12,7 @@ - + From 80e1d7a85b64a42381d2e90a5e11bbd99ddbed93 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Tue, 23 Apr 2024 13:50:45 +0900 Subject: [PATCH 006/106] GH-41314: [CI][Python] Add a job on ARM64 macOS (#41313) ### Rationale for this change We need the GitHub-hosted ARM64 macOS to do more specific testing for the Python test. ### What changes are included in this PR? Add a job on ARM64 macOS (version: 14) for python ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41314 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- .github/workflows/python.yml | 14 +++++++++++--- ci/scripts/python_build.sh | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 2db9b17e895b0..15056961f8cf4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -128,12 +128,20 @@ jobs: run: archery docker push ${{ matrix.image }} macos: - name: AMD64 macOS 12 Python 3 - runs-on: macos-latest + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" env: - ARROW_HOME: /usr/local + ARROW_HOME: /tmp/local ARROW_AZURE: ON ARROW_DATASET: ON ARROW_FLIGHT: ON diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 7a24c1172f7b1..99153cdf75539 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -74,6 +74,8 @@ export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} export PYARROW_PARALLEL=${n_jobs} +: ${CMAKE_PREFIX_PATH:=${ARROW_HOME}} +export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} pushd ${source_dir} From 2cf844d4386e1ccad8592b421c18dfe9a82d2e42 Mon Sep 17 00:00:00 2001 From: Adam Curtis Date: Tue, 23 Apr 2024 01:24:47 -0400 Subject: [PATCH 007/106] GH-37720: [Go][FlightSQL] Add prepared statement handle to DoPut result (#40311) ### Rationale for this change See discussion on https://github.com/apache/arrow/issues/37720 and mailing list: https://lists.apache.org/thread/3kb82ypx99q96g84qv555l6x8r0bppyq ### What changes are included in this PR? Changes the Go FlightSQL client and server implementations to support returning an updated prepared statement handle to the client as part of the `DoPut(PreparedStatement)` RPC call. ### Are these changes tested? ### Are there any user-facing changes? See parent issue and docs PR #40243 for details of user facing changes. **This PR includes breaking changes to public APIs.** * GitHub Issue: #37720 Lead-authored-by: Adam Curtis Co-authored-by: David Li Signed-off-by: David Li --- go/arrow/flight/flightsql/client.go | 36 +- go/arrow/flight/flightsql/client_test.go | 35 +- .../flight/flightsql/driver/driver_test.go | 10 +- .../flight/flightsql/example/sqlite_server.go | 8 +- go/arrow/flight/flightsql/server.go | 17 +- go/arrow/flight/gen/flight/FlightSql.pb.go | 1128 +++++++++-------- .../internal/flight_integration/scenario.go | 6 +- .../arrow/flight/sql/FlightSqlClient.java | 4 +- 8 files changed, 683 insertions(+), 561 deletions(-) diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index 73293cd97a6ec..e594191c35fdf 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -1124,7 +1124,6 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption if err != nil { return nil, err } - wr, err := p.writeBindParameters(pstream, desc) if err != nil { return nil, err @@ -1133,9 +1132,7 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return nil, err } pstream.CloseSend() - - // wait for the server to ack the result - if _, err = pstream.Recv(); err != nil && err != io.EOF { + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { return nil, err } } @@ -1173,9 +1170,7 @@ func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOpt return err } pstream.CloseSend() - - // wait for the server to ack the result - if _, err = pstream.Recv(); err != nil && err != io.EOF { + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { return err } } @@ -1219,9 +1214,7 @@ func (p *PreparedStatement) ExecutePoll(ctx context.Context, retryDescriptor *fl return nil, err } pstream.CloseSend() - - // wait for the server to ack the result - if _, err = pstream.Recv(); err != nil && err != io.EOF { + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { return nil, err } } @@ -1313,6 +1306,29 @@ func (p *PreparedStatement) writeBindParameters(pstream pb.FlightService_DoPutCl } } +func (p *PreparedStatement) captureDoPutPreparedStatementHandle(pstream pb.FlightService_DoPutClient) error { + var ( + result *pb.PutResult + preparedStatementResult pb.DoPutPreparedStatementResult + err error + ) + if result, err = pstream.Recv(); err != nil && err != io.EOF { + return err + } + // skip if server does not provide a response (legacy server) + if result == nil { + return nil + } + if err = proto.Unmarshal(result.GetAppMetadata(), &preparedStatementResult); err != nil { + return err + } + handle := preparedStatementResult.GetPreparedStatementHandle() + if handle != nil { + p.handle = handle + } + return nil +} + // DatasetSchema may be nil if the server did not return it when creating the // Prepared Statement. func (p *PreparedStatement) DatasetSchema() *arrow.Schema { return p.datasetSchema } diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index e84b74c028484..727fe02aa7063 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -408,24 +408,26 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecute() { func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { const query = "query" + const handle = "handle" + const updatedHandle = "updated handle" // create and close actions cmd := &pb.ActionCreatePreparedStatementRequest{Query: query} action := getAction(cmd) action.Type = flightsql.CreatePreparedStatementActionType - closeAct := getAction(&pb.ActionClosePreparedStatementRequest{PreparedStatementHandle: []byte(query)}) + closeAct := getAction(&pb.ActionClosePreparedStatementRequest{PreparedStatementHandle: []byte(updatedHandle)}) closeAct.Type = flightsql.ClosePreparedStatementActionType // results from createprepared statement - result := &pb.ActionCreatePreparedStatementResult{ - PreparedStatementHandle: []byte(query), + actionResult := &pb.ActionCreatePreparedStatementResult{ + PreparedStatementHandle: []byte(handle), } schema := arrow.NewSchema([]arrow.Field{{Name: "id", Type: arrow.PrimitiveTypes.Int64, Nullable: true}}, nil) - result.ParameterSchema = flight.SerializeSchema(schema, memory.DefaultAllocator) + actionResult.ParameterSchema = flight.SerializeSchema(schema, memory.DefaultAllocator) // mocked client stream var out anypb.Any - out.MarshalFrom(result) + out.MarshalFrom(actionResult) data, _ := proto.Marshal(&out) createRsp := &mockDoActionClient{} @@ -443,7 +445,12 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { s.mockClient.On("DoAction", flightsql.CreatePreparedStatementActionType, action.Body, s.callOpts).Return(createRsp, nil) s.mockClient.On("DoAction", flightsql.ClosePreparedStatementActionType, closeAct.Body, s.callOpts).Return(closeRsp, nil) - expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}) + expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)}) + + // mocked DoPut result + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} + resdata, _ := proto.Marshal(doPutPreparedStatementResult) + putResult := &pb.PutResult{ AppMetadata: resdata } // mocked client stream for DoPut mockedPut := &mockDoPutClient{} @@ -452,9 +459,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { return proto.Equal(expectedDesc, fd.FlightDescriptor) })).Return(nil).Twice() // first sends schema message, second sends data mockedPut.On("CloseSend").Return(nil) - mockedPut.On("Recv").Return((*pb.PutResult)(nil), nil) + mockedPut.On("Recv").Return(putResult, nil) - infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)} + infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)} desc := getDesc(infoCmd) s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil) @@ -462,19 +469,20 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { s.NoError(err) defer prepared.Close(context.TODO(), s.callOpts...) - s.Equal(string(prepared.Handle()), "query") + s.Equal(string(prepared.Handle()), handle) paramSchema := prepared.ParameterSchema() rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, paramSchema, strings.NewReader(`[{"id": 1}]`)) s.NoError(err) defer rec.Release() - s.Equal(string(prepared.Handle()), "query") + s.Equal(string(prepared.Handle()), handle) prepared.SetParameters(rec) info, err := prepared.Execute(context.TODO(), s.callOpts...) s.NoError(err) s.Equal(&emptyFlightInfo, info) + s.Equal(string(prepared.Handle()), updatedHandle) } func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { @@ -516,6 +524,11 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}) + // mocked DoPut result + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} + resdata, _ := proto.Marshal(doPutPreparedStatementResult) + putResult := &pb.PutResult{ AppMetadata: resdata } + // mocked client stream for DoPut mockedPut := &mockDoPutClient{} s.mockClient.On("DoPut", s.callOpts).Return(mockedPut, nil) @@ -528,7 +541,7 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { return fd.FlightDescriptor == nil })).Return(nil).Times(3) mockedPut.On("CloseSend").Return(nil) - mockedPut.On("Recv").Return((*pb.PutResult)(nil), nil) + mockedPut.On("Recv").Return(putResult, nil) infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)} desc := getDesc(infoCmd) diff --git a/go/arrow/flight/flightsql/driver/driver_test.go b/go/arrow/flight/flightsql/driver/driver_test.go index 0c42cf59e7388..d4361eb2dd722 100644 --- a/go/arrow/flight/flightsql/driver/driver_test.go +++ b/go/arrow/flight/flightsql/driver/driver_test.go @@ -1768,16 +1768,16 @@ func (s *MockServer) CreatePreparedStatement(ctx context.Context, req flightsql. }, nil } -func (s *MockServer) DoPutPreparedStatementQuery(ctx context.Context, qry flightsql.PreparedStatementQuery, r flight.MessageReader, w flight.MetadataWriter) error { +func (s *MockServer) DoPutPreparedStatementQuery(ctx context.Context, qry flightsql.PreparedStatementQuery, r flight.MessageReader, w flight.MetadataWriter) ([]byte, error) { if s.ExpectedPreparedStatementSchema != nil { if !s.ExpectedPreparedStatementSchema.Equal(r.Schema()) { - return errors.New("parameter schema: unexpected") + return nil, errors.New("parameter schema: unexpected") } - return nil + return qry.GetPreparedStatementHandle(), nil } if s.PreparedStatementParameterSchema != nil && !s.PreparedStatementParameterSchema.Equal(r.Schema()) { - return fmt.Errorf("parameter schema: %w", arrow.ErrInvalid) + return nil, fmt.Errorf("parameter schema: %w", arrow.ErrInvalid) } // GH-35328: it's rare, but this function can complete execution and return @@ -1791,7 +1791,7 @@ func (s *MockServer) DoPutPreparedStatementQuery(ctx context.Context, qry flight for r.Next() { } - return nil + return qry.GetPreparedStatementHandle(), nil } func (s *MockServer) DoGetStatement(ctx context.Context, ticket flightsql.StatementQueryTicket) (*arrow.Schema, <-chan flight.StreamChunk, error) { diff --git a/go/arrow/flight/flightsql/example/sqlite_server.go b/go/arrow/flight/flightsql/example/sqlite_server.go index fcabebf9806b6..b5a64d57dceb6 100644 --- a/go/arrow/flight/flightsql/example/sqlite_server.go +++ b/go/arrow/flight/flightsql/example/sqlite_server.go @@ -618,21 +618,21 @@ func getParamsForStatement(rdr flight.MessageReader) (params [][]interface{}, er return params, rdr.Err() } -func (s *SQLiteFlightSQLServer) DoPutPreparedStatementQuery(_ context.Context, cmd flightsql.PreparedStatementQuery, rdr flight.MessageReader, _ flight.MetadataWriter) error { +func (s *SQLiteFlightSQLServer) DoPutPreparedStatementQuery(_ context.Context, cmd flightsql.PreparedStatementQuery, rdr flight.MessageReader, _ flight.MetadataWriter) ([]byte, error) { val, ok := s.prepared.Load(string(cmd.GetPreparedStatementHandle())) if !ok { - return status.Error(codes.InvalidArgument, "prepared statement not found") + return nil, status.Error(codes.InvalidArgument, "prepared statement not found") } stmt := val.(Statement) args, err := getParamsForStatement(rdr) if err != nil { - return status.Errorf(codes.Internal, "error gathering parameters for prepared statement query: %s", err.Error()) + return nil, status.Errorf(codes.Internal, "error gathering parameters for prepared statement query: %s", err.Error()) } stmt.params = args s.prepared.Store(string(cmd.GetPreparedStatementHandle()), stmt) - return nil + return cmd.GetPreparedStatementHandle(), nil } func (s *SQLiteFlightSQLServer) DoPutPreparedStatementUpdate(ctx context.Context, cmd flightsql.PreparedStatementUpdate, rdr flight.MessageReader) (int64, error) { diff --git a/go/arrow/flight/flightsql/server.go b/go/arrow/flight/flightsql/server.go index 4742a2dceeb01..25bdc5f4d5bb3 100644 --- a/go/arrow/flight/flightsql/server.go +++ b/go/arrow/flight/flightsql/server.go @@ -510,8 +510,8 @@ func (BaseServer) DoPutCommandSubstraitPlan(context.Context, StatementSubstraitP return 0, status.Error(codes.Unimplemented, "DoPutCommandSubstraitPlan not implemented") } -func (BaseServer) DoPutPreparedStatementQuery(context.Context, PreparedStatementQuery, flight.MessageReader, flight.MetadataWriter) error { - return status.Error(codes.Unimplemented, "DoPutPreparedStatementQuery not implemented") +func (BaseServer) DoPutPreparedStatementQuery(context.Context, PreparedStatementQuery, flight.MessageReader, flight.MetadataWriter) ([]byte, error) { + return nil, status.Error(codes.Unimplemented, "DoPutPreparedStatementQuery not implemented") } func (BaseServer) DoPutPreparedStatementUpdate(context.Context, PreparedStatementUpdate, flight.MessageReader) (int64, error) { @@ -677,7 +677,7 @@ type Server interface { // Currently anything written to the writer will be ignored. It is in the // interface for potential future enhancements to avoid having to change // the interface in the future. - DoPutPreparedStatementQuery(context.Context, PreparedStatementQuery, flight.MessageReader, flight.MetadataWriter) error + DoPutPreparedStatementQuery(context.Context, PreparedStatementQuery, flight.MessageReader, flight.MetadataWriter) ([]byte, error) // DoPutPreparedStatementUpdate executes an update SQL Prepared statement // for the specified statement handle. The reader allows providing a sequence // of uploaded record batches to bind the parameters to. Returns the number @@ -990,7 +990,16 @@ func (f *flightSqlServer) DoPut(stream flight.FlightService_DoPutServer) error { } return stream.Send(out) case *pb.CommandPreparedStatementQuery: - return f.srv.DoPutPreparedStatementQuery(stream.Context(), cmd, rdr, &putMetadataWriter{stream}) + handle, err := f.srv.DoPutPreparedStatementQuery(stream.Context(), cmd, rdr, &putMetadataWriter{stream}) + if err != nil { + return err + } + result := pb.DoPutPreparedStatementResult{PreparedStatementHandle: handle} + out := &flight.PutResult{} + if out.AppMetadata, err = proto.Marshal(&result); err != nil { + return status.Errorf(codes.Internal, "failed to marshal PutResult: %s", err.Error()) + } + return stream.Send(out) case *pb.CommandPreparedStatementUpdate: recordCount, err := f.srv.DoPutPreparedStatementUpdate(stream.Context(), cmd, rdr) if err != nil { diff --git a/go/arrow/flight/gen/flight/FlightSql.pb.go b/go/arrow/flight/gen/flight/FlightSql.pb.go index 3ff7c5fc147a6..196c1d6b33643 100644 --- a/go/arrow/flight/gen/flight/FlightSql.pb.go +++ b/go/arrow/flight/gen/flight/FlightSql.pb.go @@ -2355,7 +2355,7 @@ func (x ActionCancelQueryResult_CancelResult) Number() protoreflect.EnumNumber { // Deprecated: Use ActionCancelQueryResult_CancelResult.Descriptor instead. func (ActionCancelQueryResult_CancelResult) EnumDescriptor() ([]byte, []int) { - return file_FlightSql_proto_rawDescGZIP(), []int{30, 0} + return file_FlightSql_proto_rawDescGZIP(), []int{31, 0} } // Represents a metadata request. Used in the command member of FlightDescriptor @@ -4469,6 +4469,66 @@ func (x *DoPutUpdateResult) GetRecordCount() int64 { return 0 } +// An *optional* response returned when `DoPut` is called with `CommandPreparedStatementQuery`. +// +// *Note on legacy behavior*: previous versions of the protocol did not return any result for +// this command, and that behavior should still be supported by clients. In that case, the client +// can continue as though the fields in this message were not provided or set to sensible default values. +type DoPutPreparedStatementResult struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Represents a (potentially updated) opaque handle for the prepared statement on the server. + // Because the handle could potentially be updated, any previous handles for this prepared + // statement should be considered invalid, and all subsequent requests for this prepared + // statement must use this new handle. + // The updated handle allows implementing query parameters with stateless services. + // + // When an updated handle is not provided by the server, clients should contiue + // using the previous handle provided by `ActionCreatePreparedStatementResonse`. + PreparedStatementHandle []byte `protobuf:"bytes,1,opt,name=prepared_statement_handle,json=preparedStatementHandle,proto3,oneof" json:"prepared_statement_handle,omitempty"` +} + +func (x *DoPutPreparedStatementResult) Reset() { + *x = DoPutPreparedStatementResult{} + if protoimpl.UnsafeEnabled { + mi := &file_FlightSql_proto_msgTypes[29] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DoPutPreparedStatementResult) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DoPutPreparedStatementResult) ProtoMessage() {} + +func (x *DoPutPreparedStatementResult) ProtoReflect() protoreflect.Message { + mi := &file_FlightSql_proto_msgTypes[29] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DoPutPreparedStatementResult.ProtoReflect.Descriptor instead. +func (*DoPutPreparedStatementResult) Descriptor() ([]byte, []int) { + return file_FlightSql_proto_rawDescGZIP(), []int{29} +} + +func (x *DoPutPreparedStatementResult) GetPreparedStatementHandle() []byte { + if x != nil { + return x.PreparedStatementHandle + } + return nil +} + // Request message for the "CancelQuery" action. // // Explicitly cancel a running query. @@ -4501,7 +4561,7 @@ type ActionCancelQueryRequest struct { func (x *ActionCancelQueryRequest) Reset() { *x = ActionCancelQueryRequest{} if protoimpl.UnsafeEnabled { - mi := &file_FlightSql_proto_msgTypes[29] + mi := &file_FlightSql_proto_msgTypes[30] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -4514,7 +4574,7 @@ func (x *ActionCancelQueryRequest) String() string { func (*ActionCancelQueryRequest) ProtoMessage() {} func (x *ActionCancelQueryRequest) ProtoReflect() protoreflect.Message { - mi := &file_FlightSql_proto_msgTypes[29] + mi := &file_FlightSql_proto_msgTypes[30] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -4527,7 +4587,7 @@ func (x *ActionCancelQueryRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use ActionCancelQueryRequest.ProtoReflect.Descriptor instead. func (*ActionCancelQueryRequest) Descriptor() ([]byte, []int) { - return file_FlightSql_proto_rawDescGZIP(), []int{29} + return file_FlightSql_proto_rawDescGZIP(), []int{30} } func (x *ActionCancelQueryRequest) GetInfo() []byte { @@ -4556,7 +4616,7 @@ type ActionCancelQueryResult struct { func (x *ActionCancelQueryResult) Reset() { *x = ActionCancelQueryResult{} if protoimpl.UnsafeEnabled { - mi := &file_FlightSql_proto_msgTypes[30] + mi := &file_FlightSql_proto_msgTypes[31] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -4569,7 +4629,7 @@ func (x *ActionCancelQueryResult) String() string { func (*ActionCancelQueryResult) ProtoMessage() {} func (x *ActionCancelQueryResult) ProtoReflect() protoreflect.Message { - mi := &file_FlightSql_proto_msgTypes[30] + mi := &file_FlightSql_proto_msgTypes[31] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -4582,7 +4642,7 @@ func (x *ActionCancelQueryResult) ProtoReflect() protoreflect.Message { // Deprecated: Use ActionCancelQueryResult.ProtoReflect.Descriptor instead. func (*ActionCancelQueryResult) Descriptor() ([]byte, []int) { - return file_FlightSql_proto_rawDescGZIP(), []int{30} + return file_FlightSql_proto_rawDescGZIP(), []int{31} } func (x *ActionCancelQueryResult) GetResult() ActionCancelQueryResult_CancelResult { @@ -4605,7 +4665,7 @@ type CommandStatementIngest_TableDefinitionOptions struct { func (x *CommandStatementIngest_TableDefinitionOptions) Reset() { *x = CommandStatementIngest_TableDefinitionOptions{} if protoimpl.UnsafeEnabled { - mi := &file_FlightSql_proto_msgTypes[31] + mi := &file_FlightSql_proto_msgTypes[32] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -4618,7 +4678,7 @@ func (x *CommandStatementIngest_TableDefinitionOptions) String() string { func (*CommandStatementIngest_TableDefinitionOptions) ProtoMessage() {} func (x *CommandStatementIngest_TableDefinitionOptions) ProtoReflect() protoreflect.Message { - mi := &file_FlightSql_proto_msgTypes[31] + mi := &file_FlightSql_proto_msgTypes[32] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -4966,515 +5026,523 @@ var file_FlightSql_proto_rawDesc = []byte{ 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x5f, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x72, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x3a, - 0x03, 0xc0, 0x3e, 0x01, 0x22, 0x35, 0x0a, 0x18, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x61, - 0x6e, 0x63, 0x65, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x12, 0x12, 0x0a, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x04, - 0x69, 0x6e, 0x66, 0x6f, 0x3a, 0x05, 0xc0, 0x3e, 0x01, 0x18, 0x01, 0x22, 0x87, 0x02, 0x0a, 0x17, - 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x51, 0x75, 0x65, 0x72, - 0x79, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x57, 0x0a, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x3f, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, - 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x63, 0x6f, 0x6c, 0x2e, - 0x73, 0x71, 0x6c, 0x2e, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, - 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x43, 0x61, 0x6e, 0x63, - 0x65, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x06, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x22, 0x8b, 0x01, 0x0a, 0x0c, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x12, 0x1d, 0x0a, 0x19, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, - 0x4c, 0x54, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, - 0x12, 0x1b, 0x0a, 0x17, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, - 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x45, 0x44, 0x10, 0x01, 0x12, 0x1c, 0x0a, - 0x18, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x43, - 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x49, 0x4e, 0x47, 0x10, 0x02, 0x12, 0x21, 0x0a, 0x1d, 0x43, - 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x4e, 0x4f, 0x54, - 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x03, 0x3a, 0x05, - 0xc0, 0x3e, 0x01, 0x18, 0x01, 0x2a, 0x92, 0x19, 0x0a, 0x07, 0x53, 0x71, 0x6c, 0x49, 0x6e, 0x66, - 0x6f, 0x12, 0x1a, 0x0a, 0x16, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, - 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x10, 0x00, 0x12, 0x1d, 0x0a, - 0x19, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, - 0x45, 0x52, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x01, 0x12, 0x23, 0x0a, 0x1f, - 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, - 0x52, 0x5f, 0x41, 0x52, 0x52, 0x4f, 0x57, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, - 0x02, 0x12, 0x1f, 0x0a, 0x1b, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, - 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x5f, 0x4f, 0x4e, 0x4c, 0x59, - 0x10, 0x03, 0x12, 0x19, 0x0a, 0x15, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, - 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x51, 0x4c, 0x10, 0x04, 0x12, 0x1f, 0x0a, - 0x1b, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, - 0x45, 0x52, 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, 0x49, 0x54, 0x10, 0x05, 0x12, 0x2b, - 0x0a, 0x27, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, - 0x56, 0x45, 0x52, 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, 0x49, 0x54, 0x5f, 0x4d, 0x49, - 0x4e, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x06, 0x12, 0x2b, 0x0a, 0x27, 0x46, - 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, - 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, 0x49, 0x54, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x56, - 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x07, 0x12, 0x21, 0x0a, 0x1d, 0x46, 0x4c, 0x49, 0x47, - 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x54, 0x52, - 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x08, 0x12, 0x1c, 0x0a, 0x18, 0x46, - 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, - 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x10, 0x09, 0x12, 0x24, 0x0a, 0x20, 0x46, 0x4c, 0x49, - 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x42, - 0x55, 0x4c, 0x4b, 0x5f, 0x49, 0x4e, 0x47, 0x45, 0x53, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x0a, 0x12, - 0x33, 0x0a, 0x2f, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, - 0x52, 0x56, 0x45, 0x52, 0x5f, 0x49, 0x4e, 0x47, 0x45, 0x53, 0x54, 0x5f, 0x54, 0x52, 0x41, 0x4e, - 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, - 0x45, 0x44, 0x10, 0x0b, 0x12, 0x27, 0x0a, 0x23, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, - 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x4d, - 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x4f, 0x55, 0x54, 0x10, 0x64, 0x12, 0x29, 0x0a, - 0x25, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, - 0x45, 0x52, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x54, - 0x49, 0x4d, 0x45, 0x4f, 0x55, 0x54, 0x10, 0x65, 0x12, 0x14, 0x0a, 0x0f, 0x53, 0x51, 0x4c, 0x5f, - 0x44, 0x44, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x10, 0xf4, 0x03, 0x12, 0x13, - 0x0a, 0x0e, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x44, 0x4c, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, 0x41, - 0x10, 0xf5, 0x03, 0x12, 0x12, 0x0a, 0x0d, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x44, 0x4c, 0x5f, 0x54, - 0x41, 0x42, 0x4c, 0x45, 0x10, 0xf6, 0x03, 0x12, 0x18, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x49, - 0x44, 0x45, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x52, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x10, 0xf7, - 0x03, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x49, 0x44, 0x45, 0x4e, 0x54, 0x49, 0x46, - 0x49, 0x45, 0x52, 0x5f, 0x51, 0x55, 0x4f, 0x54, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x10, 0xf8, - 0x03, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x51, 0x55, 0x4f, 0x54, 0x45, 0x44, 0x5f, - 0x49, 0x44, 0x45, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x52, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x10, - 0xf9, 0x03, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x41, 0x4c, 0x4c, 0x5f, 0x54, 0x41, - 0x42, 0x4c, 0x45, 0x53, 0x5f, 0x41, 0x52, 0x45, 0x5f, 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x41, - 0x42, 0x4c, 0x45, 0x10, 0xfa, 0x03, 0x12, 0x16, 0x0a, 0x11, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, - 0x4c, 0x4c, 0x5f, 0x4f, 0x52, 0x44, 0x45, 0x52, 0x49, 0x4e, 0x47, 0x10, 0xfb, 0x03, 0x12, 0x11, - 0x0a, 0x0c, 0x53, 0x51, 0x4c, 0x5f, 0x4b, 0x45, 0x59, 0x57, 0x4f, 0x52, 0x44, 0x53, 0x10, 0xfc, - 0x03, 0x12, 0x1a, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4d, 0x45, 0x52, 0x49, 0x43, - 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0xfd, 0x03, 0x12, 0x19, 0x0a, - 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x54, 0x52, 0x49, 0x4e, 0x47, 0x5f, 0x46, 0x55, 0x4e, 0x43, - 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0xfe, 0x03, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, - 0x53, 0x59, 0x53, 0x54, 0x45, 0x4d, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, - 0x10, 0xff, 0x03, 0x12, 0x1b, 0x0a, 0x16, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x54, - 0x49, 0x4d, 0x45, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x80, 0x04, - 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x5f, 0x53, - 0x54, 0x52, 0x49, 0x4e, 0x47, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x10, 0x81, 0x04, 0x12, - 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x52, 0x41, 0x5f, 0x4e, 0x41, 0x4d, - 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x41, 0x43, 0x54, 0x45, 0x52, 0x53, 0x10, 0x82, 0x04, 0x12, - 0x21, 0x0a, 0x1c, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, - 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x5f, 0x41, 0x4c, 0x49, 0x41, 0x53, 0x49, 0x4e, 0x47, 0x10, - 0x83, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x5f, 0x50, - 0x4c, 0x55, 0x53, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x5f, 0x49, 0x53, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, - 0x10, 0x84, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, - 0x52, 0x54, 0x53, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x10, 0x85, 0x04, 0x12, 0x29, - 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x54, - 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, 0x4f, 0x52, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, - 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, 0x10, 0x86, 0x04, 0x12, 0x33, 0x0a, 0x2e, 0x53, 0x51, 0x4c, - 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x44, 0x49, 0x46, 0x46, 0x45, 0x52, - 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, 0x4f, 0x52, 0x52, 0x45, 0x4c, - 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, 0x10, 0x87, 0x04, 0x12, 0x29, - 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x45, - 0x58, 0x50, 0x52, 0x45, 0x53, 0x53, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x4f, 0x52, - 0x44, 0x45, 0x52, 0x5f, 0x42, 0x59, 0x10, 0x88, 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, - 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x4f, 0x52, 0x44, 0x45, 0x52, 0x5f, - 0x42, 0x59, 0x5f, 0x55, 0x4e, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x45, 0x44, 0x10, 0x89, 0x04, 0x12, - 0x1b, 0x0a, 0x16, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, - 0x5f, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x10, 0x8a, 0x04, 0x12, 0x24, 0x0a, 0x1f, - 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x4c, 0x49, 0x4b, - 0x45, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x43, 0x4c, 0x41, 0x55, 0x53, 0x45, 0x10, - 0x8b, 0x04, 0x12, 0x26, 0x0a, 0x21, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, - 0x54, 0x53, 0x5f, 0x4e, 0x4f, 0x4e, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x5f, - 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x10, 0x8c, 0x04, 0x12, 0x1a, 0x0a, 0x15, 0x53, 0x51, - 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x47, 0x52, 0x41, 0x4d, - 0x4d, 0x41, 0x52, 0x10, 0x8d, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x41, 0x4e, - 0x53, 0x49, 0x39, 0x32, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x4c, - 0x45, 0x56, 0x45, 0x4c, 0x10, 0x8e, 0x04, 0x12, 0x30, 0x0a, 0x2b, 0x53, 0x51, 0x4c, 0x5f, 0x53, - 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x47, 0x52, 0x49, 0x54, - 0x59, 0x5f, 0x45, 0x4e, 0x48, 0x41, 0x4e, 0x43, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x46, 0x41, - 0x43, 0x49, 0x4c, 0x49, 0x54, 0x59, 0x10, 0x8f, 0x04, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, - 0x5f, 0x4f, 0x55, 0x54, 0x45, 0x52, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, 0x53, 0x5f, 0x53, 0x55, 0x50, - 0x50, 0x4f, 0x52, 0x54, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x10, 0x90, 0x04, 0x12, 0x14, 0x0a, - 0x0f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, 0x41, 0x5f, 0x54, 0x45, 0x52, 0x4d, - 0x10, 0x91, 0x04, 0x12, 0x17, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, - 0x44, 0x55, 0x52, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x10, 0x92, 0x04, 0x12, 0x15, 0x0a, 0x10, - 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x5f, 0x54, 0x45, 0x52, 0x4d, - 0x10, 0x93, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, - 0x4f, 0x47, 0x5f, 0x41, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, 0x10, 0x94, 0x04, 0x12, 0x22, - 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, 0x41, 0x53, 0x5f, 0x53, 0x55, - 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, - 0x95, 0x04, 0x12, 0x23, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, - 0x47, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x41, 0x43, 0x54, - 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x96, 0x04, 0x12, 0x26, 0x0a, 0x21, 0x53, 0x51, 0x4c, 0x5f, 0x53, - 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, - 0x4e, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4d, 0x4d, 0x41, 0x4e, 0x44, 0x53, 0x10, 0x97, 0x04, 0x12, - 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x5f, 0x46, 0x4f, - 0x52, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, - 0x45, 0x44, 0x10, 0x98, 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x54, 0x4f, - 0x52, 0x45, 0x44, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, 0x44, 0x55, 0x52, 0x45, 0x53, 0x5f, 0x53, - 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x99, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, - 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x53, 0x55, 0x42, - 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x10, 0x9a, 0x04, 0x12, 0x28, 0x0a, 0x23, 0x53, 0x51, - 0x4c, 0x5f, 0x43, 0x4f, 0x52, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x45, 0x44, 0x5f, 0x53, 0x55, 0x42, - 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, - 0x44, 0x10, 0x9b, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, - 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x9c, 0x04, 0x12, - 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x42, 0x49, 0x4e, 0x41, 0x52, - 0x59, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, - 0x10, 0x9d, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, - 0x48, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x4c, 0x45, 0x4e, 0x47, - 0x54, 0x48, 0x10, 0x9e, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, - 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, - 0x47, 0x54, 0x48, 0x10, 0x9f, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, - 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x47, 0x52, 0x4f, - 0x55, 0x50, 0x5f, 0x42, 0x59, 0x10, 0xa0, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, - 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x49, - 0x4e, 0x44, 0x45, 0x58, 0x10, 0xa1, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x4d, - 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x4f, 0x52, - 0x44, 0x45, 0x52, 0x5f, 0x42, 0x59, 0x10, 0xa2, 0x04, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, - 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, - 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x10, 0xa3, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, - 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, - 0x54, 0x41, 0x42, 0x4c, 0x45, 0x10, 0xa4, 0x04, 0x12, 0x18, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, - 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4e, 0x4e, 0x45, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, - 0xa5, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x55, - 0x52, 0x53, 0x4f, 0x52, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, - 0x10, 0xa6, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x49, - 0x4e, 0x44, 0x45, 0x58, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xa7, 0x04, 0x12, 0x1e, - 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x42, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, 0x41, 0x5f, - 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xa8, 0x04, 0x12, 0x22, - 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, 0x44, - 0x55, 0x52, 0x45, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, - 0xa9, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x41, - 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, - 0x48, 0x10, 0xaa, 0x04, 0x12, 0x15, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, - 0x52, 0x4f, 0x57, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x10, 0xab, 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, - 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x52, 0x4f, 0x57, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x5f, - 0x49, 0x4e, 0x43, 0x4c, 0x55, 0x44, 0x45, 0x53, 0x5f, 0x42, 0x4c, 0x4f, 0x42, 0x53, 0x10, 0xac, - 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x53, 0x54, 0x41, - 0x54, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xad, 0x04, - 0x12, 0x17, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x53, 0x54, 0x41, 0x54, - 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x53, 0x10, 0xae, 0x04, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, - 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, - 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xaf, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, - 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x53, - 0x45, 0x4c, 0x45, 0x43, 0x54, 0x10, 0xb0, 0x04, 0x12, 0x1c, 0x0a, 0x17, 0x53, 0x51, 0x4c, 0x5f, - 0x4d, 0x41, 0x58, 0x5f, 0x55, 0x53, 0x45, 0x52, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, - 0x47, 0x54, 0x48, 0x10, 0xb1, 0x04, 0x12, 0x26, 0x0a, 0x21, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x45, - 0x46, 0x41, 0x55, 0x4c, 0x54, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, - 0x4e, 0x5f, 0x49, 0x53, 0x4f, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0xb2, 0x04, 0x12, 0x1f, - 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, - 0x4e, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xb3, 0x04, 0x12, - 0x30, 0x0a, 0x2b, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, - 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x49, 0x53, - 0x4f, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x53, 0x10, 0xb4, - 0x04, 0x12, 0x32, 0x0a, 0x2d, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x44, 0x45, - 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x43, 0x41, 0x55, 0x53, 0x45, 0x53, 0x5f, - 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x43, 0x4f, 0x4d, 0x4d, - 0x49, 0x54, 0x10, 0xb5, 0x04, 0x12, 0x31, 0x0a, 0x2c, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, - 0x41, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x49, 0x4e, - 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x49, 0x47, - 0x4e, 0x4f, 0x52, 0x45, 0x44, 0x10, 0xb6, 0x04, 0x12, 0x23, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, - 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, - 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x53, 0x10, 0xb7, 0x04, 0x12, 0x3b, 0x0a, - 0x36, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x43, + 0x03, 0xc0, 0x3e, 0x01, 0x22, 0x82, 0x01, 0x0a, 0x1c, 0x44, 0x6f, 0x50, 0x75, 0x74, 0x50, 0x72, + 0x65, 0x70, 0x61, 0x72, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x52, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x3f, 0x0a, 0x19, 0x70, 0x72, 0x65, 0x70, 0x61, 0x72, 0x65, + 0x64, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x5f, 0x68, 0x61, 0x6e, 0x64, + 0x6c, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x48, 0x00, 0x52, 0x17, 0x70, 0x72, 0x65, 0x70, + 0x61, 0x72, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x48, 0x61, 0x6e, + 0x64, 0x6c, 0x65, 0x88, 0x01, 0x01, 0x3a, 0x03, 0xc0, 0x3e, 0x01, 0x42, 0x1c, 0x0a, 0x1a, 0x5f, + 0x70, 0x72, 0x65, 0x70, 0x61, 0x72, 0x65, 0x64, 0x5f, 0x73, 0x74, 0x61, 0x74, 0x65, 0x6d, 0x65, + 0x6e, 0x74, 0x5f, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x22, 0x35, 0x0a, 0x18, 0x41, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x0c, 0x52, 0x04, 0x69, 0x6e, 0x66, 0x6f, 0x3a, 0x05, 0xc0, 0x3e, 0x01, 0x18, 0x01, + 0x22, 0x87, 0x02, 0x0a, 0x17, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x61, 0x6e, 0x63, 0x65, + 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x57, 0x0a, 0x06, + 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x3f, 0x2e, 0x61, + 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x63, 0x6f, 0x6c, 0x2e, 0x73, 0x71, 0x6c, 0x2e, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x43, + 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x51, 0x75, 0x65, 0x72, 0x79, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, + 0x2e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x52, 0x06, 0x72, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x8b, 0x01, 0x0a, 0x0c, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, + 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x1d, 0x0a, 0x19, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, + 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, + 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, + 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x45, 0x44, + 0x10, 0x01, 0x12, 0x1c, 0x0a, 0x18, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, + 0x55, 0x4c, 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x49, 0x4e, 0x47, 0x10, 0x02, + 0x12, 0x21, 0x0a, 0x1d, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, + 0x54, 0x5f, 0x4e, 0x4f, 0x54, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x4c, 0x41, 0x42, 0x4c, + 0x45, 0x10, 0x03, 0x3a, 0x05, 0xc0, 0x3e, 0x01, 0x18, 0x01, 0x2a, 0x92, 0x19, 0x0a, 0x07, 0x53, + 0x71, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x12, 0x1a, 0x0a, 0x16, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, + 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x4e, 0x41, 0x4d, 0x45, + 0x10, 0x00, 0x12, 0x1d, 0x0a, 0x19, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, + 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, + 0x01, 0x12, 0x23, 0x0a, 0x1f, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, + 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x41, 0x52, 0x52, 0x4f, 0x57, 0x5f, 0x56, 0x45, 0x52, + 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x02, 0x12, 0x1f, 0x0a, 0x1b, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, + 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x52, 0x45, 0x41, 0x44, + 0x5f, 0x4f, 0x4e, 0x4c, 0x59, 0x10, 0x03, 0x12, 0x19, 0x0a, 0x15, 0x46, 0x4c, 0x49, 0x47, 0x48, + 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x51, 0x4c, + 0x10, 0x04, 0x12, 0x1f, 0x0a, 0x1b, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, + 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, 0x49, + 0x54, 0x10, 0x05, 0x12, 0x2b, 0x0a, 0x27, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, + 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, + 0x49, 0x54, 0x5f, 0x4d, 0x49, 0x4e, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x06, + 0x12, 0x2b, 0x0a, 0x27, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, + 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, 0x55, 0x42, 0x53, 0x54, 0x52, 0x41, 0x49, 0x54, 0x5f, + 0x4d, 0x41, 0x58, 0x5f, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x07, 0x12, 0x21, 0x0a, + 0x1d, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, + 0x45, 0x52, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x08, + 0x12, 0x1c, 0x0a, 0x18, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, + 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, 0x4c, 0x10, 0x09, 0x12, 0x24, + 0x0a, 0x20, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, + 0x56, 0x45, 0x52, 0x5f, 0x42, 0x55, 0x4c, 0x4b, 0x5f, 0x49, 0x4e, 0x47, 0x45, 0x53, 0x54, 0x49, + 0x4f, 0x4e, 0x10, 0x0a, 0x12, 0x33, 0x0a, 0x2f, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, + 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x49, 0x4e, 0x47, 0x45, 0x53, 0x54, + 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x53, 0x55, + 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x0b, 0x12, 0x27, 0x0a, 0x23, 0x46, 0x4c, 0x49, + 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x53, + 0x54, 0x41, 0x54, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x4f, 0x55, 0x54, + 0x10, 0x64, 0x12, 0x29, 0x0a, 0x25, 0x46, 0x4c, 0x49, 0x47, 0x48, 0x54, 0x5f, 0x53, 0x51, 0x4c, + 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, + 0x49, 0x4f, 0x4e, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x4f, 0x55, 0x54, 0x10, 0x65, 0x12, 0x14, 0x0a, + 0x0f, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x44, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, + 0x10, 0xf4, 0x03, 0x12, 0x13, 0x0a, 0x0e, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x44, 0x4c, 0x5f, 0x53, + 0x43, 0x48, 0x45, 0x4d, 0x41, 0x10, 0xf5, 0x03, 0x12, 0x12, 0x0a, 0x0d, 0x53, 0x51, 0x4c, 0x5f, + 0x44, 0x44, 0x4c, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x10, 0xf6, 0x03, 0x12, 0x18, 0x0a, 0x13, + 0x53, 0x51, 0x4c, 0x5f, 0x49, 0x44, 0x45, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x52, 0x5f, 0x43, + 0x41, 0x53, 0x45, 0x10, 0xf7, 0x03, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x49, 0x44, + 0x45, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x52, 0x5f, 0x51, 0x55, 0x4f, 0x54, 0x45, 0x5f, 0x43, + 0x48, 0x41, 0x52, 0x10, 0xf8, 0x03, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x51, 0x55, + 0x4f, 0x54, 0x45, 0x44, 0x5f, 0x49, 0x44, 0x45, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x52, 0x5f, + 0x43, 0x41, 0x53, 0x45, 0x10, 0xf9, 0x03, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x41, + 0x4c, 0x4c, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x53, 0x5f, 0x41, 0x52, 0x45, 0x5f, 0x53, 0x45, + 0x4c, 0x45, 0x43, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x10, 0xfa, 0x03, 0x12, 0x16, 0x0a, 0x11, 0x53, + 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x5f, 0x4f, 0x52, 0x44, 0x45, 0x52, 0x49, 0x4e, 0x47, + 0x10, 0xfb, 0x03, 0x12, 0x11, 0x0a, 0x0c, 0x53, 0x51, 0x4c, 0x5f, 0x4b, 0x45, 0x59, 0x57, 0x4f, + 0x52, 0x44, 0x53, 0x10, 0xfc, 0x03, 0x12, 0x1a, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, + 0x4d, 0x45, 0x52, 0x49, 0x43, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, + 0xfd, 0x03, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x54, 0x52, 0x49, 0x4e, 0x47, + 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0xfe, 0x03, 0x12, 0x19, 0x0a, + 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x59, 0x53, 0x54, 0x45, 0x4d, 0x5f, 0x46, 0x55, 0x4e, 0x43, + 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0xff, 0x03, 0x12, 0x1b, 0x0a, 0x16, 0x53, 0x51, 0x4c, 0x5f, + 0x44, 0x41, 0x54, 0x45, 0x54, 0x49, 0x4d, 0x45, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, + 0x4e, 0x53, 0x10, 0x80, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x41, + 0x52, 0x43, 0x48, 0x5f, 0x53, 0x54, 0x52, 0x49, 0x4e, 0x47, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, + 0x45, 0x10, 0x81, 0x04, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x52, + 0x41, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x41, 0x43, 0x54, 0x45, 0x52, + 0x53, 0x10, 0x82, 0x04, 0x12, 0x21, 0x0a, 0x1c, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, + 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x5f, 0x41, 0x4c, 0x49, 0x41, + 0x53, 0x49, 0x4e, 0x47, 0x10, 0x83, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x4e, + 0x55, 0x4c, 0x4c, 0x5f, 0x50, 0x4c, 0x55, 0x53, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x5f, 0x49, 0x53, + 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x10, 0x84, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, + 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, + 0x10, 0x85, 0x04, 0x12, 0x29, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, + 0x52, 0x54, 0x53, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, 0x4f, 0x52, 0x52, 0x45, 0x4c, + 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, 0x10, 0x86, 0x04, 0x12, 0x33, + 0x0a, 0x2e, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x44, + 0x49, 0x46, 0x46, 0x45, 0x52, 0x45, 0x4e, 0x54, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, + 0x4f, 0x52, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, + 0x10, 0x87, 0x04, 0x12, 0x29, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, + 0x52, 0x54, 0x53, 0x5f, 0x45, 0x58, 0x50, 0x52, 0x45, 0x53, 0x53, 0x49, 0x4f, 0x4e, 0x53, 0x5f, + 0x49, 0x4e, 0x5f, 0x4f, 0x52, 0x44, 0x45, 0x52, 0x5f, 0x42, 0x59, 0x10, 0x88, 0x04, 0x12, 0x24, + 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x4f, + 0x52, 0x44, 0x45, 0x52, 0x5f, 0x42, 0x59, 0x5f, 0x55, 0x4e, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x45, + 0x44, 0x10, 0x89, 0x04, 0x12, 0x1b, 0x0a, 0x16, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, + 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x10, 0x8a, + 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, + 0x53, 0x5f, 0x4c, 0x49, 0x4b, 0x45, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x43, 0x4c, + 0x41, 0x55, 0x53, 0x45, 0x10, 0x8b, 0x04, 0x12, 0x26, 0x0a, 0x21, 0x53, 0x51, 0x4c, 0x5f, 0x53, + 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x4e, 0x4f, 0x4e, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, + 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x10, 0x8c, 0x04, 0x12, + 0x1a, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, + 0x5f, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x8d, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, + 0x51, 0x4c, 0x5f, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, + 0x54, 0x45, 0x44, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x10, 0x8e, 0x04, 0x12, 0x30, 0x0a, 0x2b, + 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x53, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x47, 0x52, 0x49, 0x54, 0x59, 0x5f, 0x45, 0x4e, 0x48, 0x41, 0x4e, 0x43, 0x45, 0x4d, 0x45, + 0x4e, 0x54, 0x5f, 0x46, 0x41, 0x43, 0x49, 0x4c, 0x49, 0x54, 0x59, 0x10, 0x8f, 0x04, 0x12, 0x22, + 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x4f, 0x55, 0x54, 0x45, 0x52, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, + 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x5f, 0x4c, 0x45, 0x56, 0x45, 0x4c, 0x10, + 0x90, 0x04, 0x12, 0x14, 0x0a, 0x0f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, 0x41, + 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x10, 0x91, 0x04, 0x12, 0x17, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, + 0x50, 0x52, 0x4f, 0x43, 0x45, 0x44, 0x55, 0x52, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x10, 0x92, + 0x04, 0x12, 0x15, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, + 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x10, 0x93, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, + 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x5f, 0x41, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, + 0x10, 0x94, 0x04, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x4d, + 0x41, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x41, 0x43, 0x54, + 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x95, 0x04, 0x12, 0x23, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x43, + 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, + 0x44, 0x5f, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x96, 0x04, 0x12, 0x26, 0x0a, 0x21, + 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x50, 0x4f, + 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4d, 0x4d, 0x41, 0x4e, 0x44, + 0x53, 0x10, 0x97, 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x45, 0x4c, 0x45, + 0x43, 0x54, 0x5f, 0x46, 0x4f, 0x52, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x55, + 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x98, 0x04, 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, + 0x4c, 0x5f, 0x53, 0x54, 0x4f, 0x52, 0x45, 0x44, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, 0x44, 0x55, + 0x52, 0x45, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x99, 0x04, + 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, + 0x44, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x10, 0x9a, 0x04, 0x12, + 0x28, 0x0a, 0x23, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x52, 0x52, 0x45, 0x4c, 0x41, 0x54, 0x45, + 0x44, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, 0x53, 0x55, 0x50, + 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x9b, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, + 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x49, 0x4f, 0x4e, + 0x53, 0x10, 0x9c, 0x04, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, + 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x4c, + 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0x9d, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, + 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, + 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0x9e, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, + 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x5f, 0x4e, 0x41, 0x4d, + 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0x9f, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, + 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, 0x49, + 0x4e, 0x5f, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x10, 0xa0, 0x04, 0x12, 0x1d, 0x0a, + 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, + 0x5f, 0x49, 0x4e, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x10, 0xa1, 0x04, 0x12, 0x20, 0x0a, 0x1b, + 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, 0x53, 0x5f, + 0x49, 0x4e, 0x5f, 0x4f, 0x52, 0x44, 0x45, 0x52, 0x5f, 0x42, 0x59, 0x10, 0xa2, 0x04, 0x12, 0x1e, + 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, + 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x10, 0xa3, 0x04, 0x12, 0x1d, + 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4c, 0x55, 0x4d, 0x4e, + 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x10, 0xa4, 0x04, 0x12, 0x18, 0x0a, + 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x43, 0x4f, 0x4e, 0x4e, 0x45, 0x43, 0x54, + 0x49, 0x4f, 0x4e, 0x53, 0x10, 0xa5, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x4d, + 0x41, 0x58, 0x5f, 0x43, 0x55, 0x52, 0x53, 0x4f, 0x52, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, + 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xa6, 0x04, 0x12, 0x19, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, + 0x4d, 0x41, 0x58, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, + 0x10, 0xa7, 0x04, 0x12, 0x1e, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x42, 0x5f, 0x53, 0x43, + 0x48, 0x45, 0x4d, 0x41, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, + 0x10, 0xa8, 0x04, 0x12, 0x22, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x50, + 0x52, 0x4f, 0x43, 0x45, 0x44, 0x55, 0x52, 0x45, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, + 0x4e, 0x47, 0x54, 0x48, 0x10, 0xa9, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x4d, + 0x41, 0x58, 0x5f, 0x43, 0x41, 0x54, 0x41, 0x4c, 0x4f, 0x47, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x5f, + 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xaa, 0x04, 0x12, 0x15, 0x0a, 0x10, 0x53, 0x51, 0x4c, + 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x52, 0x4f, 0x57, 0x5f, 0x53, 0x49, 0x5a, 0x45, 0x10, 0xab, 0x04, + 0x12, 0x24, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x52, 0x4f, 0x57, 0x5f, + 0x53, 0x49, 0x5a, 0x45, 0x5f, 0x49, 0x4e, 0x43, 0x4c, 0x55, 0x44, 0x45, 0x53, 0x5f, 0x42, 0x4c, + 0x4f, 0x42, 0x53, 0x10, 0xac, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, + 0x58, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x4c, 0x45, 0x4e, 0x47, + 0x54, 0x48, 0x10, 0xad, 0x04, 0x12, 0x17, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, + 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x53, 0x10, 0xae, 0x04, 0x12, 0x1e, + 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, + 0x4e, 0x41, 0x4d, 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xaf, 0x04, 0x12, 0x1d, + 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x53, + 0x5f, 0x49, 0x4e, 0x5f, 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x10, 0xb0, 0x04, 0x12, 0x1c, 0x0a, + 0x17, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x41, 0x58, 0x5f, 0x55, 0x53, 0x45, 0x52, 0x4e, 0x41, 0x4d, + 0x45, 0x5f, 0x4c, 0x45, 0x4e, 0x47, 0x54, 0x48, 0x10, 0xb1, 0x04, 0x12, 0x26, 0x0a, 0x21, 0x53, + 0x51, 0x4c, 0x5f, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, 0x54, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, + 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x49, 0x53, 0x4f, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, + 0x10, 0xb2, 0x04, 0x12, 0x1f, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, + 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, + 0x44, 0x10, 0xb3, 0x04, 0x12, 0x30, 0x0a, 0x2b, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, + 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, + 0x4e, 0x53, 0x5f, 0x49, 0x53, 0x4f, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4c, 0x45, 0x56, + 0x45, 0x4c, 0x53, 0x10, 0xb4, 0x04, 0x12, 0x32, 0x0a, 0x2d, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, + 0x54, 0x41, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x43, 0x41, + 0x55, 0x53, 0x45, 0x53, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, + 0x5f, 0x43, 0x4f, 0x4d, 0x4d, 0x49, 0x54, 0x10, 0xb5, 0x04, 0x12, 0x31, 0x0a, 0x2c, 0x53, 0x51, + 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, + 0x4e, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, + 0x4e, 0x53, 0x5f, 0x49, 0x47, 0x4e, 0x4f, 0x52, 0x45, 0x44, 0x10, 0xb6, 0x04, 0x12, 0x23, 0x0a, + 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x52, + 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x53, 0x10, + 0xb7, 0x04, 0x12, 0x3b, 0x0a, 0x36, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, + 0x54, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, + 0x53, 0x5f, 0x46, 0x4f, 0x52, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, + 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0xb8, 0x04, 0x12, + 0x3c, 0x0a, 0x37, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, + 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, + 0x4f, 0x52, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x46, 0x4f, + 0x52, 0x57, 0x41, 0x52, 0x44, 0x5f, 0x4f, 0x4e, 0x4c, 0x59, 0x10, 0xb9, 0x04, 0x12, 0x40, 0x0a, + 0x3b, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, 0x4f, 0x52, - 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x55, 0x4e, 0x53, 0x50, - 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0xb8, 0x04, 0x12, 0x3c, 0x0a, 0x37, 0x53, 0x51, - 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4e, 0x43, - 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, 0x4f, 0x52, 0x5f, 0x52, 0x45, - 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x46, 0x4f, 0x52, 0x57, 0x41, 0x52, 0x44, - 0x5f, 0x4f, 0x4e, 0x4c, 0x59, 0x10, 0xb9, 0x04, 0x12, 0x40, 0x0a, 0x3b, 0x53, 0x51, 0x4c, 0x5f, - 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, - 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, 0x4f, 0x52, 0x5f, 0x52, 0x45, 0x53, 0x55, - 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x53, 0x43, 0x52, 0x4f, 0x4c, 0x4c, 0x5f, 0x53, 0x45, - 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0xba, 0x04, 0x12, 0x42, 0x0a, 0x3d, 0x53, 0x51, - 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x43, 0x4f, 0x4e, 0x43, - 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, 0x4f, 0x52, 0x5f, 0x52, 0x45, - 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x53, 0x43, 0x52, 0x4f, 0x4c, 0x4c, 0x5f, - 0x49, 0x4e, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0xbb, 0x04, 0x12, 0x20, - 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x42, 0x41, 0x54, 0x43, 0x48, 0x5f, 0x55, 0x50, 0x44, 0x41, - 0x54, 0x45, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xbc, 0x04, - 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x41, 0x56, 0x45, 0x50, 0x4f, 0x49, 0x4e, - 0x54, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xbd, 0x04, 0x12, - 0x23, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x44, 0x5f, 0x50, 0x41, 0x52, - 0x41, 0x4d, 0x45, 0x54, 0x45, 0x52, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, - 0x44, 0x10, 0xbe, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x4c, 0x4f, 0x43, 0x41, - 0x54, 0x4f, 0x52, 0x53, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, 0x5f, 0x43, 0x4f, 0x50, 0x59, - 0x10, 0xbf, 0x04, 0x12, 0x35, 0x0a, 0x30, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x54, 0x4f, 0x52, 0x45, - 0x44, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x5f, 0x55, 0x53, 0x49, 0x4e, - 0x47, 0x5f, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x59, 0x4e, 0x54, 0x41, 0x58, 0x5f, 0x53, 0x55, - 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xc0, 0x04, 0x2a, 0x91, 0x01, 0x0a, 0x17, 0x53, - 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x72, 0x61, 0x6e, 0x73, - 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, - 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, - 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x29, 0x0a, 0x25, 0x53, 0x51, + 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x53, 0x43, 0x52, 0x4f, + 0x4c, 0x4c, 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0xba, 0x04, 0x12, + 0x42, 0x0a, 0x3d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, + 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x49, 0x45, 0x53, 0x5f, 0x46, + 0x4f, 0x52, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x53, 0x43, + 0x52, 0x4f, 0x4c, 0x4c, 0x5f, 0x49, 0x4e, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, + 0x10, 0xbb, 0x04, 0x12, 0x20, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x42, 0x41, 0x54, 0x43, 0x48, + 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, + 0x45, 0x44, 0x10, 0xbc, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x41, 0x56, + 0x45, 0x50, 0x4f, 0x49, 0x4e, 0x54, 0x53, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, + 0x44, 0x10, 0xbd, 0x04, 0x12, 0x23, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x41, 0x4d, 0x45, + 0x44, 0x5f, 0x50, 0x41, 0x52, 0x41, 0x4d, 0x45, 0x54, 0x45, 0x52, 0x53, 0x5f, 0x53, 0x55, 0x50, + 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xbe, 0x04, 0x12, 0x1d, 0x0a, 0x18, 0x53, 0x51, 0x4c, + 0x5f, 0x4c, 0x4f, 0x43, 0x41, 0x54, 0x4f, 0x52, 0x53, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, + 0x5f, 0x43, 0x4f, 0x50, 0x59, 0x10, 0xbf, 0x04, 0x12, 0x35, 0x0a, 0x30, 0x53, 0x51, 0x4c, 0x5f, + 0x53, 0x54, 0x4f, 0x52, 0x45, 0x44, 0x5f, 0x46, 0x55, 0x4e, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, + 0x5f, 0x55, 0x53, 0x49, 0x4e, 0x47, 0x5f, 0x43, 0x41, 0x4c, 0x4c, 0x5f, 0x53, 0x59, 0x4e, 0x54, + 0x41, 0x58, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0xc0, 0x04, 0x2a, + 0x91, 0x01, 0x0a, 0x17, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, + 0x54, 0x72, 0x61, 0x6e, 0x73, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x22, 0x0a, 0x1e, 0x53, + 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x54, 0x52, 0x41, + 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, + 0x29, 0x0a, 0x25, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, + 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x54, 0x52, 0x41, + 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x01, 0x12, 0x27, 0x0a, 0x23, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x54, 0x52, 0x41, 0x4e, - 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, - 0x49, 0x4f, 0x4e, 0x10, 0x01, 0x12, 0x27, 0x0a, 0x23, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x50, - 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, - 0x4f, 0x4e, 0x5f, 0x53, 0x41, 0x56, 0x45, 0x50, 0x4f, 0x49, 0x4e, 0x54, 0x10, 0x02, 0x2a, 0xb2, - 0x01, 0x0a, 0x1b, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x43, - 0x61, 0x73, 0x65, 0x53, 0x65, 0x6e, 0x73, 0x69, 0x74, 0x69, 0x76, 0x69, 0x74, 0x79, 0x12, 0x20, - 0x0a, 0x1c, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, - 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, - 0x12, 0x29, 0x0a, 0x25, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x53, 0x45, 0x4e, - 0x53, 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x49, 0x4e, - 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0x01, 0x12, 0x22, 0x0a, 0x1e, 0x53, - 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, - 0x49, 0x54, 0x59, 0x5f, 0x55, 0x50, 0x50, 0x45, 0x52, 0x43, 0x41, 0x53, 0x45, 0x10, 0x02, 0x12, - 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x53, - 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x4c, 0x4f, 0x57, 0x45, 0x52, 0x43, 0x41, 0x53, - 0x45, 0x10, 0x03, 0x2a, 0x82, 0x01, 0x0a, 0x0f, 0x53, 0x71, 0x6c, 0x4e, 0x75, 0x6c, 0x6c, 0x4f, - 0x72, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4e, - 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x48, 0x49, 0x47, 0x48, - 0x10, 0x00, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x5f, - 0x53, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x4c, 0x4f, 0x57, 0x10, 0x01, 0x12, 0x1d, 0x0a, 0x19, + 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x53, 0x41, 0x56, 0x45, 0x50, 0x4f, 0x49, 0x4e, + 0x54, 0x10, 0x02, 0x2a, 0xb2, 0x01, 0x0a, 0x1b, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, + 0x72, 0x74, 0x65, 0x64, 0x43, 0x61, 0x73, 0x65, 0x53, 0x65, 0x6e, 0x73, 0x69, 0x74, 0x69, 0x76, + 0x69, 0x74, 0x79, 0x12, 0x20, 0x0a, 0x1c, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, + 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, + 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x29, 0x0a, 0x25, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, + 0x45, 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x43, 0x41, + 0x53, 0x45, 0x5f, 0x49, 0x4e, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0x01, + 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, 0x5f, 0x53, 0x45, 0x4e, + 0x53, 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x55, 0x50, 0x50, 0x45, 0x52, 0x43, 0x41, + 0x53, 0x45, 0x10, 0x02, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x41, 0x53, 0x45, + 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x49, 0x54, 0x59, 0x5f, 0x4c, 0x4f, 0x57, + 0x45, 0x52, 0x43, 0x41, 0x53, 0x45, 0x10, 0x03, 0x2a, 0x82, 0x01, 0x0a, 0x0f, 0x53, 0x71, 0x6c, + 0x4e, 0x75, 0x6c, 0x6c, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, 0x4f, 0x52, 0x54, 0x45, 0x44, - 0x5f, 0x41, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, 0x10, 0x02, 0x12, 0x1b, 0x0a, 0x17, 0x53, - 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, - 0x41, 0x54, 0x5f, 0x45, 0x4e, 0x44, 0x10, 0x03, 0x2a, 0x5e, 0x0a, 0x13, 0x53, 0x75, 0x70, 0x70, - 0x6f, 0x72, 0x74, 0x65, 0x64, 0x53, 0x71, 0x6c, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x12, - 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x49, 0x4e, 0x49, 0x4d, 0x55, 0x4d, 0x5f, 0x47, - 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x00, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, - 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x01, 0x12, 0x18, - 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, 0x44, 0x45, 0x44, 0x5f, 0x47, - 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x02, 0x2a, 0x68, 0x0a, 0x1e, 0x53, 0x75, 0x70, 0x70, - 0x6f, 0x72, 0x74, 0x65, 0x64, 0x41, 0x6e, 0x73, 0x69, 0x39, 0x32, 0x53, 0x71, 0x6c, 0x47, 0x72, - 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x14, 0x0a, 0x10, 0x41, 0x4e, - 0x53, 0x49, 0x39, 0x32, 0x5f, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, 0x53, 0x51, 0x4c, 0x10, 0x00, - 0x12, 0x1b, 0x0a, 0x17, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, - 0x4d, 0x45, 0x44, 0x49, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x51, 0x4c, 0x10, 0x01, 0x12, 0x13, 0x0a, - 0x0f, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, 0x46, 0x55, 0x4c, 0x4c, 0x5f, 0x53, 0x51, 0x4c, - 0x10, 0x02, 0x2a, 0x6d, 0x0a, 0x19, 0x53, 0x71, 0x6c, 0x4f, 0x75, 0x74, 0x65, 0x72, 0x4a, 0x6f, - 0x69, 0x6e, 0x73, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, - 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, 0x53, 0x5f, 0x55, 0x4e, 0x53, - 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x53, 0x51, - 0x4c, 0x5f, 0x4c, 0x49, 0x4d, 0x49, 0x54, 0x45, 0x44, 0x5f, 0x4f, 0x55, 0x54, 0x45, 0x52, 0x5f, - 0x4a, 0x4f, 0x49, 0x4e, 0x53, 0x10, 0x01, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x46, - 0x55, 0x4c, 0x4c, 0x5f, 0x4f, 0x55, 0x54, 0x45, 0x52, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, 0x53, 0x10, - 0x02, 0x2a, 0x51, 0x0a, 0x13, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, - 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x42, 0x79, 0x12, 0x1a, 0x0a, 0x16, 0x53, 0x51, 0x4c, 0x5f, - 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x5f, 0x55, 0x4e, 0x52, 0x45, 0x4c, 0x41, 0x54, - 0x45, 0x44, 0x10, 0x00, 0x12, 0x1e, 0x0a, 0x1a, 0x53, 0x51, 0x4c, 0x5f, 0x47, 0x52, 0x4f, 0x55, - 0x50, 0x5f, 0x42, 0x59, 0x5f, 0x42, 0x45, 0x59, 0x4f, 0x4e, 0x44, 0x5f, 0x53, 0x45, 0x4c, 0x45, - 0x43, 0x54, 0x10, 0x01, 0x2a, 0x90, 0x01, 0x0a, 0x1a, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, - 0x6f, 0x72, 0x74, 0x65, 0x64, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x41, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x4c, 0x45, 0x4d, 0x45, - 0x4e, 0x54, 0x5f, 0x49, 0x4e, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, 0x44, 0x55, 0x52, 0x45, 0x5f, - 0x43, 0x41, 0x4c, 0x4c, 0x53, 0x10, 0x00, 0x12, 0x24, 0x0a, 0x20, 0x53, 0x51, 0x4c, 0x5f, 0x45, - 0x4c, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x49, 0x4e, 0x5f, 0x49, 0x4e, 0x44, 0x45, 0x58, 0x5f, - 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x01, 0x12, 0x28, 0x0a, - 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x4c, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x49, 0x4e, 0x5f, - 0x50, 0x52, 0x49, 0x56, 0x49, 0x4c, 0x45, 0x47, 0x45, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, - 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x02, 0x2a, 0x56, 0x0a, 0x1e, 0x53, 0x71, 0x6c, 0x53, 0x75, - 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x50, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x65, - 0x64, 0x43, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x73, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, - 0x5f, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x45, 0x44, 0x5f, 0x44, 0x45, 0x4c, 0x45, - 0x54, 0x45, 0x10, 0x00, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x50, 0x4f, 0x53, 0x49, - 0x54, 0x49, 0x4f, 0x4e, 0x45, 0x44, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x45, 0x10, 0x01, 0x2a, - 0x97, 0x01, 0x0a, 0x16, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, - 0x53, 0x75, 0x62, 0x71, 0x75, 0x65, 0x72, 0x69, 0x65, 0x73, 0x12, 0x21, 0x0a, 0x1d, 0x53, 0x51, - 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, 0x49, 0x4e, 0x5f, - 0x43, 0x4f, 0x4d, 0x50, 0x41, 0x52, 0x49, 0x53, 0x4f, 0x4e, 0x53, 0x10, 0x00, 0x12, 0x1c, 0x0a, - 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, - 0x49, 0x4e, 0x5f, 0x45, 0x58, 0x49, 0x53, 0x54, 0x53, 0x10, 0x01, 0x12, 0x19, 0x0a, 0x15, 0x53, + 0x5f, 0x48, 0x49, 0x47, 0x48, 0x10, 0x00, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x4e, + 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x4c, 0x4f, 0x57, 0x10, + 0x01, 0x12, 0x1d, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, + 0x4f, 0x52, 0x54, 0x45, 0x44, 0x5f, 0x41, 0x54, 0x5f, 0x53, 0x54, 0x41, 0x52, 0x54, 0x10, 0x02, + 0x12, 0x1b, 0x0a, 0x17, 0x53, 0x51, 0x4c, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x5f, 0x53, 0x4f, + 0x52, 0x54, 0x45, 0x44, 0x5f, 0x41, 0x54, 0x5f, 0x45, 0x4e, 0x44, 0x10, 0x03, 0x2a, 0x5e, 0x0a, + 0x13, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x53, 0x71, 0x6c, 0x47, 0x72, 0x61, + 0x6d, 0x6d, 0x61, 0x72, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x4d, 0x49, 0x4e, 0x49, + 0x4d, 0x55, 0x4d, 0x5f, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x00, 0x12, 0x14, 0x0a, + 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x52, 0x45, 0x5f, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, + 0x52, 0x10, 0x01, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x58, 0x54, 0x45, 0x4e, + 0x44, 0x45, 0x44, 0x5f, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x10, 0x02, 0x2a, 0x68, 0x0a, + 0x1e, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x41, 0x6e, 0x73, 0x69, 0x39, 0x32, + 0x53, 0x71, 0x6c, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, + 0x14, 0x0a, 0x10, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, 0x45, 0x4e, 0x54, 0x52, 0x59, 0x5f, + 0x53, 0x51, 0x4c, 0x10, 0x00, 0x12, 0x1b, 0x0a, 0x17, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, + 0x49, 0x4e, 0x54, 0x45, 0x52, 0x4d, 0x45, 0x44, 0x49, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x51, 0x4c, + 0x10, 0x01, 0x12, 0x13, 0x0a, 0x0f, 0x41, 0x4e, 0x53, 0x49, 0x39, 0x32, 0x5f, 0x46, 0x55, 0x4c, + 0x4c, 0x5f, 0x53, 0x51, 0x4c, 0x10, 0x02, 0x2a, 0x6d, 0x0a, 0x19, 0x53, 0x71, 0x6c, 0x4f, 0x75, + 0x74, 0x65, 0x72, 0x4a, 0x6f, 0x69, 0x6e, 0x73, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x4c, + 0x65, 0x76, 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, + 0x53, 0x5f, 0x55, 0x4e, 0x53, 0x55, 0x50, 0x50, 0x4f, 0x52, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, + 0x1b, 0x0a, 0x17, 0x53, 0x51, 0x4c, 0x5f, 0x4c, 0x49, 0x4d, 0x49, 0x54, 0x45, 0x44, 0x5f, 0x4f, + 0x55, 0x54, 0x45, 0x52, 0x5f, 0x4a, 0x4f, 0x49, 0x4e, 0x53, 0x10, 0x01, 0x12, 0x18, 0x0a, 0x14, + 0x53, 0x51, 0x4c, 0x5f, 0x46, 0x55, 0x4c, 0x4c, 0x5f, 0x4f, 0x55, 0x54, 0x45, 0x52, 0x5f, 0x4a, + 0x4f, 0x49, 0x4e, 0x53, 0x10, 0x02, 0x2a, 0x51, 0x0a, 0x13, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, + 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x47, 0x72, 0x6f, 0x75, 0x70, 0x42, 0x79, 0x12, 0x1a, 0x0a, + 0x16, 0x53, 0x51, 0x4c, 0x5f, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x5f, 0x55, 0x4e, + 0x52, 0x45, 0x4c, 0x41, 0x54, 0x45, 0x44, 0x10, 0x00, 0x12, 0x1e, 0x0a, 0x1a, 0x53, 0x51, 0x4c, + 0x5f, 0x47, 0x52, 0x4f, 0x55, 0x50, 0x5f, 0x42, 0x59, 0x5f, 0x42, 0x45, 0x59, 0x4f, 0x4e, 0x44, + 0x5f, 0x53, 0x45, 0x4c, 0x45, 0x43, 0x54, 0x10, 0x01, 0x2a, 0x90, 0x01, 0x0a, 0x1a, 0x53, 0x71, + 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, + 0x74, 0x41, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, + 0x45, 0x4c, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x49, 0x4e, 0x5f, 0x50, 0x52, 0x4f, 0x43, 0x45, + 0x44, 0x55, 0x52, 0x45, 0x5f, 0x43, 0x41, 0x4c, 0x4c, 0x53, 0x10, 0x00, 0x12, 0x24, 0x0a, 0x20, + 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x4c, 0x45, 0x4d, 0x45, 0x4e, 0x54, 0x5f, 0x49, 0x4e, 0x5f, 0x49, + 0x4e, 0x44, 0x45, 0x58, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x53, + 0x10, 0x01, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x45, 0x4c, 0x45, 0x4d, 0x45, 0x4e, + 0x54, 0x5f, 0x49, 0x4e, 0x5f, 0x50, 0x52, 0x49, 0x56, 0x49, 0x4c, 0x45, 0x47, 0x45, 0x5f, 0x44, + 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x02, 0x2a, 0x56, 0x0a, 0x1e, + 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x50, 0x6f, 0x73, 0x69, + 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x6f, 0x6d, 0x6d, 0x61, 0x6e, 0x64, 0x73, 0x12, 0x19, + 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x45, 0x44, + 0x5f, 0x44, 0x45, 0x4c, 0x45, 0x54, 0x45, 0x10, 0x00, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, + 0x5f, 0x50, 0x4f, 0x53, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x45, 0x44, 0x5f, 0x55, 0x50, 0x44, 0x41, + 0x54, 0x45, 0x10, 0x01, 0x2a, 0x97, 0x01, 0x0a, 0x16, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, + 0x6f, 0x72, 0x74, 0x65, 0x64, 0x53, 0x75, 0x62, 0x71, 0x75, 0x65, 0x72, 0x69, 0x65, 0x73, 0x12, + 0x21, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, + 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x43, 0x4f, 0x4d, 0x50, 0x41, 0x52, 0x49, 0x53, 0x4f, 0x4e, 0x53, + 0x10, 0x00, 0x12, 0x1c, 0x0a, 0x18, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, + 0x52, 0x49, 0x45, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x45, 0x58, 0x49, 0x53, 0x54, 0x53, 0x10, 0x01, + 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, + 0x45, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x49, 0x4e, 0x53, 0x10, 0x02, 0x12, 0x21, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, 0x49, 0x4e, - 0x5f, 0x49, 0x4e, 0x53, 0x10, 0x02, 0x12, 0x21, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x53, 0x55, - 0x42, 0x51, 0x55, 0x45, 0x52, 0x49, 0x45, 0x53, 0x5f, 0x49, 0x4e, 0x5f, 0x51, 0x55, 0x41, 0x4e, - 0x54, 0x49, 0x46, 0x49, 0x45, 0x44, 0x53, 0x10, 0x03, 0x2a, 0x36, 0x0a, 0x12, 0x53, 0x71, 0x6c, - 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x55, 0x6e, 0x69, 0x6f, 0x6e, 0x73, 0x12, - 0x0d, 0x0a, 0x09, 0x53, 0x51, 0x4c, 0x5f, 0x55, 0x4e, 0x49, 0x4f, 0x4e, 0x10, 0x00, 0x12, 0x11, - 0x0a, 0x0d, 0x53, 0x51, 0x4c, 0x5f, 0x55, 0x4e, 0x49, 0x4f, 0x4e, 0x5f, 0x41, 0x4c, 0x4c, 0x10, - 0x01, 0x2a, 0xc9, 0x01, 0x0a, 0x1c, 0x53, 0x71, 0x6c, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x61, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x73, 0x6f, 0x6c, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4c, 0x65, 0x76, - 0x65, 0x6c, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, - 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x24, 0x0a, 0x20, - 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, - 0x52, 0x45, 0x41, 0x44, 0x5f, 0x55, 0x4e, 0x43, 0x4f, 0x4d, 0x4d, 0x49, 0x54, 0x54, 0x45, 0x44, - 0x10, 0x01, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, - 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x5f, 0x43, 0x4f, 0x4d, 0x4d, 0x49, - 0x54, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x23, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, - 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x52, 0x45, 0x50, 0x45, 0x41, 0x54, - 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x10, 0x03, 0x12, 0x20, 0x0a, 0x1c, 0x53, - 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x53, - 0x45, 0x52, 0x49, 0x41, 0x4c, 0x49, 0x5a, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x04, 0x2a, 0x89, 0x01, - 0x0a, 0x18, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x72, - 0x61, 0x6e, 0x73, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x1f, 0x0a, 0x1b, 0x53, 0x51, - 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x55, 0x4e, - 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x24, 0x0a, 0x20, 0x53, - 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x44, 0x45, 0x46, 0x49, 0x4e, 0x49, 0x54, 0x49, - 0x4f, 0x4e, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, - 0x01, 0x12, 0x26, 0x0a, 0x22, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x4d, 0x41, - 0x4e, 0x49, 0x50, 0x55, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, - 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x02, 0x2a, 0xbc, 0x01, 0x0a, 0x19, 0x53, 0x71, - 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x53, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x52, - 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x55, - 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x24, 0x0a, 0x20, + 0x5f, 0x51, 0x55, 0x41, 0x4e, 0x54, 0x49, 0x46, 0x49, 0x45, 0x44, 0x53, 0x10, 0x03, 0x2a, 0x36, + 0x0a, 0x12, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x55, 0x6e, + 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x0d, 0x0a, 0x09, 0x53, 0x51, 0x4c, 0x5f, 0x55, 0x4e, 0x49, 0x4f, + 0x4e, 0x10, 0x00, 0x12, 0x11, 0x0a, 0x0d, 0x53, 0x51, 0x4c, 0x5f, 0x55, 0x4e, 0x49, 0x4f, 0x4e, + 0x5f, 0x41, 0x4c, 0x4c, 0x10, 0x01, 0x2a, 0xc9, 0x01, 0x0a, 0x1c, 0x53, 0x71, 0x6c, 0x54, 0x72, + 0x61, 0x6e, 0x73, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x73, 0x6f, 0x6c, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x54, + 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, + 0x00, 0x12, 0x24, 0x0a, 0x20, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, + 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x5f, 0x55, 0x4e, 0x43, 0x4f, 0x4d, 0x4d, + 0x49, 0x54, 0x54, 0x45, 0x44, 0x10, 0x01, 0x12, 0x22, 0x0a, 0x1e, 0x53, 0x51, 0x4c, 0x5f, 0x54, + 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x5f, + 0x43, 0x4f, 0x4d, 0x4d, 0x49, 0x54, 0x54, 0x45, 0x44, 0x10, 0x02, 0x12, 0x23, 0x0a, 0x1f, 0x53, + 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x52, + 0x45, 0x50, 0x45, 0x41, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x10, 0x03, + 0x12, 0x20, 0x0a, 0x1c, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, + 0x49, 0x4f, 0x4e, 0x5f, 0x53, 0x45, 0x52, 0x49, 0x41, 0x4c, 0x49, 0x5a, 0x41, 0x42, 0x4c, 0x45, + 0x10, 0x04, 0x2a, 0x89, 0x01, 0x0a, 0x18, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, + 0x74, 0x65, 0x64, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, + 0x1f, 0x0a, 0x1b, 0x53, 0x51, 0x4c, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, + 0x4f, 0x4e, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, + 0x12, 0x24, 0x0a, 0x20, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, 0x54, 0x41, 0x5f, 0x44, 0x45, 0x46, + 0x49, 0x4e, 0x49, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, + 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x01, 0x12, 0x26, 0x0a, 0x22, 0x53, 0x51, 0x4c, 0x5f, 0x44, 0x41, + 0x54, 0x41, 0x5f, 0x4d, 0x41, 0x4e, 0x49, 0x50, 0x55, 0x4c, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, + 0x54, 0x52, 0x41, 0x4e, 0x53, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x53, 0x10, 0x02, 0x2a, 0xbc, + 0x01, 0x0a, 0x19, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x52, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53, 0x65, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, - 0x59, 0x50, 0x45, 0x5f, 0x46, 0x4f, 0x52, 0x57, 0x41, 0x52, 0x44, 0x5f, 0x4f, 0x4e, 0x4c, 0x59, - 0x10, 0x01, 0x12, 0x2a, 0x0a, 0x26, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, - 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x53, 0x43, 0x52, 0x4f, 0x4c, 0x4c, - 0x5f, 0x49, 0x4e, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0x02, 0x12, 0x28, + 0x59, 0x50, 0x45, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, + 0x00, 0x12, 0x24, 0x0a, 0x20, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, + 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x46, 0x4f, 0x52, 0x57, 0x41, 0x52, 0x44, + 0x5f, 0x4f, 0x4e, 0x4c, 0x59, 0x10, 0x01, 0x12, 0x2a, 0x0a, 0x26, 0x53, 0x51, 0x4c, 0x5f, 0x52, + 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x53, + 0x43, 0x52, 0x4f, 0x4c, 0x4c, 0x5f, 0x49, 0x4e, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, + 0x45, 0x10, 0x02, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, + 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x53, 0x43, 0x52, 0x4f, 0x4c, + 0x4c, 0x5f, 0x53, 0x45, 0x4e, 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0x03, 0x2a, 0xa2, 0x01, + 0x0a, 0x20, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x52, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, + 0x63, 0x79, 0x12, 0x2a, 0x0a, 0x26, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, + 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x59, + 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, - 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x53, 0x43, 0x52, 0x4f, 0x4c, 0x4c, 0x5f, 0x53, 0x45, 0x4e, - 0x53, 0x49, 0x54, 0x49, 0x56, 0x45, 0x10, 0x03, 0x2a, 0xa2, 0x01, 0x0a, 0x20, 0x53, 0x71, 0x6c, - 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x53, - 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x63, 0x79, 0x12, 0x2a, 0x0a, - 0x26, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, - 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x59, 0x5f, 0x55, 0x4e, 0x53, 0x50, - 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, - 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x43, 0x4f, 0x4e, 0x43, - 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x59, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x5f, 0x4f, 0x4e, 0x4c, - 0x59, 0x10, 0x01, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, 0x52, 0x45, 0x53, 0x55, 0x4c, - 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, - 0x59, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x02, 0x2a, 0x99, 0x04, - 0x0a, 0x12, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x73, 0x43, 0x6f, 0x6e, - 0x76, 0x65, 0x72, 0x74, 0x12, 0x16, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, - 0x45, 0x52, 0x54, 0x5f, 0x42, 0x49, 0x47, 0x49, 0x4e, 0x54, 0x10, 0x00, 0x12, 0x16, 0x0a, 0x12, - 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x42, 0x49, 0x4e, 0x41, - 0x52, 0x59, 0x10, 0x01, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, - 0x45, 0x52, 0x54, 0x5f, 0x42, 0x49, 0x54, 0x10, 0x02, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, - 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x10, 0x03, 0x12, - 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x44, - 0x41, 0x54, 0x45, 0x10, 0x04, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, - 0x56, 0x45, 0x52, 0x54, 0x5f, 0x44, 0x45, 0x43, 0x49, 0x4d, 0x41, 0x4c, 0x10, 0x05, 0x12, 0x15, - 0x0a, 0x11, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x46, 0x4c, - 0x4f, 0x41, 0x54, 0x10, 0x06, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, - 0x56, 0x45, 0x52, 0x54, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x47, 0x45, 0x52, 0x10, 0x07, 0x12, 0x21, - 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x49, 0x4e, - 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x10, - 0x08, 0x12, 0x23, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, - 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x5f, 0x4d, - 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x09, 0x12, 0x1d, 0x0a, 0x19, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, - 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, - 0x41, 0x52, 0x59, 0x10, 0x0a, 0x12, 0x1b, 0x0a, 0x17, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, - 0x56, 0x45, 0x52, 0x54, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, - 0x10, 0x0b, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, - 0x54, 0x5f, 0x4e, 0x55, 0x4d, 0x45, 0x52, 0x49, 0x43, 0x10, 0x0c, 0x12, 0x14, 0x0a, 0x10, 0x53, - 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x52, 0x45, 0x41, 0x4c, 0x10, - 0x0d, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, - 0x5f, 0x53, 0x4d, 0x41, 0x4c, 0x4c, 0x49, 0x4e, 0x54, 0x10, 0x0e, 0x12, 0x14, 0x0a, 0x10, 0x53, - 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x10, - 0x0f, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, - 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x10, 0x10, 0x12, 0x17, 0x0a, 0x13, - 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x54, 0x49, 0x4e, 0x59, - 0x49, 0x4e, 0x54, 0x10, 0x11, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, - 0x56, 0x45, 0x52, 0x54, 0x5f, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, 0x12, - 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, - 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0x13, 0x2a, 0x8f, 0x04, 0x0a, 0x0c, 0x58, 0x64, - 0x62, 0x63, 0x44, 0x61, 0x74, 0x61, 0x54, 0x79, 0x70, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, - 0x42, 0x43, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x10, - 0x00, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x10, 0x01, - 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x4e, 0x55, 0x4d, 0x45, 0x52, 0x49, 0x43, - 0x10, 0x02, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x44, 0x45, 0x43, 0x49, 0x4d, - 0x41, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x49, 0x4e, 0x54, - 0x45, 0x47, 0x45, 0x52, 0x10, 0x04, 0x12, 0x11, 0x0a, 0x0d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, - 0x4d, 0x41, 0x4c, 0x4c, 0x49, 0x4e, 0x54, 0x10, 0x05, 0x12, 0x0e, 0x0a, 0x0a, 0x58, 0x44, 0x42, - 0x43, 0x5f, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x10, 0x06, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, - 0x43, 0x5f, 0x52, 0x45, 0x41, 0x4c, 0x10, 0x07, 0x12, 0x0f, 0x0a, 0x0b, 0x58, 0x44, 0x42, 0x43, - 0x5f, 0x44, 0x4f, 0x55, 0x42, 0x4c, 0x45, 0x10, 0x08, 0x12, 0x11, 0x0a, 0x0d, 0x58, 0x44, 0x42, - 0x43, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x54, 0x49, 0x4d, 0x45, 0x10, 0x09, 0x12, 0x11, 0x0a, 0x0d, - 0x58, 0x44, 0x42, 0x43, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x10, 0x0a, 0x12, - 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, - 0x0c, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x10, 0x5b, - 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x10, 0x5c, 0x12, - 0x12, 0x0a, 0x0e, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, - 0x50, 0x10, 0x5d, 0x12, 0x1d, 0x0a, 0x10, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, - 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0x01, 0x12, 0x18, 0x0a, 0x0b, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x42, 0x49, 0x4e, 0x41, 0x52, - 0x59, 0x10, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x1b, 0x0a, 0x0e, - 0x58, 0x44, 0x42, 0x43, 0x5f, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, 0xfd, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x1f, 0x0a, 0x12, 0x58, 0x44, 0x42, - 0x43, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, - 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x18, 0x0a, 0x0b, 0x58, 0x44, - 0x42, 0x43, 0x5f, 0x42, 0x49, 0x47, 0x49, 0x4e, 0x54, 0x10, 0xfb, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0x01, 0x12, 0x19, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x54, 0x49, 0x4e, - 0x59, 0x49, 0x4e, 0x54, 0x10, 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, - 0x15, 0x0a, 0x08, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x42, 0x49, 0x54, 0x10, 0xf9, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x17, 0x0a, 0x0a, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x57, - 0x43, 0x48, 0x41, 0x52, 0x10, 0xf8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, - 0x1a, 0x0a, 0x0d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x57, 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, - 0x10, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x2a, 0xa3, 0x08, 0x0a, 0x13, - 0x58, 0x64, 0x62, 0x63, 0x44, 0x61, 0x74, 0x65, 0x74, 0x69, 0x6d, 0x65, 0x53, 0x75, 0x62, 0x63, - 0x6f, 0x64, 0x65, 0x12, 0x18, 0x0a, 0x14, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, - 0x4f, 0x44, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x15, 0x0a, - 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x59, 0x45, - 0x41, 0x52, 0x10, 0x01, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x10, 0x01, 0x12, 0x15, 0x0a, 0x11, 0x58, - 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x54, 0x49, 0x4d, 0x45, - 0x10, 0x02, 0x12, 0x16, 0x0a, 0x12, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, - 0x44, 0x45, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x02, 0x12, 0x1a, 0x0a, 0x16, 0x58, 0x44, - 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, - 0x54, 0x41, 0x4d, 0x50, 0x10, 0x03, 0x12, 0x14, 0x0a, 0x10, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, - 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x10, 0x03, 0x12, 0x23, 0x0a, 0x1f, - 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x54, 0x49, 0x4d, - 0x45, 0x5f, 0x57, 0x49, 0x54, 0x48, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x5a, 0x4f, 0x4e, 0x45, 0x10, - 0x04, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, - 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x04, 0x12, 0x28, 0x0a, 0x24, 0x58, 0x44, 0x42, 0x43, - 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, - 0x4d, 0x50, 0x5f, 0x57, 0x49, 0x54, 0x48, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x5a, 0x4f, 0x4e, 0x45, - 0x10, 0x05, 0x12, 0x17, 0x0a, 0x13, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, - 0x44, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x05, 0x12, 0x17, 0x0a, 0x13, 0x58, - 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x53, 0x45, 0x43, 0x4f, - 0x4e, 0x44, 0x10, 0x06, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x4f, 0x4e, - 0x54, 0x48, 0x10, 0x07, 0x12, 0x1c, 0x0a, 0x18, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x48, 0x4f, 0x55, 0x52, - 0x10, 0x08, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, - 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, - 0x10, 0x09, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, - 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, - 0x10, 0x0a, 0x12, 0x1f, 0x0a, 0x1b, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, - 0x44, 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, - 0x45, 0x10, 0x0b, 0x12, 0x1f, 0x0a, 0x1b, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, - 0x4f, 0x44, 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, - 0x4e, 0x44, 0x10, 0x0c, 0x12, 0x21, 0x0a, 0x1d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x5f, 0x54, 0x4f, 0x5f, 0x53, - 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x0d, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, - 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, - 0x5f, 0x59, 0x45, 0x41, 0x52, 0x10, 0x65, 0x12, 0x1f, 0x0a, 0x1b, 0x58, 0x44, 0x42, 0x43, 0x5f, - 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, - 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x66, 0x12, 0x1d, 0x0a, 0x19, 0x58, 0x44, 0x42, 0x43, - 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, - 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x10, 0x67, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, - 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, - 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x68, 0x12, 0x20, 0x0a, 0x1c, 0x58, 0x44, 0x42, 0x43, 0x5f, - 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, - 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x69, 0x12, 0x20, 0x0a, 0x1c, 0x58, 0x44, 0x42, + 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, 0x52, 0x52, 0x45, 0x4e, 0x43, 0x59, 0x5f, 0x52, 0x45, 0x41, + 0x44, 0x5f, 0x4f, 0x4e, 0x4c, 0x59, 0x10, 0x01, 0x12, 0x28, 0x0a, 0x24, 0x53, 0x51, 0x4c, 0x5f, + 0x52, 0x45, 0x53, 0x55, 0x4c, 0x54, 0x5f, 0x53, 0x45, 0x54, 0x5f, 0x43, 0x4f, 0x4e, 0x43, 0x55, + 0x52, 0x52, 0x45, 0x4e, 0x43, 0x59, 0x5f, 0x55, 0x50, 0x44, 0x41, 0x54, 0x41, 0x42, 0x4c, 0x45, + 0x10, 0x02, 0x2a, 0x99, 0x04, 0x0a, 0x12, 0x53, 0x71, 0x6c, 0x53, 0x75, 0x70, 0x70, 0x6f, 0x72, + 0x74, 0x73, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x12, 0x16, 0x0a, 0x12, 0x53, 0x51, 0x4c, + 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x42, 0x49, 0x47, 0x49, 0x4e, 0x54, 0x10, + 0x00, 0x12, 0x16, 0x0a, 0x12, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, + 0x5f, 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, 0x01, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x51, 0x4c, + 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x42, 0x49, 0x54, 0x10, 0x02, 0x12, 0x14, + 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x43, 0x48, + 0x41, 0x52, 0x10, 0x03, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, + 0x45, 0x52, 0x54, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x10, 0x04, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, + 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x44, 0x45, 0x43, 0x49, 0x4d, 0x41, + 0x4c, 0x10, 0x05, 0x12, 0x15, 0x0a, 0x11, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, + 0x52, 0x54, 0x5f, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x10, 0x06, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, + 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x47, 0x45, + 0x52, 0x10, 0x07, 0x12, 0x21, 0x0a, 0x1d, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, + 0x52, 0x54, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, + 0x54, 0x49, 0x4d, 0x45, 0x10, 0x08, 0x12, 0x23, 0x0a, 0x1f, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, + 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x59, + 0x45, 0x41, 0x52, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x09, 0x12, 0x1d, 0x0a, 0x19, 0x53, + 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, + 0x41, 0x52, 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, 0x0a, 0x12, 0x1b, 0x0a, 0x17, 0x53, 0x51, + 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, + 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0x0b, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, + 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x4e, 0x55, 0x4d, 0x45, 0x52, 0x49, 0x43, 0x10, 0x0c, + 0x12, 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, + 0x52, 0x45, 0x41, 0x4c, 0x10, 0x0d, 0x12, 0x18, 0x0a, 0x14, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, + 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x53, 0x4d, 0x41, 0x4c, 0x4c, 0x49, 0x4e, 0x54, 0x10, 0x0e, + 0x12, 0x14, 0x0a, 0x10, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, + 0x54, 0x49, 0x4d, 0x45, 0x10, 0x0f, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, + 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x10, + 0x10, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, + 0x5f, 0x54, 0x49, 0x4e, 0x59, 0x49, 0x4e, 0x54, 0x10, 0x11, 0x12, 0x19, 0x0a, 0x15, 0x53, 0x51, + 0x4c, 0x5f, 0x43, 0x4f, 0x4e, 0x56, 0x45, 0x52, 0x54, 0x5f, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, + 0x41, 0x52, 0x59, 0x10, 0x12, 0x12, 0x17, 0x0a, 0x13, 0x53, 0x51, 0x4c, 0x5f, 0x43, 0x4f, 0x4e, + 0x56, 0x45, 0x52, 0x54, 0x5f, 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0x13, 0x2a, 0x8f, + 0x04, 0x0a, 0x0c, 0x58, 0x64, 0x62, 0x63, 0x44, 0x61, 0x74, 0x61, 0x54, 0x79, 0x70, 0x65, 0x12, + 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x5f, + 0x54, 0x59, 0x50, 0x45, 0x10, 0x00, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x43, + 0x48, 0x41, 0x52, 0x10, 0x01, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x4e, 0x55, + 0x4d, 0x45, 0x52, 0x49, 0x43, 0x10, 0x02, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x44, 0x45, 0x43, 0x49, 0x4d, 0x41, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x47, 0x45, 0x52, 0x10, 0x04, 0x12, 0x11, 0x0a, 0x0d, 0x58, + 0x44, 0x42, 0x43, 0x5f, 0x53, 0x4d, 0x41, 0x4c, 0x4c, 0x49, 0x4e, 0x54, 0x10, 0x05, 0x12, 0x0e, + 0x0a, 0x0a, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x46, 0x4c, 0x4f, 0x41, 0x54, 0x10, 0x06, 0x12, 0x0d, + 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x52, 0x45, 0x41, 0x4c, 0x10, 0x07, 0x12, 0x0f, 0x0a, + 0x0b, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x44, 0x4f, 0x55, 0x42, 0x4c, 0x45, 0x10, 0x08, 0x12, 0x11, + 0x0a, 0x0d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x54, 0x49, 0x4d, 0x45, 0x10, + 0x09, 0x12, 0x11, 0x0a, 0x0d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, + 0x41, 0x4c, 0x10, 0x0a, 0x12, 0x10, 0x0a, 0x0c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x56, 0x41, 0x52, + 0x43, 0x48, 0x41, 0x52, 0x10, 0x0c, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x44, + 0x41, 0x54, 0x45, 0x10, 0x5b, 0x12, 0x0d, 0x0a, 0x09, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x54, 0x49, + 0x4d, 0x45, 0x10, 0x5c, 0x12, 0x12, 0x0a, 0x0e, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x54, 0x49, 0x4d, + 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x10, 0x5d, 0x12, 0x1d, 0x0a, 0x10, 0x58, 0x44, 0x42, 0x43, + 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x18, 0x0a, 0x0b, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x42, 0x49, 0x4e, 0x41, 0x52, 0x59, 0x10, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x01, 0x12, 0x1b, 0x0a, 0x0e, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x56, 0x41, 0x52, 0x42, 0x49, 0x4e, + 0x41, 0x52, 0x59, 0x10, 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x1f, + 0x0a, 0x12, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x4c, 0x4f, 0x4e, 0x47, 0x56, 0x41, 0x52, 0x42, 0x49, + 0x4e, 0x41, 0x52, 0x59, 0x10, 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, + 0x18, 0x0a, 0x0b, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x42, 0x49, 0x47, 0x49, 0x4e, 0x54, 0x10, 0xfb, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x19, 0x0a, 0x0c, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x54, 0x49, 0x4e, 0x59, 0x49, 0x4e, 0x54, 0x10, 0xfa, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x01, 0x12, 0x15, 0x0a, 0x08, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x42, 0x49, 0x54, + 0x10, 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x12, 0x17, 0x0a, 0x0a, 0x58, + 0x44, 0x42, 0x43, 0x5f, 0x57, 0x43, 0x48, 0x41, 0x52, 0x10, 0xf8, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x01, 0x12, 0x1a, 0x0a, 0x0d, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x57, 0x56, 0x41, + 0x52, 0x43, 0x48, 0x41, 0x52, 0x10, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, + 0x2a, 0xa3, 0x08, 0x0a, 0x13, 0x58, 0x64, 0x62, 0x63, 0x44, 0x61, 0x74, 0x65, 0x74, 0x69, 0x6d, + 0x65, 0x53, 0x75, 0x62, 0x63, 0x6f, 0x64, 0x65, 0x12, 0x18, 0x0a, 0x14, 0x58, 0x44, 0x42, 0x43, + 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, + 0x10, 0x00, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, + 0x44, 0x45, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x10, 0x01, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x54, 0x45, 0x10, 0x01, + 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, + 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x10, 0x02, 0x12, 0x16, 0x0a, 0x12, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x02, 0x12, + 0x1a, 0x0a, 0x16, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, + 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x10, 0x03, 0x12, 0x14, 0x0a, 0x10, 0x58, + 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x10, + 0x03, 0x12, 0x23, 0x0a, 0x1f, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, + 0x45, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x5f, 0x57, 0x49, 0x54, 0x48, 0x5f, 0x54, 0x49, 0x4d, 0x45, + 0x5a, 0x4f, 0x4e, 0x45, 0x10, 0x04, 0x12, 0x15, 0x0a, 0x11, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, + 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x04, 0x12, 0x28, 0x0a, + 0x24, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x54, 0x49, + 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x5f, 0x57, 0x49, 0x54, 0x48, 0x5f, 0x54, 0x49, 0x4d, + 0x45, 0x5a, 0x4f, 0x4e, 0x45, 0x10, 0x05, 0x12, 0x17, 0x0a, 0x13, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x05, + 0x12, 0x17, 0x0a, 0x13, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, + 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x06, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x5f, 0x54, + 0x4f, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x07, 0x12, 0x1c, 0x0a, 0x18, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, + 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x08, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, + 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x09, 0x12, 0x1e, 0x0a, 0x1a, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x53, + 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x0a, 0x12, 0x1f, 0x0a, 0x1b, 0x58, 0x44, 0x42, 0x43, 0x5f, + 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, + 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x0b, 0x12, 0x1f, 0x0a, 0x1b, 0x58, 0x44, 0x42, 0x43, + 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, + 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x0c, 0x12, 0x21, 0x0a, 0x1d, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, + 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x0d, 0x12, 0x1e, 0x0a, 0x1a, + 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x10, 0x65, 0x12, 0x1f, 0x0a, 0x1b, + 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x66, 0x12, 0x1d, 0x0a, + 0x19, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, + 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x10, 0x67, 0x12, 0x1e, 0x0a, 0x1a, + 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x68, 0x12, 0x20, 0x0a, 0x1c, + 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x69, 0x12, 0x20, + 0x0a, 0x1c, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, + 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x6a, + 0x12, 0x27, 0x0a, 0x23, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, + 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x5f, 0x54, + 0x4f, 0x5f, 0x4d, 0x4f, 0x4e, 0x54, 0x48, 0x10, 0x6b, 0x12, 0x25, 0x0a, 0x21, 0x58, 0x44, 0x42, + 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, + 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x6c, + 0x12, 0x27, 0x0a, 0x23, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, + 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, + 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x6d, 0x12, 0x27, 0x0a, 0x23, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, - 0x41, 0x4c, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x6a, 0x12, 0x27, 0x0a, 0x23, 0x58, - 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, - 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x59, 0x45, 0x41, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x4f, 0x4e, - 0x54, 0x48, 0x10, 0x6b, 0x12, 0x25, 0x0a, 0x21, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, - 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x10, 0x6c, 0x12, 0x27, 0x0a, 0x23, 0x58, - 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, - 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x49, 0x4e, 0x55, - 0x54, 0x45, 0x10, 0x6d, 0x12, 0x27, 0x0a, 0x23, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, - 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x44, 0x41, - 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x6e, 0x12, 0x28, 0x0a, - 0x24, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, - 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x4d, - 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x6f, 0x12, 0x28, 0x0a, 0x24, 0x58, 0x44, 0x42, 0x43, 0x5f, - 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, - 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, - 0x70, 0x12, 0x2a, 0x0a, 0x26, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, - 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, - 0x45, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x71, 0x1a, 0x02, 0x10, - 0x01, 0x2a, 0x57, 0x0a, 0x08, 0x4e, 0x75, 0x6c, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x18, 0x0a, - 0x14, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x49, 0x4c, 0x49, 0x54, 0x59, 0x5f, 0x4e, 0x4f, 0x5f, - 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x10, 0x00, 0x12, 0x18, 0x0a, 0x14, 0x4e, 0x55, 0x4c, 0x4c, 0x41, - 0x42, 0x49, 0x4c, 0x49, 0x54, 0x59, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x10, - 0x01, 0x12, 0x17, 0x0a, 0x13, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x49, 0x4c, 0x49, 0x54, 0x59, - 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x2a, 0x61, 0x0a, 0x0a, 0x53, 0x65, - 0x61, 0x72, 0x63, 0x68, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x45, 0x41, 0x52, - 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x00, 0x12, 0x13, 0x0a, - 0x0f, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, - 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, - 0x5f, 0x42, 0x41, 0x53, 0x49, 0x43, 0x10, 0x02, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x45, 0x41, 0x52, - 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x46, 0x55, 0x4c, 0x4c, 0x10, 0x03, 0x2a, 0x5c, 0x0a, - 0x11, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x44, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x52, 0x75, 0x6c, - 0x65, 0x73, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x41, 0x53, 0x43, 0x41, 0x44, 0x45, 0x10, 0x00, 0x12, - 0x0c, 0x0a, 0x08, 0x52, 0x45, 0x53, 0x54, 0x52, 0x49, 0x43, 0x54, 0x10, 0x01, 0x12, 0x0c, 0x0a, - 0x08, 0x53, 0x45, 0x54, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x10, 0x02, 0x12, 0x0d, 0x0a, 0x09, 0x4e, - 0x4f, 0x5f, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x03, 0x12, 0x0f, 0x0a, 0x0b, 0x53, 0x45, - 0x54, 0x5f, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, 0x54, 0x10, 0x04, 0x3a, 0x44, 0x0a, 0x0c, 0x65, - 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x12, 0x1f, 0x2e, 0x67, 0x6f, - 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x4d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0xe8, 0x07, 0x20, - 0x01, 0x28, 0x08, 0x52, 0x0c, 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x61, - 0x6c, 0x42, 0x56, 0x0a, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, - 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2e, 0x73, 0x71, 0x6c, - 0x2e, 0x69, 0x6d, 0x70, 0x6c, 0x5a, 0x32, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, - 0x6d, 0x2f, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2f, 0x67, - 0x6f, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x2f, 0x67, - 0x65, 0x6e, 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x33, + 0x41, 0x4c, 0x5f, 0x44, 0x41, 0x59, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, + 0x10, 0x6e, 0x12, 0x28, 0x0a, 0x24, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, + 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x48, 0x4f, 0x55, 0x52, + 0x5f, 0x54, 0x4f, 0x5f, 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x10, 0x6f, 0x12, 0x28, 0x0a, 0x24, + 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, + 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, 0x48, 0x4f, 0x55, 0x52, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, + 0x43, 0x4f, 0x4e, 0x44, 0x10, 0x70, 0x12, 0x2a, 0x0a, 0x26, 0x58, 0x44, 0x42, 0x43, 0x5f, 0x53, + 0x55, 0x42, 0x43, 0x4f, 0x44, 0x45, 0x5f, 0x49, 0x4e, 0x54, 0x45, 0x52, 0x56, 0x41, 0x4c, 0x5f, + 0x4d, 0x49, 0x4e, 0x55, 0x54, 0x45, 0x5f, 0x54, 0x4f, 0x5f, 0x53, 0x45, 0x43, 0x4f, 0x4e, 0x44, + 0x10, 0x71, 0x1a, 0x02, 0x10, 0x01, 0x2a, 0x57, 0x0a, 0x08, 0x4e, 0x75, 0x6c, 0x6c, 0x61, 0x62, + 0x6c, 0x65, 0x12, 0x18, 0x0a, 0x14, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x49, 0x4c, 0x49, 0x54, + 0x59, 0x5f, 0x4e, 0x4f, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x53, 0x10, 0x00, 0x12, 0x18, 0x0a, 0x14, + 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, 0x49, 0x4c, 0x49, 0x54, 0x59, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, + 0x41, 0x42, 0x4c, 0x45, 0x10, 0x01, 0x12, 0x17, 0x0a, 0x13, 0x4e, 0x55, 0x4c, 0x4c, 0x41, 0x42, + 0x49, 0x4c, 0x49, 0x54, 0x59, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x2a, + 0x61, 0x0a, 0x0a, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x61, 0x62, 0x6c, 0x65, 0x12, 0x13, 0x0a, + 0x0f, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x4e, 0x4f, 0x4e, 0x45, + 0x10, 0x00, 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, + 0x5f, 0x43, 0x48, 0x41, 0x52, 0x10, 0x01, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x45, 0x41, 0x52, 0x43, + 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x42, 0x41, 0x53, 0x49, 0x43, 0x10, 0x02, 0x12, 0x13, 0x0a, + 0x0f, 0x53, 0x45, 0x41, 0x52, 0x43, 0x48, 0x41, 0x42, 0x4c, 0x45, 0x5f, 0x46, 0x55, 0x4c, 0x4c, + 0x10, 0x03, 0x2a, 0x5c, 0x0a, 0x11, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x44, 0x65, 0x6c, 0x65, + 0x74, 0x65, 0x52, 0x75, 0x6c, 0x65, 0x73, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x41, 0x53, 0x43, 0x41, + 0x44, 0x45, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x52, 0x45, 0x53, 0x54, 0x52, 0x49, 0x43, 0x54, + 0x10, 0x01, 0x12, 0x0c, 0x0a, 0x08, 0x53, 0x45, 0x54, 0x5f, 0x4e, 0x55, 0x4c, 0x4c, 0x10, 0x02, + 0x12, 0x0d, 0x0a, 0x09, 0x4e, 0x4f, 0x5f, 0x41, 0x43, 0x54, 0x49, 0x4f, 0x4e, 0x10, 0x03, 0x12, + 0x0f, 0x0a, 0x0b, 0x53, 0x45, 0x54, 0x5f, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, 0x54, 0x10, 0x04, + 0x3a, 0x44, 0x0a, 0x0c, 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, + 0x12, 0x1f, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, + 0x75, 0x66, 0x2e, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, + 0x73, 0x18, 0xe8, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, + 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x42, 0x56, 0x0a, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, + 0x61, 0x63, 0x68, 0x65, 0x2e, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2e, 0x66, 0x6c, 0x69, 0x67, 0x68, + 0x74, 0x2e, 0x73, 0x71, 0x6c, 0x2e, 0x69, 0x6d, 0x70, 0x6c, 0x5a, 0x32, 0x67, 0x69, 0x74, 0x68, + 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2f, 0x61, 0x72, + 0x72, 0x6f, 0x77, 0x2f, 0x67, 0x6f, 0x2f, 0x61, 0x72, 0x72, 0x6f, 0x77, 0x2f, 0x66, 0x6c, 0x69, + 0x67, 0x68, 0x74, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x62, 0x06, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( @@ -5490,7 +5558,7 @@ func file_FlightSql_proto_rawDescGZIP() []byte { } var file_FlightSql_proto_enumTypes = make([]protoimpl.EnumInfo, 27) -var file_FlightSql_proto_msgTypes = make([]protoimpl.MessageInfo, 33) +var file_FlightSql_proto_msgTypes = make([]protoimpl.MessageInfo, 34) var file_FlightSql_proto_goTypes = []interface{}{ (SqlInfo)(0), // 0: arrow.flight.protocol.sql.SqlInfo (SqlSupportedTransaction)(0), // 1: arrow.flight.protocol.sql.SqlSupportedTransaction @@ -5548,23 +5616,24 @@ var file_FlightSql_proto_goTypes = []interface{}{ (*CommandPreparedStatementUpdate)(nil), // 53: arrow.flight.protocol.sql.CommandPreparedStatementUpdate (*CommandStatementIngest)(nil), // 54: arrow.flight.protocol.sql.CommandStatementIngest (*DoPutUpdateResult)(nil), // 55: arrow.flight.protocol.sql.DoPutUpdateResult - (*ActionCancelQueryRequest)(nil), // 56: arrow.flight.protocol.sql.ActionCancelQueryRequest - (*ActionCancelQueryResult)(nil), // 57: arrow.flight.protocol.sql.ActionCancelQueryResult - (*CommandStatementIngest_TableDefinitionOptions)(nil), // 58: arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions - nil, // 59: arrow.flight.protocol.sql.CommandStatementIngest.OptionsEntry - (*descriptorpb.MessageOptions)(nil), // 60: google.protobuf.MessageOptions + (*DoPutPreparedStatementResult)(nil), // 56: arrow.flight.protocol.sql.DoPutPreparedStatementResult + (*ActionCancelQueryRequest)(nil), // 57: arrow.flight.protocol.sql.ActionCancelQueryRequest + (*ActionCancelQueryResult)(nil), // 58: arrow.flight.protocol.sql.ActionCancelQueryResult + (*CommandStatementIngest_TableDefinitionOptions)(nil), // 59: arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions + nil, // 60: arrow.flight.protocol.sql.CommandStatementIngest.OptionsEntry + (*descriptorpb.MessageOptions)(nil), // 61: google.protobuf.MessageOptions } var file_FlightSql_proto_depIdxs = []int32{ 38, // 0: arrow.flight.protocol.sql.ActionCreatePreparedSubstraitPlanRequest.plan:type_name -> arrow.flight.protocol.sql.SubstraitPlan 22, // 1: arrow.flight.protocol.sql.ActionEndTransactionRequest.action:type_name -> arrow.flight.protocol.sql.ActionEndTransactionRequest.EndTransaction 23, // 2: arrow.flight.protocol.sql.ActionEndSavepointRequest.action:type_name -> arrow.flight.protocol.sql.ActionEndSavepointRequest.EndSavepoint 38, // 3: arrow.flight.protocol.sql.CommandStatementSubstraitPlan.plan:type_name -> arrow.flight.protocol.sql.SubstraitPlan - 58, // 4: arrow.flight.protocol.sql.CommandStatementIngest.table_definition_options:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions - 59, // 5: arrow.flight.protocol.sql.CommandStatementIngest.options:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.OptionsEntry + 59, // 4: arrow.flight.protocol.sql.CommandStatementIngest.table_definition_options:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions + 60, // 5: arrow.flight.protocol.sql.CommandStatementIngest.options:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.OptionsEntry 26, // 6: arrow.flight.protocol.sql.ActionCancelQueryResult.result:type_name -> arrow.flight.protocol.sql.ActionCancelQueryResult.CancelResult 24, // 7: arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions.if_not_exist:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions.TableNotExistOption 25, // 8: arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions.if_exists:type_name -> arrow.flight.protocol.sql.CommandStatementIngest.TableDefinitionOptions.TableExistsOption - 60, // 9: arrow.flight.protocol.sql.experimental:extendee -> google.protobuf.MessageOptions + 61, // 9: arrow.flight.protocol.sql.experimental:extendee -> google.protobuf.MessageOptions 10, // [10:10] is the sub-list for method output_type 10, // [10:10] is the sub-list for method input_type 10, // [10:10] is the sub-list for extension type_name @@ -5927,7 +5996,7 @@ func file_FlightSql_proto_init() { } } file_FlightSql_proto_msgTypes[29].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*ActionCancelQueryRequest); i { + switch v := v.(*DoPutPreparedStatementResult); i { case 0: return &v.state case 1: @@ -5939,7 +6008,7 @@ func file_FlightSql_proto_init() { } } file_FlightSql_proto_msgTypes[30].Exporter = func(v interface{}, i int) interface{} { - switch v := v.(*ActionCancelQueryResult); i { + switch v := v.(*ActionCancelQueryRequest); i { case 0: return &v.state case 1: @@ -5951,6 +6020,18 @@ func file_FlightSql_proto_init() { } } file_FlightSql_proto_msgTypes[31].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ActionCancelQueryResult); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_FlightSql_proto_msgTypes[32].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*CommandStatementIngest_TableDefinitionOptions); i { case 0: return &v.state @@ -5976,13 +6057,14 @@ func file_FlightSql_proto_init() { file_FlightSql_proto_msgTypes[22].OneofWrappers = []interface{}{} file_FlightSql_proto_msgTypes[25].OneofWrappers = []interface{}{} file_FlightSql_proto_msgTypes[27].OneofWrappers = []interface{}{} + file_FlightSql_proto_msgTypes[29].OneofWrappers = []interface{}{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_FlightSql_proto_rawDesc, NumEnums: 27, - NumMessages: 33, + NumMessages: 34, NumExtensions: 1, NumServices: 0, }, diff --git a/go/arrow/internal/flight_integration/scenario.go b/go/arrow/internal/flight_integration/scenario.go index 820ae806a761d..cde0fff522ec5 100644 --- a/go/arrow/internal/flight_integration/scenario.go +++ b/go/arrow/internal/flight_integration/scenario.go @@ -2134,16 +2134,16 @@ func (m *flightSqlScenarioTester) ClosePreparedStatement(_ context.Context, requ return nil } -func (m *flightSqlScenarioTester) DoPutPreparedStatementQuery(_ context.Context, cmd flightsql.PreparedStatementQuery, rdr flight.MessageReader, _ flight.MetadataWriter) error { +func (m *flightSqlScenarioTester) DoPutPreparedStatementQuery(_ context.Context, cmd flightsql.PreparedStatementQuery, rdr flight.MessageReader, _ flight.MetadataWriter) ([]byte, error){ switch string(cmd.GetPreparedStatementHandle()) { case "SELECT PREPARED STATEMENT HANDLE", "SELECT PREPARED STATEMENT WITH TXN HANDLE", "PLAN HANDLE", "PLAN WITH TXN HANDLE": actualSchema := rdr.Schema() - return assertEq(true, actualSchema.Equal(getQuerySchema())) + return cmd.GetPreparedStatementHandle(), assertEq(true, actualSchema.Equal(getQuerySchema())) } - return fmt.Errorf("%w: handle for DoPutPreparedStatementQuery '%s'", + return cmd.GetPreparedStatementHandle(), fmt.Errorf("%w: handle for DoPutPreparedStatementQuery '%s'", arrow.ErrInvalid, string(cmd.GetPreparedStatementHandle())) } diff --git a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java index e6eb28fe317e1..6fe31fae9216b 100644 --- a/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java +++ b/java/flight/flight-sql/src/main/java/org/apache/arrow/flight/sql/FlightSqlClient.java @@ -1055,7 +1055,9 @@ public FlightInfo execute(final CallOption... options) { .toByteArray()); if (parameterBindingRoot != null && parameterBindingRoot.getRowCount() > 0) { - putParameters(descriptor, options); + try (final SyncPutListener listener = putParameters(descriptor, options)) { + listener.getResult(); + } } return client.getInfo(descriptor, options); From 68eeef52aa78743a69844914ec984bed053bcfff Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 23 Apr 2024 13:29:48 +0800 Subject: [PATCH 008/106] MINOR: [C++][Parquet] Change static_pointer_cast to static_cast for DataPage (#41304) ### Rationale for this change `cpp/src/parquet/column_reader.cc` uses `static_pointer_cast` to cast `current_page_` to specific page. This may introduce unneccessary counting ### What changes are included in this PR? Change `static_pointer_cast` to `static_cast`. ### Are these changes tested? No need ### Are there any user-facing changes? no Authored-by: mwish Signed-off-by: Gang Wu --- cpp/src/parquet/column_reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index af489c70a5233..2566a989c1d21 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -727,13 +727,13 @@ class ColumnReaderImplBase { ConfigureDictionary(static_cast(current_page_.get())); continue; } else if (current_page_->type() == PageType::DATA_PAGE) { - const auto page = std::static_pointer_cast(current_page_); + const auto* page = static_cast(current_page_.get()); const int64_t levels_byte_size = InitializeLevelDecoders( *page, page->repetition_level_encoding(), page->definition_level_encoding()); InitializeDataDecoder(*page, levels_byte_size); return true; } else if (current_page_->type() == PageType::DATA_PAGE_V2) { - const auto page = std::static_pointer_cast(current_page_); + const auto* page = static_cast(current_page_.get()); int64_t levels_byte_size = InitializeLevelDecodersV2(*page); InitializeDataDecoder(*page, levels_byte_size); return true; From d2f140dd7540420aad7685e035f887b0cc9baf91 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 24 Apr 2024 00:06:30 +0800 Subject: [PATCH 009/106] GH-40069: [C++] Make scalar scratch space immutable after initialization (#40237) ### Rationale for this change As #40069 shows, TSAN reports data race that is caused by concurrent filling the scratch space of a scalar instance. The concurrent use of the same scalar could be parallel executing an acero plan containing a literal (a "constant" that is simply represented by an underlying scalar), and this is totally legit. The problem lies in the fact that the scratch space of the scalar is filled "lazily" by the time when it is being involved in the computation and transformed to an array span, for *every* thread. After piloting several approaches (relaxed atomic - an earlier version of this PR, locking - #40260), @ pitrou and @ bkietz suggested an immutable-after-initialization approach, for which the latest version of this PR is. ### What changes are included in this PR? There are generally two parts in this PR: 1. Mandate the initialization of the scratch space in constructor of the concrete subclass of `Scalar`. 2. In order to keep the content of the scratch space consistent with the underlying `value` of the scalar, make the `value` constant. This effectively makes legacy code that directly assigning to the `value` invalid, which is refactored accordingly: 2.1 `BoxScalar` in https://github.com/apache/arrow/pull/40237/files#diff-08d11e02c001c82b1aa89565e16760a8bcca4a608c22619fb45da42fd0ebebac 2.2 `Scalar::CastTo` in https://github.com/apache/arrow/pull/40237/files#diff-b4b83682450006616fa7e4f6f2ea3031cf1a22d734f4bee42a99af313e808f9e 2.3 `ScalarMinMax` in https://github.com/apache/arrow/pull/40237/files#diff-368ab7e748bd4432c92d9fdc26b51e131742b968e3eb32a6fcea4b9f02fa36aa Besides, when refactoring 2.2, I found the current `Scalar::CastTo` is not fully covered by the existing tests. So I also added some lacking ones. ### Are these changes tested? Yes. ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** The `value` member of `BaseBinaryScalar` and subclasses/`BaseListScalar` and subclasses/`SparseUnionScalar`/`DenseUnionScalar`/`RunEndEncodedScalar` is made constant, thus code directly assigning to this member will no more compile. Also the `Scalar::mutable_data()` member function is removed because it's against the immutable nature of `Scalar`. However the impact of these changes seems limited. I don't think much user code is depending on these two old pieces of code. Also after an quick search, I didn't find any document that need to be updated according to this change. There could be none. But if there is, may someone please redirect me to it so I can update. Thanks. * GitHub Issue: #40069 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Signed-off-by: Benjamin Kietzman --- c_glib/arrow-glib/scalar.cpp | 6 +- cpp/src/arrow/array/array_test.cc | 36 ++ cpp/src/arrow/array/data.cc | 90 ++-- .../arrow/compute/kernels/codegen_internal.h | 37 -- .../arrow/compute/kernels/scalar_compare.cc | 17 +- cpp/src/arrow/scalar.cc | 405 +++++++++++------- cpp/src/arrow/scalar.h | 213 +++++++-- cpp/src/arrow/scalar_test.cc | 314 +++++++++++--- 8 files changed, 764 insertions(+), 354 deletions(-) diff --git a/c_glib/arrow-glib/scalar.cpp b/c_glib/arrow-glib/scalar.cpp index def6b15148355..f965b4970304b 100644 --- a/c_glib/arrow-glib/scalar.cpp +++ b/c_glib/arrow-glib/scalar.cpp @@ -1063,7 +1063,8 @@ garrow_base_binary_scalar_get_value(GArrowBaseBinaryScalar *scalar) if (!priv->value) { const auto arrow_scalar = std::static_pointer_cast( garrow_scalar_get_raw(GARROW_SCALAR(scalar))); - priv->value = garrow_buffer_new_raw(&(arrow_scalar->value)); + priv->value = garrow_buffer_new_raw( + const_cast *>(&(arrow_scalar->value))); } return priv->value; } @@ -1983,7 +1984,8 @@ garrow_base_list_scalar_get_value(GArrowBaseListScalar *scalar) if (!priv->value) { const auto arrow_scalar = std::static_pointer_cast( garrow_scalar_get_raw(GARROW_SCALAR(scalar))); - priv->value = garrow_array_new_raw(&(arrow_scalar->value)); + priv->value = garrow_array_new_raw( + const_cast *>(&(arrow_scalar->value))); } return priv->value; } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 60efdb47683f4..b0d7fe740a0a1 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -823,6 +824,41 @@ TEST_F(TestArray, TestFillFromScalar) { } } +// GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same +// scalar instance. +TEST_F(TestArray, TestConcurrentFillFromScalar) { + for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { + ARROW_SCOPED_TRACE("type = ", type->ToString()); + for (auto seed : {0u, 0xdeadbeef, 42u}) { + ARROW_SCOPED_TRACE("seed = ", seed); + + Field field("", type, /*nullable=*/true, + key_value_metadata({{"extension_allow_random_storage", "true"}})); + auto array = random::GenerateArray(field, 1, seed); + + ASSERT_OK_AND_ASSIGN(auto scalar, array->GetScalar(0)); + + // Lambda to create fill an ArraySpan with the scalar and use the ArraySpan a bit. + auto array_span_from_scalar = [&]() { + ArraySpan span(*scalar); + auto roundtripped_array = span.ToArray(); + ASSERT_OK(roundtripped_array->ValidateFull()); + + AssertArraysEqual(*array, *roundtripped_array); + ASSERT_OK_AND_ASSIGN(auto roundtripped_scalar, roundtripped_array->GetScalar(0)); + AssertScalarsEqual(*scalar, *roundtripped_scalar); + }; + + // Two concurrent calls to the lambda are just enough for TSAN to detect a race + // condition. + auto fut1 = std::async(std::launch::async, array_span_from_scalar); + auto fut2 = std::async(std::launch::async, array_span_from_scalar); + fut1.get(); + fut2.get(); + } + } +} + TEST_F(TestArray, ExtensionSpanRoundTrip) { // Other types are checked in MakeEmptyArray but MakeEmptyArray doesn't // work for extension types so we check that here diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 80c411dfa6a6d..ff3112ec1fcc2 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -283,25 +283,15 @@ void ArraySpan::SetMembers(const ArrayData& data) { namespace { -template -BufferSpan OffsetsForScalar(uint8_t* scratch_space, offset_type value_size) { - auto* offsets = reinterpret_cast(scratch_space); - offsets[0] = 0; - offsets[1] = static_cast(value_size); - static_assert(2 * sizeof(offset_type) <= 16); - return {scratch_space, sizeof(offset_type) * 2}; +BufferSpan OffsetsForScalar(uint8_t* scratch_space, int64_t offset_width) { + return {scratch_space, offset_width * 2}; } -template std::pair OffsetsAndSizesForScalar(uint8_t* scratch_space, - offset_type value_size) { + int64_t offset_width) { auto* offsets = scratch_space; - auto* sizes = scratch_space + sizeof(offset_type); - reinterpret_cast(offsets)[0] = 0; - reinterpret_cast(sizes)[0] = value_size; - static_assert(2 * sizeof(offset_type) <= 16); - return {BufferSpan{offsets, sizeof(offset_type)}, - BufferSpan{sizes, sizeof(offset_type)}}; + auto* sizes = scratch_space + offset_width; + return {BufferSpan{offsets, offset_width}, BufferSpan{sizes, offset_width}}; } int GetNumBuffers(const DataType& type) { @@ -415,26 +405,23 @@ void ArraySpan::FillFromScalar(const Scalar& value) { data_size = scalar.value->size(); } if (is_binary_like(type_id)) { - this->buffers[1] = - OffsetsForScalar(scalar.scratch_space_, static_cast(data_size)); + const auto& binary_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(binary_scalar.scratch_space_, sizeof(int32_t)); } else { // is_large_binary_like - this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, data_size); + const auto& large_binary_scalar = checked_cast(value); + this->buffers[1] = + OffsetsForScalar(large_binary_scalar.scratch_space_, sizeof(int64_t)); } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { - const auto& scalar = checked_cast(value); + const auto& scalar = checked_cast(value); this->buffers[1].size = BinaryViewType::kSize; this->buffers[1].data = scalar.scratch_space_; - static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); - auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; if (scalar.is_valid) { - *view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0); this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); - } else { - *view = {}; } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); @@ -443,12 +430,10 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } else if (is_var_length_list_like(type_id) || type_id == Type::FIXED_SIZE_LIST) { const auto& scalar = checked_cast(value); - int64_t value_length = 0; this->child_data.resize(1); if (scalar.value != nullptr) { // When the scalar is null, scalar.value can also be null this->child_data[0].SetMembers(*scalar.value->data()); - value_length = scalar.value->length(); } else { // Even when the value is null, we still must populate the // child_data to yield a valid array. Tedious @@ -456,17 +441,25 @@ void ArraySpan::FillFromScalar(const Scalar& value) { &this->child_data[0]); } - if (type_id == Type::LIST || type_id == Type::MAP) { - this->buffers[1] = - OffsetsForScalar(scalar.scratch_space_, static_cast(value_length)); + if (type_id == Type::LIST) { + const auto& list_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(list_scalar.scratch_space_, sizeof(int32_t)); + } else if (type_id == Type::MAP) { + const auto& map_scalar = checked_cast(value); + this->buffers[1] = OffsetsForScalar(map_scalar.scratch_space_, sizeof(int32_t)); } else if (type_id == Type::LARGE_LIST) { - this->buffers[1] = OffsetsForScalar(scalar.scratch_space_, value_length); + const auto& large_list_scalar = checked_cast(value); + this->buffers[1] = + OffsetsForScalar(large_list_scalar.scratch_space_, sizeof(int64_t)); } else if (type_id == Type::LIST_VIEW) { - std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( - scalar.scratch_space_, static_cast(value_length)); - } else if (type_id == Type::LARGE_LIST_VIEW) { + const auto& list_view_scalar = checked_cast(value); std::tie(this->buffers[1], this->buffers[2]) = - OffsetsAndSizesForScalar(scalar.scratch_space_, value_length); + OffsetsAndSizesForScalar(list_view_scalar.scratch_space_, sizeof(int32_t)); + } else if (type_id == Type::LARGE_LIST_VIEW) { + const auto& large_list_view_scalar = + checked_cast(value); + std::tie(this->buffers[1], this->buffers[2]) = OffsetsAndSizesForScalar( + large_list_view_scalar.scratch_space_, sizeof(int64_t)); } else { DCHECK_EQ(type_id, Type::FIXED_SIZE_LIST); // FIXED_SIZE_LIST: does not have a second buffer @@ -480,27 +473,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) { this->child_data[i].FillFromScalar(*scalar.value[i]); } } else if (is_union(type_id)) { - // Dense union needs scratch space to store both offsets and a type code - struct UnionScratchSpace { - alignas(int64_t) int8_t type_code; - alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2]; - }; - static_assert(sizeof(UnionScratchSpace) <= sizeof(UnionScalar::scratch_space_)); - auto* union_scratch_space = reinterpret_cast( - &checked_cast(value).scratch_space_); - // First buffer is kept null since unions have no validity vector this->buffers[0] = {}; - union_scratch_space->type_code = checked_cast(value).type_code; - this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); - this->buffers[1].size = 1; - this->child_data.resize(this->type->num_fields()); if (type_id == Type::DENSE_UNION) { const auto& scalar = checked_cast(value); - this->buffers[2] = - OffsetsForScalar(union_scratch_space->offsets, static_cast(1)); + auto* union_scratch_space = + reinterpret_cast(&scalar.scratch_space_); + + this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); + this->buffers[1].size = 1; + + this->buffers[2] = OffsetsForScalar(union_scratch_space->offsets, sizeof(int32_t)); // We can't "see" the other arrays in the union, but we put the "active" // union array in the right place and fill zero-length arrays for the // others @@ -517,6 +502,12 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } } else { const auto& scalar = checked_cast(value); + auto* union_scratch_space = + reinterpret_cast(&scalar.scratch_space_); + + this->buffers[1].data = reinterpret_cast(&union_scratch_space->type_code); + this->buffers[1].size = 1; + // Sparse union scalars have a full complement of child values even // though only one of them is relevant, so we just fill them in here for (int i = 0; i < static_cast(this->child_data.size()); ++i) { @@ -541,7 +532,6 @@ void ArraySpan::FillFromScalar(const Scalar& value) { e.null_count = 0; e.buffers[1].data = scalar.scratch_space_; e.buffers[1].size = sizeof(run_end); - reinterpret_cast(scalar.scratch_space_)[0] = run_end; }; switch (scalar.run_end_type()->id()) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 72b29057b82e0..097ee1de45b6a 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -369,43 +369,6 @@ struct UnboxScalar { } }; -template -struct BoxScalar; - -template -struct BoxScalar> { - using T = typename GetOutputType::T; - static void Box(T val, Scalar* out) { - // Enables BoxScalar to work on a (for example) Time64Scalar - T* mutable_data = reinterpret_cast( - checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data()); - *mutable_data = val; - } -}; - -template -struct BoxScalar> { - using T = typename GetOutputType::T; - using ScalarType = typename TypeTraits::ScalarType; - static void Box(T val, Scalar* out) { - checked_cast(out)->value = std::make_shared(val); - } -}; - -template <> -struct BoxScalar { - using T = Decimal128; - using ScalarType = Decimal128Scalar; - static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } -}; - -template <> -struct BoxScalar { - using T = Decimal256; - using ScalarType = Decimal256Scalar; - static void Box(T val, Scalar* out) { checked_cast(out)->value = val; } -}; - // A VisitArraySpanInline variant that calls its visitor function with logical // values, such as Decimal128 rather than std::string_view. diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc index daf8ed76d628d..9b2fd987d81d8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_compare.cc +++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc @@ -491,8 +491,9 @@ template struct ScalarMinMax { using OutValue = typename GetOutputType::T; - static void ExecScalar(const ExecSpan& batch, - const ElementWiseAggregateOptions& options, Scalar* out) { + static Result> ExecScalar( + const ExecSpan& batch, const ElementWiseAggregateOptions& options, + std::shared_ptr type) { // All arguments are scalar OutValue value{}; bool valid = false; @@ -502,8 +503,8 @@ struct ScalarMinMax { const Scalar& scalar = *arg.scalar; if (!scalar.is_valid) { if (options.skip_nulls) continue; - out->is_valid = false; - return; + valid = false; + break; } if (!valid) { value = UnboxScalar::Unbox(scalar); @@ -513,9 +514,10 @@ struct ScalarMinMax { value, UnboxScalar::Unbox(scalar)); } } - out->is_valid = valid; if (valid) { - BoxScalar::Box(value, out); + return MakeScalar(std::move(type), std::move(value)); + } else { + return MakeNullScalar(std::move(type)); } } @@ -537,8 +539,7 @@ struct ScalarMinMax { bool initialize_output = true; if (scalar_count > 0) { ARROW_ASSIGN_OR_RAISE(std::shared_ptr temp_scalar, - MakeScalar(out->type()->GetSharedPtr(), 0)); - ExecScalar(batch, options, temp_scalar.get()); + ExecScalar(batch, options, out->type()->GetSharedPtr())); if (temp_scalar->is_valid) { const auto value = UnboxScalar::Unbox(*temp_scalar); initialize_output = false; diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 6996b46c8b61a..8e8d3903663e4 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -542,6 +542,12 @@ struct ScalarValidateImpl { } }; +template +void FillScalarScratchSpace(void* scratch_space, T const (&arr)[N]) { + static_assert(sizeof(arr) <= internal::kScalarScratchSpaceSize); + std::memcpy(scratch_space, arr, sizeof(arr)); +} + } // namespace size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; } @@ -557,6 +563,28 @@ Status Scalar::ValidateFull() const { BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} +void BinaryScalar::FillScratchSpace() { + FillScalarScratchSpace( + scratch_space_, + {int32_t(0), value ? static_cast(value->size()) : int32_t(0)}); +} + +void BinaryViewScalar::FillScratchSpace() { + static_assert(sizeof(BinaryViewType::c_type) <= internal::kScalarScratchSpaceSize); + auto* view = new (&scratch_space_) BinaryViewType::c_type; + if (value) { + *view = util::ToBinaryView(std::string_view{*value}, 0, 0); + } else { + *view = {}; + } +} + +void LargeBinaryScalar::FillScratchSpace() { + FillScalarScratchSpace( + scratch_space_, + {int64_t(0), value ? static_cast(value->size()) : int64_t(0)}); +} + FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) @@ -578,21 +606,45 @@ FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::string s, bool is_valid) BaseListScalar::BaseListScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) : Scalar{std::move(type), is_valid}, value(std::move(value)) { - ARROW_CHECK(this->type->field(0)->type()->Equals(this->value->type())); + if (this->value) { + ARROW_CHECK(this->type->field(0)->type()->Equals(this->value->type())); + } } ListScalar::ListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, list(value->type()), is_valid) {} +void ListScalar::FillScratchSpace() { + FillScalarScratchSpace( + scratch_space_, + {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); +} + LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list(value->type()), is_valid) {} +void LargeListScalar::FillScratchSpace() { + FillScalarScratchSpace(scratch_space_, + {int64_t(0), value ? value->length() : int64_t(0)}); +} + ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, list_view(value->type()), is_valid) {} +void ListViewScalar::FillScratchSpace() { + FillScalarScratchSpace( + scratch_space_, + {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); +} + LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, large_list_view(value->type()), is_valid) {} +void LargeListViewScalar::FillScratchSpace() { + FillScalarScratchSpace(scratch_space_, + {int64_t(0), value ? value->length() : int64_t(0)}); +} + inline std::shared_ptr MakeMapType(const std::shared_ptr& pair_type) { ARROW_CHECK_EQ(pair_type->id(), Type::STRUCT); ARROW_CHECK_EQ(pair_type->num_fields(), 2); @@ -602,11 +654,19 @@ inline std::shared_ptr MakeMapType(const std::shared_ptr& pa MapScalar::MapScalar(std::shared_ptr value, bool is_valid) : BaseListScalar(value, MakeMapType(value->type()), is_valid) {} +void MapScalar::FillScratchSpace() { + FillScalarScratchSpace( + scratch_space_, + {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); +} + FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid) - : BaseListScalar(value, std::move(type), is_valid) { - ARROW_CHECK_EQ(this->value->length(), - checked_cast(*this->type).list_size()); + : BaseListScalar(std::move(value), std::move(type), is_valid) { + if (this->value) { + ARROW_CHECK_EQ(this->value->length(), + checked_cast(*this->type).list_size()); + } } FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr value, bool is_valid) @@ -656,6 +716,21 @@ RunEndEncodedScalar::RunEndEncodedScalar(const std::shared_ptr& type) RunEndEncodedScalar::~RunEndEncodedScalar() = default; +void RunEndEncodedScalar::FillScratchSpace() { + auto run_end = run_end_type()->id(); + switch (run_end) { + case Type::INT16: + FillScalarScratchSpace(scratch_space_, {int16_t(1)}); + break; + case Type::INT32: + FillScalarScratchSpace(scratch_space_, {int32_t(1)}); + break; + default: + DCHECK_EQ(run_end, Type::INT64); + FillScalarScratchSpace(scratch_space_, {int64_t(1)}); + } +} + DictionaryScalar::DictionaryScalar(std::shared_ptr type) : internal::PrimitiveScalarBase(std::move(type)), value{MakeNullScalar(checked_cast(*this->type).index_type()), @@ -732,11 +807,14 @@ SparseUnionScalar::SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, /*is_valid=*/true), value(std::move(value)) { - this->child_id = - checked_cast(*this->type).child_ids()[type_code]; + const auto child_ids = checked_cast(*this->type).child_ids(); + if (type_code >= 0 && static_cast(type_code) < child_ids.size() && + child_ids[type_code] != UnionType::kInvalidChildId) { + this->child_id = child_ids[type_code]; - // Fix nullness based on whether the selected child is null - this->is_valid = this->value[this->child_id]->is_valid; + // Fix nullness based on whether the selected child is null + this->is_valid = this->value[this->child_id]->is_valid; + } } std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr value, @@ -755,6 +833,17 @@ std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr val return std::make_shared(field_values, type_code, std::move(type)); } +void SparseUnionScalar::FillScratchSpace() { + auto* union_scratch_space = reinterpret_cast(&scratch_space_); + union_scratch_space->type_code = type_code; +} + +void DenseUnionScalar::FillScratchSpace() { + auto* union_scratch_space = reinterpret_cast(&scratch_space_); + union_scratch_space->type_code = type_code; + FillScalarScratchSpace(union_scratch_space->offsets, {int32_t(0), int32_t(1)}); +} + namespace { template @@ -969,58 +1058,72 @@ std::shared_ptr FormatToBuffer(Formatter&& formatter, const ScalarType& } // error fallback -Status CastImpl(const Scalar& from, Scalar* to) { +template +Result> CastImpl(const Scalar& from, + std::shared_ptr to_type) { return Status::NotImplemented("casting scalars of type ", *from.type, " to type ", - *to->type); + *to_type); } // numeric to numeric -template -Status CastImpl(const NumericScalar& from, NumericScalar* to) { - to->value = static_cast(from.value); - return Status::OK(); +template +enable_if_number>> CastImpl( + const NumericScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + return std::make_shared(static_cast(from.value), + std::move(to_type)); } // numeric to boolean -template -Status CastImpl(const NumericScalar& from, BooleanScalar* to) { - constexpr auto zero = static_cast(0); - to->value = from.value != zero; - return Status::OK(); +template +enable_if_boolean>> CastImpl( + const NumericScalar& from, std::shared_ptr to_type) { + constexpr auto zero = static_cast(0); + return std::make_shared(from.value != zero, std::move(to_type)); } // boolean to numeric -template -Status CastImpl(const BooleanScalar& from, NumericScalar* to) { - to->value = static_cast(from.value); - return Status::OK(); +template +enable_if_number>> CastImpl( + const BooleanScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + return std::make_shared(static_cast(from.value), + std::move(to_type)); } // numeric to temporal -template +template typename std::enable_if::value && !std::is_same::value && !std::is_same::value, - Status>::type -CastImpl(const NumericScalar& from, TemporalScalar* to) { - to->value = static_cast(from.value); - return Status::OK(); + Result>>::type +CastImpl(const NumericScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + return std::make_shared(static_cast(from.value), + std::move(to_type)); } // temporal to numeric -template -typename std::enable_if::value && +template +typename std::enable_if::value && + std::is_base_of::value && !std::is_same::value && !std::is_same::value, - Status>::type -CastImpl(const TemporalScalar& from, NumericScalar* to) { - to->value = static_cast(from.value); - return Status::OK(); + Result>>::type +CastImpl(const TemporalScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + return std::make_shared(static_cast(from.value), + std::move(to_type)); } // timestamp to timestamp -Status CastImpl(const TimestampScalar& from, TimestampScalar* to) { - return util::ConvertTimestampValue(from.type, to->type, from.value).Value(&to->value); +template +enable_if_timestamp>> CastImpl( + const TimestampScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + ARROW_ASSIGN_OR_RAISE(auto value, + util::ConvertTimestampValue(from.type, to_type, from.value)); + return std::make_shared(value, std::move(to_type)); } template @@ -1029,101 +1132,117 @@ std::shared_ptr AsTimestampType(const std::shared_ptr& type) } // duration to duration -Status CastImpl(const DurationScalar& from, DurationScalar* to) { - return util::ConvertTimestampValue(AsTimestampType(from.type), - AsTimestampType(to->type), from.value) - .Value(&to->value); +template +enable_if_duration>> CastImpl( + const DurationScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + ARROW_ASSIGN_OR_RAISE( + auto value, + util::ConvertTimestampValue(AsTimestampType(from.type), + AsTimestampType(to_type), from.value)); + return std::make_shared(value, std::move(to_type)); } // time to time -template -enable_if_time CastImpl(const TimeScalar& from, ToScalar* to) { - return util::ConvertTimestampValue(AsTimestampType(from.type), - AsTimestampType(to->type), from.value) - .Value(&to->value); +template +enable_if_time>> CastImpl( + const TimeScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + ARROW_ASSIGN_OR_RAISE( + auto value, util::ConvertTimestampValue(AsTimestampType(from.type), + AsTimestampType(to_type), from.value)); + return std::make_shared(value, std::move(to_type)); } constexpr int64_t kMillisecondsInDay = 86400000; // date to date -Status CastImpl(const Date32Scalar& from, Date64Scalar* to) { - to->value = from.value * kMillisecondsInDay; - return Status::OK(); +template +enable_if_t::value, Result>> +CastImpl(const Date32Scalar& from, std::shared_ptr to_type) { + return std::make_shared(from.value * kMillisecondsInDay, + std::move(to_type)); } -Status CastImpl(const Date64Scalar& from, Date32Scalar* to) { - to->value = static_cast(from.value / kMillisecondsInDay); - return Status::OK(); +template +enable_if_t::value, Result>> +CastImpl(const Date64Scalar& from, std::shared_ptr to_type) { + return std::make_shared( + static_cast(from.value / kMillisecondsInDay), std::move(to_type)); } // timestamp to date -Status CastImpl(const TimestampScalar& from, Date64Scalar* to) { +template +enable_if_t::value, Result>> +CastImpl(const TimestampScalar& from, std::shared_ptr to_type) { ARROW_ASSIGN_OR_RAISE( auto millis, util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value)); - to->value = millis - millis % kMillisecondsInDay; - return Status::OK(); + return std::make_shared(millis - millis % kMillisecondsInDay, + std::move(to_type)); } -Status CastImpl(const TimestampScalar& from, Date32Scalar* to) { +template +enable_if_t::value, Result>> +CastImpl(const TimestampScalar& from, std::shared_ptr to_type) { ARROW_ASSIGN_OR_RAISE( auto millis, util::ConvertTimestampValue(from.type, timestamp(TimeUnit::MILLI), from.value)); - to->value = static_cast(millis / kMillisecondsInDay); - return Status::OK(); + return std::make_shared(static_cast(millis / kMillisecondsInDay), + std::move(to_type)); } // date to timestamp -template -Status CastImpl(const DateScalar& from, TimestampScalar* to) { +template +enable_if_timestamp>> CastImpl( + const DateScalar& from, std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; int64_t millis = from.value; - if (std::is_same::value) { + if (std::is_same::value) { millis *= kMillisecondsInDay; } - return util::ConvertTimestampValue(timestamp(TimeUnit::MILLI), to->type, millis) - .Value(&to->value); + ARROW_ASSIGN_OR_RAISE(auto value, util::ConvertTimestampValue( + timestamp(TimeUnit::MILLI), to_type, millis)); + return std::make_shared(value, std::move(to_type)); } // string to any -template -Status CastImpl(const StringScalar& from, ScalarType* to) { - ARROW_ASSIGN_OR_RAISE(auto out, Scalar::Parse(to->type, std::string_view(*from.value))); - to->value = std::move(checked_cast(*out).value); - return Status::OK(); +template +Result> CastImpl(const StringScalar& from, + std::shared_ptr to_type) { + using ToScalar = typename TypeTraits::ScalarType; + ARROW_ASSIGN_OR_RAISE(auto out, + Scalar::Parse(std::move(to_type), std::string_view(*from.value))); + DCHECK(checked_pointer_cast(out) != nullptr); + return std::move(out); } // binary/large binary/large string to string -template -enable_if_t && - !std::is_same::value, - Status> -CastImpl(const ScalarType& from, StringScalar* to) { - to->value = from.value; - return Status::OK(); +template +enable_if_t::value && + std::is_base_of_v && + !std::is_same::value, + Result>> +CastImpl(const From& from, std::shared_ptr to_type) { + return std::make_shared(from.value, std::move(to_type)); } // formattable to string -template , // note: Value unused but necessary to trigger SFINAE if Formatter is // undefined typename Value = typename Formatter::value_type> -Status CastImpl(const ScalarType& from, StringScalar* to) { - to->value = FormatToBuffer(Formatter{from.type.get()}, from); - return Status::OK(); -} - -Status CastImpl(const Decimal128Scalar& from, StringScalar* to) { - auto from_type = checked_cast(from.type.get()); - to->value = Buffer::FromString(from.value.ToString(from_type->scale())); - return Status::OK(); -} - -Status CastImpl(const Decimal256Scalar& from, StringScalar* to) { - auto from_type = checked_cast(from.type.get()); - to->value = Buffer::FromString(from.value.ToString(from_type->scale())); - return Status::OK(); +typename std::enable_if_t::value, + Result>> +CastImpl(const From& from, std::shared_ptr to_type) { + return std::make_shared(FormatToBuffer(Formatter{from.type.get()}, from), + std::move(to_type)); } -Status CastImpl(const StructScalar& from, StringScalar* to) { +// struct to string +template +typename std::enable_if_t::value, + Result>> +CastImpl(const StructScalar& from, std::shared_ptr to_type) { std::stringstream ss; ss << '{'; for (int i = 0; static_cast(i) < from.value.size(); i++) { @@ -1132,24 +1251,23 @@ Status CastImpl(const StructScalar& from, StringScalar* to) { << " = " << from.value[i]->ToString(); } ss << '}'; - to->value = Buffer::FromString(ss.str()); - return Status::OK(); + return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } // casts between variable-length and fixed-length list types -template -enable_if_list_type CastImpl( - const BaseListScalar& from, ToScalar* to) { - if constexpr (sizeof(typename ToScalar::TypeClass::offset_type) < sizeof(int64_t)) { - if (from.value->length() > - std::numeric_limits::max()) { +template +std::enable_if_t::value && is_list_type::value, + Result>> +CastImpl(const From& from, std::shared_ptr to_type) { + if constexpr (sizeof(typename To::offset_type) < sizeof(int64_t)) { + if (from.value->length() > std::numeric_limits::max()) { return Status::Invalid(from.type->ToString(), " too large to cast to ", - to->type->ToString()); + to_type->ToString()); } } - if constexpr (is_fixed_size_list_type::value) { - const auto& fixed_size_list_type = checked_cast(*to->type); + if constexpr (is_fixed_size_list_type::value) { + const auto& fixed_size_list_type = checked_cast(*to_type); if (from.value->length() != fixed_size_list_type.list_size()) { return Status::Invalid("Cannot cast ", from.type->ToString(), " of length ", from.value->length(), " to fixed size list of length ", @@ -1157,13 +1275,15 @@ enable_if_list_type CastImpl( } } - DCHECK_EQ(from.is_valid, to->is_valid); - to->value = from.value; - return Status::OK(); + using ToScalar = typename TypeTraits::ScalarType; + return std::make_shared(from.value, std::move(to_type), from.is_valid); } // list based types (list, large list and map (fixed sized list too)) to string -Status CastImpl(const BaseListScalar& from, StringScalar* to) { +template +typename std::enable_if_t::value, + Result>> +CastImpl(const BaseListScalar& from, std::shared_ptr to_type) { std::stringstream ss; ss << from.type->ToString() << "["; for (int64_t i = 0; i < from.value->length(); i++) { @@ -1172,11 +1292,14 @@ Status CastImpl(const BaseListScalar& from, StringScalar* to) { ss << value->ToString(); } ss << ']'; - to->value = Buffer::FromString(ss.str()); - return Status::OK(); + return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } -Status CastImpl(const UnionScalar& from, StringScalar* to) { +// union types to string +template +typename std::enable_if_t::value, + Result>> +CastImpl(const UnionScalar& from, std::shared_ptr to_type) { const auto& union_ty = checked_cast(*from.type); std::stringstream ss; const Scalar* selected_value; @@ -1188,8 +1311,7 @@ Status CastImpl(const UnionScalar& from, StringScalar* to) { } ss << "union{" << union_ty.field(union_ty.child_ids()[from.type_code])->ToString() << " = " << selected_value->ToString() << '}'; - to->value = Buffer::FromString(ss.str()); - return Status::OK(); + return std::make_shared(Buffer::FromString(ss.str()), std::move(to_type)); } struct CastImplVisitor { @@ -1199,59 +1321,49 @@ struct CastImplVisitor { const Scalar& from_; const std::shared_ptr& to_type_; - Scalar* out_; + std::shared_ptr out_ = nullptr; }; template struct FromTypeVisitor : CastImplVisitor { using ToScalar = typename TypeTraits::ScalarType; - FromTypeVisitor(const Scalar& from, const std::shared_ptr& to_type, - Scalar* out) - : CastImplVisitor{from, to_type, out} {} + FromTypeVisitor(const Scalar& from, const std::shared_ptr& to_type) + : CastImplVisitor{from, to_type} {} template Status Visit(const FromType&) { - return CastImpl(checked_cast::ScalarType&>(from_), - checked_cast(out_)); + ARROW_ASSIGN_OR_RAISE( + out_, CastImpl( + checked_cast::ScalarType&>(from_), + std::move(to_type_))); + return Status::OK(); } // identity cast only for parameter free types template typename std::enable_if_t::is_parameter_free, Status> Visit( const ToType&) { - checked_cast(out_)->value = checked_cast(from_).value; + ARROW_ASSIGN_OR_RAISE(out_, MakeScalar(std::move(to_type_), + checked_cast(from_).value)); return Status::OK(); } - Status CastFromListLike(const BaseListType& base_list_type) { - return CastImpl(checked_cast(from_), - checked_cast(out_)); - } - - Status Visit(const ListType& list_type) { return CastFromListLike(list_type); } - - Status Visit(const LargeListType& large_list_type) { - return CastFromListLike(large_list_type); - } - - Status Visit(const FixedSizeListType& fixed_size_list_type) { - return CastFromListLike(fixed_size_list_type); - } - Status Visit(const NullType&) { return NotImplemented(); } Status Visit(const DictionaryType&) { return NotImplemented(); } Status Visit(const ExtensionType&) { return NotImplemented(); } }; struct ToTypeVisitor : CastImplVisitor { - ToTypeVisitor(const Scalar& from, const std::shared_ptr& to_type, Scalar* out) - : CastImplVisitor{from, to_type, out} {} + ToTypeVisitor(const Scalar& from, const std::shared_ptr& to_type) + : CastImplVisitor{from, to_type} {} template Status Visit(const ToType&) { - FromTypeVisitor unpack_from_type{from_, to_type_, out_}; - return VisitTypeInline(*from_.type, &unpack_from_type); + FromTypeVisitor unpack_from_type{from_, to_type_}; + ARROW_RETURN_NOT_OK(VisitTypeInline(*from_.type, &unpack_from_type)); + out_ = std::move(unpack_from_type.out_); + return Status::OK(); } Status Visit(const NullType&) { @@ -1262,25 +1374,28 @@ struct ToTypeVisitor : CastImplVisitor { } Status Visit(const DictionaryType& dict_type) { - auto& out = checked_cast(out_)->value; ARROW_ASSIGN_OR_RAISE(auto cast_value, from_.CastTo(dict_type.value_type())); - ARROW_ASSIGN_OR_RAISE(out.dictionary, MakeArrayFromScalar(*cast_value, 1)); - return Int32Scalar(0).CastTo(dict_type.index_type()).Value(&out.index); + ARROW_ASSIGN_OR_RAISE(auto dictionary, MakeArrayFromScalar(*cast_value, 1)); + ARROW_ASSIGN_OR_RAISE(auto index, Int32Scalar(0).CastTo(dict_type.index_type())); + out_ = DictionaryScalar::Make(std::move(index), std::move(dictionary)); + return Status::OK(); } Status Visit(const ExtensionType&) { return NotImplemented(); } + + Result> Finish() && { + ARROW_RETURN_NOT_OK(VisitTypeInline(*to_type_, this)); + return std::move(out_); + } }; } // namespace Result> Scalar::CastTo(std::shared_ptr to) const { - std::shared_ptr out = MakeNullScalar(to); if (is_valid) { - out->is_valid = true; - ToTypeVisitor unpack_to_type{*this, to, out.get()}; - RETURN_NOT_OK(VisitTypeInline(*to, &unpack_to_type)); + return ToTypeVisitor{*this, std::move(to)}.Finish(); } - return out; + return MakeNullScalar(std::move(to)); } void PrintTo(const Scalar& scalar, std::ostream* os) { *os << scalar.ToString(); } diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 65c5ee4df0a04..a7ee6a417d9a1 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -131,11 +131,19 @@ struct ARROW_EXPORT NullScalar : public Scalar { namespace internal { +constexpr auto kScalarScratchSpaceSize = sizeof(int64_t) * 2; + +template struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { // 16 bytes of scratch space to enable ArraySpan to be a view onto any // Scalar- including binary scalars where we need to create a buffer // that looks like two 32-bit or 64-bit offsets. - alignas(int64_t) mutable uint8_t scratch_space_[sizeof(int64_t) * 2]; + alignas(int64_t) mutable uint8_t scratch_space_[kScalarScratchSpaceSize]; + + private: + ArraySpanFillFromScalarScratchSpace() { static_cast(this)->FillScratchSpace(); } + + friend Impl; }; struct ARROW_EXPORT PrimitiveScalarBase : public Scalar { @@ -145,8 +153,6 @@ struct ARROW_EXPORT PrimitiveScalarBase : public Scalar { using Scalar::Scalar; /// \brief Get a const pointer to the value of this scalar. May be null. virtual const void* data() const = 0; - /// \brief Get a mutable pointer to the value of this scalar. May be null. - virtual void* mutable_data() = 0; /// \brief Get an immutable view of the value of this scalar as bytes. virtual std::string_view view() const = 0; }; @@ -167,7 +173,6 @@ struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase { ValueType value{}; const void* data() const override { return &value; } - void* mutable_data() override { return &value; } std::string_view view() const override { return std::string_view(reinterpret_cast(&value), sizeof(ValueType)); }; @@ -245,34 +250,38 @@ struct ARROW_EXPORT DoubleScalar : public NumericScalar { using NumericScalar::NumericScalar; }; -struct ARROW_EXPORT BaseBinaryScalar - : public internal::PrimitiveScalarBase, - private internal::ArraySpanFillFromScalarScratchSpace { - using internal::PrimitiveScalarBase::PrimitiveScalarBase; +struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { using ValueType = std::shared_ptr; - std::shared_ptr value; + // The value is not supposed to be modified after construction, because subclasses have + // a scratch space whose content need to be kept consistent with the value. It is also + // the user of this class's responsibility to ensure that the buffer is not written to + // accidentally. + const std::shared_ptr value = NULLPTR; const void* data() const override { return value ? reinterpret_cast(value->data()) : NULLPTR; } - void* mutable_data() override { - return value ? reinterpret_cast(value->mutable_data()) : NULLPTR; - } std::string_view view() const override { return value ? std::string_view(*value) : std::string_view(); } + explicit BaseBinaryScalar(std::shared_ptr type) + : internal::PrimitiveScalarBase(std::move(type)) {} + BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} - friend ArraySpan; BaseBinaryScalar(std::string s, std::shared_ptr type); }; -struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { +struct ARROW_EXPORT BinaryScalar + : public BaseBinaryScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} @@ -280,6 +289,12 @@ struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; struct ARROW_EXPORT StringScalar : public BinaryScalar { @@ -294,9 +309,13 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { StringScalar() : StringScalar(utf8()) {} }; -struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { +struct ARROW_EXPORT BinaryViewScalar + : public BaseBinaryScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit BinaryViewScalar(std::shared_ptr value) : BinaryViewScalar(std::move(value), binary_view()) {} @@ -307,6 +326,12 @@ struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { BinaryViewScalar() : BinaryViewScalar(binary_view()) {} std::string_view view() const override { return std::string_view(*this->value); } + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { @@ -322,9 +347,13 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { StringViewScalar() : StringViewScalar(utf8_view()) {} }; -struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { +struct ARROW_EXPORT LargeBinaryScalar + : public BaseBinaryScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; LargeBinaryScalar(std::shared_ptr value, std::shared_ptr type) : BaseBinaryScalar(std::move(value), std::move(type)) {} @@ -336,6 +365,12 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { @@ -482,10 +517,6 @@ struct ARROW_EXPORT DecimalScalar : public internal::PrimitiveScalarBase { return reinterpret_cast(value.native_endian_bytes()); } - void* mutable_data() override { - return reinterpret_cast(value.mutable_native_endian_bytes()); - } - std::string_view view() const override { return std::string_view(reinterpret_cast(value.native_endian_bytes()), ValueType::kByteWidth); @@ -502,54 +533,102 @@ struct ARROW_EXPORT Decimal256Scalar : public DecimalScalar; BaseListScalar(std::shared_ptr value, std::shared_ptr type, bool is_valid = true); - std::shared_ptr value; - - private: - friend struct ArraySpan; + // The value is not supposed to be modified after construction, because subclasses have + // a scratch space whose content need to be kept consistent with the value. It is also + // the user of this class's responsibility to ensure that the array is not modified + // accidentally. + const std::shared_ptr value; }; -struct ARROW_EXPORT ListScalar : public BaseListScalar { +struct ARROW_EXPORT ListScalar + : public BaseListScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListType; using BaseListScalar::BaseListScalar; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit ListScalar(std::shared_ptr value, bool is_valid = true); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; -struct ARROW_EXPORT LargeListScalar : public BaseListScalar { +struct ARROW_EXPORT LargeListScalar + : public BaseListScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListType; using BaseListScalar::BaseListScalar; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; -struct ARROW_EXPORT ListViewScalar : public BaseListScalar { +struct ARROW_EXPORT ListViewScalar + : public BaseListScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListViewType; using BaseListScalar::BaseListScalar; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; -struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar { +struct ARROW_EXPORT LargeListViewScalar + : public BaseListScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListViewType; using BaseListScalar::BaseListScalar; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; -struct ARROW_EXPORT MapScalar : public BaseListScalar { +struct ARROW_EXPORT MapScalar + : public BaseListScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = MapType; using BaseListScalar::BaseListScalar; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; explicit MapScalar(std::shared_ptr value, bool is_valid = true); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; struct ARROW_EXPORT FixedSizeListScalar : public BaseListScalar { @@ -576,9 +655,10 @@ struct ARROW_EXPORT StructScalar : public Scalar { std::vector field_names); }; -struct ARROW_EXPORT UnionScalar : public Scalar, - private internal::ArraySpanFillFromScalarScratchSpace { - int8_t type_code; +struct ARROW_EXPORT UnionScalar : public Scalar { + // The type code is not supposed to be modified after construction, because the scratch + // space's content need to be kept consistent with it. + const int8_t type_code; virtual const std::shared_ptr& child_value() const = 0; @@ -586,17 +666,31 @@ struct ARROW_EXPORT UnionScalar : public Scalar, UnionScalar(std::shared_ptr type, int8_t type_code, bool is_valid) : Scalar(std::move(type), is_valid), type_code(type_code) {} - friend struct ArraySpan; + struct UnionScratchSpace { + alignas(int64_t) int8_t type_code; + alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2]; + }; + static_assert(sizeof(UnionScratchSpace) <= internal::kScalarScratchSpaceSize); + + friend ArraySpan; }; -struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { +struct ARROW_EXPORT SparseUnionScalar + : public UnionScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = SparseUnionType; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; // Even though only one of the union values is relevant for this scalar, we // nonetheless construct a vector of scalars, one per union value, to have // enough data to reconstruct a valid ArraySpan of length 1 from this scalar using ValueType = std::vector>; - ValueType value; + // The value is not supposed to be modified after construction, because the scratch + // space's content need to be kept consistent with the value. It is also the user of + // this class's responsibility to ensure that the scalars of the vector is not modified + // to accidentally. + const ValueType value; // The value index corresponding to the active type code int child_id; @@ -611,30 +705,56 @@ struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { /// to construct a vector of scalars static std::shared_ptr FromValue(std::shared_ptr value, int field_index, std::shared_ptr type); + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; -struct ARROW_EXPORT DenseUnionScalar : public UnionScalar { +struct ARROW_EXPORT DenseUnionScalar + : public UnionScalar, + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = DenseUnionType; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; // For DenseUnionScalar, we can make a valid ArraySpan of length 1 from this // scalar using ValueType = std::shared_ptr; - ValueType value; + // The value is not supposed to be modified after construction, because the scratch + // space's content need to be kept consistent with the value. It is also the user of + // this class's responsibility to ensure that the elements of the vector is not modified + // accidentally. + const ValueType value; const std::shared_ptr& child_value() const override { return this->value; } DenseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, value->is_valid), value(std::move(value)) {} + + private: + void FillScratchSpace(); + + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; struct ARROW_EXPORT RunEndEncodedScalar : public Scalar, - private internal::ArraySpanFillFromScalarScratchSpace { + private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = RunEndEncodedType; using ValueType = std::shared_ptr; + using ArraySpanFillFromScalarScratchSpace = + internal::ArraySpanFillFromScalarScratchSpace; - ValueType value; + // The value is not supposed to be modified after construction, because the scratch + // space's content need to be kept consistent with the value. It is also the user of + // this class's responsibility to ensure that the wrapped scalar is not modified + // accidentally. + const ValueType value; RunEndEncodedScalar(std::shared_ptr value, std::shared_ptr type); @@ -652,7 +772,10 @@ struct ARROW_EXPORT RunEndEncodedScalar private: const TypeClass& ree_type() const { return internal::checked_cast(*type); } + void FillScratchSpace(); + friend ArraySpan; + friend ArraySpanFillFromScalarScratchSpace; }; /// \brief A Scalar value for DictionaryType @@ -680,10 +803,6 @@ struct ARROW_EXPORT DictionaryScalar : public internal::PrimitiveScalarBase { const void* data() const override { return internal::checked_cast(*value.index).data(); } - void* mutable_data() override { - return internal::checked_cast(*value.index) - .mutable_data(); - } std::string_view view() const override { return internal::checked_cast(*value.index) .view(); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 09dfde3227109..104a5697b5727 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -95,6 +95,68 @@ TEST(TestNullScalar, ValidateErrors) { AssertValidationFails(scalar); } +TEST(TestNullScalar, Cast) { + NullScalar scalar; + for (auto to_type : { + int8(), + float64(), + date32(), + time32(TimeUnit::SECOND), + timestamp(TimeUnit::SECOND), + duration(TimeUnit::SECOND), + utf8(), + large_binary(), + list(int32()), + struct_({field("f", int32())}), + map(utf8(), int32()), + decimal(12, 2), + list_view(int32()), + large_list(int32()), + dense_union({field("string", utf8()), field("number", uint64())}), + sparse_union({field("string", utf8()), field("number", uint64())}), + }) { + // Cast() function doesn't support casting null scalar, use Scalar::CastTo() instead. + ASSERT_OK_AND_ASSIGN(auto casted, scalar.CastTo(to_type)); + ASSERT_EQ(casted->type->id(), to_type->id()); + ASSERT_FALSE(casted->is_valid); + } +} + +TEST(TestBooleanScalar, Cast) { + for (auto b : {true, false}) { + BooleanScalar scalar(b); + ARROW_SCOPED_TRACE("boolean value: ", scalar.ToString()); + + // Boolean type (identity cast). + { + ASSERT_OK_AND_ASSIGN(auto casted, Cast(scalar, boolean())); + ASSERT_TRUE(casted.scalar()->Equals(scalar)) << casted.scalar()->ToString(); + } + + // Numeric types. + for (auto to_type : { + int8(), + uint16(), + int32(), + uint64(), + float32(), + float64(), + }) { + ARROW_SCOPED_TRACE("to type: ", to_type->ToString()); + ASSERT_OK_AND_ASSIGN(auto casted, Cast(scalar, to_type)); + ASSERT_EQ(casted.scalar()->type->id(), to_type->id()); + ASSERT_EQ(casted.scalar()->ToString(), std::to_string(b)); + } + + // String type. + { + ASSERT_OK_AND_ASSIGN(auto casted, Cast(scalar, utf8())); + ASSERT_EQ(casted.scalar()->type->id(), utf8()->id()); + ASSERT_EQ(casted.scalar()->ToString(), scalar.ToString()); + } + } +} + template class TestNumericScalar : public ::testing::Test { public: @@ -464,12 +526,23 @@ class TestDecimalScalar : public ::testing::Test { ::testing::HasSubstr("does not fit in precision of"), invalid.ValidateFull()); } + + void TestCast() { + const auto ty = std::make_shared(3, 2); + const auto pi = ScalarType(ValueType(314), ty); + + ASSERT_OK_AND_ASSIGN(auto casted, Cast(pi, utf8())); + ASSERT_TRUE(casted.scalar()->Equals(StringScalar("3.14"))) + << casted.scalar()->ToString(); + } }; TYPED_TEST_SUITE(TestDecimalScalar, DecimalArrowTypes); TYPED_TEST(TestDecimalScalar, Basics) { this->TestBasics(); } +TYPED_TEST(TestDecimalScalar, Cast) { this->TestCast(); } + TEST(TestBinaryScalar, Basics) { std::string data = "test data"; auto buf = std::make_shared(data); @@ -551,6 +624,14 @@ TEST(TestBinaryScalar, ValidateErrors) { AssertValidationFails(*null_scalar); } +TEST(TestBinaryScalar, Cast) { + BinaryScalar scalar(Buffer::FromString("test data")); + ASSERT_OK_AND_ASSIGN(auto casted, Cast(scalar, utf8())); + ASSERT_EQ(casted.scalar()->type->id(), utf8()->id()); + AssertBufferEqual(*checked_cast(*casted.scalar()).value, + *scalar.value); +} + template class TestStringScalar : public ::testing::Test { public: @@ -580,19 +661,25 @@ class TestStringScalar : public ::testing::Test { } void TestValidateErrors() { - // Inconsistent is_valid / value - ScalarType scalar(Buffer::FromString("xxx")); - scalar.is_valid = false; - AssertValidationFails(scalar); + { + // Inconsistent is_valid / value + ScalarType scalar(Buffer::FromString("xxx")); + scalar.is_valid = false; + AssertValidationFails(scalar); + } - auto null_scalar = MakeNullScalar(type_); - null_scalar->is_valid = true; - AssertValidationFails(*null_scalar); + { + auto null_scalar = MakeNullScalar(type_); + null_scalar->is_valid = true; + AssertValidationFails(*null_scalar); + } - // Invalid UTF8 - scalar = ScalarType(Buffer::FromString("\xff")); - ASSERT_OK(scalar.Validate()); - ASSERT_RAISES(Invalid, scalar.ValidateFull()); + { + // Invalid UTF8 + ScalarType scalar(Buffer::FromString("\xff")); + ASSERT_OK(scalar.Validate()); + ASSERT_RAISES(Invalid, scalar.ValidateFull()); + } } protected: @@ -676,8 +763,16 @@ TEST(TestFixedSizeBinaryScalar, ValidateErrors) { FixedSizeBinaryScalar scalar(buf, type); ASSERT_OK(scalar.ValidateFull()); - scalar.value = SliceBuffer(buf, 1); - AssertValidationFails(scalar); + ASSERT_RAISES(Invalid, MakeScalar(type, SliceBuffer(buf, 1))); +} + +TEST(TestFixedSizeBinaryScalar, Cast) { + std::string data = "test data"; + FixedSizeBinaryScalar scalar(data); + ASSERT_OK_AND_ASSIGN(auto casted, Cast(scalar, utf8())); + ASSERT_EQ(casted.scalar()->type->id(), utf8()->id()); + AssertBufferEqual(*checked_cast(*casted.scalar()).value, + *scalar.value); } TEST(TestDateScalars, Basics) { @@ -1136,24 +1231,25 @@ class TestListLikeScalar : public ::testing::Test { } void TestValidateErrors() { - ScalarType scalar(value_); - scalar.is_valid = false; - ASSERT_OK(scalar.ValidateFull()); - - // Value must be defined - scalar = ScalarType(value_); - scalar.value = nullptr; - AssertValidationFails(scalar); + { + ScalarType scalar(value_); + scalar.is_valid = false; + ASSERT_OK(scalar.ValidateFull()); + } - // Inconsistent child type - scalar = ScalarType(value_); - scalar.value = ArrayFromJSON(int32(), "[1, 2, null]"); - AssertValidationFails(scalar); + { + // Value must be defined + ScalarType scalar(nullptr, type_); + scalar.is_valid = true; + AssertValidationFails(scalar); + } - // Invalid UTF8 in child data - scalar = ScalarType(ArrayFromJSON(utf8(), "[null, null, \"\xff\"]")); - ASSERT_OK(scalar.Validate()); - ASSERT_RAISES(Invalid, scalar.ValidateFull()); + { + // Invalid UTF8 in child data + ScalarType scalar(ArrayFromJSON(utf8(), "[null, null, \"\xff\"]")); + ASSERT_OK(scalar.Validate()); + ASSERT_RAISES(Invalid, scalar.ValidateFull()); + } } void TestHashing() { @@ -1195,6 +1291,12 @@ class TestListLikeScalar : public ::testing::Test { auto invalid_cast_type = fixed_size_list(value_->type(), 5); CheckListCastError(scalar, invalid_cast_type); + + // Cast() function doesn't support casting list-like to string, use Scalar::CastTo() + // instead. + ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8())); + ASSERT_EQ(casted_str->type->id(), utf8()->id()); + ASSERT_EQ(casted_str->ToString(), scalar.ToString()); } protected: @@ -1224,6 +1326,24 @@ TEST(TestFixedSizeListScalar, ValidateErrors) { AssertValidationFails(scalar); } +TEST(TestFixedSizeListScalar, Cast) { + const auto ty = fixed_size_list(int16(), 3); + FixedSizeListScalar scalar(ArrayFromJSON(int16(), "[1, 2, 5]"), ty); + + CheckListCast(scalar, list(int16())); + CheckListCast(scalar, large_list(int16())); + CheckListCast(scalar, fixed_size_list(int16(), 3)); + + auto invalid_cast_type = fixed_size_list(int16(), 4); + CheckListCastError(scalar, invalid_cast_type); + + // Cast() function doesn't support casting list-like to string, use Scalar::CastTo() + // instead. + ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8())); + ASSERT_EQ(casted_str->type->id(), utf8()->id()); + ASSERT_EQ(casted_str->ToString(), scalar.ToString()); +} + TEST(TestMapScalar, Basics) { auto value = ArrayFromJSON(struct_({field("key", utf8(), false), field("value", int8())}), @@ -1253,6 +1373,12 @@ TEST(TestMapScalar, Cast) { auto invalid_cast_type = fixed_size_list(key_value_type, 5); CheckListCastError(scalar, invalid_cast_type); + + // Cast() function doesn't support casting map to string, use Scalar::CastTo() instead. + ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8())); + ASSERT_TRUE(casted_str->Equals(StringScalar( + R"(map[{key:string = a, value:int8 = 1}, {key:string = b, value:int8 = 2}])"))) + << casted_str->ToString(); } TEST(TestStructScalar, FieldAccess) { @@ -1345,6 +1471,16 @@ TEST(TestStructScalar, ValidateErrors) { ASSERT_RAISES(Invalid, scalar.ValidateFull()); } +TEST(TestStructScalar, Cast) { + auto ty = struct_({field("i", int32()), field("s", utf8())}); + StructScalar scalar({MakeScalar(42), MakeScalar("xxx")}, ty); + + // Cast() function doesn't support casting map to string, use Scalar::CastTo() instead. + ASSERT_OK_AND_ASSIGN(auto casted_str, scalar.CastTo(utf8())); + ASSERT_TRUE(casted_str->Equals(StringScalar(R"({i:int32 = 42, s:string = xxx})"))) + << casted_str->ToString(); +} + TEST(TestDictionaryScalar, Basics) { for (auto index_ty : all_dictionary_index_types()) { auto ty = dictionary(index_ty, utf8()); @@ -1534,17 +1670,41 @@ void CheckGetNullUnionScalar(const Array& arr, int64_t index) { ASSERT_FALSE(checked_cast(*scalar).child_value()->is_valid); } +std::shared_ptr MakeUnionScalar(const SparseUnionType& type, int8_t type_code, + std::shared_ptr field_value, + int field_index) { + ScalarVector field_values; + for (int i = 0; i < type.num_fields(); ++i) { + if (i == field_index) { + field_values.emplace_back(std::move(field_value)); + } else { + field_values.emplace_back(MakeNullScalar(type.field(i)->type())); + } + } + return std::make_shared(std::move(field_values), type_code, + type.GetSharedPtr()); +} + std::shared_ptr MakeUnionScalar(const SparseUnionType& type, std::shared_ptr field_value, int field_index) { - return SparseUnionScalar::FromValue(field_value, field_index, type.GetSharedPtr()); + return SparseUnionScalar::FromValue(std::move(field_value), field_index, + type.GetSharedPtr()); +} + +std::shared_ptr MakeUnionScalar(const DenseUnionType& type, int8_t type_code, + std::shared_ptr field_value, + int field_index) { + return std::make_shared(std::move(field_value), type_code, + type.GetSharedPtr()); } std::shared_ptr MakeUnionScalar(const DenseUnionType& type, std::shared_ptr field_value, int field_index) { int8_t type_code = type.type_codes()[field_index]; - return std::make_shared(field_value, type_code, type.GetSharedPtr()); + return std::make_shared(std::move(field_value), type_code, + type.GetSharedPtr()); } std::shared_ptr MakeSpecificNullScalar(const DenseUnionType& type, @@ -1592,7 +1752,13 @@ class TestUnionScalar : public ::testing::Test { std::shared_ptr ScalarFromValue(int field_index, std::shared_ptr field_value) { - return MakeUnionScalar(*union_type_, field_value, field_index); + return MakeUnionScalar(*union_type_, std::move(field_value), field_index); + } + + std::shared_ptr ScalarFromTypeCodeAndValue(int8_t type_code, + std::shared_ptr field_value, + int field_index) { + return MakeUnionScalar(*union_type_, type_code, std::move(field_value), field_index); } std::shared_ptr SpecificNull(int field_index) { @@ -1610,40 +1776,48 @@ class TestUnionScalar : public ::testing::Test { } void TestValidateErrors() { - // Type code doesn't exist - auto scalar = ScalarFromValue(0, alpha_); - UnionScalar* union_scalar = static_cast(scalar.get()); - - // Invalid type code - union_scalar->type_code = 0; - AssertValidationFails(*union_scalar); + { + // Invalid type code + auto scalar = ScalarFromTypeCodeAndValue(0, alpha_, 0); + AssertValidationFails(*scalar); + } - union_scalar->is_valid = false; - AssertValidationFails(*union_scalar); + { + auto scalar = ScalarFromTypeCodeAndValue(0, alpha_, 0); + scalar->is_valid = false; + AssertValidationFails(*scalar); + } - union_scalar->type_code = -42; - union_scalar->is_valid = true; - AssertValidationFails(*union_scalar); + { + auto scalar = ScalarFromTypeCodeAndValue(-42, alpha_, 0); + AssertValidationFails(*scalar); + } - union_scalar->is_valid = false; - AssertValidationFails(*union_scalar); + { + auto scalar = ScalarFromTypeCodeAndValue(-42, alpha_, 0); + scalar->is_valid = false; + AssertValidationFails(*scalar); + } // Type code doesn't correspond to child type if (type_->id() == ::arrow::Type::DENSE_UNION) { - union_scalar->type_code = 42; - union_scalar->is_valid = true; - AssertValidationFails(*union_scalar); - - scalar = ScalarFromValue(2, two_); - union_scalar = static_cast(scalar.get()); - union_scalar->type_code = 3; - AssertValidationFails(*union_scalar); + { + auto scalar = ScalarFromTypeCodeAndValue(42, alpha_, 0); + AssertValidationFails(*scalar); + } + + { + auto scalar = ScalarFromTypeCodeAndValue(3, two_, 2); + AssertValidationFails(*scalar); + } } - // underlying value has invalid UTF8 - scalar = ScalarFromValue(0, std::make_shared("\xff")); - ASSERT_OK(scalar->Validate()); - ASSERT_RAISES(Invalid, scalar->ValidateFull()); + { + // underlying value has invalid UTF8 + auto scalar = ScalarFromValue(0, std::make_shared("\xff")); + ASSERT_OK(scalar->Validate()); + ASSERT_RAISES(Invalid, scalar->ValidateFull()); + } } void TestEquals() { @@ -1680,6 +1854,14 @@ class TestUnionScalar : public ::testing::Test { } } + void TestCast() { + // Cast() function doesn't support casting union to string, use Scalar::CastTo() + // instead. + ASSERT_OK_AND_ASSIGN(auto casted, union_alpha_->CastTo(utf8())); + ASSERT_TRUE(casted->Equals(StringScalar(R"(union{string: string = alpha})"))) + << casted->ToString(); + } + protected: std::shared_ptr type_; const UnionType* union_type_; @@ -1698,6 +1880,8 @@ TYPED_TEST(TestUnionScalar, Equals) { this->TestEquals(); } TYPED_TEST(TestUnionScalar, MakeNullScalar) { this->TestMakeNullScalar(); } +TYPED_TEST(TestUnionScalar, Cast) { this->TestCast(); } + class TestSparseUnionScalar : public TestUnionScalar {}; TEST_F(TestSparseUnionScalar, GetScalar) { @@ -1974,14 +2158,14 @@ TEST_F(TestExtensionScalar, ValidateErrors) { scalar.is_valid = false; ASSERT_OK(scalar.ValidateFull()); - // Invalid storage scalar (wrong length) - std::shared_ptr invalid_storage = MakeNullScalar(storage_type_); - invalid_storage->is_valid = true; - static_cast(invalid_storage.get())->value = - std::make_shared("123"); - AssertValidationFails(*invalid_storage); + // Invalid storage scalar (invalid UTF8) + ASSERT_OK_AND_ASSIGN(std::shared_ptr invalid_storage, + MakeScalar(utf8(), std::make_shared("\xff"))); + ASSERT_OK(invalid_storage->Validate()); + ASSERT_RAISES(Invalid, invalid_storage->ValidateFull()); scalar = ExtensionScalar(invalid_storage, type_); - AssertValidationFails(scalar); + ASSERT_OK(scalar.Validate()); + ASSERT_RAISES(Invalid, scalar.ValidateFull()); } } // namespace arrow From fb7e4682ca5c19ec49997842650f80a453e0c1d1 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 23 Apr 2024 14:01:51 -0300 Subject: [PATCH 010/106] GH-41353: [C++] Define bit_width and byte_width of ExtensionType in terms of the storage type (#41354) ### Rationale for this change Users and other classes within Arrow itself (e.g. array builders) expect extension types to behave like their underlying storage type. As it is now, `ExtensionType::bit_width()` is the default `DataType::bit_width()` implementation which returns `-1`. It should return the storage type's bit-width. ### What changes are included in this PR? Definition of `ExtensionType::bit_width/byte_width` functions. ### Are these changes tested? Tests added and confirmed to fail prior to these changes. ### Are there any user-facing changes? `ExtensionType` now define `bit_width` and `byte_width` according to their storage type. * GitHub Issue: #41353 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/extension_type.h | 3 +++ cpp/src/arrow/extension_type_test.cc | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h index 0fd7216f1820b..b3f085198be69 100644 --- a/cpp/src/arrow/extension_type.h +++ b/cpp/src/arrow/extension_type.h @@ -54,6 +54,9 @@ class ARROW_EXPORT ExtensionType : public DataType { std::string name() const override { return "extension"; } + int32_t byte_width() const override { return storage_type_->byte_width(); } + int bit_width() const override { return storage_type_->bit_width(); } + /// \brief Unique name of extension type used to identify type for /// serialization /// \return the string name of the extension diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 31222d74806f4..f104c984a64b4 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -196,6 +196,8 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); + ASSERT_EQ(type->bit_width(), 128); + ASSERT_EQ(type->byte_width(), 16); const auto& ext_type = static_cast(*type); std::string serialized = ext_type.Serialize(); @@ -204,6 +206,9 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { ext_type.Deserialize(fixed_size_binary(16), serialized)); ASSERT_TRUE(deserialized->Equals(*type)); ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); + ASSERT_EQ(deserialized->id(), Type::EXTENSION); + ASSERT_EQ(deserialized->bit_width(), 128); + ASSERT_EQ(deserialized->byte_width(), 16); } auto RoundtripBatch = [](const std::shared_ptr& batch, From f1bc82f2b39a317970427052c360383f983ec3f8 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 23 Apr 2024 13:31:26 -0400 Subject: [PATCH 011/106] MINOR: [R] refactor: move aggregation function bindings to their own file (#41355) For consistency with other bindings, and to allow `dplyr-summarize.R` to start with the summarize method, as do the other dplyr verb files. --- r/DESCRIPTION | 1 + r/R/dplyr-funcs-agg.R | 198 ++++++++++++++++++++++++++++++++++++++++++ r/R/dplyr-funcs.R | 16 +++- r/R/dplyr-summarize.R | 195 ----------------------------------------- 4 files changed, 213 insertions(+), 197 deletions(-) create mode 100644 r/R/dplyr-funcs-agg.R diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 2efaed4d6ca0a..eeff8168b361c 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -107,6 +107,7 @@ Collate: 'dplyr-distinct.R' 'dplyr-eval.R' 'dplyr-filter.R' + 'dplyr-funcs-agg.R' 'dplyr-funcs-augmented.R' 'dplyr-funcs-conditional.R' 'dplyr-funcs-datetime.R' diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R new file mode 100644 index 0000000000000..ab1df1d2f15a5 --- /dev/null +++ b/r/R/dplyr-funcs-agg.R @@ -0,0 +1,198 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Aggregation functions +# +# These all insert into an ..aggregations list (in a parent frame) a list containing: +# @param fun string function name +# @param data list of 0 or more Expressions +# @param options list of function options, as passed to call_function +# The functions return a FieldRef pointing to the result of the aggregation. +# +# For group-by aggregation, `hash_` gets prepended to the function name when +# the query is executed. +# So to see a list of available hash aggregation functions, +# you can use list_compute_functions("^hash_") + +register_bindings_aggregate <- function() { + register_binding_agg("base::sum", function(..., na.rm = FALSE) { + set_agg( + fun = "sum", + data = ensure_one_arg(list2(...), "sum"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("base::prod", function(..., na.rm = FALSE) { + set_agg( + fun = "product", + data = ensure_one_arg(list2(...), "prod"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("base::any", function(..., na.rm = FALSE) { + set_agg( + fun = "any", + data = ensure_one_arg(list2(...), "any"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("base::all", function(..., na.rm = FALSE) { + set_agg( + fun = "all", + data = ensure_one_arg(list2(...), "all"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("base::mean", function(x, na.rm = FALSE) { + set_agg( + fun = "mean", + data = list(x), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { + set_agg( + fun = "stddev", + data = list(x), + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) + ) + }) + register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { + set_agg( + fun = "variance", + data = list(x), + options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) + ) + }) + register_binding_agg( + "stats::quantile", + function(x, probs, na.rm = FALSE) { + if (length(probs) != 1) { + arrow_not_supported("quantile() with length(probs) != 1") + } + # TODO: Bind to the Arrow function that returns an exact quantile and remove + # this warning (ARROW-14021) + warn( + "quantile() currently returns an approximate quantile in Arrow", + .frequency = "once", + .frequency_id = "arrow.quantile.approximate", + class = "arrow.quantile.approximate" + ) + set_agg( + fun = "tdigest", + data = list(x), + options = list(skip_nulls = na.rm, q = probs) + ) + }, + notes = c( + "`probs` must be length 1;", + "approximate quantile (t-digest) is computed" + ) + ) + register_binding_agg( + "stats::median", + function(x, na.rm = FALSE) { + # TODO: Bind to the Arrow function that returns an exact median and remove + # this warning (ARROW-14021) + warn( + "median() currently returns an approximate median in Arrow", + .frequency = "once", + .frequency_id = "arrow.median.approximate", + class = "arrow.median.approximate" + ) + set_agg( + fun = "approximate_median", + data = list(x), + options = list(skip_nulls = na.rm) + ) + }, + notes = "approximate median (t-digest) is computed" + ) + register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { + set_agg( + fun = "count_distinct", + data = ensure_one_arg(list2(...), "n_distinct"), + options = list(na.rm = na.rm) + ) + }) + register_binding_agg("dplyr::n", function() { + set_agg( + fun = "count_all", + data = list(), + options = list() + ) + }) + register_binding_agg("base::min", function(..., na.rm = FALSE) { + set_agg( + fun = "min", + data = ensure_one_arg(list2(...), "min"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) + register_binding_agg("base::max", function(..., na.rm = FALSE) { + set_agg( + fun = "max", + data = ensure_one_arg(list2(...), "max"), + options = list(skip_nulls = na.rm, min_count = 0L) + ) + }) +} + +set_agg <- function(...) { + agg_data <- list2(...) + # Find the environment where ..aggregations is stored + target <- find_aggregations_env() + aggs <- get("..aggregations", target) + lapply(agg_data[["data"]], function(expr) { + # If any of the fields referenced in the expression are in ..aggregations, + # then we can't aggregate over them. + # This is mainly for combinations of dataset columns and aggregations, + # like sum(x - mean(x)), i.e. window functions. + # This will reject (sum(sum(x)) as well, but that's not a useful operation. + if (any(expr$field_names_in_expression() %in% names(aggs))) { + # TODO: support in ARROW-13926 + arrow_not_supported("aggregate within aggregate expression") + } + }) + + # Record the (fun, data, options) in ..aggregations + # and return a FieldRef pointing to it + tmpname <- paste0("..temp", length(aggs)) + aggs[[tmpname]] <- agg_data + assign("..aggregations", aggs, envir = target) + Expression$field_ref(tmpname) +} + +find_aggregations_env <- function() { + # Find the environment where ..aggregations is stored, + # it's in parent.env of something in the call stack + for (f in sys.frames()) { + if (exists("..aggregations", envir = f)) { + return(f) + } + } + stop("Could not find ..aggregations") +} + +ensure_one_arg <- function(args, fun) { + if (length(args) == 0) { + arrow_not_supported(paste0(fun, "() with 0 arguments")) + } else if (length(args) > 1) { + arrow_not_supported(paste0("Multiple arguments to ", fun, "()")) + } + args +} diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 956e31fe2bfdd..abf2362d0107f 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -175,8 +175,7 @@ agg_funcs <- new.env(parent = emptyenv()) .cache <- new.env(parent = emptyenv()) # we register 2 versions of the "::" binding - one for use with nse_funcs -# (registered below) and another one for use with agg_funcs (registered in -# dplyr-summarize.R) +# and another one for use with agg_funcs (registered in dplyr-funcs-agg.R) nse_funcs[["::"]] <- function(lhs, rhs) { lhs_name <- as.character(substitute(lhs)) rhs_name <- as.character(substitute(rhs)) @@ -187,3 +186,16 @@ nse_funcs[["::"]] <- function(lhs, rhs) { # regular pkg::fun function nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] } + +agg_funcs[["::"]] <- function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) + + fun_name <- paste0(lhs_name, "::", rhs_name) + + # if we do not have a binding for pkg::fun, then fall back on to the + # nse_funcs (useful when we have a regular function inside an aggregating one) + # and then, if searching nse_funcs fails too, fall back to the + # regular `pkg::fun()` function + agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] +} diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 1b625e34ad606..5bb81dc2b34fc 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -15,201 +15,6 @@ # specific language governing permissions and limitations # under the License. -# Aggregation functions -# These all return a list of: -# @param fun string function name -# @param data list of 0 or more Expressions -# @param options list of function options, as passed to call_function -# For group-by aggregation, `hash_` gets prepended to the function name. -# So to see a list of available hash aggregation functions, -# you can use list_compute_functions("^hash_") - - -ensure_one_arg <- function(args, fun) { - if (length(args) == 0) { - arrow_not_supported(paste0(fun, "() with 0 arguments")) - } else if (length(args) > 1) { - arrow_not_supported(paste0("Multiple arguments to ", fun, "()")) - } - args -} - -register_bindings_aggregate <- function() { - register_binding_agg("base::sum", function(..., na.rm = FALSE) { - set_agg( - fun = "sum", - data = ensure_one_arg(list2(...), "sum"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("base::prod", function(..., na.rm = FALSE) { - set_agg( - fun = "product", - data = ensure_one_arg(list2(...), "prod"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("base::any", function(..., na.rm = FALSE) { - set_agg( - fun = "any", - data = ensure_one_arg(list2(...), "any"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("base::all", function(..., na.rm = FALSE) { - set_agg( - fun = "all", - data = ensure_one_arg(list2(...), "all"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("base::mean", function(x, na.rm = FALSE) { - set_agg( - fun = "mean", - data = list(x), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { - set_agg( - fun = "stddev", - data = list(x), - options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) - ) - }) - register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { - set_agg( - fun = "variance", - data = list(x), - options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) - ) - }) - register_binding_agg( - "stats::quantile", - function(x, probs, na.rm = FALSE) { - if (length(probs) != 1) { - arrow_not_supported("quantile() with length(probs) != 1") - } - # TODO: Bind to the Arrow function that returns an exact quantile and remove - # this warning (ARROW-14021) - warn( - "quantile() currently returns an approximate quantile in Arrow", - .frequency = "once", - .frequency_id = "arrow.quantile.approximate", - class = "arrow.quantile.approximate" - ) - set_agg( - fun = "tdigest", - data = list(x), - options = list(skip_nulls = na.rm, q = probs) - ) - }, - notes = c( - "`probs` must be length 1;", - "approximate quantile (t-digest) is computed" - ) - ) - register_binding_agg( - "stats::median", - function(x, na.rm = FALSE) { - # TODO: Bind to the Arrow function that returns an exact median and remove - # this warning (ARROW-14021) - warn( - "median() currently returns an approximate median in Arrow", - .frequency = "once", - .frequency_id = "arrow.median.approximate", - class = "arrow.median.approximate" - ) - set_agg( - fun = "approximate_median", - data = list(x), - options = list(skip_nulls = na.rm) - ) - }, - notes = "approximate median (t-digest) is computed" - ) - register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { - set_agg( - fun = "count_distinct", - data = ensure_one_arg(list2(...), "n_distinct"), - options = list(na.rm = na.rm) - ) - }) - register_binding_agg("dplyr::n", function() { - set_agg( - fun = "count_all", - data = list(), - options = list() - ) - }) - register_binding_agg("base::min", function(..., na.rm = FALSE) { - set_agg( - fun = "min", - data = ensure_one_arg(list2(...), "min"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) - register_binding_agg("base::max", function(..., na.rm = FALSE) { - set_agg( - fun = "max", - data = ensure_one_arg(list2(...), "max"), - options = list(skip_nulls = na.rm, min_count = 0L) - ) - }) -} - -set_agg <- function(...) { - agg_data <- list2(...) - # Find the environment where ..aggregations is stored - target <- find_aggregations_env() - aggs <- get("..aggregations", target) - lapply(agg_data[["data"]], function(expr) { - # If any of the fields referenced in the expression are in ..aggregations, - # then we can't aggregate over them. - # This is mainly for combinations of dataset columns and aggregations, - # like sum(x - mean(x)), i.e. window functions. - # This will reject (sum(sum(x)) as well, but that's not a useful operation. - if (any(expr$field_names_in_expression() %in% names(aggs))) { - # TODO: support in ARROW-13926 - arrow_not_supported("aggregate within aggregate expression") - } - }) - - # Record the (fun, data, options) in ..aggregations - # and return a FieldRef pointing to it - tmpname <- paste0("..temp", length(aggs)) - aggs[[tmpname]] <- agg_data - assign("..aggregations", aggs, envir = target) - Expression$field_ref(tmpname) -} - -find_aggregations_env <- function() { - # Find the environment where ..aggregations is stored, - # it's in parent.env of something in the call stack - for (f in sys.frames()) { - if (exists("..aggregations", envir = f)) { - return(f) - } - } - stop("Could not find ..aggregations") -} - -# we register 2 versions of the "::" binding - one for use with agg_funcs -# (registered below) and another one for use with nse_funcs -# (registered in dplyr-funcs.R) -agg_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) - - fun_name <- paste0(lhs_name, "::", rhs_name) - - # if we do not have a binding for pkg::fun, then fall back on to the - # nse_funcs (useful when we have a regular function inside an aggregating one) - # and then, if searching nse_funcs fails too, fall back to the - # regular `pkg::fun()` function - agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] -} - # The following S3 methods are registered on load if dplyr is present summarise.arrow_dplyr_query <- function(.data, ..., .by = NULL, .groups = NULL) { From a78760f995b11d3f14c035696fc567e019321243 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 23 Apr 2024 15:03:49 -0300 Subject: [PATCH 012/106] GH-41114: [C++] Add is_validity_defined_by_bitmap() predicate (#41115) ### Rationale for this change To make it easier to find bugs that are very likely to be silent in the codebas because users rarely use unions and REE types. ### What changes are included in this PR? Adding the type predicate and two usages in `builder_nested.h`. ### Are these changes tested? By the compilation process, since they are both `static_asserts`. * GitHub Issue: #41114 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/array_test.cc | 7 ++++--- cpp/src/arrow/array/builder_nested.h | 18 +++++++----------- cpp/src/arrow/array/concatenate.cc | 2 +- cpp/src/arrow/array/data.cc | 6 +++--- cpp/src/arrow/array/data.h | 1 + cpp/src/arrow/array/util.cc | 2 +- cpp/src/arrow/array/validate.cc | 2 +- cpp/src/arrow/c/bridge.cc | 2 +- cpp/src/arrow/c/bridge_test.cc | 2 +- cpp/src/arrow/compute/exec.cc | 2 +- cpp/src/arrow/integration/json_internal.cc | 2 +- cpp/src/arrow/ipc/metadata_internal.cc | 5 +++-- cpp/src/arrow/type.h | 5 ++++- 13 files changed, 29 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index b0d7fe740a0a1..af64908b59582 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -604,11 +604,11 @@ void AssertAppendScalar(MemoryPool* pool, const std::shared_ptr& scalar) ASSERT_EQ(out->length(), 9); auto out_type_id = out->type()->id(); - const bool has_validity = internal::HasValidityBitmap(out_type_id); + const bool can_check_nulls = internal::may_have_validity_bitmap(out_type_id); // For a dictionary builder, the output dictionary won't necessarily be the same const bool can_check_values = !is_dictionary(out_type_id); - if (has_validity) { + if (can_check_nulls) { ASSERT_EQ(out->null_count(), 4); } else { ASSERT_EQ(out->null_count(), 0); @@ -891,7 +891,8 @@ TEST_F(TestArray, TestAppendArraySlice) { span.SetMembers(*nulls->data()); ASSERT_OK(builder->AppendArraySlice(span, 0, 4)); ASSERT_EQ(12, builder->length()); - const bool has_validity_bitmap = internal::HasValidityBitmap(scalar->type->id()); + const bool has_validity_bitmap = + internal::may_have_validity_bitmap(scalar->type->id()); if (has_validity_bitmap) { ASSERT_EQ(4, builder->null_count()); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 2c8c41c365f6a..9f7b0fcdbce07 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -181,13 +181,11 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { if constexpr (is_list_view(TYPE::type_id)) { sizes = array.GetValues(2); } - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + static_assert(internal::may_have_validity_bitmap(TYPE::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); int64_t size = 0; if (is_valid) { if constexpr (is_list_view(TYPE::type_id)) { @@ -569,13 +567,11 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) override { - const int32_t* offsets = array.GetValues(1); - const bool all_valid = !array.MayHaveLogicalNulls(); - const uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR; + const auto* offsets = array.GetValues(1); + static_assert(internal::may_have_validity_bitmap(MapType::type_id)); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { - const bool is_valid = - all_valid || (validity && bit_util::GetBit(validity, array.offset + row)) || - array.IsValid(row); + const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row); if (is_valid) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index ff9ed66d1149f..44d58cc0bdebc 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -317,7 +317,7 @@ class ConcatenateImpl { } Status Concatenate(std::shared_ptr* out) && { - if (out_->null_count != 0 && internal::HasValidityBitmap(out_->type->id())) { + if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) { RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0])); } RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index ff3112ec1fcc2..ac828a9c35c67 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -53,7 +53,7 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length, if (type_id == Type::NA) { *null_count = length; (*buffers)[0] = nullptr; - } else if (internal::HasValidityBitmap(type_id)) { + } else if (internal::may_have_validity_bitmap(type_id)) { if (*null_count == 0) { // In case there are no nulls, don't keep an allocated null bitmap around (*buffers)[0] = nullptr; @@ -335,7 +335,7 @@ void FillZeroLengthArray(const DataType* type, ArraySpan* span) { span->buffers[i].size = 0; } - if (!HasValidityBitmap(type->id())) { + if (!may_have_validity_bitmap(type->id())) { span->buffers[0] = {}; } @@ -370,7 +370,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) { if (type_id == Type::NA) { this->null_count = 1; - } else if (!internal::HasValidityBitmap(type_id)) { + } else if (!internal::may_have_validity_bitmap(type_id)) { this->null_count = 0; } else { // Populate null count and validity bitmap diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index d8a6663cec580..beec29789ad1e 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -46,6 +46,7 @@ ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i); ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data); ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data); + } // namespace internal // When slicing, we do not know the null count of the sliced range without diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 86e2ffcae4de7..bdba92c9a11fb 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -95,7 +95,7 @@ class ArrayDataEndianSwapper { Status SwapType(const DataType& type) { RETURN_NOT_OK(VisitTypeInline(type, this)); RETURN_NOT_OK(SwapChildren(type.fields())); - if (internal::HasValidityBitmap(type.id())) { + if (internal::may_have_validity_bitmap(type.id())) { // Copy null bitmap out_->buffers[0] = data_->buffers[0]; } diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 8dd3eb3f90c15..0d940d3bc869e 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -550,7 +550,7 @@ struct ValidateArrayImpl { if (full_validation) { if (data.null_count != kUnknownNullCount) { int64_t actual_null_count; - if (HasValidityBitmap(data.type->id()) && data.buffers[0]) { + if (may_have_validity_bitmap(data.type->id()) && data.buffers[0]) { // Do not call GetNullCount() as it would also set the `null_count` member actual_null_count = data.length - CountSetBits(data.buffers[0]->data(), data.offset, data.length); diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index d004de7a2ea9f..8a530b3798d41 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -576,7 +576,7 @@ struct ArrayExporter { // Store buffer pointers size_t n_buffers = data->buffers.size(); auto buffers_begin = data->buffers.begin(); - if (n_buffers > 0 && !internal::HasValidityBitmap(data->type->id())) { + if (n_buffers > 0 && !internal::may_have_validity_bitmap(data->type->id())) { --n_buffers; ++buffers_begin; } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index dba6e4736b673..d64fe67accde0 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -565,7 +565,7 @@ struct ArrayExportChecker { auto expected_n_buffers = static_cast(expected_data.buffers.size()); auto expected_buffers = expected_data.buffers.data(); - if (!internal::HasValidityBitmap(expected_data.type->id())) { + if (!internal::may_have_validity_bitmap(expected_data.type->id())) { --expected_n_buffers; ++expected_buffers; } diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 28dcf493fa294..f2e4578383122 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -480,7 +480,7 @@ struct NullGeneralization { if (dtype_id == Type::NA) { return ALL_NULL; } - if (!arrow::internal::HasValidityBitmap(dtype_id)) { + if (!arrow::internal::may_have_validity_bitmap(dtype_id)) { return ALL_VALID; } if (value.is_scalar()) { diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index 64eb342d5bd47..4b75e84bfccb6 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -1849,7 +1849,7 @@ class ArrayReader { Result> Parse() { ARROW_ASSIGN_OR_RAISE(length_, GetMemberInt(obj_, "count")); - if (::arrow::internal::HasValidityBitmap(type_->id())) { + if (::arrow::internal::may_have_validity_bitmap(type_->id())) { // Null and union types don't have a validity bitmap RETURN_NOT_OK(ParseValidityBitmap()); } diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 4154b594d9507..e20b352d18d95 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -109,8 +109,9 @@ flatbuf::MetadataVersion MetadataVersionToFlatbuffer(MetadataVersion version) { bool HasValidityBitmap(Type::type type_id, MetadataVersion version) { // In V4, null types have no validity bitmap // In V5 and later, null and union types have no validity bitmap - return (version < MetadataVersion::V5) ? (type_id != Type::NA) - : ::arrow::internal::HasValidityBitmap(type_id); + return (version < MetadataVersion::V5) + ? (type_id != Type::NA) + : ::arrow::internal::may_have_validity_bitmap(type_id); } namespace { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 5629cade42335..58c9df04ec5c3 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -2488,7 +2488,7 @@ Result> UnifySchemas( namespace internal { -constexpr bool HasValidityBitmap(Type::type id) { +constexpr bool may_have_validity_bitmap(Type::type id) { switch (id) { case Type::NA: case Type::DENSE_UNION: @@ -2500,6 +2500,9 @@ constexpr bool HasValidityBitmap(Type::type id) { } } +ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.") +constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); } + ARROW_EXPORT std::string ToString(Type::type id); From fd75cbdb38836b66dbae81094cf06821e5d25cb1 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 24 Apr 2024 10:27:17 -0400 Subject: [PATCH 013/106] GH-40342: [C++] move LocalFileSystem to the registry (#40356) ### Rationale for this change Moving LocalFileSystem into the registry is a good first usage and will help us hammer out which aspects of built in file systems should remain public. ### What changes are included in this PR? A factory for LocalFileSystem is added to the registry. `FileSystem::MakeUri` ( https://github.com/apache/arrow/issues/18316 ) is added to enable roundtripping filesystems through URIs. `file://` URIs now support a use_mmap query parameter, and `local://` URIs are also supported as an alias.
Reducing the set of bound classes
Some unnecessary bindings to the LocalFileSystem concrete class are removed. This demonstrates that with a registered factory pattern, it is no longer necessary to keep a class hierarchy public for binding. Eventually (if desired), we can move concrete subclasses of FileSystem entirely out of public headers. ### Are these changes tested? Yes, all existing tests for file:// URIs continue to pass ### Are there any user-facing changes? For consistency, local:// URIs will now be considered equivalent to corresponding file:// URIs * GitHub Issue: #40342 Authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- .../arrow/filesystem_definition_example.cc | 4 +- cpp/src/arrow/filesystem/CMakeLists.txt | 4 +- cpp/src/arrow/filesystem/filesystem.cc | 35 ++++++++---------- cpp/src/arrow/filesystem/filesystem.h | 30 ++++++++++++++- cpp/src/arrow/filesystem/localfs.cc | 37 ++++++++++++++++++- cpp/src/arrow/filesystem/localfs.h | 1 + cpp/src/arrow/filesystem/localfs_test.cc | 29 +++++++++------ cpp/src/arrow/testing/examplefs.cc | 4 +- docs/source/cpp/io.rst | 7 ++-- python/pyarrow/_fs.pxd | 3 -- python/pyarrow/_fs.pyx | 34 +++++++---------- python/pyarrow/includes/libarrow_fs.pxd | 16 ++------ r/R/arrowExports.R | 4 -- r/R/filesystem.R | 3 +- r/src/arrowExports.cpp | 8 ---- r/src/filesystem.cpp | 14 ++----- 16 files changed, 130 insertions(+), 103 deletions(-) diff --git a/cpp/examples/arrow/filesystem_definition_example.cc b/cpp/examples/arrow/filesystem_definition_example.cc index efe1bd10470c0..65301bb843ba1 100644 --- a/cpp/examples/arrow/filesystem_definition_example.cc +++ b/cpp/examples/arrow/filesystem_definition_example.cc @@ -138,7 +138,7 @@ class ExampleFileSystem : public fs::FileSystem { } }; -fs::FileSystemRegistrar kExampleFileSystemModule{ +auto kExampleFileSystemModule = ARROW_REGISTER_FILESYSTEM( "example", [](const arrow::util::Uri& uri, const io::IOContext& io_context, std::string* out_path) -> Result> { @@ -148,4 +148,4 @@ fs::FileSystemRegistrar kExampleFileSystemModule{ } return fs; }, -}; + {}); diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index deac04af72f5a..0a31a64b7a3a4 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -28,7 +28,9 @@ add_arrow_test(filesystem-test EXTRA_LABELS filesystem DEFINITIONS - ARROW_FILESYSTEM_EXAMPLE_LIBPATH="$") + ARROW_FILESYSTEM_EXAMPLE_LIBPATH="$" + EXTRA_DEPENDENCIES + arrow_filesystem_example) if(ARROW_BUILD_BENCHMARKS) add_arrow_benchmark(localfs_benchmark diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index c96a5fd2cf280..b79af08385c0c 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -56,16 +56,13 @@ #include "arrow/util/visibility.h" #include "arrow/util/windows_fixup.h" -namespace arrow { - -using internal::checked_pointer_cast; -using internal::TaskHints; -using io::internal::SubmitIO; -using util::Uri; - -namespace fs { +namespace arrow::fs { +using arrow::internal::checked_pointer_cast; using arrow::internal::GetEnvVar; +using arrow::internal::TaskHints; +using arrow::io::internal::SubmitIO; +using arrow::util::Uri; using internal::ConcatAbstractPath; using internal::EnsureTrailingSlash; using internal::GetAbstractPathParent; @@ -273,6 +270,11 @@ Result FileSystem::PathFromUri(const std::string& uri_string) const return Status::NotImplemented("PathFromUri is not yet supported on this filesystem"); } +Result FileSystem::MakeUri(std::string path) const { + return Status::NotImplemented("MakeUri is not yet supported for ", type_name(), + " filesystems"); +} + ////////////////////////////////////////////////////////////////////////// // SubTreeFileSystem implementation @@ -726,6 +728,10 @@ class FileSystemFactoryRegistry { main_registry->scheme_to_factory_.emplace(std::move(scheme), registered); if (success) continue; + if (it->second.ok()) { + if (registered->factory == it->second->factory) continue; + } + duplicated_schemes.emplace_back(it->first); } scheme_to_factory_.clear(); @@ -852,18 +858,10 @@ Result> FileSystemFromUriReal(const Uri& uri, auto* factory, FileSystemFactoryRegistry::GetInstance()->FactoryForScheme(scheme)); if (factory != nullptr) { - return (*factory)(uri, io_context, out_path); + return factory->function(uri, io_context, out_path); } } - if (scheme == "file") { - std::string path; - ARROW_ASSIGN_OR_RAISE(auto options, LocalFileSystemOptions::FromUri(uri, &path)); - if (out_path != nullptr) { - *out_path = path; - } - return std::make_shared(options, io_context); - } if (scheme == "abfs" || scheme == "abfss") { #ifdef ARROW_AZURE ARROW_ASSIGN_OR_RAISE(auto options, AzureOptions::FromUri(uri, out_path)); @@ -969,5 +967,4 @@ Status Initialize(const FileSystemGlobalOptions& options) { return Status::OK(); } -} // namespace fs -} // namespace arrow +} // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index 272e42256a388..d4f62f86a7482 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -197,6 +197,11 @@ class ARROW_EXPORT FileSystem /// \return The path inside the filesystem that is indicated by the URI. virtual Result PathFromUri(const std::string& uri_string) const; + /// \brief Make a URI from which FileSystemFromUri produces an equivalent filesystem + /// \param path The path component to use in the resulting URI + /// \return A URI string, or an error if an equivalent URI cannot be produced + virtual Result MakeUri(std::string path) const; + virtual bool Equals(const FileSystem& other) const = 0; virtual bool Equals(const std::shared_ptr& other) const { @@ -352,8 +357,23 @@ class ARROW_EXPORT FileSystem bool default_async_is_sync_ = true; }; -using FileSystemFactory = std::function>( - const Uri& uri, const io::IOContext& io_context, std::string* out_path)>; +struct FileSystemFactory { + std::function>( + const Uri& uri, const io::IOContext& io_context, std::string* out_path)> + function; + std::string_view file; + int line; + + bool operator==(const FileSystemFactory& other) const { + // In the case where libarrow is linked statically both to the executable and to a + // dynamically loaded filesystem implementation library, the library contains a + // duplicate definition of the registry and duplicate definitions of any + // FileSystemRegistrars which are statically linked to libarrow. When retrieving + // factories from the filesystem implementation library, we use the file and line + // of the registrar's definition to determine equivalence of the duplicate factories. + return file == other.file && line == other.line; + } +}; /// \brief A FileSystem implementation that delegates to another /// implementation after prepending a fixed base path. @@ -645,6 +665,12 @@ struct ARROW_EXPORT FileSystemRegistrar { std::function finalizer = {}); }; +#define ARROW_REGISTER_FILESYSTEM(scheme, factory_function, finalizer) \ + ::arrow::fs::FileSystemRegistrar { \ + scheme, ::arrow::fs::FileSystemFactory{factory_function, __FILE__, __LINE__}, \ + finalizer \ + } + /// @} namespace internal { diff --git a/cpp/src/arrow/filesystem/localfs.cc b/cpp/src/arrow/filesystem/localfs.cc index fbb33fd00868b..25ac04b758f9b 100644 --- a/cpp/src/arrow/filesystem/localfs.cc +++ b/cpp/src/arrow/filesystem/localfs.cc @@ -39,6 +39,7 @@ #include "arrow/io/type_fwd.h" #include "arrow/util/async_generator.h" #include "arrow/util/io_util.h" +#include "arrow/util/string.h" #include "arrow/util/uri.h" #include "arrow/util/windows_fixup.h" @@ -246,8 +247,20 @@ Result LocalFileSystemOptions::FromUri( std::string(internal::RemoveTrailingSlash(uri.path(), /*preserve_root=*/true)); } - // TODO handle use_mmap option - return LocalFileSystemOptions(); + LocalFileSystemOptions options; + ARROW_ASSIGN_OR_RAISE(auto params, uri.query_items()); + for (const auto& [key, value] : params) { + if (key == "use_mmap") { + if (value.empty()) { + options.use_mmap = true; + continue; + } else { + ARROW_ASSIGN_OR_RAISE(options.use_mmap, ::arrow::internal::ParseBoolean(value)); + } + break; + } + } + return options; } LocalFileSystem::LocalFileSystem(const io::IOContext& io_context) @@ -273,6 +286,11 @@ Result LocalFileSystem::PathFromUri(const std::string& uri_string) authority_handling); } +Result LocalFileSystem::MakeUri(std::string path) const { + ARROW_ASSIGN_OR_RAISE(path, DoNormalizePath(std::move(path))); + return "file://" + path + (options_.use_mmap ? "?use_mmap" : ""); +} + bool LocalFileSystem::Equals(const FileSystem& other) const { if (other.type_name() != type_name()) { return false; @@ -686,4 +704,19 @@ Result> LocalFileSystem::OpenAppendStream( return OpenOutputStreamGeneric(path, truncate, append); } +static Result> LocalFileSystemFactory( + const arrow::util::Uri& uri, const io::IOContext& io_context, std::string* out_path) { + std::string path; + ARROW_ASSIGN_OR_RAISE(auto options, LocalFileSystemOptions::FromUri(uri, &path)); + if (out_path != nullptr) { + *out_path = std::move(path); + } + return std::make_shared(options, io_context); +} + +FileSystemRegistrar kLocalFileSystemModule[]{ + ARROW_REGISTER_FILESYSTEM("file", LocalFileSystemFactory, {}), + ARROW_REGISTER_FILESYSTEM("local", LocalFileSystemFactory, {}), +}; + } // namespace arrow::fs diff --git a/cpp/src/arrow/filesystem/localfs.h b/cpp/src/arrow/filesystem/localfs.h index 45a3da317f663..d72e8f7d74d51 100644 --- a/cpp/src/arrow/filesystem/localfs.h +++ b/cpp/src/arrow/filesystem/localfs.h @@ -83,6 +83,7 @@ class ARROW_EXPORT LocalFileSystem : public FileSystem { Result NormalizePath(std::string path) override; Result PathFromUri(const std::string& uri_string) const override; + Result MakeUri(std::string path) const override; bool Equals(const FileSystem& other) const override; diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index b76c7ebad45db..1a20e44bc36e2 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -15,8 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include -#include #include #include #include @@ -113,10 +111,8 @@ Result> SlowFileSystemFactory(const Uri& uri, } return std::make_shared(base_fs, average_latency, seed); } -FileSystemRegistrar kSlowFileSystemModule{ - "slowfile", - SlowFileSystemFactory, -}; +auto kSlowFileSystemModule = + ARROW_REGISTER_FILESYSTEM("slowfile", SlowFileSystemFactory, {}); TEST(FileSystemFromUri, LinkedRegisteredFactory) { // Since the registrar's definition is in this translation unit (which is linked to the @@ -158,23 +154,24 @@ TEST(FileSystemFromUri, RuntimeRegisteredFactory) { EXPECT_THAT(FileSystemFromUri("slowfile2:///hey/yo", &path), Raises(StatusCode::Invalid)); - EXPECT_THAT(RegisterFileSystemFactory("slowfile2", SlowFileSystemFactory), Ok()); + EXPECT_THAT(RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), + Ok()); ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri("slowfile2:///hey/yo", &path)); EXPECT_EQ(path, "/hey/yo"); EXPECT_EQ(fs->type_name(), "slow"); EXPECT_THAT( - RegisterFileSystemFactory("slowfile2", SlowFileSystemFactory), + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), Raises(StatusCode::KeyError, testing::HasSubstr("Attempted to register factory for scheme 'slowfile2' " "but that scheme is already registered"))); } FileSystemRegistrar kSegfaultFileSystemModule[]{ - {"segfault", nullptr}, - {"segfault", nullptr}, - {"segfault", nullptr}, + ARROW_REGISTER_FILESYSTEM("segfault", nullptr, {}), + ARROW_REGISTER_FILESYSTEM("segfault", nullptr, {}), + ARROW_REGISTER_FILESYSTEM("segfault", nullptr, {}), }; TEST(FileSystemFromUri, LinkedRegisteredFactoryNameCollision) { // Since multiple registrars are defined in this translation unit which all @@ -312,6 +309,7 @@ class TestLocalFS : public LocalFSTestMixin { std::string path; ASSERT_OK_AND_ASSIGN(fs_, fs_from_uri(uri, &path)); ASSERT_EQ(fs_->type_name(), "local"); + local_fs_ = ::arrow::internal::checked_pointer_cast(fs_); ASSERT_EQ(path, expected_path); ASSERT_OK_AND_ASSIGN(path, fs_->PathFromUri(uri)); ASSERT_EQ(path, expected_path); @@ -423,8 +421,17 @@ TYPED_TEST(TestLocalFS, FileSystemFromUriFile) { // Variations this->TestLocalUri("file:/foo/bar", "/foo/bar"); + ASSERT_FALSE(this->local_fs_->options().use_mmap); this->TestLocalUri("file:///foo/bar", "/foo/bar"); this->TestLocalUri("file:///some%20path/%25percent", "/some path/%percent"); + + this->TestLocalUri("file:///_?use_mmap", "/_"); + if (this->path_formatter_.supports_uri()) { + ASSERT_TRUE(this->local_fs_->options().use_mmap); + ASSERT_OK_AND_ASSIGN(auto uri, this->fs_->MakeUri("/_")); + EXPECT_EQ(uri, "file:///_?use_mmap"); + } + #ifdef _WIN32 this->TestLocalUri("file:/C:/foo/bar", "C:/foo/bar"); this->TestLocalUri("file:///C:/foo/bar", "C:/foo/bar"); diff --git a/cpp/src/arrow/testing/examplefs.cc b/cpp/src/arrow/testing/examplefs.cc index d3e7e3b03f6d7..5c9d5f9d90712 100644 --- a/cpp/src/arrow/testing/examplefs.cc +++ b/cpp/src/arrow/testing/examplefs.cc @@ -24,7 +24,7 @@ namespace arrow::fs { -FileSystemRegistrar kExampleFileSystemModule{ +auto kExampleFileSystemModule = ARROW_REGISTER_FILESYSTEM( "example", [](const Uri& uri, const io::IOContext& io_context, std::string* out_path) -> Result> { @@ -33,6 +33,6 @@ FileSystemRegistrar kExampleFileSystemModule{ auto local_uri = "file" + uri.ToString().substr(kScheme.size()); return FileSystemFromUri(local_uri, io_context, out_path); }, -}; + {}); } // namespace arrow::fs diff --git a/docs/source/cpp/io.rst b/docs/source/cpp/io.rst index 2a05473852c24..2312132b1a354 100644 --- a/docs/source/cpp/io.rst +++ b/docs/source/cpp/io.rst @@ -116,15 +116,15 @@ scope, which will register a factory whenever the instance is loaded: .. code-block:: cpp - arrow::fs::FileSystemRegistrar kExampleFileSystemModule{ + auto kExampleFileSystemModule = ARROW_REGISTER_FILESYSTEM( "example", [](const Uri& uri, const io::IOContext& io_context, std::string* out_path) -> Result> { EnsureExampleFileSystemInitialized(); return std::make_shared(); }, - &EnsureExampleFileSystemFinalized, - }; + &EnsureExampleFileSystemFinalized + ); If a filesystem implementation requires initialization before any instances may be constructed, this should be included in the corresponding factory or @@ -144,4 +144,3 @@ should have exactly one of its sources ``#include "arrow/filesystem/filesystem_library.h"`` in order to ensure the presence of the symbol on which :func:`~arrow::fs::LoadFileSystemFactories` depends. - diff --git a/python/pyarrow/_fs.pxd b/python/pyarrow/_fs.pxd index 4504b78b837ea..0df75530bbd6e 100644 --- a/python/pyarrow/_fs.pxd +++ b/python/pyarrow/_fs.pxd @@ -67,9 +67,6 @@ cdef class FileSystem(_Weakrefable): cdef class LocalFileSystem(FileSystem): - cdef: - CLocalFileSystem* localfs - cdef init(self, const shared_ptr[CFileSystem]& wrapped) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 86cf39e993c1b..0e635b2c8a28a 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -18,7 +18,6 @@ # cython: language_level = 3 from cpython.datetime cimport datetime, PyDateTime_DateTime -from cython cimport binding from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint @@ -421,6 +420,11 @@ cdef class FileSystem(_Weakrefable): "the subclasses instead: LocalFileSystem or " "SubTreeFileSystem") + @staticmethod + def _from_uri(uri): + fs, _path = FileSystem.from_uri(uri) + return fs + @staticmethod def from_uri(uri): """ @@ -1097,30 +1101,18 @@ cdef class LocalFileSystem(FileSystem): def __init__(self, *, use_mmap=False): cdef: - CLocalFileSystemOptions opts - shared_ptr[CLocalFileSystem] fs - - opts = CLocalFileSystemOptions.Defaults() - opts.use_mmap = use_mmap + shared_ptr[CFileSystem] fs + c_string c_uri - fs = make_shared[CLocalFileSystem](opts) + # from_uri needs a non-empty path, so just use a placeholder of /_ + c_uri = tobytes(f"file:///_?use_mmap={int(use_mmap)}") + with nogil: + fs = GetResultValue(CFileSystemFromUri(c_uri)) self.init( fs) - cdef init(self, const shared_ptr[CFileSystem]& c_fs): - FileSystem.init(self, c_fs) - self.localfs = c_fs.get() - - @staticmethod - @binding(True) # Required for cython < 3 - def _reconstruct(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - return LocalFileSystem(**kwargs) - def __reduce__(self): - cdef CLocalFileSystemOptions opts = self.localfs.options() - return LocalFileSystem._reconstruct, (dict( - use_mmap=opts.use_mmap),) + uri = frombytes(GetResultValue(self.fs.MakeUri(b"/_"))) + return FileSystem._from_uri, (uri,) cdef class SubTreeFileSystem(FileSystem): diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 328b426a498db..f1f2985f65394 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -61,6 +61,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: shared_ptr[CFileSystem] shared_from_this() c_string type_name() const CResult[c_string] NormalizePath(c_string path) + CResult[c_string] MakeUri(c_string path) CResult[CFileInfo] GetFileInfo(const c_string& path) CResult[vector[CFileInfo]] GetFileInfo( const vector[c_string]& paths) @@ -84,6 +85,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_bool Equals(const CFileSystem& other) c_bool Equals(shared_ptr[CFileSystem] other) + CResult[shared_ptr[CFileSystem]] CFileSystemFromUri \ + "arrow::fs::FileSystemFromUri"(const c_string& uri) CResult[shared_ptr[CFileSystem]] CFileSystemFromUri \ "arrow::fs::FileSystemFromUri"(const c_string& uri, c_string* out_path) CResult[shared_ptr[CFileSystem]] CFileSystemFromUriOrPath \ @@ -98,19 +101,6 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CStatus CFileSystemsInitialize "arrow::fs::Initialize" \ (const CFileSystemGlobalOptions& options) - cdef cppclass CLocalFileSystemOptions "arrow::fs::LocalFileSystemOptions": - c_bool use_mmap - - @staticmethod - CLocalFileSystemOptions Defaults() - - c_bool Equals(const CLocalFileSystemOptions& other) - - cdef cppclass CLocalFileSystem "arrow::fs::LocalFileSystem"(CFileSystem): - CLocalFileSystem() - CLocalFileSystem(CLocalFileSystemOptions) - CLocalFileSystemOptions options() - cdef cppclass CSubTreeFileSystem \ "arrow::fs::SubTreeFileSystem"(CFileSystem): CSubTreeFileSystem(const c_string& base_path, diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 967caba727898..752d3a266b26a 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1352,10 +1352,6 @@ fs___FileSystem__type_name <- function(file_system) { .Call(`_arrow_fs___FileSystem__type_name`, file_system) } -fs___LocalFileSystem__create <- function() { - .Call(`_arrow_fs___LocalFileSystem__create`) -} - fs___SubTreeFileSystem__create <- function(base_path, base_fs) { .Call(`_arrow_fs___SubTreeFileSystem__create`, base_path, base_fs) } diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 0e4484d1b583d..0176cdf846da7 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -390,7 +390,8 @@ are_urls <- function(x) if (!is.character(x)) FALSE else grepl("://", x) #' @export LocalFileSystem <- R6Class("LocalFileSystem", inherit = FileSystem) LocalFileSystem$create <- function() { - fs___LocalFileSystem__create() + # from_uri needs a non-empty path, so just use a placeholder of /_ + FileSystem$from_uri("file:///_")$fs } #' @usage NULL diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5031c79f200b3..a4c4b614d6d75 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3472,13 +3472,6 @@ BEGIN_CPP11 END_CPP11 } // filesystem.cpp -std::shared_ptr fs___LocalFileSystem__create(); -extern "C" SEXP _arrow_fs___LocalFileSystem__create(){ -BEGIN_CPP11 - return cpp11::as_sexp(fs___LocalFileSystem__create()); -END_CPP11 -} -// filesystem.cpp std::shared_ptr fs___SubTreeFileSystem__create(const std::string& base_path, const std::shared_ptr& base_fs); extern "C" SEXP _arrow_fs___SubTreeFileSystem__create(SEXP base_path_sexp, SEXP base_fs_sexp){ BEGIN_CPP11 @@ -6014,7 +6007,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___FileSystem__OpenOutputStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenOutputStream, 2}, { "_arrow_fs___FileSystem__OpenAppendStream", (DL_FUNC) &_arrow_fs___FileSystem__OpenAppendStream, 2}, { "_arrow_fs___FileSystem__type_name", (DL_FUNC) &_arrow_fs___FileSystem__type_name, 1}, - { "_arrow_fs___LocalFileSystem__create", (DL_FUNC) &_arrow_fs___LocalFileSystem__create, 0}, { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, { "_arrow_fs___SubTreeFileSystem__base_fs", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_fs, 1}, { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 23bcb81e8faae..2274a3d7ff7a2 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -238,13 +238,6 @@ std::string fs___FileSystem__type_name( return file_system->type_name(); } -// [[arrow::export]] -std::shared_ptr fs___LocalFileSystem__create() { - // Affects OpenInputFile/OpenInputStream - auto io_context = MainRThread::GetInstance().CancellableIOContext(); - return std::make_shared(io_context); -} - // [[arrow::export]] std::shared_ptr fs___SubTreeFileSystem__create( const std::string& base_path, const std::shared_ptr& base_fs) { @@ -268,9 +261,10 @@ cpp11::writable::list fs___FileSystemFromUri(const std::string& path) { using cpp11::literals::operator"" _nm; std::string out_path; - return cpp11::writable::list( - {"fs"_nm = cpp11::to_r6(ValueOrStop(fs::FileSystemFromUri(path, &out_path))), - "path"_nm = out_path}); + auto io_context = MainRThread::GetInstance().CancellableIOContext(); + return cpp11::writable::list({"fs"_nm = cpp11::to_r6(ValueOrStop( + fs::FileSystemFromUri(path, io_context, &out_path))), + "path"_nm = out_path}); } // [[arrow::export]] From 192de026ad5c021846d89449757b0ac679436d13 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 24 Apr 2024 10:37:13 -0800 Subject: [PATCH 014/106] MINOR: [R] Update language in NEWS.md related to GH-41223 (#41368) ### Rationale for this change This clarifies the language added in https://github.com/apache/arrow/pull/41223, as discussed in a post-merge review in https://github.com/apache/arrow/pull/41223#discussion_r1577243498. ### What changes are included in this PR? Just a tweak to R's NEWS.md file. ### Are these changes tested? No. ### Are there any user-facing changes? No. Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- r/NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/NEWS.md b/r/NEWS.md index 71ac0e8111537..4ed9f28a28436 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,7 +19,7 @@ # arrow 16.0.0.9000 -* R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are not true user-defined functions (UDFs); for those, see `register_scalar_function()`. (#41223) +* R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. # arrow 16.0.0 From d7a5777bb3626d4f448ae4c525a03c8ec6bda5dc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 25 Apr 2024 08:06:11 +0900 Subject: [PATCH 015/106] MINOR: [Java] Bump org.apache.maven.plugins:maven-plugin-plugin from 3.11.0 to 3.12.0 in /java (#41211) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-plugin-plugin](https://github.com/apache/maven-plugin-tools) from 3.11.0 to 3.12.0.
Release notes

Sourced from org.apache.maven.plugins:maven-plugin-plugin's releases.

3.12.0

Release Notes - Maven Plugin Tools - Version 3.12.0

Commits
  • 0b69acc [maven-release-plugin] prepare release maven-plugin-tools-3.12.0
  • e5e3dd2 code simplifications
  • e97ba77 [MPLUGIN-510] group history per common requirements
  • 6f9c3d9 use @​Component instead of @​Parameter when possible
  • d8fecbc Bump org.codehaus.plexus:plexus-archiver from 4.9.1 to 4.9.2
  • a9dd57d rename mavenVersion to maven3Version
  • 1aad214 [MPLUGIN-514] switch from png+imagemap to svg
  • ddbaa5b Bump apache/maven-gh-actions-shared from 3 to 4
  • cd74761 [MPLUGIN-511] add versions history requirements detection
  • d9f8d89 [MPLUGIN-511] prepare method to list releases history
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-plugin-plugin&package-manager=maven&previous-version=3.11.0&new-version=3.12.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/maven/module-info-compiler-maven-plugin/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/maven/module-info-compiler-maven-plugin/pom.xml b/java/maven/module-info-compiler-maven-plugin/pom.xml index 48c759b484268..6589020d6ecb5 100644 --- a/java/maven/module-info-compiler-maven-plugin/pom.xml +++ b/java/maven/module-info-compiler-maven-plugin/pom.xml @@ -79,7 +79,7 @@ maven-plugin-plugin - 3.11.0 + 3.12.0 maven-jar-plugin @@ -118,7 +118,7 @@ org.apache.maven.plugins maven-plugin-plugin - 3.11.0 + 3.12.0 true From 32885c91a1c94f1c8530815f4d4b8fc068859de8 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 25 Apr 2024 11:10:25 +0900 Subject: [PATCH 016/106] GH-41340: [C++][CMake][Windows] Remove needless .dll suffix from link libraries (#41341) ### Rationale for this change We should use `.lib` (import library) not `.dll` for linking. So `.dll` is wrong. (But it seems working... Why...?) ### What changes are included in this PR? Remove `.dll` from link libraries because CMake generates suitable link options from library name automatically: https://cmake.org/cmake/help/latest/command/target_link_libraries.html#command:target_link_libraries > A plain library name: The generated link line will ask the linker to search for the library (e.g. `foo` becomes `-lfoo` or `foo.lib`). ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #41340 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/CMakeLists.txt | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cc4df6d2fa2eb..df83f56dd2f70 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -708,7 +708,7 @@ list(APPEND ARROW_TEST_LINK_LIBS ${ARROW_GTEST_GMOCK} ${ARROW_GTEST_GTEST_MAIN}) if(ARROW_BUILD_BENCHMARKS) set(ARROW_BENCHMARK_LINK_LIBS benchmark::benchmark_main ${ARROW_TEST_LINK_LIBS}) if(WIN32) - list(APPEND ARROW_BENCHMARK_LINK_LIBS Shlwapi.dll) + list(APPEND ARROW_BENCHMARK_LINK_LIBS shlwapi) endif() endif() diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 026bb5c77e066..2ef82dd614f84 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -163,7 +163,7 @@ if(CMAKE_THREAD_LIBS_INIT) endif() if(WIN32) - list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32.dll") + list(APPEND ARROW_SYSTEM_LINK_LIBS "ws2_32") endif() if(NOT WIN32 AND NOT APPLE) @@ -628,9 +628,9 @@ else() list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS ArrowTesting::gtest) endif() if(WIN32) - list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32.dll") - list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32.dll") - list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32.dll") + list(APPEND ARROW_TESTING_SHARED_LINK_LIBS "ws2_32") + list(APPEND ARROW_TESTING_STATIC_LINK_LIBS "ws2_32") + list(APPEND ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS "ws2_32") endif() set(ARROW_TESTING_SRCS From 610ab9cc976b1735065176402eb50d0c4aa0bbe9 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 26 Apr 2024 01:24:58 +0900 Subject: [PATCH 017/106] GH-41371: [CI][Release] Use the latest Ruby on macOS (#41379) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Ruby 2.7 doesn't exist on `macos-latest` (`macos-14`). ### What changes are included in this PR? Use `ruby` as the Ruby version to use the latest Ruby. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41371 Authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido --- .github/workflows/dev.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 8af5832f15948..7c438be6024d6 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -109,9 +109,9 @@ jobs: with: python-version: '3.12' - name: Install Ruby - uses: ruby/setup-ruby@250fcd6a742febb1123a77a841497ccaa8b9e939 # v1.152.0 + uses: ruby/setup-ruby@v1 with: - ruby-version: '2.7' + ruby-version: ruby - name: Install .NET uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0 with: From 774f10df7a7e1587f1996e1d8dc70d5cf7b8075f Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 26 Apr 2024 01:44:25 +0800 Subject: [PATCH 018/106] MINOR: [C++][Parquet] fix dict_length for ReadDictionary when not having dict (#41344) ### Rationale for this change `dictionary_length = 0` is used when page doesn't have dictionary, however, this should be a nop. ### What changes are included in this PR? Change `dictionary_length = 0` to `*dictionary_length = 0`. ### Are these changes tested? No? ### Are there any user-facing changes? no Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/column_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 2566a989c1d21..eae7ac4252735 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1340,7 +1340,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, const void* ReadDictionary(int32_t* dictionary_length) override { if (this->current_decoder_ == nullptr && !this->HasNextInternal()) { - dictionary_length = 0; + *dictionary_length = 0; return nullptr; } // Verify the current data page is dictionary encoded. The current_encoding_ should From 6578c3b360cca0739029c4b8311cfb1a4317c87e Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Thu, 25 Apr 2024 15:19:02 -0400 Subject: [PATCH 019/106] GH-41370: [CI][MATLAB] MATLAB macOS CI workflow fails because of `macos-latest` change to `macos-14` (#41384) ### Rationale for this change * The MATLAB macOS CI workflow is failing because of the recent change to `macos-latest` from `macos-12` to `macos-14`. * In https://github.blog/changelog/2024-01-30-github-actions-macos-14-sonoma-is-now-available/ it is mentioned that the transition to `macos-14` was planned for April-June 2024. * As noted in the [Platform Road Map](https://www.mathworks.com/support/requirements/platform-road-map.html), MATLAB R2023a is not officially supported with `macos-14`. * Until we can move to building against MATLAB R2024a (latest available version of MATLAB), it makes sense to pin back the macOS version to version 14. ### What changes are included in this PR? 1. Pin back MATLAB CI workflow to use `macos-12`. 2. Pin back MATLAB crossbow packaging workflow to use `macos-12`. ### Are these changes tested? I will run a crossbow job and ensure that the MATLAB CI workflow passes successfully before merging this PR. ### Are there any user-facing changes? No. ### Future Directions 1. #41385 ### Notes 1. Thanks @ sgilmore10 for your help with this pull request! * GitHub Issue: #41370 Authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- .github/workflows/matlab.yml | 2 +- dev/tasks/matlab/github.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index dfc734e043371..aa3692e587961 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -98,7 +98,7 @@ jobs: select-by-folder: matlab/test macos: name: AMD64 macOS 12 MATLAB - runs-on: macos-latest + runs-on: macos-12 if: ${{ !contains(github.event.pull_request.title, 'WIP') }} steps: - name: Check out repository diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 1cd3949efbcf8..13fa36b501125 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -60,7 +60,7 @@ jobs: macos: name: AMD64 macOS 12 MATLAB - runs-on: macos-latest + runs-on: macos-12 steps: {{ macros.github_checkout_arrow()|indent }} - name: Install ninja-build From 299eb26e8c22b4aad4876c9e3b52f9adde699a5c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 26 Apr 2024 09:37:53 +0900 Subject: [PATCH 020/106] GH-41369: [CI][GLib] Don't use /usr/local on macOS (#41387) ### Rationale for this change We don't have write permission for `/usr/local` on macos-14. ### What changes are included in this PR? Use `/tmp/local` instead. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41369 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/ruby.yml | 5 ++--- ci/scripts/c_glib_test.sh | 1 + ci/scripts/ruby_test.sh | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index ea3e61d55787d..11e3c93ed0806 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -117,7 +117,7 @@ jobs: run: archery docker push ubuntu-ruby macos: - name: AMD64 macOS 12 GLib & Ruby + name: AMD64 macOS 14 GLib & Ruby runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -132,7 +132,7 @@ jobs: ARROW_GCS: ON ARROW_GLIB_GTK_DOC: true ARROW_GLIB_WERROR: true - ARROW_HOME: /usr/local + ARROW_HOME: /tmp/local ARROW_JEMALLOC: OFF ARROW_ORC: OFF ARROW_PARQUET: ON @@ -141,7 +141,6 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - XML_CATALOG_FILES: /usr/local/etc/xml/catalog steps: - name: Checkout Arrow uses: actions/checkout@v4 diff --git a/ci/scripts/c_glib_test.sh b/ci/scripts/c_glib_test.sh index f8083c7759d8a..02753872dcb2d 100755 --- a/ci/scripts/c_glib_test.sh +++ b/ci/scripts/c_glib_test.sh @@ -24,6 +24,7 @@ build_dir=${2}/c_glib : ${ARROW_GLIB_VAPI:=true} +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH} export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 diff --git a/ci/scripts/ruby_test.sh b/ci/scripts/ruby_test.sh index 56c33a4d6378a..507fa7858e870 100755 --- a/ci/scripts/ruby_test.sh +++ b/ci/scripts/ruby_test.sh @@ -22,6 +22,7 @@ set -ex source_dir=${1}/ruby build_dir=${2}/ruby +export DYLD_LIBRARY_PATH=${ARROW_HOME}/lib:${DYLD_LIBRARY_PATH} export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig export GI_TYPELIB_PATH=${ARROW_HOME}/lib/girepository-1.0 From e559a28cc19f87c2b1933bd2f7c9800ba290d5ab Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 26 Apr 2024 17:12:05 +0900 Subject: [PATCH 021/106] GH-41327: [Ruby] Show type name in Arrow::Table#to_s (#41328) ### Rationale for this change It's useful to detect type difference. ### What changes are included in this PR? Add `:show_column_type` option to `Arrow::Table#to_s` and enables it by default. This is a backward incompatible change but this'll help users. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. **This PR includes breaking changes to public APIs.** * GitHub Issue: #41327 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ruby/red-arrow/lib/arrow/field-containable.rb | 2 +- ruby/red-arrow/lib/arrow/table-formatter.rb | 40 +- .../lib/arrow/table-list-formatter.rb | 6 +- .../lib/arrow/table-table-formatter.rb | 7 + ruby/red-arrow/test/test-csv-loader.rb | 72 +-- ruby/red-arrow/test/test-group.rb | 13 + ruby/red-arrow/test/test-schema.rb | 2 +- ruby/red-arrow/test/test-slicer.rb | 345 ++++++----- ruby/red-arrow/test/test-struct-data-type.rb | 2 +- ruby/red-arrow/test/test-table.rb | 535 ++++++++++-------- 10 files changed, 595 insertions(+), 429 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/field-containable.rb b/ruby/red-arrow/lib/arrow/field-containable.rb index e4dbf4ec26cae..103e901f5d2de 100644 --- a/ruby/red-arrow/lib/arrow/field-containable.rb +++ b/ruby/red-arrow/lib/arrow/field-containable.rb @@ -29,7 +29,7 @@ def find_field(name_or_index) return nil if index < 0 or index >= n_fields get_field(index) else - message = "field name or index must be String, Symbol or Integer" + message = +"field name or index must be String, Symbol or Integer" message << ": <#{name_or_index.inspect}>" raise ArgumentError, message end diff --git a/ruby/red-arrow/lib/arrow/table-formatter.rb b/ruby/red-arrow/lib/arrow/table-formatter.rb index d039679f9a03a..b93faf09cbd02 100644 --- a/ruby/red-arrow/lib/arrow/table-formatter.rb +++ b/ruby/red-arrow/lib/arrow/table-formatter.rb @@ -24,7 +24,8 @@ class ColumnFormatter attr_reader :head_values attr_reader :tail_values attr_reader :sample_values - def initialize(column, head_values, tail_values) + def initialize(table_formatter, column, head_values, tail_values) + @table_formatter = table_formatter @column = column @head_values = head_values @tail_values = tail_values @@ -36,6 +37,15 @@ def data_type @data_type ||= @column.data_type end + def formatted_data_type_name + @formatted_data_type_name ||= "(#{data_type.name})" + end + + def aligned_data_type_name + @aligned_data_type_name ||= + "%*s" % [aligned_name.size, formatted_data_type_name] + end + def name @name ||= @column.name end @@ -63,7 +73,7 @@ def format_value(value, width=0) formatted_value = format_value(value[field_name], field_value_width) "#{formatted_name}: #{formatted_value}" end - formatted = "{" + formatted = +"{" formatted << formatted_values.join(", ") formatted << "}" "%-*s" % [width, formatted] @@ -90,9 +100,16 @@ def compute_field_value_width(field, sample_values) end def format_aligned_name(name, data_type, sample_values) + if @table_formatter.show_column_type? + min_width = formatted_data_type_name.size + else + min_width = 0 + end case data_type when TimestampDataType - "%*s" % [::Time.now.iso8601.size, name] + width = ::Time.now.iso8601.size + width = min_width if width < min_width + "%*s" % [width, name] when IntegerDataType have_null = false have_negative = false @@ -118,9 +135,12 @@ def format_aligned_name(name, data_type, sample_values) end width += 1 if have_negative # Need "-" width = [width, FORMATTED_NULL.size].max if have_null + width = min_width if width < min_width "%*s" % [width, name] when FloatDataType, DoubleDataType - "%*s" % [FLOAT_N_DIGITS, name] + width = FLOAT_N_DIGITS + width = min_width if width < min_width + "%*s" % [width, name] when StructDataType field_widths = data_type.fields.collect do |field| field_value_width = compute_field_value_width(field, sample_values) @@ -130,9 +150,11 @@ def format_aligned_name(name, data_type, sample_values) if field_widths.size > 0 width += (", ".size * (field_widths.size - 1)) end + width = min_width if width < min_width "%*s" % [width, name] else - name + width = min_width + "%*s" % [width, name] end end end @@ -143,7 +165,7 @@ def initialize(table, options={}) end def format - text = "" + text = +"" n_rows = @table.n_rows border = @options[:border] || 10 @@ -159,7 +181,7 @@ def format else tail_values = [] end - ColumnFormatter.new(column, head_values, tail_values) + ColumnFormatter.new(self, column, head_values, tail_values) end format_header(text, column_formatters) @@ -186,5 +208,9 @@ def format text end + + def show_column_type? + @options.fetch(:show_column_type, true) + end end end diff --git a/ruby/red-arrow/lib/arrow/table-list-formatter.rb b/ruby/red-arrow/lib/arrow/table-list-formatter.rb index 4fe2934160a69..3e4d410ffbee8 100644 --- a/ruby/red-arrow/lib/arrow/table-list-formatter.rb +++ b/ruby/red-arrow/lib/arrow/table-list-formatter.rb @@ -27,9 +27,9 @@ def format_rows(text, column_formatters, rows, n_digits, start_offset) text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n") row.each_with_index do |column_value, nth_column| column_formatter = column_formatters[nth_column] - formatted_name = column_formatter.name - formatted_value = column_formatter.format_value(column_value) - text << "#{formatted_name}: #{formatted_value}\n" + text << column_formatter.name + text << "(#{column_formatter.data_type.name})" if show_column_type? + text << ": #{column_formatter.format_value(column_value)}\n" end end end diff --git a/ruby/red-arrow/lib/arrow/table-table-formatter.rb b/ruby/red-arrow/lib/arrow/table-table-formatter.rb index 36121e1b6f0e4..acf4aca8bb6d1 100644 --- a/ruby/red-arrow/lib/arrow/table-table-formatter.rb +++ b/ruby/red-arrow/lib/arrow/table-table-formatter.rb @@ -26,6 +26,13 @@ def format_header(text, column_formatters) text << "\t" text << column_formatter.aligned_name end + if show_column_type? + text << "\n" + column_formatters.each do |column_formatter| + text << "\t" + text << column_formatter.aligned_data_type_name + end + end text << "\n" end diff --git a/ruby/red-arrow/test/test-csv-loader.rb b/ruby/red-arrow/test/test-csv-loader.rb index 0b21f6f9b71f9..1e0445db06ef9 100644 --- a/ruby/red-arrow/test/test-csv-loader.rb +++ b/ruby/red-arrow/test/test-csv-loader.rb @@ -27,80 +27,88 @@ def load_csv(input) test("String: data: with header") do data = fixture_path("with-header-float.csv").read assert_equal(<<-TABLE, load_csv(data).to_s) - name score -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + name score + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("String: data: without header") do data = fixture_path("without-header-float.csv").read assert_equal(<<-TABLE, load_csv(data).to_s) - 0 1 -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + 0 1 + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("String: path: with header") do path = fixture_path("with-header-float.csv").to_s assert_equal(<<-TABLE, load_csv(path).to_s) - name score -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + name score + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("String: path: without header") do path = fixture_path("without-header-float.csv").to_s assert_equal(<<-TABLE, load_csv(path).to_s) - 0 1 -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + 0 1 + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("Pathname: with header") do path = fixture_path("with-header-float.csv") assert_equal(<<-TABLE, load_csv(path).to_s) - name score -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + name score + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("Pathname: without header") do path = fixture_path("without-header-float.csv") assert_equal(<<-TABLE, load_csv(path).to_s) - 0 1 -0 alice 10.100000 -1 bob 29.200000 -2 chris -1.300000 + 0 1 + (utf8) (double) +0 alice 10.100000 +1 bob 29.200000 +2 chris -1.300000 TABLE end test("null: with double quote") do path = fixture_path("null-with-double-quote.csv").to_s assert_equal(<<-TABLE, load_csv(path).to_s) - name score -0 alice 10 -1 bob (null) -2 chris -1 + name score + (utf8) (int8) +0 alice 10 +1 bob (null) +2 chris -1 TABLE end test("null: without double quote") do path = fixture_path("null-without-double-quote.csv").to_s assert_equal(<<-TABLE, load_csv(path).to_s) - name score -0 alice 10 -1 bob (null) -2 chris -1 + name score + (utf8) (int8) +0 alice 10 +1 bob (null) +2 chris -1 TABLE end diff --git a/ruby/red-arrow/test/test-group.rb b/ruby/red-arrow/test/test-group.rb index 68e927df69bc8..f4831289eda48 100644 --- a/ruby/red-arrow/test/test-group.rb +++ b/ruby/red-arrow/test/test-group.rb @@ -43,6 +43,7 @@ def setup table = Arrow::Table.new(raw_table) assert_equal(<<-TABLE, table.group(:time).count.to_s) time count(int) + (timestamp) (int64) 0 #{time_values[0].iso8601} 1 1 #{time_values[1].iso8601} 1 TABLE @@ -53,6 +54,7 @@ def setup test("single") do assert_equal(<<-TABLE, @table.group(:group_key1).count.to_s) group_key1 count(group_key2) count(int) count(uint) count(float) count(string) + (uint8) (int64) (int64) (int64) (int64) (int64) 0 1 2 2 1 1 2 1 2 1 0 1 1 1 2 3 3 3 3 3 2 @@ -62,6 +64,7 @@ def setup test("multiple") do assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).count.to_s) group_key1 group_key2 count(int) count(uint) count(float) count(string) + (uint8) (uint8) (int64) (int64) (int64) (int64) 0 1 1 2 1 1 2 1 2 1 0 1 1 1 2 3 1 1 1 1 0 @@ -73,6 +76,7 @@ def setup group = @table.group(:group_key1, :group_key2) assert_equal(<<-TABLE, group.count(:int, :uint).to_s) group_key1 group_key2 count(int) count(uint) + (uint8) (uint8) (int64) (int64) 0 1 1 2 1 1 2 1 0 1 2 3 1 1 1 @@ -85,6 +89,7 @@ def setup test("single") do assert_equal(<<-TABLE, @table.group(:group_key1).sum.to_s) group_key1 sum(group_key2) sum(int) sum(uint) sum(float) + (uint8) (uint64) (int64) (uint64) (double) 0 1 2 -3 1 2.200000 1 2 1 (null) 3 3.300000 2 3 5 -15 15 16.500000 @@ -94,6 +99,7 @@ def setup test("multiple") do assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).sum.to_s) group_key1 group_key2 sum(int) sum(uint) sum(float) + (uint8) (uint8) (int64) (uint64) (double) 0 1 1 -3 1 2.200000 1 2 1 (null) 3 3.300000 2 3 1 -4 4 4.400000 @@ -106,6 +112,7 @@ def setup test("single") do assert_equal(<<-TABLE, @table.group(:group_key1).mean.to_s) group_key1 mean(group_key2) mean(int) mean(uint) mean(float) + (uint8) (double) (double) (double) (double) 0 1 1.000000 -1.500000 1.000000 2.200000 1 2 1.000000 (null) 3.000000 3.300000 2 3 1.666667 -5.000000 5.000000 5.500000 @@ -115,6 +122,7 @@ def setup test("multiple") do assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).mean.to_s) group_key1 group_key2 mean(int) mean(uint) mean(float) + (uint8) (uint8) (double) (double) (double) 0 1 1 -1.500000 1.000000 2.200000 1 2 1 (null) 3.000000 3.300000 2 3 1 -4.000000 4.000000 4.400000 @@ -127,6 +135,7 @@ def setup test("single") do assert_equal(<<-TABLE, @table.group(:group_key1).min.to_s) group_key1 min(group_key2) min(int) min(uint) min(float) + (uint8) (uint8) (int32) (uint32) (float) 0 1 1 -2 1 2.200000 1 2 1 (null) 3 3.300000 2 3 1 -6 4 4.400000 @@ -136,6 +145,7 @@ def setup test("multiple") do assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).min.to_s) group_key1 group_key2 min(int) min(uint) min(float) + (uint8) (uint8) (int32) (uint32) (float) 0 1 1 -2 1 2.200000 1 2 1 (null) 3 3.300000 2 3 1 -4 4 4.400000 @@ -148,6 +158,7 @@ def setup test("single") do assert_equal(<<-TABLE, @table.group(:group_key1).max.to_s) group_key1 max(group_key2) max(int) max(uint) max(float) + (uint8) (uint8) (int32) (uint32) (float) 0 1 1 -1 1 2.200000 1 2 1 (null) 3 3.300000 2 3 2 -4 6 6.600000 @@ -157,6 +168,7 @@ def setup test("multiple") do assert_equal(<<-TABLE, @table.group(:group_key1, :group_key2).max.to_s) group_key1 group_key2 max(int) max(uint) max(float) + (uint8) (uint8) (int32) (uint32) (float) 0 1 1 -1 1 2.200000 1 2 1 (null) 3 3.300000 2 3 1 -4 4 4.400000 @@ -170,6 +182,7 @@ def setup group = @table.group(:group_key1, :group_key2) assert_equal(<<-TABLE, group.aggregate("count(int)", "sum(uint)").to_s) group_key1 group_key2 count(int) sum(uint) + (uint8) (uint8) (int64) (uint64) 0 1 1 2 1 1 2 1 0 3 2 3 1 1 4 diff --git a/ruby/red-arrow/test/test-schema.rb b/ruby/red-arrow/test/test-schema.rb index 20d73b2726d6b..c4164d83903f2 100644 --- a/ruby/red-arrow/test/test-schema.rb +++ b/ruby/red-arrow/test/test-schema.rb @@ -95,7 +95,7 @@ def setup test("[invalid]") do invalid = [] - message = "field name or index must be String, Symbol or Integer" + message = +"field name or index must be String, Symbol or Integer" message << ": <#{invalid.inspect}>" assert_raise(ArgumentError.new(message)) do @schema[invalid] diff --git a/ruby/red-arrow/test/test-slicer.rb b/ruby/red-arrow/test/test-slicer.rb index d33748a387c8f..89cf34b0d13f7 100644 --- a/ruby/red-arrow/test/test-slicer.rb +++ b/ruby/red-arrow/test/test-slicer.rb @@ -45,11 +45,12 @@ def setup slicer.visible end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 8 true -2 16 true -3 256 true + count visible + (uint32) (bool) +0 1 true +1 8 true +2 16 true +3 256 true TABLE end @@ -58,15 +59,16 @@ def setup slicer.count end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 2 false -2 4 (null) -3 8 true -4 16 true -5 32 false -6 64 (null) -7 256 true + count visible + (uint32) (bool) +0 1 true +1 2 false +2 4 (null) +3 8 true +4 16 true +5 32 false +6 64 (null) +7 256 true TABLE end end @@ -77,9 +79,10 @@ def setup !slicer.visible end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 2 false -1 32 false + count visible + (uint32) (bool) +0 2 false +1 32 false TABLE end @@ -88,8 +91,9 @@ def setup !slicer.count end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) + count visible + (uint32) (bool) +0 0 (null) TABLE end end @@ -99,11 +103,12 @@ def setup slicer.visible.null? end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 4 (null) -2 64 (null) -3 (null) (null) + count visible + (uint32) (bool) +0 0 (null) +1 4 (null) +2 64 (null) +3 (null) (null) TABLE end @@ -112,13 +117,14 @@ def setup slicer.visible.valid? end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 2 false -2 8 true -3 16 true -4 32 false -5 256 true + count visible + (uint32) (bool) +0 1 true +1 2 false +2 8 true +3 16 true +4 32 false +5 256 true TABLE end @@ -128,11 +134,12 @@ def setup slicer.visible == nil end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 4 (null) -2 64 (null) -3 (null) (null) + count visible + (uint32) (bool) +0 0 (null) +1 4 (null) +2 64 (null) +3 (null) (null) TABLE end @@ -141,11 +148,12 @@ def setup slicer.visible == true end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 8 true -2 16 true -3 256 true + count visible + (uint32) (bool) +0 1 true +1 8 true +2 16 true +3 256 true TABLE end end @@ -156,13 +164,14 @@ def setup !(slicer.visible == nil) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 2 false -2 8 true -3 16 true -4 32 false -5 256 true + count visible + (uint32) (bool) +0 1 true +1 2 false +2 8 true +3 16 true +4 32 false +5 256 true TABLE end @@ -171,9 +180,10 @@ def setup !(slicer.visible == true) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 2 false -1 32 false + count visible + (uint32) (bool) +0 2 false +1 32 false TABLE end end @@ -184,13 +194,14 @@ def setup slicer.visible != nil end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 2 false -2 8 true -3 16 true -4 32 false -5 256 true + count visible + (uint32) (bool) +0 1 true +1 2 false +2 8 true +3 16 true +4 32 false +5 256 true TABLE end @@ -199,9 +210,10 @@ def setup slicer.visible != true end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 2 false -1 32 false + count visible + (uint32) (bool) +0 2 false +1 32 false TABLE end end @@ -211,12 +223,13 @@ def setup slicer.count < 16 end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 2 false -3 4 (null) -4 8 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 2 false +3 4 (null) +4 8 true TABLE end @@ -225,11 +238,12 @@ def setup !(slicer.count < 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 16 true -1 32 false -2 64 (null) -3 256 true + count visible + (uint32) (bool) +0 16 true +1 32 false +2 64 (null) +3 256 true TABLE end @@ -238,13 +252,14 @@ def setup slicer.count <= 16 end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 2 false -3 4 (null) -4 8 true -5 16 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 2 false +3 4 (null) +4 8 true +5 16 true TABLE end @@ -253,10 +268,11 @@ def setup !(slicer.count <= 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 32 false -1 64 (null) -2 256 true + count visible + (uint32) (bool) +0 32 false +1 64 (null) +2 256 true TABLE end @@ -265,10 +281,11 @@ def setup slicer.count > 16 end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 32 false -1 64 (null) -2 256 true + count visible + (uint32) (bool) +0 32 false +1 64 (null) +2 256 true TABLE end @@ -277,13 +294,14 @@ def setup !(slicer.count > 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 2 false -3 4 (null) -4 8 true -5 16 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 2 false +3 4 (null) +4 8 true +5 16 true TABLE end @@ -292,11 +310,12 @@ def setup slicer.count >= 16 end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 16 true -1 32 false -2 64 (null) -3 256 true + count visible + (uint32) (bool) +0 16 true +1 32 false +2 64 (null) +3 256 true TABLE end @@ -305,12 +324,13 @@ def setup !(slicer.count >= 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 2 false -3 4 (null) -4 8 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 2 false +3 4 (null) +4 8 true TABLE end @@ -319,11 +339,12 @@ def setup slicer.count.in?([1, 4, 16, 64]) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 4 (null) -2 16 true -3 64 (null) + count visible + (uint32) (bool) +0 1 true +1 4 (null) +2 16 true +3 64 (null) TABLE end @@ -332,13 +353,14 @@ def setup !slicer.count.in?([1, 4, 16, 64]) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 2 false -2 8 true -3 32 false -4 (null) (null) -5 256 true + count visible + (uint32) (bool) +0 0 (null) +1 2 false +2 8 true +3 32 false +4 (null) (null) +5 256 true TABLE end @@ -347,9 +369,10 @@ def setup slicer.visible & (slicer.count >= 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 16 true -1 256 true + count visible + (uint32) (bool) +0 16 true +1 256 true TABLE end @@ -358,12 +381,13 @@ def setup slicer.visible | (slicer.count >= 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 8 true -2 16 true -3 32 false -4 256 true + count visible + (uint32) (bool) +0 1 true +1 8 true +2 16 true +3 32 false +4 256 true TABLE end @@ -372,10 +396,11 @@ def setup slicer.visible ^ (slicer.count >= 16) end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 1 true -1 8 true -2 32 false + count visible + (uint32) (bool) +0 1 true +1 8 true +2 32 false TABLE end @@ -386,15 +411,16 @@ def setup end end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 4 (null) -3 8 true -4 16 true -5 64 (null) -6 (null) (null) -7 256 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 4 (null) +3 8 true +4 16 true +5 64 (null) +6 (null) (null) +7 256 true TABLE end @@ -405,9 +431,10 @@ def setup end end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 2 false -1 32 false + count visible + (uint32) (bool) +0 2 false +1 32 false TABLE end @@ -418,9 +445,10 @@ def setup end end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 2 false -1 32 false + count visible + (uint32) (bool) +0 2 false +1 32 false TABLE end @@ -431,15 +459,16 @@ def setup end end assert_equal(<<-TABLE, sliced_table.to_s) - count visible -0 0 (null) -1 1 true -2 4 (null) -3 8 true -4 16 true -5 64 (null) -6 (null) (null) -7 256 true + count visible + (uint32) (bool) +0 0 (null) +1 1 true +2 4 (null) +3 8 true +4 16 true +5 64 (null) +6 (null) (null) +7 256 true TABLE end @@ -456,6 +485,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 Arrow 1 window TABLE @@ -467,6 +497,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 array 1 Arrow TABLE @@ -478,6 +509,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 array 1 carrot TABLE @@ -489,6 +521,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 array 1 Arrow 2 carrot @@ -501,6 +534,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 Arrow 1 window TABLE @@ -512,6 +546,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 Arrow 1 window TABLE @@ -523,6 +558,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 array 1 Arrow 2 carrot @@ -545,6 +581,7 @@ def setup end assert_equal(<<~TABLE, sliced_table.to_s) string + (utf8) 0 carrot TABLE end diff --git a/ruby/red-arrow/test/test-struct-data-type.rb b/ruby/red-arrow/test/test-struct-data-type.rb index d106e38b1d841..9bf9a17dd645a 100644 --- a/ruby/red-arrow/test/test-struct-data-type.rb +++ b/ruby/red-arrow/test/test-struct-data-type.rb @@ -101,7 +101,7 @@ def setup test("[invalid]") do invalid = [] - message = "field name or index must be String, Symbol or Integer" + message = +"field name or index must be String, Symbol or Integer" message << ": <#{invalid.inspect}>" assert_raise(ArgumentError.new(message)) do @data_type[invalid] diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 883cf70c269bb..a69e926156809 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -87,24 +87,26 @@ def array_like.to_ary target_rows_raw = [nil, true, true, false, true, false, true, true] target_rows = Arrow::BooleanArray.new(target_rows_raw) assert_equal(<<-TABLE, @table.slice(target_rows).to_s) - count visible -0 2 false -1 4 (null) -2 16 true -3 64 (null) -4 128 (null) + count visible + (uint8) (bool) +0 2 false +1 4 (null) +2 16 true +3 64 (null) +4 128 (null) TABLE end test("Array: boolean") do target_rows_raw = [nil, true, true, false, true, false, true, true] assert_equal(<<-TABLE, @table.slice(target_rows_raw).to_s) - count visible -0 2 false -1 4 (null) -2 16 true -3 64 (null) -4 128 (null) + count visible + (uint8) (bool) +0 2 false +1 4 (null) +2 16 true +3 64 (null) +4 128 (null) TABLE end @@ -131,83 +133,93 @@ def array_like.to_ary test("Range: positive: include end") do assert_equal(<<-TABLE, @table.slice(2..4).to_s) - count visible -0 4 (null) -1 8 true -2 16 true + count visible + (uint8) (bool) +0 4 (null) +1 8 true +2 16 true TABLE end test("Range: positive: exclude end") do assert_equal(<<-TABLE, @table.slice(2...4).to_s) - count visible -0 4 (null) -1 8 true + count visible + (uint8) (bool) +0 4 (null) +1 8 true TABLE end test("Range: negative: include end") do assert_equal(<<-TABLE, @table.slice(-4..-2).to_s) - count visible -0 16 true -1 32 false -2 64 (null) + count visible + (uint8) (bool) +0 16 true +1 32 false +2 64 (null) TABLE end test("Range: negative: exclude end") do assert_equal(<<-TABLE, @table.slice(-4...-2).to_s) - count visible -0 16 true -1 32 false + count visible + (uint8) (bool) +0 16 true +1 32 false TABLE end test("[from, to]: positive") do assert_equal(<<-TABLE, @table.slice(0, 2).to_s) - count visible -0 1 true -1 2 false + count visible + (uint8) (bool) +0 1 true +1 2 false TABLE end test("[from, to]: negative") do assert_equal(<<-TABLE, @table.slice(-4, 2).to_s) - count visible -0 16 true -1 32 false + count visible + (uint8) (bool) +0 16 true +1 32 false TABLE end test("{key: Number}") do assert_equal(<<-TABLE, @table.slice(count: 16).to_s) - count visible -0 16 true + count visible + (uint8) (bool) +0 16 true TABLE end test("{key: String}") do table = Arrow::Table.new(name: Arrow::StringArray.new(["a", "b", "c"])) assert_equal(<<-TABLE, table.slice(name: 'b').to_s) - name -0 b + name + (utf8) +0 b TABLE end test("{key: true}") do assert_equal(<<-TABLE, @table.slice(visible: true).to_s) - count visible -0 1 true -1 8 true -2 16 true + count visible + (uint8) (bool) +0 1 true +1 8 true +2 16 true TABLE end test("{key: false}") do assert_equal(<<-TABLE, @table.slice(visible: false).to_s) - count visible -0 2 false -1 32 false + count visible + (uint8) (bool) +0 2 false +1 32 false TABLE end @@ -218,11 +230,12 @@ def array_like.to_ary omit("beginless range isn't supported") end assert_equal(<<-TABLE, @table.slice(count: range).to_s) - count visible -0 1 true -1 2 false -2 4 (null) -3 8 true + count visible + (uint8) (bool) +0 1 true +1 2 false +2 4 (null) +3 8 true TABLE end @@ -233,10 +246,11 @@ def array_like.to_ary omit("beginless range isn't supported") end assert_equal(<<-TABLE, @table.slice(count: range).to_s) - count visible -0 1 true -1 2 false -2 4 (null) + count visible + (uint8) (bool) +0 1 true +1 2 false +2 4 (null) TABLE end @@ -247,39 +261,43 @@ def array_like.to_ary omit("endless range isn't supported") end assert_equal(<<-TABLE, @table.slice(count: range).to_s) - count visible -0 16 true -1 32 false -2 64 (null) -3 128 (null) + count visible + (uint8) (bool) +0 16 true +1 32 false +2 64 (null) +3 128 (null) TABLE end test("{key: Range}: include end") do assert_equal(<<-TABLE, @table.slice(count: 1..16).to_s) - count visible -0 1 true -1 2 false -2 4 (null) -3 8 true -4 16 true + count visible + (uint8) (bool) +0 1 true +1 2 false +2 4 (null) +3 8 true +4 16 true TABLE end test("{key: Range}: exclude end") do assert_equal(<<-TABLE, @table.slice(count: 1...16).to_s) - count visible -0 1 true -1 2 false -2 4 (null) -3 8 true + count visible + (uint8) (bool) +0 1 true +1 2 false +2 4 (null) +3 8 true TABLE end test("{key1: Range, key2: true}") do assert_equal(<<-TABLE, @table.slice(count: 0..8, visible: false).to_s) - count visible -0 2 false + count visible + (uint8) (bool) +0 2 false TABLE end @@ -372,44 +390,47 @@ def setup test("add") do name_array = Arrow::StringArray.new(["a", "b", "c", "d", "e", "f", "g", "h"]) assert_equal(<<-TABLE, @table.merge(:name => name_array).to_s) - count visible name -0 1 true a -1 2 false b -2 4 (null) c -3 8 true d -4 16 true e -5 32 false f -6 64 (null) g -7 128 (null) h + count visible name + (uint8) (bool) (utf8) +0 1 true a +1 2 false b +2 4 (null) c +3 8 true d +4 16 true e +5 32 false f +6 64 (null) g +7 128 (null) h TABLE end test("remove") do assert_equal(<<-TABLE, @table.merge(:visible => nil).to_s) - count -0 1 -1 2 -2 4 -3 8 -4 16 -5 32 -6 64 -7 128 + count + (uint8) +0 1 +1 2 +2 4 +3 8 +4 16 +5 32 +6 64 +7 128 TABLE end test("replace") do visible_array = Arrow::Int32Array.new([1] * @visible_array.length) assert_equal(<<-TABLE, @table.merge(:visible => visible_array).to_s) - count visible -0 1 1 -1 2 1 -2 4 1 -3 8 1 -4 16 1 -5 32 1 -6 64 1 -7 128 1 + count visible + (uint8) (int32) +0 1 1 +1 2 1 +2 4 1 +3 8 1 +4 16 1 +5 32 1 +6 64 1 +7 128 1 TABLE end end @@ -419,15 +440,16 @@ def setup name_array = Arrow::StringArray.new(["a", "b", "c", "d", "e", "f", "g", "h"]) table = Arrow::Table.new("name" => name_array) assert_equal(<<-TABLE, @table.merge(table).to_s) - count visible name -0 1 true a -1 2 false b -2 4 (null) c -3 8 true d -4 16 true e -5 32 false f -6 64 (null) g -7 128 (null) h + count visible name + (uint8) (bool) (utf8) +0 1 true a +1 2 false b +2 4 (null) c +3 8 true d +4 16 true e +5 32 false f +6 64 (null) g +7 128 (null) h TABLE end @@ -435,15 +457,16 @@ def setup visible_array = Arrow::Int32Array.new([1] * @visible_array.length) table = Arrow::Table.new("visible" => visible_array) assert_equal(<<-TABLE, @table.merge(table).to_s) - count visible -0 1 1 -1 2 1 -2 4 1 -3 8 1 -4 16 1 -5 32 1 -6 64 1 -7 128 1 + count visible + (uint8) (int32) +0 1 1 +1 2 1 +2 4 1 +3 8 1 +4 16 1 +5 32 1 +6 64 1 +7 128 1 TABLE end end @@ -457,29 +480,31 @@ def setup sub_test_case("#remove_column") do test("String") do assert_equal(<<-TABLE, @table.remove_column("visible").to_s) - count -0 1 -1 2 -2 4 -3 8 -4 16 -5 32 -6 64 -7 128 + count + (uint8) +0 1 +1 2 +2 4 +3 8 +4 16 +5 32 +6 64 +7 128 TABLE end test("Symbol") do assert_equal(<<-TABLE, @table.remove_column(:visible).to_s) - count -0 1 -1 2 -2 4 -3 8 -4 16 -5 32 -6 64 -7 128 + count + (uint8) +0 1 +1 2 +2 4 +3 8 +4 16 +5 32 +6 64 +7 128 TABLE end @@ -491,29 +516,31 @@ def setup test("Integer") do assert_equal(<<-TABLE, @table.remove_column(1).to_s) - count -0 1 -1 2 -2 4 -3 8 -4 16 -5 32 -6 64 -7 128 + count + (uint8) +0 1 +1 2 +2 4 +3 8 +4 16 +5 32 +6 64 +7 128 TABLE end test("negative integer") do assert_equal(<<-TABLE, @table.remove_column(-1).to_s) - count -0 1 -1 2 -2 4 -3 8 -4 16 -5 32 -6 64 -7 128 + count + (uint8) +0 1 +1 2 +2 4 +3 8 +4 16 +5 32 +6 64 +7 128 TABLE end @@ -544,29 +571,33 @@ def setup test("names") do assert_equal(<<-TABLE, @table.select_columns(:c, :a).to_s) - c a -0 1 1 + c a + (uint8) (uint8) +0 1 1 TABLE end test("range") do assert_equal(<<-TABLE, @table.select_columns(2...4).to_s) - c d -0 1 1 + c d + (uint8) (uint8) +0 1 1 TABLE end test("indexes") do assert_equal(<<-TABLE, @table.select_columns(0, -1, 2).to_s) - a e c -0 1 1 1 + a e c + (uint8) (uint8) (uint8) +0 1 1 1 TABLE end test("mixed") do assert_equal(<<-TABLE, @table.select_columns(:a, -1, 2..3).to_s) - a e c d -0 1 1 1 1 + a e c d + (uint8) (uint8) (uint8) (uint8) +0 1 1 1 1 TABLE end @@ -575,8 +606,9 @@ def setup column.name == "a" or i.odd? end assert_equal(<<-TABLE, selected_table.to_s) - a b d -0 1 1 1 + a b d + (uint8) (uint8) (uint8) +0 1 1 1 TABLE end @@ -585,15 +617,17 @@ def setup column.name == "a" end assert_equal(<<-TABLE, selected_table.to_s) - a -0 1 + a + (uint8) +0 1 TABLE end test("empty result") do selected_table = @table.filter([false] * @table.size).select_columns(:a) assert_equal(<<-TABLE, selected_table.to_s) - a + a + (uint8) TABLE end end @@ -682,7 +716,7 @@ def test_json output = create_output(".json") # TODO: Implement this. # @table.save(output, format: :json) - columns = "" + columns = +"" @table.each_record.each do |record| column = { "count" => record.count, @@ -789,10 +823,11 @@ def create_output(extension) path = fixture_path("with-header.csv") table = Arrow::Table.load(path, skip_lines: /^\#/) assert_equal(<<-TABLE, table.to_s) - name score -0 alice 10 -1 bob 29 -2 chris -1 + name score + (utf8) (int8) +0 alice 10 +1 bob 29 +2 chris -1 TABLE end @@ -808,10 +843,11 @@ def create_output(extension) CSV end assert_equal(<<-TABLE, Arrow::Table.load(file.path).to_s) - name score -0 alice 10 -1 bob 29 -2 chris -1 + name score + (utf8) (int64) +0 alice 10 +1 bob 29 +2 chris -1 TABLE end @@ -826,10 +862,11 @@ def create_output(extension) file.close table = Arrow::Table.load(file.path) assert_equal(<<-TABLE, table.to_s) - name score -0 alice 10 -1 bob 29 -2 chris -1 + name score + (utf8) (int64) +0 alice 10 +1 bob 29 +2 chris -1 TABLE end end @@ -881,7 +918,7 @@ def test_http(data) output.data.to_s, content_type) do |port| input = URI("http://127.0.0.1:#{port}#{path}") - loaded_table = Arrow::Table.load(input) + loaded_table = Arrow::Table.load(input, schema: @table.schema) assert_equal(@table.to_s, loaded_table.to_s) end end @@ -962,15 +999,16 @@ def test_join packed_table = @table.pack column_n_chunks = packed_table.columns.collect {|c| c.data.n_chunks} assert_equal([[1, 1], <<-TABLE], [column_n_chunks, packed_table.to_s]) - count visible -0 1 true -1 2 false -2 4 (null) -3 8 true -4 16 true -5 32 false -6 64 (null) -7 128 (null) + count visible + (uint8) (bool) +0 1 true +1 2 false +2 4 (null) +3 8 true +4 16 true +5 32 false +6 64 (null) +7 128 (null) TABLE end @@ -1009,19 +1047,20 @@ def setup test(":list") do assert_equal(<<-TABLE, @table.to_s(format: :list)) ==================== 0 ==================== -count: 1 -visible: true +count(uint8): 1 +visible(bool): true ==================== 1 ==================== -count: 2 -visible: false +count(uint8): 2 +visible(bool): false TABLE end test(":table") do assert_equal(<<-TABLE, @table.to_s(format: :table)) - count visible -0 1 true -1 2 false + count visible + (uint8) (bool) +0 1 true +1 2 false TABLE end @@ -1033,6 +1072,35 @@ def setup end end + sub_test_case(":show_column_type") do + def setup + columns = { + "count" => Arrow::UInt8Array.new([1, 2]), + "visible" => Arrow::BooleanArray.new([true, false]), + } + @table = Arrow::Table.new(columns) + end + + test(":list") do + assert_equal(<<-TABLE, @table.to_s(format: :list, show_column_type: false)) +==================== 0 ==================== +count: 1 +visible: true +==================== 1 ==================== +count: 2 +visible: false + TABLE + end + + test(":table") do + assert_equal(<<-TABLE, @table.to_s(format: :table, show_column_type: false)) + count visible +0 1 true +1 2 false + TABLE + end + end + sub_test_case("#==") do test("Arrow::Table") do assert do @@ -1058,13 +1126,14 @@ def setup test("Array: boolean") do filter = [nil, true, true, false, true, false, true, true] assert_equal(<<-TABLE, @table.filter(filter, @options).to_s) - count visible -0 (null) (null) -1 2 false -2 4 (null) -3 16 true -4 64 (null) -5 128 (null) + count visible + (uint8) (bool) +0 (null) (null) +1 2 false +2 4 (null) +3 16 true +4 64 (null) +5 128 (null) TABLE end @@ -1072,13 +1141,14 @@ def setup array = [nil, true, true, false, true, false, true, true] filter = Arrow::BooleanArray.new(array) assert_equal(<<-TABLE, @table.filter(filter, @options).to_s) - count visible -0 (null) (null) -1 2 false -2 4 (null) -3 16 true -4 64 (null) -5 128 (null) + count visible + (uint8) (bool) +0 (null) (null) +1 2 false +2 4 (null) +3 16 true +4 64 (null) +5 128 (null) TABLE end @@ -1090,13 +1160,14 @@ def setup ] filter = Arrow::ChunkedArray.new(filter_chunks) assert_equal(<<-TABLE, @table.filter(filter, @options).to_s) - count visible -0 (null) (null) -1 2 false -2 4 (null) -3 16 true -4 64 (null) -5 128 (null) + count visible + (uint8) (bool) +0 (null) (null) +1 2 false +2 4 (null) +3 16 true +4 64 (null) +5 128 (null) TABLE end end @@ -1105,20 +1176,22 @@ def setup test("Arrow: boolean") do indices = [1, 0, 2] assert_equal(<<-TABLE, @table.take(indices).to_s) - count visible -0 2 false -1 1 true -2 4 (null) + count visible + (uint8) (bool) +0 2 false +1 1 true +2 4 (null) TABLE end test("Arrow::Array") do indices = Arrow::Int16Array.new([1, 0, 2]) assert_equal(<<-TABLE, @table.take(indices).to_s) - count visible -0 2 false -1 1 true -2 4 (null) + count visible + (uint8) (bool) +0 2 false +1 1 true +2 4 (null) TABLE end @@ -1129,10 +1202,11 @@ def setup ] indices = Arrow::ChunkedArray.new(chunks) assert_equal(<<-TABLE, @table.take(indices).to_s) - count visible -0 2 false -1 1 true -2 4 (null) + count visible + (uint8) (bool) +0 2 false +1 1 true +2 4 (null) TABLE end end @@ -1144,9 +1218,10 @@ def setup table2 = Arrow::Table.new(b: [false]) concatenated = table1.concatenate([table2], unify_schemas: true) assert_equal(<<-TABLE, concatenated.to_s) - a b -0 true false -1 (null) false + a b + (bool) (bool) +0 true false +1 (null) false TABLE end end From 64be7a2d073759b5eea90e3a6167ca2c1d56ce79 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 26 Apr 2024 17:15:13 +0900 Subject: [PATCH 022/106] GH-41333: [C++][CMake] Prefer protobuf-config.cmake to FindProtobuf.cmake (#41360) ### Rationale for this change `protobuf::libprotobuf` provided by `FindProtobuf.cmake` (provided by CMake) may not provide needed dependencies such as Abseil. ### What changes are included in this PR? Try `protobuf-config.cmake` provided by Protobuf before `FindProtobuf.cmake`. `protobuf::libprotobuf` provided by `protobuf-config.cmake` must have needed dependencies. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #41333 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/FindProtobufAlt.cmake | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/cpp/cmake_modules/FindProtobufAlt.cmake b/cpp/cmake_modules/FindProtobufAlt.cmake index 15fe1b4f27ef7..f343b42f2b762 100644 --- a/cpp/cmake_modules/FindProtobufAlt.cmake +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -28,12 +28,22 @@ endif() if(ProtobufAlt_FIND_QUIETLY) list(APPEND find_package_args QUIET) endif() -find_package(Protobuf ${find_package_args}) -set(ProtobufAlt_FOUND ${Protobuf_FOUND}) +find_package(protobuf CONFIG ${find_package_args}) +set(ProtobufAlt_FOUND ${protobuf_FOUND}) if(ProtobufAlt_FOUND) - set(ProtobufAlt_VERSION ${Protobuf_VERSION}) - set(ProtobufAlt_VERSION_MAJOR ${Protobuf_VERSION_MAJOR}) - set(ProtobufAlt_VERSION_MINOR ${Protobuf_VERSION_MINOR}) - set(ProtobufAlt_VERSION_PATCH ${Protobuf_VERSION_PATCH}) - set(ProtobufAlt_VERSION_TWEEK ${Protobuf_VERSION_TWEEK}) + set(ProtobufAlt_VERSION ${protobuf_VERSION}) + set(ProtobufAlt_VERSION_MAJOR ${protobuf_VERSION_MAJOR}) + set(ProtobufAlt_VERSION_MINOR ${protobuf_VERSION_MINOR}) + set(ProtobufAlt_VERSION_PATCH ${protobuf_VERSION_PATCH}) + set(ProtobufAlt_VERSION_TWEEK ${protobuf_VERSION_TWEEK}) +else() + find_package(Protobuf ${find_package_args}) + set(ProtobufAlt_FOUND ${Protobuf_FOUND}) + if(ProtobufAlt_FOUND) + set(ProtobufAlt_VERSION ${Protobuf_VERSION}) + set(ProtobufAlt_VERSION_MAJOR ${Protobuf_VERSION_MAJOR}) + set(ProtobufAlt_VERSION_MINOR ${Protobuf_VERSION_MINOR}) + set(ProtobufAlt_VERSION_PATCH ${Protobuf_VERSION_PATCH}) + set(ProtobufAlt_VERSION_TWEEK ${Protobuf_VERSION_TWEEK}) + endif() endif() From 6cbdec55b6bbaee3f00a42541bf89998452a3718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 26 Apr 2024 12:26:22 +0200 Subject: [PATCH 023/106] GH-41282: [Dev] Always prompt next major version on merge script if it exists (#41305) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change When we created the `16.1.0` milestone the merge script started prompting it instead of `17.0.0` we want to default to the next major release. ### What changes are included in this PR? Update archery logic to default to major versions online. ### Are these changes tested? I've tested locally and now it defaults to `17.0.0`: ``` Enter fix version [17.0.0]: ``` ### Are there any user-facing changes? No * GitHub Issue: #41282 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- dev/merge_arrow_pr.py | 14 +++------ dev/test_merge_arrow_pr.py | 62 ++++++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 25d3372d8b4d3..344d943fd87e1 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -306,15 +306,11 @@ def version_tuple(x): # Only suggest versions starting with a number, like 0.x but not JS-0.x mainline_versions = all_versions - mainline_non_patch_versions = [] - for v in mainline_versions: - (major, minor, patch) = v.split(".") - if patch == "0": - mainline_non_patch_versions.append(v) - - if len(mainline_versions) > len(mainline_non_patch_versions): - # If there is a non-patch release, suggest that instead - mainline_versions = mainline_non_patch_versions + major_versions = [v for v in mainline_versions if v.endswith('.0.0')] + + if len(mainline_versions) > len(major_versions): + # If there is a future major release, suggest that + mainline_versions = major_versions mainline_versions = [v for v in mainline_versions if f"maint-{v}" not in maintenance_branches] diff --git a/dev/test_merge_arrow_pr.py b/dev/test_merge_arrow_pr.py index 305b08f2830bb..0067c10414c65 100755 --- a/dev/test_merge_arrow_pr.py +++ b/dev/test_merge_arrow_pr.py @@ -26,14 +26,17 @@ FakeIssue = namedtuple('issue', ['fields']) FakeFields = namedtuple('fields', ['status', 'summary', 'assignee', - 'components', 'fixVersions']) + 'components', 'fixVersions', 'milestone']) FakeAssignee = namedtuple('assignee', ['displayName']) FakeStatus = namedtuple('status', ['name']) FakeComponent = namedtuple('component', ['name']) FakeVersion = namedtuple('version', ['name', 'raw']) +FakeMilestone = namedtuple('milestone', ['state']) RAW_VERSION_JSON = [ {'name': 'JS-0.4.0', 'released': False}, + {'name': '1.0.0', 'released': False}, + {'name': '2.0.0', 'released': False}, {'name': '0.9.0', 'released': False}, {'name': '0.10.0', 'released': False}, {'name': '0.8.0', 'released': True}, @@ -50,7 +53,7 @@ status = FakeStatus('In Progress') fields = FakeFields(status, 'issue summary', FakeAssignee('groundhog'), [FakeComponent('C++'), FakeComponent('Format')], - []) + [], FakeMilestone('closed')._asdict()) FAKE_ISSUE_1 = FakeIssue(fields) @@ -92,6 +95,31 @@ def project_versions(self, project): return self._project_versions +class FakeGitHub: + + def __init__(self, issue=None, project_versions=None): + self._issue = issue + self._project_versions = project_versions + + @property + def issue(self): + return self._issue.fields._asdict() + + @property + def current_versions(self): + all_versions = self._project_versions or SOURCE_VERSIONS + return [ + v for v in all_versions if not v.raw.get("released") + ] + ['0.11.0'] + + @property + def current_fix_versions(self): + return 'JS-0.4.0' + + def project_versions(self, project): + return self._project_versions + + class FakeCLI: def __init__(self, responses=()): @@ -115,11 +143,11 @@ def test_jira_fix_versions(): fix_version = merge_arrow_pr.get_candidate_fix_version( issue.current_versions ) - assert fix_version == '0.9.0' + assert fix_version == '1.0.0' def test_jira_fix_versions_filters_maintenance(): - maintenance_branches = ["maint-0.9.0"] + maintenance_branches = ["maint-1.0.0"] jira = FakeJIRA(project_versions=SOURCE_VERSIONS, transitions=TRANSITIONS) @@ -128,13 +156,14 @@ def test_jira_fix_versions_filters_maintenance(): issue.current_versions, maintenance_branches=maintenance_branches ) - assert fix_version == '0.10.0' + assert fix_version == '2.0.0' -def test_jira_no_suggest_patch_release(): +def test_jira_only_suggest_major_release(): versions_json = [ {'name': '0.9.1', 'released': False}, {'name': '0.10.0', 'released': False}, + {'name': '1.0.0', 'released': False}, ] versions = [FakeVersion(raw['name'], raw) for raw in versions_json] @@ -144,7 +173,7 @@ def test_jira_no_suggest_patch_release(): fix_version = merge_arrow_pr.get_candidate_fix_version( issue.current_versions ) - assert fix_version == '0.10.0' + assert fix_version == '1.0.0' def test_jira_parquet_no_suggest_non_cpp(): @@ -153,8 +182,10 @@ def test_jira_parquet_no_suggest_non_cpp(): {'name': 'cpp-1.5.0', 'released': True}, {'name': 'cpp-1.6.0', 'released': False}, {'name': 'cpp-1.7.0', 'released': False}, + {'name': 'cpp-2.0.0', 'released': False}, {'name': '1.11.0', 'released': False}, - {'name': '1.12.0', 'released': False} + {'name': '1.12.0', 'released': False}, + {'name': '2.0.0', 'released': False} ] versions = [FakeVersion(raw['name'], raw) @@ -166,7 +197,7 @@ def test_jira_parquet_no_suggest_non_cpp(): fix_version = merge_arrow_pr.get_candidate_fix_version( issue.current_versions ) - assert fix_version == 'cpp-1.6.0' + assert fix_version == 'cpp-2.0.0' def test_jira_invalid_issue(): @@ -219,13 +250,12 @@ def test_jira_resolve_non_mainline(): def test_jira_resolve_released_fix_version(): # ARROW-5083 - jira = FakeJIRA(issue=FAKE_ISSUE_1, - project_versions=SOURCE_VERSIONS, - transitions=TRANSITIONS) + jira = FakeGitHub(issue=FAKE_ISSUE_1, + project_versions=SOURCE_VERSIONS) - cmd = FakeCLI(responses=['0.7.0']) + cmd = FakeCLI(responses=['1.0.0']) fix_versions_json = merge_arrow_pr.prompt_for_fix_version(cmd, jira) - assert fix_versions_json == "0.7.0" + assert fix_versions_json == "1.0.0" def test_multiple_authors_bad_input(): @@ -256,7 +286,7 @@ def test_multiple_authors_bad_input(): def test_jira_already_resolved(): status = FakeStatus('Resolved') fields = FakeFields(status, 'issue summary', FakeAssignee('groundhog'), - [FakeComponent('Java')], []) + [FakeComponent('Java')], [], None) issue = FakeIssue(fields) jira = FakeJIRA(issue=issue, @@ -287,7 +317,7 @@ def test_no_unset_point_release_fix_version(): fields = FakeFields(status, 'summary', FakeAssignee('someone'), [FakeComponent('Java')], [FakeVersion(v, versions_json[v]) - for v in ['0.17.0', '0.15.1', '0.14.2']]) + for v in ['0.17.0', '0.15.1', '0.14.2']], None) issue = FakeIssue(fields) jira = FakeJIRA( From 2710626b234d5e387a3c63988ca5899c70547dcf Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Fri, 26 Apr 2024 06:26:42 -0700 Subject: [PATCH 024/106] GH-41375: [C#] Move to .NET 8.0 (#41376) ### What changes are included in this PR? Changes to workflow infrastructure and projects to install and target net8.0 instead of net7.0. ### Are these changes tested? Yes ### Are there any user-facing changes? Users will need to install .NET 8 to run tests and examples. No impact on product code. Closes #41375 * GitHub Issue: #41375 Lead-authored-by: Curt Hagenlocher Co-authored-by: Sutou Kouhei Signed-off-by: Curt Hagenlocher --- .env | 2 +- .github/workflows/csharp.yml | 6 +++--- .github/workflows/dev.yml | 2 +- ci/docker/conda-integration.dockerfile | 2 +- ci/docker/ubuntu-22.04-csharp.dockerfile | 2 +- .../FlightAspServerExample/FlightAspServerExample.csproj | 2 +- .../examples/FlightClientExample/FlightClientExample.csproj | 2 +- .../FluentBuilderExample/FluentBuilderExample.csproj | 2 +- .../Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj | 2 +- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.TestWeb.csproj | 2 +- .../Apache.Arrow.Flight.Tests.csproj | 2 +- .../Apache.Arrow.IntegrationTest.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 6 ++++-- dev/archery/archery/integration/tester_csharp.py | 6 +++--- dev/release/verify-release-candidate.sh | 4 ++-- dev/tasks/verify-rc/github.macos.yml | 4 ++-- docs/source/developers/release_verification.rst | 2 +- 19 files changed, 28 insertions(+), 26 deletions(-) diff --git a/.env b/.env index f379ca14cd205..d9f875a4d454e 100644 --- a/.env +++ b/.env @@ -56,7 +56,7 @@ UBUNTU=20.04 CLANG_TOOLS=14 CUDA=11.2.2 DASK=latest -DOTNET=7.0 +DOTNET=8.0 GCC_VERSION="" GO=1.21.8 STATICCHECK=v0.4.7 diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 17ef2de81088f..12f946fe66fc9 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -46,7 +46,7 @@ jobs: strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 @@ -74,7 +74,7 @@ jobs: strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 @@ -101,7 +101,7 @@ jobs: strategy: fail-fast: false matrix: - dotnet: ['7.0.x'] + dotnet: ['8.0.x'] steps: - name: Install C# uses: actions/setup-dotnet@v4 diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 7c438be6024d6..6111d1d2e5fe3 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -115,7 +115,7 @@ jobs: - name: Install .NET uses: actions/setup-dotnet@4d6c8fcf3c8f7a60068d26b594648e99df24cee3 # v4.0.0 with: - dotnet-version: '7.0.x' + dotnet-version: '8.0.x' - name: Install Dependencies shell: bash run: | diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index a747ccbc7262f..30b9cd5199fab 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -56,7 +56,7 @@ RUN wget -nv -O - https://dl.google.com/go/go${go}.linux-${arch}.tar.gz | tar -x ENV DOTNET_ROOT=/opt/dotnet \ PATH=/opt/dotnet:$PATH -RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 7.0 -InstallDir /opt/dotnet +RUN curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Channel 8.0 -InstallDir /opt/dotnet ENV ARROW_ACERO=OFF \ ARROW_AZURE=OFF \ diff --git a/ci/docker/ubuntu-22.04-csharp.dockerfile b/ci/docker/ubuntu-22.04-csharp.dockerfile index aebbd8fab74e9..4d77ba060b877 100644 --- a/ci/docker/ubuntu-22.04-csharp.dockerfile +++ b/ci/docker/ubuntu-22.04-csharp.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch=amd64 -ARG dotnet=7.0 +ARG dotnet=8.0 ARG platform=jammy FROM mcr.microsoft.com/dotnet/sdk:${dotnet}-${platform}-${arch} diff --git a/csharp/examples/FlightAspServerExample/FlightAspServerExample.csproj b/csharp/examples/FlightAspServerExample/FlightAspServerExample.csproj index 98e8bb324c727..79312520ba8c0 100644 --- a/csharp/examples/FlightAspServerExample/FlightAspServerExample.csproj +++ b/csharp/examples/FlightAspServerExample/FlightAspServerExample.csproj @@ -20,7 +20,7 @@ - net7.0 + net8.0 enable enable 10 diff --git a/csharp/examples/FlightClientExample/FlightClientExample.csproj b/csharp/examples/FlightClientExample/FlightClientExample.csproj index 228c5cf4ea44c..ce0ec83359769 100644 --- a/csharp/examples/FlightClientExample/FlightClientExample.csproj +++ b/csharp/examples/FlightClientExample/FlightClientExample.csproj @@ -21,7 +21,7 @@ Exe - net7.0 + net8.0 diff --git a/csharp/examples/FluentBuilderExample/FluentBuilderExample.csproj b/csharp/examples/FluentBuilderExample/FluentBuilderExample.csproj index a7bd5392f007d..b1d76d4db00e0 100644 --- a/csharp/examples/FluentBuilderExample/FluentBuilderExample.csproj +++ b/csharp/examples/FluentBuilderExample/FluentBuilderExample.csproj @@ -2,7 +2,7 @@ Exe - net7.0 + net8.0 diff --git a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj index d44b7488e3b17..f735f01b022d0 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj +++ b/csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj @@ -2,7 +2,7 @@ Exe - net7.0 + net8.0 diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 5cc0d303e881e..b386ccf79c12c 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 false diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 5b7c10f35bed0..ae6f9f1e69667 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 false diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj index d7a2042a4581a..bd6425e7ed99b 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 050d0f452cc4e..ed158ca8656d3 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -1,7 +1,7 @@ - net7.0 + net8.0 false diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj index e77f329bf2a15..7f226fd08818f 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj +++ b/csharp/test/Apache.Arrow.IntegrationTest/Apache.Arrow.IntegrationTest.csproj @@ -4,7 +4,7 @@ Exe true - net7.0 + net8.0 diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 92f6e2d662f38..06fb44e0a0e88 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -4,13 +4,15 @@ true true + + true - net7.0;net472;net462 + net8.0;net472;net462 - net7.0 + net8.0 diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 9aab5b0b28ef9..02ced0701deaf 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -28,7 +28,7 @@ _EXE_PATH = os.path.join(_ARTIFACTS_PATH, "Apache.Arrow.IntegrationTest", - "Debug/net7.0/Apache.Arrow.IntegrationTest", + "Debug/net8.0/Apache.Arrow.IntegrationTest", ) _clr_loaded = False @@ -44,10 +44,10 @@ def _load_clr(): import clr clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.IntegrationTest/" - f"Debug/net7.0/Apache.Arrow.IntegrationTest.dll") + f"Debug/net8.0/Apache.Arrow.IntegrationTest.dll") clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.Tests/" - f"Debug/net7.0/Apache.Arrow.Tests.dll") + f"Debug/net8.0/Apache.Arrow.Tests.dll") from Apache.Arrow.IntegrationTest import CDataInterface CDataInterface.Initialize() diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 77ea01e3eec04..95be4800f7ffd 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -347,7 +347,7 @@ install_csharp() { show_info "Ensuring that C# is installed..." - if dotnet --version | grep 7\.0 > /dev/null 2>&1; then + if dotnet --version | grep 8\.0 > /dev/null 2>&1; then local csharp_bin=$(dirname $(which dotnet)) show_info "Found C# at $(which csharp) (.NET $(dotnet --version))" else @@ -355,7 +355,7 @@ install_csharp() { show_info "dotnet found but it is the wrong version and will be ignored." fi local csharp_bin=${ARROW_TMPDIR}/csharp/bin - local dotnet_version=7.0.102 + local dotnet_version=8.0.204 local dotnet_platform= case "$(uname)" in Linux) diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml index 8963954dba49d..4bc3fff71b64a 100644 --- a/dev/tasks/verify-rc/github.macos.yml +++ b/dev/tasks/verify-rc/github.macos.yml @@ -51,9 +51,9 @@ jobs: distribution: 'temurin' java-version: '11' - - uses: actions/setup-dotnet@v2 + - uses: actions/setup-dotnet@v4 with: - dotnet-version: '7.0.x' + dotnet-version: '8.0.x' - uses: actions/setup-node@v4 with: diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index ec474a5729b64..8c301b44a3c42 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -152,7 +152,7 @@ As an example: * NVIDIA CUDA Build cuda_11.5.r11.5/compiler.30672275_0 * openjdk version "17.0.9" 2023-10-17 * ruby 3.0.2p107 (2021-07-07 revision 0db68f0233) [x86_64-linux-gnu] - * dotnet 7.0.115 + * dotnet 8.0.204 * Ubuntu 22.04 LTS If there were some issues during verification please report them on the From 0f56339ee803858b597418aefdabb993def19f48 Mon Sep 17 00:00:00 2001 From: Ian Cook Date: Fri, 26 Apr 2024 11:33:34 -0400 Subject: [PATCH 025/106] GH-41386: [Docs] Document Arrow Go Avro read support (#41393) Updates the Implementation Status docs page to reflect that the Go implementation can read Avro files. For the Rust implementation, I inferred from [this PR](https://github.com/apache/arrow-rs/issues/4886) and [this comment](https://github.com/apache/arrow-rs/issues/5562#issuecomment-2024885348) that we should hold off on indicating that the Rust implementation can read Avro files. * GitHub Issue: #41386 --- docs/source/status.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index f4672d6b4bc55..266381175608a 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -353,7 +353,7 @@ Third-Party Data Formats | Format | C++ | Java | Go | JS | C# | Rust | Julia | Swift | | | | | | | | | | | +=============================+=========+=========+=======+====+=======+=======+=======+=======+ -| Avro | | R | | | | | | | +| Avro | | R | R | | | | | | +-----------------------------+---------+---------+-------+----+-------+-------+-------+-------+ | CSV | R/W | R (2) | R/W | | | R/W | R/W | | +-----------------------------+---------+---------+-------+----+-------+-------+-------+-------+ From 15986ae5ffef2f274c04cf0d5eec2155fe6523a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 26 Apr 2024 19:13:39 +0200 Subject: [PATCH 026/106] GH-41390: [CI] Use setup-python GitHub action on csharp macOS job (#41392) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The new macos-latest GH runner has changed and jobs are currently failing. ### What changes are included in this PR? Install python with setup-python. ### Are these changes tested? Will be on CI ### Are there any user-facing changes? No * GitHub Issue: #41390 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .github/workflows/csharp.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 12f946fe66fc9..7ae3606a44812 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -94,7 +94,7 @@ jobs: run: ci/scripts/csharp_test.sh $(pwd) macos: - name: AMD64 macOS 11 C# ${{ matrix.dotnet }} + name: ARM64 macOS 14 C# ${{ matrix.dotnet }} runs-on: macos-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 @@ -107,6 +107,10 @@ jobs: uses: actions/setup-dotnet@v4 with: dotnet-version: ${{ matrix.dotnet }} + - name: Setup Python + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: 3.12 - name: Checkout Arrow uses: actions/checkout@v4 with: From ea314a3f8d9d4446836aa999b66659c07421f7a4 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Fri, 26 Apr 2024 18:32:32 -0400 Subject: [PATCH 027/106] GH-41358: [R] Support join "na_matches" argument (#41372) ### Rationale for this change Noticed in #41350, I made #41358 to implement this in C++, but it turns out the option was there, just buried a bit. ### What changes are included in this PR? `na_matches` is mapped through to the `key_cmp` field in `HashJoinNodeOptions`. Acero supports having a different value for this for each of the join keys, but dplyr does not, so I kept it constant for all key columns to match the dplyr behavior. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * GitHub Issue: #41358 --- r/NEWS.md | 1 + r/R/arrow-package.R | 12 +++++------ r/R/arrowExports.R | 4 ++-- r/R/dplyr-funcs-doc.R | 12 +++++------ r/R/dplyr-join.R | 8 +++++--- r/R/query-engine.R | 8 +++++--- r/man/acero.Rd | 12 +++++------ r/src/arrowExports.cpp | 11 +++++----- r/src/compute-exec.cpp | 18 ++++++++++++----- r/tests/testthat/test-dplyr-join.R | 32 ++++++++++++++++++++++++++++++ 10 files changed, 82 insertions(+), 36 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 4ed9f28a28436..05f934dac68f3 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -21,6 +21,7 @@ * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. +* The `na_matches` argument to the `dplyr::*_join()` functions is now supported. This argument controls whether `NA` values are considered equal when joining. (#41358) # arrow 16.0.0 diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index f6977e626276b..7087a40c4903a 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -66,12 +66,12 @@ supported_dplyr_methods <- list( compute = NULL, collapse = NULL, distinct = "`.keep_all = TRUE` not supported", - left_join = "the `copy` and `na_matches` arguments are ignored", - right_join = "the `copy` and `na_matches` arguments are ignored", - inner_join = "the `copy` and `na_matches` arguments are ignored", - full_join = "the `copy` and `na_matches` arguments are ignored", - semi_join = "the `copy` and `na_matches` arguments are ignored", - anti_join = "the `copy` and `na_matches` arguments are ignored", + left_join = "the `copy` argument is ignored", + right_join = "the `copy` argument is ignored", + inner_join = "the `copy` argument is ignored", + full_join = "the `copy` argument is ignored", + semi_join = "the `copy` argument is ignored", + anti_join = "the `copy` argument is ignored", count = NULL, tally = NULL, rename_with = NULL, diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 752d3a266b26a..62e2182ffcd52 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -484,8 +484,8 @@ ExecNode_Aggregate <- function(input, options, key_names) { .Call(`_arrow_ExecNode_Aggregate`, input, options, key_names) } -ExecNode_Join <- function(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) { - .Call(`_arrow_ExecNode_Join`, input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right) +ExecNode_Join <- function(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches) { + .Call(`_arrow_ExecNode_Join`, input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches) } ExecNode_Union <- function(input, right_data) { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 2042f800142b7..fda77bca83fc2 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -36,7 +36,7 @@ #' which returns an `arrow` [Table], or `collect()`, which pulls the resulting #' Table into an R `tibble`. #' -#' * [`anti_join()`][dplyr::anti_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`anti_join()`][dplyr::anti_join()]: the `copy` argument is ignored #' * [`arrange()`][dplyr::arrange()] #' * [`collapse()`][dplyr::collapse()] #' * [`collect()`][dplyr::collect()] @@ -45,22 +45,22 @@ #' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` not supported #' * [`explain()`][dplyr::explain()] #' * [`filter()`][dplyr::filter()] -#' * [`full_join()`][dplyr::full_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored #' * [`glimpse()`][dplyr::glimpse()] #' * [`group_by()`][dplyr::group_by()] #' * [`group_by_drop_default()`][dplyr::group_by_drop_default()] #' * [`group_vars()`][dplyr::group_vars()] #' * [`groups()`][dplyr::groups()] -#' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored -#' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`inner_join()`][dplyr::inner_join()]: the `copy` argument is ignored +#' * [`left_join()`][dplyr::left_join()]: the `copy` argument is ignored #' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported #' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] #' * [`rename_with()`][dplyr::rename_with()] -#' * [`right_join()`][dplyr::right_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`right_join()`][dplyr::right_join()]: the `copy` argument is ignored #' * [`select()`][dplyr::select()] -#' * [`semi_join()`][dplyr::semi_join()]: the `copy` and `na_matches` arguments are ignored +#' * [`semi_join()`][dplyr::semi_join()]: the `copy` argument is ignored #' * [`show_query()`][dplyr::show_query()] #' * [`slice_head()`][dplyr::slice_head()]: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; `prop` only supported on queries where `nrow()` is knowable without evaluating #' * [`slice_max()`][dplyr::slice_max()]: slicing within groups not supported; `with_ties = TRUE` (dplyr default) is not supported; `prop` only supported on queries where `nrow()` is knowable without evaluating diff --git a/r/R/dplyr-join.R b/r/R/dplyr-join.R index 39237f574bd28..e76e041a54277 100644 --- a/r/R/dplyr-join.R +++ b/r/R/dplyr-join.R @@ -25,14 +25,15 @@ do_join <- function(x, suffix = c(".x", ".y"), ..., keep = FALSE, - na_matches, + na_matches = c("na", "never"), join_type) { # TODO: handle `copy` arg: ignore? - # TODO: handle `na_matches` arg x <- as_adq(x) y <- as_adq(y) by <- handle_join_by(by, x, y) + na_matches <- match.arg(na_matches) + # For outer joins, we need to output the join keys on both sides so we # can coalesce them afterwards. left_output <- if (!keep && join_type == "RIGHT_OUTER") { @@ -54,7 +55,8 @@ do_join <- function(x, left_output = left_output, right_output = right_output, suffix = suffix, - keep = keep + keep = keep, + na_matches = na_matches == "na" ) collapse.arrow_dplyr_query(x) } diff --git a/r/R/query-engine.R b/r/R/query-engine.R index 0f8a84f9b867e..fb48d790fd36e 100644 --- a/r/R/query-engine.R +++ b/r/R/query-engine.R @@ -148,7 +148,8 @@ ExecPlan <- R6Class("ExecPlan", left_output = .data$join$left_output, right_output = .data$join$right_output, left_suffix = .data$join$suffix[[1]], - right_suffix = .data$join$suffix[[2]] + right_suffix = .data$join$suffix[[2]], + na_matches = .data$join$na_matches ) } @@ -307,7 +308,7 @@ ExecNode <- R6Class("ExecNode", out$extras$source_schema$metadata[["r"]]$attributes <- NULL out }, - Join = function(type, right_node, by, left_output, right_output, left_suffix, right_suffix) { + Join = function(type, right_node, by, left_output, right_output, left_suffix, right_suffix, na_matches = TRUE) { self$preserve_extras( ExecNode_Join( self, @@ -318,7 +319,8 @@ ExecNode <- R6Class("ExecNode", left_output = left_output, right_output = right_output, output_suffix_for_left = left_suffix, - output_suffix_for_right = right_suffix + output_suffix_for_right = right_suffix, + na_matches = na_matches ) ) }, diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 365795d9fc65c..ca51ef56334eb 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -23,7 +23,7 @@ the query on the data. To run the query, call either \code{compute()}, which returns an \code{arrow} \link{Table}, or \code{collect()}, which pulls the resulting Table into an R \code{tibble}. \itemize{ -\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:filter-joins]{anti_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:arrange]{arrange()}} \item \code{\link[dplyr:compute]{collapse()}} \item \code{\link[dplyr:compute]{collect()}} @@ -32,22 +32,22 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} not supported \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} -\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} \item \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} \item \code{\link[dplyr:group_data]{group_vars()}} \item \code{\link[dplyr:group_data]{groups()}} -\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and \code{na_matches} arguments are ignored -\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored +\item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} \item \code{\link[dplyr:rename]{rename_with()}} -\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:mutate-joins]{right_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:select]{select()}} -\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} and \code{na_matches} arguments are ignored +\item \code{\link[dplyr:filter-joins]{semi_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:explain]{show_query()}} \item \code{\link[dplyr:slice]{slice_head()}}: slicing within groups not supported; Arrow datasets do not have row order, so head is non-deterministic; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating \item \code{\link[dplyr:slice]{slice_max()}}: slicing within groups not supported; \code{with_ties = TRUE} (dplyr default) is not supported; \code{prop} only supported on queries where \code{nrow()} is knowable without evaluating diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index a4c4b614d6d75..d5aec50219e0b 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -1163,8 +1163,8 @@ extern "C" SEXP _arrow_ExecNode_Aggregate(SEXP input_sexp, SEXP options_sexp, SE // compute-exec.cpp #if defined(ARROW_R_WITH_ACERO) -std::shared_ptr ExecNode_Join(const std::shared_ptr& input, acero::JoinType join_type, const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, std::string output_suffix_for_left, std::string output_suffix_for_right); -extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp){ +std::shared_ptr ExecNode_Join(const std::shared_ptr& input, acero::JoinType join_type, const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, std::string output_suffix_for_left, std::string output_suffix_for_right, bool na_matches); +extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp, SEXP na_matches_sexp){ BEGIN_CPP11 arrow::r::Input&>::type input(input_sexp); arrow::r::Input::type join_type(join_type_sexp); @@ -1175,11 +1175,12 @@ BEGIN_CPP11 arrow::r::Input>::type right_output(right_output_sexp); arrow::r::Input::type output_suffix_for_left(output_suffix_for_left_sexp); arrow::r::Input::type output_suffix_for_right(output_suffix_for_right_sexp); - return cpp11::as_sexp(ExecNode_Join(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right)); + arrow::r::Input::type na_matches(na_matches_sexp); + return cpp11::as_sexp(ExecNode_Join(input, join_type, right_data, left_keys, right_keys, left_output, right_output, output_suffix_for_left, output_suffix_for_right, na_matches)); END_CPP11 } #else -extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp){ +extern "C" SEXP _arrow_ExecNode_Join(SEXP input_sexp, SEXP join_type_sexp, SEXP right_data_sexp, SEXP left_keys_sexp, SEXP right_keys_sexp, SEXP left_output_sexp, SEXP right_output_sexp, SEXP output_suffix_for_left_sexp, SEXP output_suffix_for_right_sexp, SEXP na_matches_sexp){ Rf_error("Cannot call ExecNode_Join(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -5790,7 +5791,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ExecNode_Filter", (DL_FUNC) &_arrow_ExecNode_Filter, 2}, { "_arrow_ExecNode_Project", (DL_FUNC) &_arrow_ExecNode_Project, 3}, { "_arrow_ExecNode_Aggregate", (DL_FUNC) &_arrow_ExecNode_Aggregate, 3}, - { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 9}, + { "_arrow_ExecNode_Join", (DL_FUNC) &_arrow_ExecNode_Join, 10}, { "_arrow_ExecNode_Union", (DL_FUNC) &_arrow_ExecNode_Union, 2}, { "_arrow_ExecNode_Fetch", (DL_FUNC) &_arrow_ExecNode_Fetch, 3}, { "_arrow_ExecNode_OrderBy", (DL_FUNC) &_arrow_ExecNode_OrderBy, 2}, diff --git a/r/src/compute-exec.cpp b/r/src/compute-exec.cpp index e0b3c62c47d7f..d0c50315c299f 100644 --- a/r/src/compute-exec.cpp +++ b/r/src/compute-exec.cpp @@ -411,10 +411,17 @@ std::shared_ptr ExecNode_Join( const std::shared_ptr& right_data, std::vector left_keys, std::vector right_keys, std::vector left_output, std::vector right_output, - std::string output_suffix_for_left, std::string output_suffix_for_right) { + std::string output_suffix_for_left, std::string output_suffix_for_right, + bool na_matches) { std::vector left_refs, right_refs, left_out_refs, right_out_refs; + std::vector key_cmps; for (auto&& name : left_keys) { left_refs.emplace_back(std::move(name)); + // Populate key_cmps in this loop, one for each key + // Note that Acero supports having different values for each key, but dplyr + // only supports one value for all keys, so we're only going to support that + // for now. + key_cmps.emplace_back(na_matches ? acero::JoinKeyCmp::IS : acero::JoinKeyCmp::EQ); } for (auto&& name : right_keys) { right_refs.emplace_back(std::move(name)); @@ -434,10 +441,11 @@ std::shared_ptr ExecNode_Join( return MakeExecNodeOrStop( "hashjoin", input->plan(), {input.get(), right_data.get()}, - acero::HashJoinNodeOptions{ - join_type, std::move(left_refs), std::move(right_refs), - std::move(left_out_refs), std::move(right_out_refs), compute::literal(true), - std::move(output_suffix_for_left), std::move(output_suffix_for_right)}); + acero::HashJoinNodeOptions{join_type, std::move(left_refs), std::move(right_refs), + std::move(left_out_refs), std::move(right_out_refs), + std::move(key_cmps), compute::literal(true), + std::move(output_suffix_for_left), + std::move(output_suffix_for_right)}); } // [[acero::export]] diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index e3e1e98cfca15..9a1c8b7b80fea 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -441,3 +441,35 @@ test_that("full joins handle keep", { small_dataset_df ) }) + +left <- tibble::tibble( + x = c(1, NA, 3), +) +right <- tibble::tibble( + x = c(1, NA, 3), + y = c("a", "b", "c") +) +na_matches_na <- right +na_matches_never <- tibble::tibble( + x = c(1, NA, 3), + y = c("a", NA, "c") +) +test_that("na_matches argument to join: na (default)", { + expect_equal( + arrow_table(left) %>% + left_join(right, by = "x", na_matches = "na") %>% + arrange(x) %>% + collect(), + na_matches_na %>% arrange(x) + ) +}) + +test_that("na_matches argument to join: never", { + expect_equal( + arrow_table(left) %>% + left_join(right, by = "x", na_matches = "never") %>% + arrange(x) %>% + collect(), + na_matches_never %>% arrange(x) + ) +}) From 858054bec01a9b9d820107ca0ec45865385a5e89 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Sat, 27 Apr 2024 11:37:29 -0500 Subject: [PATCH 028/106] MINOR: [Docs] Correct a small typo in archery docs (#41412) ### Rationale for this change The name is archery not archer ### What changes are included in this PR? `s/archer /archery/` ### Are these changes tested? No, docs only ### Are there any user-facing changes? Yes, to the docs Authored-by: Jonathan Keane Signed-off-by: AlenkaF --- dev/archery/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/README.md b/dev/archery/README.md index 9991e7402d832..0b9d5c743d122 100644 --- a/dev/archery/README.md +++ b/dev/archery/README.md @@ -23,7 +23,7 @@ Archery is documented on the Arrow website: * [Daily development using Archery](https://arrow.apache.org/docs/developers/continuous_integration/archery.html) * [Using Archery and Crossbow](https://arrow.apache.org/docs/developers/continuous_integration/crossbow.html) -* [Using Archer and Docker](https://arrow.apache.org/docs/developers/continuous_integration/docker.html) +* [Using Archery and Docker](https://arrow.apache.org/docs/developers/continuous_integration/docker.html) # Installing Archery From 5ee70ee1bc9c8c9f6ede63ea5e6c52e04446fe08 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 27 Apr 2024 15:42:30 -0400 Subject: [PATCH 029/106] GH-41367: [C++] Replace [[maybe_unused]] with Arrow macro (#41359) ### Rationale for this change This is a follow up to https://github.com/apache/arrow/pull/41111 which was created as an issue in #41367 ### What changes are included in this PR? Replace [[maybe_unused]] with Arrow macro ### Are these changes tested? Builds cleanly ### Are there any user-facing changes? No * GitHub Issue: #41367 Authored-by: Will Ayd Signed-off-by: Sutou Kouhei --- cpp/apidoc/Doxyfile | 11 ++++++----- cpp/cmake_modules/SetupCxxFlags.cmake | 1 + cpp/src/arrow/array/builder_base.h | 6 +++--- cpp/src/arrow/array/builder_nested.h | 2 +- cpp/src/arrow/array/builder_primitive.h | 4 ++-- cpp/src/arrow/device.h | 6 +++--- cpp/src/arrow/type.h | 6 +++--- cpp/src/arrow/util/macros.h | 4 ++++ 8 files changed, 23 insertions(+), 17 deletions(-) diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index e19c933cd454f..5be93032c00d9 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -2168,16 +2168,17 @@ INCLUDE_FILE_PATTERNS = PREDEFINED = __attribute__(x)= \ __declspec(x)= \ - PARQUET_EXPORT= \ - GANDIVA_EXPORT= \ - ARROW_EXPORT= \ ARROW_ACERO_EXPORT= \ + ARROW_ARG_UNUSED(x)=x \ + ARROW_DEPRECATED(x)= \ ARROW_DS_EXPORT= \ ARROW_ENGINE_EXPORT= \ + ARROW_EXPORT= \ + ARROW_EXTERN_TEMPLATE= \ ARROW_FLIGHT_EXPORT= \ ARROW_FLIGHT_SQL_EXPORT= \ - ARROW_EXTERN_TEMPLATE= \ - ARROW_DEPRECATED(x)= + GANDIVA_EXPORT= \ + PARQUET_EXPORT= # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index d56609c123968..ea357b47794ce 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -314,6 +314,7 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wextra") set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wdocumentation") + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DARROW_WARN_DOCUMENTATION") if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") # size_t is 32 bit in Emscripten wasm32 - ignore conversion errors set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-shorten-64-to-32") diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index e6c0b2d2387f2..3a85318735f80 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -175,9 +175,9 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySlice([[maybe_unused]] const ArraySpan& array, - [[maybe_unused]] int64_t offset, - [[maybe_unused]] int64_t length) { + virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array), + int64_t ARROW_ARG_UNUSED(offset), + int64_t ARROW_ARG_UNUSED(length)) { return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 9f7b0fcdbce07..6089cf04d421f 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -248,7 +248,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { /// \brief Append dimensions for a single list slot. /// /// ListViewBuilder overrides this to also append the size. - virtual void UnsafeAppendDimensions(int64_t offset, [[maybe_unused]] int64_t size) { + virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) { offsets_builder_.UnsafeAppend(static_cast(offset)); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index db8d2cbaabb61..de7af1b46bdee 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -32,10 +32,10 @@ namespace arrow { class ARROW_EXPORT NullBuilder : public ArrayBuilder { public: explicit NullBuilder(MemoryPool* pool = default_memory_pool(), - [[maybe_unused]] int64_t alignment = kDefaultBufferAlignment) + int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment) : ArrayBuilder(pool) {} - explicit NullBuilder([[maybe_unused]] const std::shared_ptr& type, + explicit NullBuilder(const std::shared_ptr& ARROW_ARG_UNUSED(type), MemoryPool* pool = default_memory_pool(), int64_t alignment = kDefaultBufferAlignment) : NullBuilder(pool, alignment) {} diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h index 3003bad7c459c..a591167ef9a45 100644 --- a/cpp/src/arrow/device.h +++ b/cpp/src/arrow/device.h @@ -140,7 +140,7 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// derived from Device::Stream to allow for stream ordered events /// and memory allocations. virtual Result> MakeStream( - [[maybe_unused]] unsigned int flags) { + unsigned int ARROW_ARG_UNUSED(flags)) { return NULLPTR; } @@ -151,8 +151,8 @@ class ARROW_EXPORT Device : public std::enable_shared_from_this, /// a no-op function can be passed to indicate ownership is maintained /// externally virtual Result> WrapStream( - [[maybe_unused]] void* device_stream, - [[maybe_unused]] Stream::release_fn_t release_fn) { + void* ARROW_ARG_UNUSED(device_stream), + Stream::release_fn_t ARROW_ARG_UNUSED(release_fn)) { return NULLPTR; } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 58c9df04ec5c3..bb05e6efdb987 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1723,7 +1723,7 @@ class ARROW_EXPORT MonthIntervalType : public IntervalType { MonthIntervalType() : IntervalType(type_id) {} - std::string ToString([[maybe_unused]] bool show_metadata = false) const override { + std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override { return name(); } std::string name() const override { return "month_interval"; } @@ -1761,7 +1761,7 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType { int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - std::string ToString([[maybe_unused]] bool show_metadata = false) const override { + std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override { return name(); } std::string name() const override { return "day_time_interval"; } @@ -1803,7 +1803,7 @@ class ARROW_EXPORT MonthDayNanoIntervalType : public IntervalType { int bit_width() const override { return static_cast(sizeof(c_type) * CHAR_BIT); } - std::string ToString([[maybe_unused]] bool show_metadata = false) const override { + std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override { return name(); } std::string name() const override { return "month_day_nano_interval"; } diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index d80828869b33c..d0c05a7908256 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -67,7 +67,11 @@ // [5] J. Doerfert et al. 2019. "Performance Exploration Through Optimistic Static // Program Annotations". https://github.com/jdoerfert/PETOSPA/blob/master/ISC19.pdf #define ARROW_UNUSED(x) (void)(x) +#ifdef ARROW_WARN_DOCUMENTATION +#define ARROW_ARG_UNUSED(x) x +#else #define ARROW_ARG_UNUSED(x) +#endif #if defined(__GNUC__) // GCC and compatible compilers (clang, Intel ICC) #define ARROW_NORETURN __attribute__((noreturn)) #define ARROW_NOINLINE __attribute__((noinline)) From 4b2cf22f5aea16f4ad0db447624b6e713b43d62c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 28 Apr 2024 05:31:55 +0900 Subject: [PATCH 030/106] GH-41405: [Release][Docs][GLib] Use Sphinx based GLib front page (#41406) ### Rationale for this change We should use the GLib front page generated by Sphinx. ### What changes are included in this PR? Stop reverting the GLib front page change in release script. ### Are these changes tested? No. ### Are there any user-facing changes? Yes. * GitHub Issue: #41405 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/release/post-08-docs.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index 1e457c95c033c..c59f9b96857a6 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -79,7 +79,6 @@ curl \ https://apache.jfrog.io/artifactory/arrow/docs/${version}/docs.tar.gz tar xvf docs.tar.gz rm -f docs.tar.gz -git checkout docs/c_glib/index.html if [ "$is_major_release" = "yes" ] ; then previous_series=${previous_version%.*} mv docs_temp docs/${previous_series} From 9090e679da91e0544171c2da9f2b9ce8ba23d389 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Sun, 28 Apr 2024 17:00:52 -0700 Subject: [PATCH 031/106] GH-41307: [Java] Use org.apache:apache parent pom version 31 (#41309) ### Rationale for this change Use/update Maven modules to `org.apache:parent:31` and clean up Maven modules to remove unnecessary configuration or outdated workarounds ### What changes are included in this PR? * Add `org.apache:parent:31` to `org.apache.arrow:arrow-bom` and `org.apache.arrow.maven.plugins:arrow-maven-plugins` to make them conformant with ASF standards * Update `org.apache.arrow:arrow-java-root` parent to `org.apache:parent:31` * Use `version.*` and other properties to override plugin versions defined by `org.apache:parent` * Move standalone plugin versions under pluginManagement at the top level * Cleanup redundant plugin version or configuration declaration * Update `maven-dependency-plugin` to 3.6.1 and add the required overrides when necessary * Update `maven-shade-plugin` to 3.5.1 (via `org.apache:parent`) * Remove enforcer check for java and maven version (handled by `org.apache:parent`) * Remove unnecessary `mvnrepository` link comments * Remove `m2e.version` property check in profiles (only needed for errorprone plugin configuration which is incompatible with M2E) * Cleanup `argLine` overrides for surefire/failsafe plugins * Remove unnecessary `../pom.xml` `` directives * Remove source/target/encoding configuration properties for `maven-compiler-plugin`, `maven-javadoc-plugin` and `maven-resources-plugin` as it is handled by `org.apache:parent` and plugin themselves * Remove unnecessary copy of codegen templates in `arrow-vector` module * Remove unnecessary junit jupiter engine dependencies for surefire/failsafe plugins. ### Are these changes tested? No net new code. tested via CI jobs ### Are there any user-facing changes? None * GitHub Issue: #41307 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/adapter/avro/pom.xml | 9 - java/adapter/jdbc/pom.xml | 7 - java/adapter/orc/pom.xml | 17 ++ java/bom/pom.xml | 21 +- java/c/pom.xml | 1 - java/dataset/pom.xml | 12 +- java/flight/flight-core/pom.xml | 27 +-- java/flight/flight-integration-tests/pom.xml | 2 - java/flight/flight-sql-jdbc-core/pom.xml | 10 - java/flight/flight-sql-jdbc-driver/pom.xml | 1 - java/flight/flight-sql/pom.xml | 5 - java/format/pom.xml | 2 - java/gandiva/pom.xml | 19 +- .../module-info-compiler-maven-plugin/pom.xml | 28 +-- java/maven/pom.xml | 75 +++---- java/memory/memory-core/pom.xml | 22 +- java/performance/pom.xml | 49 ----- java/pom.xml | 207 +++++++----------- java/tools/pom.xml | 22 +- java/vector/pom.xml | 91 +------- 20 files changed, 177 insertions(+), 450 deletions(-) diff --git a/java/adapter/avro/pom.xml b/java/adapter/avro/pom.xml index 9ddc150253874..645e8c4ff2e60 100644 --- a/java/adapter/avro/pom.xml +++ b/java/adapter/avro/pom.xml @@ -25,36 +25,27 @@ http://maven.apache.org - - org.apache.arrow arrow-memory-core - - org.apache.arrow arrow-memory-netty runtime - - org.apache.arrow arrow-vector - org.immutables value - org.apache.avro avro ${dep.avro.version} - diff --git a/java/adapter/jdbc/pom.xml b/java/adapter/jdbc/pom.xml index 5f72729bb76e7..33360c64b13b6 100644 --- a/java/adapter/jdbc/pom.xml +++ b/java/adapter/jdbc/pom.xml @@ -26,20 +26,17 @@ - org.apache.arrow arrow-memory-core - org.apache.arrow arrow-memory-netty runtime - org.apache.arrow arrow-vector @@ -51,7 +48,6 @@ value - com.h2database h2 @@ -94,9 +90,6 @@ jdk11+ [11,] - - !m2e.version - diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index f6aadca6de4d3..ec6f73a3e9e40 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -134,5 +134,22 @@ + + + org.apache.maven.plugins + maven-dependency-plugin + + + analyze + + + + org.apache.arrow:arrow-format + + + + + + diff --git a/java/bom/pom.xml b/java/bom/pom.xml index 12b9950ad80fc..0af50c638055e 100644 --- a/java/bom/pom.xml +++ b/java/bom/pom.xml @@ -15,7 +15,7 @@ org.apache apache - 18 + 31 org.apache.arrow @@ -27,6 +27,19 @@ + + 1.8 + 1.8 + 3.11.0 + 3.2.5 + 0.16.1 + 3.7.1 + 3.12.1 + 3.6.1 + 3.2.0 + 3.2.2 + 3.6.3 + 3.5.0 @@ -138,11 +151,9 @@ ${project.version} - - @@ -156,12 +167,10 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.5.0 org.apache.maven.plugins maven-site-plugin - 3.12.1 com.diffplug.spotless @@ -188,12 +197,10 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.5.0 org.apache.maven.plugins maven-site-plugin - 3.12.1 diff --git a/java/c/pom.xml b/java/c/pom.xml index 1095e99bbdd3f..43a62a8303bfe 100644 --- a/java/c/pom.xml +++ b/java/c/pom.xml @@ -83,5 +83,4 @@ - diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index 62ea79f55ccd4..2121119af398e 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -23,7 +23,6 @@ Java implementation of Arrow Dataset API/Framework ../../../cpp/release-build/ - 2.5.0 1.13.1 1.11.3 @@ -195,21 +194,14 @@ jdk11+ [11,] - - !m2e.version - org.apache.maven.plugins maven-surefire-plugin - - false - - ${project.basedir}/../../testing/data - - --add-reads=org.apache.arrow.dataset=com.fasterxml.jackson.databind --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + + --add-reads=org.apache.arrow.dataset=com.fasterxml.jackson.databind --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 897af0b9e1129..163b4c24031b1 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -15,7 +15,6 @@ org.apache.arrow arrow-flight 17.0.0-SNAPSHOT - ../pom.xml flight-core @@ -151,13 +150,6 @@ org.apache.maven.plugins maven-shade-plugin - - 3.2.4 shade-main @@ -244,7 +236,6 @@ org.apache.maven.plugins maven-dependency-plugin - 3.3.0 analyze @@ -264,7 +255,6 @@ org.codehaus.mojo build-helper-maven-plugin - 1.9.1 add-generated-sources-to-classpath @@ -282,7 +272,6 @@ maven-assembly-plugin - 3.7.1 jar-with-dependencies @@ -299,13 +288,6 @@ - - - kr.motd.maven - os-maven-plugin - 1.7.1 - - @@ -313,18 +295,14 @@ jdk11+ [11,] - - !m2e.version - org.apache.maven.plugins maven-surefire-plugin - - --add-opens=org.apache.arrow.flight.core/org.apache.arrow.flight.perf.impl=protobuf.java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED - false + + --add-opens=org.apache.arrow.flight.core/org.apache.arrow.flight.perf.impl=protobuf.java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED ${project.basedir}/../../../testing/data @@ -334,5 +312,4 @@ - diff --git a/java/flight/flight-integration-tests/pom.xml b/java/flight/flight-integration-tests/pom.xml index 74016d81e91e5..cd2c28ba8959f 100644 --- a/java/flight/flight-integration-tests/pom.xml +++ b/java/flight/flight-integration-tests/pom.xml @@ -15,7 +15,6 @@ org.apache.arrow arrow-flight 17.0.0-SNAPSHOT - ../pom.xml flight-integration-tests @@ -63,7 +62,6 @@ maven-assembly-plugin - 3.7.1 jar-with-dependencies diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 36da335b37b9a..2e0de90fcf8bc 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -15,7 +15,6 @@ org.apache.arrow arrow-flight 17.0.0-SNAPSHOT - ../pom.xml flight-sql-jdbc-core @@ -47,20 +46,17 @@ - org.apache.arrow arrow-memory-core - org.apache.arrow arrow-memory-netty runtime - org.apache.arrow arrow-vector @@ -136,11 +132,6 @@ - - - src/main/resources - - maven-surefire-plugin @@ -154,7 +145,6 @@ org.codehaus.mojo properties-maven-plugin - 1.2.1 write-project-properties-to-file diff --git a/java/flight/flight-sql-jdbc-driver/pom.xml b/java/flight/flight-sql-jdbc-driver/pom.xml index b3afbe1defdba..4456270e7b347 100644 --- a/java/flight/flight-sql-jdbc-driver/pom.xml +++ b/java/flight/flight-sql-jdbc-driver/pom.xml @@ -15,7 +15,6 @@ org.apache.arrow arrow-flight 17.0.0-SNAPSHOT - ../pom.xml flight-sql-jdbc-driver diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml index 7ed217db68b07..cf466ab1720cf 100644 --- a/java/flight/flight-sql/pom.xml +++ b/java/flight/flight-sql/pom.xml @@ -15,7 +15,6 @@ org.apache.arrow arrow-flight 17.0.0-SNAPSHOT - ../pom.xml flight-sql @@ -119,9 +118,6 @@ jdk11+ [11,] - - !m2e.version - @@ -136,5 +132,4 @@ - diff --git a/java/format/pom.xml b/java/format/pom.xml index e9eded79de660..4483047e20960 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -31,7 +31,6 @@ - @@ -42,6 +41,5 @@ - diff --git a/java/gandiva/pom.xml b/java/gandiva/pom.xml index 00acb89f1d7cf..c5703c62dfe23 100644 --- a/java/gandiva/pom.xml +++ b/java/gandiva/pom.xml @@ -22,13 +22,12 @@ jar Arrow Gandiva Java wrappers around the native Gandiva SQL expression compiler. + - 1.8 - 1.8 - 3.25.1 true ../../../cpp/release-build + org.apache.arrow @@ -51,7 +50,6 @@ com.google.protobuf protobuf-java - ${protobuf.version} com.google.guava @@ -62,6 +60,7 @@ slf4j-api + @@ -88,14 +87,6 @@ - - - - kr.motd.maven - os-maven-plugin - 1.7.1 - - @@ -105,7 +96,6 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 attach-sources @@ -118,7 +108,6 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.6.3 attach-javadocs @@ -131,7 +120,6 @@ org.apache.maven.plugins maven-gpg-plugin - 3.2.2 sign-artifacts @@ -146,5 +134,4 @@ - diff --git a/java/maven/module-info-compiler-maven-plugin/pom.xml b/java/maven/module-info-compiler-maven-plugin/pom.xml index 6589020d6ecb5..5909b6b3484fc 100644 --- a/java/maven/module-info-compiler-maven-plugin/pom.xml +++ b/java/maven/module-info-compiler-maven-plugin/pom.xml @@ -64,39 +64,14 @@ org.apache.maven.plugin-tools maven-plugin-annotations - 3.11.0 + ${maven.plugin.tools.version} provided - - - maven-clean-plugin - 3.3.2 - - - maven-plugin-plugin - 3.12.0 - - - maven-jar-plugin - 3.3.0 - - - maven-install-plugin - 3.1.1 - - - maven-deploy-plugin - 3.1.1 - - - maven-invoker-plugin - 3.1.0 - com.gradle gradle-enterprise-maven-extension @@ -118,7 +93,6 @@ org.apache.maven.plugins maven-plugin-plugin - 3.12.0 true diff --git a/java/maven/pom.xml b/java/maven/pom.xml index f290ded2e2913..4314192eda73b 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -15,6 +15,13 @@ Note: Do not inherit from the Arrow parent POM as plugins can be referenced during the parent POM, introducing circular dependencies. --> + + org.apache + apache + 31 + + + org.apache.arrow.maven.plugins arrow-maven-plugins 17.0.0-SNAPSHOT @@ -27,25 +34,38 @@ true + + 1.8 + 1.8 + 3.12.0 + 3.2.5 + 0.16.1 + 3.7.1 + 3.12.1 + 3.6.1 + 3.2.0 + 3.2.2 + 3.6.3 + 3.5.0 - org.apache.maven.plugins - maven-project-info-reports-plugin - 3.5.0 + com.diffplug.spotless + spotless-maven-plugin + 2.30.0 - org.apache.maven.plugins - maven-site-plugin - 3.12.1 + pl.project13.maven + git-commit-id-plugin + 4.0.5 - com.diffplug.spotless - spotless-maven-plugin - 2.30.0 + org.cyclonedx + cyclonedx-maven-plugin + 2.7.11 @@ -119,11 +139,6 @@ **/logback.xml - true - - true - true - org.apache.arrow ${username} @@ -143,43 +158,17 @@ - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - org.apache.maven.plugins maven-compiler-plugin - UTF-8 - 1.8 - 1.8 2048m - false true maven-enforcer-plugin - - validate_java_and_maven_version - - enforce - - verify - false - - - - [3.3.0,4) - - - - avoid_bad_dependencies @@ -205,8 +194,6 @@ pl.project13.maven git-commit-id-plugin - 4.0.5 - dd.MM.yyyy '@' HH:mm:ss z false @@ -248,7 +235,6 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.1.0 ../dev/checkstyle/checkstyle.xml ../dev/checkstyle/checkstyle.license @@ -288,7 +274,6 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.11 @@ -353,12 +338,10 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.5.0 org.apache.maven.plugins maven-site-plugin - 3.12.1 diff --git a/java/memory/memory-core/pom.xml b/java/memory/memory-core/pom.xml index ca5bc603bd4dc..8e39ae43d116f 100644 --- a/java/memory/memory-core/pom.xml +++ b/java/memory/memory-core/pom.xml @@ -61,9 +61,6 @@ jdk11+ [11,] - - !m2e.version - @@ -92,7 +89,6 @@ org.apache.maven.plugins maven-surefire-plugin - opens-tests @@ -101,12 +97,9 @@ test - - -Dfoo=bar - - - **/TestArrowBuf.java - + + + **/TestOpens.java @@ -129,9 +122,6 @@ org.apache.maven.plugins maven-compiler-plugin - 8 - 8 - UTF-8 -Xmaxerrs @@ -150,12 +140,6 @@ ${checker.framework.version} - - - org.immutables.value.internal.$processor$.$Processor - - org.checkerframework.checker.nullness.NullnessChecker - diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 1e99a29265724..c819e6393d78f 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -22,9 +22,7 @@ JMH Performance benchmarks for other Arrow libraries. - UTF-8 1.37 - 1.8 benchmarks true .* @@ -96,52 +94,7 @@ - - - - maven-clean-plugin - 3.3.2 - - - maven-deploy-plugin - 3.1.1 - - - maven-install-plugin - 3.1.1 - - - maven-jar-plugin - 3.3.0 - - - maven-javadoc-plugin - 3.6.3 - - - maven-resources-plugin - 3.3.1 - - - maven-source-plugin - 2.2.1 - - - maven-surefire-plugin - 3.2.5 - - - - - org.apache.maven.plugins - maven-compiler-plugin - - ${javac.target} - ${javac.target} - ${javac.target} - - org.apache.maven.plugins maven-shade-plugin @@ -175,7 +128,6 @@ org.codehaus.mojo exec-maven-plugin - 3.2.0 ${skip.perf.benchmarks} test @@ -212,5 +164,4 @@ - diff --git a/java/pom.xml b/java/pom.xml index 16564ae828b0f..39fd1e00b64e9 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -15,7 +15,7 @@ org.apache apache - 18 + 31 org.apache.arrow @@ -85,7 +85,7 @@ 33.0.0-jre 4.1.108.Final 1.63.0 - 3.23.1 + 3.25.1 2.17.0 3.4.0 23.5.26 @@ -95,10 +95,28 @@ true 9+181-r4173-1 2.24.0 - 3.12.1 5.11.0 5.2.0 3.42.0 + none + -Xdoclint:none + + 1.8 + 1.8 + 3.11.0 + 3.2.5 + 0.16.1 + 3.7.1 + 3.12.1 + 3.6.1 + 3.2.0 + + 3.2.2 + 3.6.3 + 3.5.0 @@ -268,40 +286,16 @@ 8.3.0 test - - - - - - org.apache.maven.plugins - maven-dependency-plugin - - 3.1.2 - - - org.apache.rat - apache-rat-plugin - 0.16.1 - - - org.apache.maven.plugins - maven-resources-plugin - 3.3.1 - org.apache.maven.plugins maven-compiler-plugin - ${maven-compiler-plugin.version} + true **/module-info.java **/module-info.java false @@ -314,18 +308,8 @@ - - maven-enforcer-plugin - 3.4.1 - - - org.apache.maven.plugins - maven-shade-plugin - 3.5.1 - maven-surefire-plugin - 3.2.5 true true @@ -340,22 +324,9 @@ 1048576 - - - org.junit.jupiter - junit-jupiter-engine - ${dep.junit.jupiter.version} - - - org.apache.maven.surefire - surefire-junit-platform - 3.2.5 - - maven-failsafe-plugin - 3.2.5 ${project.build.directory} @@ -444,6 +415,22 @@ + + + org.apache.drill.tools + drill-fmpp-maven-plugin + [1.0,) + + generate + + + + + false + true + + + @@ -451,9 +438,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.6.3 - 8 **/module-info.java @@ -464,16 +449,6 @@ module-info-compiler-maven-plugin ${project.version} - - org.apache.maven.plugins - maven-project-info-reports-plugin - 3.5.0 - - - org.apache.maven.plugins - maven-site-plugin - 3.12.1 - com.gradle gradle-enterprise-maven-extension @@ -521,6 +496,36 @@ spotless-maven-plugin 2.30.0 + + org.codehaus.mojo + build-helper-maven-plugin + 1.9.1 + + + org.codehaus.mojo + properties-maven-plugin + 1.2.1 + + + org.codehaus.mojo + exec-maven-plugin + 3.2.0 + + + pl.project13.maven + git-commit-id-plugin + 4.0.5 + + + org.cyclonedx + cyclonedx-maven-plugin + 2.7.11 + + + org.apache.drill.tools + drill-fmpp-maven-plugin + 1.21.1 + @@ -594,11 +599,6 @@ **/logback.xml - true - - true - true - org.apache.arrow ${username} @@ -618,42 +618,17 @@ - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - org.apache.maven.plugins maven-compiler-plugin - 1.8 - 1.8 2048m - false true maven-enforcer-plugin - - validate_java_and_maven_version - - enforce - - verify - false - - - - [3.3.0,4) - - - - avoid_bad_dependencies @@ -679,8 +654,6 @@ pl.project13.maven git-commit-id-plugin - 4.0.5 - dd.MM.yyyy '@' HH:mm:ss z false @@ -722,7 +695,6 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.1.0 **/module-info.java dev/checkstyle/checkstyle.xml @@ -786,7 +758,6 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.11 @@ -817,12 +788,10 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.5.0 org.apache.maven.plugins maven-site-plugin - 3.12.1 com.diffplug.spotless @@ -857,7 +826,6 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.6.3 **/module-info.java @@ -885,28 +853,15 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.5.0 org.apache.maven.plugins maven-site-plugin - 3.12.1 - - java-nodoclint - - [1.8,) - - - none - -Xdoclint:none - - - arrow-c-data @@ -954,7 +909,6 @@ org.apache.maven.plugins maven-compiler-plugin - true -XDcompilePolicy=simple -Xplugin:ErrorProne @@ -987,9 +941,6 @@ org.apache.maven.plugins maven-compiler-plugin - 8 - 8 - UTF-8 -XDcompilePolicy=simple -Xplugin:ErrorProne -XepExcludedPaths:.*/(target/generated-sources)/.* @@ -1013,6 +964,16 @@ + + + + + jdk11+ + + [11,] + + + org.apache.maven.plugins maven-surefire-plugin @@ -1020,6 +981,13 @@ --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + + org.apache.maven.plugins + maven-failsafe-plugin + + --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED + + @@ -1060,7 +1028,6 @@ org.jacoco jacoco-maven-plugin - 0.8.11 @@ -1106,7 +1073,6 @@ org.codehaus.mojo exec-maven-plugin - 3.2.0 cdata-cmake @@ -1163,7 +1129,6 @@ org.codehaus.mojo exec-maven-plugin - 3.2.0 jni-cpp-cmake @@ -1270,7 +1235,6 @@ org.codehaus.mojo exec-maven-plugin - 3.2.0 jni-cpp-cmake @@ -1360,5 +1324,4 @@ - diff --git a/java/tools/pom.xml b/java/tools/pom.xml index b1507cd301f31..58b790c9f027f 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -54,6 +54,11 @@ 1.3.14 test + com.fasterxml.jackson.core jackson-core @@ -85,7 +90,6 @@ maven-assembly-plugin - 3.7.1 jar-with-dependencies @@ -101,7 +105,21 @@ + + org.apache.maven.plugins + maven-dependency-plugin + + + analyze + verify + + + com.fasterxml.jackson.core:* + + + + + - diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 07af93a499907..ca932ae6f26f9 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -76,64 +76,7 @@ - - - - true - - - false - - apache - apache - https://repo.maven.apache.org/maven2/ - - - - - - - codegen - - ${basedir}/src/main/codegen - - - - - - - org.eclipse.m2e - lifecycle-mapping - 1.0.0 - - - - - - org.apache.drill.tools - drill-fmpp-maven-plugin - [1.0,) - - generate - - - - - false - true - - - - - - - - - - org.apache.maven.plugins @@ -163,33 +106,10 @@ - - maven-resources-plugin - - - - copy-fmpp-resources - - copy-resources - - initialize - - ${project.build.directory}/codegen - - - src/main/codegen - false - - - - - - org.apache.drill.tools drill-fmpp-maven-plugin - 1.21.1 generate-fmpp @@ -200,7 +120,7 @@ src/main/codegen/config.fmpp ${project.build.directory}/generated-sources - ${project.build.directory}/codegen/templates + src/main/codegen/templates @@ -208,13 +128,6 @@ org.apache.maven.plugins maven-shade-plugin - - 3.2.4 @@ -243,7 +156,6 @@ - @@ -276,5 +188,4 @@ - From a8c4f8620117f31425de45a67a275fbf8044ba1f Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Mon, 29 Apr 2024 05:55:17 +0530 Subject: [PATCH 032/106] GH-40339: [Java] StringView Initial Implementation (#40340) ### Rationale for this change StringView implementation in Java. This PR only includes the core implementation of StringView ### What changes are included in this PR? - [X] Adding ViewVarBinaryVector - [X] Adding ViewVarCharVector - [X] Adding corresponding test cases in the given scope - [X] Including required implementation extensions with not supported warnings - [X] Interface for Holders ### Non Goals of this PR - [ ] https://github.com/apache/arrow/issues/40937 - [ ] https://github.com/apache/arrow/issues/40936 - [ ] https://github.com/apache/arrow/issues/40932 - [ ] https://github.com/apache/arrow/issues/40943 - [ ] https://github.com/apache/arrow/issues/40944 - [ ] https://github.com/apache/arrow/issues/40942 - [ ] https://github.com/apache/arrow/issues/40945 - [ ] https://github.com/apache/arrow/issues/40941 - [ ] https://github.com/apache/arrow/issues/40946 ### Are these changes tested? Yes. Existing test cases on `VarCharVector` and `VarBinaryVector` are verified with view implementations and additional test cases have also been added to check view functionality. And explitly tests have been added to evaluate the view functionality with `ViewVarCharVector` ### Are there any user-facing changes? Yes, this introduces a new API and some public methods have been included in an interface so that it can be extended to write custom functionality like done for views. * GitHub Issue: #40339 Lead-authored-by: Vibhatha Abeykoon Co-authored-by: vibhatha Co-authored-by: Vibhatha Lakmal Abeykoon Signed-off-by: David Li --- .../binder/ColumnBinderArrowTypeVisitor.java | 10 + .../arrow/c/BufferImportTypeVisitor.java | 10 + .../BinaryViewAvaticaParameterConverter.java | 42 + .../Utf8ViewAvaticaParameterConverter.java | 43 + .../jdbc/utils/AvaticaParameterBinder.java | 10 + .../arrow/driver/jdbc/utils/ConvertUtils.java | 12 + .../src/main/java/module-info.java | 1 + .../apache/arrow/memory/ReusableBuffer.java | 2 + .../src/main/codegen/data/ArrowTypes.tdd | 10 + .../main/codegen/data/ValueVectorTypes.tdd | 4 +- .../codegen/templates/HolderReaderImpl.java | 4 +- .../main/codegen/templates/UnionReader.java | 2 +- .../main/codegen/templates/ValueHolders.java | 10 +- .../apache/arrow/vector/AddOrGetResult.java | 2 +- .../vector/BaseLargeVariableWidthVector.java | 11 +- .../arrow/vector/BaseVariableWidthVector.java | 11 +- .../vector/BaseVariableWidthViewVector.java | 1451 ++++++++++++++++ .../arrow/vector/LargeVarBinaryVector.java | 1 + .../arrow/vector/LargeVarCharVector.java | 4 +- .../org/apache/arrow/vector/TypeLayout.java | 25 + .../apache/arrow/vector/VarBinaryVector.java | 1 + .../apache/arrow/vector/VarCharVector.java | 1 + .../vector/VariableWidthFieldVector.java | 143 ++ .../arrow/vector/ViewVarBinaryVector.java | 229 +++ .../arrow/vector/ViewVarCharVector.java | 291 ++++ .../vector/compare/RangeEqualsVisitor.java | 6 + .../vector/compare/TypeEqualsVisitor.java | 6 + .../arrow/vector/compare/VectorVisitor.java | 3 + .../org/apache/arrow/vector/types/Types.java | 44 + .../vector/util/DataSizeRoundingUtil.java | 4 + .../arrow/vector/util/ReusableByteArray.java | 7 + .../org/apache/arrow/vector/util/Text.java | 4 +- .../arrow/vector/util/VectorAppender.java | 6 + .../validate/ValidateVectorBufferVisitor.java | 6 + .../validate/ValidateVectorDataVisitor.java | 6 + .../validate/ValidateVectorTypeVisitor.java | 6 + .../validate/ValidateVectorVisitor.java | 6 + .../org/apache/arrow/vector/TestUtils.java | 10 + .../arrow/vector/TestVarCharViewVector.java | 1462 +++++++++++++++++ .../testing/ValueVectorDataPopulator.java | 12 + 40 files changed, 3898 insertions(+), 20 deletions(-) create mode 100644 java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java create mode 100644 java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java index dc708724043d0..7d50676688e0f 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java @@ -148,6 +148,11 @@ public ColumnBinder visit(ArrowType.Utf8 type) { new VarCharBinder<>(varChar, jdbcType); } + @Override + public ColumnBinder visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Column binder implemented for type " + type + " is not supported"); + } + @Override public ColumnBinder visit(ArrowType.LargeUtf8 type) { LargeVarCharVector varChar = (LargeVarCharVector) vector; @@ -162,6 +167,11 @@ public ColumnBinder visit(ArrowType.Binary type) { new VarBinaryBinder<>(varBinary, jdbcType); } + @Override + public ColumnBinder visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("Column binder implemented for type " + type + " is not supported"); + } + @Override public ColumnBinder visit(ArrowType.LargeBinary type) { LargeVarBinaryVector varBinary = (LargeVarBinaryVector) vector; diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index cd2a464f4fa17..bc6139cc84c54 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -209,6 +209,11 @@ public List visit(ArrowType.Utf8 type) { } } + @Override + public List visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } + @Override public List visit(ArrowType.LargeUtf8 type) { try (ArrowBuf offsets = importOffsets(type, LargeVarCharVector.OFFSET_WIDTH)) { @@ -237,6 +242,11 @@ public List visit(ArrowType.Binary type) { } } + @Override + public List visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } + @Override public List visit(ArrowType.LargeBinary type) { try (ArrowBuf offsets = importOffsets(type, LargeVarBinaryVector.OFFSET_WIDTH)) { diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java new file mode 100644 index 0000000000000..dfd4727014292 --- /dev/null +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/BinaryViewAvaticaParameterConverter.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc.converter.impl; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.calcite.avatica.AvaticaParameter; +import org.apache.calcite.avatica.remote.TypedValue; + +/** AvaticaParameterConverter for BinaryView Arrow types. */ +public class BinaryViewAvaticaParameterConverter extends BaseAvaticaParameterConverter { + + public BinaryViewAvaticaParameterConverter(ArrowType.BinaryView type) { + + } + + @Override + public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + public AvaticaParameter createParameter(Field field) { + return createParameter(field, false); + } +} diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java new file mode 100644 index 0000000000000..2c826aefb9c1c --- /dev/null +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/converter/impl/Utf8ViewAvaticaParameterConverter.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.driver.jdbc.converter.impl; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.calcite.avatica.AvaticaParameter; +import org.apache.calcite.avatica.remote.TypedValue; + +/** + * AvaticaParameterConverter for Utf8View Arrow types. + */ +public class Utf8ViewAvaticaParameterConverter extends BaseAvaticaParameterConverter { + + public Utf8ViewAvaticaParameterConverter(ArrowType.Utf8View type) { + } + + @Override + public boolean bindParameter(FieldVector vector, TypedValue typedValue, int index) { + throw new UnsupportedOperationException("Utf8View not supported"); + } + + @Override + public AvaticaParameter createParameter(Field field) { + return createParameter(field, false); + } +} diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index b2bd8e745ecca..fd9127c226910 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -190,6 +190,11 @@ public Boolean visit(ArrowType.Utf8 type) { return new Utf8AvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + @Override + public Boolean visit(ArrowType.Utf8View type) { + throw new UnsupportedOperationException("Utf8View is unsupported"); + } + @Override public Boolean visit(ArrowType.LargeUtf8 type) { return new LargeUtf8AvaticaParameterConverter(type).bindParameter(vector, typedValue, index); @@ -200,6 +205,11 @@ public Boolean visit(ArrowType.Binary type) { return new BinaryAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + @Override + public Boolean visit(ArrowType.BinaryView type) { + throw new UnsupportedOperationException("BinaryView is unsupported"); + } + @Override public Boolean visit(ArrowType.LargeBinary type) { return new LargeBinaryAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java index 843fe0cb89d9f..93b5faaef32c7 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java @@ -23,6 +23,7 @@ import java.util.stream.Stream; import org.apache.arrow.driver.jdbc.converter.impl.BinaryAvaticaParameterConverter; +import org.apache.arrow.driver.jdbc.converter.impl.BinaryViewAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.BoolAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.DateAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.DecimalAvaticaParameterConverter; @@ -43,6 +44,7 @@ import org.apache.arrow.driver.jdbc.converter.impl.TimestampAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.UnionAvaticaParameterConverter; import org.apache.arrow.driver.jdbc.converter.impl.Utf8AvaticaParameterConverter; +import org.apache.arrow.driver.jdbc.converter.impl.Utf8ViewAvaticaParameterConverter; import org.apache.arrow.flight.sql.FlightSqlColumnMetadata; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; @@ -208,6 +210,11 @@ public AvaticaParameter visit(ArrowType.Utf8 type) { return new Utf8AvaticaParameterConverter(type).createParameter(field); } + @Override + public AvaticaParameter visit(ArrowType.Utf8View type) { + return new Utf8ViewAvaticaParameterConverter(type).createParameter(field); + } + @Override public AvaticaParameter visit(ArrowType.LargeUtf8 type) { return new LargeUtf8AvaticaParameterConverter(type).createParameter(field); @@ -218,6 +225,11 @@ public AvaticaParameter visit(ArrowType.Binary type) { return new BinaryAvaticaParameterConverter(type).createParameter(field); } + @Override + public AvaticaParameter visit(ArrowType.BinaryView type) { + return new BinaryViewAvaticaParameterConverter(type).createParameter(field); + } + @Override public AvaticaParameter visit(ArrowType.LargeBinary type) { return new LargeBinaryAvaticaParameterConverter(type).createParameter(field); diff --git a/java/memory/memory-core/src/main/java/module-info.java b/java/memory/memory-core/src/main/java/module-info.java index 34ba34e80bc69..5024b7f45769c 100644 --- a/java/memory/memory-core/src/main/java/module-info.java +++ b/java/memory/memory-core/src/main/java/module-info.java @@ -25,4 +25,5 @@ requires jsr305; requires org.immutables.value; requires org.slf4j; + requires org.checkerframework.checker.qual; } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java index 3530b819aadff..9e37c286ad831 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java @@ -44,4 +44,6 @@ public interface ReusableBuffer { * @param len the number of bytes of the new data */ void set(ArrowBuf srcBytes, long start, long len); + + void set(byte[] srcBytes, long start, long len); } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 3cf9a968791a4..9fe40f2319bfd 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -65,6 +65,11 @@ fields: [], complex: false }, + { + name: "Utf8View", + fields: [], + complex: false + }, { name: "LargeUtf8", fields: [], @@ -75,6 +80,11 @@ fields: [], complex: false }, + { + name: "BinaryView", + fields: [], + complex: false + }, { name: "LargeBinary", fields: [], diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 6c2a967712454..ad1f1b93bb3aa 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -189,7 +189,9 @@ fields: [{name: "start", type: "int"}, {name: "end", type: "int"}, {name: "buffer", type: "ArrowBuf"}], minor: [ { class: "VarBinary" , friendlyType: "byte[]" }, - { class: "VarChar" , friendlyType: "Text" } + { class: "VarChar" , friendlyType: "Text" }, + { class: "ViewVarBinary" , friendlyType: "byte[]" }, + { class: "ViewVarChar" , friendlyType: "Text" } ] }, { diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index 8394aaad41756..1151ea5d39dda 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -109,9 +109,9 @@ public void read(Nullable${name}Holder h) { byte[] value = new byte [length]; holder.buffer.getBytes(holder.start, value, 0, length); - <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary"> + <#if minor.class == "VarBinary" || minor.class == "LargeVarBinary" || minor.class == "ViewVarBinary"> return value; - <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar"> + <#elseif minor.class == "VarChar" || minor.class == "LargeVarChar" || minor.class == "ViewVarChar"> Text text = new Text(); text.set(value); return text; diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 822d4822987fb..956bc91e9185c 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,7 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private static final int NUM_SUPPORTED_TYPES = 46; + private static final int NUM_SUPPORTED_TYPES = 48; private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; diff --git a/java/vector/src/main/codegen/templates/ValueHolders.java b/java/vector/src/main/codegen/templates/ValueHolders.java index 973efd870a662..2a2bbe81b2e74 100644 --- a/java/vector/src/main/codegen/templates/ValueHolders.java +++ b/java/vector/src/main/codegen/templates/ValueHolders.java @@ -27,7 +27,6 @@ package org.apache.arrow.vector.holders; <#include "/@includes/vv_imports.ftl" /> - /** * Source code generated using FreeMarker template ${.template_name} */ @@ -40,11 +39,12 @@ public final class ${className} implements ValueHolder{ /** The last index (exclusive) into the Vector. **/ public int end; - + /** The Vector holding the actual values. **/ public ${minor.class}Vector vector; - + <#else> + public static final int WIDTH = ${type.width}; <#if mode.name == "Optional">public int isSet; @@ -70,10 +70,6 @@ public String toString(){ throw new UnsupportedOperationException(); } - - - - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java index b41dbb245e8a2..5f9decbae4eac 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -20,7 +20,7 @@ import org.apache.arrow.util.Preconditions; /** - * Tuple class containing a vector and whether is was created. + * Tuple class containing a vector and whether it was created. * * @param The type of vector the result is for. */ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 2ef6e4bd8b374..6365493051b92 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -42,8 +42,7 @@ /** * BaseLargeVariableWidthVector is a base class providing functionality for large strings/large bytes types. */ -public abstract class BaseLargeVariableWidthVector extends BaseValueVector - implements VariableWidthVector, FieldVector, VectorDefinitionSetter { +public abstract class BaseLargeVariableWidthVector extends BaseValueVector implements VariableWidthFieldVector { private static final int DEFAULT_RECORD_BYTE_COUNT = 12; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; private int lastValueCapacity; @@ -942,6 +941,7 @@ public void setValueCount(int valueCount) { * * @param index target index */ + @Override public void fillEmpties(int index) { handleSafe(index, emptyByteArray.length); fillHoles(index); @@ -955,6 +955,7 @@ public void fillEmpties(int index) { * * @param value desired index of last non-null element. */ + @Override public void setLastSet(int value) { lastSet = value; } @@ -964,6 +965,7 @@ public void setLastSet(int value) { * * @return index of the last non-null element */ + @Override public int getLastSet() { return lastSet; } @@ -1003,6 +1005,7 @@ public void setValueLengthSafe(int index, int length) { * @param index position of element to get * @return greater than 0 length for non-null element, 0 otherwise */ + @Override public int getValueLength(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -1021,6 +1024,7 @@ public int getValueLength(int index) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void set(int index, byte[] value) { assert index >= 0; fillHoles(index); @@ -1037,6 +1041,7 @@ public void set(int index, byte[] value) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void setSafe(int index, byte[] value) { assert index >= 0; handleSafe(index, value.length); @@ -1055,6 +1060,7 @@ public void setSafe(int index, byte[] value) { * @param start start index in array of bytes * @param length length of data in array of bytes */ + @Override public void set(int index, byte[] value, int start, int length) { assert index >= 0; fillHoles(index); @@ -1091,6 +1097,7 @@ public void setSafe(int index, byte[] value, int start, int length) { * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ + @Override public void set(int index, ByteBuffer value, int start, int length) { assert index >= 0; fillHoles(index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index d533629cdd44e..0412b9600b773 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -42,8 +42,7 @@ /** * BaseVariableWidthVector is a base class providing functionality for strings/bytes types. */ -public abstract class BaseVariableWidthVector extends BaseValueVector - implements VariableWidthVector, FieldVector, VectorDefinitionSetter { +public abstract class BaseVariableWidthVector extends BaseValueVector implements VariableWidthFieldVector { private static final int DEFAULT_RECORD_BYTE_COUNT = 8; private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT; private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); @@ -993,6 +992,7 @@ public void setValueCount(int valueCount) { * * @param index target index */ + @Override public void fillEmpties(int index) { handleSafe(index, emptyByteArray.length); fillHoles(index); @@ -1006,6 +1006,7 @@ public void fillEmpties(int index) { * * @param value desired index of last non-null element. */ + @Override public void setLastSet(int value) { lastSet = value; } @@ -1015,6 +1016,7 @@ public void setLastSet(int value) { * * @return index of the last non-null element */ + @Override public int getLastSet() { return lastSet; } @@ -1050,6 +1052,7 @@ public void setIndexDefined(int index) { * @param index position of the element to set * @param length length of the element */ + @Override public void setValueLengthSafe(int index, int length) { assert index >= 0; handleSafe(index, length); @@ -1065,6 +1068,7 @@ public void setValueLengthSafe(int index, int length) { * @param index position of element to get * @return greater than 0 length for non-null element, 0 otherwise */ + @Override public int getValueLength(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -1083,6 +1087,7 @@ public int getValueLength(int index) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void set(int index, byte[] value) { assert index >= 0; fillHoles(index); @@ -1099,6 +1104,7 @@ public void set(int index, byte[] value) { * @param index position of the element to set * @param value array of bytes to write */ + @Override public void setSafe(int index, byte[] value) { assert index >= 0; handleSafe(index, value.length); @@ -1153,6 +1159,7 @@ public void setSafe(int index, byte[] value, int start, int length) { * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ + @Override public void set(int index, ByteBuffer value, int start, int length) { assert index >= 0; fillHoles(index); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java new file mode 100644 index 0000000000000..2f80775a48f58 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -0,0 +1,1451 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.vector.util.DataSizeRoundingUtil.roundUpToMultipleOf16; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * BaseVariableWidthViewVector is a base class providing functionality for strings/bytes types in view format. + * + */ +public abstract class BaseVariableWidthViewVector extends BaseValueVector implements VariableWidthFieldVector { + // A single element of a view comprises 16 bytes + protected static final int ELEMENT_SIZE = 16; + public static final int INITIAL_VIEW_VALUE_ALLOCATION = 4096; + private static final int INITIAL_BYTE_COUNT = INITIAL_VIEW_VALUE_ALLOCATION * ELEMENT_SIZE; + private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); + private int lastValueCapacity; + private long lastValueAllocationSizeInBytes; + + /* + * Variable Width View Vector comprises the following format + * + * Short strings, length <= 12 + * | Bytes 0-3 | Bytes 4-15 | + * |------------|---------------------------------------| + * | length | data (padded with 0) | + * |------------|---------------------------------------| + * + * Long strings, length > 12 + * | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | + * |------------|------------|------------|-------------| + * | length | prefix | buf.index | offset | + * |------------|------------|------------|-------------| + * + * */ + // 12 byte unsigned int to track inline views + protected static final int INLINE_SIZE = 12; + // The first 4 bytes of view are allocated for length + protected static final int LENGTH_WIDTH = 4; + // The second 4 bytes of view are allocated for prefix width + protected static final int PREFIX_WIDTH = 4; + // The third 4 bytes of view are allocated for buffer index + protected static final int BUF_INDEX_WIDTH = 4; + protected static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; + protected ArrowBuf validityBuffer; + // The view buffer is used to store the variable width view elements + protected ArrowBuf viewBuffer; + // The external buffer which stores the long strings + protected List dataBuffers; + protected int initialDataBufferSize; + protected int valueCount; + protected int lastSet; + protected final Field field; + + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector + * @param allocator The allocator to use for creating/resizing buffers + */ + public BaseVariableWidthViewVector(Field field, final BufferAllocator allocator) { + super(allocator); + this.field = field; + lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT; + lastValueCapacity = INITIAL_VIEW_VALUE_ALLOCATION; + valueCount = 0; + lastSet = -1; + validityBuffer = allocator.getEmpty(); + viewBuffer = allocator.getEmpty(); + dataBuffers = new ArrayList<>(); + } + + @Override + public String getName() { + return field.getName(); + } + + /* TODO: + * see if getNullCount() can be made faster -- O(1) + */ + + /* TODO: + * Once the entire hierarchy has been refactored, move common functions + * like getNullCount(), splitAndTransferValidityBuffer to top level + * base class BaseValueVector. + * + * Along with this, some class members (validityBuffer) can also be + * abstracted out to top level base class. + * + * Right now BaseValueVector is the top level base class for other + * vector types in ValueVector hierarchy (non-nullable) and those + * vectors have not yet been refactored/removed so moving things to + * the top class as of now is not a good idea. + */ + + /* TODO: + * Implement TransferPair functionality + * https://github.com/apache/arrow/issues/40932 + * + */ + + /** + * Get buffer that manages the validity (NULL or NON-NULL nature) of + * elements in the vector. Consider it as a buffer for internal bit vector + * data structure. + * + * @return buffer + */ + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + /** + * Get the buffer that stores the data for elements in the vector. + * + * @return buffer + */ + @Override + public ArrowBuf getDataBuffer() { + return viewBuffer; + } + + /** + * BaseVariableWidthViewVector doesn't support offset buffer. + * + * @return throws UnsupportedOperationException + */ + @Override + public ArrowBuf getOffsetBuffer() { + throw new UnsupportedOperationException("Offset buffer is not supported in BaseVariableWidthViewVector"); + } + + /** + * BaseVariableWidthViewVector doesn't support offset buffer. + * + * @return throws UnsupportedOperationException + */ + @Override + public long getOffsetBufferAddress() { + throw new UnsupportedOperationException("Offset buffer is not supported in BaseVariableWidthViewVector"); + } + + /** + * Get the memory address of buffer that manages the validity + * (NULL or NON-NULL nature) of elements in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + /** + * Get the memory address of buffer that stores the data for elements + * in the vector. + * + * @return starting address of the buffer + */ + @Override + public long getDataBufferAddress() { + return viewBuffer.memoryAddress(); + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + */ + @Override + public void setInitialCapacity(int valueCount) { + final long size = (long) valueCount * ELEMENT_SIZE; + checkDataBufferSize(size); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Sets the desired value capacity for the vector. This function doesn't + * allocate any memory for the vector. + * + * @param valueCount desired number of elements in the vector + * @param density average number of bytes per variable width view element + */ + @Override + public void setInitialCapacity(int valueCount, double density) { + final long size = (long) valueCount * ELEMENT_SIZE; + initialDataBufferSize = (int) (valueCount * density); + checkDataBufferSize(size); + lastValueAllocationSizeInBytes = (int) size; + lastValueCapacity = valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final double totalListSize = getTotalValueLengthUpToIndex(valueCount); + return totalListSize / valueCount; + } + + /** + * Get the current capacity which does not exceed either validity buffer or value buffer. + * Note: Here the `getValueCapacity` has a relationship with the value buffer. + * + * @return number of elements that vector can hold. + */ + @Override + public int getValueCapacity() { + final int validityCapacity = getValidityBufferValueCapacity(); + final int valueBufferCapacity = Math.max(capAtMaxInt(viewBuffer.capacity() / ELEMENT_SIZE), 0); + return Math.min(valueBufferCapacity, validityCapacity); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * zero out the vector and the data in associated buffers. + */ + public void zeroVector() { + initValidityBuffer(); + viewBuffer.setZero(0, viewBuffer.capacity()); + clearDataBuffers(); + } + + /* zero out the validity buffer */ + private void initValidityBuffer() { + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Reset the vector to initial state. + * Note that this method doesn't release any memory. + */ + @Override + public void reset() { + zeroVector(); + lastSet = -1; + valueCount = 0; + } + + /** + * Close the vector and release the associated buffers. + */ + @Override + public void close() { + clear(); + } + + /** + * Same as {@link #close()}. + */ + @Override + public void clear() { + validityBuffer = releaseBuffer(validityBuffer); + viewBuffer = releaseBuffer(viewBuffer); + clearDataBuffers(); + lastSet = -1; + valueCount = 0; + } + + /** + * Release the data buffers and clear the list. + */ + public void clearDataBuffers() { + for (ArrowBuf buffer : dataBuffers) { + releaseBuffer(buffer); + } + dataBuffers.clear(); + } + + /** + * Get the inner vectors. + * + * @deprecated This API will be removed as the current implementations no longer support inner vectors. + * + * @return the inner vectors for this field as defined by the TypeLayout + */ + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + /** + * Initialize the children in schema for this Field. This operation is a + * NO-OP for scalar types since they don't have any children. + * @param children the schema + * @throws IllegalArgumentException if children is a non-empty list for scalar types. + */ + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector cannot have children"); + } + } + + /** + * Get the inner child vectors. + * @return list of child vectors for complex types, empty list for scalar vector types + */ + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + + /** + * Load the buffers of this vector with provided source buffers. + * The caller manages the source buffers and populates them before invoking + * this method. + * @param fieldNode the fieldNode indicating the value count + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + // TODO: https://github.com/apache/arrow/issues/40931 + throw new UnsupportedOperationException("loadFieldBuffers is not supported for BaseVariableWidthViewVector"); + } + + /** + * Get the buffers belonging to this vector. + * @return the inner buffers. + */ + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2 + dataBuffers.size()); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(viewBuffer); + // append data buffers + result.addAll(dataBuffers); + + return result; + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + viewBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + viewBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + viewBuffer.writerIndex(valueCount * ELEMENT_SIZE); + } + } + + /** + * Same as {@link #allocateNewSafe()}. + */ + @Override + public void allocateNew() { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + } + + /** + * Allocate memory for the vector. We internally use a default value count + * of 4096 to allocate memory for at least these many elements in the + * vector. See {@link #allocateNew(long, int)} for allocating memory for specific + * number of elements in the vector. + * + * @return false if memory allocation fails, true otherwise. + */ + @Override + public boolean allocateNewSafe() { + try { + allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * Allocate memory for the vector to support storing at least the provided number of + * elements in the vector. This method must be called prior to using the ValueVector. + * + * @param totalBytes desired total memory capacity + * @param valueCount the desired number of elements in the vector + * @throws OutOfMemoryException if memory allocation fails + */ + @Override + public void allocateNew(long totalBytes, int valueCount) { + assert totalBytes >= 0; + + checkDataBufferSize(totalBytes); + + /* we are doing a new allocation -- release the current buffers */ + clear(); + + try { + allocateBytes(totalBytes, valueCount); + } catch (Exception e) { + clear(); + throw e; + } + } + + @Override + public void allocateNew(int valueCount) { + allocateNew(lastValueAllocationSizeInBytes, valueCount); + } + + /* Check if the data buffer size is within bounds. */ + private void checkDataBufferSize(long size) { + if (size > MAX_BUFFER_SIZE || size < 0) { + throw new OversizedAllocationException("Memory required for vector " + + "is (" + size + "), which is overflow or more than max allowed (" + MAX_BUFFER_SIZE + "). " + + "You could consider using LargeVarCharVector/LargeVarBinaryVector for large strings/large bytes types"); + } + } + + /* allocate the inner buffers */ + private void allocateBytes(final long valueBufferSize, final int valueCount) { + /* allocate data buffer */ + viewBuffer = allocator.buffer(valueBufferSize); + viewBuffer.readerIndex(0); + + validityBuffer = allocator.buffer((valueCount + 7) / 8); + initValidityBuffer(); + + lastValueCapacity = getValueCapacity(); + lastValueAllocationSizeInBytes = capAtMaxInt(viewBuffer.capacity()); + } + + /** + * Resize the vector to increase the capacity. The internal behavior is to + * double the current value capacity. + */ + @Override + public void reAlloc() { + reallocViewBuffer(); + reallocViewDataBuffer(); + reallocValidityBuffer(); + } + + /** + * Reallocate the view buffer. View Buffer stores the views for + * VIEWVARCHAR or VIEWVARBINARY elements in the vector. The behavior is to double + * the size of buffer. + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocViewBuffer() { + long currentViewBufferCapacity = viewBuffer.capacity(); + + long newAllocationSize = currentViewBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } + } + + reallocViewBuffer(newAllocationSize); + } + + /** + * Reallocate the data buffer associated with view buffer. + */ + public void reallocViewDataBuffer() { + long currentDataBufferCapacity = 0; + if (!dataBuffers.isEmpty()) { + currentDataBufferCapacity = dataBuffers.get(dataBuffers.size() - 1).capacity(); + } + + long newAllocationSize = currentDataBufferCapacity * 2; + if (newAllocationSize == 0) { + if (lastValueAllocationSizeInBytes > 0) { + newAllocationSize = lastValueAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_BYTE_COUNT * 2L; + } + } + + reallocViewDataBuffer(newAllocationSize); + } + + /** + * Reallocate the view buffer to given size. View Buffer stores the views for + * VIEWVARCHAR or VIEWVARBINARY elements in the vector. The actual allocated size may be larger + * than the request one because it will round up the provided value to the nearest + * power of two. + * + * @param desiredAllocSize the desired new allocation size + * @throws OversizedAllocationException if the desired new size is more than + * max allowed + * @throws OutOfMemoryException if the internal memory allocation fails + */ + public void reallocViewBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + // for each set operation, we have to allocate 16 bytes + // here we are adjusting the desired allocation-based allocation size + // to align with the 16bytes requirement. + newAllocationSize = roundUpToMultipleOf16(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, viewBuffer, 0, viewBuffer.capacity()); + + viewBuffer.getReferenceManager().release(); + viewBuffer = newBuf; + lastValueAllocationSizeInBytes = viewBuffer.capacity(); + } + + /** + * Reallocate the data buffer for views. + * + * @param desiredAllocSize allocation size in bytes + */ + public void reallocViewDataBuffer(long desiredAllocSize) { + if (desiredAllocSize == 0) { + return; + } + + if (dataBuffers.isEmpty()) { + return; + } + + ArrowBuf currentBuf = dataBuffers.get(dataBuffers.size() - 1); + if (currentBuf.capacity() - currentBuf.writerIndex() >= desiredAllocSize) { + return; + } + + final long newAllocationSize = CommonUtil.nextPowerOfTwo(desiredAllocSize); + assert newAllocationSize >= 1; + + checkDataBufferSize(newAllocationSize); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + dataBuffers.add(newBuf); + } + + /** + * Reallocate Validity buffer. + */ + public void reallocValidityBuffer() { + int targetValidityCount = capAtMaxInt((validityBuffer.capacity() * 8) * 2); + if (targetValidityCount == 0) { + if (lastValueCapacity > 0) { + targetValidityCount = lastValueCapacity; + } else { + targetValidityCount = 2 * INITIAL_VALUE_ALLOCATION; + } + } + + long validityBufferSize = computeValidityBufferSize(targetValidityCount); + + final ArrowBuf newValidityBuffer = allocator.buffer(validityBufferSize); + newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); + newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); + validityBuffer.getReferenceManager().release(); + validityBuffer = newValidityBuffer; + + lastValueCapacity = getValueCapacity(); + } + + private long computeValidityBufferSize(int valueCount) { + return (valueCount + 7) / 8; + } + + /** + * Get the size (number of bytes) of underlying view buffer. + * @return number of bytes in the view buffer + */ + @Override + public int getByteCapacity() { + return capAtMaxInt(viewBuffer.capacity()); + } + + @Override + public int sizeOfValueBuffer() { + throw new UnsupportedOperationException("sizeOfValueBuffer is not supported for BaseVariableWidthViewVector"); + } + + /** + * Get the size (number of bytes) of underlying elements in the view buffer. + * @return number of bytes used by data in the view buffer + */ + public int sizeOfViewBufferElements() { + if (valueCount == 0) { + return 0; + } + int totalSize = 0; + for (int i = 0; i < valueCount; i++) { + totalSize += getValueLength(i); + } + return totalSize; + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + return getBufferSizeFor(this.valueCount); + } + + /** + * Get the potential buffer size for a particular number of records. + * @param valueCount desired number of elements in the vector + * @return estimated size of underlying buffers if the vector holds + * a given number of elements + */ + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + final int viewBufferSize = valueCount * ELEMENT_SIZE; + final int dataBufferSize = getDataBufferSize(); + return validityBufferSize + viewBufferSize + dataBufferSize; + } + + private int getDataBufferSize() { + int dataBufferSize = 0; + for (ArrowBuf buf : dataBuffers) { + dataBufferSize += (int) buf.writerIndex(); + } + return dataBufferSize; + } + + /** + * Get information about how this field is materialized. + * @return the field corresponding to this vector + */ + @Override + public Field getField() { + return field; + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer, so it only should be used for in-context + * access. Also note that this buffer changes regularly, thus + * external classes shouldn't hold a reference to it (unless they change it). + *

+ * Note: This method only returns validityBuffer and valueBuffer. + * But it doesn't return the data buffers. + *

+ * TODO: Implement a strategy to retrieve the data buffers. + * data buffer retrieval. + * + * @param clear Whether to clear vector before returning, the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers; + setReaderAndWriterIndex(); + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + buffers = new ArrowBuf[2]; + buffers[0] = validityBuffer; + buffers[1] = viewBuffer; + } + if (clear) { + for (final ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Validate the scalar values held by this vector. + */ + public void validateScalars() { + // No validation by default. + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(field, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @param callBack not used + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + return getTransferPair(ref, allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getName(), allocator); + } + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Construct a transfer pair of this vector and another vector of the same type. + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @return TransferPair + */ + @Override + public abstract TransferPair getTransferPair(Field field, BufferAllocator allocator); + + /** + * Transfer this vector's data to another vector. + * The memory associated with this vector is transferred to the allocator of target vector + * for accounting and management purposes. + * @param target destination vector for transfer + */ + public void transferTo(BaseVariableWidthViewVector target) { + throw new UnsupportedOperationException("trasferTo function not supported!"); + } + + /** + * Slice this vector at desired index and length and transfer the + * corresponding data to the target vector. + * @param startIndex start position of the split in source vector. + * @param length length of the split. + * @param target destination vector + */ + public void splitAndTransferTo(int startIndex, int length, + BaseVariableWidthViewVector target) { + throw new UnsupportedOperationException("splitAndTransferTo function not supported!"); + } + + /*----------------------------------------------------------------* + | | + | common getters and setters | + | | + *----------------------------------------------------------------*/ + + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Check if the given index is within the current value capacity + * of the vector. + * + * @param index position to check + * @return true if the index is within the current value capacity + */ + public boolean isSafe(int index) { + return index < getValueCapacity(); + } + + /** + * Check if an element at given index is null. + * + * @param index position of an element + * @return true if an element at given index is null + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of an element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the value count of vector. This will always be zero unless + * setValueCount(int) has been called prior to calling this. + * + * @return valueCount for the vector + */ + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Sets the value count for the vector. + * + * @param valueCount value count + */ + @Override + public void setValueCount(int valueCount) { + assert valueCount >= 0; + this.valueCount = valueCount; + while (valueCount > getValueCapacity()) { + reallocViewBuffer(); + reallocValidityBuffer(); + } + lastSet = valueCount - 1; + setReaderAndWriterIndex(); + } + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last-set position in + * the vector. + * + * @param index target index + */ + @Override + public void fillEmpties(int index) { + handleSafe(index, EMPTY_BYTE_ARRAY.length); + lastSet = index - 1; + } + + /** + * Set the index of the last non-null element in the vector. + * It is important to call this method with appropriate value + * before calling {@link #setValueCount(int)}. + * + * @param value desired index of last non-null element. + */ + @Override + public void setLastSet(int value) { + lastSet = value; + } + + /** + * Get the index of the last non-null element in the vector. + * + * @return index of the last non-null element + */ + @Override + public int getLastSet() { + return lastSet; + } + + /** + * Mark the particular position in the vector as non-null. + * + * @param index position of the element. + */ + @Override + public void setIndexDefined(int index) { + // We need to check and reallocate the validity buffer + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.setBit(validityBuffer, index); + } + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + @Override + public void setValueLengthSafe(int index, int length) { + assert index >= 0; + handleSafe(index, length); + lastSet = index; + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return greater than length 0 for a non-null element, 0 otherwise + */ + @Override + public int getValueLength(int index) { + assert index >= 0; + if (index < 0 || index >= viewBuffer.capacity() / ELEMENT_SIZE) { + throw new IndexOutOfBoundsException("Index out of bounds: " + index); + } + if (isSet(index) == 0) { + return 0; + } + return viewBuffer.getInt(((long) index * ELEMENT_SIZE)); + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. This is same as using {@link #set(int, byte[], int, int)} + * with start as Zero and length as #value.length + * + * @param index position of the element to set + * @param value array of bytes to write + */ + public void set(int index, byte[] value) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[])} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + @Override + public void setSafe(int index, byte[] value) { + assert index >= 0; + // check if the current index can be populated + handleSafe(index, value.length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, 0, value.length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the supplied + * byte array. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in an array of bytes + * @param length length of data in an array of bytes + */ + public void set(int index, byte[] value, int start, int length) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, byte[], int, int)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value array of bytes to write + * @param start start index in an array of bytes + * @param length length of data in an array of bytes + */ + public void setSafe(int index, byte[] value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value, start, length); + lastSet = index; + } + + /** + * Set the variable length element at the specified index to the + * content in supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void set(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value.array(), start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start index in ByteBuffer + * @param length length of data in ByteBuffer + */ + public void setSafe(int index, ByteBuffer value, int start, int length) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, value.array(), start, length); + lastSet = index; + } + + /** + * Set the element at the given index to null. + * + * @param index position of an element + */ + @Override + public void setNull(int index) { + // We need to check and reallocate the validity buffer + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param isSet Zero for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + setBytes(index, buffer, start, dataLength); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param isSet Zero for NULL value, 1 otherwise + * @param start start position of data in buffer + * @param end end position of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int isSet, int start, int end, ArrowBuf buffer) { + assert index >= 0; + final int dataLength = end - start; + handleSafe(index, dataLength); + BitVectorHelper.setValidityBit(validityBuffer, index, isSet); + setBytes(index, buffer, start, dataLength); + lastSet = index; + } + + /** + * Store the given value at a particular position in the vector. isSet indicates + * whether the value is NULL or not. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void set(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, buffer, start, length); + lastSet = index; + } + + /** + * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case + * when index is greater than or equal to current value capacity of the + * vector. + * @param index position of the new value + * @param start start position of data in buffer + * @param length length of data in buffer + * @param buffer data buffer containing the variable width element to be stored + * in the vector + */ + public void setSafe(int index, int start, int length, ArrowBuf buffer) { + assert index >= 0; + handleSafe(index, length); + BitVectorHelper.setBit(validityBuffer, index); + setBytes(index, buffer, start, length); + lastSet = index; + } + + + /*----------------------------------------------------------------* + | | + | helper methods for setters | + | | + *----------------------------------------------------------------*/ + + + protected ArrowBuf allocateOrGetLastDataBuffer(int length) { + long dataBufferSize; + if (initialDataBufferSize > 0) { + dataBufferSize = Math.max(initialDataBufferSize, length); + } else { + dataBufferSize = Math.max(lastValueAllocationSizeInBytes, length); + } + + if (dataBuffers.isEmpty() || dataBuffers.get(dataBuffers.size() - 1).capacity() - + dataBuffers.get(dataBuffers.size() - 1).writerIndex() < length) { + ArrowBuf newBuf = allocator.buffer(dataBufferSize); + dataBuffers.add(newBuf); + } + + return dataBuffers.get(dataBuffers.size() - 1); + } + + /** + * This method is used to create a view buffer for a variable width vector. + * It handles both inline and data buffers. + *

+ * If the length of the value is less than or equal to {@link #INLINE_SIZE}, the value is stored in the valueBuffer + * directly as an inline buffer. + * The valueBuffer stores the length of the value followed by the value itself. + * If the length of the value is greater than {@link #INLINE_SIZE}, a new buffer is allocated and added to dataBuffers + * to hold the value. + * The viewBuffer in this case stores the length of the value, a prefix of the value, the index of the + * new buffer in dataBuffers, and the offset of the value in the new buffer. + * + * @param index The index at which the new value will be inserted. + * @param value The byte array that contains the data to be inserted. + * @param start The start index in the byte array from where the data for the new value begins. + * @param length The length of the data in the byte array that belongs to the new value. + */ + protected final void setBytes(int index, byte[] value, int start, int length) { + int writePosition = index * ELEMENT_SIZE; + + // to clear the memory segment of view being written to + // this is helpful in case of overwriting the value + viewBuffer.setZero(writePosition, ELEMENT_SIZE); + + if (value.length <= INLINE_SIZE) { + // allocate inline buffer + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set data + viewBuffer.setBytes(writePosition, value, start, length); + } else { + // allocate data buffer + ArrowBuf currentBuf = allocateOrGetLastDataBuffer(length); + + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set prefix + viewBuffer.setBytes(writePosition, value, start, PREFIX_WIDTH); + writePosition += PREFIX_WIDTH; + // set buf id + viewBuffer.setInt(writePosition, dataBuffers.size() - 1); + writePosition += BUF_INDEX_WIDTH; + // set offset + viewBuffer.setInt(writePosition, (int) currentBuf.writerIndex()); + + currentBuf.setBytes(currentBuf.writerIndex(), value, start, length); + currentBuf.writerIndex(currentBuf.writerIndex() + length); + } + } + + /** + * This method is used to create a view buffer for a variable width vector. + * Similar to {@link #setBytes(int index, byte[] value, int start, int length)} + * + * @param index The index at which the new value will be inserted. + * @param valueBuf The byte array that contains the data to be inserted. + * @param start The start index in the byte array from where the data for the new value begins. + * @param length The length of the data in the byte array that belongs to the new value. + */ + protected final void setBytes(int index, ArrowBuf valueBuf, int start, int length) { + int writePosition = index * ELEMENT_SIZE; + + // to clear the memory segment of view being written to + // this is helpful in case of overwriting the value + viewBuffer.setZero(writePosition, ELEMENT_SIZE); + + if (length <= INLINE_SIZE) { + // allocate inline buffer + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set data + viewBuffer.setBytes(writePosition, valueBuf, start, length); + } else { + // allocate data buffer + ArrowBuf currentBuf = allocateOrGetLastDataBuffer(length); + + // set length + viewBuffer.setInt(writePosition, length); + writePosition += LENGTH_WIDTH; + // set prefix + viewBuffer.setBytes(writePosition, valueBuf, start, PREFIX_WIDTH); + writePosition += PREFIX_WIDTH; + // set buf id + viewBuffer.setInt(writePosition, dataBuffers.size() - 1); + writePosition += BUF_INDEX_WIDTH; + // set offset + viewBuffer.setInt(writePosition, (int) currentBuf.writerIndex()); + + currentBuf.setBytes(currentBuf.writerIndex(), valueBuf, start, length); + currentBuf.writerIndex(currentBuf.writerIndex() + length); + } + } + + /** + * Get the total length of the elements up to the given index. + * @param index The index of the element in the vector. + * @return The total length up to the element at the given index. + */ + public final int getTotalValueLengthUpToIndex(int index) { + int totalLength = 0; + for (int i = 0; i < index - 1; i++) { + totalLength += getValueLength(i); + } + return totalLength; + } + + protected final void handleSafe(int index, int dataLength) { + final long lastSetCapacity = lastSet < 0 ? 0 : (long) index * ELEMENT_SIZE; + final long targetCapacity = roundUpToMultipleOf16(lastSetCapacity + dataLength); + // for views, we need each buffer with 16 byte alignment, so we need to check the last written index + // in the viewBuffer and allocate a new buffer which has 16 byte alignment for adding new values. + long writePosition = (long) index * ELEMENT_SIZE; + if (viewBuffer.capacity() <= writePosition || viewBuffer.capacity() < targetCapacity) { + /* + * Everytime we want to increase the capacity of the viewBuffer, we need to make sure that the new capacity + * meets 16 byte alignment. + * If the targetCapacity is larger than the writePosition, we may not necessarily + * want to allocate the targetCapacity to viewBuffer since when it is >={@link #INLINE_SIZE} either way + * we are writing to the dataBuffer. + */ + reallocViewBuffer(Math.max(writePosition, targetCapacity)); + } + + while (index >= getValueCapacity()) { + reallocValidityBuffer(); + } + } + + /** + * Copy a cell value from a particular index in source vector to a particular position in this + * vector. + * TODO: Improve functionality to support copying views. + * Enhance CopyFrom + * + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException("copyFrom is not supported for VariableWidthVector"); + } + + /** + * Same as {@link #copyFrom(int, int, ValueVector)} except that it handles the case when the + * capacity of the vector needs to be expanded before copy. + * TODO: Improve functionality to support copying views. + * Enhance CopyFrom + * @param fromIndex position to copy from in source vector + * @param thisIndex position to copy to in this vector + * @param from source vector + */ + @Override + public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { + throw new UnsupportedOperationException("copyFromSafe is not supported for VariableWidthVector"); + } + + @Override + public ArrowBufPointer getDataPointer(int index) { + return getDataPointer(index, new ArrowBufPointer()); + } + + @Override + public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { + if (isNull(index)) { + reuse.set(null, 0, 0); + } else { + int length = getValueLength(index); + if (length < INLINE_SIZE) { + int start = index * ELEMENT_SIZE + LENGTH_WIDTH; + reuse.set(viewBuffer, start, length); + } else { + final int bufIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufIndex); + reuse.set(dataBuf, 0, length); + } + + } + return reuse; + } + + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isNull(index)) { + return ArrowBufPointer.NULL_HASH_CODE; + } + final int length = getValueLength(index); + if (length < INLINE_SIZE) { + int start = index * ELEMENT_SIZE + LENGTH_WIDTH; + return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, start + length); + } else { + final int bufIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufIndex); + return ByteFunctionHelpers.hash(hasher, dataBuf, dataOffset, dataOffset + length); + } + } + + /** + * Retrieves the data of a variable-width element at a given index in the vector. + * + *

+ * If the length of the data is greater than {@link #INLINE_SIZE}, the data is stored in an inline buffer. + * The method retrieves the buffer index and data offset from the viewBuffer, and then retrieves the data from the + * corresponding buffer in the dataBuffers list. + *

+ * If the length of the data is less than or equal to {@link #INLINE_SIZE}, the data is stored directly in the + * viewBuffer. + * The method retrieves the data directly from the viewBuffer. + * + * @param index position of the element in the vector + * @return byte array containing the data of the element + */ + protected byte[] getData(int index) { + final int dataLength = getValueLength(index); + byte[] result = new byte[dataLength]; + if (dataLength > INLINE_SIZE) { + // data is in the data buffer + // get buffer index + final int bufferIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + // get data offset + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + dataBuffers.get(bufferIndex).getBytes(dataOffset, result, 0, dataLength); + } else { + // data is in the view buffer + viewBuffer.getBytes( + (long) index * ELEMENT_SIZE + BUF_INDEX_WIDTH, result, 0, dataLength); + } + return result; + } + + protected void getData(int index, ReusableBuffer buffer) { + final int dataLength = getValueLength(index); + if (dataLength > INLINE_SIZE) { + // data is in the data buffer + // get buffer index + final int bufferIndex = + viewBuffer.getInt(((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH); + // get data offset + final int dataOffset = + viewBuffer.getInt( + ((long) index * ELEMENT_SIZE) + LENGTH_WIDTH + PREFIX_WIDTH + BUF_INDEX_WIDTH); + ArrowBuf dataBuf = dataBuffers.get(bufferIndex); + buffer.set(dataBuf, dataOffset, dataLength); + } else { + // data is in the value buffer + buffer.set(viewBuffer, ((long) index * ELEMENT_SIZE) + BUF_INDEX_WIDTH, dataLength); + } + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + return visitor.visit(this, value); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java index 8560ba3a68b04..25c83260ef3ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -119,6 +119,7 @@ public byte[] get(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final long startOffset = getStartOffset(index); final long dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java index df424c87488a0..bc3a1e09aaa79 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -103,6 +103,7 @@ public Types.MinorType getMinorType() { * @param index position of element to get * @return array of bytes for non-null element, null otherwise */ + @Override public byte[] get(int index) { assert index >= 0; if (isSet(index) == 0) { @@ -140,6 +141,7 @@ public Text getObject(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final long startOffset = getStartOffset(index); final long dataLength = getEndOffset(index) - startOffset; @@ -298,7 +300,7 @@ public void validateScalars() { *----------------------------------------------------------------*/ /** - * Construct a TransferPair comprising of this and a target vector of + * Construct a TransferPair comprising this and a target vector of * the same type. * * @param ref name of the target vector diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index ae465418cf2fd..18032528c86d8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -46,6 +46,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8View; /** * The buffer layout of vectors for a given type. @@ -173,11 +174,23 @@ public TypeLayout visit(Binary type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(ArrowType.BinaryView type) { + // TODO: https://github.com/apache/arrow/issues/40934 + throw new UnsupportedOperationException("BinaryView not supported"); + } + @Override public TypeLayout visit(Utf8 type) { return newVariableWidthTypeLayout(); } + @Override + public TypeLayout visit(Utf8View type) { + // TODO: https://github.com/apache/arrow/issues/40934 + throw new UnsupportedOperationException("Utf8View not supported"); + } + @Override public TypeLayout visit(LargeUtf8 type) { return newLargeVariableWidthTypeLayout(); @@ -347,11 +360,23 @@ public Integer visit(Binary type) { return VARIABLE_WIDTH_BUFFER_COUNT; } + @Override + public Integer visit(ArrowType.BinaryView type) { + // TODO: https://github.com/apache/arrow/issues/40935 + return VARIABLE_WIDTH_BUFFER_COUNT; + } + @Override public Integer visit(Utf8 type) { return VARIABLE_WIDTH_BUFFER_COUNT; } + @Override + public Integer visit(Utf8View type) { + // TODO: https://github.com/apache/arrow/issues/40935 + return VARIABLE_WIDTH_BUFFER_COUNT; + } + @Override public Integer visit(LargeUtf8 type) { return VARIABLE_WIDTH_BUFFER_COUNT; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java index ab67ebad965aa..82d4feda9a991 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java @@ -120,6 +120,7 @@ public byte[] get(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final int startOffset = getStartOffset(index); final int dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index c6d5a7090bc6f..fde9459e60084 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -137,6 +137,7 @@ public Text getObject(int index) { * @param index position of element. * @param buffer the buffer to write into. */ + @Override public void read(int index, ReusableBuffer buffer) { final int startOffset = getStartOffset(index); final int dataLength = getEndOffset(index) - startOffset; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java new file mode 100644 index 0000000000000..58b6940a81a14 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthFieldVector.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.nio.ByteBuffer; + +import org.apache.arrow.memory.ReusableBuffer; + +/** +* A base interface for common functionalities in variable width vectors. +*/ +public interface VariableWidthFieldVector extends VariableWidthVector, FieldVector, VectorDefinitionSetter { + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the element to set + * @param value array of bytes with data + */ + void set(int index, byte[] value); + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the element to set + * @param value array of bytes with data + * @param start start position in the array + * @param length length of the data to write + */ + void set(int index, byte[] value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied ByteBuffer. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start position in the ByteBuffer + * @param length length of the data to write + */ + void set(int index, ByteBuffer value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied byte array, and it + * handles the case where index and length of a new element are beyond the existing capacity of the + * vector. + * + * @param index position of the element to set + * @param value array of bytes to write + */ + void setSafe(int index, byte[] value); + + /** + * Set the variable length element at the specified index to the supplied byte array, and it + * handles the case where index and length of a new element are beyond the existing capacity. + * + * @param index position of the element to set + * @param value array of bytes with data + * @param start start position in the array + * @param length length of the data to write + */ + void setSafe(int index, byte[] value, int start, int length); + + /** + * Set the variable length element at the specified index to the supplied ByteBuffer, and it + * handles the case where index and length of a new element are beyond the existing capacity. + * + * @param index position of the element to set + * @param value ByteBuffer with data + * @param start start position in the ByteBuffer + * @param length length of the data to write + */ + void setSafe(int index, ByteBuffer value, int start, int length); + + /** + * Get the variable length element at the specified index. + * + * @param index position of the element to get + * @return byte array with the data + */ + byte[] get(int index); + + /** + * Get the variable length element at the specified index using a ReusableBuffer. + * + * @param index position of the element to get + * @param buffer ReusableBuffer to write the data to + */ + void read(int index, ReusableBuffer buffer); + + /** + * Get the index of the last non-null element in the vector. + * + * @return index of the last non-null element + */ + int getLastSet(); + + /** + * Set the index of the last non-null element in the vector. + * + * @param value desired index of last non-null element + */ + void setLastSet(int value); + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return greater than length 0 for a non-null element, 0 otherwise + */ + int getValueLength(int index); + + /** + * Create holes in the vector upto the given index (exclusive). + * Holes will be created from the current last-set position in + * the vector. + * + * @param index target index + */ + void fillEmpties(int index); + + /** + * Sets the value length for an element. + * + * @param index position of the element to set + * @param length length of the element + */ + void setValueLengthSafe(int index, int length); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java new file mode 100644 index 0000000000000..393df96b2969e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarBinaryVector.java @@ -0,0 +1,229 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.vector.complex.impl.ViewVarBinaryReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableViewVarBinaryHolder; +import org.apache.arrow.vector.holders.ViewVarBinaryHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; + +/** + * ViewVarBinaryVector implements a variable width view vector of binary values which could be NULL. A + * validity buffer (bit vector) is maintained to track which elements in the vector are null. + */ +public final class ViewVarBinaryVector extends BaseVariableWidthViewVector { + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VIEWVARBINARY.getType()), allocator); + } + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a ViewVarBinaryVector. This doesn't allocate any memory for the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarBinaryVector(Field field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + protected FieldReader getReaderImpl() { + return new ViewVarBinaryReaderImpl(ViewVarBinaryVector.this); + } + + /** + * Get a minor type for this vector. The vector holds values belonging to a particular type. + * + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VIEWVARBINARY; + } + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the variable length element at specified index as a byte array. + * + * @param index position of an element to get + * @return array of bytes for a non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return getData(index); + } + + /** + * Read the value at the given position to the given output buffer. The caller is responsible for + * checking for nullity first. + * + * @param index position of an element. + * @param buffer the buffer to write into. + */ + @Override + public void read(int index, ReusableBuffer buffer) { + getData(index, buffer); + } + + /** + * Get the variable length element at a specified index as a byte array. + * + * @param index position of an element to get + * @return byte array for a non-null element, null otherwise + */ + @Override + public byte[] getObject(int index) { + return get(index); + } + + /** + * Get the variable length element at specified index and sets the state in provided holder. + * + * @param index position of an element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + /** + * Set the variable length element at the specified index to the data buffer supplied in the + * holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, ViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Same as {@link #set(int, ViewVarBinaryHolder)} except that it handles the case where index and + * length of a new element are beyond the existing capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, ViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Set the variable length element at the specified index to the data buffer supplied in the + * holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Same as {@link #set(int, NullableViewVarBinaryHolder)} except that it handles the case where index + * and length of a new element are beyond the existing capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableViewVarBinaryHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40936 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param to target vector + * @return {@link TransferPair} + */ + @Override + public TransferPair makeTransferPair(ValueVector to) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException("Unsupported operation"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java new file mode 100644 index 0000000000000..010df02e0bce0 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ViewVarCharVector.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; +import org.apache.arrow.vector.complex.impl.ViewVarCharReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.NullableViewVarCharHolder; +import org.apache.arrow.vector.holders.ViewVarCharHolder; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.Text; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; + +/** + * ViewVarCharVector implements a view of a variable width vector of VARCHAR + * values which could be NULL. A validity buffer (bit vector) is maintained + * to track which elements in the vector are null. A viewBuffer keeps track + * of all values in the vector, and an external data buffer is kept to keep longer + * strings (>12). + */ +public final class ViewVarCharVector extends BaseVariableWidthViewVector { + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(String name, BufferAllocator allocator) { + this(name, FieldType.nullable(MinorType.VARCHAR.getType()), allocator); + } + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * @param name name of the vector + * @param fieldType type of Field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(String name, FieldType fieldType, BufferAllocator allocator) { + this(new Field(name, fieldType, null), allocator); + } + + /** + * Instantiate a ViewVarCharVector. This doesn't allocate any memory for + * the data in vector. + * + * @param field field materialized by this vector + * @param allocator allocator for memory management. + */ + public ViewVarCharVector(Field field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + protected FieldReader getReaderImpl() { + return new ViewVarCharReaderImpl(ViewVarCharVector.this); + } + + /** + * Get a minor type for this vector. The vector holds values belonging + * to a particular type. + * @return {@link org.apache.arrow.vector.types.Types.MinorType} + */ + @Override + public MinorType getMinorType() { + return MinorType.VIEWVARCHAR; + } + + /*----------------------------------------------------------------* + | | + | vector value retrieval methods | + | | + *----------------------------------------------------------------*/ + + /** + * Get the variable length element at specified index as a byte array. + * + * @param index position of an element to get + * @return array of bytes for a non-null element, null otherwise + */ + public byte[] get(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + return getData(index); + } + + /** + * Get the variable length element at specified index as Text. + * + * @param index position of an element to get + * @return Text object for a non-null element, null otherwise + */ + @Override + public Text getObject(int index) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { + return null; + } + + final Text result = new Text(); + read(index, result); + return result; + } + + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of an element. + * @param buffer the buffer to write into. + */ + @Override + public void read(int index, ReusableBuffer buffer) { + getData(index, buffer); + } + + /** + * Get the variable length element at specified index and sets the state + * in provided holder. + * + * @param index position of an element to get + * @param holder data holder to be populated by this function + */ + public void get(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder get operation not supported"); + } + + + /*----------------------------------------------------------------* + | | + | vector value setter methods | + | | + *----------------------------------------------------------------*/ + + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, ViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("ViewVarCharHolder set operation not supported"); + } + + /** + * Same as {@link #set(int, ViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, ViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("ViewVarCharHolder setSafe operation not supported"); + } + + /** + * Set the variable length element at the specified index to the data + * buffer supplied in the holder. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void set(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder set operation not supported"); + } + + /** + * Same as {@link #set(int, NullableViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set + * @param holder holder that carries data buffer. + */ + public void setSafe(int index, NullableViewVarCharHolder holder) { + // TODO: https://github.com/apache/arrow/issues/40937 + throw new UnsupportedOperationException("NullableViewVarCharHolder setSafe operation not supported"); + } + + /** + * Set the variable length element at the specified index to the + * content in supplied Text. + * + * @param index position of the element to set + * @param text Text object with data + */ + public void set(int index, Text text) { + set(index, text.getBytes(), 0, (int) text.getLength()); + } + + /** + * Same as {@link #set(int, NullableViewVarCharHolder)} except that it handles the + * case where index and length of a new element are beyond the existing + * capacity of the vector. + * + * @param index position of the element to set. + * @param text Text object with data + */ + public void setSafe(int index, Text text) { + setSafe(index, text.getBytes(), 0, (int) text.getLength()); + } + + @Override + public void validateScalars() { + for (int i = 0; i < getValueCount(); ++i) { + byte[] value = get(i); + if (value != null) { + ValidateUtil.validateOrThrow(Text.validateUTF8NoThrow(value), + "Non-UTF-8 data in VarCharVector at position " + i + "."); + } + } + } + + /*----------------------------------------------------------------* + | | + | vector transfer | + | | + *----------------------------------------------------------------*/ + + /** + * Construct a TransferPair comprising this and a target vector of the same type. + * + * @param ref name of the target vector + * @param allocator allocator for the target vector + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support getTransferPair(String, BufferAllocator)"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param field The field materialized by this vector. + * @param allocator allocator for the target vector + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support getTransferPair(Field, BufferAllocator)"); + } + + /** + * Construct a TransferPair with a desired target vector of the same type. + * + * @param target the target for the transfer + * @return {@link TransferPair} (UnsupportedOperationException) + */ + @Override + public TransferPair makeTransferPair(ValueVector target) { + // TODO: https://github.com/apache/arrow/issues/40932 + throw new UnsupportedOperationException( + "ViewVarCharVector does not support makeTransferPair(ValueVector)"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 5323ddda838c8..56220d270fa9b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -27,6 +27,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; @@ -162,6 +163,11 @@ public Boolean visit(BaseLargeVariableWidthVector left, Range range) { return compareBaseLargeVariableWidthVectors(range); } + @Override + public Boolean visit(BaseVariableWidthViewVector left, Range range) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Boolean visit(ListVector left, Range range) { if (!validate(left)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 443ee1f96e273..9bbe5c1b8997c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -85,6 +86,11 @@ public Boolean visit(BaseLargeVariableWidthVector left, Void value) { return compareField(left.getField(), right.getField()); } + @Override + public Boolean visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Boolean visit(ListVector left, Void value) { return compareField(left.getField(), right.getField()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java index aee090706b3c8..de88f25e6753d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/VectorVisitor.java @@ -20,6 +20,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -42,6 +43,8 @@ public interface VectorVisitor { OUT visit(BaseLargeVariableWidthVector left, IN value); + OUT visit(BaseVariableWidthViewVector left, IN value); + OUT visit(ListVector left, IN value); OUT visit(FixedSizeListVector left, IN value); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 0b0e0d66a98f0..89d8441d42aa9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -65,6 +65,8 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; @@ -114,10 +116,13 @@ import org.apache.arrow.vector.complex.impl.UnionWriter; import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; +import org.apache.arrow.vector.complex.impl.ViewVarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.ViewVarCharWriterImpl; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.BinaryView; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; @@ -138,6 +143,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8View; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.util.CallBack; @@ -504,6 +510,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarCharWriterImpl((VarCharVector) vector); } }, + VIEWVARCHAR(Utf8View.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ViewVarCharVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new ViewVarCharWriterImpl((ViewVarCharVector) vector); + } + }, LARGEVARCHAR(LargeUtf8.INSTANCE) { @Override public FieldVector getNewVector( @@ -546,6 +566,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarBinaryWriterImpl((VarBinaryVector) vector); } }, + VIEWVARBINARY(Binary.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ViewVarBinaryVector(field, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new ViewVarBinaryWriterImpl((ViewVarBinaryVector) vector); + } + }, DECIMAL(null) { @Override public FieldVector getNewVector( @@ -923,6 +957,11 @@ public MinorType visit(Utf8 type) { return MinorType.VARCHAR; } + @Override + public MinorType visit(Utf8View type) { + return MinorType.VIEWVARCHAR; + } + @Override public Types.MinorType visit(LargeUtf8 type) { return MinorType.LARGEVARCHAR; @@ -933,6 +972,11 @@ public MinorType visit(Binary type) { return MinorType.VARBINARY; } + @Override + public MinorType visit(BinaryView type) { + return MinorType.VIEWVARBINARY; + } + @Override public MinorType visit(LargeBinary type) { return MinorType.LARGEVARBINARY; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java index 3af2c98374070..5f59933975133 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DataSizeRoundingUtil.java @@ -93,6 +93,10 @@ public static long divideBy8Ceil(long input) { return (input + 7) >>> (long) DIVIDE_BY_8_SHIFT_BITS; } + public static long roundUpToMultipleOf16(long num) { + return (num + 15) & 0xFFFFFFFFFFFFFFF0L; + } + private DataSizeRoundingUtil() { } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java index d938cd833a41a..10a195e1e7b0a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java @@ -64,6 +64,13 @@ public void set(ArrowBuf srcBytes, long start, long len) { length = (int) len; } + @Override + public void set(byte[] srcBytes, long start, long len) { + setCapacity((int) len, false); + System.arraycopy(srcBytes, (int) start, bytes, 0, (int) len); + length = (int) len; + } + @Override public boolean equals(Object o) { if (o == this) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index 95e35ce6938c3..ea631c59ce2f2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -235,9 +235,7 @@ public void set(Text other) { * @param len the number of bytes of the new string */ public void set(byte[] utf8, int start, int len) { - setCapacity(len, false); - System.arraycopy(utf8, start, bytes, 0, len); - this.length = len; + super.set(utf8, start, len); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 068717c7acbc7..def8ef96877ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -26,6 +26,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ExtensionTypeVector; @@ -205,6 +206,11 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { return targetVector; } + @Override + public ValueVector visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public ValueVector visit(ListVector deltaVector, Void value) { Preconditions.checkArgument(typeVisitor.equals(deltaVector), diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index d4abaa1945b94..0a67db0455b41 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; @@ -133,6 +134,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { int valueCount = vector.getValueCount(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java index 6d33be7a0dbac..ddcb658c1a95d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.NullVector; import org.apache.arrow.vector.ValueVector; @@ -103,6 +104,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { validateOffsetBuffer(vector, vector.getValueCount()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index 3d1c5a4f27f7c..bbdabdb1226ad 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -22,6 +22,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -308,6 +309,11 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector vector, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { validateVectorCommon(vector, ArrowType.List.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java index 7e99b1f90fb61..786a1142a2b0b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -23,6 +23,7 @@ import org.apache.arrow.vector.BaseFixedWidthVector; import org.apache.arrow.vector.BaseLargeVariableWidthVector; import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthViewVector; import org.apache.arrow.vector.ExtensionTypeVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullVector; @@ -98,6 +99,11 @@ public Void visit(BaseLargeVariableWidthVector left, Void value) { return null; } + @Override + public Void visit(BaseVariableWidthViewVector left, Void value) { + throw new UnsupportedOperationException("View vectors are not supported."); + } + @Override public Void visit(ListVector vector, Void value) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java index 7e64dd3864636..be83e573c7c46 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUtils.java @@ -29,11 +29,21 @@ public static VarCharVector newVarCharVector(String name, BufferAllocator alloca FieldType.nullable(new ArrowType.Utf8()).createNewSingleVector(name, allocator, null); } + public static ViewVarCharVector newViewVarCharVector(String name, BufferAllocator allocator) { + return (ViewVarCharVector) + FieldType.nullable(new ArrowType.Utf8View()).createNewSingleVector(name, allocator, null); + } + public static VarBinaryVector newVarBinaryVector(String name, BufferAllocator allocator) { return (VarBinaryVector) FieldType.nullable(new ArrowType.Binary()).createNewSingleVector(name, allocator, null); } + public static ViewVarBinaryVector newViewVarBinaryVector(String name, BufferAllocator allocator) { + return (ViewVarBinaryVector) + FieldType.nullable(new ArrowType.BinaryView()).createNewSingleVector(name, allocator, null); + } + public static T newVector(Class c, String name, ArrowType type, BufferAllocator allocator) { return c.cast(FieldType.nullable(type).createNewSingleVector(name, allocator, null)); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java new file mode 100644 index 0000000000000..efb5afac91b13 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVarCharViewVector.java @@ -0,0 +1,1462 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.apache.arrow.vector.TestUtils.newVector; +import static org.apache.arrow.vector.TestUtils.newViewVarBinaryVector; +import static org.apache.arrow.vector.TestUtils.newViewVarCharVector; +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Random; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.util.ReusableByteArray; +import org.apache.arrow.vector.util.Text; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + + +public class TestVarCharViewVector { + + // short string (length <= 12) + private static final byte[] STR0 = "0123456".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR1 = "012345678912".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR2 = "0123456789123".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR3 = "01234567891234567".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR4 = "01234567".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR5 = "A1234A".getBytes(StandardCharsets.UTF_8); + // short string (length <= 12) + private static final byte[] STR6 = "B1234567B".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR7 = "K01234567891234567K".getBytes(StandardCharsets.UTF_8); + // long string (length > 12) + private static final byte[] STR8 = "M012345678912345678M".getBytes(StandardCharsets.UTF_8); + private static final String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @BeforeEach + public void prepare() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @AfterEach + public void shutdown() { + allocator.close(); + } + + public static void setBytes(int index, byte[] bytes, ViewVarCharVector vector) { + BitVectorHelper.setBit(vector.validityBuffer, index); + vector.setBytes(index, bytes, 0, bytes.length); + } + + @Test + public void testInlineAllocation() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 3); + final int valueCount = 3; + viewVarCharVector.set(0, STR0); + viewVarCharVector.set(1, STR1); + viewVarCharVector.set(2, STR4); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + + String str1 = new String(STR0, StandardCharsets.UTF_8); + String str2 = new String(STR1, StandardCharsets.UTF_8); + String str3 = new String(STR4, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + + assertTrue(viewVarCharVector.dataBuffers.isEmpty()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + } + } + + @Test + public void testDataBufferBasedAllocationInSameBuffer() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); + final int valueCount = 4; + String str4 = generateRandomString(34); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + + assertEquals(1, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + } + } + + @Test + public void testDataBufferBasedAllocationInOtherBuffer() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(48, 4); + final int valueCount = 4; + String str4 = generateRandomString(35); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + + assertEquals(2, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + } + } + + @Test + public void testMixedAllocation() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(128, 6); + final int valueCount = 6; + String str4 = generateRandomString(35); + String str6 = generateRandomString(40); + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR3); + viewVarCharVector.set(3, str4.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.set(4, STR1); + viewVarCharVector.set(5, str6.getBytes(StandardCharsets.UTF_8)); + viewVarCharVector.setValueCount(valueCount); + + byte[] view1 = viewVarCharVector.get(0); + byte[] view2 = viewVarCharVector.get(1); + byte[] view3 = viewVarCharVector.get(2); + byte[] view4 = viewVarCharVector.get(3); + byte[] view5 = viewVarCharVector.get(4); + byte[] view6 = viewVarCharVector.get(5); + + assertNotNull(view1); + assertNotNull(view2); + assertNotNull(view3); + assertNotNull(view4); + assertNotNull(view5); + assertNotNull(view6); + + String str1 = new String(STR1, StandardCharsets.UTF_8); + String str2 = new String(STR2, StandardCharsets.UTF_8); + String str3 = new String(STR3, StandardCharsets.UTF_8); + + assertEquals(new String(view1, StandardCharsets.UTF_8), str1); + assertEquals(new String(view2, StandardCharsets.UTF_8), str2); + assertEquals(new String(view3, StandardCharsets.UTF_8), str3); + assertEquals(new String(view4, StandardCharsets.UTF_8), str4); + assertEquals(new String(view5, StandardCharsets.UTF_8), str1); + assertEquals(new String(view6, StandardCharsets.UTF_8), str6); + + assertEquals(1, viewVarCharVector.dataBuffers.size()); + + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(0)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(1)).getBuffer(), + StandardCharsets.UTF_8), str2); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(2)).getBuffer(), + StandardCharsets.UTF_8), str3); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(3)).getBuffer(), + StandardCharsets.UTF_8), str4); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(4)).getBuffer(), + StandardCharsets.UTF_8), str1); + assertEquals(new String(Objects.requireNonNull(viewVarCharVector.getObject(5)).getBuffer(), + StandardCharsets.UTF_8), str6); + } + } + + @Test + public void testAllocationIndexOutOfBounds() { + assertThrows(IndexOutOfBoundsException.class, () -> { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("myvector", allocator)) { + viewVarCharVector.allocateNew(32, 3); + final int valueCount = 3; + viewVarCharVector.set(0, STR1); + viewVarCharVector.set(1, STR2); + viewVarCharVector.set(2, STR2); + viewVarCharVector.setValueCount(valueCount); + } + }); + } + + @Test + public void testSizeOfViewBufferElements() { + try (final ViewVarCharVector vector = new ViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + int valueCount = 100; + int currentSize = 0; + vector.setInitialCapacity(valueCount); + vector.allocateNew(); + vector.setValueCount(valueCount); + for (int i = 0; i < valueCount; i++) { + currentSize += i; + vector.setSafe(i, new byte[i]); + } + assertEquals(currentSize, vector.sizeOfViewBufferElements()); + } + } + + @Test + public void testNullableVarType1() { + + // Create a new value vector for 1024 integers. + try (final ViewVarCharVector vector = newViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Set with convenience function + Text txt = new Text("foo"); + vector.setSafe(7, txt.getBytes(), 0, (int) txt.getLength()); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Check returning a Text object + assertEquals(txt, vector.getObject(7)); + + // Ensure null value throws. + assertNull(vector.get(8)); + } + } + + @Test + public void testGetTextRepeatedly() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + ValueVectorDataPopulator.setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + Text text = new Text(); + vector.read(0, text); + assertArrayEquals(STR1, text.getBytes()); + vector.read(1, text); + assertArrayEquals(STR2, text.getBytes()); + } + } + + @Test + public void testNullableVarType2() { + try (final ViewVarBinaryVector vector = newViewVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { + vector.allocateNew(1024 * 10, 1024); + vector.set(0, STR1); + vector.set(1, STR2); + vector.set(2, STR3); + vector.setSafe(3, STR3, 1, STR3.length - 1); + vector.setSafe(4, STR3, 2, STR3.length - 2); + ByteBuffer str3ByteBuffer = ByteBuffer.wrap(STR3); + vector.setSafe(5, str3ByteBuffer, 1, STR3.length - 1); + vector.setSafe(6, str3ByteBuffer, 2, STR3.length - 2); + + // Check the sample strings. + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(3)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(4)); + assertArrayEquals(Arrays.copyOfRange(STR3, 1, STR3.length), vector.get(5)); + assertArrayEquals(Arrays.copyOfRange(STR3, 2, STR3.length), vector.get(6)); + + // Ensure null value throws. + assertNull(vector.get(7)); + } + } + + @Test + public void testGetBytesRepeatedly() { + try (ViewVarBinaryVector vector = new ViewVarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + final String str = "hello world!!!"; + final String str2 = "foo"; + vector.setSafe(0, str.getBytes(StandardCharsets.UTF_8)); + vector.setSafe(1, str2.getBytes(StandardCharsets.UTF_8)); + + // verify results + ReusableByteArray reusableByteArray = new ReusableByteArray(); + vector.read(0, reusableByteArray); + assertArrayEquals( + str.getBytes(StandardCharsets.UTF_8), + Arrays.copyOfRange( + reusableByteArray.getBuffer(), 0, (int) reusableByteArray.getLength())); + byte[] oldBuffer = reusableByteArray.getBuffer(); + + vector.read(1, reusableByteArray); + assertArrayEquals( + str2.getBytes(StandardCharsets.UTF_8), + Arrays.copyOfRange( + reusableByteArray.getBuffer(), 0, (int) reusableByteArray.getLength())); + + // There should not have been any reallocation since the newer value is smaller in length. + assertSame(oldBuffer, reusableByteArray.getBuffer()); + } + } + + @Test + public void testReAllocVariableWidthViewVector() { + try (final ViewVarCharVector vector = newVector(ViewVarCharVector.class, EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, allocator)) { + final int capacityLimit = 4095; + final int overLimitIndex = 200; + vector.setInitialCapacity(capacityLimit); + vector.allocateNew(); + + int initialCapacity = vector.getValueCapacity(); + assertTrue(initialCapacity >= capacityLimit); + + /* Put values in indexes that fall within the initial allocation */ + vector.setSafe(0, STR1, 0, STR1.length); + vector.setSafe(initialCapacity - 1, STR2, 0, STR2.length); + + /* the set calls above should NOT have triggered a realloc */ + assertEquals(initialCapacity, vector.getValueCapacity()); + + /* Now try to put values in space that falls beyond the initial allocation */ + vector.setSafe(initialCapacity + overLimitIndex, STR3, 0, STR3.length); + + /* Check valueCapacity is more than initial allocation */ + assertTrue(initialCapacity * 2 <= vector.getValueCapacity()); + + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(initialCapacity - 1)); + assertArrayEquals(STR3, vector.get(initialCapacity + overLimitIndex)); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for ValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when the current batch is processed. + vector.setValueCount(vector.getValueCapacity() + overLimitIndex); + } + } + + @Test + public void testSetSafeWithArrowBufNoExcessAllocs() { + final int numValues = BaseVariableWidthViewVector.INITIAL_VALUE_ALLOCATION * 2; + final byte[] valueBytes = "hello world!!!".getBytes(StandardCharsets.UTF_8); + final int valueBytesLength = valueBytes.length; + final int isSet = 1; + try (final ViewVarCharVector fromVector = + newVector( + ViewVarCharVector.class, + EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, + allocator); + final ViewVarCharVector toVector = + newVector( + ViewVarCharVector.class, + EMPTY_SCHEMA_PATH, + Types.MinorType.VIEWVARCHAR, + allocator)) { + /* + * Populate the `fromVector` with `numValues` with byte-arrays, each of size `valueBytesLength`. + */ + fromVector.setInitialCapacity(numValues); + fromVector.allocateNew(); + for (int i = 0; i < numValues; ++i) { + fromVector.setSafe(i, valueBytes, 0 /*start*/, valueBytesLength); + } + fromVector.setValueCount(numValues); + ArrowBuf fromDataBuffer = fromVector.getDataBuffer(); + assertTrue(numValues * valueBytesLength <= fromDataBuffer.capacity()); + + /* + * Copy the entries one-by-one from 'fromVector' to 'toVector', but use the setSafe with + * ArrowBuf API (instead of setSafe with byte-array). + */ + toVector.setInitialCapacity(numValues); + toVector.allocateNew(); + for (int i = 0; i < numValues; i++) { + int start = fromVector.getTotalValueLengthUpToIndex(i); + // across variable + // width implementations + int end = fromVector.getTotalValueLengthUpToIndex(i + 1); + toVector.setSafe(i, isSet, start, end, fromDataBuffer); + } + + /* + * Since the 'fromVector' and 'toVector' have the same initial capacity, and were populated + * with the same varchar elements, the allocations and hence, the final capacity should be + * the same. + */ + assertEquals(fromDataBuffer.capacity(), toVector.getDataBuffer().capacity()); + } + } + + @Test + public void testSetLastSetUsage() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + /* + * If we don't do setLastSe(3) before setValueCount(), then the latter will corrupt + * the value vector by filling in all positions [0,valuecount-1] will empty byte arrays. + * Run the test by commenting on the next line, and we should see incorrect vector output. + */ + vector.setLastSet(3); + vector.setValueCount(20); + + /* Check current lastSet */ + assertEquals(19, vector.getLastSet()); + + /* Check the vector output again */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertEquals(0, vector.getValueLength(10)); + assertEquals(0, vector.getValueLength(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + assertEquals(0, vector.getValueLength(15)); + assertEquals(0, vector.getValueLength(16)); + assertEquals(0, vector.getValueLength(17)); + assertEquals(0, vector.getValueLength(18)); + assertEquals(0, vector.getValueLength(19)); + } + } + + @Test + public void testFillEmptiesUsage() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myvector", allocator)) { + vector.allocateNew(1024 * 10, 1024); + + setBytes(0, STR1, vector); + setBytes(1, STR2, vector); + setBytes(2, STR3, vector); + setBytes(3, STR4, vector); + + /* Check current lastSet */ + assertEquals(-1, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + vector.setLastSet(3); + /* fill empty byte arrays from index [4, 9] */ + vector.fillEmpties(10); + + /* Check current lastSet */ + assertEquals(9, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + + setBytes(10, STR1, vector); + setBytes(11, STR2, vector); + + vector.setLastSet(11); + /* fill empty byte arrays from index [12, 14] */ + vector.setValueCount(15); + + /* Check current lastSet */ + assertEquals(14, vector.getLastSet()); + + /* Check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + assertEquals(0, vector.getValueLength(4)); + assertEquals(0, vector.getValueLength(5)); + assertEquals(0, vector.getValueLength(6)); + assertEquals(0, vector.getValueLength(7)); + assertEquals(0, vector.getValueLength(8)); + assertEquals(0, vector.getValueLength(9)); + assertArrayEquals(STR1, vector.get(10)); + assertArrayEquals(STR2, vector.get(11)); + assertEquals(0, vector.getValueLength(12)); + assertEquals(0, vector.getValueLength(13)); + assertEquals(0, vector.getValueLength(14)); + } + } + + @Test + public void testGetBufferAddress1() { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + + setVector(vector, STR1, STR2, STR3, STR4); + vector.setValueCount(15); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR3, vector.get(2)); + assertArrayEquals(STR4, vector.get(3)); + + List buffers = vector.getFieldBuffers(); + long bitAddress = vector.getValidityBufferAddress(); + long dataAddress = vector.getDataBufferAddress(); + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(dataAddress, buffers.get(1).memoryAddress()); + } + } + + @Test + public void testSetInitialCapacityInViews() { + try (final ViewVarCharVector vector = new ViewVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { + + /* use the default 16 data bytes on average per element */ + final int viewSize = BaseVariableWidthViewVector.ELEMENT_SIZE; + int defaultCapacity = BaseVariableWidthViewVector.INITIAL_VIEW_VALUE_ALLOCATION / viewSize; + vector.setInitialCapacity(defaultCapacity); + vector.allocateNew(); + assertEquals(defaultCapacity, vector.getValueCapacity()); + assertEquals(CommonUtil.nextPowerOfTwo(defaultCapacity * viewSize), vector.getDataBuffer().capacity()); + + double density = 4.0; + final int valueCount = 5; + vector.setInitialCapacity(valueCount, density); + vector.allocateNew(); + assertEquals(8, vector.getValueCapacity()); + assertEquals(128, vector.getDataBuffer().capacity()); + int initialDataBufferSize = (int) (valueCount * density); + // making sure a databuffer is allocated + vector.set(4, "01234567890123456".getBytes(StandardCharsets.UTF_8)); + assertEquals(vector.dataBuffers.size(), 1); + ArrowBuf dataBuf = vector.dataBuffers.get(0); + try (ArrowBuf tempBuf = vector.allocator.buffer(initialDataBufferSize)) { + // replicating a new buffer allocation process when a new buffer is added to the + // data buffer when inserting an element with length > 12 + assertEquals(tempBuf.capacity(), dataBuf.capacity()); + } + } + } + + @Test + public void testGetPointerVariableWidthViews() { + final String[] sampleData = new String[]{ + "abc", "1234567890123", "def", null, "hello world java", "aaaaa", "world", "2019", null, "0717"}; + + try (ViewVarCharVector vec1 = new ViewVarCharVector("vec1", allocator); + ViewVarCharVector vec2 = new ViewVarCharVector("vec2", allocator)) { + + vec1.allocateNew((long) sampleData.length * 16, sampleData.length); + vec2.allocateNew((long) sampleData.length * 16, sampleData.length); + + for (int i = 0; i < sampleData.length; i++) { + String str = sampleData[i]; + if (str != null) { + vec1.set(i, sampleData[i].getBytes(StandardCharsets.UTF_8)); + vec2.set(i, sampleData[i].getBytes(StandardCharsets.UTF_8)); + } else { + vec1.setNull(i); + + vec2.setNull(i); + } + } + + ArrowBufPointer ptr1 = new ArrowBufPointer(); + ArrowBufPointer ptr2 = new ArrowBufPointer(); + + for (int i = 0; i < sampleData.length; i++) { + vec1.getDataPointer(i, ptr1); + vec2.getDataPointer(i, ptr2); + + assertTrue(ptr1.equals(ptr2)); + assertTrue(ptr2.equals(ptr2)); + } + } + } + + @Test + public void testGetNullFromVariableWidthViewVector() { + try (final ViewVarCharVector varCharViewVector = new ViewVarCharVector("viewvarcharvec", allocator); + final ViewVarBinaryVector varBinaryViewVector = new ViewVarBinaryVector("viewvarbinary", allocator)) { + varCharViewVector.allocateNew(16, 1); + varBinaryViewVector.allocateNew(16, 1); + + varCharViewVector.setNull(0); + varBinaryViewVector.setNull(0); + + assertNull(varCharViewVector.get(0)); + assertNull(varBinaryViewVector.get(0)); + } + } + + @Test + public void testVariableWidthViewVectorNullHashCode() { + try (ViewVarCharVector viewVarChar = new ViewVarCharVector("view var char vector", allocator)) { + viewVarChar.allocateNew(100, 1); + viewVarChar.setValueCount(1); + + viewVarChar.set(0, "abc".getBytes(StandardCharsets.UTF_8)); + viewVarChar.setNull(0); + + assertEquals(0, viewVarChar.hashCode(0)); + } + } + + @Test + public void testUnloadVariableWidthViewVector() { + try (final ViewVarCharVector viewVarCharVector = new ViewVarCharVector("view var char", allocator)) { + viewVarCharVector.allocateNew(16, 2); + viewVarCharVector.setValueCount(2); + viewVarCharVector.set(0, "abcd".getBytes(StandardCharsets.UTF_8)); + + List bufs = viewVarCharVector.getFieldBuffers(); + assertEquals(2, bufs.size()); + + ArrowBuf viewBuf = bufs.get(1); + + assertEquals(32, viewBuf.writerIndex()); + final String longString = "012345678901234"; + viewVarCharVector.set(1, longString.getBytes(StandardCharsets.UTF_8)); + + bufs = viewVarCharVector.getFieldBuffers(); + assertEquals(3, bufs.size()); + + ArrowBuf referenceBuf = bufs.get(2); + assertEquals(longString.length(), referenceBuf.writerIndex()); + } + } + + @Test + public void testUnSupportedOffSet() { + // offset is not a feature required in ViewVarCharVector + assertThrows(UnsupportedOperationException.class, () -> { + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + + setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + assertArrayEquals(STR1, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + + vector.getOffsetBuffer(); + } + }); + } + + private void validateViewBuffer(int index, ViewVarCharVector vector, byte[] expectedData, + int expectedBufId, int expectedOffSet) { + final ArrowBuf viewBuffer = vector.viewBuffer; + int writePosition = index * BaseVariableWidthViewVector.ELEMENT_SIZE; + final int prefixBufWidth = BaseVariableWidthViewVector.PREFIX_WIDTH; + final int lengthBufWidth = BaseVariableWidthViewVector.LENGTH_WIDTH; + int length = viewBuffer.getInt(writePosition); + + // validate length of the view + assertEquals(expectedData.length, length); + + byte[] prefixBytes = new byte[prefixBufWidth]; + viewBuffer.getBytes(writePosition + lengthBufWidth, prefixBytes); + + // validate the prefix + byte[] expectedPrefixBytes = new byte[prefixBufWidth]; + System.arraycopy(expectedData, 0, expectedPrefixBytes, 0, prefixBufWidth); + assertArrayEquals(expectedPrefixBytes, prefixBytes); + + if (length > 12) { + /// validate bufId + int bufId = viewBuffer.getInt(writePosition + lengthBufWidth + prefixBufWidth); + assertEquals(expectedBufId, bufId); + // validate offset + int offset = viewBuffer.getInt(writePosition + + lengthBufWidth + + prefixBufWidth + + BaseVariableWidthViewVector.BUF_INDEX_WIDTH); + assertEquals(expectedOffSet, offset); + } + // validate retrieved data + assertArrayEquals(expectedData, vector.get(index)); + } + + @Test + public void testOverwriteShortFromLongString() { + /*NA: not applicable */ + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.set(0, STR0); + vector.setValueCount(1); + assertEquals(0, vector.dataBuffers.size()); + assertArrayEquals(STR0, vector.get(0)); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + + // set long string + vector.set(0, STR3); + vector.setValueCount(1); + assertEquals(1, vector.dataBuffers.size()); + assertArrayEquals(STR3, vector.get(0)); + + validateViewBuffer(0, vector, STR3, 0, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all shorts. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(48, 3); + // set short string 1 + vector.set(0, STR0); + // set short string 2 + vector.set(1, STR5); + // set short string 3 + vector.set(2, STR6); + vector.setValueCount(3); + + // overwrite index 1 with a long string + vector.set(1, STR7); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR7, 0, 0); + validateViewBuffer(2, vector, STR6, /*NA*/-1, /*NA*/-1); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set short string 1 + vector.set(0, STR0); + // set long string 1 + vector.set(1, STR3); + // set short string 2 + vector.set(2, STR5); + // set short string 3 + vector.set(3, STR6); + // set long string 2 + vector.set(4, STR7); + vector.setValueCount(5); + + // overwrite index 2 with a long string + vector.set(2, STR8); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR3, 0, 0); + // Since we did overwrite index 2 with STR8, and as we are using append-only approach, + // it will be appended to the data buffer. + // Thus, it will be stored in the dataBuffer in order i.e. [STR3, STR7, STR8]. + validateViewBuffer(2, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(3, vector, STR6, /*NA*/-1, /*NA*/-1); + validateViewBuffer(4, vector, STR7, 0, STR3.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + // Here the short string is overwritten with a long string, and its length is larger than + // the remaining capacity of the existing data buffer. + // This would allocate a new buffer in the data buffers. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set short string 1 + vector.set(0, STR0); + // set long string 1 + vector.set(1, STR3); + // set short string 2 + vector.set(2, STR5); + // set short string 3 + vector.set(3, STR6); + // set long string 2 + vector.set(4, STR7); + + vector.setValueCount(5); + + // overwrite index 2 with a long string + String longString = generateRandomString(128); + byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); + // since the append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < longStringBytes.length); + vector.set(2, longStringBytes); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + validateViewBuffer(1, vector, STR3, 0, 0); + // overwritten long string will be stored in the new data buffer. + validateViewBuffer(2, vector, longStringBytes, 1, 0); + validateViewBuffer(3, vector, STR6, /*NA*/-1, /*NA*/-1); + validateViewBuffer(4, vector, STR7, 0, STR3.length); + } + } + + @Test + public void testOverwriteLongFromShortString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.set(0, STR3); + vector.setValueCount(1); + // set long string + vector.set(0, STR0); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR0, /*NA*/-1, /*NA*/-1); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(48, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a short string + vector.set(1, STR6); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR6, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(80, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR8); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a short string + vector.set(2, STR0); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + validateViewBuffer(2, vector, STR0, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(3, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testOverwriteLongFromAShorterLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.set(0, STR7); + vector.setValueCount(1); + // set shorter long string, since append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < STR3.length); + // set shorter long string + vector.set(0, STR3); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR3, 1, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(128, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a shorter long string + // Since append-only approach is used + // and the remaining capacity is enough to store in the same data buffer.; + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity > STR2.length); + vector.set(1, STR2); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(1, vector, STR2, 0, STR3.length + STR8.length + STR7.length); + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(128, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR8); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a shorter long string + // Since append-only approach is used + // and the remaining capacity is enough to store in the same data buffer.; + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity > STR2.length); + vector.set(2, STR2); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR2, 0, STR3.length + + STR7.length + STR8.length); + validateViewBuffer(3, vector, STR8, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testOverwriteLongFromALongerLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.set(0, STR3); + vector.setValueCount(1); + // set longer long string, since append-only approach is used and the remaining capacity + // is not enough to store the new string; a new buffer will be allocated. + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + assertTrue(remainingCapacity < STR7.length); + // set longer long string + vector.set(0, STR7); + vector.setValueCount(1); + + validateViewBuffer(0, vector, STR7, 1, 0); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(48, 3); + // set long string 1 + vector.set(0, STR3); + // set long string 2 + vector.set(1, STR8); + // set long string 3 + vector.set(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a longer long string + // the remaining capacity is not enough to store in the same data buffer + // since a new buffer is added to the dataBuffers + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + String longerString = generateRandomString(35); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + assertTrue(remainingCapacity < longerStringBytes.length); + + vector.set(1, longerStringBytes); + vector.setValueCount(3); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, longerStringBytes, 1, 0); + // since the append-only approach is used, + // STR8 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, STR7, 0, STR3.length + STR8.length); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(128, 5); + // set long string 1 + vector.set(0, STR3); + // set short string 1 + vector.set(1, STR5); + // set long string 2 + vector.set(2, STR7); + // set long string 3 + vector.set(3, STR2); + // set short string 2 + vector.set(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a longer long string + // the remaining capacity is enough to store in the same data buffer + final ArrowBuf currentDataBuf = vector.dataBuffers.get(0); + final long remainingCapacity = currentDataBuf.capacity() - currentDataBuf.writerIndex(); + String longerString = generateRandomString(24); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + assertTrue(remainingCapacity > longerStringBytes.length); + + vector.set(2, longerStringBytes); + vector.setValueCount(5); + + validateViewBuffer(0, vector, STR3, 0, 0); + validateViewBuffer(1, vector, STR5, /*NA*/-1, /*NA*/-1); + // since the append-only approach is used, + // STR7 will still be in the first data buffer in dataBuffers. + validateViewBuffer(2, vector, longerStringBytes, 0, STR3.length + STR7.length + STR2.length); + validateViewBuffer(3, vector, STR2, 0, STR3.length + STR7.length); + validateViewBuffer(4, vector, STR6, /*NA*/-1, /*NA*/-1); + } + } + + @Test + public void testSafeOverwriteShortFromLongString() { + /*NA: not applicable */ + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.setSafe(0, STR0); + vector.setValueCount(1); + assertEquals(0, vector.dataBuffers.size()); + assertArrayEquals(STR0, vector.get(0)); + + // set long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + assertEquals(1, vector.dataBuffers.size()); + assertArrayEquals(STR3, vector.get(0)); + + } + + // Overwriting in the middle of the buffer when existing buffers are all shorts. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 3); + // set short string 1 + vector.setSafe(0, STR0); + // set short string 2 + vector.setSafe(1, STR5); + // set short string 3 + vector.setSafe(2, STR6); + vector.setValueCount(3); + + // overwrite index 1 with a long string + vector.setSafe(1, STR7); + vector.setValueCount(3); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR7, vector.get(1)); + assertArrayEquals(STR6, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set short string 1 + vector.setSafe(0, STR0); + // set long string 1 + vector.setSafe(1, STR3); + // set short string 2 + vector.setSafe(2, STR5); + // set short string 3 + vector.setSafe(3, STR6); + // set long string 2 + vector.setSafe(4, STR7); + vector.setValueCount(5); + + // overwrite index 2 with a long string + vector.setSafe(2, STR8); + vector.setValueCount(5); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR3, vector.get(1)); + assertArrayEquals(STR8, vector.get(2)); + assertArrayEquals(STR6, vector.get(3)); + assertArrayEquals(STR7, vector.get(4)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set short string 1 + vector.setSafe(0, STR0); + // set long string 1 + vector.setSafe(1, STR3); + // set short string 2 + vector.setSafe(2, STR5); + // set short string 3 + vector.setSafe(3, STR6); + // set long string 2 + vector.setSafe(4, STR7); + + vector.setValueCount(5); + + // overwrite index 2 with a long string + String longString = generateRandomString(128); + byte[] longStringBytes = longString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(2, longStringBytes); + vector.setValueCount(5); + + assertArrayEquals(STR0, vector.get(0)); + assertArrayEquals(STR3, vector.get(1)); + assertArrayEquals(longStringBytes, vector.get(2)); + assertArrayEquals(STR6, vector.get(3)); + assertArrayEquals(STR7, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromShortString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set short string + vector.setSafe(0, STR3); + vector.setValueCount(1); + // set long string + vector.setSafe(0, STR0); + vector.setValueCount(1); + + assertArrayEquals(STR0, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a short string + vector.setSafe(1, STR6); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR6, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR8); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a short string + vector.setSafe(2, STR0); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(STR0, vector.get(2)); + assertArrayEquals(STR8, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromAShorterLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.setSafe(0, STR7); + vector.setValueCount(1); + // set shorter long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + + assertArrayEquals(STR3, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + // overwrite index 1 with a shorter long string + vector.setSafe(1, STR2); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR2, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR8); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + // overwrite index 2 with a shorter long string + vector.setSafe(2, STR2); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(STR2, vector.get(2)); + assertArrayEquals(STR8, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + } + } + + @Test + public void testSafeOverwriteLongFromALongerLongString() { + // Overwriting at the beginning of the buffer. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 1); + // set long string + vector.setSafe(0, STR3); + vector.setValueCount(1); + // set longer long string + vector.setSafe(0, STR7); + vector.setValueCount(1); + + assertArrayEquals(STR7, vector.get(0)); + } + + // Overwriting in the middle of the buffer when existing buffers are all longs. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + // extra memory is allocated + vector.allocateNew(16, 3); + // set long string 1 + vector.setSafe(0, STR3); + // set long string 2 + vector.setSafe(1, STR8); + // set long string 3 + vector.setSafe(2, STR7); + vector.setValueCount(3); + + String longerString = generateRandomString(35); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(1, longerStringBytes); + vector.setValueCount(3); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(longerStringBytes, vector.get(1)); + assertArrayEquals(STR7, vector.get(2)); + } + + // Overwriting in the middle of the buffer with a mix of short and long strings. + try (final ViewVarCharVector vector = new ViewVarCharVector("myviewvector", allocator)) { + vector.allocateNew(16, 5); + // set long string 1 + vector.setSafe(0, STR3); + // set short string 1 + vector.setSafe(1, STR5); + // set long string 2 + vector.setSafe(2, STR7); + // set long string 3 + vector.setSafe(3, STR2); + // set short string 2 + vector.setSafe(4, STR6); + vector.setValueCount(5); + + String longerString = generateRandomString(24); + byte[] longerStringBytes = longerString.getBytes(StandardCharsets.UTF_8); + + vector.setSafe(2, longerStringBytes); + vector.setValueCount(5); + + assertArrayEquals(STR3, vector.get(0)); + assertArrayEquals(STR5, vector.get(1)); + assertArrayEquals(longerStringBytes, vector.get(2)); + assertArrayEquals(STR2, vector.get(3)); + assertArrayEquals(STR6, vector.get(4)); + + } + } + + private String generateRandomString(int length) { + Random random = new Random(); + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append(random.nextInt(10)); // 0-9 + } + return sb.toString(); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index 9bfcb3c635d86..45e6e630792a9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -61,6 +61,7 @@ import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VariableWidthFieldVector; import org.apache.arrow.vector.complex.BaseRepeatedValueVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; @@ -586,6 +587,17 @@ public static void setVector(VarCharVector vector, byte[]... values) { vector.setValueCount(length); } + public static void setVector(VariableWidthFieldVector vector, byte[]... values) { + final int length = values.length; + vector.allocateNewSafe(); + for (int i = 0; i < length; i++) { + if (values[i] != null) { + vector.set(i, values[i]); + } + } + vector.setValueCount(length); + } + /** * Populate values for LargeVarCharVector. */ From ff679790e7a93969a13aa6842e2f70e7d6a208e5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:05:04 +0900 Subject: [PATCH 033/106] MINOR: [Java] Bump org.cyclonedx:cyclonedx-maven-plugin from 2.7.11 to 2.8.0 in /java (#41210) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.cyclonedx:cyclonedx-maven-plugin](https://github.com/CycloneDX/cyclonedx-maven-plugin) from 2.7.11 to 2.8.0.

Release notes

Sourced from org.cyclonedx:cyclonedx-maven-plugin's releases.

2.8.0

🚀 New features and improvements

🐛 Bug Fixes

📦 Dependency updates

Commits
  • 90e3817 [maven-release-plugin] prepare release cyclonedx-maven-plugin-2.8.0
  • eed838e convert external reference type by value instead of default CONSTANT_NAME
  • 3fd83bf Bump org.apache.maven.plugins:maven-compiler-plugin
  • 343c62d check if configured schemaVersion is supported
  • d001542 distribution-intake external reference is more accurate
  • fa5541d Bump actions/checkout from 4.1.1 to 4.1.2
  • a43cd05 Bump org.apache.commons:commons-compress
  • 31ff1f4 Bump org.junit:junit-bom from 5.10.1 to 5.10.2
  • ce8a6e7 Bump release-drafter/release-drafter from 5 to 6
  • 16dcb5b Bump commons-codec:commons-codec from 1.16.0 to 1.16.1
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.cyclonedx:cyclonedx-maven-plugin&package-manager=maven&previous-version=2.7.11&new-version=2.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: David Li --- java/maven/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 4314192eda73b..558532012a1ae 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -65,7 +65,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.11 + 2.8.0 diff --git a/java/pom.xml b/java/pom.xml index 39fd1e00b64e9..f3639858d7818 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -519,7 +519,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.11 + 2.8.0 org.apache.drill.tools From 6eb0b37386ecbfc4108e914d6dadb8b049a6f549 Mon Sep 17 00:00:00 2001 From: Jonathan Keane Date: Mon, 29 Apr 2024 08:39:07 -0500 Subject: [PATCH 034/106] GH-41402: [CI][R] Update our backwards compatibility CI any other R 4.4 cleanups (#41403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Keep up with the state of the world, ensure we are maintaining backwards compatibility. Resolves #41402 ### What changes are included in this PR? * Bump to 4.4 as the release * Remove old 3.6 jobs now that we no longer support that; clean up code where we hardcode things fro 3.6 and below * Move many of our CI jobs to [rhub's new containers](https://github.com/r-hub/containers). We were accidentally running stale R devel (from December 2023) because the other rhub images stopped being updated. (One exception to be done as a follow on: #41416) * Resolve a number of extended test failures With this PR R extended tests should be all green with the exceptions of: * Two sanitizer jobs (test-fedora-r-clang-sanitizer, test-ubuntu-r-sanitizer) — which are being investigated / fixed in #41421 * Valgrind — I'm running one last run with a new suppression file. * Binary jobs — these work but fail at upload, see https://github.com/apache/arrow/pull/41403#discussion_r1582245207 * Windows R Release — failing on main, #41398 ### Are these changes tested? By definition. ### Are there any user-facing changes? No. * GitHub Issue: #41402 Lead-authored-by: Jonathan Keane Co-authored-by: Jacob Wujciak-Jens Signed-off-by: Jonathan Keane --- .env | 6 +- .github/workflows/r.yml | 4 +- ci/docker/linux-apt-docs.dockerfile | 2 +- ci/docker/linux-apt-lint.dockerfile | 2 +- ci/docker/linux-apt-r.dockerfile | 2 +- ci/etc/valgrind-cran.supp | 20 ++++++- ci/scripts/r_sanitize.sh | 4 +- ci/scripts/r_test.sh | 7 ++- ci/scripts/r_valgrind.sh | 2 +- ...github.linux.arrow.version.back.compat.yml | 2 + dev/tasks/r/github.linux.offline.build.yml | 2 +- dev/tasks/r/github.linux.versions.yml | 2 +- dev/tasks/r/github.packages.yml | 10 ++-- dev/tasks/tasks.yml | 12 ++-- docker-compose.yml | 5 +- r/DESCRIPTION | 2 +- r/R/dplyr-funcs-type.R | 2 +- r/R/util.R | 14 ----- r/tests/testthat/test-Array.R | 5 -- r/tests/testthat/test-RecordBatch.R | 16 ++---- r/tests/testthat/test-Table.R | 4 -- r/tests/testthat/test-altrep.R | 7 ++- r/tests/testthat/test-chunked-array.R | 5 -- r/tests/testthat/test-dplyr-collapse.R | 10 ---- r/tests/testthat/test-dplyr-funcs-datetime.R | 32 ++++++----- r/tests/testthat/test-dplyr-funcs-type.R | 3 +- r/tests/testthat/test-dplyr-glimpse.R | 5 -- r/tests/testthat/test-scalar.R | 4 -- r/tools/test-nixlibs.R | 7 ++- r/vignettes/developers/docker.Rmd | 50 ++++++++--------- r/vignettes/install.Rmd | 55 +++++++++---------- 31 files changed, 139 insertions(+), 164 deletions(-) diff --git a/.env b/.env index d9f875a4d454e..ab2e4b4fbe7fb 100644 --- a/.env +++ b/.env @@ -71,12 +71,12 @@ NUMBA=latest NUMPY=latest PANDAS=latest PYTHON=3.8 -R=4.2 +R=4.4 SPARK=master TURBODBC=latest -# These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-gcc-release:latest -R_IMAGE=ubuntu-gcc-release +# These correspond to images on Docker Hub that contain R, e.g. rhub/ubuntu-release:latest +R_IMAGE=ubuntu-release R_ORG=rhub R_TAG=latest diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 05c85fa6dc2c2..8228aaad7ce37 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -121,7 +121,7 @@ jobs: strategy: fail-fast: false matrix: - r: ["4.3"] + r: ["4.4"] ubuntu: [20.04] force-tests: ["true"] env: @@ -192,7 +192,7 @@ jobs: fail-fast: false matrix: config: - - { org: "rhub", image: "debian-gcc-devel", tag: "latest", devtoolset: "" } + - { org: "rhub", image: "ubuntu-gcc12", tag: "latest", devtoolset: "" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 76b5ae6f14363..ec424b4e6eaa0 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -18,7 +18,7 @@ ARG base FROM ${base} -ARG r=4.2 +ARG r=4.4 ARG jdk=8 # See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ diff --git a/ci/docker/linux-apt-lint.dockerfile b/ci/docker/linux-apt-lint.dockerfile index 2b94a48871847..9ec80440a3c21 100644 --- a/ci/docker/linux-apt-lint.dockerfile +++ b/ci/docker/linux-apt-lint.dockerfile @@ -40,7 +40,7 @@ RUN apt-get update && \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* -ARG r=4.2 +ARG r=4.4 RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ # NOTE: Only R >= 4.0 is available in this repo diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index d93732abb0032..a68354e3abf8d 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -35,7 +35,7 @@ ENV LANG=C.UTF-8 # Build R # [1] https://www.digitalocean.com/community/tutorials/how-to-install-r-on-ubuntu-18-04 # [2] https://linuxize.com/post/how-to-install-r-on-ubuntu-18-04/#installing-r-packages-from-cran -ARG r=3.6 +ARG r=4.4 RUN apt-get update -y && \ apt-get install -y \ dirmngr \ diff --git a/ci/etc/valgrind-cran.supp b/ci/etc/valgrind-cran.supp index 4d29220260823..e93c2a3465f79 100644 --- a/ci/etc/valgrind-cran.supp +++ b/ci/etc/valgrind-cran.supp @@ -16,7 +16,7 @@ # under the License. { - # `testthat::skip()`s cause a valgrind error that does not show up on CRAN. + # `testthat::skip()`s cause a valgrind error that does not show up on CRAN. Memcheck:Cond fun:gregexpr_Regexc @@ -32,3 +32,21 @@ fun:getvar fun:bcEval } +{ + # This also doesn't seem to cause issues on CRAN, so suppress it. + + Memcheck:Leak + match-leak-kinds: possible + fun:malloc + fun:libdeflate_alloc_compressor + fun:do_memCompress + fun:bcEval_loop + fun:bcEval + fun:Rf_eval + fun:R_execClosure + fun:applyClosure_core + fun:Rf_applyClosure + fun:Rf_eval + fun:do_set + fun:Rf_eval +} diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index 600ee0fa2cbe5..f7ed07f0c864b 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -49,7 +49,7 @@ export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" # run tests pushd tests -${R_BIN} < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } +${R_BIN} --no-save < testthat.R > testthat.out 2>&1 || { cat testthat.out; exit 1; } cat testthat.out if grep -q "runtime error" testthat.out; then @@ -58,7 +58,7 @@ fi # run examples popd -${R_BIN} -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } +${R_BIN} --no-save -e 'library(arrow); testthat::test_examples(".")' >> examples.out 2>&1 || { cat examples.out; exit 1; } cat examples.out if grep -q "runtime error" examples.out; then diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 72078ab3c06c2..95a49ee83a79b 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -46,7 +46,12 @@ if [ "$ARROW_USE_PKG_CONFIG" != "false" ]; then export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} export R_LD_LIBRARY_PATH=${LD_LIBRARY_PATH} fi -export _R_CHECK_COMPILATION_FLAGS_KNOWN_=${ARROW_R_CXXFLAGS} + +export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} ${ARROW_R_CXXFLAGS}" +# These should generally be picked up, but are slightly wrong in rhub's containers it appears +# https://github.com/r-hub/containers/pull/63 +export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} -Wno-parentheses -Werror=format-security -Wp,-D_FORTIFY_SOURCE=3" + if [ "$ARROW_R_DEV" = "TRUE" ]; then # These are sometimes used in the Arrow C++ build and are not a problem export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} -Wno-attributes -msse4.2 -Wno-noexcept-type -Wno-subobject-linkage" diff --git a/ci/scripts/r_valgrind.sh b/ci/scripts/r_valgrind.sh index a14cb803ca898..0e40d792111c4 100755 --- a/ci/scripts/r_valgrind.sh +++ b/ci/scripts/r_valgrind.sh @@ -33,7 +33,7 @@ ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz pushd tests # to generate suppression files run: -# ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --gen-suppressions=all --log-file=memcheck.log" -f testthat.supp +# ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --gen-suppressions=all --log-file=memcheck.log" -f testthat.R ${R_BIN} --vanilla -d "valgrind --tool=memcheck --leak-check=full --track-origins=yes --suppressions=/${1}/ci/etc/valgrind-cran.supp" -f testthat.R |& tee testthat.out # valgrind --error-exitcode=1 should return an erroring exit code that we can catch, diff --git a/dev/tasks/r/github.linux.arrow.version.back.compat.yml b/dev/tasks/r/github.linux.arrow.version.back.compat.yml index 804f0d2127059..086705dbb9cf4 100644 --- a/dev/tasks/r/github.linux.arrow.version.back.compat.yml +++ b/dev/tasks/r/github.linux.arrow.version.back.compat.yml @@ -73,6 +73,8 @@ jobs: config: # We use the R version that was released at the time of the arrow release in order # to make sure we can download binaries from RSPM. + - { old_arrow_version: '14.0.2.1', r: '4.3' } + - { old_arrow_version: '13.0.0.1', r: '4.3' } - { old_arrow_version: '12.0.1.1', r: '4.3' } - { old_arrow_version: '11.0.0.3', r: '4.2' } - { old_arrow_version: '10.0.1', r: '4.2' } diff --git a/dev/tasks/r/github.linux.offline.build.yml b/dev/tasks/r/github.linux.offline.build.yml index 7a747ac480084..9ac0ebc40835e 100644 --- a/dev/tasks/r/github.linux.offline.build.yml +++ b/dev/tasks/r/github.linux.offline.build.yml @@ -84,7 +84,7 @@ jobs: {{ macros.github_set_sccache_envvars()|indent(8)}} run: | cd arrow/r - R CMD INSTALL --install-tests --no-test-load --no-docs --no-help --no-byte-compile arrow_with_deps.tar.gz + R CMD INSTALL --install-tests --no-test-load --no-byte-compile arrow_with_deps.tar.gz - name: Run the tests run: R -e 'if(tools::testInstalledPackage("arrow") != 0L) stop("There was a test failure.")' - name: Dump test logs diff --git a/dev/tasks/r/github.linux.versions.yml b/dev/tasks/r/github.linux.versions.yml index 48093e9fd5b32..753efe61d048e 100644 --- a/dev/tasks/r/github.linux.versions.yml +++ b/dev/tasks/r/github.linux.versions.yml @@ -30,9 +30,9 @@ jobs: r_version: # We test devel, release, and oldrel in regular CI. # This is for older versions - - "3.6" - "4.0" - "4.1" + - "4.2" env: R_ORG: "rstudio" R_IMAGE: "r-base" diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 2979f57bb6146..9ca7e59a957de 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -227,7 +227,7 @@ jobs: working-directory: 'arrow' extra-packages: cpp11 - name: Set CRAN like openssl - if: contains(matrix.platform.runs_on, 'arm64') + if: contains(matrix.platform.name, 'arm64') run: | # The arm64 runners contain openssl 1.1.1t in this path that is always included first so we need to override the # default setting of the brew --prefix as root dir to avoid version conflicts. @@ -300,16 +300,14 @@ jobs: # an OS that is not in the allowlist, so we have to opt-in to use the # binary. Other env vars used in r_docker_configure.sh can be added # here (like devtoolset) and wired up in the later steps. - - {image: "rhub/debian-clang-devel", libarrow_binary: "TRUE"} + - {image: "rhub/ubuntu-clang", libarrow_binary: "TRUE"} # fedora-clang-devel cannot use binaries bc of libc++ (uncomment to see the error) # - {image: "rhub/fedora-clang-devel", libarrow_binary: "TRUE"} - - {image: "rhub/ubuntu-gcc-release"} # currently ubuntu-20.04 (focal) - - {image: "rocker/r-ubuntu:22.04"} # openssl3 - - {image: "rocker/r-ver"} # whatever is latest ubuntu LTS + - {image: "rhub/ubuntu-release"} # currently ubuntu-22.04 - {image: "rocker/r-ver:4.0.0"} # ubuntu-20.04 - - {image: "rocker/r-ver:3.6.3", libarrow_binary: "TRUE"} # debian:buster (10) - {image: "rstudio/r-base:4.1-focal"} # ubuntu-20.04 - {image: "rstudio/r-base:4.2-centos7", devtoolset: "8"} + - {image: "rstudio/r-base:4.3-noble"} steps: # Get the arrow checkout just for the docker config scripts # Don't need submodules for this (hence false arg to macro): they fail on diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 3e7f7ea0c43a8..52a235c688eda 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -888,12 +888,12 @@ tasks: - r-lib__libarrow__bin__darwin-arm64-openssl-3.0__arrow-{no_rc_r_version}\.zip - r-lib__libarrow__bin__darwin-x86_64-openssl-1.1__arrow-{no_rc_r_version}\.zip - r-lib__libarrow__bin__darwin-x86_64-openssl-3.0__arrow-{no_rc_r_version}\.zip + - r-pkg__bin__windows__contrib__4.4__arrow_{no_rc_r_version}\.zip - r-pkg__bin__windows__contrib__4.3__arrow_{no_rc_r_version}\.zip - - r-pkg__bin__windows__contrib__4.2__arrow_{no_rc_r_version}\.zip + - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.4__arrow_{no_rc_r_version}\.tgz - r-pkg__bin__macosx__big-sur-x86_64__contrib__4.3__arrow_{no_rc_r_version}\.tgz - - r-pkg__bin__macosx__contrib__4.2__arrow_{no_rc_r_version}\.tgz + - r-pkg__bin__macosx__big-sur-arm64__contrib__4.4__arrow_{no_rc_r_version}\.tgz - r-pkg__bin__macosx__big-sur-arm64__contrib__4.3__arrow_{no_rc_r_version}\.tgz - - r-pkg__bin__macosx__big-sur-arm64__contrib__4.2__arrow_{no_rc_r_version}\.tgz - r-pkg__src__contrib__arrow_{no_rc_r_version}\.tar\.gz @@ -1356,7 +1356,7 @@ tasks: r_tag: latest r_custom_ccache: true -{% for r_org, r_image, r_tag in [("rhub", "ubuntu-gcc-release", "latest"), +{% for r_org, r_image, r_tag in [("rhub", "ubuntu-release", "latest"), ("rocker", "r-ver", "latest"), ("rstudio", "r-base", "4.2-focal"), ("rstudio", "r-base", "4.1-opensuse153")] %} @@ -1377,9 +1377,9 @@ tasks: template: r/azure.linux.yml params: r_org: rhub - r_image: debian-gcc-devel-lto + r_image: gcc13 r_tag: latest - flags: '-e NOT_CRAN=false -e INSTALL_ARGS=--use-LTO' + flags: '-e INSTALL_ARGS=--use-LTO' # This one has -flto=auto test-r-ubuntu-22.04: diff --git a/docker-compose.yml b/docker-compose.yml index 60edf1420bc0f..d771fc2d22a35 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1459,8 +1459,8 @@ services: # (including building the C++ library) on any Docker image that contains R # # Usage: - # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r - # R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r + # R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose build r + # R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose run r image: ${REPO}:r-${R_ORG}-${R_IMAGE}-${R_TAG} build: context: . @@ -1523,6 +1523,7 @@ services: cache_from: - ${REPO}:r-rhub-fedora-clang-devel-latest args: + # TODO: change this to rhub/clang-asan base: rhub/fedora-clang-devel-san r_dev: ${ARROW_R_DEV} devtoolset_version: ${DEVTOOLSET_VERSION} diff --git a/r/DESCRIPTION b/r/DESCRIPTION index eeff8168b361c..38cbaa94a3c25 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -21,7 +21,7 @@ Description: 'Apache' 'Arrow' is a cross-language language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. This package provides an interface to the 'Arrow C++' library. -Depends: R (>= 3.4) +Depends: R (>= 4.0) License: Apache License (>= 2.0) URL: https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/ BugReports: https://github.com/apache/arrow/issues diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index f244682737cb4..efb3c6b756a16 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -140,7 +140,7 @@ register_bindings_type_cast <- function() { fix.empty.names = TRUE, stringsAsFactors = FALSE) { # we need a specific value of stringsAsFactors because the default was - # TRUE in R <= 3.6 + # TRUE in R <= 3.6 and folks might still be cargoculting to stay in the past. if (!identical(stringsAsFactors, FALSE)) { arrow_not_supported("stringsAsFactors = TRUE") } diff --git a/r/R/util.R b/r/R/util.R index a7cb5b3792d29..14e4544ab1e54 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -15,20 +15,6 @@ # specific language governing permissions and limitations # under the License. -# for compatibility with R versions earlier than 4.0.0 -if (!exists("deparse1")) { - deparse1 <- function(expr, collapse = " ", width.cutoff = 500L, ...) { - paste(deparse(expr, width.cutoff, ...), collapse = collapse) - } -} - -# for compatibility with R versions earlier than 3.6.0 -if (!exists("str2lang")) { - str2lang <- function(s) { - parse(text = s, keep.source = FALSE)[[1]] - } -} - oxford_paste <- function(x, conjunction = "and", quote = TRUE, diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index bb005605de318..98068bdea20d5 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -818,11 +818,6 @@ test_that("Handling string data with embedded nuls", { ) array_with_nul <- arrow_array(raws)$cast(utf8()) - # The behavior of the warnings/errors is slightly different with and without - # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately - # on `as.vector()` where as with it, the error only happens on materialization) - skip_on_r_older_than("3.6") - # no error on conversion, because altrep laziness v <- expect_error(as.vector(array_with_nul), NA) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index f29b75dbf4095..5987f5a4b7c17 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -595,14 +595,10 @@ test_that("RecordBatch supports cbind", { ) # Rejects Table and ChunkedArray arguments - if (getRversion() >= "4.0.0") { - # R 3.6 cbind dispatch rules cause cbind to fall back to default impl if - # there are multiple arguments with distinct cbind implementations - expect_error( - cbind(record_batch(a = 1:2), arrow_table(b = 3:4)), - regexp = "Cannot cbind a RecordBatch with Tables or ChunkedArrays" - ) - } + expect_error( + cbind(record_batch(a = 1:2), arrow_table(b = 3:4)), + regexp = "Cannot cbind a RecordBatch with Tables or ChunkedArrays" + ) expect_error( cbind(record_batch(a = 1:2), b = chunked_array(1, 2)), regexp = "Cannot cbind a RecordBatch with Tables or ChunkedArrays" @@ -622,10 +618,6 @@ test_that("Handling string data with embedded nuls", { batch_with_nul <- record_batch(a = 1:5, b = raws) batch_with_nul$b <- batch_with_nul$b$cast(utf8()) - # The behavior of the warnings/errors is slightly different with and without - # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately - # on `as.vector()` where as with it, the error only happens on materialization) - skip_on_r_older_than("3.6") df <- as.data.frame(batch_with_nul) expect_error( diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index ce3254a158eee..f6cec3b2b7683 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -581,10 +581,6 @@ test_that("Table supports cbind", { }) test_that("cbind.Table handles record batches and tables", { - # R 3.6 cbind dispatch rules cause cbind to fall back to default impl if - # there are multiple arguments with distinct cbind implementations - skip_if(getRversion() < "4.0.0", "R 3.6 cbind dispatch rules prevent this behavior") - expect_equal( cbind(arrow_table(a = 1L:2L), record_batch(b = 4:5)), arrow_table(a = 1L:2L, b = 4:5) diff --git a/r/tests/testthat/test-altrep.R b/r/tests/testthat/test-altrep.R index 7a66d0e778282..50bd40988e550 100644 --- a/r/tests/testthat/test-altrep.R +++ b/r/tests/testthat/test-altrep.R @@ -15,8 +15,6 @@ # specific language governing permissions and limitations # under the License. -skip_on_r_older_than("3.6") - test_that("altrep test functions do not include base altrep", { expect_false(is_arrow_altrep(1:10)) expect_identical(test_arrow_altrep_is_materialized(1:10), NA) @@ -373,6 +371,11 @@ test_that("altrep min/max/sum identical to R versions for double", { expect_altrep_roundtrip(x, max) expect_altrep_roundtrip(x, sum) + # On valgrind the NA_real_ is sometimes transformed to NaN + # https://stat.ethz.ch/pipermail/r-devel/2021-April/080683.html + # so we skip these there to avoid complicated NA == NaN logic, + # and they are tested on a number of other platforms / conditions + skip_on_linux_devel() x <- c(1, 2, NA_real_) expect_altrep_roundtrip(x, min, na.rm = TRUE) expect_altrep_roundtrip(x, max, na.rm = TRUE) diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 223f5022d3b94..bb01df427f713 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -475,11 +475,6 @@ test_that("Handling string data with embedded nuls", { ) chunked_array_with_nul <- ChunkedArray$create(raws)$cast(utf8()) - # The behavior of the warnings/errors is slightly different with and without - # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately - # on `as.vector()` where as with it, the error only happens on materialization) - skip_on_r_older_than("3.6") - v <- expect_error(as.vector(chunked_array_with_nul), NA) expect_error( diff --git a/r/tests/testthat/test-dplyr-collapse.R b/r/tests/testthat/test-dplyr-collapse.R index a8aa5556f1e0d..f50fa8945db11 100644 --- a/r/tests/testthat/test-dplyr-collapse.R +++ b/r/tests/testthat/test-dplyr-collapse.R @@ -185,16 +185,6 @@ See $.data for the source Arrow object", fixed = TRUE ) - skip_if(getRversion() < "3.6.0", "TODO investigate why these aren't equal") - # On older R versions: - # ── Failure (test-dplyr-collapse.R:172:3): Properties of collapsed query ──────── - # head(q, 1) %>% collect() not equal to tibble::tibble(lgl = FALSE, total = 8L, extra = 40). - # Component "total": Mean relative difference: 0.3846154 - # Component "extra": Mean relative difference: 0.3846154 - # ── Failure (test-dplyr-collapse.R:176:3): Properties of collapsed query ──────── - # tail(q, 1) %>% collect() not equal to tibble::tibble(lgl = NA, total = 25L, extra = 125). - # Component "total": Mean relative difference: 0.9230769 - # Component "extra": Mean relative difference: 0.9230769 expect_equal( q %>% arrange(lgl) %>% diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 4d3226798d3ff..6f520f6e3223b 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -180,7 +180,7 @@ test_that("strptime", { ) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( @@ -198,7 +198,7 @@ test_that("strptime works for individual formats", { skip_on_cran() # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") expect_equal( @@ -269,7 +269,7 @@ test_that("timestamp round trip correctly via strftime and strptime", { skip_on_cran() # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") tz <- "Pacific/Marquesas" @@ -291,7 +291,9 @@ test_that("timestamp round trip correctly via strftime and strptime", { # Some formats are not supported on Windows if (!tolower(Sys.info()[["sysname"]]) == "windows") { - formats <- c(formats, "%a", "%A", "%b", "%B", "%OS", "%I%p", "%r", "%T%z") + # "%r" could also be here, though it is only valid in some locales (those + # that use 12 hour formats, so skip for now) + formats <- c(formats, "%a", "%A", "%b", "%B", "%OS", "%I%p", "%T%z") } for (fmt in formats) { @@ -2080,7 +2082,7 @@ test_that("as_datetime() works with other functions", { test_that("parse_date_time() works with year, month, and date components", { # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -2139,7 +2141,7 @@ test_that("parse_date_time() works with year, month, and date components", { test_that("parse_date_time() works with a mix of formats and orders", { # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") test_df <- tibble( string_combi = c("2021-09-1", "2/09//2021", "09.3.2021") @@ -2169,7 +2171,7 @@ test_that("year, month, day date/time parsers", { ) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -2221,7 +2223,7 @@ test_that("ym, my & yq parsers", { ) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% @@ -2270,7 +2272,7 @@ test_that("ym, my & yq parsers", { test_that("parse_date_time's other formats", { # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( @@ -2401,7 +2403,7 @@ test_that("lubridate's fast_strptime", { ) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( @@ -2508,7 +2510,7 @@ test_that("parse_date_time with hours, minutes and seconds components", { # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( @@ -2638,7 +2640,7 @@ test_that("parse_date_time with month names and HMS", { skip_on_os("windows") # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6 & the minimal nightly builds) + # RE2 library (not available in the minimal nightly builds) skip_if_not_available("re2") test_dates_times2 <- tibble( @@ -2737,7 +2739,7 @@ test_that("parse_date_time with `quiet = FALSE` not supported", { # https://issues.apache.org/jira/browse/ARROW-17146 # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6 & the minimal nightly builds) + # RE2 library (not available in the minimal nightly builds) skip_if_not_available("re2") expect_warning( @@ -2766,7 +2768,7 @@ test_that("parse_date_time with `quiet = FALSE` not supported", { test_that("parse_date_time with truncated formats", { # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") test_truncation_df <- tibble( @@ -2853,7 +2855,7 @@ test_that("parse_date_time with `exact = TRUE`, and with regular R objects", { ) # these functions' internals use some string processing which requires the - # RE2 library (not available on Windows with R 3.6) + # RE2 library skip_if_not_available("re2") compare_dplyr_binding( .input %>% diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 2624e16156bce..ecb6b3b7b45b6 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -754,11 +754,10 @@ test_that("structs/nested data frames/tibbles can be created", { ) # check that data.frame is mapped too - # stringsAsFactors default is TRUE in R 3.6, which is still tested on CI compare_dplyr_binding( .input %>% transmute( - df_col = data.frame(regular_col1, regular_col2, stringsAsFactors = FALSE) + df_col = data.frame(regular_col1, regular_col2) ) %>% collect() %>% mutate(df_col = as.data.frame(df_col)), diff --git a/r/tests/testthat/test-dplyr-glimpse.R b/r/tests/testthat/test-dplyr-glimpse.R index c93273bdeef34..d39fef9e82cca 100644 --- a/r/tests/testthat/test-dplyr-glimpse.R +++ b/r/tests/testthat/test-dplyr-glimpse.R @@ -15,11 +15,6 @@ # specific language governing permissions and limitations # under the License. -# The glimpse output for tests with `example_data` is different on R < 3.6 -# because the `lgl` column is generated with `sample()` and the RNG -# algorithm is different in older R versions. -skip_on_r_older_than("3.6") - library(dplyr, warn.conflicts = FALSE) test_that("glimpse() Table/ChunkedArray", { diff --git a/r/tests/testthat/test-scalar.R b/r/tests/testthat/test-scalar.R index 06f956504350e..8335dc95cd85c 100644 --- a/r/tests/testthat/test-scalar.R +++ b/r/tests/testthat/test-scalar.R @@ -94,10 +94,6 @@ test_that("Handling string data with embedded nuls", { ) scalar_with_nul <- scalar(raws, binary())$cast(utf8()) - # The behavior of the warnings/errors is slightly different with and without - # altrep. Without it (i.e. 3.5.0 and below, the error would trigger immediately - # on `as.vector()` where as with it, the error only happens on materialization) - skip_on_r_older_than("3.6") v <- expect_error(as.vector(scalar_with_nul), NA) expect_error( v[1], diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index ed5192d806990..6996f234ced45 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -23,8 +23,9 @@ TESTING <- TRUE # The functions use `on_macos` from the env they were sourced in, so we need tool # explicitly set it in that environment. +# We capture.output for a cleaner testthat output. nixlibs_env <- environment() -source("nixlibs.R", local = nixlibs_env) +capture.output(source("nixlibs.R", local = nixlibs_env)) test_that("identify_binary() based on LIBARROW_BINARY", { expect_null(identify_binary("FALSE")) @@ -157,6 +158,10 @@ test_that("check_allowlist", { }) test_that("find_latest_nightly()", { + skip_if( + getRversion() > "4.4.0", + "long last version components (>8) fail to max on r-devel" + ) tf <- tempfile() tf_uri <- paste0("file://", tf) on.exit(unlink(tf)) diff --git a/r/vignettes/developers/docker.Rmd b/r/vignettes/developers/docker.Rmd index de2795cfa6bb5..13f60904c9484 100644 --- a/r/vignettes/developers/docker.Rmd +++ b/r/vignettes/developers/docker.Rmd @@ -5,23 +5,23 @@ description: > output: rmarkdown::html_vignette --- -Arrow is compatible with a huge number of combinations of OSs, OS versions, -compilers, R versions, and other variables. Sometimes these combinations of -variables means that behaviours are found in some environments which cannot be -replicated in others. In addition, there are different ways of building Arrow, -for example, using environment variables to specify the building of optional +Arrow is compatible with a huge number of combinations of OSs, OS versions, +compilers, R versions, and other variables. Sometimes these combinations of +variables means that behaviours are found in some environments which cannot be +replicated in others. In addition, there are different ways of building Arrow, +for example, using environment variables to specify the building of optional components. -What all this means is that you may need to use a different setup to the one in -which you are working, when diagnosing a bug or testing out a new feature which -you have reason to believe may be affected by these variables. One way to do +What all this means is that you may need to use a different setup to the one in +which you are working, when diagnosing a bug or testing out a new feature which +you have reason to believe may be affected by these variables. One way to do this is so spin up a Docker image containing the desired setup. This article provides a basic guide to using Docker in your R development. ## How do I run a Docker container? -There are a number of images which have been created for the convenience of +There are a number of images which have been created for the convenience of Arrow devs and you can find them on [the DockerHub repo](https://hub.docker.com/r/apache/arrow-dev/tags). The code below shows an example command you could use to run a Docker container. @@ -29,7 +29,7 @@ The code below shows an example command you could use to run a Docker container. This should be run in the root directory of a checkout of the arrow repo. ```shell -docker run -it -e ARROW_DEPENDENCY_SOURCE=AUTO -v $(pwd):/arrow apache/arrow-dev:r-rhub-ubuntu-gcc-release-latest +docker run -it -e ARROW_DEPENDENCY_SOURCE=AUTO -v $(pwd):/arrow apache/arrow-dev:r-rhub-ubuntu-release-latest ``` Components: @@ -39,13 +39,13 @@ Components: * `-e ARROW_DEPENDENCY_SOURCE=AUTO` - set the environment variable `ARROW_DEPENDENCY_SOURCE` to the value `AUTO` * `-v $(pwd):/arrow` - mount the current directory at `/arrow` in the container * `apache/arrow-dev` - the DockerHub repo to get this container from -* `r-rhub-ubuntu-gcc-release-latest` - the image tag +* `r-rhub-ubuntu-release-latest` - the image tag -Once you run this command, if you don't have a copy of that particular image +Once you run this command, if you don't have a copy of that particular image saved locally, it will first be downloaded before a container is spun up. -In the example above, mounting the directory in which the Arrow repo was stored -on the local machine, meant that that code could be built and tested on the +In the example above, mounting the directory in which the Arrow repo was stored +on the local machine, meant that that code could be built and tested on the container. ## How do I exit this image? @@ -73,29 +73,29 @@ sudo docker ps -a ## Running existing workflows from docker-compose.yml There are a number of workflows outlined in the file `docker-compose.yml` in the -arrow repo root directory. For example, you can use the workflow called `r` to -test building and installing the R package. This is advantageous as you can use -existing utility scripts and install it onto a container which already has R on +arrow repo root directory. For example, you can use the workflow called `r` to +test building and installing the R package. This is advantageous as you can use +existing utility scripts and install it onto a container which already has R on it. -These workflows are also parameterized, which means you can specify different +These workflows are also parameterized, which means you can specify different options (or just use the defaults, which can be found in `.env`) ### Example - The manual way -If you wanted to run [RHub's latest `ubuntu-gcc-release` image](https://hub.docker.com/r/rhub/ubuntu-gcc-release), you could +If you wanted to run [RHub's latest `ubuntu-release` image](https://hub.docker.com/r/rhub/ubuntu-release), you could run: -```shell -R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r -R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r +```shell +R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose build r +R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose run r ``` ### Example - Using Archery Alternatively, you may prefer to use the [Archery tool to run docker images](https://arrow.apache.org/docs/developers/docker.html). -This has the advantage of making it simpler to build some of the existing Arrow -CI jobs which have hierarchical dependencies, and so for example, you could +This has the advantage of making it simpler to build some of the existing Arrow +CI jobs which have hierarchical dependencies, and so for example, you could build the R package on a container which already has the C++ code pre-built. This is the same tool which our CI uses - via a tool called [Crossbow](https://arrow.apache.org/docs/developers/crossbow.html). @@ -103,5 +103,5 @@ This is the same tool which our CI uses - via a tool called [Crossbow](https://a If you want to run the `r` workflow discussed above, you could run: ```shell -R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest archery docker run r +R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest archery docker run r ``` diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index df43a9de36fc2..cc90c5ff08c60 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -69,7 +69,7 @@ The prebuilt binaries come with S3 and GCS support enabled, so you will need to ## Install release version (easy way) -On macOS and Windows, when you run `install.packages("arrow")` and install arrow from CRAN, you get an R binary package that contains a precompiled version of libarrow. Installing binaries is much easier that installing from source, but CRAN does not host binaries for Linux. This means that the default behaviour when you run `install.packages()` on Linux is to retrieve the source version of the R package and compile both the R package _and_ libarrow from source. We'll talk about this scenario in the next section (the "less easy" way), but first we'll suggest two faster alternatives that are usually much easier. +On macOS and Windows, when you run `install.packages("arrow")` and install arrow from CRAN, you get an R binary package that contains a precompiled version of libarrow. Installing binaries is much easier that installing from source, but CRAN does not host binaries for Linux. This means that the default behaviour when you run `install.packages()` on Linux is to retrieve the source version of the R package and compile both the R package _and_ libarrow from source. We'll talk about this scenario in the next section (the "less easy" way), but first we'll suggest two faster alternatives that are usually much easier. ### Binary R package with libarrow binary via RSPM/conda @@ -154,15 +154,15 @@ knitr::include_graphics("./r_source_libarrow_source.png") ``` The "less easy" way to install arrow is to install both the R package and the underlying Arrow C++ library (libarrow) from source. This method is somewhat more -difficult because compiling and installing R packages with C++ dependencies -generally requires installing system packages, which you may not have privileges -to do, and/or building the C++ dependencies separately, which introduces all sorts +difficult because compiling and installing R packages with C++ dependencies +generally requires installing system packages, which you may not have privileges +to do, and/or building the C++ dependencies separately, which introduces all sorts of additional ways for things to go wrong. -Installing from the full source build of arrow, compiling both C++ and R -bindings, will handle most of the dependency management for you, but it is -much slower than using binaries. However, if using binaries isn't an option -for you,or you wish to customize your Linux installation, the instructions in +Installing from the full source build of arrow, compiling both C++ and R +bindings, will handle most of the dependency management for you, but it is +much slower than using binaries. However, if using binaries isn't an option +for you,or you wish to customize your Linux installation, the instructions in this section explain how to do that. ### Basic configuration @@ -369,10 +369,10 @@ satisfy C++ dependencies. ## Offline installation -The `install-arrow.R` file mentioned in the previous section includes a -function called `create_package_with_all_dependencies()`. Normally, when -installing on a computer with internet access, the build process will -download third-party dependencies as needed. This function provides a +The `install-arrow.R` file mentioned in the previous section includes a +function called `create_package_with_all_dependencies()`. Normally, when +installing on a computer with internet access, the build process will +download third-party dependencies as needed. This function provides a way to download them in advance, which can be useful when installing Arrow on a computer without internet access. The process is as follows: @@ -380,11 +380,11 @@ on a computer without internet access. The process is as follows: **Step 1.** Using a computer with internet access, download dependencies: * Install the arrow package **or** source the script directly using the following command: - + ```r source("https://raw.githubusercontent.com/apache/arrow/main/r/R/install-arrow.R") ``` - + * Use the `create_package_with_all_dependencies()` function to create the installation bundle: ```r @@ -399,27 +399,27 @@ on a computer without internet access. The process is as follows: ```r install.packages( - "my_arrow_pkg.tar.gz", + "my_arrow_pkg.tar.gz", dependencies = c("Depends", "Imports", "LinkingTo") ) ``` This installation will build from source, so `cmake` must be available - + * Run `arrow_info()` to check installed capabilities Notes: -- arrow _can_ be installed on a computer without internet access -without using this function, but many useful features will be disabled, +- arrow _can_ be installed on a computer without internet access +without using this function, but many useful features will be disabled, as they depend on third-party components. More precisely, `arrow::arrow_info()$capabilities()` will be `FALSE` for every capability. - If you are using binary packages you shouldn't need to this function. You can download the appropriate binary from your package repository, transfer -that to the offline computer, and install that. +that to the offline computer, and install that. - If you're using RStudio Package Manager on Linux (RSPM), and you want to make a source bundle with this function, make sure to set the first repository @@ -523,11 +523,11 @@ so that we can improve the script. ## Contributing -We are constantly working to make the installation process as painless as +We are constantly working to make the installation process as painless as possible. If you find ways to improve the process, please [report an issue](https://github.com/apache/arrow/issues) so that we can document it. Similarly, if you find that your Linux distribution -or version is not supported, we would welcome the contribution of Docker -images (hosted on Docker Hub) that we can use in our continuous integration +or version is not supported, we would welcome the contribution of Docker +images (hosted on Docker Hub) that we can use in our continuous integration and hopefully improve our coverage. If you do contribute a Docker image, it should be as minimal as possible, containing only R and the dependencies it requires. For reference, see the images that @@ -537,19 +537,16 @@ You can test the arrow R package installation using the `docker-compose` setup included in the `apache/arrow` git repository. For example, ``` -R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose build r -R_ORG=rhub R_IMAGE=ubuntu-gcc-release R_TAG=latest docker-compose run r +R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose build r +R_ORG=rhub R_IMAGE=ubuntu-release R_TAG=latest docker-compose run r ``` installs the arrow R package, including libarrow, on the -[rhub/ubuntu-gcc-release](https://hub.docker.com/r/rhub/ubuntu-gcc-release) +[rhub/ubuntu-release](https://hub.docker.com/r/rhub/ubuntu-release) image. ## Further reading - To learn about installing development versions, see the article on [installing nightly builds](./install_nightly.html). -- If you're contributing to the Arrow project, see the [Arrow R developers guide](./developing.html) for resources to help you on set up your development environment. +- If you're contributing to the Arrow project, see the [Arrow R developers guide](./developing.html) for resources to help you on set up your development environment. - Arrow developers may also wish to read a more detailed discussion of the code run during the installation process, described in the [install details article](./developers/install_details.html). - - - From c87073737b6ffef9715549a199499b92630e8e5f Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 29 Apr 2024 11:32:01 -0400 Subject: [PATCH 035/106] MINOR: [R] refactor arrow_mask to include aggregations list (#41414) ### Rationale for this change Keeping the `..aggregations` list in parent.frame felt a little wrong. As we're starting to use this in more places (like mutate in #41350, and potentially more places), I wanted to try to improve this. I tried a bunch of things before to put it somewhere better (like in the mask) but failed. Finally I found one that worked. ### What changes are included in this PR? Just a refactor ### Are these changes tested? Existing tests pass. ### Are there any user-facing changes? Nope. --- r/R/dplyr-eval.R | 8 +++----- r/R/dplyr-funcs-agg.R | 23 ++++++++++++----------- r/R/dplyr-summarize.R | 41 ++++++++++++++++++----------------------- 3 files changed, 33 insertions(+), 39 deletions(-) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 3aaa29696b8c8..ff1619ce944d0 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -125,13 +125,9 @@ arrow_mask <- function(.data, aggregation = FALSE) { f_env <- new_environment(.cache$functions) if (aggregation) { - # Add the aggregation functions to the environment, and set the enclosing - # environment to the parent frame so that, when called from summarize_eval(), - # they can reference and assign into `..aggregations` defined there. - pf <- parent.frame() + # Add the aggregation functions to the environment. for (f in names(agg_funcs)) { f_env[[f]] <- agg_funcs[[f]] - environment(f_env[[f]]) <- pf } } else { # Add functions that need to error hard and clear. @@ -156,6 +152,8 @@ arrow_mask <- function(.data, aggregation = FALSE) { # TODO: figure out what rlang::as_data_pronoun does/why we should use it # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) out$.data <- .data$selected_columns + # Add the aggregations list to collect any that get pulled out when evaluating + out$.aggregations <- empty_named_list() out } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index ab1df1d2f15a5..d84f8f28f0dff 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -17,7 +17,7 @@ # Aggregation functions # -# These all insert into an ..aggregations list (in a parent frame) a list containing: +# These all insert into an .aggregations list in the mask, a list containing: # @param fun string function name # @param data list of 0 or more Expressions # @param options list of function options, as passed to call_function @@ -154,11 +154,11 @@ register_bindings_aggregate <- function() { set_agg <- function(...) { agg_data <- list2(...) - # Find the environment where ..aggregations is stored + # Find the environment where .aggregations is stored target <- find_aggregations_env() - aggs <- get("..aggregations", target) + aggs <- get(".aggregations", target) lapply(agg_data[["data"]], function(expr) { - # If any of the fields referenced in the expression are in ..aggregations, + # If any of the fields referenced in the expression are in .aggregations, # then we can't aggregate over them. # This is mainly for combinations of dataset columns and aggregations, # like sum(x - mean(x)), i.e. window functions. @@ -169,23 +169,24 @@ set_agg <- function(...) { } }) - # Record the (fun, data, options) in ..aggregations + # Record the (fun, data, options) in .aggregations # and return a FieldRef pointing to it tmpname <- paste0("..temp", length(aggs)) aggs[[tmpname]] <- agg_data - assign("..aggregations", aggs, envir = target) + assign(".aggregations", aggs, envir = target) Expression$field_ref(tmpname) } find_aggregations_env <- function() { - # Find the environment where ..aggregations is stored, + # Find the environment where .aggregations is stored, # it's in parent.env of something in the call stack - for (f in sys.frames()) { - if (exists("..aggregations", envir = f)) { - return(f) + n <- 1 + while (TRUE) { + if (exists(".aggregations", envir = caller_env(n))) { + return(caller_env(n)) } + n <- n + 1 } - stop("Could not find ..aggregations") } ensure_one_arg <- function(args, fun) { diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 5bb81dc2b34fc..56de14db6dd44 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -80,34 +80,32 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # ExecNode), and in the expressions, replace them with FieldRefs so that # further operations can happen (in what will become a ProjectNode that works # on the result of the Aggregate). - # To do this, we create a list in this function scope, and in arrow_mask(), - # and we make sure this environment here is the parent env of the binding - # functions, so that when they receive an expression, they can pull out - # aggregations and insert them into the list, which they can find because it - # is in the parent env. + # To do this, arrow_mask() includes a list called .aggregations, + # and the aggregation functions will pull out those terms and insert into + # that list. # nolint end - ..aggregations <- empty_named_list() - - # We'll collect any transformations after the aggregation here - ..post_mutate <- empty_named_list() mask <- arrow_mask(.data, aggregation = TRUE) + # We'll collect any transformations after the aggregation here. + # summarize_eval() returns NULL when the outer expression is an aggregation, + # i.e. there is no projection to do after + post_mutate <- empty_named_list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) name <- names(exprs)[i] - ..post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) + post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) } # Apply the results to the .data object. # First, the aggregations - .data$aggregations <- ..aggregations + .data$aggregations <- mask$.aggregations # Then collapse the query so that the resulting query object can have # additional operations applied to it out <- collapse.arrow_dplyr_query(.data) - # Now, add the projections in ..post_mutate (if any) - for (post in names(..post_mutate)) { + # Now, add the projections in post_mutate (if any) + for (post in names(post_mutate)) { # One last check: it's possible that an expression like y - mean(y) would # successfully evaluate, but it's not supported. It gets transformed to: # nolint start @@ -121,7 +119,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # We can tell the expression is invalid if it references fields not in # the schema of the data after summarize(). Evaulating its type will # throw an error if it's invalid. - tryCatch(..post_mutate[[post]]$type(out$.data$schema), error = function(e) { + tryCatch(post_mutate[[post]]$type(out$.data$schema), error = function(e) { msg <- paste( "Expression", as_label(exprs[[post]]), "is not a valid aggregation expression or is" @@ -129,7 +127,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { arrow_not_supported(msg) }) # If it's valid, add it to the .data object - out$selected_columns[[post]] <- ..post_mutate[[post]] + out$selected_columns[[post]] <- post_mutate[[post]] } # Make sure column order is correct (and also drop ..temp columns) @@ -266,10 +264,10 @@ format_aggregation <- function(x) { # This function evaluates an expression and returns the post-summarize # projection that results, or NULL if there is none because the top-level # expression was an aggregation. Any aggregations are pulled out and collected -# in the ..aggregations list outside this function. +# in the .aggregations list outside this function. summarize_eval <- function(name, quosure, mask) { # Add previous aggregations to the mask, so they can be referenced - for (n in names(get("..aggregations", parent.frame()))) { + for (n in names(mask$.aggregations)) { mask[[n]] <- mask$.data[[n]] <- Expression$field_ref(n) } # Evaluate: @@ -286,14 +284,11 @@ summarize_eval <- function(name, quosure, mask) { # Handle case where outer expr is ..temp field ref. This came from an # aggregation at the top level. So the resulting name should be `name`. # not `..tempN`. Rename the corresponding aggregation. - post_aggs <- get("..aggregations", parent.frame()) result_field_name <- value$field_name - if (result_field_name %in% names(post_aggs)) { + if (result_field_name %in% names(mask$.aggregations)) { # Do this by assigning over `name` in case something else was in `name` - post_aggs[[name]] <- post_aggs[[result_field_name]] - post_aggs[[result_field_name]] <- NULL - # Assign back into the parent environment - assign("..aggregations", post_aggs, parent.frame()) + mask$.aggregations[[name]] <- mask$.aggregations[[result_field_name]] + mask$.aggregations[[result_field_name]] <- NULL # Return NULL because there is no post-mutate projection, it's just # the aggregation return(NULL) From e3db586eb343e80dce58d8cbf6eef91aba14dfff Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:33:30 -0400 Subject: [PATCH 036/106] MINOR: [Go] Bump github.com/apache/thrift from 0.19.0 to 0.20.0 in /go (#40777) Bumps [github.com/apache/thrift](https://github.com/apache/thrift) from 0.19.0 to 0.20.0.
Release notes

Sourced from github.com/apache/thrift's releases.

Version 0.20.0

Please head over to the official release download source: http://thrift.apache.org/download

The assets listed below are added by Github based on the release tag and they will therefore not match the checkums published on the Thrift project website.

Changelog

Sourced from github.com/apache/thrift's changelog.

0.20.0

Known Open Issues (Blocker or Critical)

  • THRIFT-3877 - C++ library don't work with HTTP (csharp server, cpp client; need cross test enhancement)
  • THRIFT-5468 - Swift service generator doesn't support oneway
  • THRIFT-5654 - LNK4042 and LNK2019 in go_validator_generator.cc

Build Process

C++

Compiler (General)

Delphi

Documentation

Erlang

Go

Haxe

Java

netstd

... (truncated)

Commits
  • 0d0ac75 Preparing 0.20.0
  • 417eafd Add license header to pypi workflow file
  • 9a253e7 THRIFT-5688: Add PyPI publishing github actions
  • dd1217f THRIFT-5745: Implement slog.LogValuer on go TStructs
  • ab6f3ef THRIFT-5744: Switch to slog for go library
  • 318731b Updated manual version info and CHANGES
  • 0c637d7 Bump com.diffplug.spotless from 6.23.3 to 6.25.0 in /lib/java
  • 9d96f20 Fix ambigous typescript definitions
  • 7d4c7fa THRIFT-5750 deprecate "ansistr_binary_" option
  • 7507fa0 THRIFT-5753 PHP 8.1 deprecated warning about return type in jsonSerialize fun...
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/apache/thrift&package-manager=go_modules&previous-version=0.19.0&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 972940ee3c299..79c3cc3981231 100644 --- a/go/go.mod +++ b/go/go.mod @@ -21,7 +21,7 @@ go 1.21 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c github.com/andybalholm/brotli v1.1.0 - github.com/apache/thrift v0.19.0 + github.com/apache/thrift v0.20.0 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/goccy/go-json v0.10.2 github.com/golang/snappy v0.0.4 diff --git a/go/go.sum b/go/go.sum index 0a45cb751f77e..e8c2fde15181a 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= -github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= -github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= From 00df70c6dca6b7cf9a274e131ea88ed588133aec Mon Sep 17 00:00:00 2001 From: Jacob Wujciak-Jens Date: Mon, 29 Apr 2024 19:25:35 +0100 Subject: [PATCH 037/106] GH-41398: [R][CI] Windows job failing after R 4.4 release (#41409) ### Rationale for this change We can't throw warnings on cran. ### What changes are included in this PR? Update function to match changes in libarrow added in GH-39864 ### Are these changes tested? CI ### Are there any user-facing changes? No * GitHub Issue: #41398 Authored-by: Jacob Wujciak-Jens Signed-off-by: Jacob Wujciak-Jens --- r/src/extension-impl.cpp | 8 +++++++- r/src/extension.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index a13b252b2832f..14c771cc98e4f 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -87,7 +87,9 @@ arrow::Result> RExtensionType::Deserialize( return std::shared_ptr(cloned.release()); } -std::string RExtensionType::ToString() const { +std::string RExtensionType::ToString() const { return ToString(false); } + +std::string RExtensionType::ToString(bool show_metadata) const { arrow::Result result = SafeCallIntoR([&]() { cpp11::environment instance = r6_instance(); cpp11::function instance_ToString(instance["ToString"]); @@ -98,7 +100,11 @@ std::string RExtensionType::ToString() const { // In the event of an error (e.g., we are not on the main thread // and we are not inside RunWithCapturedR()), just call the default method if (!result.ok()) { +#if ARROW_VERSION_MAJOR >= 16 + return ExtensionType::ToString(show_metadata); +#else return ExtensionType::ToString(); +#endif } else { return result.ValueUnsafe(); } diff --git a/r/src/extension.h b/r/src/extension.h index fbd3ad484691a..6e6c6f7c29761 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -52,6 +52,8 @@ class RExtensionType : public arrow::ExtensionType { std::string Serialize() const { return extension_metadata_; } + std::string ToString(bool show_metadata = false) const; + // wrapper for libarrow < 16 std::string ToString() const; cpp11::sexp Convert(const std::shared_ptr& array) const; From 2ef4059566eb3dfc5cceb85d8ea8fa83e33234bb Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 29 Apr 2024 17:19:41 -0400 Subject: [PATCH 038/106] GH-29537: [R] Support mutate/summarize with implicit join (#41350) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Since it doesn't look like Acero will be getting window functions any time soon, implement support in `mutate()` for transformations that involve aggregations, like `x - mean(x)`, via left_join. ### What changes are included in this PR? Following #41223, I realized I could reuse that evaluation path in `mutate()`. Evaluating expressions accumulates `..aggregations` and `mutate_stuff`; in summarize() we apply aggregations and then mutate on the result. If expressions in the `mutate_stuff` reference columns in the original data and not just the result of aggregations, we reject it. Here, if there are aggregations, we apply them on a copy of the query up to that point, and join the result back onto the query, then apply the mutations on that. It's not a problem for those mutate expressions to reference both columns in the original data and the results of the aggregations because both are present. There are ~three~ two caveats: * Join has non-deterministic order, so while `mutate()` doesn't generally affect row order, if this code path is activated, row order may not be stable. With datasets, it's not guaranteed anyway. * ~Acero's join seems to have a limitation currently where missing values are not joined to each other. If your join key has NA in it, and you do a left_join, your new columns will all be NA, even if there is a corresponding value in the right dataset. I made https://github.com/apache/arrow/issues/41358 to address that, and in the meantime, I've added a workaround (https://github.com/apache/arrow/pull/41350/commits/b9de50452e926fe5f39aeb3887a04e203302b960) that's not awesome but has the right behavior.~ Fixed and rebased. * I believe it is possible in dplyr to get this behavior in other verbs: filter, arrange, even summarize. I've only done this for mutate. Are we ok with that? ### Are these changes tested? Yes ### Are there any user-facing changes? This works now: ``` r library(arrow) library(dplyr) mtcars |> arrow_table() |> select(cyl, mpg, hp) |> group_by(cyl) |> mutate(stdize_mpg = (mpg - mean(mpg)) / sd(mpg)) |> collect() #> # A tibble: 32 × 4 #> # Groups: cyl [3] #> cyl mpg hp stdize_mpg #> #> 1 6 21 110 0.865 #> 2 6 21 110 0.865 #> 3 4 22.8 93 -0.857 #> 4 6 21.4 110 1.14 #> 5 8 18.7 175 1.41 #> 6 6 18.1 105 -1.13 #> 7 8 14.3 245 -0.312 #> 8 4 24.4 62 -0.502 #> 9 4 22.8 95 -0.857 #> 10 6 19.2 123 -0.373 #> # ℹ 22 more rows ``` Created on 2024-04-23 with [reprex v2.1.0](https://reprex.tidyverse.org) * GitHub Issue: #29537 --- r/R/arrow-package.R | 5 +-- r/R/dplyr-funcs-agg.R | 1 - r/R/dplyr-funcs-doc.R | 2 +- r/R/dplyr-mutate.R | 39 ++++++++++++------ r/man/acero.Rd | 2 +- r/tests/testthat/test-dataset-dplyr.R | 11 ------ r/tests/testthat/test-dplyr-mutate.R | 57 ++++++++++++--------------- r/vignettes/data_wrangling.Rmd | 28 +------------ 8 files changed, 58 insertions(+), 87 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7087a40c4903a..44dfbbcd5c7e7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -48,10 +48,7 @@ supported_dplyr_methods <- list( group_vars = NULL, group_by_drop_default = NULL, ungroup = NULL, - mutate = c( - "window functions (e.g. things that require aggregation within groups)", - "not currently supported" - ), + mutate = NULL, transmute = NULL, arrange = NULL, rename = NULL, diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index d84f8f28f0dff..9411ce5ce6faf 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -164,7 +164,6 @@ set_agg <- function(...) { # like sum(x - mean(x)), i.e. window functions. # This will reject (sum(sum(x)) as well, but that's not a useful operation. if (any(expr$field_names_in_expression() %in% names(aggs))) { - # TODO: support in ARROW-13926 arrow_not_supported("aggregate within aggregate expression") } }) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index fda77bca83fc2..7f0627c33d010 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -53,7 +53,7 @@ #' * [`groups()`][dplyr::groups()] #' * [`inner_join()`][dplyr::inner_join()]: the `copy` argument is ignored #' * [`left_join()`][dplyr::left_join()]: the `copy` argument is ignored -#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported +#' * [`mutate()`][dplyr::mutate()] #' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 287532dee08a9..880f7799e6316 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -45,17 +45,11 @@ mutate.arrow_dplyr_query <- function(.data, return(out) } - # Restrict the cases we support for now - has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs)) - if (has_aggregations) { - # ARROW-13926 - # mutate() on a grouped dataset does calculations within groups - # This doesn't matter on scalar ops (arithmetic etc.) but it does - # for things with aggregations (e.g. subtracting the mean) - return(abandon_ship(call, .data, "window functions not currently supported in Arrow")) - } - - mask <- arrow_mask(out) + # Create a mask with aggregation functions in it + # If there are any aggregations, we will need to compute them and + # and join the results back in, for "window functions" like x - mean(x) + mask <- arrow_mask(out, aggregation = TRUE) + # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated @@ -81,6 +75,24 @@ mutate.arrow_dplyr_query <- function(.data, mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } + if (length(mask$.aggregations)) { + # Make a copy of .data, do the aggregations on it, and then left_join on + # the group_by variables. + agg_query <- as_adq(.data) + # These may be computed by .by, make sure they're set + agg_query$group_by_vars <- grv + agg_query$aggregations <- mask$.aggregations + agg_query <- collapse.arrow_dplyr_query(agg_query) + if (length(grv)) { + out <- left_join(out, agg_query, by = grv) + } else { + # If there are no group_by vars, add a scalar column to both and join on that + agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out <- left_join(out, agg_query, by = "..tempjoin") + } + } + old_vars <- names(out$selected_columns) # Note that this is names(exprs) not names(results): # if results$new_var is NULL, that means we are supposed to remove it @@ -91,6 +103,11 @@ mutate.arrow_dplyr_query <- function(.data, out$selected_columns[[new_var]] <- results[[new_var]] } + # Prune any ..temp columns from the result, which would have come from + # .aggregations + temps <- grepl("^\\.\\.temp", names(out$selected_columns)) + out$selected_columns <- out$selected_columns[!temps] + # Deduplicate new_vars and remove NULL columns from new_vars new_vars <- intersect(union(new_vars, grv), names(out$selected_columns)) diff --git a/r/man/acero.Rd b/r/man/acero.Rd index ca51ef56334eb..9ef9cd7dda6fb 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -40,7 +40,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:group_data]{groups()}} \item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored -\item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported +\item \code{\link[dplyr:mutate]{mutate()}} \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b8d93841921d7..1e36ea8bd4966 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -163,17 +163,6 @@ See $.data for the source Arrow object", ) }) -test_that("mutate() features not yet implemented", { - ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - expect_error( - ds %>% - group_by(int) %>% - mutate(avg = mean(int)), - "window functions not currently supported in Arrow\nCall collect() first to pull data into R.", - fixed = TRUE - ) -}) - test_that("filter scalar validation doesn't crash (ARROW-7772)", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) expect_error( diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 0889fffedd508..71c1e52d33c1d 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -378,18 +378,16 @@ test_that("dplyr::mutate's examples", { # The mutate operation may yield different results on grouped # tibbles because the expressions are computed within groups. # The following normalises `mass` by the global average: - # TODO(ARROW-13926): support window functions compare_dplyr_binding( .input %>% select(name, mass, species) %>% mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% collect(), - starwars, - warning = "window function" + starwars ) }) -test_that("Can mutate after group_by as long as there are no aggregations", { +test_that("Can mutate after group_by, including with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -417,31 +415,31 @@ test_that("Can mutate after group_by as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% group_by(chr) %>% mutate(avg_int = mean(int)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name group_by(chr) %>% mutate(avg_int = mean(mean)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) -test_that("Can mutate with .by argument as long as there are no aggregations", { +test_that("Can mutate with .by argument, even with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -479,25 +477,25 @@ test_that("Can mutate with .by argument as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% mutate(avg_int = mean(int), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name mutate(avg_int = mean(mean), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) @@ -682,7 +680,6 @@ test_that("mutate() and transmute() with namespaced functions", { }) test_that("Can use across() within mutate()", { - # expressions work in the right order compare_dplyr_binding( .input %>% @@ -717,17 +714,15 @@ test_that("Can use across() within mutate()", { example_data ) - # gives the right error with window functions - expect_warning( - arrow_table(example_data) %>% + compare_dplyr_binding( + .input %>% mutate( x = int + 2, across(c("int", "dbl"), list(mean = mean, sd = sd, round)), exp(dbl2) ) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + example_data ) }) diff --git a/r/vignettes/data_wrangling.Rmd b/r/vignettes/data_wrangling.Rmd index 305a91c156eb1..1d074ef0cfedb 100644 --- a/r/vignettes/data_wrangling.Rmd +++ b/r/vignettes/data_wrangling.Rmd @@ -165,33 +165,7 @@ sw2 %>% transmute(name, height, mass, res = residuals(lm(mass ~ height))) ``` -Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise operation like `filter()` is not supported: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - group_by(hair_color) %>% - filter(height < mean(height, na.rm = TRUE)) -``` - -This operation is sometimes referred to as a windowed aggregate and can be accomplished in Arrow by computing the aggregation separately, for example within a join operation: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - left_join( - sw %>% - group_by(hair_color) %>% - summarize(mean_height = mean(height, na.rm = TRUE)) - ) %>% - filter(height < mean_height) %>% - select(!mean_height) %>% - collect() -``` - -Alternatively, [DuckDB](https:\www.duckdb.org) supports Arrow natively, so you can pass the `Table` object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: +For some operations, you can use [DuckDB](https://www.duckdb.org). It supports Arrow natively, so you can pass the `Dataset` or query object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: ```{r} sw %>% From d60ff53394788aef9a6070dfdf46a2bcade128ad Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 30 Apr 2024 08:46:26 +0900 Subject: [PATCH 039/106] GH-41427: [Go] Fix stateless prepared statements (#41428) ### Rationale for this change Stateless prepared statements didn't actually work ### What changes are included in this PR? Update the handle after binding parameters ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41427 Authored-by: David Li Signed-off-by: David Li --- go/arrow/flight/flightsql/client.go | 93 ++++++++++-------------- go/arrow/flight/flightsql/client_test.go | 10 +-- 2 files changed, 45 insertions(+), 58 deletions(-) diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index e594191c35fdf..c6794820dc172 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -1119,24 +1119,10 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return nil, err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } - return p.client.getFlightInfo(ctx, desc, opts...) } @@ -1156,23 +1142,9 @@ func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOpt return err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return err - } - if err = wr.Close(); err != nil { - return err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return err - } + _, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return err } return nil @@ -1200,23 +1172,9 @@ func (p *PreparedStatement) ExecutePoll(ctx context.Context, retryDescriptor *fl } if retryDescriptor == nil { - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } } return p.client.Client.PollFlightInfo(ctx, desc, opts...) @@ -1248,7 +1206,7 @@ func (p *PreparedStatement) ExecuteUpdate(ctx context.Context, opts ...grpc.Call return } if p.hasBindParameters() { - wr, err = p.writeBindParameters(pstream, desc) + wr, err = p.writeBindParametersToStream(pstream, desc) if err != nil { return } @@ -1283,7 +1241,36 @@ func (p *PreparedStatement) hasBindParameters() bool { return (p.paramBinding != nil && p.paramBinding.NumRows() > 0) || (p.streamBinding != nil) } -func (p *PreparedStatement) writeBindParameters(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { +func (p *PreparedStatement) bindParameters(ctx context.Context, desc *pb.FlightDescriptor, opts ...grpc.CallOption) (*flight.FlightDescriptor, error) { + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return nil, err + } + wr, err := p.writeBindParametersToStream(pstream, desc) + if err != nil { + return nil, err + } + if err = wr.Close(); err != nil { + return nil, err + } + pstream.CloseSend() + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { + return nil, err + } + + cmd := pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + desc, err = descForCommand(&cmd) + if err != nil { + return nil, err + } + return desc, nil + } + return desc, nil +} + +// XXX: this does not capture the updated handle. Prefer bindParameters. +func (p *PreparedStatement) writeBindParametersToStream(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { if p.paramBinding != nil { wr := flight.NewRecordWriter(pstream, ipc.WithSchema(p.paramBinding.Schema())) wr.SetFlightDescriptor(desc) diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index 727fe02aa7063..33da79167c4ae 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -448,9 +448,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} @@ -461,7 +461,7 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { mockedPut.On("CloseSend").Return(nil) mockedPut.On("Recv").Return(putResult, nil) - infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)} + infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(updatedHandle)} desc := getDesc(infoCmd) s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil) @@ -525,9 +525,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} From 6dc662324c2c46a7b5066b91cd0ace93a275ecf7 Mon Sep 17 00:00:00 2001 From: DenisTarasyuk <131180287+DenisTarasyuk@users.noreply.github.com> Date: Tue, 30 Apr 2024 03:59:51 +0300 Subject: [PATCH 040/106] GH-41433: [C++][Gandiva] Fix ascii_utf8 function to return same result on x86 and Arm (#41434) ### Rationale for this change Fixing ascii_utf8 function that has different return result on x86 and Arm due to default char type sign difference on those platforms. Added tests to cover existing x86 behavior for ascii symbols with code >127. ### What changes are included in this PR? 1. Added type cast to signed char to save existing x86 behavior on Arm platform. 2. Added tests cases for negative results. ### Are these changes tested? UT included. ### Are there any user-facing changes? None * GitHub Issue: #41433 Authored-by: DenisTarasyuk Signed-off-by: Sutou Kouhei --- cpp/src/gandiva/precompiled/string_ops.cc | 2 +- cpp/src/gandiva/precompiled/string_ops_test.cc | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 5aa0eb38eafd7..3849cf7bdf9a5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1377,7 +1377,7 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) { if (data_len == 0) { return 0; } - return static_cast(data[0]); + return static_cast(static_cast(data[0])); } // Returns the ASCII character having the binary equivalent to A. diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 89213592e7ea2..aaa25db0a9f8d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -51,6 +51,8 @@ TEST(TestStringOps, TestAscii) { EXPECT_EQ(ascii_utf8("", 0), 0); EXPECT_EQ(ascii_utf8("123", 3), 49); EXPECT_EQ(ascii_utf8("999", 3), 57); + EXPECT_EQ(ascii_utf8("\x80", 1), -128); + EXPECT_EQ(ascii_utf8("\xFF", 1), -1); } TEST(TestStringOps, TestChrBigInt) { From 747c8a28306f1e14439cf374b04cb8ed68e08cd2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:29:36 -0700 Subject: [PATCH 041/106] MINOR: [C#] Bump xunit.runner.visualstudio from 2.5.8 to 2.8.0 in /csharp (#41441) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.5.8 to 2.8.0.
Commits
  • 6438bb8 v2.8.0
  • 2afd4cd Pick up latest dependencies
  • b8be108 Add multiplier format support to RunSettings
  • 3c2e493 Update to 2.7.2-pre.17 and support Xunit.ParallelAlgorithm in RunSetttings
  • 144931e Missing height on version
  • 4315921 Fix concurrency bug in AssemblyHelper (#407)
  • 8617393 Bump up to 2.5.9-pre
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.5.8&new-version=2.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index b386ccf79c12c..df53da2098509 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index ae6f9f1e69667..65b4ac027e29f 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index ed158ca8656d3..cde2004e8e48d 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 06fb44e0a0e88..491a0c087b1cd 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -18,7 +18,7 @@ - + all runtime; build; native; contentfiles; analyzers From 131dbd60b52d595583aae3c883fbddce26199d68 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:37:41 -0700 Subject: [PATCH 042/106] MINOR: [C#] Bump ZstdSharp.Port and System.Runtime.CompilerServices.Unsafe in /csharp (#41440) Bumps [ZstdSharp.Port](https://github.com/oleg-st/ZstdSharp) and [System.Runtime.CompilerServices.Unsafe](https://github.com/dotnet/runtime). These dependencies needed to be updated together. Updates `ZstdSharp.Port` from 0.7.6 to 0.8.0
Release notes

Sourced from ZstdSharp.Port's releases.

0.8.0

Ported zstd v1.5.6 Workaround for .NET Native

Commits

Updates `System.Runtime.CompilerServices.Unsafe` from 4.7.1 to 6.0.0
Release notes

Sourced from System.Runtime.CompilerServices.Unsafe's releases.

.NET 6.0

Release

.NET 6.0 RC 2

Release

.NET 6.0 RC 1

Release

.NET 6.0 Preview 7

Release

.NET 6.0 Preview 6

Release

.NET 6.0 Preview 5

Release

.NET 6.0 Preview 4

Release

.NET 6.0 Preview 3

Release

.NET 6.0 Preview 2

Release

.NET 6.0 Preview 1

Release

.NET 5.0.17

Release

.NET 5 is now out of support. We recommend using .NET 6.

.NET 5.0.16

Release

.NET 5.0.15

Release

.NET 5.0.14

Release

.NET 5.0.13

Release

.NET 5.0.11

Release

... (truncated)

Commits

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression/Apache.Arrow.Compression.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 0ce8c89bb1d1b..c34d880f90060 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + From de37ee88690fc2ca8e48341d59e7dba327d8fe2c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Apr 2024 20:12:54 -0700 Subject: [PATCH 043/106] MINOR: [C#] Bump xunit from 2.7.1 to 2.8.0 in /csharp (#41439) Bumps [xunit](https://github.com/xunit/xunit) from 2.7.1 to 2.8.0.
Commits
  • be260b3 v2.8.0
  • a8ceb66 #783: Add -useansicolor flag to console runner (v2)
  • 7b0ff93 Don't show /aggressive with unlimited threads
  • 46cdf06 Support parallel algorithm in MSBuild runner
  • b4aa876 Support multipler syntax in MSBuild runner
  • 6790b48 Add aggressive display to TestFrameworkEnvironment reported by XunitTestAssem...
  • 3dd7e91 Update mocks to make CollectionBehaviorAttribute property values optional
  • 4c82dea Asking for default threads should set 0, not null
  • d73cdef Should not try to use a semaphore when we've been asked for unlimited threads
  • 3722e54 Enable multiplier style max threads support
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.7.1&new-version=2.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index df53da2098509..2b1720561004e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 65b4ac027e29f..c8fb40f2d6702 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index cde2004e8e48d..ba60451f25f68 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 491a0c087b1cd..90b498d4e9b03 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -17,7 +17,7 @@ - + all runtime; build; native; contentfiles; analyzers From e4f31462dbd668c3bcb6ce96442f3c1632c4d8c8 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 30 Apr 2024 06:38:40 +0200 Subject: [PATCH 044/106] GH-41317: [C++] Fix crash on invalid Parquet file (#41366) ### Rationale for this change Fixes the crash detailed in #41317 in TableBatchReader::ReadNext() on a corrupted Parquet file ### What changes are included in this PR? Add a validation that all read columns have the same size ### Are these changes tested? I've tested on the reproducer I provided in #41317 that it now triggers a clean error: ``` Traceback (most recent call last): File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "test.py", line 3, in [_ for _ in parquet_file.iter_batches()] File "pyarrow/_parquet.pyx", line 1587, in iter_batches File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowInvalid: columns do not have the same size ``` I'm not sure if/how unit tests for corrupted datasets should be added ### Are there any user-facing changes? No **This PR contains a "Critical Fix".** * GitHub Issue: #41317 Authored-by: Even Rouault Signed-off-by: mwish --- cpp/src/arrow/table.cc | 2 ++ cpp/src/arrow/table.h | 2 ++ cpp/src/parquet/arrow/reader.cc | 10 ++++++++++ 3 files changed, 14 insertions(+) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 967e78f6b4db1..5dc5e4c1a9a8c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table) for (int i = 0; i < table.num_columns(); ++i) { column_data_[i] = table.column(i).get(); } + DCHECK(table_.Validate().ok()); } TableBatchReader::TableBatchReader(std::shared_ptr table) @@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr
table) for (int i = 0; i < owned_table_->num_columns(); ++i) { column_data_[i] = owned_table_->column(i).get(); } + DCHECK(table_.Validate().ok()); } std::shared_ptr TableBatchReader::schema() const { return table_.schema(); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index a7508430c132b..79675fa92b1f3 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -241,6 +241,8 @@ class ARROW_EXPORT Table { /// /// The conversion is zero-copy: each record batch is a view over a slice /// of the table's columns. +/// +/// The table is expected to be valid prior to using it with the batch reader. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: /// \brief Construct a TableBatchReader for the given table diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index d6ad7c25bc7c1..285e2a597389d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, } } + // Check all columns has same row-size + if (!columns.empty()) { + int64_t row_size = columns[0]->length(); + for (size_t i = 1; i < columns.size(); ++i) { + if (columns[i]->length() != row_size) { + return ::arrow::Status::Invalid("columns do not have the same size"); + } + } + } + auto table = ::arrow::Table::Make(batch_schema, std::move(columns)); auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table); From 97e169a115bcf4e18fffd6c788f6fde648969664 Mon Sep 17 00:00:00 2001 From: Calvin Kirs Date: Tue, 30 Apr 2024 17:00:02 +0800 Subject: [PATCH 045/106] MINOR: [Java] Upgrade bcpkix-jdkon15 dependency to bcpkix-jdkon18 (#41458) ### Rationale for this change Since bcpkix-jdk15on is no longer being maintained and bcpkix-jdkon18 is fully compatible with it, we can replace bcpkix-jdk15on with bcpkix-jdkon18. This will ensure continued support and security for our applications. FYI: https://www.bouncycastle.org/latest_releases.html ### What changes are included in this PR? - Upgrade bcpkix-jdkon15 dependency to bcpkix-jdkon18 ### Are these changes tested? - yes We used the JcaPEMWriter class to convert certificates stored within a KeyStore object into PEM format and subsequently write them to a designated JcaPEMWriter object. Existing test suites provide comprehensive coverage for this functionality. Authored-by: Calvin Kirs Signed-off-by: David Li --- java/flight/flight-sql-jdbc-core/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 2e0de90fcf8bc..ef3f2469b73dd 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -126,8 +126,8 @@ org.bouncycastle - bcpkix-jdk15on - 1.70 + bcpkix-jdk18on + 1.78.1 From b609de374c7c00e1537eb8092e1ff2db718d2b61 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 30 Apr 2024 13:42:31 +0200 Subject: [PATCH 046/106] GH-40342: [Python] Fix pickling of LocalFileSystem for cython 2 (#41459) Small follow-up fix for the failure introduced by https://github.com/apache/arrow/pull/40356 * GitHub Issue: #40342 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/_fs.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0e635b2c8a28a..dbfb6ed114553 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -18,6 +18,7 @@ # cython: language_level = 3 from cpython.datetime cimport datetime, PyDateTime_DateTime +from cython cimport binding from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint @@ -421,6 +422,7 @@ cdef class FileSystem(_Weakrefable): "SubTreeFileSystem") @staticmethod + @binding(True) # Required for cython < 3 def _from_uri(uri): fs, _path = FileSystem.from_uri(uri) return fs From e22197f39e41446789dcc52e931995fe20a784a4 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Tue, 30 Apr 2024 09:41:44 -0400 Subject: [PATCH 047/106] GH-41400: [MATLAB] Bump `libmexclass` version to commit `ca3cea6` (#41436) ### Rationale for this change @ kevingurney and I recently resolved multiple issues related to `mathworks/libmexclass` not supporting ARM-based macOS builds (i.e. builds on `macos-14`): - mathworks/libmexclass#76 - mathworks/libmexclass#77 We should bump the version of mathworks/libmexclass used by the MATLAB interface to the latest available commit ([ca3cea6](https://github.com/mathworks/libmexclass/commit/ca3cea6bf1ba5e9d86210bd207d643493e8d45f6) as of now) in order to enable building the MATLAB interface to Arrow on `macos-14` (which is ARM-based). ### What changes are included in this PR? - Bumped version of `mathworks/libmexclass` used by the MATLAB interface to [ca3cea6](https://github.com/mathworks/libmexclass/commit/ca3cea6bf1ba5e9d86210bd207d643493e8d45f6) ### Are these changes tested? - Yes. The existing test points verify verify upgrading `mathworks/libmexclass` does not break the MATLAB interface. ### Are there any user-facing changes? - No. ### Future Directions - #41435 - #41385 * GitHub Issue: #41400 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- matlab/tools/cmake/BuildMatlabArrowInterface.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index cb746e08b1f8e..e1641842ca8b9 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -24,8 +24,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_NAME libmexclass) # libmexclass is accessible for CI without permission issues. set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_REPOSITORY "https://github.com/mathworks/libmexclass.git") # Use a specific Git commit hash to avoid libmexclass version changing unexpectedly. -set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "d04f88d") - +set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "ca3cea6") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_SOURCE_SUBDIR "libmexclass/cpp") # ------------------------------------------ From 0ef7351986ee8b967e210d0f9c7a9c8e4d4038fd Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 1 May 2024 02:01:39 +0800 Subject: [PATCH 048/106] GH-41407: [C++] Use static method to fill scalar scratch space to prevent ub (#41421) ### Rationale for this change In #40237, I introduced scalar scratch space filling in concrete scalar sub-class constructor, in which there is a static down-casting of `this` to sub-class pointer. Though this is common in CRTP, it happens in base cast constructor. And this is reported in #41407 to be UB by UBSAN's "vptr" sanitizing. I'm not a language lawyer to tell if this is a true/false-positive. So I proposed two approaches: 1. The easy way: add suppression in [1], like we already did for `shared_ptr`. But apparently this won't be feasible if this is a true-positive (need some language lawyer's help to confirm). 2. The hard way: totally avoid this so-to-speak UB but may introduce more boilerplate code. This PR is the hard way. [1] https://github.com/apache/arrow/blob/main/r/tools/ubsan.supp ### What changes are included in this PR? Make `FillScratchSpace` static. ### Are these changes tested? The existing UT should cover it well. ### Are there any user-facing changes? None. * GitHub Issue: #41407 Lead-authored-by: Ruoxi Sun Co-authored-by: Rossi Sun Co-authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/scalar.cc | 73 +++++++++++++++----------- cpp/src/arrow/scalar.h | 112 +++++++++++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 55 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 8e8d3903663e4..7d8084e17c279 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -563,15 +563,17 @@ Status Scalar::ValidateFull() const { BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} -void BinaryScalar::FillScratchSpace() { +void BinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->size()) : int32_t(0)}); } -void BinaryViewScalar::FillScratchSpace() { +void BinaryViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { static_assert(sizeof(BinaryViewType::c_type) <= internal::kScalarScratchSpaceSize); - auto* view = new (&scratch_space_) BinaryViewType::c_type; + auto* view = new (scratch_space) BinaryViewType::c_type; if (value) { *view = util::ToBinaryView(std::string_view{*value}, 0, 0); } else { @@ -579,9 +581,10 @@ void BinaryViewScalar::FillScratchSpace() { } } -void LargeBinaryScalar::FillScratchSpace() { +void LargeBinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int64_t(0), value ? static_cast(value->size()) : int64_t(0)}); } @@ -612,36 +615,40 @@ BaseListScalar::BaseListScalar(std::shared_ptr value, } ListScalar::ListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list(value->type()), is_valid) {} + : ListScalar(value, list(value->type()), is_valid) {} -void ListScalar::FillScratchSpace() { +void ListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list(value->type()), is_valid) {} + : LargeListScalar(value, large_list(value->type()), is_valid) {} -void LargeListScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list_view(value->type()), is_valid) {} + : ListViewScalar(value, list_view(value->type()), is_valid) {} -void ListViewScalar::FillScratchSpace() { +void ListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + : LargeListViewScalar(value, large_list_view(value->type()), is_valid) {} -void LargeListViewScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } @@ -652,11 +659,12 @@ inline std::shared_ptr MakeMapType(const std::shared_ptr& pa } MapScalar::MapScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, MakeMapType(value->type()), is_valid) {} + : MapScalar(value, MakeMapType(value->type()), is_valid) {} -void MapScalar::FillScratchSpace() { +void MapScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } @@ -705,7 +713,9 @@ Result> StructScalar::field(FieldRef ref) const { RunEndEncodedScalar::RunEndEncodedScalar(std::shared_ptr value, std::shared_ptr type) - : Scalar{std::move(type), value->is_valid}, value{std::move(value)} { + : Scalar{std::move(type), value->is_valid}, + ArraySpanFillFromScalarScratchSpace(*this->type), + value{std::move(value)} { ARROW_CHECK_EQ(this->type->id(), Type::RUN_END_ENCODED); } @@ -716,18 +726,18 @@ RunEndEncodedScalar::RunEndEncodedScalar(const std::shared_ptr& type) RunEndEncodedScalar::~RunEndEncodedScalar() = default; -void RunEndEncodedScalar::FillScratchSpace() { - auto run_end = run_end_type()->id(); +void RunEndEncodedScalar::FillScratchSpace(uint8_t* scratch_space, const DataType& type) { + Type::type run_end = checked_cast(type).run_end_type()->id(); switch (run_end) { case Type::INT16: - FillScalarScratchSpace(scratch_space_, {int16_t(1)}); + FillScalarScratchSpace(scratch_space, {int16_t(1)}); break; case Type::INT32: - FillScalarScratchSpace(scratch_space_, {int32_t(1)}); + FillScalarScratchSpace(scratch_space, {int32_t(1)}); break; default: DCHECK_EQ(run_end, Type::INT64); - FillScalarScratchSpace(scratch_space_, {int64_t(1)}); + FillScalarScratchSpace(scratch_space, {int64_t(1)}); } } @@ -806,6 +816,7 @@ Result TimestampScalar::FromISO8601(std::string_view iso8601, SparseUnionScalar::SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, /*is_valid=*/true), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) { const auto child_ids = checked_cast(*this->type).child_ids(); if (type_code >= 0 && static_cast(type_code) < child_ids.size() && @@ -833,13 +844,13 @@ std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr val return std::make_shared(field_values, type_code, std::move(type)); } -void SparseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void SparseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; } -void DenseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void DenseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; FillScalarScratchSpace(union_scratch_space->offsets, {int32_t(0), int32_t(1)}); } diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index a7ee6a417d9a1..982a4c5113c92 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -141,7 +141,12 @@ struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { alignas(int64_t) mutable uint8_t scratch_space_[kScalarScratchSpaceSize]; private: - ArraySpanFillFromScalarScratchSpace() { static_cast(this)->FillScratchSpace(); } + template + explicit ArraySpanFillFromScalarScratchSpace(Args&&... args) { + Impl::FillScratchSpace(scratch_space_, std::forward(args)...); + } + + ArraySpanFillFromScalarScratchSpace() = delete; friend Impl; }; @@ -278,20 +283,32 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} + explicit BinaryScalar(std::string s) : BinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -312,23 +329,35 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryViewScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryViewScalar(std::shared_ptr value) : BinaryViewScalar(std::move(value), binary_view()) {} explicit BinaryViewScalar(std::string s) - : BaseBinaryScalar(std::move(s), binary_view()) {} + : BinaryViewScalar(std::move(s), binary_view()) {} BinaryViewScalar() : BinaryViewScalar(binary_view()) {} std::string_view view() const override { return std::string_view(*this->value); } private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -350,24 +379,33 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit LargeBinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + LargeBinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + LargeBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} explicit LargeBinaryScalar(std::string s) - : BaseBinaryScalar(std::move(s), large_binary()) {} + : LargeBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -550,14 +588,19 @@ struct ARROW_EXPORT ListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -567,14 +610,19 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -584,14 +632,19 @@ struct ARROW_EXPORT ListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -601,14 +654,19 @@ struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -618,14 +676,19 @@ struct ARROW_EXPORT MapScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = MapType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + MapScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit MapScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -707,7 +770,7 @@ struct ARROW_EXPORT SparseUnionScalar std::shared_ptr type); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -733,10 +796,11 @@ struct ARROW_EXPORT DenseUnionScalar DenseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, value->is_valid), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -772,7 +836,7 @@ struct ARROW_EXPORT RunEndEncodedScalar private: const TypeClass& ree_type() const { return internal::checked_cast(*type); } - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, const DataType& type); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; From 5e986be59f08135d2fdaeb819c87120b0bf7436a Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Wed, 1 May 2024 06:20:04 +0800 Subject: [PATCH 049/106] GH-41183: [C++][Python] Expose recursive flatten for lists on list_flatten kernel function and pyarrow bindings (#41295) ### Rationale for this change Expose recursive flatten for logical lists on list_flatten kernel function and pyarrow bindings. ### What changes are included in this PR? 1. Expose recursive flatten for logical lists on `list_flatten` kernel function 2. Support [Large]ListView for some kernel functions: `list_flatten`,`list_value_length`, `list_element` 3. Support recursive flatten for pyarrow bindinds and simplify [Large]ListView's pyarrow bindings 4. Refactor vector_nested_test.cc for better support [Large]ListView types. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes. 1. Some kernel functions like: list_flatten, list_value_length, list_element would support [Large]ListView types 2. `list_flatten` and related pyarrow bindings could support flatten recursively with an ListFlattenOptions. * GitHub Issue: #41183 Lead-authored-by: ZhangHuiGui Co-authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/api_vector.cc | 7 + cpp/src/arrow/compute/api_vector.h | 12 + .../arrow/compute/kernels/codegen_internal.cc | 21 +- .../arrow/compute/kernels/codegen_internal.h | 3 +- .../arrow/compute/kernels/scalar_nested.cc | 49 +++- .../compute/kernels/scalar_nested_test.cc | 17 +- .../arrow/compute/kernels/vector_nested.cc | 54 +++-- .../compute/kernels/vector_nested_test.cc | 129 +++++++++-- python/pyarrow/_compute.pyx | 20 ++ python/pyarrow/array.pxi | 215 +++++++----------- python/pyarrow/compute.py | 1 + python/pyarrow/includes/libarrow.pxd | 5 + python/pyarrow/lib.pxd | 4 +- python/pyarrow/tests/test_array.py | 8 +- python/pyarrow/tests/test_compute.py | 1 + 15 files changed, 364 insertions(+), 182 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf239..f0d5c0fcc3d72 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee69..e5bcc37329661 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f957..0fd9cae7a8d71 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -56,9 +57,23 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 097ee1de45b6a..9e46a21887f8c 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -423,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc287..b99f065a0b158 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" @@ -41,10 +42,17 @@ Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const ArraySpan& arr = batch[0].array; ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(*arr.type)) { + const auto* sizes = arr.GetValues(2); + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +67,30 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b82..32bea8246954d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7b4..8c77c261c6a98 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,13 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); + out->value = std::move(result->data()); return Status::OK(); } @@ -107,10 +113,15 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" - "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +164,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ffb5..56604ebd16cc0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,38 +30,113 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorNestedSpecialized : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursive = true; + + // List types with two nesting levels: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nesting levels: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); + +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { + this->TestListFlattenRecursively(); } TEST(TestVectorNested, ListFlattenFixedSizeList) { @@ -92,6 +168,21 @@ TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { CheckVectorUnary("list_flatten", input, expected); } +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursive = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); +} + TEST(TestVectorNested, ListParentIndices) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a267d53599436..44a3d5e740701 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + + def __init__(self, recursive=False): + self._set_options(recursive) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861b6..6a11b19ffcdf5 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursive=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b09..83612f66d21e2 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6dae45ab80b1c..f461513e8b3cf 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2589,6 +2589,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(c_bool recursive) + c_bool recursive + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index b1187a77c2a6e..bfd266a807c40 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -437,11 +437,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): pass -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): pass diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b961..6a190957879d3 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2757,6 +2757,7 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ @@ -2778,7 +2779,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2879,8 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten().equals(arr1) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3844,6 +3849,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b509b..17cc546f834ca 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursive=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), From 0d7fac0d49eae7f139735c3e7c9256fc304a698a Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Wed, 1 May 2024 06:26:05 +0800 Subject: [PATCH 050/106] GH-41418: [C++] Add [Large]ListView and Map nested types for scalar_if_else's kernel functions (#41419) ### Rationale for this change Add [Large]ListView and Map nested types for scalar_if_else's kernel functions ### What changes are included in this PR? 1. Add the list-view related types to `case_when`, `coalesce`'s kernel function and move the nested-types's added logic to a unified function for better management. 2. Add the `MapType` and related test for `if_else` ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41418 Authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute/kernels/scalar_if_else.cc | 107 ++++++++++++++---- .../kernels/scalar_if_else_benchmark.cc | 50 +++++--- .../compute/kernels/scalar_if_else_test.cc | 19 +++- 3 files changed, 138 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index ee181c053c053..13874d9d65e70 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1309,9 +1309,10 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, - Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : + {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, Type::LARGE_LIST_VIEW, + Type::FIXED_SIZE_LIST, Type::MAP, Type::STRUCT, Type::DENSE_UNION, + Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; @@ -1847,6 +1848,48 @@ struct CaseWhenFunctor> { } }; +// TODO(GH-41453): a more efficient implementation for list-views is possible +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + /// TODO(wesm): should this be a DCHECK? Or checked elsewhere + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecVarWidthScalarCaseWhen(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + + int64_t reservation = 0; + for (int arg = 1; arg < batch.num_values(); arg++) { + const ExecValue& source = batch[arg]; + if (!source.is_array()) { + const auto& scalar = checked_cast(*source.scalar); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->length()); + } else { + const ArraySpan& array = source.array; + reservation = std::max(reservation, array.child_data[0].length); + } + } + return child_builder->Reserve(reservation); + }); + } +}; + // No-op reserve function, pulled out to avoid apparent miscompilation on MinGW Status ReserveNoData(ArrayBuilder*) { return Status::OK(); } @@ -2712,6 +2755,25 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr& scalar_fu } } +template +void AddNestedCaseWhenKernel(const std::shared_ptr& scalar_function) { + AddCaseWhenKernel(scalar_function, ArrowNestedType::type_id, + CaseWhenFunctor::Exec); +} + +void AddNestedCaseWhenKernels(const std::shared_ptr& scalar_function) { + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); +} + void AddCoalesceKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, FirstType, @@ -2731,6 +2793,25 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr& scalar_f } } +template +void AddNestedCoalesceKernel(const std::shared_ptr& scalar_function) { + AddCoalesceKernel(scalar_function, ArrowNestedType::type_id, + CoalesceFunctor::Exec); +} + +void AddNestedCoalesceKernels(const std::shared_ptr& scalar_function) { + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); +} + void AddChooseKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({Type::INT64, InputType(get_id.id)}, LastType, @@ -2822,15 +2903,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); - AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, - CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DENSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::SPARSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DICTIONARY, CaseWhenFunctor::Exec); + AddNestedCaseWhenKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { @@ -2848,15 +2921,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { for (const auto& ty : BaseBinaryTypes()) { AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase(ty)); } - AddCoalesceKernel(func, Type::FIXED_SIZE_LIST, - CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LARGE_LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::MAP, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::STRUCT, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DENSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::SPARSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DICTIONARY, CoalesceFunctor::Exec); + AddNestedCoalesceKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 58bc560f52842..5988908853d50 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -284,8 +284,11 @@ static void CaseWhenBench(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * (len - offset)); } -static void CaseWhenBenchList(benchmark::State& state) { - auto type = list(int64()); +template +static void CaseWhenBenchList(benchmark::State& state, + const std::shared_ptr& type) { + using ArrayType = typename TypeTraits::ArrayType; + auto fld = field("", type); int64_t len = state.range(0); @@ -295,17 +298,17 @@ static void CaseWhenBenchList(benchmark::State& state) { auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); - auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), - key_value_metadata({{"null_probability", "0.0"}})), - len); - auto val1 = rand.ArrayOf(*fld, len); - auto val2 = rand.ArrayOf(*fld, len); - auto val3 = rand.ArrayOf(*fld, len); - auto val4 = rand.ArrayOf(*fld, len); + auto cond = std::static_pointer_cast( + rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), + key_value_metadata({{"null_probability", "0.0"}})), + len)) + ->Slice(offset); + auto val1 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val2 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val3 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val4 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); for (auto _ : state) { - ABORT_NOT_OK( - CaseWhen(cond->Slice(offset), {val1->Slice(offset), val2->Slice(offset), - val3->Slice(offset), val4->Slice(offset)})); + ABORT_NOT_OK(CaseWhen(cond, {val1, val2, val3, val4})); } // Set bytes processed to ~length of output @@ -372,6 +375,21 @@ static void CaseWhenBenchStringContiguous(benchmark::State& state) { return CaseWhenBenchContiguous(state); } +template +static void CaseWhenBenchVarLengthListLike(benchmark::State& state) { + auto value_type = TypeTraits::type_singleton(); + auto list_type = std::make_shared(value_type); + return CaseWhenBenchList(state, list_type); +} + +static void CaseWhenBenchListInt64(benchmark::State& state) { + return CaseWhenBenchVarLengthListLike(state); +} + +static void CaseWhenBenchListViewInt64(benchmark::State& state) { + CaseWhenBenchVarLengthListLike(state); +} + struct CoalesceParams { int64_t length; int64_t num_arguments; @@ -533,9 +551,11 @@ BENCHMARK(CaseWhenBench64)->Args({kNumItems, 99}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 0}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 99}); -// CaseWhen: Lists -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 0}); -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 99}); +// CaseWhen: List-like types +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 99}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 99}); // CaseWhen: Strings BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index c4c46b5efe84d..9a0ca325277dc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -896,6 +896,21 @@ TEST_F(TestIfElseKernel, ParameterizedTypes) { {cond, ArrayFromJSON(type0, "[0]"), ArrayFromJSON(type1, "[1]")})); } +TEST_F(TestIfElseKernel, MapNested) { + auto type = map(int64(), utf8()); + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[true, true, false, false]"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[3, "test"]], []])"), + ArrayFromJSON(type, R"([[[1, "b"]], [[2, "c"]], [[7, "abc"]], null])"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[7, "abc"]], null])")); + + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[null, null, null, null]"), + ArrayFromJSON(type, R"([null, [[1, "c"]], [[4, null]], [[6, "ok"]]])"), + ArrayFromJSON(type, R"([[[-1, null]], [[3, "c"]], null, [[6, "ok"]]])"), + ArrayFromJSON(type, R"([null, null, null, null])")); +} + template class TestIfElseUnion : public ::testing::Test {}; @@ -1920,7 +1935,7 @@ TYPED_TEST(TestCaseWhenBinary, Random) { template class TestCaseWhenList : public ::testing::Test {}; -TYPED_TEST_SUITE(TestCaseWhenList, ListArrowTypes); +TYPED_TEST_SUITE(TestCaseWhenList, ListAndListViewArrowTypes); TYPED_TEST(TestCaseWhenList, ListOfString) { auto type = std::make_shared(utf8()); @@ -2555,7 +2570,7 @@ class TestCoalesceList : public ::testing::Test {}; TYPED_TEST_SUITE(TestCoalesceNumeric, IfElseNumericBasedTypes); TYPED_TEST_SUITE(TestCoalesceBinary, BaseBinaryArrowTypes); -TYPED_TEST_SUITE(TestCoalesceList, ListArrowTypes); +TYPED_TEST_SUITE(TestCoalesceList, ListAndListViewArrowTypes); TYPED_TEST(TestCoalesceNumeric, Basics) { auto type = default_type_instance(); From 6b278be178975fe7174b961a3bf33502acb79295 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Tue, 30 Apr 2024 16:50:14 -0700 Subject: [PATCH 051/106] GH-41471: [Java] Fix performance uber-jar (#41473) ### Rationale for this change Performance `benchmarks.jar` uber-jar is mostly empty and is missing critical metadata information which would allow someone to run performance benchmarks using a simple commandline like: ``` $ java -cp performance/target/benchmarks.jar ArrowBufBenchmarks ``` ### What changes are included in this PR? Move benchmark classes from `src/test/java` to `src/main/java` and change the dependencies' scope as well so that `maven-shade-plugin` can actually pick up the classes to package. Also add missing jmh annotation generator to `maven-compiler-plugin` so that JMH metadata can be generated ### Are these changes tested? Local testing only. ### Are there any user-facing changes? I didn't find any user-facing documentation regarding JMH benchmarks. If there are some, it may be helpful to include a simplified command line * GitHub Issue: #41471 Authored-by: Laurent Goujon Signed-off-by: David Li --- java/performance/pom.xml | 30 +++++++++---------- .../arrow/adapter/AvroAdapterBenchmarks.java | 0 .../adapter/jdbc/JdbcAdapterBenchmarks.java | 2 ++ .../search/ParallelSearcherBenchmarks.java | 2 ++ .../arrow/memory/AllocatorBenchmarks.java | 0 .../arrow/memory/ArrowBufBenchmarks.java | 0 .../util/ArrowBufPointerBenchmarks.java | 0 .../util/ByteFunctionHelpersBenchmarks.java | 3 +- .../vector/BaseValueVectorBenchmarks.java | 0 .../vector/BitVectorHelperBenchmarks.java | 2 ++ .../arrow/vector/DecimalVectorBenchmarks.java | 0 .../apache/arrow/vector/Float8Benchmarks.java | 2 ++ .../arrow/vector/FloatingPointBenchmarks.java | 2 ++ .../apache/arrow/vector/IntBenchmarks.java | 2 ++ .../arrow/vector/VarCharBenchmarks.java | 2 ++ .../vector/VariableWidthVectorBenchmarks.java | 2 ++ .../arrow/vector/VectorLoaderBenchmark.java | 2 ++ .../arrow/vector/VectorUnloaderBenchmark.java | 2 ++ .../DictionaryEncoderBenchmarks.java | 0 .../vector/ipc/WriteChannelBenchmark.java | 2 ++ .../message/ArrowRecordBatchBenchmarks.java | 2 ++ .../vector/util/TransferPairBenchmarks.java | 2 ++ 22 files changed, 43 insertions(+), 16 deletions(-) rename java/performance/src/{test => main}/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java (99%) rename java/performance/src/{test => main}/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/AllocatorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/ArrowBufBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/Float8Benchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/FloatingPointBenchmarks.java (98%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/IntBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VarCharBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VectorLoaderBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java (100%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java (97%) rename java/performance/src/{test => main}/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java (97%) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index c819e6393d78f..e9023ece080a3 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -40,61 +40,61 @@ org.openjdk.jmh jmh-core ${jmh.version} - test - - - org.openjdk.jmh - jmh-generator-annprocess - ${jmh.version} - provided org.apache.arrow arrow-vector ${arrow.vector.classifier} - test org.apache.arrow arrow-memory-core - test org.apache.arrow arrow-memory-netty - test + runtime org.apache.avro avro ${dep.avro.version} - test org.apache.arrow arrow-avro - test com.h2database h2 2.2.224 - test + runtime org.apache.arrow arrow-jdbc - test org.apache.arrow arrow-algorithm - test + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.openjdk.jmh + jmh-generator-annprocess + ${jmh.version} + + + + org.apache.maven.plugins maven-shade-plugin diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java similarity index 99% rename from java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java index fd3940b4c872c..f6dab83b7cd0c 100644 --- a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java @@ -54,6 +54,7 @@ * Benchmarks for Jdbc adapter. */ public class JdbcAdapterBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VALUE_COUNT = 3000; @@ -355,5 +356,6 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java index 1c3af77e73a05..c9fc5cc4bef9c 100644 --- a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java @@ -43,6 +43,7 @@ * Benchmarks for {@link ParallelSearcher}. */ public class ParallelSearcherBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024 * 1024; @@ -112,4 +113,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java index 4d0dfcb5da80d..f1dc2d79eff83 100644 --- a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java @@ -48,7 +48,7 @@ public class ByteFunctionHelpersBenchmarks { */ @State(Scope.Benchmark) public static class ArrowEqualState { - + // checkstyle:off: MissingJavadocMethod private static final int BUFFER_CAPACITY = 7; private BufferAllocator allocator; @@ -135,4 +135,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java index 5f6e5ca28fbab..e29b889c6e7a8 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java @@ -41,6 +41,7 @@ * Benchmarks for {@link BitVectorHelper}. */ public class BitVectorHelperBenchmarks { + // checkstyle:off: MissingJavadocMethod /** * State object for general benchmarks. @@ -226,4 +227,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java index 874e0d9f82ee7..36a633e5e1b6e 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java @@ -40,6 +40,7 @@ */ @State(Scope.Benchmark) public class Float8Benchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -119,4 +120,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java index 079672e9f2a98..2938591737f06 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class FloatingPointBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -130,5 +131,6 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java index 036768d445e55..99674058970a6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class IntBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -107,4 +108,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java index 1ab4b7bc20dad..a7ce4e04fee87 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java @@ -39,6 +39,7 @@ */ @State(Scope.Benchmark) public class VarCharBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -99,4 +100,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java index 7eee981f13327..62c54606e6da6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VariableWidthVectorBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -127,4 +128,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java index 416d126419e56..e8e8c0cfbc1f3 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java @@ -40,6 +40,7 @@ * Benchmarks for {@link VectorLoader}. */ public class VectorLoaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -114,4 +115,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java index d125172450004..b464f888fa85f 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VectorUnloaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -106,4 +107,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java index 7a2537cbb8820..18efff11db9ff 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java @@ -41,6 +41,7 @@ * Benchmarks for {@link WriteChannel}. */ public class WriteChannelBenchmark { + // checkstyle:off: MissingJavadocMethod /** * State object for align benchmark. @@ -84,4 +85,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java index c0882821e9cc4..b608bb4c1c590 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class ArrowRecordBatchBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -95,4 +96,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java index 5142f4bdb8d0d..486862859f122 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class TransferPairBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -120,4 +121,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } From 0f7e9af43796a81d126c59ee1342c6dbf8efaf08 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Tue, 30 Apr 2024 17:27:26 -0800 Subject: [PATCH 052/106] GH-39990: [Docs][CI] Add sphinx-lint for docs linting (#40022) ### What changes are included in this PR? This adds developer tooling to the repo for linting the docs by adding the sphinx-lint tool to archery and our pre-commit hooks. In both locations, only two rules are enabled at the moment (Discussed in https://github.com/apache/arrow/pull/40006): `trailing-whitespace` and `missing-final-newline`. This PR also fixes the individual issues covered by the new tooling. ### Are these changes tested? Yes, though manually. I tested this works by running `archery lint --docs` and `pre-commit` without and without changes that should get caught by the rules. It works as expected. ### Are there any user-facing changes? Yes, 1. Developers that use pre-commit hooks will see a change in behavior when they modify docs 2. Developers using archery will see a new --docs option in `archery lint` 3. Developers working on the docs may see CI failures related to the new checks * Closes: #39990 * GitHub Issue: #39990 Authored-by: Bryce Mecum Signed-off-by: Bryce Mecum --- .pre-commit-config.yaml | 6 ++ ci/conda_env_sphinx.txt | 1 + dev/archery/archery/cli.py | 6 +- dev/archery/archery/utils/lint.py | 52 ++++++++++++- dev/archery/setup.py | 2 +- docs/requirements.txt | 1 + docs/source/cpp/acero/developer_guide.rst | 6 +- docs/source/cpp/acero/overview.rst | 8 +- docs/source/cpp/acero/substrait.rst | 46 ++++++------ docs/source/cpp/acero/user_guide.rst | 74 +++++++++---------- docs/source/cpp/api/scalar.rst | 2 +- docs/source/cpp/build_system.rst | 2 +- docs/source/cpp/compute.rst | 56 +++++++------- docs/source/cpp/dataset.rst | 22 +++--- docs/source/cpp/datatypes.rst | 14 ++-- .../examples/compute_and_write_example.rst | 6 +- docs/source/cpp/flight.rst | 4 +- docs/source/cpp/gandiva.rst | 26 +++---- .../cpp/gandiva/expr_projector_filter.rst | 26 +++---- docs/source/cpp/gandiva/external_func.rst | 14 ++-- docs/source/cpp/getting_started.rst | 12 ++- docs/source/cpp/memory.rst | 30 ++++---- docs/source/cpp/parquet.rst | 34 ++++----- docs/source/cpp/tables.rst | 6 +- docs/source/cpp/threading.rst | 2 +- .../source/cpp/tutorials/compute_tutorial.rst | 12 +-- .../cpp/tutorials/datasets_tutorial.rst | 30 ++++---- docs/source/cpp/tutorials/io_tutorial.rst | 20 ++--- .../continuous_integration/index.rst | 2 +- docs/source/developers/cpp/building.rst | 10 +-- docs/source/developers/cpp/windows.rst | 4 +- .../guide/architectural_overview.rst | 4 +- .../source/developers/guide/communication.rst | 4 +- .../source/developers/guide/documentation.rst | 3 +- docs/source/developers/guide/index.rst | 10 +-- docs/source/developers/guide/resources.rst | 2 +- .../guide/step_by_step/finding_issues.rst | 2 +- .../developers/guide/step_by_step/set_up.rst | 2 +- .../developers/guide/step_by_step/styling.rst | 2 +- .../developers/guide/tutorials/index.rst | 2 +- .../guide/tutorials/python_tutorial.rst | 36 ++++----- docs/source/developers/java/building.rst | 2 +- docs/source/developers/overview.rst | 3 +- docs/source/developers/release.rst | 18 ++--- .../developers/release_verification.rst | 2 +- docs/source/developers/reviewing.rst | 6 +- .../CDataInterface/PyCapsuleInterface.rst | 26 +++---- docs/source/format/Glossary.rst | 2 +- docs/source/format/Integration.rst | 8 +- docs/source/java/algorithm.rst | 28 +++---- docs/source/java/flight.rst | 2 +- docs/source/java/flight_sql_jdbc_driver.rst | 4 +- docs/source/java/memory.rst | 40 +++++----- docs/source/java/quickstartguide.rst | 2 +- docs/source/java/vector.rst | 4 +- docs/source/python/api/compute.rst | 8 +- docs/source/python/api/substrait.rst | 2 +- docs/source/python/compute.rst | 16 ++-- docs/source/python/dataset.rst | 54 +++++++------- docs/source/python/dlpack.rst | 2 +- docs/source/python/filesystems.rst | 4 +- docs/source/python/getstarted.rst | 16 ++-- docs/source/python/getting_involved.rst | 4 +- docs/source/python/integration/python_r.rst | 24 +++--- docs/source/python/ipc.rst | 10 +-- docs/source/python/json.rst | 2 +- docs/source/python/orc.rst | 2 +- docs/source/python/parquet.rst | 2 +- docs/source/python/timestamps.rst | 26 +++---- 69 files changed, 488 insertions(+), 434 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e598e0a95064..bf5ca08d53c32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -132,3 +132,9 @@ repos: ?^cpp/cmake_modules/UseCython\.cmake$| ?^cpp/src/arrow/util/config\.h\.cmake$| ) + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.9.1 + hooks: + - id: sphinx-lint + files: ^docs/ + args: ['--disable', 'all', '--enable', 'trailing-whitespace,missing-final-newline', 'docs'] diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 0a356d5722c42..83afa69a653a9 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -26,6 +26,7 @@ pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinxcontrib-jquery sphinx==6.2 # Requirement for doctest-cython diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 5fa41e28a3208..8a26d9266f22d 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -261,6 +261,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): "Check all sources files for license texts via Apache RAT."), LintCheck('r', "Lint R files."), LintCheck('docker', "Lint Dockerfiles with hadolint."), + LintCheck('docs', "Lint docs with sphinx-lint."), ] @@ -285,9 +286,10 @@ def decorate_lint_command(cmd): help="Run IWYU on all C++ files if enabled") @click.option("-a", "--all", is_flag=True, default=False, help="Enable all checks.") +@click.argument("path", required=False) @decorate_lint_command @click.pass_context -def lint(ctx, src, fix, iwyu_all, **checks): +def lint(ctx, src, fix, iwyu_all, path, **checks): if checks.pop('all'): # "--all" is given => enable all non-selected checks for k, v in checks.items(): @@ -297,7 +299,7 @@ def lint(ctx, src, fix, iwyu_all, **checks): raise click.UsageError( "Need to enable at least one lint check (try --help)") try: - linter(src, fix, iwyu_all=iwyu_all, **checks) + linter(src, fix, iwyu_all=iwyu_all, path=path, **checks) except LintValidationException: sys.exit(1) diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 15f22ca2e6e5c..108c9ded361e7 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -436,10 +436,55 @@ def docker_linter(src): cwd=src.path)) -def linter(src, fix=False, *, clang_format=False, cpplint=False, +class SphinxLint(Command): + def __init__(self, src, path=None, sphinx_lint_bin=None, disable=None, enable=None): + self.src = src + self.path = path + self.bin = default_bin(sphinx_lint_bin, "sphinx-lint") + self.disable = disable or "all" + self.enable = enable + + def lint(self, *args, check=False): + docs_path = os.path.join(self.src.path, "docs") + + args = [] + + if self.disable: + args.extend(["--disable", self.disable]) + + if self.enable: + args.extend(["--enable", self.enable]) + + if self.path is not None: + args.extend([self.path]) + else: + args.extend([docs_path]) + + return self.run(*args, check=check) + + +def docs_linter(src, path=None): + """Run sphinx-lint on docs.""" + logger.info("Running docs linter (sphinx-lint)") + + sphinx_lint = SphinxLint( + src, + path=path, + disable="all", + enable="trailing-whitespace,missing-final-newline" + ) + + if not sphinx_lint.available: + logger.error("sphinx-lint linter requested but sphinx-lint binary not found") + return + + yield LintResult.from_cmd(sphinx_lint.lint()) + + +def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, clang_tidy=False, iwyu=False, iwyu_all=False, python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, docker=False): + r=False, docker=False, docs=False): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -481,6 +526,9 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False, if docker: results.extend(docker_linter(src)) + if docs: + results.extend(docs_linter(src, path)) + # Raise error if one linter failed, ensuring calling code can exit with # non-zero. for result in results: diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 23a1600910d04..cd3e2e9ca0834 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -41,7 +41,7 @@ 'integration': ['cffi'], 'integration-java': ['jpype1'], 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', - 'cmake_format==0.6.13'], + 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], 'numpydoc': ['numpydoc==1.1.0'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], } diff --git a/docs/requirements.txt b/docs/requirements.txt index 252344a74a58f..8891680814dff 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,5 +10,6 @@ pydata-sphinx-theme~=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinx==6.2 pandas diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index 331cd833b58af..80ca68556fc40 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -187,7 +187,7 @@ Examples task (described below) as completed which allows the plan to finish. * The ``fetch`` node, in ``InputReceived``, may decide that it has all the data it needs. It can then call ``StopProducing`` on its input. - + Initialization / Construction / Destruction ------------------------------------------- @@ -271,7 +271,7 @@ distributed systems. Once that has been done then it should be possible to do a meaning exchanging between multiple exec plan instances on a single system) if desired. .. figure:: dist_plan.svg - + A distributed plan can provide parallelism even if the plans themselves run serially Pipeline Parallelism @@ -472,7 +472,7 @@ Benchmarking The most complete macro benchmarking for Acero is provided by https://github.com/voltrondata-labs/arrowbench These include a set of TPC-H benchmarks, executed from the R-dplyr integration, which are run on every Arrow commit and -reported to Conbench at https://conbench.ursa.dev/ +reported to Conbench at https://conbench.ursa.dev/ In addition to these TPC-H benchmarks there are a number of micro-benchmarks for various nodes (hash-join, asof-join, etc.) Finally, the compute functions themselves should mostly have micro-benchmarks. For more on micro benchmarks you diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index c569f82b099b6..8be4cbc1b1772 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -206,7 +206,7 @@ is very similar to a RecordBatch. It can have zero or more columns and all of t must have the same length. There are a few key differences from ExecBatch: .. figure:: rb_vs_eb.svg - + Both the record batch and the exec batch have strong ownership of the arrays & buffers * An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be @@ -217,7 +217,7 @@ must have the same length. There are a few key differences from ExecBatch: also has a length property which describes how many rows are in a batch. So another way to view a `Scalar` is a constant array with `length` elements. * An `ExecBatch` contains additional information used by the exec plan. For example, an - `index` can be used to describe a batch's position in an ordered stream. We expect + `index` can be used to describe a batch's position in an ordered stream. We expect that `ExecBatch` will also evolve to contain additional fields such as a selection vector. .. figure:: scalar_vs_array.svg @@ -266,5 +266,5 @@ various query representations (e.g. Substrait). The Declaration objects are the with the DeclarationToXyz methods, are the current public API for Acero. .. figure:: decl_vs_ep.svg - - A declaration is a blueprint that is used to instantiate exec plan instances \ No newline at end of file + + A declaration is a blueprint that is used to instantiate exec plan instances diff --git a/docs/source/cpp/acero/substrait.rst b/docs/source/cpp/acero/substrait.rst index 797b2407f93cd..a5532733627c1 100644 --- a/docs/source/cpp/acero/substrait.rst +++ b/docs/source/cpp/acero/substrait.rst @@ -111,7 +111,7 @@ Aggregate Relations * Each measure's arguments must be direct references. * A measure may not have a filter * A measure may not have sorts -* A measure's invocation must be AGGREGATION_INVOCATION_ALL or +* A measure's invocation must be AGGREGATION_INVOCATION_ALL or AGGREGATION_INVOCATION_UNSPECIFIED * A measure's phase must be AGGREGATION_PHASE_INITIAL_TO_RESULT @@ -146,73 +146,73 @@ Types - Caveat * - boolean - boolean - - + - * - i8 - int8 - - + - * - i16 - int16 - - + - * - i32 - int32 - - + - * - i64 - int64 - - + - * - fp32 - float32 - - + - * - fp64 - float64 - - + - * - string - string - - + - * - binary - binary - - + - * - timestamp - timestamp - - + - * - timestamp_tz - timestamp - - + - * - date - date32 - - + - * - time - time64 - - + - * - interval_year - - + - - Not currently supported * - interval_day - - + - - Not currently supported * - uuid - - + - - Not currently supported * - FIXEDCHAR - - + - - Not currently supported * - VARCHAR - - + - - Not currently supported * - FIXEDBINARY - fixed_size_binary - - + - * - DECIMAL - decimal128 - - + - * - STRUCT - struct - Arrow struct fields will have no name (empty string) * - NSTRUCT - - + - - Not currently supported * - LIST - list - - + - * - MAP - map - K must not be nullable diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index eca1a0104708b..adcc17216e5ae 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -32,14 +32,14 @@ Using Acero The basic workflow for Acero is this: #. First, create a graph of :class:`Declaration` objects describing the plan - + #. Call one of the DeclarationToXyz methods to execute the Declaration. a. A new ExecPlan is created from the graph of Declarations. Each Declaration will correspond to one ExecNode in the plan. In addition, a sink node will be added, depending on which DeclarationToXyz method was used. - b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in + b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in DeclarationToReader the reader is returned before the plan is finished executing. c. Once the plan is finished it is destroyed @@ -315,7 +315,7 @@ of a specific execution node. ``source`` ---------- -A ``source`` operation can be considered as an entry point to create a streaming execution plan. +A ``source`` operation can be considered as an entry point to create a streaming execution plan. :class:`SourceNodeOptions` are used to create the ``source`` operation. The ``source`` operation is the most generic and flexible type of source currently available but it can be quite tricky to configure. First you should review the other source node types to ensure there @@ -326,7 +326,7 @@ function should take no arguments and should return an ``arrow::Future>``. This function might be reading a file, iterating through an in memory structure, or receiving data from a network connection. The arrow library refers to these functions as ``arrow::AsyncGenerator`` -and there are a number of utilities for working with these functions. For this example we use +and there are a number of utilities for working with these functions. For this example we use a vector of record batches that we've already stored in memory. In addition, the schema of the data must be known up front. Acero must know the schema of the data at each stage of the execution graph before any processing has begun. This means we must supply the @@ -368,10 +368,10 @@ Example of using ``source`` (usage of sink is explained in detail in :ref:`sink< In the previous example, :ref:`source node `, a source node was used to input the data. But when developing an application, if the data is already in memory as a table, it is much easier, and more performant to use :class:`TableSourceNodeOptions`. -Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. +Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. The ``max_batch_size`` is to break up large record batches so that they can be processed in parallel. It is important to note that the table batches will not get merged to form larger batches when the source -table has a smaller batch size. +table has a smaller batch size. Example of using ``table_source`` @@ -387,7 +387,7 @@ Example of using ``table_source`` ``filter`` ---------- -``filter`` operation, as the name suggests, provides an option to define data filtering +``filter`` operation, as the name suggests, provides an option to define data filtering criteria. It selects rows where the given expression evaluates to true. Filters can be written using :class:`arrow::compute::Expression`, and the expression should have a return type of boolean. For example, if we wish to keep rows where the value @@ -415,7 +415,7 @@ functions, i.e. elementwise functions that return one value for each input row independent of the value of all other rows). This is exposed via :class:`ProjectNodeOptions` which requires, an :class:`arrow::compute::Expression` and name for each of the output columns (if names are not -provided, the string representations of exprs will be used). +provided, the string representations of exprs will be used). Project example: @@ -456,7 +456,7 @@ can be selected from :ref:`this list of aggregation functions The aggregation can provide results as a group or scalar. For instances, an operation like `hash_count` provides the counts per each unique record -as a grouped result while an operation like `sum` provides a single record. +as a grouped result while an operation like `sum` provides a single record. Scalar Aggregation example: @@ -481,14 +481,14 @@ Group Aggregation example: ``sink`` -------- -``sink`` operation provides output and is the final node of a streaming -execution definition. :class:`SinkNodeOptions` interface is used to pass +``sink`` operation provides output and is the final node of a streaming +execution definition. :class:`SinkNodeOptions` interface is used to pass the required options. Similar to the source operator the sink operator exposes the output with a function that returns a record batch future each time it is called. It is expected the caller will repeatedly call this function until the generator function is exhausted (returns ``std::optional::nullopt``). If this function is not called often enough then record batches will accumulate in memory. An execution plan should only have one -"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or +"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or an error, before the output is fully consumed. However, the plan can be safely destroyed independently of the sink, which will hold the unconsumed batches by `exec_plan->finished()`. @@ -526,12 +526,12 @@ Example:: arrow::Future<> finish = arrow::Future<>::Make(); struct CustomSinkNodeConsumer : public cp::SinkNodeConsumer { - CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): + CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): batches_seen(batches_seen), finish(std::move(finish)) {} // Consumption logic can be written here arrow::Status Consume(cp::ExecBatch batch) override { // data can be consumed in the expected way - // transfer to another system or just do some work + // transfer to another system or just do some work // and write to disk (*batches_seen)++; return arrow::Status::OK(); @@ -541,9 +541,9 @@ Example:: std::atomic *batches_seen; arrow::Future<> finish; - + }; - + std::shared_ptr consumer = std::make_shared(&batches_seen, finish); @@ -567,14 +567,14 @@ Consuming-Sink example: ``order_by_sink`` ----------------- -``order_by_sink`` operation is an extension to the ``sink`` operation. -This operation provides the ability to guarantee the ordering of the -stream by providing the :class:`OrderBySinkNodeOptions`. -Here the :class:`arrow::compute::SortOptions` are provided to define which columns +``order_by_sink`` operation is an extension to the ``sink`` operation. +This operation provides the ability to guarantee the ordering of the +stream by providing the :class:`OrderBySinkNodeOptions`. +Here the :class:`arrow::compute::SortOptions` are provided to define which columns are used for sorting and whether to sort by ascending or descending values. .. note:: This node is a "pipeline breaker" and will fully materialize the dataset in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. @@ -593,14 +593,14 @@ Order-By-Sink example: ``select_k_sink`` ----------------- -``select_k_sink`` option enables selecting the top/bottom K elements, -similar to a SQL ``ORDER BY ... LIMIT K`` clause. -:class:`SelectKOptions` which is a defined by -using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives +``select_k_sink`` option enables selecting the top/bottom K elements, +similar to a SQL ``ORDER BY ... LIMIT K`` clause. +:class:`SelectKOptions` which is a defined by +using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives inputs and then compute top_k/bottom_k. .. note:: This node is a "pipeline breaker" and will fully materialize the input in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. SelectK example: @@ -617,7 +617,7 @@ SelectK example: .. _stream_execution_table_sink_docs: -The ``table_sink`` node provides the ability to receive the output as an in-memory table. +The ``table_sink`` node provides the ability to receive the output as an in-memory table. This is simpler to use than the other sink nodes provided by the streaming execution engine but it only makes sense when the output fits comfortably in memory. The node is created using :class:`TableSinkNodeOptions`. @@ -637,7 +637,7 @@ Example of using ``table_sink`` --------- ``scan`` is an operation used to load and process datasets. It should be preferred over the -more generic ``source`` node when your input is a dataset. The behavior is defined using +more generic ``source`` node when your input is a dataset. The behavior is defined using :class:`arrow::dataset::ScanNodeOptions`. More information on datasets and the various scan options can be found in :doc:`../dataset`. @@ -683,10 +683,10 @@ Write example: ``union`` ------------- -``union`` merges multiple data streams with the same schema into one, similar to +``union`` merges multiple data streams with the same schema into one, similar to a SQL ``UNION ALL`` clause. -The following example demonstrates how this can be achieved using +The following example demonstrates how this can be achieved using two data sources. Union example: @@ -704,15 +704,15 @@ Union example: ------------- ``hash_join`` operation provides the relational algebra operation, join using hash-based -algorithm. :class:`HashJoinNodeOptions` contains the options required in -defining a join. The hash_join supports +algorithm. :class:`HashJoinNodeOptions` contains the options required in +defining a join. The hash_join supports `left/right/full semi/anti/outerjoins -`_. +`_. Also the join-key (i.e. the column(s) to join on), and suffixes (i.e a suffix term like "_x" -which can be appended as a suffix for column names duplicated in both left and right -relations.) can be set via the join options. +which can be appended as a suffix for column names duplicated in both left and right +relations.) can be set via the join options. `Read more on hash-joins -`_. +`_. Hash-Join example: @@ -726,7 +726,7 @@ Hash-Join example: Summary ======= -There are examples of these nodes which can be found in +There are examples of these nodes which can be found in ``cpp/examples/arrow/execution_plan_documentation_examples.cc`` in the Arrow source. Complete Example: diff --git a/docs/source/cpp/api/scalar.rst b/docs/source/cpp/api/scalar.rst index 04e78450d7744..be9f9686bf110 100644 --- a/docs/source/cpp/api/scalar.rst +++ b/docs/source/cpp/api/scalar.rst @@ -44,4 +44,4 @@ Utilities .. doxygenclass:: arrow::ScalarVisitor :project: arrow_cpp :members: - :undoc-members: \ No newline at end of file + :undoc-members: diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index 60df117eb510e..0c94d7e5ce5dc 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -47,7 +47,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: cmake cmake_minimum_required(VERSION 3.16) - + project(MyExample) find_package(Arrow REQUIRED) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e7310d2c0c711..546b6e5716df7 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1621,12 +1621,12 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are also available in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are also available in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1861,9 +1861,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1877,9 +1877,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/cpp/dataset.rst b/docs/source/cpp/dataset.rst index 1f5d0476c2889..a64b73b61c05d 100644 --- a/docs/source/cpp/dataset.rst +++ b/docs/source/cpp/dataset.rst @@ -378,28 +378,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst index 4e1fe76b4d6f2..7eb70936f4e1d 100644 --- a/docs/source/cpp/datatypes.rst +++ b/docs/source/cpp/datatypes.rst @@ -72,8 +72,8 @@ To instantiate data types, it is recommended to call the provided Type Traits ----------- -Writing code that can handle concrete :class:`arrow::DataType` subclasses would -be verbose, if it weren't for type traits. Arrow's type traits map the Arrow +Writing code that can handle concrete :class:`arrow::DataType` subclasses would +be verbose, if it weren't for type traits. Arrow's type traits map the Arrow data types to the specialized array, scalar, builder, and other associated types. For example, the Boolean type has traits: @@ -96,7 +96,7 @@ For example, the Boolean type has traits: See the :ref:`type-traits` for an explanation of each of these fields. Using type traits, one can write template functions that can handle a variety -of Arrow types. For example, to write a function that creates an array of +of Arrow types. For example, to write a function that creates an array of Fibonacci values for any Arrow numeric type: .. code-block:: cpp @@ -128,7 +128,7 @@ For some common cases, there are type associations on the classes themselves. Us Similar to the type traits provided in `std::type_traits `_, -Arrow provides type predicates such as ``is_number_type`` as well as +Arrow provides type predicates such as ``is_number_type`` as well as corresponding templates that wrap ``std::enable_if_t`` such as ``enable_if_number``. These can constrain template functions to only compile for relevant types, which is useful if other overloads need to be implemented. For example, to write a sum @@ -176,20 +176,20 @@ here is how one might sum across columns of arbitrary numeric types: class TableSummation { double partial = 0.0; public: - + arrow::Result Compute(std::shared_ptr batch) { for (std::shared_ptr array : batch->columns()) { ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this)); } return partial; } - + // Default implementation arrow::Status Visit(const arrow::Array& array) { return arrow::Status::NotImplemented("Cannot compute sum for array of type ", array.type()->ToString()); } - + template arrow::enable_if_number Visit(const ArrayType& array) { for (std::optional value : array) { diff --git a/docs/source/cpp/examples/compute_and_write_example.rst b/docs/source/cpp/examples/compute_and_write_example.rst index e66d3ced55d0c..a4b619f7ffff3 100644 --- a/docs/source/cpp/examples/compute_and_write_example.rst +++ b/docs/source/cpp/examples/compute_and_write_example.rst @@ -21,8 +21,8 @@ Compute and Write CSV Example ============================= -The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside -the source tree contains an example of creating a table of two numerical columns -and then comparing the magnitudes of the entries in the columns and writing out to +The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside +the source tree contains an example of creating a table of two numerical columns +and then comparing the magnitudes of the entries in the columns and writing out to a CSV file with the column entries and their comparisons. The code in the example is documented. diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst index e07a84e91ee4f..a1e9420bfd34e 100644 --- a/docs/source/cpp/flight.rst +++ b/docs/source/cpp/flight.rst @@ -350,10 +350,10 @@ Closing unresponsive connections calls Cancel() on a timer, with the main thread resetting the timer every time an operation completes successfully. For a fully-worked out example, see the Cookbook. - + .. note:: There is a long standing ticket for a per-write/per-read timeout instead of a per call timeout (ARROW-6062_), but this is not (easily) - possible to implement with the blocking gRPC API. + possible to implement with the blocking gRPC API. .. _best gRPC practices: https://grpc.io/docs/guides/performance/#general .. _gRPC keys: https://grpc.github.io/grpc/cpp/group__grpc__arg__keys.html diff --git a/docs/source/cpp/gandiva.rst b/docs/source/cpp/gandiva.rst index 07b07bee7ac4e..f60d1fc8ac8d9 100644 --- a/docs/source/cpp/gandiva.rst +++ b/docs/source/cpp/gandiva.rst @@ -29,8 +29,8 @@ Gandiva only handles projections and filters; for other transformations, see :ref:`Compute Functions `. Gandiva was designed to take advantage of the Arrow memory format and modern -hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and -validity bitmaps, values and their null status can often be processed +hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and +validity bitmaps, values and their null status can often be processed independently, allowing for better instruction pipelining. On modern hardware, compiling expressions using LLVM allows the execution to be optimized to the local runtime environment and hardware, including available SIMD @@ -42,25 +42,25 @@ pre-compiled into LLVM IR (intermediate representation). Expression, Projector and Filter ================================ -To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, -including the creation of function nodes, if-else logic, and boolean expressions. +To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, +including the creation of function nodes, if-else logic, and boolean expressions. Subsequently, leverage ``Projector`` or ``Filter`` execution kernels to efficiently evaluate these expressions. -See :doc:`./gandiva/expr_projector_filter` for more details. +See :doc:`./gandiva/expr_projector_filter` for more details. External Functions Development ============================== -Gandiva offers the capability of integrating external functions, encompassing -both C functions and IR functions. This feature broadens the spectrum of -functions that can be applied within Gandiva expressions. For developers -looking to customize and enhance their computational solutions, -Gandiva provides the opportunity to develop and register their own external -functions, thus allowing for a more tailored and flexible use of the Gandiva +Gandiva offers the capability of integrating external functions, encompassing +both C functions and IR functions. This feature broadens the spectrum of +functions that can be applied within Gandiva expressions. For developers +looking to customize and enhance their computational solutions, +Gandiva provides the opportunity to develop and register their own external +functions, thus allowing for a more tailored and flexible use of the Gandiva environment. -See :doc:`./gandiva/external_func` for more details. +See :doc:`./gandiva/external_func` for more details. .. toctree:: :maxdepth: 2 gandiva/expr_projector_filter - gandiva/external_func \ No newline at end of file + gandiva/external_func diff --git a/docs/source/cpp/gandiva/expr_projector_filter.rst b/docs/source/cpp/gandiva/expr_projector_filter.rst index c960d1d869fe5..9d58b185032e3 100644 --- a/docs/source/cpp/gandiva/expr_projector_filter.rst +++ b/docs/source/cpp/gandiva/expr_projector_filter.rst @@ -30,7 +30,7 @@ literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes can be combined into more complex expression trees using: * :func:`TreeExprBuilder::MakeFunction` to create a function - node. (You can call :func:`GetRegisteredFunctionSignatures` to + node. (You can call :func:`GetRegisteredFunctionSignatures` to get a list of valid function signatures.) * :func:`TreeExprBuilder::MakeIf` to create if-else logic. * :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` @@ -39,7 +39,7 @@ can be combined into more complex expression trees using: functions to create set membership tests. Each of these functions create new composite nodes, which contain the leaf nodes -(literals and field references) or other composite nodes as children. By +(literals and field references) or other composite nodes as children. By composing these, you can create arbitrarily complex expression trees. Once an expression tree is built, they are wrapped in either :class:`Expression` @@ -84,7 +84,7 @@ reused to process distinct record batches in parallel. Evaluating projections ---------------------- -Execution is performed with :func:`Projector::Evaluate`. This outputs +Execution is performed with :func:`Projector::Evaluate`. This outputs a vector of arrays, which can be passed along with the output schema to :func:`arrow::RecordBatch::Make()`. @@ -99,14 +99,14 @@ Evaluating filters :func:`Filter::Evaluate` produces :class:`SelectionVector`, a vector of row indices that matched the filter condition. The selection vector -is a wrapper around an arrow integer array, parameterized by bitwidth. When -creating the selection vector (you must initialize it *before* passing to -``Evaluate()``), you must choose the bitwidth, which determines the max index +is a wrapper around an arrow integer array, parameterized by bitwidth. When +creating the selection vector (you must initialize it *before* passing to +``Evaluate()``), you must choose the bitwidth, which determines the max index value it can hold, and the max number of slots, which determines how many indices -it may contain. In general, the max number of slots should be set to your batch -size and the bitwidth the smallest integer size that can represent all integers -less than the batch size. For example, if your batch size is 100k, set the -maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which +it may contain. In general, the max number of slots should be set to your batch +size and the bitwidth the smallest integer size that can represent all integers +less than the batch size. For example, if your batch size is 100k, set the +maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which would be too small). Once ``Evaluate()`` has been run and the :class:`SelectionVector` is @@ -123,10 +123,10 @@ output record batch. Evaluating projections and filters ---------------------------------- -Finally, you can also project while apply a selection vector, with +Finally, you can also project while apply a selection vector, with :func:`Projector::Evaluate()`. To do so, first make sure to initialize the :class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector -compiles with the correct bitwidth. Then you can pass the +compiles with the correct bitwidth. Then you can pass the :class:`SelectionVector` into the :func:`Projector::Evaluate()` method. @@ -134,4 +134,4 @@ compiles with the correct bitwidth. Then you can pass the :language: cpp :start-after: (Doc section: Evaluate filter and projection) :end-before: (Doc section: Evaluate filter and projection) - :dedent: 2 \ No newline at end of file + :dedent: 2 diff --git a/docs/source/cpp/gandiva/external_func.rst b/docs/source/cpp/gandiva/external_func.rst index cdd8fc82e59db..f8bdde83d96e6 100644 --- a/docs/source/cpp/gandiva/external_func.rst +++ b/docs/source/cpp/gandiva/external_func.rst @@ -79,7 +79,7 @@ The ``NativeFunction`` class is used to define the metadata for an external func * ``ResultNullableType::kResultNullIfNull``: result validity is an intersection of the validity of the children. * ``ResultNullableType::kResultNullNever``: result is always valid. * ``ResultNullableType::kResultNullInternal``: result validity depends on some internal logic. -* ``pc_name``: The name of the corresponding precompiled function. +* ``pc_name``: The name of the corresponding precompiled function. * Typically, this name follows the convention ``{base_name}`` + ``_{param1_type}`` + ``{param2_type}`` + ... + ``{paramN_type}``. For example, if the base name is ``add`` and the function takes two ``int32`` parameters and returns an ``int32``, the precompiled function name would be ``add_int32_int32``, but this convention is not mandatory as long as you can guarantee its uniqueness. * ``flags``: Optional flags for additional function attributes (default is 0). Please check out ``NativeFunction::kNeedsContext``, ``NativeFunction::kNeedsFunctionHolder``, and ``NativeFunction::kCanReturnErrors`` for more details. @@ -153,10 +153,10 @@ Not all Arrow data types are supported in Gandiva. The following table lists the | utf8 (as return type) | int64_t context, | | | const char*, | | | uint32_t* | -| | [see next section]| +| | [see next section]| +-------------------------------------+-------------------+ -Handling arrow::StringType (utf8 type) and arrow::BinaryType +Handling arrow::StringType (utf8 type) and arrow::BinaryType ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Both ``arrow::StringType`` and ``arrow::BinaryType`` are variable-length types. And they are handled similarly in external functions. Since ``arrow::StringType`` (utf8 type) is more commonly used, we will use it below as the example to explain how to handle variable-length types in external functions. @@ -179,7 +179,7 @@ When ``arrow::StringType`` (``utf8`` type) is used as the return type in a funct 2. **Function Parameters:** * **Context Parameter**: The C function should begin with an additional parameter, ``int64_t context``. This parameter is crucial for context management within the function. * **String Length Output Parameter**: The function should also include a ``uint32_t*`` parameter at the end. This output parameter will store the length of the returned string data. -3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. +3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. 4. **Function Implementation:** * **Memory Allocation and Error Messaging:** Within the function's implementation, use ``gdv_fn_context_arena_malloc`` and ``gdv_fn_context_set_error_msg`` for memory allocation and error messaging, respectively. Both functions take ``int64_t context`` as their first parameter, facilitating efficient context utilization. @@ -200,10 +200,10 @@ You can use ``gandiva::FunctionRegistry``'s APIs to register external C function NativeFunction func, void* c_function_ptr, std::optional function_holder_maker = std::nullopt); -The above API allows you to register an external C function. +The above API allows you to register an external C function. -* The ``NativeFunction`` object describes the metadata of the external C function. -* The ``c_function_ptr`` is the function pointer to the external C function's implementation. +* The ``NativeFunction`` object describes the metadata of the external C function. +* The ``c_function_ptr`` is the function pointer to the external C function's implementation. * The optional ``function_holder_maker`` is used to create a function holder for the external C function if the external C function requires a function holder. Check out the ``gandiva::FunctionHolder`` class and its several sub-classes for more details. External IR functions diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 89bd4559ef1e6..2cab5d1581c1c 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -24,17 +24,17 @@ Getting Started The following articles demonstrate installation, use, and a basic understanding of Arrow. These articles will get you setup quickly using Arrow and give you a taste of what the library is capable of. -Specifically, it contains: an installation and linking guide; documentation of conventions used -in the codebase and suggested for users; and tutorials, including: +Specifically, it contains: an installation and linking guide; documentation of conventions used +in the codebase and suggested for users; and tutorials, including: -* Building Arrow arrays and tabular structures +* Building Arrow arrays and tabular structures * Reading and writing Parquet, Arrow, and CSV files * Executing compute kernels on arrays * Reading and writing multi-file partitioned datasets Start here to gain a basic understanding of Arrow, and move on to the :doc:`/cpp/user_guide` to -explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's -API. +explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's +API. .. toctree:: @@ -44,5 +44,3 @@ API. tutorials/io_tutorial.rst tutorials/compute_tutorial.rst tutorials/datasets_tutorial.rst - - diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst index ad8276e3728a2..33907b5580f61 100644 --- a/docs/source/cpp/memory.rst +++ b/docs/source/cpp/memory.rst @@ -205,7 +205,7 @@ simply do:: Memory Profiling ================ -On Linux, detailed profiles of memory allocations can be generated using +On Linux, detailed profiles of memory allocations can be generated using ``perf record``, without any need to modify the binaries. These profiles can show the traceback in addition to allocation size. This does require debug symbols, from either a debug build or a release with debug symbols build. @@ -234,14 +234,14 @@ recorded allocations, so we can correlate them with the call to free/de-allocate .. tab-set:: .. tab-item:: jemalloc - + .. code-block:: shell - perf probe -x libarrow.so je_arrow_mallocx '$params' - perf probe -x libarrow.so je_arrow_mallocx%return '$retval' - perf probe -x libarrow.so je_arrow_rallocx '$params' - perf probe -x libarrow.so je_arrow_rallocx%return '$retval' - perf probe -x libarrow.so je_arrow_dallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx%return '$retval' + perf probe -x libarrow.so je_arrow_rallocx '$params' + perf probe -x libarrow.so je_arrow_rallocx%return '$retval' + perf probe -x libarrow.so je_arrow_dallocx '$params' PROBE_ARGS="-e probe_libarrow:je_arrow_mallocx \ -e probe_libarrow:je_arrow_mallocx__return \ -e probe_libarrow:je_arrow_rallocx \ @@ -249,13 +249,13 @@ recorded allocations, so we can correlate them with the call to free/de-allocate -e probe_libarrow:je_arrow_dallocx" .. tab-item:: mimalloc - + .. code-block:: shell - perf probe -x libarrow.so mi_malloc_aligned '$params' - perf probe -x libarrow.so mi_malloc_aligned%return '$retval' - perf probe -x libarrow.so mi_realloc_aligned '$params' - perf probe -x libarrow.so mi_realloc_aligned%return '$retval' + perf probe -x libarrow.so mi_malloc_aligned '$params' + perf probe -x libarrow.so mi_malloc_aligned%return '$retval' + perf probe -x libarrow.so mi_realloc_aligned '$params' + perf probe -x libarrow.so mi_realloc_aligned%return '$retval' perf probe -x libarrow.so mi_free '$params' PROBE_ARGS="-e probe_libarrow:mi_malloc_aligned \ -e probe_libarrow:mi_malloc_aligned__return \ @@ -277,9 +277,9 @@ If you want to profile a running process, you can run ``perf record -p `` and it will record until you interrupt with CTRL+C. Alternatively, you can do ``perf record -P sleep 10`` to record for 10 seconds. -The resulting data can be processed with standard tools to work with perf or +The resulting data can be processed with standard tools to work with perf or ``perf script`` can be used to pipe a text format of the data to custom scripts. -The following script parses ``perf script`` output and prints the output in +The following script parses ``perf script`` output and prints the output in new lines delimited JSON for easier processing. .. code-block:: python @@ -354,7 +354,7 @@ Here's an example invocation of that script, with a preview of output data: From there one can answer a number of questions. For example, the following -script will find which allocations were never freed, and print the associated +script will find which allocations were never freed, and print the associated tracebacks along with the count of dangling allocations: .. code-block:: python diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 3e06352f5dde3..96897d139b351 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -51,8 +51,8 @@ FileReader ---------- To read Parquet data into Arrow structures, use :class:`arrow::FileReader`. -To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance -representing the input file. To read the whole file at once, +To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance +representing the input file. To read the whole file at once, use :func:`arrow::FileReader::ReadTable`: .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -67,7 +67,7 @@ Finer-grained options are available through the and :class:`ArrowReaderProperties` classes. For reading as a stream of batches, use the :func:`arrow::FileReader::GetRecordBatchReader` -method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch +method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch size set in :class:`ArrowReaderProperties`. .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -106,8 +106,8 @@ If memory efficiency is more important than performance, then: #. Turn on ``enable_buffered_stream`` in :class:`parquet::ReaderProperties`. In addition, if you know certain columns contain many repeated values, you can -read them as :term:`dictionary encoded` columns. This is -enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. +read them as :term:`dictionary encoded` columns. This is +enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. If the files were written with Arrow C++ and the ``store_schema`` was activated, then the original Arrow schema will be automatically read and will override this setting. @@ -174,7 +174,7 @@ The :func:`arrow::WriteTable` function writes an entire .. note:: - Column compression is off by default in C++. See :ref:`below ` + Column compression is off by default in C++. See :ref:`below ` for how to choose a compression codec in the writer properties. To write out data batch-by-batch, use :class:`arrow::FileWriter`. @@ -191,9 +191,9 @@ StreamWriter The :class:`StreamWriter` allows for Parquet files to be written using standard C++ output operators, similar to reading with the :class:`StreamReader` -class. This type-safe approach also ensures that rows are written without -omitting fields and allows for new row groups to be created automatically -(after certain volume of data) or explicitly by using the :type:`EndRowGroup` +class. This type-safe approach also ensures that rows are written without +omitting fields and allows for new row groups to be created automatically +(after certain volume of data) or explicitly by using the :type:`EndRowGroup` stream modifier. Exceptions are used to signal errors. A :class:`ParquetException` is @@ -266,20 +266,20 @@ group that takes precedent over the ``chunk_size`` passed in the write methods. You can set the version of Parquet to write with ``version``, which determines which logical types are available. In addition, you can set the data page version with ``data_page_version``. It's V1 by default; setting to V2 will allow more -optimal compression (skipping compressing pages where there isn't a space +optimal compression (skipping compressing pages where there isn't a space benefit), but not all readers support this data page version. -Compression is off by default, but to get the most out of Parquet, you should -also choose a compression codec. You can choose one for the whole file or +Compression is off by default, but to get the most out of Parquet, you should +also choose a compression codec. You can choose one for the whole file or choose one for individual columns. If you choose a mix, the file-level option -will apply to columns that don't have a specific compression codec. See +will apply to columns that don't have a specific compression codec. See :class:`::arrow::Compression` for options. -Column data encodings can likewise be applied at the file-level or at the -column level. By default, the writer will attempt to dictionary encode all +Column data encodings can likewise be applied at the file-level or at the +column level. By default, the writer will attempt to dictionary encode all supported columns, unless the dictionary grows too large. This behavior can be changed at file-level or at the column level with ``disable_dictionary()``. -When not using dictionary encoding, it will fallback to the encoding set for +When not using dictionary encoding, it will fallback to the encoding set for the column or the overall file; by default ``Encoding::PLAIN``, but this can be changed with ``encoding()``. @@ -559,7 +559,7 @@ Encryption Parquet C++ implements all features specified in the `encryption specification `__, -except for encryption of column index and bloom filter modules. +except for encryption of column index and bloom filter modules. More specifically, Parquet C++ supports: diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index b28a9fc1e13a5..d98a2acde6620 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -81,13 +81,13 @@ and computation functions, possibly incremental. :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. -Record batches can be sent between implementations, such as via +Record batches can be sent between implementations, such as via :ref:`IPC ` or -via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and +via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and chunked arrays, on the other hand, are concepts in the C++ implementation, not in the Arrow format itself, so they aren't directly portable. -However, a table can be converted to and built from a sequence of record +However, a table can be converted to and built from a sequence of record batches easily without needing to copy the underlying array buffers. A table can be streamed as an arbitrary number of record batches using a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of diff --git a/docs/source/cpp/threading.rst b/docs/source/cpp/threading.rst index 24ad25b5a028a..4a1a65ffe012d 100644 --- a/docs/source/cpp/threading.rst +++ b/docs/source/cpp/threading.rst @@ -99,4 +99,4 @@ Arrow C++ uses :class:`arrow::Future` to communicate results between threads. T an :class:`arrow::Future` will be created when an operation needs to perform some kind of long running task that will block for some period of time. :class:`arrow::Future` objects are mainly meant for internal use and any method that returns an -:class:`arrow::Future` will usually have a synchronous variant as well. \ No newline at end of file +:class:`arrow::Future` will usually have a synchronous variant as well. diff --git a/docs/source/cpp/tutorials/compute_tutorial.rst b/docs/source/cpp/tutorials/compute_tutorial.rst index bcb87e6a8f992..a650865d75ce4 100644 --- a/docs/source/cpp/tutorials/compute_tutorial.rst +++ b/docs/source/cpp/tutorials/compute_tutorial.rst @@ -34,7 +34,7 @@ functionality to: 3. Search for a value in a column -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -49,16 +49,16 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. ``A main()`` is needed to glue things together. 3. We need data to play with. - + Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/compute_example.cc :language: cpp @@ -340,4 +340,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Compute Example) :end-before: (Doc section: Compute Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/datasets_tutorial.rst b/docs/source/cpp/tutorials/datasets_tutorial.rst index 285fc24d8d599..f60e1e52170ae 100644 --- a/docs/source/cpp/tutorials/datasets_tutorial.rst +++ b/docs/source/cpp/tutorials/datasets_tutorial.rst @@ -33,7 +33,7 @@ file on disk. In this article, you will: 2. write out a partitioned dataset from a Table. -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need data on disk to play with. @@ -58,8 +58,8 @@ Before running some computations, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc :language: cpp @@ -206,7 +206,7 @@ Build Dataset using Factory ^^^^^^^^^^^^^^^^^^^^^^^^^^^ With a :class:`dataset::FileSystemDatasetFactory` set up, we can actually build our -:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just +:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just like with an :class:`ArrayBuilder` back in the basic tutorial: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -228,14 +228,14 @@ dataset, and print those out, along with some small info: Move Dataset into Table ^^^^^^^^^^^^^^^^^^^^^^^ -One way we can do something with :class:`Datasets ` is getting -them into a :class:`Table`, where we can do anything we’ve learned we can do to -:class:`Tables
` to that :class:`Table`. +One way we can do something with :class:`Datasets ` is getting +them into a :class:`Table`, where we can do anything we’ve learned we can do to +:class:`Tables
` to that :class:`Table`. .. seealso:: :doc:`/cpp/streaming_execution` for execution that avoids manifesting the entire dataset in memory. -In order to move a :class:`Dataset’s ` contents into a :class:`Table`, -we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. +In order to move a :class:`Dataset’s ` contents into a :class:`Table`, +we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. First, we get a :class:`dataset::ScannerBuilder` from the :class:`dataset::Dataset`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -305,7 +305,7 @@ Create Scanner for Moving Table Data The process for writing a :class:`dataset::Dataset`, once a source of data is available, is similar to the reverse of reading it. Before, we used a :class:`dataset::Scanner` in order to scan into a :class:`Table` – now, we need one to read out of our -:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` +:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` based on our :class:`TableBatchReader`, then use that Builder to build a :class:`dataset::Scanner`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -343,7 +343,7 @@ Arrow, so we’ll write back out to that: :start-after: (Doc section: Write Format) :end-before: (Doc section: Write Format) -Configure FileSystemDatasetWriteOptions +Configure FileSystemDatasetWriteOptions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to write to disk, we need some configuration. We’ll do so via @@ -435,11 +435,11 @@ tutorials. With that, you’ve read and written partitioned datasets! This method, with some configuration, will work for any supported dataset format. For an example of such a dataset, the NYC Taxi dataset is a well-known -one, which you can find `here `_. +one, which you can find `here `_. Now you can get larger-than-memory data mapped for use! Which means that now we have to be able to process this data without -pulling it all into memory at once. For this, try Acero. +pulling it all into memory at once. For this, try Acero. .. seealso:: :doc:`/cpp/streaming_execution` for more information on Acero. @@ -450,4 +450,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Dataset Example) :end-before: (Doc section: Dataset Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/io_tutorial.rst b/docs/source/cpp/tutorials/io_tutorial.rst index f981c94b83e32..309f10a350aa3 100644 --- a/docs/source/cpp/tutorials/io_tutorial.rst +++ b/docs/source/cpp/tutorials/io_tutorial.rst @@ -33,7 +33,7 @@ the start to end of an application. In this article, you will: 3. Read a Parquet file into a :class:`Table` and write it back out afterwards -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before writing out some file I/O, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need files to play with. @@ -58,8 +58,8 @@ Before writing out some file I/O, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -I/O functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +I/O functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc :language: cpp @@ -156,8 +156,8 @@ Opening an Arrow file Reader ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An :class:`io::ReadableFile` is too generic to offer all functionality to read an Arrow file. -We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements -all the logic needed to read an Arrow file with correct formatting. We get one through +We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements +all the logic needed to read an Arrow file with correct formatting. We get one through :func:`ipc::RecordBatchFileReader::Open`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -294,8 +294,8 @@ Write a CSV File from Table CSV writing to :class:`Table` looks exactly like IPC writing to :class:`RecordBatch`, except with our :class:`Table`, and using :func:`ipc::RecordBatchWriter::WriteTable` instead of -:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- -we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target +:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- +we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target a file, use our :class:`Table’s
` :class:`Schema`, and then write the :class:`Table`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -358,7 +358,7 @@ even though we used :func:`io::ReadableFile::Open`. Note that we pass our Reading a Parquet File to Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a +With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a :class:`Table`, except we must pass the :class:`Table` by reference instead of outputting to it: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -401,4 +401,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: File I/O) :end-before: (Doc section: File I/O) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/developers/continuous_integration/index.rst b/docs/source/developers/continuous_integration/index.rst index f988b5ab69d50..cfca14e10e48c 100644 --- a/docs/source/developers/continuous_integration/index.rst +++ b/docs/source/developers/continuous_integration/index.rst @@ -27,4 +27,4 @@ Continuous Integration overview docker archery - crossbow \ No newline at end of file + crossbow diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 5fab745679e93..040a046c5153d 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -67,7 +67,7 @@ On Alpine Linux: gcc \ ninja \ make - + On Fedora Linux: .. code-block:: shell @@ -99,7 +99,7 @@ On macOS, you can use `Homebrew `_: With `vcpkg `_: .. code-block:: shell - + git clone https://github.com/apache/arrow.git cd arrow vcpkg install \ @@ -362,7 +362,7 @@ boolean flags to ``cmake``. * ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++) * ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop Filesystem -* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default +* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default * ``-DARROW_JSON=ON``: JSON reader module * ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator * ``-DARROW_ORC=ON``: Arrow integration with Apache ORC @@ -375,7 +375,7 @@ boolean flags to ``cmake``. instead. * ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems * ``-DARROW_SUBSTRAIT=ON``: Build with support for Substrait -* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 +* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON`` * ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` @@ -472,7 +472,7 @@ The build system supports a number of third-party dependencies * ``c-ares``: a dependency of gRPC * ``gflags``: for command line utilities (formerly Googleflags) * ``GLOG``: for logging - * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires + * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires system cURL and can use the ``BUNDLED`` method described below * ``gRPC``: for remote procedure calls * ``GTest``: Googletest, for testing diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index 251a45325fe0b..60ac949e81663 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -379,9 +379,9 @@ Downloading the Timezone Database ================================= To run some of the compute unit tests on Windows, the IANA timezone database -and the Windows timezone mapping need to be downloaded first. See +and the Windows timezone mapping need to be downloaded first. See :ref:`download-timezone-database` for download instructions. To set a non-default -path for the timezone database while running the unit tests, set the +path for the timezone database while running the unit tests, set the ``ARROW_TIMEZONE_DATABASE`` environment variable. Replicating Appveyor Builds diff --git a/docs/source/developers/guide/architectural_overview.rst b/docs/source/developers/guide/architectural_overview.rst index 58e05c85f457e..085a814453c84 100644 --- a/docs/source/developers/guide/architectural_overview.rst +++ b/docs/source/developers/guide/architectural_overview.rst @@ -29,8 +29,8 @@ Architectural Overview ********************** -A general overview of Apache Arrow project can be found on the -`front page `_ and in the +A general overview of Apache Arrow project can be found on the +`front page `_ and in the `Apache Arrow Overview `_. You can also have a look at the `Frequently Asked Questions `_. diff --git a/docs/source/developers/guide/communication.rst b/docs/source/developers/guide/communication.rst index a8659f83ac04d..749c94f9419b2 100644 --- a/docs/source/developers/guide/communication.rst +++ b/docs/source/developers/guide/communication.rst @@ -27,7 +27,7 @@ .. _communication: ************* -Communication +Communication ************* **About the contributors** @@ -50,7 +50,7 @@ tags ([C++], [R], [Ruby] etc.) so it gets noticed by the right people. Where to get help 👋 ==================== -For any question you may have or problems you are facing you can write to +For any question you may have or problems you are facing you can write to user or development :ref:`mailing_list` or you can create an issue on :ref:`github`. Also use GitHub to search through the issues, report bugs and create feature requests or proposals. diff --git a/docs/source/developers/guide/documentation.rst b/docs/source/developers/guide/documentation.rst index 3bb3bebef5098..8f9d7311e765f 100644 --- a/docs/source/developers/guide/documentation.rst +++ b/docs/source/developers/guide/documentation.rst @@ -49,7 +49,7 @@ documentation itself, you can search for an issue in GitHub. Documentation improvements are also a great way to gain some experience with our submission and review process without -requiring a lot of local development environment setup. +requiring a lot of local development environment setup. .. note:: Many documentation-only changes can be made directly in the @@ -114,4 +114,3 @@ library. Source folder includes: **Cookbooks** have their own repository ``_ and can be separately cloned and built. - diff --git a/docs/source/developers/guide/index.rst b/docs/source/developers/guide/index.rst index 353c8332ff0b5..0ed27a0ddc54e 100644 --- a/docs/source/developers/guide/index.rst +++ b/docs/source/developers/guide/index.rst @@ -83,17 +83,17 @@ of adding a basic feature. the installation of third-party packages, depending on which build options and components you enable. The C++ build guide has suggestions for commonly encountered issues - you can find it - :ref:`here `. + :ref:`here `. Anytime you are stuck, feel free to reach out via appropriate :ref:`communication` channel. - See a short description about the building process of + See a short description about the building process of :ref:`PyArrow or the R package` or go straight to detailed instructions on how to build one of Arrow libraries in the `documentation `_ . - + #. **Run the tests** - + We should run the tests to check if everything is working correctly. For example, you can run the tests from a terminal for Python @@ -155,7 +155,7 @@ There are lots of ways to contribute to the project besides writing code! * Improving the **documentation** is a great way to start contributing! For more information visit :ref:`documentation` section of the guide. -* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems +* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems and completing different tasks using Apache Arrow. They are also a great way to start contributing. For more information visit `How to contribute to Apache Arrow Cookbook `_ diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst index f350f469af403..b5905af65499b 100644 --- a/docs/source/developers/guide/resources.rst +++ b/docs/source/developers/guide/resources.rst @@ -78,7 +78,7 @@ Reproducible examples: - `Tidyverse: Make a reprex `_ - `Craft Minimal Bug Reports by Matthew Rocklin `_ -Recommended references +Recommended references ---------------------- - Slatkin, Brett, *Effective Python: 90 Specific Ways to Write Better Python*, Addison-Wesley Professional, 2019 diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index 390c56a81c73f..a76b15e917e9a 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -65,7 +65,7 @@ person who triaged the ticket expected it to be. Don't hesitate to write that in the comments. .. note:: - + When you find a GitHub issue you would like to work on, please mention your interest in the comment section of that issue; that way we will know you are working on it. diff --git a/docs/source/developers/guide/step_by_step/set_up.rst b/docs/source/developers/guide/step_by_step/set_up.rst index 60b472637badb..9a2177568d6f5 100644 --- a/docs/source/developers/guide/step_by_step/set_up.rst +++ b/docs/source/developers/guide/step_by_step/set_up.rst @@ -60,7 +60,7 @@ a username and password each time you execute a git command. RStudio project and will create a ``.Rproj`` file in the root directory. For this reason it is *highly recommended* to clone the repository using the command line or a Git client. - + Get the source code =================== diff --git a/docs/source/developers/guide/step_by_step/styling.rst b/docs/source/developers/guide/step_by_step/styling.rst index bb428b0b6ab40..c155acb389512 100644 --- a/docs/source/developers/guide/step_by_step/styling.rst +++ b/docs/source/developers/guide/step_by_step/styling.rst @@ -59,4 +59,4 @@ check your code and will stop the commit process, described in the following section, if there are any errors. - `Pre-commit installation instructions `_ -- `Pre-commit hooks `_ \ No newline at end of file +- `Pre-commit hooks `_ diff --git a/docs/source/developers/guide/tutorials/index.rst b/docs/source/developers/guide/tutorials/index.rst index dcefab23230f9..5f44231afc9c2 100644 --- a/docs/source/developers/guide/tutorials/index.rst +++ b/docs/source/developers/guide/tutorials/index.rst @@ -25,4 +25,4 @@ Tutorials :maxdepth: 1 python_tutorial - r_tutorial \ No newline at end of file + r_tutorial diff --git a/docs/source/developers/guide/tutorials/python_tutorial.rst b/docs/source/developers/guide/tutorials/python_tutorial.rst index 7f004160b0e75..c12c4489aee95 100644 --- a/docs/source/developers/guide/tutorials/python_tutorial.rst +++ b/docs/source/developers/guide/tutorials/python_tutorial.rst @@ -137,7 +137,7 @@ function is defined in the ``compute.py`` file. After examining the ``compute.py`` file we can see that together with ``_compute.pyx`` the functions from C++ get wrapped into Python. -We will define the new feature at the end of the ``compute.py`` file. +We will define the new feature at the end of the ``compute.py`` file. Lets run some code in the Python console from ``arrow/python`` directory in order to learn more about ``pc.min_max``. @@ -147,10 +147,10 @@ directory in order to learn more about ``pc.min_max``. $ cd python $ python - Python 3.9.7 (default, Oct 22 2021, 13:24:00) + Python 3.9.7 (default, Oct 22 2021, 13:24:00) [Clang 13.0.0 (clang-1300.0.29.3)] on darwin Type "help", "copyright", "credits" or "license" for more information. - + We have entered into the Python console from the shell and we can do some research: @@ -278,7 +278,7 @@ options for the ``pc.min_max`` function we can finish the work. return pa.scalar([('min-', min_t), ('max+', max_t)], type=ty) .. TODO seealso - .. For more information about the Arrow codebase visit + .. For more information about the Arrow codebase visit .. :ref:``. (link to working on the Arrow codebase section) Adding a test @@ -303,24 +303,24 @@ a specific unit test, pass in the test name to the ``-k`` parameter. .. code:: console $ cd python - $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max + $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max ======================== test session starts ========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items / 203 deselected / 1 selected + collected 204 items / 203 deselected / 1 selected pyarrow/tests/test_compute.py . [100%] ======================== 1 passed, 203 deselected in 0.16s ============ - - $ python -m pytest pyarrow/tests/test_compute.py + + $ python -m pytest pyarrow/tests/test_compute.py ======================== test session starts =========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items + collected 204 items pyarrow/tests/test_compute.py ................................... [ 46%] ................................................. [100%] @@ -339,7 +339,7 @@ utility called `Archery ` to check if code is in line with PEP 8 style guide. .. code:: console - + $ archery lint --python --fix INFO:archery:Running Python formatter (autopep8) INFO:archery:Running Python linter (flake8) @@ -430,7 +430,7 @@ to the branch history): $ git commit -am "Adding a new compute feature for tutorial purposes" [ARROW-14977 170ef85be] Adding a new compute feature for tutorial purposes 2 files changed, 51 insertions(+) - + We can use ``git log`` to check the history of commits: @@ -448,12 +448,12 @@ We can use ``git log`` to check the history of commits: Date: Sun Dec 5 15:19:46 2021 +0900 ARROW-14981: [CI][Docs] Upload built documents - + We can use this in release process instead of building on release manager's local environment. - + Closes #11856 from kou/ci-docs-upload - + Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei ... @@ -478,10 +478,10 @@ called ``origin``. Writing objects: 100% (7/7), 1.19 KiB | 1.19 MiB/s, done. Total 7 (delta 6), reused 0 (delta 0), pack-reused 0 remote: Resolving deltas: 100% (6/6), completed with 6 local objects. - remote: + remote: remote: Create a pull request for 'ARROW-14977' on GitHub by visiting: remote: https://github.com/AlenkaF/arrow/pull/new/ARROW-14977 - remote: + remote: To https://github.com/AlenkaF/arrow.git * [new branch] ARROW-14977 -> ARROW-14977 @@ -490,7 +490,7 @@ to create a Pull Request. On the GitHub Arrow page (main or forked) we will see a yellow notice bar with a note that we made recent pushes to the branch ARROW-14977. That’s great, now we can make the Pull Request -by clicking on **Compare & pull request**. +by clicking on **Compare & pull request**. .. figure:: ../../images/python_tutorial_github_pr_notice.jpeg :scale: 50 % @@ -527,5 +527,5 @@ the code, comment, resolve conversations and so on. The Pull Request we made can be viewed `here `_. .. seealso:: - + For more information about Pull Request workflow see :ref:`pr_lifecycle`. diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index c059ff676efb2..82053e901186c 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -350,7 +350,7 @@ Arrow repository, and update the following settings: * To enable debugging JNI-based modules like ``dataset``, activate specific profiles in the Maven tab under "Profiles". Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, - ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index c7bc4273313bc..5a18b1e4eb8db 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -75,7 +75,7 @@ checklist for using ``git``: locally, for example if additional commits have been made by a colleague. By using ``--force-with-lease`` instead of ``--force``, you ensure those commits are not overwritten and can fetch those changes if desired. - + .. dropdown:: Setting rebase to be default :animate: fade-in-slide-down :class-container: sd-shadow-none @@ -202,4 +202,3 @@ Implementations that do not intend to implement cross endian support: For other libraries, a discussion to gather consensus on the mailing-list should be had before submitting PRs. - diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index e7431ce0fb7b9..0b3a83dc5aabe 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -80,10 +80,10 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a # Delete the local tag for RC1 or later git tag -d apache-arrow- - + # Setup gpg agent for signing artifacts source dev/release/setup-gpg-agent.sh - + # Curate the release # The end of the generated report shows the JIRA tickets with wrong version number assigned. archery release curate @@ -180,7 +180,7 @@ Create the Release Candidate branch from the updated maintenance branch # Start from the updated maintenance branch. git checkout maint-X.Y.Z - + # The following script will create a branch for the Release Candidate, # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) @@ -188,7 +188,7 @@ Create the Release Candidate branch from the updated maintenance branch # starts at 0 and increments every time the Release Candidate is burned # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh - + # Push the release tag (for RC1 or later the --force flag is required) git push -u apache apache-arrow- # Push the release candidate branch in order to trigger verification jobs later @@ -201,23 +201,23 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks dev/release/02-source.sh - + # Submit binary tasks using crossbow, the command will output the crossbow build id dev/release/03-binary-submit.sh - + # Wait for the crossbow jobs to finish archery crossbow status - + # Download the produced binaries # This will download packages to a directory called packages/release--rc dev/release/04-binary-download.sh - + # Sign and upload the binaries # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh - + # Sign and upload the Java artifacts # # Note that you need to press the "Close" button manually by Web interface diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 8c301b44a3c42..afd220db6010d 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -55,7 +55,7 @@ and test the result on their own platform in order to cast a +1 vote. # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - + # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests diff --git a/docs/source/developers/reviewing.rst b/docs/source/developers/reviewing.rst index b6e0c1f4023bd..1550d6aa7ce61 100644 --- a/docs/source/developers/reviewing.rst +++ b/docs/source/developers/reviewing.rst @@ -260,14 +260,14 @@ Social aspects Labelling ========= -While reviewing PRs, we should try to identify whether the corresponding issue +While reviewing PRs, we should try to identify whether the corresponding issue needs to be marked with one or both of the following issue labels: * **Critical Fix**: The change fixes either: (a) a security vulnerability; (b) a bug that causes incorrect or invalid data to be produced; or (c) a bug that causes a crash (while the API contract is upheld). This is intended to mark fixes to issues that may affect users without their - knowledge. For this reason, fixing bugs that cause errors don't count, since + knowledge. For this reason, fixing bugs that cause errors don't count, since those bugs are usually obvious. Bugs that cause crashes are considered critical because they are a possible vector of Denial-of-Service attacks. * **Breaking Change**: The change breaks backwards compatibility in a public API. @@ -275,7 +275,7 @@ needs to be marked with one or both of the following issue labels: compatibility, except for the few places where we do guarantee ABI compatibility (such as C Data Interface). Experimental APIs are *not* exempt from this; they are just more likely to be associated with this tag. - + Breaking changes and critical fixes are separate: breaking changes alter the API contract, while critical fixes make the implementation align with the existing API contract. For example, fixing a bug that caused a Parquet reader diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 03095aa2e9356..67f77f53f012b 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -64,7 +64,7 @@ structures should be wrapped in capsules. Capsules avoid invalid access by attaching a name to the pointer and avoid memory leaks by attaching a destructor. Thus, they are much safer than passing pointers as integers. -`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing +`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing consumers to verify that the capsule contains the expected kind of data. To make sure Arrow structures are recognized, the following names must be used: @@ -133,8 +133,8 @@ Arrays and record batches (contiguous tables) can implement the method Export the object as a pair of ArrowSchema and ArrowArray structures. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -152,8 +152,8 @@ Tables / DataFrames and streams can implement the method ``__arrow_c_stream__``. Export the object as an ArrowArrayStream. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -192,7 +192,7 @@ schema transformations. Protocol Typehints ------------------ -The following typehints can be copied into your library to annotate that a +The following typehints can be copied into your library to annotate that a function accepts an object implementing one of these protocols. .. code-block:: python @@ -248,7 +248,7 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for } free(schema); } - + PyObject* ExportArrowSchemaPyCapsule() { struct ArrowSchema* schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); @@ -270,9 +270,9 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for ) if schema.release != NULL: schema.release(schema) - + free(schema) - + cdef object export_arrow_schema_py_capsule(): cdef ArrowSchema* schema = malloc(sizeof(ArrowSchema)) # It's recommended to immediately wrap the struct in a capsule, so @@ -305,7 +305,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: c #include - + // If the capsule is not an ArrowSchema, will return NULL and set an exception. struct ArrowSchema* GetArrowSchemaPyCapsule(PyObject* capsule) { return PyCapsule_GetPointer(capsule, "arrow_schema"); @@ -316,7 +316,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: cython cimport cpython - + cdef ArrowSchema* get_arrow_schema_py_capsule(object capsule) except NULL: return cpython.PyCapsule_GetPointer(capsule, 'arrow_schema') @@ -429,7 +429,7 @@ implementing the DataFrame Interchange Protocol. Comparison to ``__arrow_array__`` protocol ------------------------------------------ -The :ref:`arrow_array_protocol` protocol is a dunder method that +The :ref:`arrow_array_protocol` protocol is a dunder method that defines how PyArrow should import an object as an Arrow array. Unlike this protocol, it is specific to PyArrow and isn't used by other libraries. It is -also limited to arrays and does not support schemas, tabular structures, or streams. \ No newline at end of file +also limited to arrays and does not support schemas, tabular structures, or streams. diff --git a/docs/source/format/Glossary.rst b/docs/source/format/Glossary.rst index 3f2f118a95d6d..11c19c5fa70e9 100644 --- a/docs/source/format/Glossary.rst +++ b/docs/source/format/Glossary.rst @@ -211,7 +211,7 @@ Glossary its bindings, and Go). .. image:: ../cpp/tables-versus-record-batches.svg - :alt: A graphical representation of an Arrow Table and a + :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. .. seealso:: :term:`chunked array`, :term:`record batch` diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 1a9b1b97f07ee..c800255687796 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -501,14 +501,14 @@ integration testing actually tests. There are two types of integration test cases: the ones populated on the fly by the data generator in the Archery utility, and *gold* files that exist -in the `arrow-testing ` +in the `arrow-testing ` repository. Data Generator Tests ~~~~~~~~~~~~~~~~~~~~ This is the high-level description of the cases which are generated and -tested using the ``archery integration`` command (see ``get_generated_json_files`` +tested using the ``archery integration`` command (see ``get_generated_json_files`` in ``datagen.py``): * Primitive Types @@ -549,7 +549,7 @@ Gold File Integration Tests Pre-generated json and arrow IPC files (both file and stream format) exist in the `arrow-testing `__ repository in the ``data/arrow-ipc-stream/integration`` directory. These serve as -*gold* files that are assumed to be correct for use in testing. They are +*gold* files that are assumed to be correct for use in testing. They are referenced by ``runner.py`` in the code for the :ref:`Archery ` utility. Below are the test cases which are covered by them: @@ -563,7 +563,7 @@ utility. Below are the test cases which are covered by them: + intervals + maps + nested types (list, struct) - + primitives + + primitives + primitive with no batches + primitive with zero length batches diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst index 316fd38fa0990..06ed32bd48cf7 100644 --- a/docs/source/java/algorithm.rst +++ b/docs/source/java/algorithm.rst @@ -20,12 +20,12 @@ Java Algorithms Arrow's Java library provides algorithms for some commonly-used functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` -package of the ``algorithm`` module. +package of the ``algorithm`` module. Comparing Vector Elements ------------------------- -Comparing vector elements is the basic for many algorithms. Vector +Comparing vector elements is the basic for many algorithms. Vector elements can be compared in one of the two ways: 1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. @@ -36,30 +36,30 @@ interface. and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. We provide default implementations to compare vector elements. However, users can also define ways -for customized comparisons. +for customized comparisons. Vector Element Search --------------------- -A search algorithm tries to find a particular value in a vector. When successful, a vector index is +A search algorithm tries to find a particular value in a vector. When successful, a vector index is returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: -1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. -2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. However, it is only applicable to sorted vectors. To get a sorted vector, one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. 3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search -for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. -4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. If the vector is sorted, the matching values reside in a contiguous region in the vector. The -range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. Vector Sorting @@ -72,19 +72,19 @@ classified into the following categories: 1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original vector, without creating any new vector. So it just returns the original vector after the sorting operations. Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place -sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. 2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, it copies vector elements to a new vector in sorted order, and returns the new vector. -We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` -for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. 3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer vector, which correspond to indices of vector elements in sorted order. With the index vector, one can easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th -smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, -which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. Other Algorithms ---------------- diff --git a/docs/source/java/flight.rst b/docs/source/java/flight.rst index e009998be4f4e..6d26583aeefa6 100644 --- a/docs/source/java/flight.rst +++ b/docs/source/java/flight.rst @@ -184,7 +184,7 @@ Handshake-based authentication can be enabled by implementing ``ServerAuthHandler``. Authentication consists of two parts: on initial client connection, the server and client authentication implementations can perform any negotiation needed. The client authentication -handler then provides a token that will be attached to future calls. +handler then provides a token that will be attached to future calls. The client send data to be validated through ``ClientAuthHandler.authenticate`` The server validate data received through ``ServerAuthHandler.authenticate``. diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index 0ace2185983a9..cc8822247b007 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -169,8 +169,8 @@ when using the JDBC Driver Manager to connect. When supplying using the Properties object, values should *not* be URI-encoded. Parameters specified by the URI supercede parameters supplied by the -Properties object. When calling the `user/password overload of -DriverManager#getConnection() +Properties object. When calling the `user/password overload of +DriverManager#getConnection() `_, the username and password supplied on the URI supercede the username and password arguments to the function call. diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst index 036befa148692..8014a27444ac9 100644 --- a/docs/source/java/memory.rst +++ b/docs/source/java/memory.rst @@ -20,7 +20,7 @@ Memory Management ================= The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: -The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. +The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. .. contents:: @@ -39,7 +39,7 @@ Getting Started Arrow's memory management is built around the needs of the columnar format and using off-heap memory. Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough -to be used with memory allocated in C++ that is used by Java code. +to be used with memory allocated in C++ that is used by Java code. Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. Users need the core interfaces, and exactly one of the implementations. @@ -67,9 +67,9 @@ Why Arrow Uses Direct Memory BufferAllocator --------------- -The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). -As the name suggests, it can allocate new buffers associated with itself, but it can also -handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for +The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). +As the name suggests, it can allocate new buffers associated with itself, but it can also +handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: .. code-block:: Java @@ -100,21 +100,21 @@ memory from a child allocator, those allocations are also reflected in all paren effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can -be set for a particular section of code. The child allocator can be closed when that section completes, -at which point it checks that that section didn't leak any memory. +be set for a particular section of code. The child allocator can be closed when that section completes, +at which point it checks that that section didn't leak any memory. Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. Reference counting ------------------ -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers -deterministically, we use manual reference counting instead of the garbage collector. +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers +deterministically, we use manual reference counting instead of the garbage collector. This simply means that each buffer has a counter keeping track of the number of references to the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve -it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, -and `ReferenceManager.retain`_ to increment it. +it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, +and `ReferenceManager.retain`_ to increment it. Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically @@ -289,7 +289,7 @@ Finally, enabling the ``TRACE`` logging level will automatically provide this st | at (#8:1) Sometimes, explicitly passing allocators around is difficult. For example, it -can be hard to pass around extra state, like an allocator, through layers of +can be hard to pass around extra state, like an allocator, through layers of existing application or framework code. A global or singleton allocator instance can be useful here, though it should not be your first choice. @@ -370,7 +370,7 @@ Arrow’s memory model is based on the following basic concepts: leaks. - The same physical memory can be shared by multiple allocators and the allocator must provide an accounting paradigm for this purpose. - + Reserving Memory ---------------- @@ -384,17 +384,17 @@ Arrow provides two different ways to reserve memory: - ``AllocationReservation`` via BufferAllocator.newReservation(): Allows a short-term preallocation strategy so that a particular subsystem can ensure future memory is available to support a - particular request. - + particular request. + Reference Counting Details -------------------------- -Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. -A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, +Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. +A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s -All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination -share the same reference count and either all will be valid or all will be invalid. +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination +share the same reference count and either all will be valid or all will be invalid. For simplicity of accounting, we treat that memory as being used by one of the BufferAllocators associated with the memory. When that allocator releases its claim on that memory, the memory ownership is then moved to @@ -411,7 +411,7 @@ There are several Allocator types in Arrow Java: - ``ChildAllocator`` - A child allocator that derives from the root allocator Many BufferAllocators can reference the same piece of physical memory at the same -time. It is the AllocationManager’s responsibility to ensure that in this situation, +time. It is the AllocationManager’s responsibility to ensure that in this situation, all memory is accurately accounted for from the Root’s perspective and also to ensure that the memory is correctly released once all BufferAllocators have stopped using that memory. diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index e358681c57830..a71ddc5b5e55f 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -313,4 +313,4 @@ Example: Read the dataset from the previous example from an Arrow IPC file (rand More examples available at `Arrow Java Cookbook`_. -.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java \ No newline at end of file +.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst index abbbd1a236d6d..1c3e123cf50fb 100644 --- a/docs/source/java/vector.rst +++ b/docs/source/java/vector.rst @@ -226,7 +226,7 @@ A :class:`ListVector` is a vector that holds a list of values for each index. Wo For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. .. code-block:: Java - + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); ListVector listVector = ListVector.empty("vector", allocator)) { UnionListWriter writer = listVector.getWriter(); @@ -240,7 +240,7 @@ For example, the code below shows how to build a :class:`ListVector` of int's us writer.endList(); } listVector.setValueCount(10); - } + } :class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 928c607d139ce..ae48578a1bd61 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -52,10 +52,10 @@ Aggregations Cumulative Functions -------------------- -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst index 66e88fcd279ae..1556be9dbd011 100644 --- a/docs/source/python/api/substrait.rst +++ b/docs/source/python/api/substrait.rst @@ -50,4 +50,4 @@ Utility .. autosummary:: :toctree: ../generated/ - get_supported_functions \ No newline at end of file + get_supported_functions diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c02059a4f8faa..ce3dfabb0e689 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,7 +23,7 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. +varying types. The standard compute operations are provided by the :mod:`pyarrow.compute` module and can be used directly:: @@ -91,7 +91,7 @@ Grouped Aggregations ==================== PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the -:meth:`pyarrow.Table.group_by` method. +:meth:`pyarrow.Table.group_by` method. The method will return a grouping declaration to which the hash aggregation functions can be applied:: @@ -300,7 +300,7 @@ Filtering by Expressions :class:`.Table` and :class:`.Dataset` can both be filtered using a boolean :class:`.Expression`. -The expression can be built starting from a +The expression can be built starting from a :func:`pyarrow.compute.field`. Comparisons and transformations can then be applied to one or more fields to build the filter expression you care about. @@ -325,7 +325,7 @@ in column ``"nums"`` by the ``bit_wise_and`` operation equals ``0``. Only the numbers where the last bit was ``0`` will return a ``0`` as the result of ``num & 1`` and as all numbers where the last bit is ``0`` are multiples of ``2`` we will be filtering for the even numbers only. - + Once we have our filter, we can provide it to the :meth:`.Table.filter` method to filter our table only for the matching rows: @@ -392,7 +392,7 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) -using their registered function name. +using their registered function name. UDF support is limited to scalar functions. A scalar function is a function which executes elementwise operations on arrays or scalars. In general, the output of a @@ -441,7 +441,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun function_docs, input_types, output_type) - + The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of @@ -497,9 +497,9 @@ the GCD of one column with the scalar value 30. We will be re-using the category: [["A","B","C","D"]] Note that ``ds.field('')._call(...)`` returns a :func:`pyarrow.compute.Expression`. -The arguments passed to this function call are expressions, not scalar values +The arguments passed to this function call are expressions, not scalar values (notice the difference between :func:`pyarrow.scalar` and :func:`pyarrow.compute.scalar`, -the latter produces an expression). +the latter produces an expression). This expression is evaluated when the projection operator executes it. Projection Expressions diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index daab36f9a7be9..00469fd57becf 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -575,28 +575,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: @@ -611,35 +611,35 @@ of file size. Arrow's file writer provides sensible defaults for group sizing in Configuring files open during a write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When writing data to the disk, there are a few parameters that can be +When writing data to the disk, there are a few parameters that can be important to optimize the writes, such as the number of rows per file and the maximum number of open files allowed during the write. Set the maximum number of files opened with the ``max_open_files`` parameter of :meth:`write_dataset`. -If ``max_open_files`` is set greater than 0 then this will limit the maximum +If ``max_open_files`` is set greater than 0 then this will limit the maximum number of files that can be left open. This only applies to writing partitioned datasets, where rows are dispatched to the appropriate file depending on their partition values. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. -If your process is concurrently using other file handlers, either with a -dataset scanner or otherwise, you may hit a system file handler limit. For +If your process is concurrently using other file handlers, either with a +dataset scanner or otherwise, you may hit a system file handler limit. For example, if you are scanning a dataset with 300 files and writing out to 900 files, the total of 1200 files may be over a system limit. (On Linux, this might be a "Too Many Open Files" error.) You can either reduce this ``max_open_files`` setting or increase the file handler limit on your system. The default value is 900 which allows some number of files -to be open by the scanner before hitting the default Linux limit of 1024. +to be open by the scanner before hitting the default Linux limit of 1024. -Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. +Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. Set the maximum number of rows written in each file with the ``max_rows_per_files`` parameter of :meth:`write_dataset`. -If ``max_rows_per_file`` is set greater than 0 then this will limit how many +If ``max_rows_per_file`` is set greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect ``max_open_files``. This setting is the primary way to control file size. @@ -653,22 +653,22 @@ Configuring rows per group during a write The volume of data written to the disk per each group can be configured. This configuration includes a lower and an upper bound. -The minimum number of rows required to form a row group is +The minimum number of rows required to form a row group is defined with the ``min_rows_per_group`` parameter of :meth:`write_dataset`. .. note:: - If ``min_rows_per_group`` is set greater than 0 then this will cause the - dataset writer to batch incoming data and only write the row groups to the - disk when sufficient rows have accumulated. The final row group size may be - less than this value if other options such as ``max_open_files`` or + If ``min_rows_per_group`` is set greater than 0 then this will cause the + dataset writer to batch incoming data and only write the row groups to the + disk when sufficient rows have accumulated. The final row group size may be + less than this value if other options such as ``max_open_files`` or ``max_rows_per_file`` force smaller row group sizes. The maximum number of rows allowed per group is defined with the ``max_rows_per_group`` parameter of :meth:`write_dataset`. -If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split -up large incoming batches into multiple row groups. If this value is set then -``min_rows_per_group`` should also be set or else you may end up with very small +If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split +up large incoming batches into multiple row groups. If this value is set then +``min_rows_per_group`` should also be set or else you may end up with very small row groups (e.g. if the incoming row group size is just barely larger than this value). Row groups are built into the Parquet and IPC/Feather formats but don't affect JSON or CSV. @@ -719,7 +719,7 @@ Customizing & inspecting written files By default the dataset API will create files named "part-i.format" where "i" is a integer generated during the write and "format" is the file format specified in the write_dataset call. For simple datasets it may be possible to know which files will be created but for -larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used to supply a visitor that will be called as each file is created: .. ipython:: python diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst index f612ebabde5c9..024c2800e1107 100644 --- a/docs/source/python/dlpack.rst +++ b/docs/source/python/dlpack.rst @@ -90,4 +90,4 @@ Convert a PyArrow CPU array to PyTorch tensor: >>> import torch >>> torch.from_dlpack(array) - tensor([2, 0, 2, 4]) + tensor([2, 0, 2, 4]) diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 5309250351d8e..22f983a60c349 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -233,7 +233,7 @@ generate a credentials file in the default location:: To connect to a public bucket without using any credentials, you must pass ``anonymous=True`` to :class:`GcsFileSystem`. Otherwise, the filesystem -will report ``Couldn't resolve host name`` since there are different host +will report ``Couldn't resolve host name`` since there are different host names for authenticated and public access. Example showing how you can read contents from a GCS bucket:: @@ -314,7 +314,7 @@ For example:: # using this to read a partitioned dataset import pyarrow.dataset as ds ds.dataset("data/", filesystem=fs) - + Similarly for Azure Blob Storage:: import adlfs diff --git a/docs/source/python/getstarted.rst b/docs/source/python/getstarted.rst index d38fcadab288f..42e415c40b835 100644 --- a/docs/source/python/getstarted.rst +++ b/docs/source/python/getstarted.rst @@ -37,7 +37,7 @@ in tabular data. Arrow also provides support for various formats to get those tabular data in and out of disk and networks. Most commonly used formats are -Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). +Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). Creating Arrays and Tables -------------------------- @@ -63,7 +63,7 @@ in tabular data when attached to a column name birthdays_table = pa.table([days, months, years], names=["days", "months", "years"]) - + birthdays_table See :ref:`data` for more details. @@ -75,7 +75,7 @@ Once you have tabular data, Arrow provides out of the box the features to save and restore that data for common formats like Parquet: -.. ipython:: python +.. ipython:: python import pyarrow.parquet as pq @@ -92,14 +92,14 @@ data will be as quick as possible reloaded_birthdays Saving and loading back data in arrow is usually done through -:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), +:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), :ref:`CSV ` or :ref:`Line-Delimited JSON ` formats. Performing Computations ----------------------- Arrow ships with a bunch of compute functions that can be applied -to its arrays and tables, so through the compute functions +to its arrays and tables, so through the compute functions it's possible to apply transformations to the data .. ipython:: python @@ -122,7 +122,7 @@ smaller chunks import pyarrow.dataset as ds - ds.write_dataset(birthdays_table, "savedir", format="parquet", + ds.write_dataset(birthdays_table, "savedir", format="parquet", partitioning=ds.partitioning( pa.schema([birthdays_table.schema.field("years")]) )) @@ -151,8 +151,8 @@ how to project them, etc., refer to :ref:`dataset` documentation. Continuing from here -------------------- -For digging further into Arrow, you might want to read the -:doc:`PyArrow Documentation <./index>` itself or the +For digging further into Arrow, you might want to read the +:doc:`PyArrow Documentation <./index>` itself or the `Arrow Python Cookbook `_ diff --git a/docs/source/python/getting_involved.rst b/docs/source/python/getting_involved.rst index 7b3bcf2ac527a..9fda3c7c78488 100644 --- a/docs/source/python/getting_involved.rst +++ b/docs/source/python/getting_involved.rst @@ -54,7 +54,7 @@ used as foundations to build easier to use entities. exposed to the user are declared. In some cases, those files might directly import the entities from inner implementation if they want to expose it as is without modification. -* The ``lib.pyx`` file is where the majority of the core C++ libarrow +* The ``lib.pyx`` file is where the majority of the core C++ libarrow capabilities are exposed to Python. Most of the implementation of this module relies on included ``*.pxi`` files where the specific pieces are built. While being exposed to Python as ``pyarrow.lib`` its content @@ -73,4 +73,4 @@ used as foundations to build easier to use entities. PyArrow is also based on PyArrow C++, dedicated pieces of code that live in ``python/pyarrow/src/arrow/python`` directory and provide the low level code for capabilities like converting to and from numpy or pandas and the classes - that allow to use Python objects and callbacks in C++. \ No newline at end of file + that allow to use Python objects and callbacks in C++. diff --git a/docs/source/python/integration/python_r.rst b/docs/source/python/integration/python_r.rst index 20627c3782d3c..ec5dfc366fdf9 100644 --- a/docs/source/python/integration/python_r.rst +++ b/docs/source/python/integration/python_r.rst @@ -29,7 +29,7 @@ marshaling and unmarshaling data. The article takes for granted that you have a ``Python`` environment with ``pyarrow`` correctly installed and an ``R`` environment with - ``arrow`` library correctly installed. + ``arrow`` library correctly installed. See `Python Install Instructions `_ and `R Install instructions `_ for further details. @@ -52,7 +52,7 @@ We could save such a function in a ``addthree.R`` file so that we can make it available for reuse. Once the ``addthree.R`` file is created we can invoke any of its functions -from Python using the +from Python using the `rpy2 `_ library which enables a R runtime within the Python interpreter. @@ -91,12 +91,12 @@ to access the ``R`` function and print the expected result: .. code-block:: bash - $ python addthree.py + $ python addthree.py 6 If instead of passing around basic data types we want to pass around Arrow Arrays, we can do so relying on the -`rpy2-arrow `_ +`rpy2-arrow `_ module which implements ``rpy2`` support for Arrow types. ``rpy2-arrow`` can be installed through ``pip``: @@ -189,7 +189,7 @@ Invoking the ``addthree.R`` script will print the outcome of adding .. code-block:: bash - $ R --silent -f addthree.R + $ R --silent -f addthree.R Array [ @@ -219,7 +219,7 @@ necessary to import an Arrow Array in R from the C Data interface. That work will be done by the ``addthree_cdata`` function which invokes the ``addthree`` function once the Array is imported. -Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the +Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the ``addthree`` functions: .. code-block:: R @@ -261,7 +261,7 @@ Our ``addthree.py`` will thus become: # Import the pyarrow module that provides access to the C Data interface from pyarrow.cffi import ffi as arrow_c - # Allocate structures where we will export the Array data + # Allocate structures where we will export the Array data # and the Array schema. They will be released when we exit the with block. with arrow_c.new("struct ArrowArray*") as c_array, \ arrow_c.new("struct ArrowSchema*") as c_schema: @@ -274,7 +274,7 @@ Our ``addthree.py`` will thus become: array.type._export_to_c(c_schema_ptr) # Invoke the R addthree_cdata function passing the references - # to the array and schema C Data structures. + # to the array and schema C Data structures. # Those references are passed as strings as R doesn't have # native support for 64bit integers, so the integers are # converted to their string representation for R to convert it back. @@ -289,19 +289,19 @@ Our ``addthree.py`` will thus become: # Once the returned array is exported to a C Data infrastructure # we can import it back into pyarrow using Array._import_from_c py_array = pyarrow.Array._import_from_c(c_array_ptr, c_schema_ptr) - + print("RESULT", py_array) Running the newly changed ``addthree.py`` will now print the Array resulting -from adding ``3`` to all the elements of the original +from adding ``3`` to all the elements of the original ``pyarrow.array((1, 2, 3))`` array: .. code-block:: bash - $ python addthree.py + $ python addthree.py R[write to console]: Attaching package: ‘arrow’ RESULT [ 4, 5, 6 - ] \ No newline at end of file + ] diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 27cd14a68853d..f55e8f8bc5dc3 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -76,12 +76,12 @@ this one can be created with :func:`~pyarrow.ipc.new_stream`: .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_stream(sink, batch.schema) as writer: for i in range(5): writer.write_batch(batch) -Here we used an in-memory Arrow buffer stream (``sink``), +Here we used an in-memory Arrow buffer stream (``sink``), but this could have been a socket or some other IO sink. When creating the ``StreamWriter``, we pass the schema, since the schema @@ -102,7 +102,7 @@ convenience function ``pyarrow.ipc.open_stream``: with pa.ipc.open_stream(buf) as reader: schema = reader.schema batches = [b for b in reader] - + schema len(batches) @@ -126,7 +126,7 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_file(sink, batch.schema) as writer: for i in range(10): writer.write_batch(batch) @@ -164,7 +164,7 @@ DataFrame output: with pa.ipc.open_file(buf) as reader: df = reader.read_pandas() - + df[:5] Efficiently Writing and Reading Arrow Data diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst index 99ecbc19a1230..eff6135d895a7 100644 --- a/docs/source/python/json.rst +++ b/docs/source/python/json.rst @@ -21,7 +21,7 @@ Reading JSON files ================== -Arrow supports reading columnar data from line-delimited JSON files. +Arrow supports reading columnar data from line-delimited JSON files. In this context, a JSON file consists of multiple JSON objects, one per line, representing individual data rows. For example, this file represents two rows of data with four columns "a", "b", "c", "d": diff --git a/docs/source/python/orc.rst b/docs/source/python/orc.rst index bfa68fc34d895..76c293d742010 100644 --- a/docs/source/python/orc.rst +++ b/docs/source/python/orc.rst @@ -112,7 +112,7 @@ control various settings when writing an ORC file. * ``file_version``, the ORC format version to use. ``'0.11'`` ensures compatibility with older readers, while ``'0.12'`` is the newer one. -* ``stripe_size``, to control the approximate size of data within a column +* ``stripe_size``, to control the approximate size of data within a column stripe. This currently defaults to 64MB. See the :func:`~pyarrow.orc.write_table()` docstring for more details. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index d4717897660b6..029ed4f1a3e15 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -32,7 +32,7 @@ performance data IO. Apache Arrow is an ideal in-memory transport layer for data that is being read or written with Parquet files. We have been concurrently developing the `C++ -implementation of +implementation of Apache Parquet `_, which includes a native, multithreaded C++ adapter to and from in-memory Arrow data. PyArrow includes Python bindings to this code, which thus enables reading diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index 64a2a354dddef..cecbd5b595bc7 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -51,8 +51,8 @@ This implies a few things when round-tripping timestamps: #. Timezone information is lost (all timestamps that result from converting from spark to arrow/pandas are "time zone naive"). #. Timestamps are truncated to microseconds. -#. The session time zone might have unintuitive impacts on - translation of timestamp values. +#. The session time zone might have unintuitive impacts on + translation of timestamp values. Spark to Pandas (through Apache Arrow) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,8 +62,8 @@ The following cases assume the Spark configuration :: - >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], - ... 'aware': [Timestamp(year=2019, month=1, day=1, + >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], + ... 'aware': [Timestamp(year=2019, month=1, day=1, ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) >>> pdf naive aware @@ -77,7 +77,7 @@ The following cases assume the Spark configuration +-------------------+-------------------+ |2019-01-01 00:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + Note that conversion of the aware timestamp is shifted to reflect the time assuming UTC (it represents the same instant in time). For naive timestamps, Spark treats them as being in the system local @@ -129,7 +129,7 @@ session time zone is still PST: |2019-01-01 00:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas() naive aware 0 2019-01-01 2019-01-01 @@ -141,7 +141,7 @@ session time zone is still PST: aware 1 non-null datetime64[ns] dtypes: datetime64[ns](2) memory usage: 96.0 bytes - + Notice that, in addition to being a "time zone naive" timestamp, the 'aware' value will now differ when converting to an epoch offset. Spark does the conversion by first converting to the session time zone (or system local time zone if @@ -158,9 +158,9 @@ time: >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 -8.0 -The same type of conversion happens with the data frame converted while -the session time zone was UTC. In this case both naive and aware -represent different instants in time (the naive instant is due to +The same type of conversion happens with the data frame converted while +the session time zone was UTC. In this case both naive and aware +represent different instants in time (the naive instant is due to the change in session time zone between creating data frames): :: @@ -179,9 +179,9 @@ the change in session time zone between creating data frames): Note that the surprising shift for aware doesn't happen when the session time zone is UTC (but the timestamps still become "time zone naive"): - + :: - + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> pst_df.show() +-------------------+-------------------+ @@ -189,7 +189,7 @@ still become "time zone naive"): +-------------------+-------------------+ |2019-01-01 08:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas()['aware'][0] Timestamp('2019-01-01 08:00:00') >>> pdf['aware'][0] From 250291500b6a7d5d934901acef708cef2eb1dc08 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Wed, 1 May 2024 14:39:35 +0800 Subject: [PATCH 053/106] GH-41463: [C++] Skip TestConcurrentFillFromScalar for platforms without threading support (#41461) ### Rationale for this change See #41463 and https://github.com/apache/arrow/pull/40237#issuecomment-2084577090 ### What changes are included in this PR? Skip test for platforms that have no threading support. ### Are these changes tested? Change is test. ### Are there any user-facing changes? None. * GitHub Issue: #41463 Authored-by: Ruoxi Sun Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/array/array_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index af64908b59582..7e25ad61fa2ea 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -827,6 +827,9 @@ TEST_F(TestArray, TestFillFromScalar) { // GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same // scalar instance. TEST_F(TestArray, TestConcurrentFillFromScalar) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); for (auto seed : {0u, 0xdeadbeef, 42u}) { From 22f88fa4a8f5ac7250f1845aace5a78d20006ef2 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Wed, 1 May 2024 00:12:37 -0800 Subject: [PATCH 054/106] GH-41410: [C++][FS][Azure][Docs] Add AzureFileSystem to Filesystems API reference (#41411) ### Rationale for this change See https://github.com/apache/arrow/issues/41410. ### What changes are included in this PR? Just changes to filesystem.rst. ### Are these changes tested? Yes, locally. ### Are there any user-facing changes? These are those changes. * GitHub Issue: #41410 Authored-by: Bryce Mecum Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.h | 17 ++++++++--------- docs/source/cpp/api/filesystem.rst | 9 +++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 667b4e372ae59..b71a5ae73b2e9 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -141,18 +141,14 @@ struct ARROW_EXPORT AzureOptions { /// /// 1. abfs[s]://[:\@]\.blob.core.windows.net /// [/\[/\]] - /// 2. abfs[s]://\[:\]@\.dfs.core.windows.net - /// [/path] + /// 2. abfs[s]://\[:\]\@\.dfs.core.windows.net[/path] /// 3. abfs[s]://[\]@]\[\<:port\>] /// [/\[/path]] /// 4. abfs[s]://[\]@]\[/path] /// - /// 1. and 2. are compatible with the Azure Data Lake Storage Gen2 URIs: - /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri - /// - /// 3. is for Azure Blob Storage compatible service including Azurite. - /// - /// 4. is a shorter version of 1. and 2. + /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs + /// [1], (3) is for Azure Blob Storage compatible service including Azurite, + /// and (4) is a shorter version of (1) and (2). /// /// Note that there is no difference between abfs and abfss. HTTPS is /// used with abfs by default. You can force to use HTTP by specifying @@ -178,6 +174,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// + /// [1]: + /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri static Result FromUri(const Uri& uri, std::string* out_path); static Result FromUri(const std::string& uri, std::string* out_path); @@ -226,7 +225,7 @@ struct ARROW_EXPORT AzureOptions { /// overwriting. /// - When you use the ListBlobs operation without specifying a delimiter, the results /// include both directories and blobs. If you choose to use a delimiter, use only a -/// forward slash (/) -- the only supported delimiter. +/// forward slash (/) \--- the only supported delimiter. /// - If you use the DeleteBlob API to delete a directory, that directory is deleted only /// if it's empty. This means that you can't use the Blob API delete directories /// recursively. diff --git a/docs/source/cpp/api/filesystem.rst b/docs/source/cpp/api/filesystem.rst index 02b12668327f2..599e9fedb60f9 100644 --- a/docs/source/cpp/api/filesystem.rst +++ b/docs/source/cpp/api/filesystem.rst @@ -97,3 +97,12 @@ Google Cloud Storage filesystem .. doxygenclass:: arrow::fs::GcsFileSystem :members: + +Azure filesystem +---------------- + +.. doxygenstruct:: arrow::fs::AzureOptions + :members: + +.. doxygenclass:: arrow::fs::AzureFileSystem + :members: From 281122c018df86601ca675f3941751ddc3a89b3d Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Thu, 2 May 2024 00:18:12 +0800 Subject: [PATCH 055/106] GH-41306: [C++] Check to avoid copying when NullBitmapBuffer is Null (#41452) ### Rationale for this change This PR addresses a bug with the `FixedSizeBinary` type where it does not cast to a `Binary` type after being sliced. When slicing occurs, the offset is modified. If the resulting sliced data structure does not contain any `null` values, the Null Bitmap Buffer may be set to `null`. Currently, when a `Cast` operation is attempted on such a data structure, the code erroneously tries to access the Null Bitmap Buffer even when it is `null`. This leads to an `EXC_BAD_ACCESS` error. This PR implements a fix to prevent this erroneous behavior by adding checks before accessing the Null Bitmap Buffer. ### What changes are included in this PR? - Add a null check for the Null Bitmap Buffer when casting from `FixedSizeBinary` to `Binary` to prevent access violations if the buffer is null. ### Are these changes tested? Yes ### Are there any user-facing changes? Yes (Pyarrow side) * GitHub Issue: #41306 Authored-by: Hyunseok Seo Signed-off-by: Weston Pace --- .../compute/kernels/scalar_cast_string.cc | 13 ++++++--- .../arrow/compute/kernels/scalar_cast_test.cc | 27 +++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 3a8352a9b870f..dc3fe29a3dfae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -340,10 +340,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); + // When the offsets are different (e.g., due to slice operation), we need to check if + // the null bitmap buffer is not null before copying it. The null bitmap buffer can be + // null if the input array value does not contain any null value. + if (input.buffers[0].data != NULLPTR) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } } // This buffer is preallocated diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index af62b4da2caa5..a6d7f6097b59b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2196,6 +2196,33 @@ TEST(Cast, BinaryOrStringToFixedSizeBinary) { } } +TEST(Cast, FixedSizeBinaryToBinaryOrString) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + + CheckCast(valid_input, ArrayFromJSON(out_type, R"(["foo", null, "bar", "baz", + "quu"])")); + + auto empty_input = ArrayFromJSON(fixed_size_binary(3), "[]"); + CheckCast(empty_input, ArrayFromJSON(out_type, "[]")); + } +} + +TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + auto sliced = valid_input->Slice(1, 3); + CheckCast(sliced, ArrayFromJSON(out_type, R"([null, "bar", "baz"])")); + + auto valid_input_without_null = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar", + "baz", "quu"])"); + auto sliced_without_null = valid_input_without_null->Slice(1, 3); + CheckCast(sliced_without_null, ArrayFromJSON(out_type, R"(["bar", "baz", "quu"])")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), From cc78c7a9bf17ceba7d538b30ddda008daeb1db85 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 May 2024 05:54:20 +0900 Subject: [PATCH 056/106] MINOR: [JS] Bump memfs from 4.8.2 to 4.9.2 in /js (#41482) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [memfs](https://github.com/streamich/memfs) from 4.8.2 to 4.9.2.
Release notes

Sourced from memfs's releases.

v4.9.2

4.9.2 (2024-04-30)

Bug Fixes

v4.9.1

4.9.1 (2024-04-27)

Bug Fixes

  • 🐛 use latest json-pack implementation (de54ab5)
Changelog

Sourced from memfs's changelog.

4.9.2 (2024-04-30)

Bug Fixes

4.9.1 (2024-04-27)

Bug Fixes

  • 🐛 use latest json-pack implementation (de54ab5)

4.9.0 (2024-04-27)

Features

  • 🎸 define .scan() CRUD method (921e05d)
  • 🎸 implement .scan() in Node.js CRUD (3d973b7)
  • 🎸 implement .scan() method for FSA CRUD (a148fb8)
Commits
  • 0cc081d chore(release): 4.9.2 [skip ci]
  • a474a47 Merge pull request #1031 from streamich/bump-utils
  • 32cc4da fix: 🐛 bump json-pack
  • eea3b42 fix: 🐛 bump @​jsonjoy.com/util package
  • 7a38617 chore(deps): update peaceiris/actions-gh-pages action to v4 (#1027)
  • b198f40 ci: 🎡 add mirror to Gitlab workflow
  • 4619f16 chore(release): 4.9.1 [skip ci]
  • e5461ae Merge pull request #1028 from streamich/dependencies
  • 0dfd7bb docs: ✏️ describe memfs() helper in docs
  • 0509f15 chore: 🤖 remove /src/json-joy/ folder
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=memfs&package-manager=npm_and_yarn&previous-version=4.8.2&new-version=4.9.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 53 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/js/package.json b/js/package.json index fee6b342dbd13..7def7986490d6 100644 --- a/js/package.json +++ b/js/package.json @@ -99,7 +99,7 @@ "ix": "5.0.0", "jest": "29.7.0", "jest-silent-reporter": "0.5.0", - "memfs": "4.8.2", + "memfs": "4.9.2", "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", diff --git a/js/yarn.lock b/js/yarn.lock index b74e4543d9d4e..9daed1af9dd69 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -936,6 +936,26 @@ "@jridgewell/resolve-uri" "^3.1.0" "@jridgewell/sourcemap-codec" "^1.4.14" +"@jsonjoy.com/base64@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/base64/-/base64-1.1.1.tgz#a717fd8840f7bad49c7fe66cc65db8bcfc4c4dc5" + integrity sha512-LnFjVChaGY8cZVMwAIMjvA1XwQjZ/zIXHyh28IyJkyNkzof4Dkm1+KN9UIm3lHhREH4vs7XwZ0NpkZKnwOtEfg== + +"@jsonjoy.com/json-pack@^1.0.3": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/json-pack/-/json-pack-1.0.3.tgz#a68cbe3ccfd85d26cd763e4175fe90c9ee383d33" + integrity sha512-Q0SPAdmK6s5Fe3e1kcNvwNyk6e2+CxM8XZdGbf4abZG7nUO05KSie3/iX29loTBuY+75uVP6RixDSPVpotfzmQ== + dependencies: + "@jsonjoy.com/base64" "^1.1.1" + "@jsonjoy.com/util" "^1.1.2" + hyperdyperid "^1.2.0" + thingies "^1.20.0" + +"@jsonjoy.com/util@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/util/-/util-1.1.2.tgz#5072c27ecdb16d1ed7a2d125a1d0ed8aba01d652" + integrity sha512-HOGa9wtE6LEz2I5mMQ2pMSjth85PmD71kPbsecs02nEUq3/Kw0wRK3gmZn5BCEB8mFLXByqPxjHgApoMwIPMKQ== + "@nodelib/fs.scandir@2.1.5": version "2.1.5" resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" @@ -4018,6 +4038,11 @@ human-signals@^2.1.0: resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== +hyperdyperid@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hyperdyperid/-/hyperdyperid-1.2.0.tgz#59668d323ada92228d2a869d3e474d5a33b69e6b" + integrity sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A== + ignore@^5.2.0, ignore@^5.2.4, ignore@^5.3.1: version "5.3.1" resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.3.1.tgz#5073e554cd42c5b33b394375f538b8593e34d4ef" @@ -5165,11 +5190,14 @@ matchdep@^2.0.0: resolve "^1.4.0" stack-trace "0.0.10" -memfs@4.8.2: - version "4.8.2" - resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.8.2.tgz#9bb7c3e43647348451082557f05fb170b7442949" - integrity sha512-j4WKth315edViMBGkHW6NTF0QBjsTrcRDmYNcGsPq+ozMEyCCCIlX2d2mJ5wuh6iHvJ3FevUrr48v58YRqVdYg== +memfs@4.9.2: + version "4.9.2" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.9.2.tgz#42e7b48207268dad8c9c48ea5d4952c5d3840433" + integrity sha512-f16coDZlTG1jskq3mxarwB+fGRrd0uXWt+o1WIhRfOwbXQZqUDsTVxQBFK9JjRQHblg8eAG2JSbprDXKjc7ijQ== dependencies: + "@jsonjoy.com/json-pack" "^1.0.3" + "@jsonjoy.com/util" "^1.1.2" + sonic-forest "^1.0.0" tslib "^2.0.0" memoizee@0.4.X: @@ -6386,6 +6414,13 @@ snapdragon@^0.8.1: source-map-resolve "^0.5.0" use "^3.1.0" +sonic-forest@^1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/sonic-forest/-/sonic-forest-1.0.2.tgz#d80aa621d1cffe75a606ca44789ccff30f5b9ce6" + integrity sha512-2rICdwIJi5kVlehMUVtJeHn3ohh5YZV4pDv0P0c1M11cRz/gXNViItpM94HQwfvnXuzybpqK0LZJgTa3lEwtAw== + dependencies: + tree-dump "^1.0.0" + source-map-resolve@^0.5.0: version "0.5.3" resolved "https://registry.yarnpkg.com/source-map-resolve/-/source-map-resolve-0.5.3.tgz#190866bece7553e1f8f267a2ee82c606b5509a1a" @@ -6755,6 +6790,11 @@ textextensions@^3.2.0: resolved "https://registry.yarnpkg.com/textextensions/-/textextensions-3.3.0.tgz#03530d5287b86773c08b77458589148870cc71d3" integrity sha512-mk82dS8eRABNbeVJrEiN5/UMSCliINAuz8mkUwH4SwslkNP//gbEzlWNS5au0z5Dpx40SQxzqZevZkn+WYJ9Dw== +thingies@^1.20.0: + version "1.21.0" + resolved "https://registry.yarnpkg.com/thingies/-/thingies-1.21.0.tgz#e80fbe58fd6fdaaab8fad9b67bd0a5c943c445c1" + integrity sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g== + through2-filter@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/through2-filter/-/through2-filter-3.0.0.tgz#700e786df2367c2c88cd8aa5be4cf9c1e7831254" @@ -6866,6 +6906,11 @@ totalist@^3.0.0: resolved "https://registry.yarnpkg.com/totalist/-/totalist-3.0.1.tgz#ba3a3d600c915b1a97872348f79c127475f6acf8" integrity sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ== +tree-dump@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/tree-dump/-/tree-dump-1.0.1.tgz#b448758da7495580e6b7830d6b7834fca4c45b96" + integrity sha512-WCkcRBVPSlHHq1dc/px9iOfqklvzCbdRwvlNfxGZsrHqf6aZttfPrd7DJTt6oR10dwUfpFFQeVTkPbBIZxX/YA== + trim-newlines@^4.0.2: version "4.1.1" resolved "https://registry.yarnpkg.com/trim-newlines/-/trim-newlines-4.1.1.tgz#28c88deb50ed10c7ba6dc2474421904a00139125" From 9ce7ab10fbb3937cdcb4800a791c06591523240b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 May 2024 05:55:57 +0900 Subject: [PATCH 057/106] MINOR: [JS] Bump rollup from 4.14.3 to 4.17.2 in /js (#41484) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [rollup](https://github.com/rollup/rollup) from 4.14.3 to 4.17.2.
Release notes

Sourced from rollup's releases.

v4.17.2

4.17.2

2024-04-30

Bug Fixes

  • Fix tree-shaking problems when using spread arguments (#5503)

Pull Requests

v4.17.1

4.17.1

2024-04-29

Bug Fixes

  • Prevent infinite recursions for certain constructor invocations (#5500)

Pull Requests

v4.17.0

4.17.0

2024-04-27

Features

  • Track function call arguments to optimize functions only called once or with the same literal values (re-release from 4.16.0) (#5483)

Bug Fixes

  • Reduce browser WASM size to a fraction by changing optimization settings (#5494)

Pull Requests

v4.16.4

... (truncated)

Changelog

Sourced from rollup's changelog.

4.17.2

2024-04-30

Bug Fixes

  • Fix tree-shaking problems when using spread arguments (#5503)

Pull Requests

4.17.1

2024-04-29

Bug Fixes

  • Prevent infinite recursions for certain constructor invocations (#5500)

Pull Requests

4.17.0

2024-04-27

Features

  • Track function call arguments to optimize functions only called once or with the same literal values (re-release from 4.16.0) (#5483)

Bug Fixes

  • Reduce browser WASM size to a fraction by changing optimization settings (#5494)

Pull Requests

4.16.4

2024-04-23

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rollup&package-manager=npm_and_yarn&previous-version=4.14.3&new-version=4.17.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 198 ++++++++++++++++++++++++------------------------ 2 files changed, 100 insertions(+), 100 deletions(-) diff --git a/js/package.json b/js/package.json index 7def7986490d6..8cab229521f79 100644 --- a/js/package.json +++ b/js/package.json @@ -103,7 +103,7 @@ "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", - "rollup": "4.14.3", + "rollup": "4.17.2", "rxjs": "7.8.1", "ts-jest": "29.1.2", "ts-node": "10.9.2", diff --git a/js/yarn.lock b/js/yarn.lock index 9daed1af9dd69..fe483ab8aca36 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1020,85 +1020,85 @@ estree-walker "^2.0.2" picomatch "^2.3.1" -"@rollup/rollup-android-arm-eabi@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.14.3.tgz#bddf05c3387d02fac04b6b86b3a779337edfed75" - integrity sha512-X9alQ3XM6I9IlSlmC8ddAvMSyG1WuHk5oUnXGw+yUBs3BFoTizmG1La/Gr8fVJvDWAq+zlYTZ9DBgrlKRVY06g== - -"@rollup/rollup-android-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.14.3.tgz#b26bd09de58704c0a45e3375b76796f6eda825e4" - integrity sha512-eQK5JIi+POhFpzk+LnjKIy4Ks+pwJ+NXmPxOCSvOKSNRPONzKuUvWE+P9JxGZVxrtzm6BAYMaL50FFuPe0oWMQ== - -"@rollup/rollup-darwin-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.14.3.tgz#c5f3fd1aa285b6d33dda6e3f3ca395f8c37fd5ca" - integrity sha512-Od4vE6f6CTT53yM1jgcLqNfItTsLt5zE46fdPaEmeFHvPs5SjZYlLpHrSiHEKR1+HdRfxuzXHjDOIxQyC3ptBA== - -"@rollup/rollup-darwin-x64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.14.3.tgz#8e4673734d7dc9d68f6d48e81246055cda0e840f" - integrity sha512-0IMAO21axJeNIrvS9lSe/PGthc8ZUS+zC53O0VhF5gMxfmcKAP4ESkKOCwEi6u2asUrt4mQv2rjY8QseIEb1aw== - -"@rollup/rollup-linux-arm-gnueabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.14.3.tgz#53ed38eb13b58ababdb55a7f66f0538a7f85dcba" - integrity sha512-ge2DC7tHRHa3caVEoSbPRJpq7azhG+xYsd6u2MEnJ6XzPSzQsTKyXvh6iWjXRf7Rt9ykIUWHtl0Uz3T6yXPpKw== - -"@rollup/rollup-linux-arm-musleabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.14.3.tgz#0706ee38330e267a5c9326956820f009cfb21fcd" - integrity sha512-ljcuiDI4V3ySuc7eSk4lQ9wU8J8r8KrOUvB2U+TtK0TiW6OFDmJ+DdIjjwZHIw9CNxzbmXY39wwpzYuFDwNXuw== - -"@rollup/rollup-linux-arm64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.14.3.tgz#426fce7b8b242ac5abd48a10a5020f5a468c6cb4" - integrity sha512-Eci2us9VTHm1eSyn5/eEpaC7eP/mp5n46gTRB3Aar3BgSvDQGJZuicyq6TsH4HngNBgVqC5sDYxOzTExSU+NjA== - -"@rollup/rollup-linux-arm64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.14.3.tgz#65bf944530d759b50d7ffd00dfbdf4125a43406f" - integrity sha512-UrBoMLCq4E92/LCqlh+blpqMz5h1tJttPIniwUgOFJyjWI1qrtrDhhpHPuFxULlUmjFHfloWdixtDhSxJt5iKw== - -"@rollup/rollup-linux-powerpc64le-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.14.3.tgz#494ba3b31095e9a45df9c3f646d21400fb631a95" - integrity sha512-5aRjvsS8q1nWN8AoRfrq5+9IflC3P1leMoy4r2WjXyFqf3qcqsxRCfxtZIV58tCxd+Yv7WELPcO9mY9aeQyAmw== - -"@rollup/rollup-linux-riscv64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.14.3.tgz#8b88ed0a40724cce04aa15374ebe5ba4092d679f" - integrity sha512-sk/Qh1j2/RJSX7FhEpJn8n0ndxy/uf0kI/9Zc4b1ELhqULVdTfN6HL31CDaTChiBAOgLcsJ1sgVZjWv8XNEsAQ== - -"@rollup/rollup-linux-s390x-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.14.3.tgz#09c9e5ec57a0f6ec3551272c860bb9a04b96d70f" - integrity sha512-jOO/PEaDitOmY9TgkxF/TQIjXySQe5KVYB57H/8LRP/ux0ZoO8cSHCX17asMSv3ruwslXW/TLBcxyaUzGRHcqg== - -"@rollup/rollup-linux-x64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.14.3.tgz#197f27fd481ad9c861021d5cbbf21793922a631c" - integrity sha512-8ybV4Xjy59xLMyWo3GCfEGqtKV5M5gCSrZlxkPGvEPCGDLNla7v48S662HSGwRd6/2cSneMQWiv+QzcttLrrOA== - -"@rollup/rollup-linux-x64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.14.3.tgz#5cc0522f4942f2df625e9bfb6fb02c6580ffbce6" - integrity sha512-s+xf1I46trOY10OqAtZ5Rm6lzHre/UiLA1J2uOhCFXWkbZrJRkYBPO6FhvGfHmdtQ3Bx793MNa7LvoWFAm93bg== - -"@rollup/rollup-win32-arm64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.14.3.tgz#a648122389d23a7543b261fba082e65fefefe4f6" - integrity sha512-+4h2WrGOYsOumDQ5S2sYNyhVfrue+9tc9XcLWLh+Kw3UOxAvrfOrSMFon60KspcDdytkNDh7K2Vs6eMaYImAZg== - -"@rollup/rollup-win32-ia32-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.14.3.tgz#34727b5c7953c35fc6e1ae4f770ad3a2025f8e03" - integrity sha512-T1l7y/bCeL/kUwh9OD4PQT4aM7Bq43vX05htPJJ46RTI4r5KNt6qJRzAfNfM+OYMNEVBWQzR2Gyk+FXLZfogGw== - -"@rollup/rollup-win32-x64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.14.3.tgz#5b2fb4d8cd44c05deef8a7b0e6deb9ccb8939d18" - integrity sha512-/BypzV0H1y1HzgYpxqRaXGBRqfodgoBBCcsrujT6QRcakDQdfU+Lq9PENPh5jB4I44YWq+0C2eHsHya+nZY1sA== +"@rollup/rollup-android-arm-eabi@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.17.2.tgz#1a32112822660ee104c5dd3a7c595e26100d4c2d" + integrity sha512-NM0jFxY8bB8QLkoKxIQeObCaDlJKewVlIEkuyYKm5An1tdVZ966w2+MPQ2l8LBZLjR+SgyV+nRkTIunzOYBMLQ== + +"@rollup/rollup-android-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.17.2.tgz#5aeef206d65ff4db423f3a93f71af91b28662c5b" + integrity sha512-yeX/Usk7daNIVwkq2uGoq2BYJKZY1JfyLTaHO/jaiSwi/lsf8fTFoQW/n6IdAsx5tx+iotu2zCJwz8MxI6D/Bw== + +"@rollup/rollup-darwin-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.17.2.tgz#6b66aaf003c70454c292cd5f0236ebdc6ffbdf1a" + integrity sha512-kcMLpE6uCwls023+kknm71ug7MZOrtXo+y5p/tsg6jltpDtgQY1Eq5sGfHcQfb+lfuKwhBmEURDga9N0ol4YPw== + +"@rollup/rollup-darwin-x64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.17.2.tgz#f64fc51ed12b19f883131ccbcea59fc68cbd6c0b" + integrity sha512-AtKwD0VEx0zWkL0ZjixEkp5tbNLzX+FCqGG1SvOu993HnSz4qDI6S4kGzubrEJAljpVkhRSlg5bzpV//E6ysTQ== + +"@rollup/rollup-linux-arm-gnueabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.17.2.tgz#1a7641111be67c10111f7122d1e375d1226cbf14" + integrity sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A== + +"@rollup/rollup-linux-arm-musleabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.17.2.tgz#c93fd632923e0fee25aacd2ae414288d0b7455bb" + integrity sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg== + +"@rollup/rollup-linux-arm64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.17.2.tgz#fa531425dd21d058a630947527b4612d9d0b4a4a" + integrity sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A== + +"@rollup/rollup-linux-arm64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.17.2.tgz#8acc16f095ceea5854caf7b07e73f7d1802ac5af" + integrity sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA== + +"@rollup/rollup-linux-powerpc64le-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.17.2.tgz#94e69a8499b5cf368911b83a44bb230782aeb571" + integrity sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ== + +"@rollup/rollup-linux-riscv64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.17.2.tgz#7ef1c781c7e59e85a6ce261cc95d7f1e0b56db0f" + integrity sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg== + +"@rollup/rollup-linux-s390x-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.17.2.tgz#f15775841c3232fca9b78cd25a7a0512c694b354" + integrity sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g== + +"@rollup/rollup-linux-x64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.17.2.tgz#b521d271798d037ad70c9f85dd97d25f8a52e811" + integrity sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ== + +"@rollup/rollup-linux-x64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.17.2.tgz#9254019cc4baac35800991315d133cc9fd1bf385" + integrity sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q== + +"@rollup/rollup-win32-arm64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.17.2.tgz#27f65a89f6f52ee9426ec11e3571038e4671790f" + integrity sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA== + +"@rollup/rollup-win32-ia32-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.17.2.tgz#a2fbf8246ed0bb014f078ca34ae6b377a90cb411" + integrity sha512-7II/QCSTAHuE5vdZaQEwJq2ZACkBpQDOmQsE6D6XUbnBHW8IAhm4eTufL6msLJorzrHDFv3CF8oCA/hSIRuZeQ== + +"@rollup/rollup-win32-x64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.17.2.tgz#5a2d08b81e8064b34242d5cc9973ef8dd1e60503" + integrity sha512-TGGO7v7qOq4CYmSBVEYpI1Y5xDuCEnbVC5Vth8mOsW0gDSzxNrVERPc790IGHsrT2dQSimgMr9Ub3Y1Jci5/8w== "@rollup/stream@3.0.1": version "3.0.1" @@ -6191,29 +6191,29 @@ rimraf@^3.0.2: dependencies: glob "^7.1.3" -rollup@4.14.3: - version "4.14.3" - resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.14.3.tgz#bcbb7784b35826d3164346fa6d5aac95190d8ba9" - integrity sha512-ag5tTQKYsj1bhrFC9+OEWqb5O6VYgtQDO9hPDBMmIbePwhfSr+ExlcU741t8Dhw5DkPCQf6noz0jb36D6W9/hw== +rollup@4.17.2: + version "4.17.2" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.17.2.tgz#26d1785d0144122277fdb20ab3a24729ae68301f" + integrity sha512-/9ClTJPByC0U4zNLowV1tMBe8yMEAxewtR3cUNX5BoEpGH3dQEWpJLr6CLp0fPdYRF/fzVOgvDb1zXuakwF5kQ== dependencies: "@types/estree" "1.0.5" optionalDependencies: - "@rollup/rollup-android-arm-eabi" "4.14.3" - "@rollup/rollup-android-arm64" "4.14.3" - "@rollup/rollup-darwin-arm64" "4.14.3" - "@rollup/rollup-darwin-x64" "4.14.3" - "@rollup/rollup-linux-arm-gnueabihf" "4.14.3" - "@rollup/rollup-linux-arm-musleabihf" "4.14.3" - "@rollup/rollup-linux-arm64-gnu" "4.14.3" - "@rollup/rollup-linux-arm64-musl" "4.14.3" - "@rollup/rollup-linux-powerpc64le-gnu" "4.14.3" - "@rollup/rollup-linux-riscv64-gnu" "4.14.3" - "@rollup/rollup-linux-s390x-gnu" "4.14.3" - "@rollup/rollup-linux-x64-gnu" "4.14.3" - "@rollup/rollup-linux-x64-musl" "4.14.3" - "@rollup/rollup-win32-arm64-msvc" "4.14.3" - "@rollup/rollup-win32-ia32-msvc" "4.14.3" - "@rollup/rollup-win32-x64-msvc" "4.14.3" + "@rollup/rollup-android-arm-eabi" "4.17.2" + "@rollup/rollup-android-arm64" "4.17.2" + "@rollup/rollup-darwin-arm64" "4.17.2" + "@rollup/rollup-darwin-x64" "4.17.2" + "@rollup/rollup-linux-arm-gnueabihf" "4.17.2" + "@rollup/rollup-linux-arm-musleabihf" "4.17.2" + "@rollup/rollup-linux-arm64-gnu" "4.17.2" + "@rollup/rollup-linux-arm64-musl" "4.17.2" + "@rollup/rollup-linux-powerpc64le-gnu" "4.17.2" + "@rollup/rollup-linux-riscv64-gnu" "4.17.2" + "@rollup/rollup-linux-s390x-gnu" "4.17.2" + "@rollup/rollup-linux-x64-gnu" "4.17.2" + "@rollup/rollup-linux-x64-musl" "4.17.2" + "@rollup/rollup-win32-arm64-msvc" "4.17.2" + "@rollup/rollup-win32-ia32-msvc" "4.17.2" + "@rollup/rollup-win32-x64-msvc" "4.17.2" fsevents "~2.3.2" run-parallel@^1.1.9: From 14c54bbfb7d9305e79a2c2d016c34a655773e5cb Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 1 May 2024 19:56:20 -0400 Subject: [PATCH 058/106] GH-41470: [C++] Reuse deduplication logic for direct registration (#41466) ### Rationale for this change As observed in https://github.com/apache/arrow/pull/41309 a crossbow job on mac is failing due to duplicate registration of a factory for the file:// scheme ### What changes are included in this PR? Deduplication of registered filesystem factories is applied to direct registration as well as when merging registries. ### Are these changes tested? No, we just need to verify that the problematic crossbow job is repaired. ### Are there any user-facing changes? No * GitHub Issue: #41470 Lead-authored-by: Benjamin Kietzman Co-authored-by: David Li Signed-off-by: David Li --- cpp/src/arrow/filesystem/filesystem.cc | 4 ++-- cpp/src/arrow/filesystem/localfs_test.cc | 7 ++++--- dev/tasks/java-jars/github.yml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index b79af08385c0c..284be685fa800 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -761,8 +761,8 @@ class FileSystemFactoryRegistry { RETURN_NOT_OK(CheckValid()); auto [it, success] = scheme_to_factory_.emplace( - std::move(scheme), Registered{std::move(factory), std::move(finalizer)}); - if (success) { + std::move(scheme), Registered{factory, std::move(finalizer)}); + if (success || (it->second.ok() && it->second->factory == factory)) { return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 1a20e44bc36e2..d68c992dff863 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -154,15 +154,16 @@ TEST(FileSystemFromUri, RuntimeRegisteredFactory) { EXPECT_THAT(FileSystemFromUri("slowfile2:///hey/yo", &path), Raises(StatusCode::Invalid)); - EXPECT_THAT(RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), - Ok()); + EXPECT_THAT( + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), + Ok()); ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri("slowfile2:///hey/yo", &path)); EXPECT_EQ(path, "/hey/yo"); EXPECT_EQ(fs->type_name(), "slow"); EXPECT_THAT( - RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), Raises(StatusCode::KeyError, testing::HasSubstr("Attempted to register factory for scheme 'slowfile2' " "but that scheme is already registered"))); diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 0437ee7864979..eb9478ebaa6ef 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -80,7 +80,7 @@ jobs: fail-fast: false matrix: platform: - - { runs_on: ["macos-latest"], arch: "x86_64"} + - { runs_on: ["macos-13"], arch: "x86_64"} - { runs_on: ["macos-14"], arch: "aarch_64" } env: MACOSX_DEPLOYMENT_TARGET: "10.15" From 3c67091f93223f2d12f5a73d3e5bc51e7b389a00 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Thu, 2 May 2024 08:18:21 -0400 Subject: [PATCH 059/106] GH-41491: [Python] remove special methods related to buffers in python <2.6 (#41492) ### Rationale for this change These methods are not actually used and will be removed from Cython in an upcoming release. Closes #41491 ### What changes are included in this PR? ### Are these changes tested? Trust CI ### Are there any user-facing changes? No, this code should never be actually used. * GitHub Issue: #41491 Authored-by: Thomas A Caswell Signed-off-by: Joris Van den Bossche --- python/pyarrow/io.pxi | 47 ++++++++++++------------------------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7890bf4b2dd76..9e8026deb435c 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1446,27 +1446,6 @@ cdef class Buffer(_Weakrefable): buffer.strides = self.strides buffer.suboffsets = NULL - def __getsegcount__(self, Py_ssize_t *len_out): - if len_out != NULL: - len_out[0] = self.size - return 1 - - def __getreadbuffer__(self, Py_ssize_t idx, void **p): - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - - def __getwritebuffer__(self, Py_ssize_t idx, void **p): - if not self.buffer.get().is_mutable(): - raise SystemError("trying to write an immutable buffer") - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - cdef class ResizableBuffer(Buffer): """ @@ -2142,21 +2121,21 @@ cdef class CacheOptions(_Weakrefable): Parameters ---------- hole_size_limit : int, default 8KiB - The maximum distance in bytes between two consecutive ranges; beyond + The maximum distance in bytes between two consecutive ranges; beyond this value, ranges are not combined. range_size_limit : int, default 32MiB - The maximum size in bytes of a combined range; if combining two - consecutive ranges would produce a range of a size greater than this, + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, they are not combined lazy : bool, default True lazy = false: request all byte ranges when PreBuffer or WillNeed is called. - lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader - needs them. - lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the range that is currently being read. prefetch_limit : int, default 0 - The maximum number of ranges to be prefetched. This is only used for - lazy cache to asynchronously read some ranges after reading the target + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target range. """ @@ -2227,19 +2206,19 @@ cdef class CacheOptions(_Weakrefable): """ Create suiteable CacheOptions based on provided network metrics. - Typically this will be used with object storage solutions like Amazon S3, + Typically this will be used with object storage solutions like Amazon S3, Google Cloud Storage and Azure Blob Storage. Parameters ---------- time_to_first_byte_millis : int - Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call - setup latency of a new read request. The value is a positive integer. + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. transfer_bandwidth_mib_per_sec : int - Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive integer. ideal_bandwidth_utilization_frac : int, default 0.9 - Transfer bandwidth utilization fraction (per connection) to maximize the net + Transfer bandwidth utilization fraction (per connection) to maximize the net data load. The value is a positive float less than 1. max_ideal_request_size_mib : int, default 64 The maximum single data request size (in MiB) to maximize the net data load. From 49bf3d9bf2ca266fcf63dca1e57bdb83c9559b72 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 3 May 2024 06:10:20 +0900 Subject: [PATCH 060/106] GH-41467: [CI][Release] Don't push conda-verify-rc image (#41468) ### Rationale for this change Because it uses ubuntu:20.04 image directly. We don't build our image for it. ### What changes are included in this PR? Don't push an image for `conda-verify-rc`. ### Are these changes tested? No. ### Are there any user-facing changes? No. * GitHub Issue: #41467 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- dev/tasks/verify-rc/github.linux.amd64.docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/verify-rc/github.linux.amd64.docker.yml b/dev/tasks/verify-rc/github.linux.amd64.docker.yml index 65b30b5c8d4df..7a28ba705dd50 100644 --- a/dev/tasks/verify-rc/github.linux.amd64.docker.yml +++ b/dev/tasks/verify-rc/github.linux.amd64.docker.yml @@ -43,7 +43,7 @@ jobs: -e TEST_{{ target|upper }}=1 \ {{ distro }}-verify-rc - {% if arrow.is_default_branch() %} + {% if arrow.is_default_branch() and distro != "conda" %} {{ macros.github_login_dockerhub()|indent }} - name: Push Docker Image shell: bash From 71e38fc8a9fdf102a5136793b738d7650ca053fa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 06:50:42 +0900 Subject: [PATCH 061/106] MINOR: [JS] Bump @typescript-eslint/eslint-plugin from 7.7.0 to 7.8.0 in /js (#41485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ typescript-eslint/eslint-plugin](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/eslint-plugin) from 7.7.0 to 7.8.0.
Release notes

Sourced from @​typescript-eslint/eslint-plugin's releases.

v7.8.0

7.8.0 (2024-04-29)

🚀 Features

  • rule-tester: assert suggestion messages are unique (#8995)
  • typescript-estree: add maximumDefaultProjectFileMatchCount and wide allowDefaultProjectForFiles glob restrictions (#8925)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates (#8746)
  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check (#8717)
  • eslint-plugin: [consistent-type-assertions] handle tagged templates (#8993)
  • eslint-plugin: [no-unsafe-return] handle union types (#9001)
  • eslint-plugin: [no-unused-vars] clear error report range (#8640)
  • utils: export ESLint backwards-compat functions (#8976)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

v7.7.1

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment (#8800)
  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type (#8809)
  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish (#8559)
  • eslint-plugin: [no-for-in-array] refine report location (#8874)
  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type (#8912)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

Changelog

Sourced from @​typescript-eslint/eslint-plugin's changelog.

7.8.0 (2024-04-29)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates

  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check

  • eslint-plugin: [consistent-type-assertions] handle tagged templates

  • eslint-plugin: [no-unsafe-return] handle union types

  • eslint-plugin: [no-unused-vars] clear error report range

❤️ Thank You

  • auvred
  • Josh Goldberg ✨
  • jsfm01
  • Kim Sang Du
  • YeonJuan

You can read about our versioning strategy and releases on our website.

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment

  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type

  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish

  • eslint-plugin: [no-for-in-array] refine report location

  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type

❤️ Thank You

  • Abraham Guo
  • Kirk Waiblinger
  • YeonJuan

You can read about our versioning strategy and releases on our website.

Commits
  • ee677f6 chore(release): publish 7.8.0
  • 8127873 fix(eslint-plugin): [no-unused-vars] clear error report range (#8640)
  • 216d1b0 fix(eslint-plugin): [no-unsafe-return] handle union types (#9001)
  • 51d2193 fix(eslint-plugin): [consistent-type-assertions] handle tagged templates (#8993)
  • 4bed24d fix(eslint-plugin): [prefer-optional-chain] suggests optional chaining during...
  • b0f7aa4 fix(eslint-plugin): [no-unsafe-argument] handle tagged templates (#8746)
  • 219b841 chore: resolve lint issues on main branch (#8966)
  • 3e19436 chore(release): publish 7.7.1
  • b2552ca fix(eslint-plugin): [no-unnecessary-type-assertion] allow non-null assertion ...
  • fdeba42 fix(eslint-plugin): [no-for-in-array] refine report location (#8874)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/eslint-plugin&package-manager=npm_and_yarn&previous-version=7.7.0&new-version=7.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 77 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/js/package.json b/js/package.json index 8cab229521f79..e9590a188820f 100644 --- a/js/package.json +++ b/js/package.json @@ -72,7 +72,7 @@ "@types/glob": "8.1.0", "@types/jest": "29.5.12", "@types/multistream": "4.1.3", - "@typescript-eslint/eslint-plugin": "7.7.0", + "@typescript-eslint/eslint-plugin": "7.8.0", "@typescript-eslint/parser": "7.7.0", "async-done": "2.0.0", "benny": "3.7.1", diff --git a/js/yarn.lock b/js/yarn.lock index fe483ab8aca36..ab092675b4806 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1436,16 +1436,16 @@ dependencies: "@types/yargs-parser" "*" -"@typescript-eslint/eslint-plugin@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.7.0.tgz#bf34a02f221811505b8bf2f31060c8560c1bb0a3" - integrity sha512-GJWR0YnfrKnsRoluVO3PRb9r5aMZriiMMM/RHj5nnTrBy1/wIgk76XCtCKcnXGjpZQJQRFtGV9/0JJ6n30uwpQ== +"@typescript-eslint/eslint-plugin@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.8.0.tgz#c78e309fe967cb4de05b85cdc876fb95f8e01b6f" + integrity sha512-gFTT+ezJmkwutUPmB0skOj3GZJtlEGnlssems4AjkVweUPGj7jRwwqg0Hhg7++kPGJqKtTYx+R05Ftww372aIg== dependencies: "@eslint-community/regexpp" "^4.10.0" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/type-utils" "7.7.0" - "@typescript-eslint/utils" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/type-utils" "7.8.0" + "@typescript-eslint/utils" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" graphemer "^1.4.0" ignore "^5.3.1" @@ -1480,13 +1480,21 @@ "@typescript-eslint/types" "7.7.0" "@typescript-eslint/visitor-keys" "7.7.0" -"@typescript-eslint/type-utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.7.0.tgz#36792ff4209a781b058de61631a48df17bdefbc5" - integrity sha512-bOp3ejoRYrhAlnT/bozNQi3nio9tIgv3U5C0mVDdZC7cpcQEDZXvq8inrHYghLVwuNABRqrMW5tzAv88Vy77Sg== +"@typescript-eslint/scope-manager@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.8.0.tgz#bb19096d11ec6b87fb6640d921df19b813e02047" + integrity sha512-viEmZ1LmwsGcnr85gIq+FCYI7nO90DVbE37/ll51hjv9aG+YZMb4WDE2fyWpUR4O/UrhGRpYXK/XajcGTk2B8g== dependencies: - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/utils" "7.7.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" + +"@typescript-eslint/type-utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.8.0.tgz#9de166f182a6e4d1c5da76e94880e91831e3e26f" + integrity sha512-H70R3AefQDQpz9mGv13Uhi121FNMh+WEaRqcXTX09YEDky21km4dV1ZXJIp8QjXc4ZaVkXVdohvWDzbnbHDS+A== + dependencies: + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/utils" "7.8.0" debug "^4.3.4" ts-api-utils "^1.3.0" @@ -1500,6 +1508,11 @@ resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.7.0.tgz#23af4d24bf9ce15d8d301236e3e3014143604f27" integrity sha512-G01YPZ1Bd2hn+KPpIbrAhEWOn5lQBrjxkzHkWvP6NucMXFtfXoevK82hzQdpfuQYuhkvFDeQYbzXCjR1z9Z03w== +"@typescript-eslint/types@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" + integrity sha512-wf0peJ+ZGlcH+2ZS23aJbOv+ztjeeP8uQ9GgwMJGVLx/Nj9CJt17GWgWWoSmoRVKAX2X+7fzEnAjxdvK2gqCLw== + "@typescript-eslint/typescript-estree@5.62.0": version "5.62.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-5.62.0.tgz#7d17794b77fabcac615d6a48fb143330d962eb9b" @@ -1527,17 +1540,31 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.7.0.tgz#3d2b6606a60ac34f3c625facfb3b3ab7e126f58d" - integrity sha512-LKGAXMPQs8U/zMRFXDZOzmMKgFv3COlxUQ+2NMPhbqgVm6R1w+nU1i4836Pmxu9jZAuIeyySNrN/6Rc657ggig== +"@typescript-eslint/typescript-estree@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.8.0.tgz#b028a9226860b66e623c1ee55cc2464b95d2987c" + integrity sha512-5pfUCOwK5yjPaJQNy44prjCwtr981dO8Qo9J9PwYXZ0MosgAbfEMB008dJ5sNo3+/BN6ytBPuSvXUg9SAqB0dg== + dependencies: + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" + debug "^4.3.4" + globby "^11.1.0" + is-glob "^4.0.3" + minimatch "^9.0.4" + semver "^7.6.0" + ts-api-utils "^1.3.0" + +"@typescript-eslint/utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.8.0.tgz#57a79f9c0c0740ead2f622e444cfaeeb9fd047cd" + integrity sha512-L0yFqOCflVqXxiZyXrDr80lnahQfSOfc9ELAAZ75sqicqp2i36kEZZGuUymHNFoYOqxRT05up760b4iGsl02nQ== dependencies: "@eslint-community/eslint-utils" "^4.4.0" "@types/json-schema" "^7.0.15" "@types/semver" "^7.5.8" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" semver "^7.6.0" "@typescript-eslint/utils@^5.10.0": @@ -1570,6 +1597,14 @@ "@typescript-eslint/types" "7.7.0" eslint-visitor-keys "^3.4.3" +"@typescript-eslint/visitor-keys@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.8.0.tgz#7285aab991da8bee411a42edbd5db760d22fdd91" + integrity sha512-q4/gibTNBQNA0lGyYQCmWRS5D15n8rXh4QjK3KV+MBPlTYHpfBUT3D3PaPR/HeNiI9W6R7FvlkcGhNyAoP+caA== + dependencies: + "@typescript-eslint/types" "7.8.0" + eslint-visitor-keys "^3.4.3" + "@ungap/structured-clone@^1.2.0": version "1.2.0" resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406" From 9749d7d653e1b106d0662624b22b2982b3ad0516 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 2 May 2024 22:02:00 -0300 Subject: [PATCH 062/106] GH-39798: [C++] Optimize Take for fixed-size types including nested fixed-size lists (#41297) ### Rationale for this change Introduce utilities for dealing with fixed-width types (including fixed-size lists of fixed-width types) generically. And use it for initial optimizations of `Take` and `Filter`. ### What changes are included in this PR? - [x] Introduce utilities for dealing with fixed-width types generically - [x] Use faster `Take` kernel on small power-of-2 byte widths of fixed-width types - [x] from `FSLTakeExec` (including FSLs of FSBs) - [x] from `FSBTakeExec` (done before this PR) - [x] ~Take on any fixed-width type~ (as a separate issue #41301) - [x] Use faster `Filter` kernel on both primitive and fixed-width types of any length - [x] from `FSLFilterExec` (including FSLs of FSBs) - [x] from `FSBFilterExec` (done before this PR) ### Are these changes tested? By existing and new tests. ### Are there any user-facing changes? Some functions added to the `arrow::util` namespace and documented inline. * GitHub Issue: #39798 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/CMakeLists.txt | 1 + .../vector_selection_filter_internal.cc | 30 +- .../kernels/vector_selection_internal.cc | 56 +++- .../kernels/vector_selection_internal.h | 7 +- .../kernels/vector_selection_take_internal.cc | 39 ++- .../compute/kernels/vector_selection_test.cc | 156 +++++++-- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/fixed_width_internal.cc | 226 +++++++++++++ cpp/src/arrow/util/fixed_width_internal.h | 307 ++++++++++++++++++ cpp/src/arrow/util/fixed_width_test.cc | 217 +++++++++++++ cpp/src/arrow/util/fixed_width_test_util.h | 203 ++++++++++++ 11 files changed, 1171 insertions(+), 72 deletions(-) create mode 100644 cpp/src/arrow/util/fixed_width_internal.cc create mode 100644 cpp/src/arrow/util/fixed_width_internal.h create mode 100644 cpp/src/arrow/util/fixed_width_test.cc create mode 100644 cpp/src/arrow/util/fixed_width_test_util.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2ef82dd614f84..5d61112518f5e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS util/decimal.cc util/delimiting.cc util/dict_util.cc + util/fixed_width_internal.cc util/float16.cc util/formatting.cc util/future.cc diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8825d697fdf77..d5e5e5ad289ac 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -40,6 +40,7 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/fixed_width_internal.h" namespace arrow { @@ -158,9 +159,11 @@ class PrimitiveFilterImpl { PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) - : byte_width_(values.type->byte_width()), + : byte_width_(util::FixedWidthInBytes(*values.type)), values_is_valid_(values.buffers[0].data), - values_data_(values.buffers[1].data), + // No offset applied for boolean because it's a bitmap + values_data_(kIsBoolean ? values.buffers[1].data + : util::OffsetPointerOfFixedWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -169,17 +172,13 @@ class PrimitiveFilterImpl { if constexpr (kByteWidth >= 0 && !kIsBoolean) { DCHECK_EQ(kByteWidth, byte_width_); } - if constexpr (!kIsBoolean) { - // No offset applied for boolean because it's a bitmap - values_data_ += values.offset * byte_width(); - } + DCHECK_EQ(out_arr->offset, 0); if (out_arr->buffers[0] != nullptr) { // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } - out_data_ = out_arr->buffers[1]->mutable_data(); - DCHECK_EQ(out_arr->offset, 0); + out_data_ = util::MutableFixedWidthValuesPointer(out_arr); out_length_ = out_arr->length; out_position_ = 0; } @@ -416,7 +415,7 @@ class PrimitiveFilterImpl { out_position_ += length; } - constexpr int32_t byte_width() const { + constexpr int64_t byte_width() const { if constexpr (kByteWidth >= 0) { return kByteWidth; } else { @@ -425,7 +424,7 @@ class PrimitiveFilterImpl { } private: - int32_t byte_width_; + int64_t byte_width_; const uint8_t* values_is_valid_; const uint8_t* values_data_; int64_t values_null_count_; @@ -439,6 +438,8 @@ class PrimitiveFilterImpl { int64_t out_position_; }; +} // namespace + Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; const ArraySpan& filter = batch[1].array; @@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - const int bit_width = values.type->bit_width(); - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width, - allocate_validity, out_arr)); + DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, output_length, /*source=*/values, allocate_validity, out_arr)); switch (bit_width) { case 1: @@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return Status::OK(); } +namespace { + // ---------------------------------------------------------------------- // Optimized filter for base binary types (32-bit and 64-bit) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index a0fe2808e3e4e..93cd5060348db 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, DCHECK_OK(registry->AddFunction(std::move(func))); } -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], - ctx->Allocate(bit_util::BytesForBits(length * bit_width))); - } - return Status::OK(); -} - namespace { /// \brief Iterate over a REE filter, emitting ranges of a plain values array that @@ -909,6 +892,20 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult } Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveFilterExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec + // might not handle it correctly. + if (byte_width > 0) { + return PrimitiveFilterExec(ctx, batch, out); + } + } return FilterExec(ctx, batch, out); } @@ -968,6 +965,29 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* } Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveTakeExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. + // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. + switch (byte_width) { + case 1: + case 2: + case 4: + case 8: + case 16: + case 32: + return PrimitiveTakeExec(ctx, batch, out); + default: + break; // fallback to TakeExec + } + } + return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 95f3e51cd67e3..a169f4b38a2b8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, const FunctionOptions* default_options, FunctionRegistry* registry); -/// \brief Allocate an ArrayData for a primitive array with a given length and bit width -/// -/// \param[in] bit_width 1 or a multiple of 8 -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out); - /// \brief Callback type for VisitPlainxREEFilterOutputSegments. /// /// position is the logical position in the values array relative to its offset. @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments( FilterOptions::NullSelectionBehavior null_selection, const EmitREEFilterSegment& emit_segment); +Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 5cd3710828485..48a2de9936cd4 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/ree_util.h" @@ -323,7 +324,7 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte +// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte // C-type based types. Use common implementation for every byte width and only // generate code for unsigned integer indices, since after boundschecking to // check for negative numbers in the indices we can safely reinterpret_cast @@ -333,16 +334,20 @@ using TakeState = OptionsWrapper; /// use the logical Arrow type but rather the physical C type. This way we /// only generate one take function for each byte width. /// -/// This function assumes that the indices have been boundschecked. +/// Also note that this function can also handle fixed-size-list arrays if +/// they fit the criteria described in fixed_width_internal.h, so use the +/// function defined in that file to access values and destination pointers +/// and DO NOT ASSUME `values.type()` is a primitive type. +/// +/// \pre the indices have been boundschecked template struct PrimitiveTakeImpl { static constexpr int kValueWidth = ValueWidthConstant::value; static void Exec(const ArraySpan& values, const ArraySpan& indices, ArrayData* out_arr) { - DCHECK_EQ(values.type->byte_width(), kValueWidth); - const auto* values_data = - values.GetValues(1, 0) + kValueWidth * values.offset; + DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); + const auto* values_data = util::OffsetPointerOfFixedWidthValues(values); const uint8_t* values_is_valid = values.buffers[0].data; auto values_offset = values.offset; @@ -350,16 +355,15 @@ struct PrimitiveTakeImpl { const uint8_t* indices_is_valid = indices.buffers[0].data; auto indices_offset = indices.offset; - auto out = out_arr->GetMutableValues(1, 0) + kValueWidth * out_arr->offset; + DCHECK_EQ(out_arr->offset, 0); + auto* out = util::MutableFixedWidthValuesPointer(out_arr); auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - DCHECK_EQ(out_offset, 0); // If either the values or indices have nulls, we preemptively zero out the // out validity bitmap so that we don't have to use ClearBit in each // iteration for nulls. if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); + bit_util::SetBitsTo(out_is_valid, 0, indices.length, false); } auto WriteValue = [&](int64_t position) { @@ -386,7 +390,7 @@ struct PrimitiveTakeImpl { valid_count += block.popcount; if (block.popcount == block.length) { // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); + bit_util::SetBitsTo(out_is_valid, position, block.length, true); for (int64_t i = 0; i < block.length; ++i) { WriteValue(position); ++position; @@ -396,7 +400,7 @@ struct PrimitiveTakeImpl { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); WriteValue(position); } else { WriteZero(position); @@ -416,7 +420,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -433,7 +437,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // index is not null && value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -584,14 +588,17 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ArrayData* out_arr = out->array_data().get(); - const int bit_width = values.type->bit_width(); + DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false, + /*exclude_dictionary=*/true)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); // TODO: When neither values nor indices contain nulls, we can skip // allocating the validity bitmap altogether and save time and space. A // streamlined PrimitiveTakeImpl would need to be written that skips all // interactions with the output validity bitmap, though. - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width, - /*allocate_validity=*/true, out_arr)); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, indices.length, /*source=*/values, + /*allocate_validity=*/true, out_arr)); switch (bit_width) { case 1: TakeIndexDispatch(values, indices, out_arr); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index ec94b328ea361..4c7d85b103f36 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/chunked_array.h" #include "arrow/compute/api.h" @@ -32,6 +33,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/fixed_width_test_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -726,7 +728,37 @@ TEST_F(TestFilterKernelWithLargeList, FilterListInt32) { "[[1,2], null, null]"); } -class TestFilterKernelWithFixedSizeList : public TestFilterKernel {}; +class TestFilterKernelWithFixedSizeList : public TestFilterKernel { + protected: + std::vector> five_length_filters_ = { + ArrayFromJSON(boolean(), "[false, false, false, false, false]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), + ArrayFromJSON(boolean(), "[false, true, true, false, true]"), + ArrayFromJSON(boolean(), "[null, true, null, false, true]"), + }; + + void AssertFilterOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + using NLG = ::arrow::util::internal::NestedListGenerator; + constexpr int64_t kLength = 5; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, kLength)); + ASSERT_OK_AND_ASSIGN(auto list, + NLG::NestedListArray(inner_type, list_sizes, kLength)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + for (auto& filter : five_length_filters_) { + // Use the Filter on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, + Filter(*list, *filter, /*options=*/emit_null_)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(expected_list, fsl_list->type())); + auto expected_fsl_array = expected_fsl.make_array(); + this->AssertFilter(fsl_list, filter, expected_fsl_array); + } + } +}; TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -740,6 +772,33 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { "[[1, null, 3], [7, 8, null]]"); } +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 1, null]", + R"([["two", "", "three"], ["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 1, null]", + R"([["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 0, 1]", + R"([["two", "", "three"], ["seven", "eight", ""]])"); +} + +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->AssertFilterOnNestedLists(inner_type, list_sizes); + }); +} + class TestFilterKernelWithMap : public TestFilterKernel {}; TEST_F(TestFilterKernelWithMap, FilterMapStringToInt32) { @@ -1034,29 +1093,34 @@ Status TakeJSON(const std::shared_ptr& type, const std::string& values .Value(out); } +void DoCheckTake(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + AssertTakeArrays(values, indices, expected); + + // Check sliced values + ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(values->type(), 2)); + ASSERT_OK_AND_ASSIGN(auto values_sliced, + Concatenate({values_filler, values, values_filler})); + values_sliced = values_sliced->Slice(2, values->length()); + AssertTakeArrays(values_sliced, indices, expected); + + // Check sliced indices + ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(indices->type(), int8_t{0})); + ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); + ASSERT_OK_AND_ASSIGN(auto indices_sliced, + Concatenate({indices_filler, indices, indices_filler})); + indices_sliced = indices_sliced->Slice(3, indices->length()); + AssertTakeArrays(values, indices_sliced, expected); +} + void CheckTake(const std::shared_ptr& type, const std::string& values_json, const std::string& indices_json, const std::string& expected_json) { auto values = ArrayFromJSON(type, values_json); auto expected = ArrayFromJSON(type, expected_json); - for (auto index_type : {int8(), uint32()}) { auto indices = ArrayFromJSON(index_type, indices_json); - AssertTakeArrays(values, indices, expected); - - // Check sliced values - ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(type, 2)); - ASSERT_OK_AND_ASSIGN(auto values_sliced, - Concatenate({values_filler, values, values_filler})); - values_sliced = values_sliced->Slice(2, values->length()); - AssertTakeArrays(values_sliced, indices, expected); - - // Check sliced indices - ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(index_type, int8_t{0})); - ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); - ASSERT_OK_AND_ASSIGN(auto indices_sliced, - Concatenate({indices_filler, indices, indices_filler})); - indices_sliced = indices_sliced->Slice(3, indices->length()); - AssertTakeArrays(values, indices_sliced, expected); + DoCheckTake(values, indices, expected); } } @@ -1427,7 +1491,25 @@ TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); } -class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped {}; +class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped { + protected: + void CheckTakeOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes, int64_t length) { + using NLG = ::arrow::util::internal::NestedListGenerator; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, length)); + ASSERT_OK_AND_ASSIGN(auto list, NLG::NestedListArray(inner_type, list_sizes, length)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + auto indices = ArrayFromJSON(int64(), "[1, 2, 4]"); + // Use the Take on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, Take(*list, *indices)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(*expected_list, fsl_list->type())); + DoCheckTake(fsl_list, indices, expected_fsl); + } +}; TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -1449,6 +1531,42 @@ TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { "[0, 1, 0]"); } +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + CheckTake(fixed_size_list(utf8(), 3), list_json, "[]", "[]"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[3, 2, 1]", + R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[null, 2, 0]", + R"([null, ["four", "five", "six"], ["zero", "one", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, R"([null, null])", "[null, null]"); + CheckTake( + fixed_size_list(utf8(), 3), list_json, "[3, 0, 0,3]", + R"([["seven", "eight", ""], ["zero", "one", ""], ["zero", "one", ""], ["seven", "eight", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[0, 1, 2, 3]", list_json); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]", + R"([ + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["two", "", "three"] + ])"); +} + +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->CheckTakeOnNestedLists(inner_type, list_sizes, /*length=*/5); + }); +} + class TestTakeKernelWithMap : public TestTakeKernelTyped {}; TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index e26efba28594b..087e4e3879e56 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -56,6 +56,7 @@ add_arrow_test(utility-test compression_test.cc decimal_test.cc float16_test.cc + fixed_width_test.cc formatting_util_test.cc key_value_metadata_test.cc hashing_test.cc diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc new file mode 100644 index 0000000000000..164af3cff66b3 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.cc @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compute/kernel.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/fixed_width_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" + +namespace arrow::util { + +using ::arrow::internal::checked_cast; + +bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + bool exclude_dictionary) { + return IsFixedWidthLike(source, force_null_count, + [exclude_dictionary](const DataType& type) { + return !exclude_dictionary || type.id() != Type::DICTIONARY; + }); +} + +static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + if (type->id() == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (type->id() != Type::BOOL && is_fixed_width(type->id())) { + const int64_t flat_byte_width = list_size * type->byte_width(); + DCHECK_GE(flat_byte_width, 0); + return flat_byte_width; + } + break; + } + return -1; +} + +int64_t FixedWidthInBytes(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + const int32_t num_bits = type.bit_width(); + return (type_id == Type::BOOL) ? -1 : num_bits / 8; + } + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBytesFallback(fsl); + } + return -1; +} + +int64_t FixedWidthInBits(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + return type.bit_width(); + } + const int64_t byte_width = FixedWidthInBytes(type); + if (ARROW_PREDICT_FALSE(byte_width < 0)) { + return -1; + } + return byte_width * 8; +} + +namespace internal { + +Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, const ArraySpan& source, + bool allocate_validity, ArrayData* out) { + DCHECK(!source.MayHaveNulls() || allocate_validity) + << "allocate_validity cannot be false if source may have nulls"; + DCHECK_EQ(source.type->id(), out->type->id()); + auto* type = source.type; + out->length = length; + if (type->id() == Type::FIXED_SIZE_LIST) { + out->buffers.resize(1); + out->child_data = {std::make_shared()}; + } else { + out->buffers.resize(2); + } + if (allocate_validity) { + ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); + } + + if (type->id() == Type::BOOL) { + ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); + return Status::OK(); + } + if (is_fixed_width(type->id())) { + if (type->id() == Type::DICTIONARY) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + ARROW_ASSIGN_OR_RAISE(out->buffers[1], + ctx->Allocate(length * source.type->byte_width())); + return Status::OK(); + } + if (type->id() == Type::FIXED_SIZE_LIST) { + auto& fsl_type = checked_cast(*type); + auto& value_type = fsl_type.value_type(); + if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) { + return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type); + } + if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + if (source.child_data[0].MayHaveNulls()) { + return Status::Invalid( + "PreallocateFixedWidthArrayData: " + "FixedSizeList may have null values in child array: ", + fsl_type); + } + auto* child_values = out->child_data[0].get(); + child_values->type = value_type; + return PreallocateFixedWidthArrayData(ctx, length * fsl_type.list_size(), + /*source=*/source.child_data[0], + /*allocate_validity=*/false, + /*out=*/child_values); + } + return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", *type); +} + +} // namespace internal + +/// \pre same as OffsetPointerOfFixedWidthValues +/// \pre source.type->id() != Type::BOOL +static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) { + using OffsetAndListSize = std::pair; + auto get_offset = [](auto pair) { return pair.first; }; + auto get_list_size = [](auto pair) { return pair.second; }; + ::arrow::internal::SmallVector stack; + + DCHECK_NE(source.type->id(), Type::BOOL); + + int64_t list_size = 1; + auto* array = &source; + while (array->type->id() == Type::FIXED_SIZE_LIST) { + list_size *= checked_cast(array->type)->list_size(); + stack.emplace_back(array->offset, list_size); + array = &array->child_data[0]; + } + // Now that innermost values were reached, pop the stack and calculate the offset + // in bytes of the innermost values buffer by considering the offset at each + // level of nesting. + DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + DCHECK(array == &source || !array->MayHaveNulls()) + << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no " + "nulls in the arrays nested by FIXED_SIZE_LIST."; + int64_t value_width = array->type->byte_width(); + int64_t offset_in_bytes = array->offset * value_width; + for (auto it = stack.rbegin(); it != stack.rend(); ++it) { + value_width *= get_list_size(*it); + offset_in_bytes += get_offset(*it) * value_width; + } + return value_width < 0 ? nullptr : array->GetValues(1, offset_in_bytes); +} + +const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { + auto type_id = source.type->id(); + if (is_fixed_width(type_id)) { + if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) { + // BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the + // general case. Returning something for BOOL arrays that happen to byte-align + // because offset=0 would create too much confusion. + return nullptr; + } + return source.GetValues(1, 0) + source.offset * source.type->byte_width(); + } + return OffsetPointerOfFixedWidthValuesFallback(source); +} + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) { + auto type_id = mutable_array->type->id(); + if (type_id == Type::FIXED_SIZE_LIST) { + auto* array = mutable_array; + do { + DCHECK_EQ(array->offset, 0); + DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " + << mutable_array->type->ToString(true); + array = array->child_data[0].get(); + } while (array->type->id() == Type::FIXED_SIZE_LIST); + DCHECK_EQ(array->offset, 0); + DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + return array->GetMutableValues(1, 0); + } + DCHECK_EQ(mutable_array->offset, 0); + // BOOL is allowed here only because the offset is expected to be 0, + // so the byte-aligned pointer also points to the first *bit* of the buffer. + DCHECK(is_fixed_width(type_id)); + return mutable_array->GetMutableValues(1, 0); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h new file mode 100644 index 0000000000000..f6959485fbd01 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.h @@ -0,0 +1,307 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" + +namespace arrow::compute { +// XXX: remove dependency on compute::KernelContext +class KernelContext; +} // namespace arrow::compute + +namespace arrow::util { + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// Fixed-width types are the ones defined by the is_fixed_width() predicate in +/// type_traits.h. They are all the types that passes any of the following +/// predicates: +/// +/// - is_primitive() +/// - is_fixed_size_binary() +/// - is_dictionary() +/// +/// At least 3 types in this set require special care: +/// - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to first bit +/// in boolean buffers are not always aligned to byte boundaries. +/// - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, but the +/// dictionary values are not necessarily fixed-width and have to be managed +/// by separate operations. +/// - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size binary +/// values are defined by a size attribute that is not known at compile time. +/// The other types have power-of-2 byte widths, while fixed-size binary can +/// have any byte width including 0. +/// +/// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as +/// defined above, or if it's a fixed-size list (or nested fixed-size lists) and +/// the innermost type is fixed-width and the following restrictions also apply: +/// - The value type of the innermost fixed-size list is not BOOL (it has to be excluded +/// because a 1-bit type doesn't byte-align) +/// - Only the top-level array may have nulls, all the inner array have to be completely +/// free of nulls so we don't need to manage internal validity bitmaps. +/// +/// Take the following `fixed_size_list, 3>` array as an +/// example: +/// +/// [ +/// [[1, 2], [3, 4], [ 5, 6]], +/// null, +/// [[7, 8], [9, 10], [11, 12]] +/// ] +/// +/// in memory, it would look like: +/// +/// { +/// type: fixed_size_list, 3>, +/// length: 3, +/// null_count: 1, +/// offset: 0, +/// buffers: [ +/// 0: [0b00000101] +/// ], +/// child_data: [ +/// 0: { +/// type: fixed_size_list, +/// length: 9, +/// null_count: 0, +/// offset: 0, +/// buffers: [0: NULL], +/// child_data: [ +/// 0: { +/// type: int32, +/// length: 18, +/// null_count: 0, +/// offset: 0, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 2, 3, 4, 5, 6, +/// 0, 0, 0, 0, 0, 0 +/// 7, 8, 9, 10, 11, 12 ] +/// ], +/// child_data: [] +/// } +/// ] +/// } +/// ] +/// } +/// +/// This layout fits the fixed-width like definition because the innermost type +/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +/// have nulls. The validity bitmap is only needed at the top-level array. +/// +/// Writing to this array can be done in the same way writing to a flat fixed-width +/// array is done, by: +/// 1. Updating the validity bitmap at the top-level array if nulls are present. +/// 2. Updating a continuous fixed-width block of memory through a single pointer. +/// +/// The length of this block of memory is the product of the list sizes in the +/// `FixedSizeList` types and the byte width of the innermost fixed-width type: +/// +/// 3 * 2 * 4 = 24 bytes +/// +/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +/// simply setting the validity bit to 1 and writing the 24-byte sequence of +/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +/// +/// The length of the top-level array fully defines the lengths that all the nested +/// arrays must have, which makes defining all the lengths as easy as defining the +/// length of the top-level array. +/// +/// length = 3 +/// child_data[0].length == 3 * 3 == 9 +/// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +/// +/// child_data[0].child_data[0].buffers[1].size() >= +/// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +/// +/// Dealing with offsets is a bit involved. Let's say the array described above has +/// the offsets 2, 5, and 7: +/// +/// { +/// type: fixed_size_list, 3>, +/// offset: 2, +/// ... +/// child_data: [ +/// 0: { +/// type: fixed_size_list, +/// offset: 5, +/// ... +/// child_data: [ +/// 0: { +/// type: int32, +/// offset: 7, +/// buffers: [ +/// 0: NULL, +/// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +/// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +/// +/// 0,0,0,0,0,1, // +/// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +/// +/// 1, 2, 3, 4, 5, 6, // +/// 0, 0, 0, 0, 0, 0 // the actual values +/// 7, 8, 9, 10, 11, 12 // +/// ] +/// ], +/// } +/// ] +/// } +/// ] +/// } +/// +/// The offset of the innermost values buffer, in bytes, is calculated as: +/// +/// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +/// +/// In general, the formula to calculate the offset of the innermost values buffer is: +/// +/// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +/// * sizeof(innermost_type) +/// +/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the +/// pointer to the first relevant byte of the innermost values buffer. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param exclude_dictionary If true, DICTIONARY is excluded from the +/// is_fixed_width() types. Default: false. +ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count = false, + bool exclude_dictionary = false); + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// This function is a more general version of +/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further +/// restrict the inner value types that should be considered fixed-width. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param extra_predicate A DataType predicate that can be used to further +/// restrict the types that are considered fixed-width +template +inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + ExtraPred extra_predicate) { + const auto* type = source.type; + // BOOL is considered fixed-width if not nested under FIXED_SIZE_LIST. + if (is_fixed_width(type->id()) && extra_predicate(*type)) { + return true; + } + if (type->id() == Type::FIXED_SIZE_LIST) { + // All the inner arrays must not contain any nulls. + const auto* values = &source.child_data[0]; + while ((force_null_count ? values->GetNullCount() : values->null_count) == 0) { + type = values->type; + if (type->id() == Type::FIXED_SIZE_LIST) { + values = &values->child_data[0]; + continue; + } + // BOOL has to be excluded because it's not byte-aligned. + return type->id() != Type::BOOL && is_fixed_width(type->id()) && + extra_predicate(*type); + } + } + return false; +} + +/// \brief Get the fixed-width in bytes of a type if it is a fixed-width like +/// type, but not BOOL. +/// +/// If the array is a FixedSizeList (of any level of nesting), the byte width of +/// the values is the product of all fixed-list sizes and the byte width of the +/// innermost fixed-width value type. +/// +/// IsFixedWidthLike(array) performs more checks than this function and should +/// be used to guarantee that, if type is not BOOL, this function will not return -1. +/// +/// NOTE: this function translates `DataType::bit_width()` to bytes differently from +/// `DataType::byte_width()`. `DataType::byte_width()` will return 0 for +/// BOOL, while this function will return `-1`. This is done because 0 is +/// a valid return value for FIXED_SIZE_LIST with size 0 or `FIXED_SIZE_BINARY` with +/// size 0. +/// +/// \pre The instance of the array where this type is from must pass +/// `IsFixedWidthLike(array)` and should not be BOOL. +/// \return The fixed-byte width of the values or -1 if the type is BOOL or not +/// fixed-width like. 0 is a valid return value as fixed-size-lists +/// and fixed-size-binary with size 0 are allowed. +ARROW_EXPORT int64_t FixedWidthInBytes(const DataType& type); + +/// \brief Get the fixed-width in bits of a type if it is a fixed-width like +/// type. +/// +/// \return The bit-width of the values or -1 +/// \see FixedWidthInBytes +ARROW_EXPORT int64_t FixedWidthInBits(const DataType& type); + +namespace internal { + +/// \brief Allocate an ArrayData for a type that is fixed-width like. +/// +/// This function performs the same checks performed by +/// `IsFixedWidthLike(source, false)`. If `source.type` is not a simple +/// fixed-width type, caller should make sure it passes the +/// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to +/// allocate an array that can serve as a destination for a kernel that writes values +/// through a single pointer to fixed-width byte blocks. +/// +/// \param[in] length The length of the array to allocate (unrelated to the length of +/// the source array) +/// \param[in] source The source array that carries the type information and the +/// validity bitmaps that are relevant for the type validation +/// when the source is a FixedSizeList. +/// \see IsFixedWidthLike +ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, + const ArraySpan& source, + bool allocate_validity, + ArrayData* out); + +} // namespace internal + +/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// +/// This function might return NULLPTR if the type of the array is BOOL or +/// if the pre-conditions listed are not satisfied. The converse is not true +/// (i.e. not getting NULLPTR doesn't guarantee that source is a fixed-width +/// like array). +/// +/// \pre `IsFixedWidthLike(source)` or the more restrictive +/// is_fixed_width(*mutable_array->type) SHOULD be true +/// \return The pointer to the fixed-width values of an array or NULLPTR +/// if pre-conditions are not satisfied. +ARROW_EXPORT const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source); + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +ARROW_EXPORT uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array); + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc new file mode 100644 index 0000000000000..2f05221ed6535 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test.cc @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// #include +// #include + +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/fixed_width_internal.h" + +namespace arrow::util { + +namespace { +bool NotBool(const DataType& type) { return type.id() != Type::BOOL; } +bool NotInt32(const DataType& type) { return type.id() != Type::INT32; } +} // namespace + +class TestFixedWidth : public ::testing::Test { + protected: + std::shared_ptr bool_array_array_; + std::shared_ptr int_array_array_; + std::shared_ptr fsl_bool_array_; + std::shared_ptr fsl_int_array_; + std::shared_ptr fsl_int_nulls_array_; + std::shared_ptr fsl_int_inner_nulls_array_; + std::shared_ptr dict_string_array_; + + std::shared_ptr fsl(int32_t list_size, + const std::shared_ptr& value_type) { + return fixed_size_list(value_type, list_size); + } + + public: + void SetUp() override { + bool_array_array_ = ArrayFromJSON(boolean(), "[true, false, null]"); + int_array_array_ = ArrayFromJSON(int32(), "[1, 0, null]"); + fsl_bool_array_ = ArrayFromJSON(fsl(2, boolean()), "[[true, false]]"); + fsl_int_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3]]"); + fsl_int_nulls_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], null, [1, 2]]"); + fsl_int_inner_nulls_array_ = + ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3], [null, 2]]"); + dict_string_array_ = + ArrayFromJSON(dictionary(int32(), utf8()), R"(["Alice", "Bob", "Alice"])"); + } +}; + +TEST_F(TestFixedWidth, IsFixedWidth) { + auto arr = ArraySpan{*bool_array_array_->data()}; + // force_null_count doesn't matter because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotInt32)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); + + arr = ArraySpan{*int_array_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); +} + +TEST_F(TestFixedWidth, IsFixedWidthLike) { + auto arr = ArraySpan{*fsl_bool_array_->data()}; + // bools wrapped by fixed-size-list are not fixed-width because the + // innermost data buffer is a bitmap and won't byte-align. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + arr = ArraySpan{*fsl_int_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + arr.null_count = kUnknownNullCount; + // force_null_count=true isn't necessary because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr.child_data[0].null_count = kUnknownNullCount; + // inner nulls are not allowed by IsFixedWidthLike... + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + // ...but forcing null counting at on every internal array increases + // the chances of IsFixedWidthLike returning true. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + // Excluding INT32 from the internal array checks. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true, NotInt32)); + + arr = ArraySpan{*fsl_int_nulls_array_->data()}; + // Nulls at the top-level of the array are allowed by IsFixedWidthLike. + // + // TODO(GH-10157): ArrayFromJSON uses FixedSizeListBuilder which currently + // produces nulls on the child data if one of the list-typed elements is null. + // ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr = ArraySpan{*fsl_int_inner_nulls_array_->data()}; + // Inner nulls are not allowed by IsFixedWidthLike. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + arr = ArraySpan{*dict_string_array_->data()}; + // Dictionaries are considered fixed-width by is_fixed_width(), but excluded + // by IsFixedWidthLike if exclude_dictionary=true. + ASSERT_TRUE(IsFixedWidthLike(arr)); + ASSERT_TRUE( + IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/false)); + ASSERT_FALSE( + IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/true)); +} + +TEST_F(TestFixedWidth, MeasureWidthInBytes) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBytes(*b), -1); + ASSERT_EQ(FixedWidthInBytes(*i8), 1); + ASSERT_EQ(FixedWidthInBytes(*i32), 4); + ASSERT_EQ(FixedWidthInBytes(*fsb), 3); + ASSERT_EQ(FixedWidthInBytes(*dict), 4); + + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, b)), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i8)), 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i8)), 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i32)), 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i32)), 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, fsb)), 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, dict)), 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i8))), 2 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i8))), 2 * 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i32))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i32))), 2 * 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, fsb))), 2 * 3 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, fsb))), 2 * 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, dict))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, dict))), 2 * 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, varlen)), -1); +} + +TEST_F(TestFixedWidth, MeasureWidthInBits) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBits(*b), 1); + ASSERT_EQ(FixedWidthInBits(*i8), 8); + ASSERT_EQ(FixedWidthInBits(*i32), 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsb), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*dict), 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), -1); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i8)), 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i32)), 4 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i32)), 4 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, fsb)), 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, dict)), 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i8))), 2 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i8))), 2 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i32))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i32))), 2 * 5 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, fsb))), 2 * 3 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, fsb))), 2 * 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, dict))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, dict))), 2 * 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(2, varlen)), -1); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test_util.h b/cpp/src/arrow/util/fixed_width_test_util.h new file mode 100644 index 0000000000000..ca141b7ca2c4d --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test_util.h @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/builder_primitive.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow::util::internal { + +class NestedListGenerator { + public: + /// \brief Create a nested FixedSizeListType. + /// + /// \return `fixed_size_list(fixed_size_list(..., sizes[1]), sizes[0])` + static std::shared_ptr NestedFSLType( + const std::shared_ptr& inner_type, const std::vector& sizes) { + auto type = inner_type; + for (auto it = sizes.rbegin(); it != sizes.rend(); it++) { + type = fixed_size_list(std::move(type), *it); + } + return type; + } + + /// \brief Create a nested FixedListType. + /// + /// \return `list(list(...))` + static std::shared_ptr NestedListType( + const std::shared_ptr& inner_type, size_t depth) { + auto list_type = list(inner_type); + for (size_t i = 1; i < depth; i++) { + list_type = list(std::move(list_type)); + } + return list_type; + } + + private: + template + static Status AppendNumeric(ArrayBuilder* builder, int64_t* next_value) { + using NumericBuilder = ::arrow::NumericBuilder; + using value_type = typename NumericBuilder::value_type; + auto* numeric_builder = ::arrow::internal::checked_cast(builder); + auto cast_next_value = + static_cast(*next_value % std::numeric_limits::max()); + RETURN_NOT_OK(numeric_builder->Append(cast_next_value)); + *next_value += 1; + return Status::OK(); + } + + // Append([...[[*next_inner_value++, *next_inner_value++, ...]]...]) + static Status AppendNestedList(ArrayBuilder* nested_builder, const int* list_sizes, + int64_t* next_inner_value) { + using ::arrow::internal::checked_cast; + ArrayBuilder* builder = nested_builder; + auto type = builder->type(); + if (type->id() == Type::FIXED_SIZE_LIST || type->id() == Type::LIST) { + const int list_size = *list_sizes; + if (type->id() == Type::FIXED_SIZE_LIST) { + auto* fsl_builder = checked_cast(builder); + assert(list_size == checked_cast(*type).list_size()); + RETURN_NOT_OK(fsl_builder->Append()); + builder = fsl_builder->value_builder(); + } else { // type->id() == Type::LIST) + auto* list_builder = checked_cast(builder); + RETURN_NOT_OK(list_builder->Append(/*is_valid=*/true, list_size)); + builder = list_builder->value_builder(); + } + list_sizes++; + for (int i = 0; i < list_size; i++) { + RETURN_NOT_OK(AppendNestedList(builder, list_sizes, next_inner_value)); + } + } else { + switch (type->id()) { + case Type::INT8: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT16: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT32: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT64: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + default: + return Status::NotImplemented("Unsupported type: ", *type); + } + } + return Status::OK(); + } + + static Result> NestedListArray( + ArrayBuilder* nested_builder, const std::vector& list_sizes, int64_t length) { + int64_t next_inner_value = 0; + for (int64_t i = 0; i < length; i++) { + RETURN_NOT_OK( + AppendNestedList(nested_builder, list_sizes.data(), &next_inner_value)); + } + return nested_builder->Finish(); + } + + public: + static Result> NestedFSLArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedFSLType(inner_type, list_sizes); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + static Result> NestedListArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedListType(inner_type, list_sizes.size()); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + /// \brief Generate all possible nested list configurations of depth 1 to max_depth. + /// + /// Each configuration consists of a single inner value type and a list of sizes. + /// Both can be used with NestedFSLArray and NestedListArray to generate test data. + /// + /// The product of the list sizes and the size of the inner value type is always a power + /// of 2 no greater than max_power_of_2_size. For max_depth=3 and + /// max_power_of_2_size=32, this generates 108 configurations. + /// + /// \tparam Visit a function type with signature + /// void(const std::shared_ptr& inner_type, + /// const std::vector& list_sizes) + template + static void VisitAllNestedListConfigurations( + const std::vector>& inner_value_types, Visit&& visit, + int max_depth = 3, int max_power_of_2_size = 32) { + for (int depth = 1; depth <= max_depth; depth++) { + for (auto& type : inner_value_types) { + assert(is_fixed_width(*type)); + int value_width = type->byte_width(); + + std::vector list_sizes; // stack of list sizes + auto pop = [&]() { // pop the list_sizes stack + assert(!list_sizes.empty()); + value_width /= list_sizes.back(); + list_sizes.pop_back(); + }; + auto next = [&]() { // double the top of the stack + assert(!list_sizes.empty()); + value_width *= 2; + list_sizes.back() *= 2; + return value_width; + }; + auto push_1s = [&]() { // fill the stack with 1s + while (list_sizes.size() < static_cast(depth)) { + list_sizes.push_back(1); + } + }; + + // Loop invariants: + // value_width == product(list_sizes) * type->byte_width() + // value_width is a power-of-2 (1, 2, 4, 8, 16, max_power_of_2_size=32) + push_1s(); + do { + // for (auto x : list_sizes) printf("%d * ", x); + // printf("(%s) %d = %2d\n", type->name().c_str(), type->byte_width(), + // value_width); + visit(type, list_sizes); + // Advance to the next test case + while (!list_sizes.empty()) { + if (next() <= max_power_of_2_size) { + push_1s(); + break; + } + pop(); + } + } while (!list_sizes.empty()); + } + } + } +}; + +} // namespace arrow::util::internal From 0d8b3791cda042224427ccbb4fc2fc3ec0f27b61 Mon Sep 17 00:00:00 2001 From: Jacek Stania <38670505+janosik47@users.noreply.github.com> Date: Fri, 3 May 2024 07:24:06 +0100 Subject: [PATCH 063/106] GH-35888: [Java] Add FlightStatusCode.RESOURCE_EXHAUSTED (#41508) ### Rationale for this change Related to https://github.com/apache/arrow/issues/35888 Currently the gRPC Status.RESOURCE_EXHAUSTED exception/code is translated by the Java FlightServer into FlightStatusCode.INVALID_ARGUMENT and thrown to the client as gRPC INVALID_ARGUMENT exception. That may mislead the other party as the INVALID_ARGUMENT indicates an input parameters problem where in reality the backed server intention was rather 'back off and try later'. ### What changes are included in this PR? Add the FlightStatusCode.RESOURCE_EXHAUSTED code and make sure is translated from/to the gRPC Status.RESOURCE_EXHAUSTED ### Are these changes tested? Unit tests included to validate the RESOURCE_EXHAUSTED translation between flight and grpc codes. ### Are there any user-facing changes? No. Users may start seeing RESOURCE_EXHAUSTED instead of INVALID_ARGUMENT code. In both cases this is an exception seen on the client side so I am considering this as a _not breaking change to any public API_. Although, may have an influence in the client side flows if one decided to react conditionally on exception status code. * GitHub Issue: #35888 Authored-by: Jacek Stania Signed-off-by: David Li --- .../org/apache/arrow/flight/CallStatus.java | 1 + .../apache/arrow/flight/FlightStatusCode.java | 5 +++++ .../apache/arrow/flight/grpc/StatusUtils.java | 4 +++- .../arrow/flight/grpc/TestStatusUtils.java | 22 +++++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java index 991d0ed6a043b..8fc2002207e24 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java @@ -49,6 +49,7 @@ public class CallStatus { public static final CallStatus UNAUTHORIZED = FlightStatusCode.UNAUTHORIZED.toStatus(); public static final CallStatus UNIMPLEMENTED = FlightStatusCode.UNIMPLEMENTED.toStatus(); public static final CallStatus UNAVAILABLE = FlightStatusCode.UNAVAILABLE.toStatus(); + public static final CallStatus RESOURCE_EXHAUSTED = FlightStatusCode.RESOURCE_EXHAUSTED.toStatus(); /** * Create a new status. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java index 3d96877ba02de..09a2c7afda106 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java @@ -71,6 +71,11 @@ public enum FlightStatusCode { * should send this code only if it has not done any work. */ UNAVAILABLE, + /** + * Some resource has been exhausted, perhaps a per-user quota, or perhaps the entire file system is out of space. + * (see: https://grpc.github.io/grpc/core/md_doc_statuscodes.html) + */ + RESOURCE_EXHAUSTED ; /** diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java index 7f0dcf2da3f0d..a2d9a85aaa442 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java @@ -74,6 +74,8 @@ public static Status.Code toGrpcStatusCode(FlightStatusCode code) { return Code.UNIMPLEMENTED; case UNAVAILABLE: return Code.UNAVAILABLE; + case RESOURCE_EXHAUSTED: + return Code.RESOURCE_EXHAUSTED; default: return Code.UNKNOWN; } @@ -101,7 +103,7 @@ public static FlightStatusCode fromGrpcStatusCode(Status.Code code) { case PERMISSION_DENIED: return FlightStatusCode.UNAUTHORIZED; case RESOURCE_EXHAUSTED: - return FlightStatusCode.INVALID_ARGUMENT; + return FlightStatusCode.RESOURCE_EXHAUSTED; case FAILED_PRECONDITION: return FlightStatusCode.INVALID_ARGUMENT; case ABORTED: diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java index 9912a26ea340a..730ea30a2f598 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java @@ -48,4 +48,26 @@ public void testParseTrailers() { Assertions.assertTrue(callStatus.metadata().containsKey("content-type")); Assertions.assertEquals("text/html", callStatus.metadata().get("content-type")); } + + @Test + public void testGrpcResourceExhaustedTranslatedToFlightStatus() { + Status status = Status.RESOURCE_EXHAUSTED; + + CallStatus callStatus = StatusUtils.fromGrpcStatus(status); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, callStatus.code()); + + FlightStatusCode flightStatusCode = StatusUtils.fromGrpcStatusCode(status.getCode()); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, flightStatusCode); + } + + @Test + public void testFlightResourceExhaustedTranslatedToGrpcStatua() { + CallStatus callStatus = CallStatus.RESOURCE_EXHAUSTED; + + Status.Code grpcStatusCode = StatusUtils.toGrpcStatusCode(callStatus.code()); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatusCode); + + Status grpcStatus = StatusUtils.toGrpcStatus(callStatus); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatus.getCode()); + } } From 2eb47efc04707145edbf6a2ad8a750138a585682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JB=20Onofr=C3=A9?= Date: Fri, 3 May 2024 08:44:41 +0200 Subject: [PATCH 064/106] MINOR: Increase the open-pull-requests-limit for dependabot (#41499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change By default, dependabot opens a maximum of five pull requests for version updates. Once there are five open pull requests from dependabot, dependabot will not open any new requests until some of those open requests are merged or closed. With this change, dependabot can open up to 50 pull requests for Maven, and 10 pull requests for other systems. ### What changes are included in this PR? Update configuration for dependabot. ### Are these changes tested? Tested on other ASF projects 😄 ### Are there any user-facing changes? No Authored-by: JB Onofré Signed-off-by: David Li --- .github/dependabot.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e96cb8d2eb1e3..7d9ff2f42e887 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,30 +23,35 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + open-pull-requests-limit: 10 - package-ecosystem: "gomod" directory: "/go/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Go] " + open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Java] " + open-pull-requests-limit: 10 - package-ecosystem: "npm" directory: "/js/" schedule: interval: "monthly" commit-message: prefix: "MINOR: [JS] " + open-pull-requests-limit: 10 - package-ecosystem: "nuget" directory: "/csharp/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [C#] " + open-pull-requests-limit: 10 ignore: - dependency-name: "Microsoft.Extensions.*" update-types: From c0aade5f624e2ec64d5d8743df9b95c7eee50117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 3 May 2024 12:16:02 +0200 Subject: [PATCH 065/106] GH-41462: [CI] Temporary pin azurite to v3.29.0 (#41501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change install_azurite.sh is failing to install the latest version of Azurite and azure tests were failing. ### What changes are included in this PR? Temporarily pin azurite to v3.29.0 to unblock 16.1.0 release. A follow up issue is tracked here: https://github.com/apache/arrow/issues/41505 ### Are these changes tested? Yes via archery ### Are there any user-facing changes? No * GitHub Issue: #41462 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- ci/scripts/install_azurite.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index 2e7008360fdc3..dda5e99405b7f 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -19,17 +19,18 @@ set -e +# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 case "$(uname)" in Darwin) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; MINGW*) choco install nodejs.install - npm install -g azurite + npm install -g azurite@v3.29.0 ;; Linux) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; esac From c8cf61c569886ff18c88e29c447a98ab6dedbd92 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 09:42:28 -0400 Subject: [PATCH 066/106] GH-41385: [CI][MATLAB][Packaging] Add support for MATLAB `R2024a` in CI and crossbow packaging workflows (#41504) ### Rationale for this change MATLAB `R2024a` is the latest available version of MATLAB as of April 2024. We are currently building against MATLAB `R2023a` in CI and for the crossbow packaging workflow. We should update the version of MATLAB we support to the latest available version. We previously created an issue to use `R2023b` (https://github.com/apache/arrow/issues/37809). However, `R2024a` has become publicly available since then. ### What changes are included in this PR? 1. Changed the `release` argument supplied to `matlab-actions/setup-matlab@ v2` to `R2024a` from `R2023a` in `.github/workflows/matlab.yml` and `dev/tasks/matlab/github.yml`. 2. Updated the script used to package the MLTBX file (`arrow/matlab/tools/packageMatlabInterface.m`) to specify the version of MATLAB currently running as the only compatible version of MATLAB for the interface. 3. Updated display tests to react to a change in how class names are formatted when an objected is displayed in the Command Window. ### Are these changes tested? Yes. Existing tests used. ### Are there any user-facing changes? There are no changes to the MATLAB Arrow Interface. However, the MATLAB release used to build, test, and package the interface has been upgraded. ### Future Work 1. We may want to consider building and packaging against multiple versions of MATLAB in parallel, rather than just the latest. This would require some more thought on how many releases back to support over time, however. 2. #41435 * GitHub Issue: #41385 Lead-authored-by: Sarah Gilmore Co-authored-by: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Co-authored-by: Sutou Kouhei Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 6 +++--- dev/tasks/matlab/github.yml | 8 ++++---- .../+internal/+test/+display/makeLinkString.m | 4 ++-- .../+tabular/+internal/+display/getSchemaString.m | 2 +- matlab/tools/packageMatlabInterface.m | 15 ++++++++++++--- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index aa3692e587961..8a0de8a365661 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -70,7 +70,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -110,7 +110,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -148,7 +148,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 13fa36b501125..7840fd176705c 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -31,7 +31,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -68,7 +68,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -103,7 +103,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Install sccache shell: bash run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache @@ -149,7 +149,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v1 with: - release: R2023a + release: R2024a - name: Run commands env: MATLABPATH: arrow/matlab/tools diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m index 79065ba1c8cfd..e99dd7d78488d 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m @@ -26,11 +26,11 @@ end if opts.BoldFont - link = compose("%s", ... opts.FullClassName, opts.ClassName); else - link = compose("%s", ... + link = compose("%s", ... opts.FullClassName, opts.ClassName); end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m index 7da945ca993ef..724b4873c92e1 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m @@ -43,7 +43,7 @@ classNameAndIDs = strings([1 numel(typeIDs) * 2]); classNameAndIDs(1:2:end-1) = classNames; classNameAndIDs(2:2:end) = typeIDs; - typeIDs = compose("%s", classNameAndIDs); + typeIDs = compose("%s", classNameAndIDs); end text = names + ": " + typeIDs; diff --git a/matlab/tools/packageMatlabInterface.m b/matlab/tools/packageMatlabInterface.m index 55b4d4241a569..3d970002614ab 100644 --- a/matlab/tools/packageMatlabInterface.m +++ b/matlab/tools/packageMatlabInterface.m @@ -55,9 +55,18 @@ opts.SupportedPlatforms.Glnxa64 = true; opts.SupportedPlatforms.MatlabOnline = true; -% Interface is only qualified against R2023a at the moment -opts.MinimumMatlabRelease = "R2023a"; -opts.MaximumMatlabRelease = "R2023a"; +% MEX files use run-time libraries shipped with MATLAB (e.g. libmx, libmex, +% etc.). MEX files linked against earlier versions of MALTAB run-time libraries +% will most likely work on newer versions of MATLAB. However, this may not +% always be the case. +% +% For now, set the earliest and latest compatible releases of MATLAB to +% the release of MATLAB used to build and package the MATLAB Arrow Interface. +% +% See: https://www.mathworks.com/help/matlab/matlab_external/version-compatibility.html +currentRelease = matlabRelease.Release; +opts.MinimumMatlabRelease = currentRelease; +opts.MaximumMatlabRelease = currentRelease; opts.OutputFile = fullfile(outputFolder, compose("matlab-arrow-%s.mltbx", toolboxVersionRaw)); disp("Output File: " + opts.OutputFile); From 32916f8355858ccea91df402f67696953f9dd298 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:06:36 +0900 Subject: [PATCH 067/106] MINOR: [JS] Bump @swc/core from 1.4.14 to 1.4.17 in /js (#41519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ swc/core](https://github.com/swc-project/swc) from 1.4.14 to 1.4.17.
Changelog

Sourced from @​swc/core's changelog.

[1.4.17] - 2024-04-23

Bug Fixes

  • (es) Ignore sourceMappingURL in string literals (#8879) (d7188cd)

  • (es/codegen) Use Str.raw for es5 (#8873) (c7a06b1)

  • (es/compat) Fix async generator (#8881) (063eabd)

  • (es/resolver) Prioritze jsc.paths by length in tsc resolver (#8875) (e22c368)

  • (html/codegen) Expand elements before which body isn’t elided (#8877) (5419a94)

[1.4.16] - 2024-04-18

Bug Fixes

  • (es/helpers) Fix resolving of usingCtx helper (#8874) (6e9d1a4)

[1.4.15] - 2024-04-17

Bug Fixes

  • (es/codegen) Fix ascii_only for identifiers (#8866) (2075a23)

  • (es/minifier) Remove raw of strings after modification (#8865) (740c0bb)

  • (es/parser) Fix span of BindingIdent (#8859) (fbd32fb)

  • (es/proposal) Update explicit resource management to match spec (#8860) (6d24076)

Features

  • (es/transforms) Allocate stacks dynamically (#8867) (a1c5415)

... (truncated)

Commits
  • 3311da7 chore: Publish 1.4.17 with swc_core v0.90.37
  • b1c22d5 chore: Improve publish script
  • fedf06f chore: Publish 1.4.17-nightly-20240423.3
  • 2c1e959 chore: Publish 1.4.17-nightly-20240423.2
  • 7b08d38 chore: Update bindings
  • 2fbb864 chore: Publish 1.4.17-nightly-20240423.1
  • 6d3c41d chore: Bump crates
  • c7a06b1 fix(es/codegen): Use Str.raw for es5 (#8873)
  • f5e50c2 chore: Bump crates
  • 063eabd fix(es/compat): Fix async generator (#8881)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/core&package-manager=npm_and_yarn&previous-version=1.4.14&new-version=1.4.17)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 128 ++++++++++++++++++++++++------------------------ 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/js/package.json b/js/package.json index e9590a188820f..08f2fe83d13e8 100644 --- a/js/package.json +++ b/js/package.json @@ -67,7 +67,7 @@ "@rollup/plugin-alias": "5.1.0", "@rollup/plugin-node-resolve": "15.2.3", "@rollup/stream": "3.0.1", - "@swc/core": "1.4.14", + "@swc/core": "1.4.17", "@types/benchmark": "2.1.5", "@types/glob": "8.1.0", "@types/jest": "29.5.12", diff --git a/js/yarn.lock b/js/yarn.lock index ab092675b4806..47674bd8b2168 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1124,74 +1124,74 @@ dependencies: "@sinonjs/commons" "^3.0.0" -"@swc/core-darwin-arm64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.14.tgz#de570252c3f155f55536f0d6bb8bafaec2e99616" - integrity sha512-8iPfLhYNspBl836YYsfv6ErXwDUqJ7IMieddV3Ey/t/97JAEAdNDUdtTKDtbyP0j/Ebyqyn+fKcqwSq7rAof0g== - -"@swc/core-darwin-x64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.14.tgz#4eefbe129e416f4c400656742ab7f30e01aff02e" - integrity sha512-9CqSj8uRZ92cnlgAlVaWMaJJBdxtNvCzJxaGj5KuIseeG6Q0l1g+qk8JcU7h9dAsH9saHTNwNFBVGKQo0W0ujg== - -"@swc/core-linux-arm-gnueabihf@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.14.tgz#bea4b94c32bb25de2816126dac299655529ba7f3" - integrity sha512-mfd5JArPITTzMjcezH4DwMw+BdjBV1y25Khp8itEIpdih9ei+fvxOOrDYTN08b466NuE2dF2XuhKtRLA7fXArQ== - -"@swc/core-linux-arm64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.14.tgz#52063214f4a14d6a0c3c6059ed9e7ba1062f6b46" - integrity sha512-3Lqlhlmy8MVRS9xTShMaPAp0oyUt0KFhDs4ixJsjdxKecE0NJSV/MInuDmrkij1C8/RQ2wySRlV9np5jK86oWw== - -"@swc/core-linux-arm64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.14.tgz#7e7deea7b1b3d0c9944cc8e9ba948fcc785158ea" - integrity sha512-n0YoCa64TUcJrbcXIHIHDWQjdUPdaXeMHNEu7yyBtOpm01oMGTKP3frsUXIABLBmAVWtKvqit4/W1KVKn5gJzg== - -"@swc/core-linux-x64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.14.tgz#301133ea3ee347568886f2489837e991e96d44db" - integrity sha512-CGmlwLWbfG1dB4jZBJnp2IWlK5xBMNLjN7AR5kKA3sEpionoccEnChOEvfux1UdVJQjLRKuHNV9yGyqGBTpxfQ== - -"@swc/core-linux-x64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.14.tgz#86b8e987a814209cd0dd0f21cbc1134305dfffd5" - integrity sha512-xq4npk8YKYmNwmr8fbvF2KP3kUVdZYfXZMQnW425gP3/sn+yFQO8Nd0bGH40vOVQn41kEesSe0Z5O/JDor2TgQ== - -"@swc/core-win32-arm64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.14.tgz#eb56b8977e3542665929c3963bd7dc18fe5b2556" - integrity sha512-imq0X+gU9uUe6FqzOQot5gpKoaC00aCUiN58NOzwp0QXEupn8CDuZpdBN93HiZswfLruu5jA1tsc15x6v9p0Yg== - -"@swc/core-win32-ia32-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.14.tgz#72e119038b9d8743b13bb933b8e192acd9f501f9" - integrity sha512-cH6QpXMw5D3t+lpx6SkErHrxN0yFzmQ0lgNAJxoDRiaAdDbqA6Col8UqUJwUS++Ul6aCWgNhCdiEYehPaoyDPA== - -"@swc/core-win32-x64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.14.tgz#f5a3b1a241708b0628a07458e5bedbf67a1b9595" - integrity sha512-FmZ4Tby4wW65K/36BKzmuu7mlq7cW5XOxzvufaSNVvQ5PN4OodAlqPjToe029oma4Av+ykJiif64scMttyNAzg== - -"@swc/core@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.14.tgz#8bad316c0119f626bb1b181ba7a988ef9d14e9cc" - integrity sha512-tHXg6OxboUsqa/L7DpsCcFnxhLkqN/ht5pCwav1HnvfthbiNIJypr86rNx4cUnQDJepETviSqBTIjxa7pSpGDQ== +"@swc/core-darwin-arm64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.17.tgz#e62fa7f247bdd1c0c50a3f99722da4dd098c7c67" + integrity sha512-HVl+W4LezoqHBAYg2JCqR+s9ife9yPfgWSj37iIawLWzOmuuJ7jVdIB7Ee2B75bEisSEKyxRlTl6Y1Oq3owBgw== + +"@swc/core-darwin-x64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.17.tgz#1145cbb7575e317204ed3a7d0274bd26fe9ffab6" + integrity sha512-WYRO9Fdzq4S/he8zjW5I95G1zcvyd9yyD3Tgi4/ic84P5XDlSMpBDpBLbr/dCPjmSg7aUXxNQqKqGkl6dQxYlA== + +"@swc/core-linux-arm-gnueabihf@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.17.tgz#7145b3ada5cf9b748eaacbc9a7c7037ba0fb26bb" + integrity sha512-cgbvpWOvtMH0XFjvwppUCR+Y+nf6QPaGu6AQ5hqCP+5Lv2zO5PG0RfasC4zBIjF53xgwEaaWmGP5/361P30X8Q== + +"@swc/core-linux-arm64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.17.tgz#5c0833ef132af17bd3cbdf2253f35b57c0cf62bb" + integrity sha512-l7zHgaIY24cF9dyQ/FOWbmZDsEj2a9gRFbmgx2u19e3FzOPuOnaopFj0fRYXXKCmtdx+anD750iBIYnTR+pq/Q== + +"@swc/core-linux-arm64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.17.tgz#5bfe81eb23c905f04b669a7d2b060a147a263483" + integrity sha512-qhH4gr9gAlVk8MBtzXbzTP3BJyqbAfUOATGkyUtohh85fPXQYuzVlbExix3FZXTwFHNidGHY8C+ocscI7uDaYw== + +"@swc/core-linux-x64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.17.tgz#a0c19bc9635e86ebd1c7f8e9e026503d1a1bf83d" + integrity sha512-vRDFATL1oN5oZMImkwbgSHEkp8xG1ofEASBypze01W1Tqto8t+yo6gsp69wzCZBlxldsvPpvFZW55Jq0Rn+UnA== + +"@swc/core-linux-x64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.17.tgz#2179b9536235a3b02a46997ddb1c178dfadf1667" + integrity sha512-zQNPXAXn3nmPqv54JVEN8k2JMEcMTQ6veVuU0p5O+A7KscJq+AGle/7ZQXzpXSfUCXlLMX4wvd+rwfGhh3J4cw== + +"@swc/core-win32-arm64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.17.tgz#3004a431c836c6b16b4660ea2425dde467a8ee36" + integrity sha512-z86n7EhOwyzxwm+DLE5NoLkxCTme2lq7QZlDjbQyfCxOt6isWz8rkW5QowTX8w9Rdmk34ncrjSLvnHOeLY17+w== + +"@swc/core-win32-ia32-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.17.tgz#59155485d5307fb2a267e5acb215e0f440b6f48f" + integrity sha512-JBwuSTJIgiJJX6wtr4wmXbfvOswHFj223AumUrK544QV69k60FJ9q2adPW9Csk+a8wm1hLxq4HKa2K334UHJ/g== + +"@swc/core-win32-x64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.17.tgz#b98f25fc277fb0e319f25f9fd00a82023662716b" + integrity sha512-jFkOnGQamtVDBm3MF5Kq1lgW8vx4Rm1UvJWRUfg+0gx7Uc3Jp3QMFeMNw/rDNQYRDYPG3yunCC+2463ycd5+dg== + +"@swc/core@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.17.tgz#3ea4180fa5c54282b284006a6de1263ef1cf887f" + integrity sha512-tq+mdWvodMBNBBZbwFIMTVGYHe9N7zvEaycVVjfvAx20k1XozHbHhRv+9pEVFJjwRxLdXmtvFZd3QZHRAOpoNQ== dependencies: "@swc/counter" "^0.1.2" "@swc/types" "^0.1.5" optionalDependencies: - "@swc/core-darwin-arm64" "1.4.14" - "@swc/core-darwin-x64" "1.4.14" - "@swc/core-linux-arm-gnueabihf" "1.4.14" - "@swc/core-linux-arm64-gnu" "1.4.14" - "@swc/core-linux-arm64-musl" "1.4.14" - "@swc/core-linux-x64-gnu" "1.4.14" - "@swc/core-linux-x64-musl" "1.4.14" - "@swc/core-win32-arm64-msvc" "1.4.14" - "@swc/core-win32-ia32-msvc" "1.4.14" - "@swc/core-win32-x64-msvc" "1.4.14" + "@swc/core-darwin-arm64" "1.4.17" + "@swc/core-darwin-x64" "1.4.17" + "@swc/core-linux-arm-gnueabihf" "1.4.17" + "@swc/core-linux-arm64-gnu" "1.4.17" + "@swc/core-linux-arm64-musl" "1.4.17" + "@swc/core-linux-x64-gnu" "1.4.17" + "@swc/core-linux-x64-musl" "1.4.17" + "@swc/core-win32-arm64-msvc" "1.4.17" + "@swc/core-win32-ia32-msvc" "1.4.17" + "@swc/core-win32-x64-msvc" "1.4.17" "@swc/counter@^0.1.2", "@swc/counter@^0.1.3": version "0.1.3" From b102aebb0a9eb05e94544fd1f15f85f660fc89be Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:17:41 +0900 Subject: [PATCH 068/106] MINOR: [JS] Bump @typescript-eslint/parser from 7.7.0 to 7.8.0 in /js (#41522) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [@ typescript-eslint/parser](https://github.com/typescript-eslint/typescript-eslint/tree/HEAD/packages/parser) from 7.7.0 to 7.8.0.
Release notes

Sourced from @​typescript-eslint/parser's releases.

v7.8.0

7.8.0 (2024-04-29)

🚀 Features

  • rule-tester: assert suggestion messages are unique (#8995)
  • typescript-estree: add maximumDefaultProjectFileMatchCount and wide allowDefaultProjectForFiles glob restrictions (#8925)

🩹 Fixes

  • eslint-plugin: [no-unsafe-argument] handle tagged templates (#8746)
  • eslint-plugin: [prefer-optional-chain] suggests optional chaining during strict null equality check (#8717)
  • eslint-plugin: [consistent-type-assertions] handle tagged templates (#8993)
  • eslint-plugin: [no-unsafe-return] handle union types (#9001)
  • eslint-plugin: [no-unused-vars] clear error report range (#8640)
  • utils: export ESLint backwards-compat functions (#8976)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

v7.7.1

7.7.1 (2024-04-22)

🩹 Fixes

  • eslint-plugin: [no-unsafe-assignment] handle shorthand property assignment (#8800)
  • eslint-plugin: [explicit-function-return-type] fix checking wrong ancestor's return type (#8809)
  • eslint-plugin: [prefer-optional-chain] only look at left operand for requireNullish (#8559)
  • eslint-plugin: [no-for-in-array] refine report location (#8874)
  • eslint-plugin: [no-unnecessary-type-assertion] allow non-null assertion for void type (#8912)

❤️ Thank You

You can read about our versioning strategy and releases on our website.

Changelog

Sourced from @​typescript-eslint/parser's changelog.

7.8.0 (2024-04-29)

This was a version bump only for parser to align it with other projects, there were no code changes.

You can read about our versioning strategy and releases on our website.

7.7.1 (2024-04-22)

This was a version bump only for parser to align it with other projects, there were no code changes.

You can read about our versioning strategy and releases on our website.

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ typescript-eslint/parser&package-manager=npm_and_yarn&previous-version=7.7.0&new-version=7.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 53 +++++++++---------------------------------------- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/js/package.json b/js/package.json index 08f2fe83d13e8..f6d3c70fca6aa 100644 --- a/js/package.json +++ b/js/package.json @@ -73,7 +73,7 @@ "@types/jest": "29.5.12", "@types/multistream": "4.1.3", "@typescript-eslint/eslint-plugin": "7.8.0", - "@typescript-eslint/parser": "7.7.0", + "@typescript-eslint/parser": "7.8.0", "async-done": "2.0.0", "benny": "3.7.1", "cross-env": "7.0.3", diff --git a/js/yarn.lock b/js/yarn.lock index 47674bd8b2168..9885be2ba0643 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1453,15 +1453,15 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/parser@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.7.0.tgz#6b1b3ce76c5de002c43af8ae933613b0f2b4bcc6" - integrity sha512-fNcDm3wSwVM8QYL4HKVBggdIPAy9Q41vcvC/GtDobw3c4ndVT3K6cqudUmjHPw8EAp4ufax0o58/xvWaP2FmTg== - dependencies: - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" +"@typescript-eslint/parser@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.8.0.tgz#1e1db30c8ab832caffee5f37e677dbcb9357ddc8" + integrity sha512-KgKQly1pv0l4ltcftP59uQZCi4HUYswCLbTqVZEJu7uLX8CTLyswqMLqLN+2QFz4jCptqWVV4SB7vdxcH2+0kQ== + dependencies: + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" "@typescript-eslint/scope-manager@5.62.0": @@ -1472,14 +1472,6 @@ "@typescript-eslint/types" "5.62.0" "@typescript-eslint/visitor-keys" "5.62.0" -"@typescript-eslint/scope-manager@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.7.0.tgz#3f0db079b275bb8b0cb5be7613fb3130cfb5de77" - integrity sha512-/8INDn0YLInbe9Wt7dK4cXLDYp0fNHP5xKLHvZl3mOT5X17rK/YShXaiNmorl+/U4VKCVIjJnx4Ri5b0y+HClw== - dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" - "@typescript-eslint/scope-manager@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.8.0.tgz#bb19096d11ec6b87fb6640d921df19b813e02047" @@ -1503,11 +1495,6 @@ resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-5.62.0.tgz#258607e60effa309f067608931c3df6fed41fd2f" integrity sha512-87NVngcbVXUahrRTqIK27gD2t5Cu1yuCXxbLcFtCzZGlfyVWWh8mLHkoxzjsB6DDNnvdL+fW8MiwPEJyGJQDgQ== -"@typescript-eslint/types@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.7.0.tgz#23af4d24bf9ce15d8d301236e3e3014143604f27" - integrity sha512-G01YPZ1Bd2hn+KPpIbrAhEWOn5lQBrjxkzHkWvP6NucMXFtfXoevK82hzQdpfuQYuhkvFDeQYbzXCjR1z9Z03w== - "@typescript-eslint/types@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" @@ -1526,20 +1513,6 @@ semver "^7.3.7" tsutils "^3.21.0" -"@typescript-eslint/typescript-estree@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.7.0.tgz#b5dd6383b4c6a852d7b256a37af971e8982be97f" - integrity sha512-8p71HQPE6CbxIBy2kWHqM1KGrC07pk6RJn40n0DSc6bMOBBREZxSDJ+BmRzc8B5OdaMh1ty3mkuWRg4sCFiDQQ== - dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" - debug "^4.3.4" - globby "^11.1.0" - is-glob "^4.0.3" - minimatch "^9.0.4" - semver "^7.6.0" - ts-api-utils "^1.3.0" - "@typescript-eslint/typescript-estree@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.8.0.tgz#b028a9226860b66e623c1ee55cc2464b95d2987c" @@ -1589,14 +1562,6 @@ "@typescript-eslint/types" "5.62.0" eslint-visitor-keys "^3.3.0" -"@typescript-eslint/visitor-keys@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.7.0.tgz#950148cf1ac11562a2d903fdf7acf76714a2dc9e" - integrity sha512-h0WHOj8MhdhY8YWkzIF30R379y0NqyOHExI9N9KCzvmu05EgG4FumeYa3ccfKUSphyWkWQE1ybVrgz/Pbam6YA== - dependencies: - "@typescript-eslint/types" "7.7.0" - eslint-visitor-keys "^3.4.3" - "@typescript-eslint/visitor-keys@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.8.0.tgz#7285aab991da8bee411a42edbd5db760d22fdd91" From 5959024e9c1094c096dee569c0a5016496b24b58 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:18:13 +0900 Subject: [PATCH 069/106] MINOR: [JS] Bump eslint-plugin-jest from 27.9.0 to 28.4.0 in /js (#41524) Bumps [eslint-plugin-jest](https://github.com/jest-community/eslint-plugin-jest) from 27.9.0 to 28.4.0.
Release notes

Sourced from eslint-plugin-jest's releases.

v28.4.0

28.4.0 (2024-05-03)

Features

  • valid-expect: supporting automatically fixing missing await in some cases (#1574) (a407098)

v28.3.0

28.3.0 (2024-04-27)

Features

  • prefer importing jest globals for specific types (#1568) (c464ae3)

v28.2.0

28.2.0 (2024-04-06)

Features

  • support providing aliases for @ jest/globals package (#1543) (744d4f6)

v28.1.1

28.1.1 (2024-04-06)

Bug Fixes

  • max-expects: properly reset counter when exiting a test case (#1550) (b4b7cbc)

v28.1.0

28.1.0 (2024-04-06)

Features

v28.0.0

28.0.0 (2024-04-06)

Bug Fixes

  • allow ESLint 9 as peer dependency (#1547) (3c5e167)
  • drop support for Node 19 (#1548) (c87e388)
  • no-large-snapshots: avoid instanceof RegExp check for ESLint v9 compatibility (#1542) (af4a9c9)

... (truncated)

Changelog

Sourced from eslint-plugin-jest's changelog.

28.4.0 (2024-05-03)

Features

  • valid-expect: supporting automatically fixing missing await in some cases (#1574) (a407098)

28.3.0 (2024-04-27)

Features

  • prefer importing jest globals for specific types (#1568) (c464ae3)

28.2.0 (2024-04-06)

Features

  • support providing aliases for @ jest/globals package (#1543) (744d4f6)

28.1.1 (2024-04-06)

Bug Fixes

  • max-expects: properly reset counter when exiting a test case (#1550) (b4b7cbc)

28.1.0 (2024-04-06)

Features

28.0.0 (2024-04-06)

Bug Fixes

  • allow ESLint 9 as peer dependency (#1547) (3c5e167)
  • drop support for Node 19 (#1548) (c87e388)
  • no-large-snapshots: avoid instanceof RegExp check for ESLint v9 compatibility (#1542) (af4a9c9)

Features

... (truncated)

Commits
  • 27f7e74 chore(release): 28.4.0 [skip ci]
  • a407098 feat(valid-expect): supporting automatically fixing missing await in some c...
  • f47cc3c refactor: remove unneeded as consts (#1578)
  • 6c1f921 refactor(prefer-lowercase-title): remove unneeded cast (#1577)
  • aac5f03 refactor(prefer-importing-jest-globals): use AST_NODE_TYPES constant instea...
  • df3202f chore(deps): update yarn to v3.8.2 (#1575)
  • 8001fe7 chore(deps): lock file maintenance
  • bd6b918 chore(release): 28.3.0 [skip ci]
  • c464ae3 feat: prefer importing jest globals for specific types (#1568)
  • 2f21f33 refactor(expect-expect): remove unneeded array (#1571)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=eslint-plugin-jest&package-manager=npm_and_yarn&previous-version=27.9.0&new-version=28.4.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/package.json | 2 +- js/yarn.lock | 115 +++++++++++++++++++++++------------------------- 2 files changed, 56 insertions(+), 61 deletions(-) diff --git a/js/package.json b/js/package.json index f6d3c70fca6aa..7ed0daddfada0 100644 --- a/js/package.json +++ b/js/package.json @@ -82,7 +82,7 @@ "esbuild": "0.20.2", "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", - "eslint-plugin-jest": "27.9.0", + "eslint-plugin-jest": "28.4.0", "eslint-plugin-unicorn": "52.0.0", "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", "gulp": "4.0.2", diff --git a/js/yarn.lock b/js/yarn.lock index 9885be2ba0643..dbf79115d6412 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1348,7 +1348,7 @@ expect "^29.0.0" pretty-format "^29.0.0" -"@types/json-schema@*", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8", "@types/json-schema@^7.0.9": +"@types/json-schema@*", "@types/json-schema@^7.0.12", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8": version "7.0.15" resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== @@ -1399,7 +1399,7 @@ resolved "https://registry.yarnpkg.com/@types/resolve/-/resolve-1.20.2.tgz#97d26e00cd4a0423b4af620abecf3e6f442b7975" integrity sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q== -"@types/semver@^7.3.12", "@types/semver@^7.5.8": +"@types/semver@^7.5.0", "@types/semver@^7.5.8": version "7.5.8" resolved "https://registry.yarnpkg.com/@types/semver/-/semver-7.5.8.tgz#8268a8c57a3e4abd25c165ecd36237db7948a55e" integrity sha512-I8EUhyrgfLrcTkzV3TSsGyl1tSuPrEDzr0yd5m90UgNxQkyDXULk3b6MlQqTCpZpNtWe1K0hzclnZkTcLBe2UQ== @@ -1464,13 +1464,13 @@ "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" -"@typescript-eslint/scope-manager@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-5.62.0.tgz#d9457ccc6a0b8d6b37d0eb252a23022478c5460c" - integrity sha512-VXuvVvZeQCQb5Zgf4HAxc04q5j+WrNAtNh9OwCsCgpKqESMTu3tF/jhZ3xG6T4NZwWl65Bg8KuS2uEvhSfLl0w== +"@typescript-eslint/scope-manager@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-6.21.0.tgz#ea8a9bfc8f1504a6ac5d59a6df308d3a0630a2b1" + integrity sha512-OwLUIWZJry80O99zvqXVEioyniJMa+d2GrqpUTqi5/v5D5rOrppJVBPa0yKCblcigC0/aYAzxxqQ1B+DS2RYsg== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" "@typescript-eslint/scope-manager@7.8.0": version "7.8.0" @@ -1490,28 +1490,29 @@ debug "^4.3.4" ts-api-utils "^1.3.0" -"@typescript-eslint/types@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-5.62.0.tgz#258607e60effa309f067608931c3df6fed41fd2f" - integrity sha512-87NVngcbVXUahrRTqIK27gD2t5Cu1yuCXxbLcFtCzZGlfyVWWh8mLHkoxzjsB6DDNnvdL+fW8MiwPEJyGJQDgQ== +"@typescript-eslint/types@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-6.21.0.tgz#205724c5123a8fef7ecd195075fa6e85bac3436d" + integrity sha512-1kFmZ1rOm5epu9NZEZm1kckCDGj5UJEf7P1kliH4LKu/RkwpsfqqGmY2OOcUs18lSlQBKLDYBOGxRVtrMN5lpg== "@typescript-eslint/types@7.8.0": version "7.8.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" integrity sha512-wf0peJ+ZGlcH+2ZS23aJbOv+ztjeeP8uQ9GgwMJGVLx/Nj9CJt17GWgWWoSmoRVKAX2X+7fzEnAjxdvK2gqCLw== -"@typescript-eslint/typescript-estree@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-5.62.0.tgz#7d17794b77fabcac615d6a48fb143330d962eb9b" - integrity sha512-CmcQ6uY7b9y694lKdRB8FEel7JbU/40iSAPomu++SjLMntB+2Leay2LO6i8VnJk58MtE9/nQSFIH6jpyRWyYzA== +"@typescript-eslint/typescript-estree@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-6.21.0.tgz#c47ae7901db3b8bddc3ecd73daff2d0895688c46" + integrity sha512-6npJTkZcO+y2/kr+z0hc4HwNfrrP4kNYh57ek7yCNlrBjWQ1Y0OS7jiZTkgumrvkX5HkEKXFZkkdFNkaW2wmUQ== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" debug "^4.3.4" globby "^11.1.0" is-glob "^4.0.3" - semver "^7.3.7" - tsutils "^3.21.0" + minimatch "9.0.3" + semver "^7.5.4" + ts-api-utils "^1.0.1" "@typescript-eslint/typescript-estree@7.8.0": version "7.8.0" @@ -1540,27 +1541,26 @@ "@typescript-eslint/typescript-estree" "7.8.0" semver "^7.6.0" -"@typescript-eslint/utils@^5.10.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-5.62.0.tgz#141e809c71636e4a75daa39faed2fb5f4b10df86" - integrity sha512-n8oxjeb5aIbPFEtmQxQYOLI0i9n5ySBEY/ZEHHZqKQSFnxio1rv6dthascc9dLuwrL0RC5mPCxB7vnAVGAYWAQ== +"@typescript-eslint/utils@^6.0.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-6.21.0.tgz#4714e7a6b39e773c1c8e97ec587f520840cd8134" + integrity sha512-NfWVaC8HP9T8cbKQxHcsJBY5YE1O33+jpMwN45qzWWaPDZgLIbo12toGMWnmhvCpd3sIxkpDw3Wv1B3dYrbDQQ== dependencies: - "@eslint-community/eslint-utils" "^4.2.0" - "@types/json-schema" "^7.0.9" - "@types/semver" "^7.3.12" - "@typescript-eslint/scope-manager" "5.62.0" - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/typescript-estree" "5.62.0" - eslint-scope "^5.1.1" - semver "^7.3.7" - -"@typescript-eslint/visitor-keys@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-5.62.0.tgz#2174011917ce582875954ffe2f6912d5931e353e" - integrity sha512-07ny+LHRzQXepkGg6w0mFY41fVUNBrL2Roj/++7V1txKugfjm/Ci/qSND03r2RhlJhJYMcTn9AhhSSqQp0Ysyw== - dependencies: - "@typescript-eslint/types" "5.62.0" - eslint-visitor-keys "^3.3.0" + "@eslint-community/eslint-utils" "^4.4.0" + "@types/json-schema" "^7.0.12" + "@types/semver" "^7.5.0" + "@typescript-eslint/scope-manager" "6.21.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/typescript-estree" "6.21.0" + semver "^7.5.4" + +"@typescript-eslint/visitor-keys@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-6.21.0.tgz#87a99d077aa507e20e238b11d56cc26ade45fe47" + integrity sha512-JJtkDduxLi9bivAB+cYOVMtbkqdPOhZ+ZI5LC47MIRrDV4Yn2o+ZnW10Nkmr28xRpSpdJ6Sm42Hjf2+REYXm0A== + dependencies: + "@typescript-eslint/types" "6.21.0" + eslint-visitor-keys "^3.4.1" "@typescript-eslint/visitor-keys@7.8.0": version "7.8.0" @@ -3050,12 +3050,12 @@ escape-string-regexp@^4.0.0: resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz#14ba83a5d373e3d311e5afca29cf5bfad965bf34" integrity sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA== -eslint-plugin-jest@27.9.0: - version "27.9.0" - resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-27.9.0.tgz#7c98a33605e1d8b8442ace092b60e9919730000b" - integrity sha512-QIT7FH7fNmd9n4se7FFKHbsLKGQiw885Ds6Y/sxKgCZ6natwCsXdgPOADnYVxN2QrRweF0FZWbJ6S7Rsn7llug== +eslint-plugin-jest@28.4.0: + version "28.4.0" + resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-28.4.0.tgz#213be88f799a35ca9d63ce1a30081bb32b8da765" + integrity sha512-ORVHiFPC8RQxHLyQJ37MxNilK9k+cPzjHz65T8gAbpYZunGutXvKqwfM3WXBCvFDF1QBeYJJu9LB/i5cuXBs+g== dependencies: - "@typescript-eslint/utils" "^5.10.0" + "@typescript-eslint/utils" "^6.0.0" eslint-plugin-unicorn@52.0.0: version "52.0.0" @@ -3079,7 +3079,7 @@ eslint-plugin-unicorn@52.0.0: semver "^7.5.4" strip-indent "^3.0.0" -eslint-scope@5.1.1, eslint-scope@^5.1.1: +eslint-scope@5.1.1: version "5.1.1" resolved "https://registry.yarnpkg.com/eslint-scope/-/eslint-scope-5.1.1.tgz#e786e59a66cb92b3f6c1fb0d508aab174848f48c" integrity sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw== @@ -5299,6 +5299,13 @@ min-indent@^1.0.0, min-indent@^1.0.1: resolved "https://registry.yarnpkg.com/min-indent/-/min-indent-1.0.1.tgz#a63f681673b30571fbe8bc25686ae746eefa9869" integrity sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg== +minimatch@9.0.3: + version "9.0.3" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-9.0.3.tgz#a6e00c3de44c3a542bfaae70abfc22420a6da825" + integrity sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg== + dependencies: + brace-expansion "^2.0.1" + minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" @@ -6278,7 +6285,7 @@ semver@^6.3.0, semver@^6.3.1: resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4" integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA== -semver@^7.3.4, semver@^7.3.7, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: +semver@^7.3.4, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: version "7.6.0" resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.0.tgz#1a46a4db4bffcccd97b743b5005c8325f23d4e2d" integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg== @@ -6916,7 +6923,7 @@ trim-newlines@^4.0.2: resolved "https://registry.yarnpkg.com/trim-newlines/-/trim-newlines-4.1.1.tgz#28c88deb50ed10c7ba6dc2474421904a00139125" integrity sha512-jRKj0n0jXWo6kh62nA5TEh3+4igKDXLvzBJcPpiizP7oOolUrYIxmVBG9TOtHYFHoddUk6YvAkGeGoSVTXfQXQ== -ts-api-utils@^1.3.0: +ts-api-utils@^1.0.1, ts-api-utils@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/ts-api-utils/-/ts-api-utils-1.3.0.tgz#4b490e27129f1e8e686b45cc4ab63714dc60eea1" integrity sha512-UQMIo7pb8WRomKR1/+MFVLTroIvDVtMX3K6OUir8ynLyzB8Jeriont2bTAtmNPa1ekAgN7YPDyf6V+ygrdU+eQ== @@ -6954,23 +6961,11 @@ ts-node@10.9.2: v8-compile-cache-lib "^3.0.1" yn "3.1.1" -tslib@^1.8.1: - version "1.14.1" - resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00" - integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg== - tslib@^2.0.0, tslib@^2.1.0, tslib@^2.3.0, tslib@^2.4.0, tslib@^2.6.2: version "2.6.2" resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae" integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q== -tsutils@^3.21.0: - version "3.21.0" - resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623" - integrity sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA== - dependencies: - tslib "^1.8.1" - type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1" From c8bf9753d7423c3020ac66b97caf2099cda35110 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 23:18:46 +0900 Subject: [PATCH 070/106] MINOR: [JS] Bump @types/node from 20.12.7 to 20.12.8 in /js (#41526) Bumps [@ types/node](https://github.com/DefinitelyTyped/DefinitelyTyped/tree/HEAD/types/node) from 20.12.7 to 20.12.8.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ types/node&package-manager=npm_and_yarn&previous-version=20.12.7&new-version=20.12.8)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index dbf79115d6412..abde21f603382 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1370,10 +1370,10 @@ dependencies: "@types/node" "*" -"@types/node@*": - version "20.12.3" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.3.tgz#d6658c2c7776c1cad93534bb45428195ed840c65" - integrity sha512-sD+ia2ubTeWrOu+YMF+MTAB7E+O7qsMqAbMfW7DG3K1URwhZ5hN1pLlRVGbf4wDFzSfikL05M17EyorS86jShw== +"@types/node@*", "@types/node@^20.12.7": + version "20.12.8" + resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.8.tgz#35897bf2bfe3469847ab04634636de09552e8256" + integrity sha512-NU0rJLJnshZWdE/097cdCBbyW1h4hEg0xpovcoAQYHl8dnEyp/NAOiE45pvc+Bd1Dt+2r94v2eGFpQJ4R7g+2w== dependencies: undici-types "~5.26.4" @@ -1382,13 +1382,6 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-13.13.52.tgz#03c13be70b9031baaed79481c0c0cfb0045e53f7" integrity sha512-s3nugnZumCC//n4moGGe6tkNMyYEdaDBitVjwPxXmR5lnMG5dHePinH2EdxkG3Rh1ghFHHixAG4NJhpJW1rthQ== -"@types/node@^20.12.7": - version "20.12.7" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.7.tgz#04080362fa3dd6c5822061aa3124f5c152cff384" - integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg== - dependencies: - undici-types "~5.26.4" - "@types/normalize-package-data@^2.4.0": version "2.4.4" resolved "https://registry.yarnpkg.com/@types/normalize-package-data/-/normalize-package-data-2.4.4.tgz#56e2cc26c397c038fab0e3a917a12d5c5909e901" From cc9e65fb80db2e2d706b1776c52a88bb1c983533 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 07:58:31 -0700 Subject: [PATCH 071/106] MINOR: [C#] Bump Grpc.Tools from 2.62.0 to 2.63.0 in /csharp (#41523) Bumps [Grpc.Tools](https://github.com/grpc/grpc) from 2.62.0 to 2.63.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Grpc.Tools&package-manager=nuget&previous-version=2.62.0&new-version=2.63.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Curt Hagenlocher --- .../src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj | 2 +- csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index 9a3cf190cc376..7314b8207fef6 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 04b8a7dc734f0..780da3ad39081 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + From 56437409d1f99852a6b9486b1620c3ed12ff3e5c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 12:36:02 -0400 Subject: [PATCH 072/106] MINOR: [Go] Bump github.com/hamba/avro/v2 from 2.20.1 to 2.21.1 in /go (#41512) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github.com/hamba/avro/v2](https://github.com/hamba/avro) from 2.20.1 to 2.21.1.
Release notes

Sourced from github.com/hamba/avro/v2's releases.

v2.21.1

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.21.0...v2.21.1

v2.21.0

What's Changed

New Contributors

Full Changelog: https://github.com/hamba/avro/compare/v2.20.1...v2.21.0

Commits
  • 5dde47b fix: support 32bit builds (#390)
  • ad836ba chore: bump golangci/golangci-lint-action from 4 to 5 in the all group (#388)
  • e42dea1 fix: Union Decoder uses readInt (#387)
  • 2461d45 fix: reader int/long setting head > tail (#385)
  • 84f9b10 fix: readByte returns errors on unexpected EOF (#383)
  • 141e857 fix: reader returns errors on unexpected EOF (#382)
  • f138d7f fix: handle short read errors on arrays and maps (#379)
  • b43fe48 feat: add max slice alloc size config (#376)
  • 0b21284 Check for max allocation (#374)
  • 7a2eb5f feat: support slices for nullable unions (#372)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github.com/hamba/avro/v2&package-manager=go_modules&previous-version=2.20.1&new-version=2.21.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 79c3cc3981231..547127a470a61 100644 --- a/go/go.mod +++ b/go/go.mod @@ -47,7 +47,7 @@ require ( require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.20.1 + github.com/hamba/avro/v2 v2.21.1 github.com/substrait-io/substrait-go v0.4.2 github.com/tidwall/sjson v1.2.5 ) diff --git a/go/go.sum b/go/go.sum index e8c2fde15181a..29490b9da2ca3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.20.1 h1:3WByQiVn7wT7d27WQq6pvBRC00FVOrniP6u67FLA/2E= -github.com/hamba/avro/v2 v2.20.1/go.mod h1:xHiKXbISpb3Ovc809XdzWow+XGTn+Oyf/F9aZbTLAig= +github.com/hamba/avro/v2 v2.21.1 h1:400/jTdLWQ3ib58y83VXlTJKijRouYQszY1SO0cMGt4= +github.com/hamba/avro/v2 v2.21.1/go.mod h1:ouJ4PkiAEP49u0lAtQyd5Gv04MehKj+7lXwD3zpLpY0= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= From 2b0647230536ffb2fd2d59af11acdb4674ed44c3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 May 2024 12:36:25 -0400 Subject: [PATCH 073/106] MINOR: [Go] Bump google.golang.org/protobuf from 1.33.0 to 1.34.0 in /go (#41513) Bumps google.golang.org/protobuf from 1.33.0 to 1.34.0. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=google.golang.org/protobuf&package-manager=go_modules&previous-version=1.33.0&new-version=1.34.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 547127a470a61..35fd9b9915c0b 100644 --- a/go/go.mod +++ b/go/go.mod @@ -41,7 +41,7 @@ require ( golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.33.0 + google.golang.org/protobuf v1.34.0 modernc.org/sqlite v1.29.6 ) diff --git a/go/go.sum b/go/go.sum index 29490b9da2ca3..bf33fed6c4c97 100644 --- a/go/go.sum +++ b/go/go.sum @@ -138,8 +138,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.0 h1:Qo/qEd2RZPCf2nKuorzksSknv0d3ERwp1vFG38gSmH4= +google.golang.org/protobuf v1.34.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 493d456d8c6eff21659dc87a2dee32abd0be5ffd Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 14:43:51 -0400 Subject: [PATCH 074/106] GH-41507: [MATLAB][CI] Pass `strict: true` to `matlab-actions/run-tests@v2` (#41530) ### Rationale for this change The MATLAB CI jobs should fail if any one of the unit tests issues a `warning`. Currently, the MATLAB CI jobs only fail if there is a verification failure. Passing the argument `strict: true` to `matlab-actions/run-tests@ v2` will ensure MATLAB jobs will fail if a test warning is issued. See the [`matlab-actions/run-tests@ v2` documentation](https://github.com/matlab-actions/run-tests/?tab=readme-ov-file#run-matlab-tests) for more details. ### What changes are included in this PR? 1. Pass `strict: true` argument to `matlab-actions/setup-matlab@ v2` ### Are these changes tested? N/A (relying on existing tests). ### Are there any user-facing changes? No. * GitHub Issue: #41507 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 8a0de8a365661..2ae33d1e8d6c6 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -96,6 +96,7 @@ jobs: uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test + strict: true macos: name: AMD64 macOS 12 MATLAB runs-on: macos-12 @@ -135,7 +136,8 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true windows: name: AMD64 Windows 2022 MATLAB runs-on: windows-2022 @@ -181,4 +183,5 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true From e7f5f810ac1235ee835a6ec62fb25d2f05e5d542 Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 3 May 2024 14:46:01 -0400 Subject: [PATCH 075/106] GH-41531: [MATLAB][Packaging] Bump `matlab-actions/setup-matlab` and `matlab-actions/run-command` from `v1` to `v2` in the `crossbow` job (#41532) ### Rationale for this change I noticed `dev/tasks/matlab/github.yml` is still using `matlab-actions/setup-matlab@ v1`, which is no longer supported. See this [log](https://github.com/ursacomputing/crossbow/actions/runs/8928507510/job/24524230971#step:4:11) file. We should use `matlab-actions/setup-matlab@ v2` instead. We should also use `matlab-actions/run-command@ v2` instead of `matlab-actions/run-command@ v1`. ### What changes are included in this PR? 1. Replaced `matlab-actions/setup-matlab@ v1` with `matlab-actions/setup-matlab@ v2` in `dev/tasks/matlab/github.yml` 2. Replaced `matlab-actions/run-command@ v1` with `matlab-actions/run-command@ v2` in `dev/tasks/matlab/github.yml` ### Are these changes tested? N/A (I will trigger a crossbow job to verify the packaging workflow works as expected still). ### Are there any user-facing changes? No. * GitHub Issue: #41531 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- dev/tasks/matlab/github.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 7840fd176705c..963c85f6e11bf 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -29,7 +29,7 @@ jobs: - name: Install ninja-build run: sudo apt-get update && sudo apt-get install ninja-build - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Build MATLAB Interface @@ -66,7 +66,7 @@ jobs: - name: Install ninja-build run: brew install ninja - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Build MATLAB Interface @@ -101,7 +101,7 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Install sccache @@ -147,7 +147,7 @@ jobs: cp arrow/LICENSE.txt arrow/matlab/install/arrow_matlab/LICENSE.txt cp arrow/NOTICE.txt arrow/matlab/install/arrow_matlab/NOTICE.txt - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: release: R2024a - name: Run commands @@ -156,7 +156,7 @@ jobs: ARROW_MATLAB_TOOLBOX_FOLDER: arrow/matlab/install/arrow_matlab ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER: artifacts/matlab-dist ARROW_MATLAB_TOOLBOX_VERSION: {{ arrow.no_rc_version }} - uses: matlab-actions/run-command@v1 + uses: matlab-actions/run-command@v2 with: command: packageMatlabInterface {{ macros.github_upload_releases(["artifacts/matlab-dist/*.mltbx"])|indent }} From 7cd9c6fbd313c9afa0673d85415a84fd199951c9 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 3 May 2024 19:17:47 -0400 Subject: [PATCH 076/106] GH-41534: [Go] Fix mem leak importing 0 length C Array (#41535) ### What changes are included in this PR? If the `imp.alloc.bufCount` is 0, indicating we did not import any buffers from the provided C ArrowArray object, then we are free to not only call the release callback (which we already do) but also we need to free the temp ArrowArray we allocated to move the source to. This was uncovered by https://github.com/apache/arrow-adbc/pull/1808 * GitHub Issue: #41534 Authored-by: Matt Topol Signed-off-by: David Li --- go/arrow/cdata/cdata.go | 1 + 1 file changed, 1 insertion(+) diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go index b86898277bf47..00d1f351eaf11 100644 --- a/go/arrow/cdata/cdata.go +++ b/go/arrow/cdata/cdata.go @@ -448,6 +448,7 @@ func (imp *cimporter) doImportArr(src *CArrowArray) error { defer func() { if imp.alloc.bufCount == 0 { C.ArrowArrayRelease(imp.arr) + C.free(unsafe.Pointer(imp.arr)) } }() From 2ba129f0a28e0ec93c7f383a64c54bef24e26f59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 06:56:08 +0900 Subject: [PATCH 077/106] MINOR: [JS] Bump @swc/helpers from 0.5.10 to 0.5.11 in /js (#41486) Bumps [@ swc/helpers](https://github.com/swc-project/swc) from 0.5.10 to 0.5.11.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=@ swc/helpers&package-manager=npm_and_yarn&previous-version=0.5.10&new-version=0.5.11)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- js/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/yarn.lock b/js/yarn.lock index abde21f603382..eb7ed33520f0a 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -1199,9 +1199,9 @@ integrity sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ== "@swc/helpers@^0.5.10": - version "0.5.10" - resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.10.tgz#5720082d007197cd85743dd599198097126a3f6e" - integrity sha512-CU+RF9FySljn7HVSkkjiB84hWkvTaI3rtLvF433+jRSBL2hMu3zX5bGhHS8C80SM++h4xy8hBSnUHFQHmRXSBw== + version "0.5.11" + resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.11.tgz#5bab8c660a6e23c13b2d23fcd1ee44a2db1b0cb7" + integrity sha512-YNlnKRWF2sVojTpIyzwou9XoTNbzbzONwRhOoniEioF1AtaitTvVZblaQRrAzChWQ1bLYyYSWzM18y4WwgzJ+A== dependencies: tslib "^2.4.0" From 4cf44b4bc3ab053b03c937d3327d43c105790462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JB=20Onofr=C3=A9?= Date: Mon, 6 May 2024 05:39:34 +0200 Subject: [PATCH 078/106] MINOR: add jbonofre in collaborators list (#41528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Add `jbonofre` to the ASF collaborators list. ### What changes are included in this PR? Update `.asf.yaml`. ### Are these changes tested? NA ### Are there any user-facing changes? No Authored-by: JB Onofré Signed-off-by: Jacob Wujciak-Jens --- .asf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.asf.yaml b/.asf.yaml index 40b961dc6e885..760a830ef98c7 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -23,6 +23,7 @@ github: - benibus - danepitkin - davisusanibar + - jbonofre - js8544 - vibhatha From d10ebf055a393c94a693097db1dca08ff86745bd Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Mon, 6 May 2024 09:28:22 -0400 Subject: [PATCH 079/106] MINOR: [R] fix no visible global function definition: left_join (#41542) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Followup to #41350, fixes a check NOTE that caused. ### What changes are included in this PR? `dplyr::` in two places. ### Are these changes tested? Check will be clean. ### Are there any user-facing changes? 🙅 --- r/R/dplyr-mutate.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 880f7799e6316..72882b6afd964 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -84,12 +84,12 @@ mutate.arrow_dplyr_query <- function(.data, agg_query$aggregations <- mask$.aggregations agg_query <- collapse.arrow_dplyr_query(agg_query) if (length(grv)) { - out <- left_join(out, agg_query, by = grv) + out <- dplyr::left_join(out, agg_query, by = grv) } else { # If there are no group_by vars, add a scalar column to both and join on that agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) - out <- left_join(out, agg_query, by = "..tempjoin") + out <- dplyr::left_join(out, agg_query, by = "..tempjoin") } } From 3a54e68c69b028afe6d5fbb58eb0c4520dca1308 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 7 May 2024 04:40:51 +0800 Subject: [PATCH 080/106] MINOR: [Dev] Add zanmato1984 and ZhangHuiGui in collaborators list (#41544) ### Rationale for this change Recently zanmato1984 and ZhangHuiGui is active on arrow-compute and arrow-acero module, which lacks maintainer. The contributions can be seem below: * https://github.com/apache/arrow/commits?author=zanmato1984 * https://github.com/apache/arrow/commits?author=ZhangHuiGui I promote them as collaborators ### What changes are included in this PR? Changes `.asf.yaml` ### Are these changes tested? No ### Are there any user-facing changes? No Authored-by: mwish Signed-off-by: Sutou Kouhei --- .asf.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.asf.yaml b/.asf.yaml index 760a830ef98c7..1eb019fea9af1 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -26,6 +26,8 @@ github: - jbonofre - js8544 - vibhatha + - zanmato1984 + - ZhangHuiGui notifications: commits: commits@arrow.apache.org From 52321377cc9fbcb8678577f10232aea984a235f5 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Tue, 7 May 2024 05:13:44 -0400 Subject: [PATCH 081/106] GH-40997: [C++] Get null_bit_id according to are_cols_in_encoding_order in NullUpdateColumnToRow_avx2 (#40998) ### Rationale for this change Recently, we find that the compare internal's avx2 function NullUpdateColumnToRowImp_avx2 lost the are_cols_in_encoding_order check when get null_bit_id. It may cause grouper's compare result wrong(are_cols_in_encoding_order = true in grouper). ### What changes are included in this PR? Get `null_bit_id` according to `are_cols_in_encoding_order` in NullUpdateColumnToRow_avx2. ### Are there any user-facing changes? No Co-authored-by laotan332 Co-authored-by ZhangHuiGui <2689496754@ qq.com> * GitHub Issue: #40997 Lead-authored-by: ZhangHuiGui Co-authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/CMakeLists.txt | 3 +- cpp/src/arrow/compute/row/compare_internal.cc | 41 +++++------ cpp/src/arrow/compute/row/compare_internal.h | 25 ++++--- .../compute/row/compare_internal_avx2.cc | 20 +++--- cpp/src/arrow/compute/row/grouper_test.cc | 68 +++++++++++++++++++ cpp/src/arrow/compute/row/row_internal.cc | 3 +- 6 files changed, 116 insertions(+), 44 deletions(-) create mode 100644 cpp/src/arrow/compute/row/grouper_test.cc diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index badcf4f2f26ac..fb778be113029 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -90,7 +90,8 @@ add_arrow_test(internals_test light_array_test.cc registry_test.cc key_hash_test.cc - row/compare_test.cc) + row/compare_test.cc + row/grouper_test.cc) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 078a8287c71c0..98aea9011266c 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order) { + bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return; } uint32_t num_processed = 0; #if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { - num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + num_processed = NullUpdateColumnToRow_avx2( + use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector); } #endif - uint32_t null_bit_id = - are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows( continue; } - uint32_t offset_within_row = rows.metadata().encoded_field_offset( - are_cols_in_encoding_order - ? static_cast(icol) - : rows.metadata().pos_after_encoding(static_cast(icol))); + uint32_t offset_within_row = + rows.metadata().encoded_field_offset(ColIdInEncodingOrder( + rows, static_cast(icol), are_cols_in_encoding_order)); if (col.metadata().is_fixed_length) { if (sel_left_maybe_null) { CompareBinaryColumnToRow( @@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { // Version without using selection vector CompareBinaryColumnToRow( @@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); @@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { if (ivarbinary == 0) { CompareVarBinaryColumnToRow( @@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index b039ca97ff978..16002ee5184e9 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -43,13 +43,19 @@ class ARROW_EXPORT KeyCompare { uint8_t* out_match_bitvector_maybe_null = NULLPTR); private: + static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col, + bool are_cols_in_encoding_order) { + return are_cols_in_encoding_order ? id_col + : rows.metadata().pos_after_encoding(id_col); + } + template static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order); + bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static void CompareBinaryColumnToRowHelper( @@ -92,7 +98,8 @@ class ARROW_EXPORT KeyCompare { static uint32_t NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector); + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static uint32_t CompareBinaryColumnToRowHelper_avx2( @@ -118,13 +125,11 @@ class ARROW_EXPORT KeyCompare { static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); - static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col, - uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, - uint8_t* match_bytevector); + static uint32_t NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, + bool are_cols_in_encoding_order, uint8_t* match_bytevector); static uint32_t CompareBinaryColumnToRow_avx2( bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index ff407c51b83cb..18f656a2e458d 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -39,12 +39,14 @@ template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector) { + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return num_rows_to_compare; } - uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector) { + bool are_cols_in_encoding_order, uint8_t* match_bytevector) { int64_t num_rows_safe = TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0)); if (sel_left_maybe_null) { @@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( } if (use_selection) { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } else { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } } diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc new file mode 100644 index 0000000000000..1e853be5e4af7 --- /dev/null +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/row/grouper.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace compute { + +// Specialized case for GH-40997 +TEST(Grouper, ResortedColumnsWithLargeNullRows) { + const uint64_t num_rows = 1024; + + // construct random array with plenty of null values + const int32_t kSeed = 42; + const int32_t min = 0; + const int32_t max = 100; + const double null_probability = 0.3; + const double true_probability = 0.5; + auto rng = random::RandomArrayGenerator(kSeed); + auto b_arr = rng.Boolean(num_rows, true_probability, null_probability); + auto i32_arr = rng.Int32(num_rows, min, max, null_probability); + auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability); + + // construct batches with columns which will be resorted in the grouper make + std::vector exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows), + ExecBatch({i32_arr, i64_arr, b_arr}, num_rows), + ExecBatch({i64_arr, b_arr, i32_arr}, num_rows), + ExecBatch({i32_arr, b_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i32_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)}; + + const int num_batches = static_cast(exec_batches.size()); + std::vector group_num_vec; + group_num_vec.reserve(num_batches); + + for (const auto& exec_batch : exec_batches) { + ExecSpan span(exec_batch); + ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes())); + ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span)); + group_num_vec.emplace_back(grouper->num_groups()); + } + + for (int i = 1; i < num_batches; i++) { + ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index f6a62c09fcf24..469205e9b008d 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector( // // Columns are sorted based on the size in bytes of their fixed-length part. // For the varying-length column, the fixed-length part is the 32-bit field storing - // cumulative length of varying-length fields. + // cumulative length of varying-length fields. This is to make the memory access of + // each individual column within the encoded row alignment-friendly. // // The rules are: // From b719408f4abd9921f35935fe0de771f3b856efd1 Mon Sep 17 00:00:00 2001 From: hemidark <67875833+hemidark@users.noreply.github.com> Date: Tue, 7 May 2024 02:44:48 -0700 Subject: [PATCH 082/106] GH-40560: [Python] RunEndEncodedArray.from_arrays: bugfix for Array arguments (#40560) (#41093) ### Rationale for this change The documentation suggests that `RunEndEncodedArray.from_arrays` takes two `Array` parameters, as would be expected of a `from_arrays` method. However, if given an `Array` instance for the `run_ends` parameter, it errors because `Array.__getitem__` returns a pyarrow scalar instead of a native Python integer. ### What changes are included in this PR? * Handle `Array` parameters for `run_ends` by unconditionally coercing the logical length to a pyarrow scalar, then to a Python native value. ### Are these change tested? Yes. Augmented the existing unit tests to test with `Array` as well as Python lists, and check that the data types of the `Array` instances correctly carry over to the data type of the `RunEndEncodedArray`. ### Are there any user-facing changes? Not apart from the bugfix; this was the minimum necessary change to make `Array` parameters work. `RunEndEncodedArray.from_arrays` continues to support e.g. python lists as before. * GitHub Issue: #40560 Authored-by: Hemidark Signed-off-by: Joris Van den Bossche --- python/pyarrow/array.pxi | 2 +- python/pyarrow/tests/test_array.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6a11b19ffcdf5..946c82b258241 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3984,7 +3984,7 @@ cdef class RunEndEncodedArray(Array): ------- RunEndEncodedArray """ - logical_length = run_ends[-1] if len(run_ends) > 0 else 0 + logical_length = scalar(run_ends[-1]).as_py() if len(run_ends) > 0 else 0 return RunEndEncodedArray._from_arrays(type, True, logical_length, run_ends, values, 0) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 6a190957879d3..3754daeb9b4bd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3578,12 +3578,23 @@ def check_run_end_encoded_from_arrays_with_type(ree_type=None): check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) +def check_run_end_encoded_from_typed_arrays(ree_type): + run_ends = [3, 5, 10, 19] + values = [1, 2, 1, 3] + typed_run_ends = pa.array(run_ends, ree_type.run_end_type) + typed_values = pa.array(values, ree_type.value_type) + ree_array = pa.RunEndEncodedArray.from_arrays(typed_run_ends, typed_values) + assert ree_array.type == ree_type + check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) + + def test_run_end_encoded_from_arrays(): check_run_end_encoded_from_arrays_with_type() for run_end_type in [pa.int16(), pa.int32(), pa.int64()]: for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]: ree_type = pa.run_end_encoded(run_end_type, value_type) check_run_end_encoded_from_arrays_with_type(ree_type) + check_run_end_encoded_from_typed_arrays(ree_type) def test_run_end_encoded_from_buffers(): From c79b6a593e21c10dc65e06a2717809ab83fd31db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Tue, 7 May 2024 14:54:10 +0200 Subject: [PATCH 083/106] GH-41566: [CI][Packaging] macOS wheel for Catalina fails to build on macOS arm64 (#41567) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Wheels for macOS catalina are failing ### What changes are included in this PR? Use macos-13 instead of (latest) ARM ### Are these changes tested? Yes, via archery ### Are there any user-facing changes? No * GitHub Issue: #41566 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- dev/tasks/tasks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 52a235c688eda..126b0fcb6f76a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -409,7 +409,7 @@ tasks: arrow_jemalloc: "ON" python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - runs_on: "macos-latest" + runs_on: "macos-13" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl From 03f8ae754ede16f118ccdba0abb593b1461024aa Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 7 May 2024 09:42:55 -0400 Subject: [PATCH 084/106] GH-41540: [R] Simplify arrow_eval() logic and bindings environments (#41537) ### Rationale for this change NSE is hard enough. I wanted to see if I could remove some layers of complexity. ### What changes are included in this PR? * There no longer are separate collections of `agg_funcs` and `nse_funcs`. Now that the aggregation functions return Expressions (https://github.com/apache/arrow/pull/41223), there's no reason to treat them separately. All bindings return Expressions now. * Both are removed and functions are just stored in `.cache$functions`. There was a note wondering why both `nse_funcs` and that needed to exist. They don't. * `arrow_mask()` no longer has an `aggregations` argument: agg functions are always present. * Because agg functions are always present, `filter` and `arrange` now have to check for whether the expressions passed to them contain aggregations--this is supported in regular dplyr but we have deferred supporting it here for now (see https://github.com/apache/arrow/pull/41350). If we decide we want to support it later, these checks are the entry points where we'd drop in the `left_join()` as in `mutate()`. * The logic of evaluating expresssions in `filter()` has been simplified. * Assorted other cleanups: `register_binding()` has two fewer arguments, for example, and the duplicate functions for referencing agg_funcs are gone. There is one more refactor I intend to pursue, and that's to rework abandon_ship and how arrow_eval does error handling, but I ~may~ will defer that to a followup. ### Are these changes tested? Yes, though I'll add some more for filter/aggregate in the followup since I'm reworking things there. ### Are there any user-facing changes? There are a couple of edge cases where the error message will change subtly. For example, if you supplied a comma-separated list of filter expressions, and more than one of them did not evaluate, previously you would be informed of all of the failures; now, we error on the first one. I don't think this is concerning. * GitHub Issue: #41540 --- r/R/dplyr-arrange.R | 8 ++ r/R/dplyr-eval.R | 17 +-- r/R/dplyr-filter.R | 54 +++------ r/R/dplyr-funcs-agg.R | 26 ++--- r/R/dplyr-funcs.R | 119 ++++---------------- r/R/dplyr-mutate.R | 2 +- r/R/dplyr-summarize.R | 2 +- r/R/udf.R | 7 +- r/man/register_binding.Rd | 45 +------- r/tests/testthat/test-dataset-dplyr.R | 2 +- r/tests/testthat/test-dplyr-filter.R | 9 +- r/tests/testthat/test-dplyr-funcs.R | 30 ++--- r/tests/testthat/test-dplyr-summarize.R | 28 ++--- r/tests/testthat/test-udf.R | 14 +-- r/vignettes/developers/writing_bindings.Rmd | 7 +- 15 files changed, 109 insertions(+), 261 deletions(-) diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index f91cd14211e0f..c8594c77df000 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -47,6 +47,14 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { msg <- paste("Expression", names(sorts)[i], "not supported in Arrow") return(abandon_ship(call, .data, msg)) } + if (length(mask$.aggregations)) { + # dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow") + return(abandon_ship(call, .data, msg)) + } descs[i] <- x[["desc"]] } .data$arrange_vars <- c(sorts, .data$arrange_vars) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index ff1619ce944d0..211c26cecce8c 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -121,24 +121,9 @@ arrow_not_supported <- function(msg) { } # Create a data mask for evaluating a dplyr expression -arrow_mask <- function(.data, aggregation = FALSE) { +arrow_mask <- function(.data) { f_env <- new_environment(.cache$functions) - if (aggregation) { - # Add the aggregation functions to the environment. - for (f in names(agg_funcs)) { - f_env[[f]] <- agg_funcs[[f]] - } - } else { - # Add functions that need to error hard and clear. - # Some R functions will still try to evaluate on an Expression - # and return NA with a warning :exploding_head: - fail <- function(...) stop("Not implemented") - for (f in c("mean", "sd")) { - f_env[[f]] <- fail - } - } - # Assign the schema to the expressions schema <- .data$.data$schema walk(.data$selected_columns, ~ (.$schema <- schema)) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index d85fa16af2e71..69decbd76655f 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -35,48 +35,24 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) - bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) - if (any(bad_filters)) { - # This is similar to abandon_ship() except that the filter eval is - # vectorized, and we apply filters that _did_ work before abandoning ship - # with the rest - expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) - if (query_on_dataset(out)) { - # Abort. We don't want to auto-collect if this is a Dataset because that - # could blow up, too big. - stop( - "Filter expression not supported for Arrow Datasets: ", - oxford_paste(expr_labs, quote = FALSE), - "\nCall collect() first to pull data into R.", - call. = FALSE - ) - } else { - arrow_errors <- map2_chr( - filters[bad_filters], expr_labs, - handle_arrow_not_supported - ) - if (length(arrow_errors) == 1) { - msg <- paste0(arrow_errors, "; ") - } else { - msg <- paste0("* ", arrow_errors, "\n", collapse = "") - } - warning( - msg, "pulling data into R", - immediate. = TRUE, - call. = FALSE - ) - # Set any valid filters first, then collect and then apply the invalid ones in R - out <- dplyr::collect(set_filters(out, filters[!bad_filters])) - if (by$from_by) { - out <- dplyr::ungroup(out) - } - return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) + mask <- arrow_mask(out) + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + if (inherits(filt, "try-error")) { + msg <- handle_arrow_not_supported(filt, format_expr(expr)) + return(abandon_ship(match.call(), .data, msg)) + } + if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in filter() in Arrow") + return(abandon_ship(match.call(), .data, msg)) } + out <- set_filters(out, filt) } - out <- set_filters(out, filters) - if (by$from_by) { out$group_by_vars <- character() } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index 9411ce5ce6faf..c0c4eb3089425 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -29,56 +29,56 @@ # you can use list_compute_functions("^hash_") register_bindings_aggregate <- function() { - register_binding_agg("base::sum", function(..., na.rm = FALSE) { + register_binding("base::sum", function(..., na.rm = FALSE) { set_agg( fun = "sum", data = ensure_one_arg(list2(...), "sum"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::prod", function(..., na.rm = FALSE) { + register_binding("base::prod", function(..., na.rm = FALSE) { set_agg( fun = "product", data = ensure_one_arg(list2(...), "prod"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::any", function(..., na.rm = FALSE) { + register_binding("base::any", function(..., na.rm = FALSE) { set_agg( fun = "any", data = ensure_one_arg(list2(...), "any"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::all", function(..., na.rm = FALSE) { + register_binding("base::all", function(..., na.rm = FALSE) { set_agg( fun = "all", data = ensure_one_arg(list2(...), "all"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::mean", function(x, na.rm = FALSE) { + register_binding("base::mean", function(x, na.rm = FALSE) { set_agg( fun = "mean", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::sd", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "stddev", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::var", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "variance", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg( + register_binding( "stats::quantile", function(x, probs, na.rm = FALSE) { if (length(probs) != 1) { @@ -103,7 +103,7 @@ register_bindings_aggregate <- function() { "approximate quantile (t-digest) is computed" ) ) - register_binding_agg( + register_binding( "stats::median", function(x, na.rm = FALSE) { # TODO: Bind to the Arrow function that returns an exact median and remove @@ -122,28 +122,28 @@ register_bindings_aggregate <- function() { }, notes = "approximate median (t-digest) is computed" ) - register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { + register_binding("dplyr::n_distinct", function(..., na.rm = FALSE) { set_agg( fun = "count_distinct", data = ensure_one_arg(list2(...), "n_distinct"), options = list(na.rm = na.rm) ) }) - register_binding_agg("dplyr::n", function() { + register_binding("dplyr::n", function() { set_agg( fun = "count_all", data = list(), options = list() ) }) - register_binding_agg("base::min", function(..., na.rm = FALSE) { + register_binding("base::min", function(..., na.rm = FALSE) { set_agg( fun = "min", data = ensure_one_arg(list2(...), "min"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::max", function(..., na.rm = FALSE) { + register_binding("base::max", function(..., na.rm = FALSE) { set_agg( fun = "max", data = ensure_one_arg(list2(...), "max"), diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index abf2362d0107f..c0eb47e428b7f 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -22,8 +22,8 @@ NULL #' Register compute bindings #' -#' The `register_binding()` and `register_binding_agg()` functions -#' are used to populate a list of functions that operate on (and return) +#' `register_binding()` is used to populate a list of functions that operate on +#' (and return) #' Expressions. These are the basis for the `.data` mask inside dplyr methods. #' #' @section Writing bindings: @@ -40,26 +40,10 @@ NULL #' * Inside your function, you can call any other binding with `call_binding()`. #' #' @param fun_name A string containing a function name in the form `"function"` or -#' `"package::function"`. The package name is currently not used but -#' may be used in the future to allow these types of function calls. -#' @param fun A function or `NULL` to un-register a previous function. +#' `"package::function"`. +#' @param fun A function, or `NULL` to un-register a previous function. #' This function must accept `Expression` objects as arguments and return #' `Expression` objects instead of regular R objects. -#' @param agg_fun An aggregate function or `NULL` to un-register a previous -#' aggregate function. This function must accept `Expression` objects as -#' arguments and return a `list()` with components: -#' - `fun`: string function name -#' - `data`: list of 0 or more `Expression`s -#' - `options`: list of function options, as passed to call_function -#' @param update_cache Update .cache$functions at the time of registration. -#' the default is FALSE because the majority of usage is to register -#' bindings at package load, after which we create the cache once. The -#' reason why .cache$functions is needed in addition to nse_funcs for -#' non-aggregate functions could be revisited...it is currently used -#' as the data mask in mutate, filter, and aggregate (but not -#' summarise) because the data mask has to be a list. -#' @param registry An environment in which the functions should be -#' assigned. #' @param notes string for the docs: note any limitations or differences in #' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously @@ -67,12 +51,10 @@ NULL #' @keywords internal register_binding <- function(fun_name, fun, - registry = nse_funcs, - update_cache = FALSE, notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] # if the unqualified name exists in the registry, warn if (!is.null(previous_fun) && !identical(fun, previous_fun)) { @@ -87,58 +69,25 @@ register_binding <- function(fun_name, # register both as `pkg::fun` and as `fun` if `qualified_name` is prefixed # unqualified_name and fun_name will be the same if not prefixed - registry[[unqualified_name]] <- fun - registry[[fun_name]] <- fun - + .cache$functions[[unqualified_name]] <- fun + .cache$functions[[fun_name]] <- fun .cache$docs[[fun_name]] <- notes - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- fun - fun_cache[[fun_name]] <- fun - .cache$functions <- fun_cache - } - invisible(previous_fun) } -unregister_binding <- function(fun_name, registry = nse_funcs, - update_cache = FALSE) { +unregister_binding <- function(fun_name) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] - rm( - list = unique(c(fun_name, unqualified_name)), - envir = registry, - inherits = FALSE - ) - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- NULL - fun_cache[[fun_name]] <- NULL - .cache$functions <- fun_cache - } + .cache$functions[[unqualified_name]] <- NULL + .cache$functions[[fun_name]] <- NULL invisible(previous_fun) } -#' @rdname register_binding -#' @keywords internal -register_binding_agg <- function(fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0)) { - register_binding(fun_name, agg_fun, registry = registry, notes = notes) -} - # Supports functions and tests that call previously-defined bindings call_binding <- function(fun_name, ...) { - nse_funcs[[fun_name]](...) -} - -call_binding_agg <- function(fun_name, ...) { - agg_funcs[[fun_name]](...) + .cache$functions[[fun_name]](...) } create_binding_cache <- function() { @@ -147,7 +96,7 @@ create_binding_cache <- function() { # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() - arrow_funcs <- set_names( + .cache$functions <- set_names( lapply(all_arrow_funs, function(fun) { force(fun) function(...) Expression$create(fun, ...) @@ -155,7 +104,7 @@ create_binding_cache <- function() { paste0("arrow_", all_arrow_funs) ) - # Register bindings into nse_funcs and agg_funcs + # Register bindings into the cache register_bindings_array_function_map() register_bindings_aggregate() register_bindings_conditional() @@ -165,37 +114,17 @@ create_binding_cache <- function() { register_bindings_type() register_bindings_augmented() - # We only create the cache for nse_funcs and not agg_funcs - .cache$functions <- c(as.list(nse_funcs), arrow_funcs) -} - -# environments in the arrow namespace used in the above functions -nse_funcs <- new.env(parent = emptyenv()) -agg_funcs <- new.env(parent = emptyenv()) -.cache <- new.env(parent = emptyenv()) - -# we register 2 versions of the "::" binding - one for use with nse_funcs -# and another one for use with agg_funcs (registered in dplyr-funcs-agg.R) -nse_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) + .cache$functions[["::"]] <- function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) - fun_name <- paste0(lhs_name, "::", rhs_name) + fun_name <- paste0(lhs_name, "::", rhs_name) - # if we do not have a binding for pkg::fun, then fall back on to the - # regular pkg::fun function - nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + # if we do not have a binding for pkg::fun, then fall back on to the + # regular pkg::fun function + .cache$functions[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + } } -agg_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) - - fun_name <- paste0(lhs_name, "::", rhs_name) - - # if we do not have a binding for pkg::fun, then fall back on to the - # nse_funcs (useful when we have a regular function inside an aggregating one) - # and then, if searching nse_funcs fails too, fall back to the - # regular `pkg::fun()` function - agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] -} +# environment in the arrow namespace used in the above functions +.cache <- new.env(parent = emptyenv()) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 72882b6afd964..f0a8c005676df 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -48,7 +48,7 @@ mutate.arrow_dplyr_query <- function(.data, # Create a mask with aggregation functions in it # If there are any aggregations, we will need to compute them and # and join the results back in, for "window functions" like x - mean(x) - mask <- arrow_mask(out, aggregation = TRUE) + mask <- arrow_mask(out) # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 56de14db6dd44..58ca849152a75 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -84,7 +84,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # and the aggregation functions will pull out those terms and insert into # that list. # nolint end - mask <- arrow_mask(.data, aggregation = TRUE) + mask <- arrow_mask(.data) # We'll collect any transformations after the aggregation here. # summarize_eval() returns NULL when the outer expression is an aggregation, diff --git a/r/R/udf.R b/r/R/udf.R index 922095cceba6a..0415fbac3c9fc 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -95,12 +95,7 @@ register_scalar_function <- function(name, fun, in_type, out_type, body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name) environment(binding_fun) <- asNamespace("arrow") - register_binding( - name, - binding_fun, - update_cache = TRUE - ) - + register_binding(name, binding_fun) invisible(NULL) } diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index d10cd733bbe9d..b84cde3b8993a 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -2,63 +2,28 @@ % Please edit documentation in R/dplyr-funcs.R \name{register_binding} \alias{register_binding} -\alias{register_binding_agg} \title{Register compute bindings} \usage{ -register_binding( - fun_name, - fun, - registry = nse_funcs, - update_cache = FALSE, - notes = character(0) -) - -register_binding_agg( - fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0) -) +register_binding(fun_name, fun, notes = character(0)) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or -\code{"package::function"}. The package name is currently not used but -may be used in the future to allow these types of function calls.} +\code{"package::function"}.} -\item{fun}{A function or \code{NULL} to un-register a previous function. +\item{fun}{A function, or \code{NULL} to un-register a previous function. This function must accept \code{Expression} objects as arguments and return \code{Expression} objects instead of regular R objects.} -\item{registry}{An environment in which the functions should be -assigned.} - -\item{update_cache}{Update .cache$functions at the time of registration. -the default is FALSE because the majority of usage is to register -bindings at package load, after which we create the cache once. The -reason why .cache$functions is needed in addition to nse_funcs for -non-aggregate functions could be revisited...it is currently used -as the data mask in mutate, filter, and aggregate (but not -summarise) because the data mask has to be a list.} - \item{notes}{string for the docs: note any limitations or differences in behavior between the Arrow version and the R function.} - -\item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous -aggregate function. This function must accept \code{Expression} objects as -arguments and return a \code{list()} with components: -\itemize{ -\item \code{fun}: string function name -\item \code{data}: list of 0 or more \code{Expression}s -\item \code{options}: list of function options, as passed to call_function -}} } \value{ The previously registered binding or \code{NULL} if no previously registered function existed. } \description{ -The \code{register_binding()} and \code{register_binding_agg()} functions -are used to populate a list of functions that operate on (and return) +\code{register_binding()} is used to populate a list of functions that operate on +(and return) Expressions. These are the basis for the \code{.data} mask inside dplyr methods. } \section{Writing bindings}{ diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 1e36ea8bd4966..493eac328e5cd 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -325,7 +325,7 @@ test_that("dplyr method not implemented messages", { # This one is more nuanced expect_error( ds %>% filter(int > 6, dbl > max(dbl)), - "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", + "Expression dbl > max(dbl) not supported in filter() in Arrow\nCall collect() first to pull data into R.", fixed = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index bf23685362a82..535bcb70c4cab 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -324,13 +324,14 @@ test_that("Filtering with unsupported functions", { filter( nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg int > 2, # good - pnorm(dbl) > .99 # bad, opaque + pnorm(dbl) > .99 # bad, opaque, but we'll error on the first one before we get here ) %>% collect(), tbl, - warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported in Arrow -\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow -pulling data into R' + warning = paste( + 'In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1,', + "allowNA = TRUE not supported in Arrow; pulling data into R" + ) ) }) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index 039604a85ee0c..48c5d730f8493 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -19,35 +19,25 @@ skip_on_cran() test_that("register_binding()/unregister_binding() works", { - fake_registry <- new.env(parent = emptyenv()) fun1 <- function() NULL fun2 <- function() "Hello" - expect_null(register_binding("some.pkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`some.pkg::some_fun`, fun1) + expect_null(register_binding("some.pkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) + expect_identical(.cache$functions$`some.pkg::some_fun`, fun1) - expect_identical(unregister_binding("some.pkg::some_fun", fake_registry), fun1) - expect_false("some.pkg::some_fun" %in% names(fake_registry)) - expect_false("some_fun" %in% names(fake_registry)) + expect_identical(unregister_binding("some.pkg::some_fun"), fun1) + expect_false("some.pkg::some_fun" %in% names(.cache$functions)) + expect_false("some_fun" %in% names(.cache$functions)) - expect_null(register_binding("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) + expect_null(register_binding("somePkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) expect_warning( - register_binding("some.pkg2::some_fun", fun2, fake_registry), + register_binding("some.pkg2::some_fun", fun2), "A \"some_fun\" binding already exists in the registry and will be overwritten." ) # No warning when an identical function is re-registered - expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry)) -}) - -test_that("register_binding_agg() works", { - fake_registry <- new.env(parent = emptyenv()) - fun1 <- function() NULL - - expect_null(register_binding_agg("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`somePkg::some_fun`, fun1) + expect_silent(register_binding("some.pkg2::some_fun", fun2)) }) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 87bb5e5fac959..a61ef95bee73d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -337,20 +337,20 @@ test_that("Functions that take ... but we only accept a single arg", { ) # Now that we've demonstrated that the whole machinery works, let's test - # the agg_funcs directly - expect_error(call_binding_agg("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("sum"), "sum() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("prod"), "prod() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("any"), "any() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("all"), "all() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("min"), "min() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("max"), "max() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("n_distinct", 1, 2), "Multiple arguments to n_distinct()") - expect_error(call_binding_agg("sum", 1, 2), "Multiple arguments to sum") - expect_error(call_binding_agg("any", 1, 2), "Multiple arguments to any()") - expect_error(call_binding_agg("all", 1, 2), "Multiple arguments to all()") - expect_error(call_binding_agg("min", 1, 2), "Multiple arguments to min()") - expect_error(call_binding_agg("max", 1, 2), "Multiple arguments to max()") + # the agg funcs directly + expect_error(call_binding("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) + expect_error(call_binding("sum"), "sum() with 0 arguments", fixed = TRUE) + expect_error(call_binding("prod"), "prod() with 0 arguments", fixed = TRUE) + expect_error(call_binding("any"), "any() with 0 arguments", fixed = TRUE) + expect_error(call_binding("all"), "all() with 0 arguments", fixed = TRUE) + expect_error(call_binding("min"), "min() with 0 arguments", fixed = TRUE) + expect_error(call_binding("max"), "max() with 0 arguments", fixed = TRUE) + expect_error(call_binding("n_distinct", 1, 2), "Multiple arguments to n_distinct()") + expect_error(call_binding("sum", 1, 2), "Multiple arguments to sum") + expect_error(call_binding("any", 1, 2), "Multiple arguments to any()") + expect_error(call_binding("all", 1, 2), "Multiple arguments to all()") + expect_error(call_binding("min", 1, 2), "Multiple arguments to min()") + expect_error(call_binding("max", 1, 2), "Multiple arguments to max()") }) test_that("median()", { diff --git a/r/tests/testthat/test-udf.R b/r/tests/testthat/test-udf.R index 0eb75b1dde6e5..8604dc610a435 100644 --- a/r/tests/testthat/test-udf.R +++ b/r/tests/testthat/test-udf.R @@ -90,7 +90,7 @@ test_that("register_scalar_function() adds a compute function to the registry", int32(), float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_true("times_32" %in% names(asNamespace("arrow")$.cache$functions)) expect_true("times_32" %in% list_compute_functions()) @@ -124,7 +124,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_array", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_array")) expect_error( call_function("times_32_bad_return_type_array", Array$create(1L)), @@ -137,7 +137,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_scalar", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_scalar")) expect_error( call_function("times_32_bad_return_type_scalar", Array$create(1L)), @@ -155,7 +155,7 @@ test_that("register_scalar_function() can register multiple kernels", { out_type = function(in_types) in_types[[1]], auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_equal( call_function("times_32", Scalar$create(1L, int32())), @@ -238,7 +238,7 @@ test_that("user-defined functions work during multi-threaded execution", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) # check a regular collect() result <- open_dataset(tf_dataset) %>% @@ -271,7 +271,7 @@ test_that("nested exec plans can contain user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) stream_plan_with_udf <- function() { record_batch(a = 1:1000) %>% @@ -310,7 +310,7 @@ test_that("head() on exec plan containing user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) result <- record_batch(a = 1:1000) %>% dplyr::mutate(b = times_32(a)) %>% diff --git a/r/vignettes/developers/writing_bindings.Rmd b/r/vignettes/developers/writing_bindings.Rmd index 443211b3c2b5e..e1ed92105dbc3 100644 --- a/r/vignettes/developers/writing_bindings.Rmd +++ b/r/vignettes/developers/writing_bindings.Rmd @@ -145,11 +145,10 @@ test_that("startsWith behaves identically in dplyr and Arrow", { df <- tibble(x = c("Foo", "bar", "baz", "qux")) compare_dplyr_binding( .input %>% - filter(startsWith(x, "b")) %>% - collect(), + filter(startsWith(x, "b")) %>% + collect(), df ) - }) ``` @@ -197,7 +196,7 @@ As `startsWith()` requires options, direct mapping is not appropriate. If the function cannot be mapped directly, some extra work may be needed to ensure that calling the arrow version of the function results in the same result as calling the R version of the function. In this case, the function will need -adding to the `nse_funcs` function registry. Here is how this might look for +adding to the `.cache$functions` function registry. Here is how this might look for `startsWith()`: ```{r, eval = FALSE} From 9cf0ee722ba048f3f305b38e536fa726eff9c813 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 7 May 2024 23:56:02 +0800 Subject: [PATCH 085/106] GH-41562: [C++][Parquet] Decoding: Fix num_value handling in ByteStreamSplitDecoder (#41565) ### Rationale for this change This problem is raised from https://github.com/apache/arrow/pull/40094 . Original bug fixed here: https://github.com/apache/arrow/pull/34140 , but this is corrupt in https://github.com/apache/arrow/pull/40094 . ### What changes are included in this PR? Refine checking ### Are these changes tested? * [x] Will add ### Are there any user-facing changes? Bugfix * GitHub Issue: #41562 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 22 +++++++++++++++++----- cpp/src/parquet/encoding.h | 5 +++++ cpp/src/parquet/encoding_test.cc | 4 ++-- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3da5c64ace5dd..05221568c8fa0 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3694,12 +3694,24 @@ class ByteStreamSplitDecoderBase : public DecoderImpl, ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width) : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), byte_width_(byte_width) {} - void SetData(int num_values, const uint8_t* data, int len) override { - if (static_cast(num_values) * byte_width_ != len) { - throw ParquetException("Data size (" + std::to_string(len) + - ") does not match number of values in BYTE_STREAM_SPLIT (" + - std::to_string(num_values) + ")"); + void SetData(int num_values, const uint8_t* data, int len) final { + // Check that the data size is consistent with the number of values + // The spec requires that the data size is a multiple of the number of values, + // see: https://github.com/apache/parquet-format/pull/192 . + // GH-41562: passed in `num_values` may include nulls, so we need to check and + // adjust the number of values. + if (static_cast(num_values) * byte_width_ < len) { + throw ParquetException( + "Data size (" + std::to_string(len) + + ") is too small for the number of values in in BYTE_STREAM_SPLIT (" + + std::to_string(num_values) + ")"); + } + if (len % byte_width_ != 0) { + throw ParquetException("ByteStreamSplit data size " + std::to_string(len) + + " not aligned with type " + TypeToString(DType::type_num) + + " and byte_width: " + std::to_string(byte_width_)); } + num_values = len / byte_width_; DecoderImpl::SetData(num_values, data, len); stride_ = num_values_; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 602009189595e..493c4044ddc1c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -255,6 +255,11 @@ class Decoder { // Sets the data for a new page. This will be called multiple times on the same // decoder and should reset all internal state. + // + // `num_values` comes from the data page header, and may be greater than the number of + // physical values in the data buffer if there are some omitted (null) values. + // `len`, on the other hand, is the size in bytes of the data buffer and + // directly relates to the number of physical values. virtual void SetData(int num_values, const uint8_t* data, int len) = 0; // Returns the number of values left (for the last call to SetData()). This is diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index b91fcb0839cba..3c20b917f6994 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -1383,7 +1383,7 @@ class TestByteStreamSplitEncoding : public TestEncodingBase { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); ASSERT_EQ(encode_buffer_->size(), physical_byte_width() * (num_values_ - null_count)); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); @@ -1717,7 +1717,7 @@ class TestDeltaBitPackEncoding : public TestEncodingBase { for (size_t i = 0; i < kNumRoundTrips; ++i) { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); From 51689a040cbe3dee8702cd899a33fa62e0616bf1 Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 8 May 2024 00:14:22 +0800 Subject: [PATCH 086/106] GH-41545: [C++][Parquet] Fix DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize (#41546) ### Rationale for this change `DeltaLengthByteArrayEncoder::EstimatedDataEncodedSize` would return an wrong estimate when `Put(const Array&)` was called. ### What changes are included in this PR? Remove `encoded_size_` and uses `sink_.length()` as `encoded_size_`. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #41545 Authored-by: mwish Signed-off-by: Antoine Pitrou --- cpp/src/parquet/encoding.cc | 18 ++++++++++-------- cpp/src/parquet/encoding_test.cc | 9 +++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 05221568c8fa0..004cb746b3a89 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2740,13 +2740,12 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, pool = ::arrow::default_memory_pool()), sink_(pool), - length_encoder_(nullptr, pool), - encoded_size_{0} {} + length_encoder_(nullptr, pool) {} std::shared_ptr FlushValues() override; int64_t EstimatedDataEncodedSize() override { - return encoded_size_ + length_encoder_.EstimatedDataEncodedSize(); + return sink_.length() + length_encoder_.EstimatedDataEncodedSize(); } using TypedEncoder::Put; @@ -2768,6 +2767,11 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, return Status::Invalid( "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } + if (ARROW_PREDICT_FALSE( + view.size() + sink_.length() > + static_cast(std::numeric_limits::max()))) { + return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); return Status::OK(); @@ -2777,7 +2781,6 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, ::arrow::BufferBuilder sink_; DeltaBitPackEncoder length_encoder_; - uint32_t encoded_size_; }; template @@ -2803,15 +2806,15 @@ void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) { const int batch_size = std::min(kBatchSize, num_values - idx); for (int j = 0; j < batch_size; ++j) { const int32_t len = src[idx + j].len; - if (AddWithOverflow(total_increment_size, len, &total_increment_size)) { + if (ARROW_PREDICT_FALSE( + AddWithOverflow(total_increment_size, len, &total_increment_size))) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } lengths[j] = len; } length_encoder_.Put(lengths.data(), batch_size); } - - if (AddWithOverflow(encoded_size_, total_increment_size, &encoded_size_)) { + if (sink_.length() + total_increment_size > std::numeric_limits::max()) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size)); @@ -2850,7 +2853,6 @@ std::shared_ptr DeltaLengthByteArrayEncoder::FlushValues() { std::shared_ptr buffer; PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); - encoded_size_ = 0; return buffer; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 3c20b917f6994..78bf26587e3fb 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -577,6 +577,11 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); ASSERT_NO_THROW(encoder->Put(*values)); + // For Plain encoding, the estimated size should be at least the total byte size + auto& string_array = dynamic_cast(*values); + EXPECT_GE(encoder->EstimatedDataEncodedSize(), string_array.total_values_length()) + << "Estimated size should be at least the total byte size"; + auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); @@ -2160,6 +2165,10 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { ASSERT_NO_THROW(encoder->Put(*values)); + auto* binary_array = checked_cast(values.get()); + // For DeltaLength encoding, the estimated size should be at least the total byte size + EXPECT_GE(encoder->EstimatedDataEncodedSize(), binary_array->total_values_length()) + << "Estimated size should be at least the total byte size"; auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); From 53859262ea988f31ce33a469305251064b5a53b8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 8 May 2024 09:52:57 +0800 Subject: [PATCH 087/106] GH-41431: [C++][Parquet][Dataset] Fix repeated scan on encrypted dataset (#41550) ### Rationale for this change When parquet dataset is reused to create multiple scanners, `FileMetaData` objects are cached to avoid parsing them again. However, these caused issues on encrypted files since internal file decryptors were no longer created by cached `FileMetaData` objects. ### What changes are included in this PR? Expose file_decryptor from FileMetaData and set it properly. ### Are these changes tested? Yes, modify the test to reproduce the issue and assure fixed. ### Are there any user-facing changes? No. * GitHub Issue: #41431 Authored-by: Gang Wu Signed-off-by: Gang Wu --- .../dataset/file_parquet_encryption_test.cc | 25 +++--- cpp/src/parquet/file_reader.cc | 83 ++++++++++--------- cpp/src/parquet/metadata.cc | 8 ++ cpp/src/parquet/metadata.h | 2 + 4 files changed, 70 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc index 307017fd67e06..0287d593d12d3 100644 --- a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc @@ -148,17 +148,22 @@ class DatasetEncryptionTestBase : public ::testing::Test { FileSystemDatasetFactory::Make(file_system_, selector, file_format, factory_options)); - // Read dataset into table + // Create the dataset ASSERT_OK_AND_ASSIGN(auto dataset, dataset_factory->Finish()); - ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); - ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); - ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); - - // Verify the data was read correctly - ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); - // Validate the table - ASSERT_OK(combined_table->ValidateFull()); - AssertTablesEqual(*combined_table, *table_); + + // Reuse the dataset above to scan it twice to make sure decryption works correctly. + for (size_t i = 0; i < 2; ++i) { + // Read dataset into table + ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); + ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); + + // Verify the data was read correctly + ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); + // Validate the table + ASSERT_OK(combined_table->ValidateFull()); + AssertTablesEqual(*combined_table, *table_); + } } protected: diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b3dd1d6054ac8..8fcb0870ce4b6 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -215,16 +215,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, int row_group_number, ReaderProperties props, - std::shared_ptr prebuffered_column_chunks_bitmap, - std::shared_ptr file_decryptor = nullptr) + std::shared_ptr prebuffered_column_chunks_bitmap) : source_(std::move(source)), cached_source_(std::move(cached_source)), source_size_(source_size), file_metadata_(file_metadata), properties_(std::move(props)), row_group_ordinal_(row_group_number), - prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), - file_decryptor_(std::move(file_decryptor)) { + prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -263,10 +261,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // The column is encrypted - std::shared_ptr meta_decryptor = - GetColumnMetaDecryptor(crypto_metadata.get(), file_decryptor_.get()); - std::shared_ptr data_decryptor = - GetColumnDataDecryptor(crypto_metadata.get(), file_decryptor_.get()); + std::shared_ptr meta_decryptor = GetColumnMetaDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); + std::shared_ptr data_decryptor = GetColumnDataDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); ARROW_DCHECK_NE(meta_decryptor, nullptr); ARROW_DCHECK_NE(data_decryptor, nullptr); @@ -291,7 +289,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { ReaderProperties properties_; int row_group_ordinal_; const std::shared_ptr prebuffered_column_chunks_bitmap_; - std::shared_ptr file_decryptor_; }; // ---------------------------------------------------------------------- @@ -316,7 +313,9 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { - if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); + if (file_metadata_ && file_metadata_->file_decryptor()) { + file_metadata_->file_decryptor()->WipeOutDecryptionKeys(); + } } std::shared_ptr GetRowGroup(int i) override { @@ -330,7 +329,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::unique_ptr contents = std::make_unique( source_, cached_source_, source_size_, file_metadata_.get(), i, properties_, - std::move(prebuffered_column_chunks_bitmap), file_decryptor_); + std::move(prebuffered_column_chunks_bitmap)); return std::make_shared(std::move(contents)); } @@ -346,8 +345,9 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!page_index_reader_) { - page_index_reader_ = PageIndexReader::Make(source_.get(), file_metadata_, - properties_, file_decryptor_.get()); + page_index_reader_ = + PageIndexReader::Make(source_.get(), file_metadata_, properties_, + file_metadata_->file_decryptor().get()); } return page_index_reader_; } @@ -362,8 +362,8 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!bloom_filter_reader_) { - bloom_filter_reader_ = - BloomFilterReader::Make(source_, file_metadata_, properties_, file_decryptor_); + bloom_filter_reader_ = BloomFilterReader::Make(source_, file_metadata_, properties_, + file_metadata_->file_decryptor()); if (bloom_filter_reader_ == nullptr) { throw ParquetException("Cannot create BloomFilterReader"); } @@ -441,10 +441,12 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. const std::pair read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len, + &file_decryptor); // Read the actual footer metadata_start = read_size.first; metadata_len = read_size.second; @@ -453,8 +455,8 @@ class SerializedFile : public ParquetFileReader::Contents { // Fall through } - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -550,34 +552,37 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. std::pair read_size; BEGIN_PARQUET_CATCH_EXCEPTIONS - read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + read_size = ParseMetaDataOfEncryptedFileWithEncryptedFooter( + metadata_buffer, metadata_len, &file_decryptor); END_PARQUET_CATCH_EXCEPTIONS // Read the actual footer int64_t metadata_start = read_size.first; metadata_len = read_size.second; return source_->ReadAsync(metadata_start, metadata_len) - .Then([this, metadata_len, is_encrypted_footer]( + .Then([this, metadata_len, is_encrypted_footer, file_decryptor]( const std::shared_ptr<::arrow::Buffer>& metadata_buffer) { // Continue and read the file footer - return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer); + return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, + file_decryptor); }); } return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, - is_encrypted_footer); + is_encrypted_footer, std::move(file_decryptor)); } // Continuation - ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer, - uint32_t metadata_len, - const bool is_encrypted_footer) { + ::arrow::Status ParseMetaDataFinal( + std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len, + const bool is_encrypted_footer, + std::shared_ptr file_decryptor) { BEGIN_PARQUET_CATCH_EXCEPTIONS - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -608,11 +613,11 @@ class SerializedFile : public ParquetFileReader::Contents { // Maps row group ordinal and prebuffer status of its column chunks in the form of a // bitmap buffer. std::unordered_map> prebuffered_column_chunks_; - std::shared_ptr file_decryptor_; // \return The true length of the metadata in bytes - uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, - const uint32_t metadata_len); + uint32_t ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor); std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -624,11 +629,13 @@ class SerializedFile : public ParquetFileReader::Contents { // \return The position and size of the actual footer std::pair ParseMetaDataOfEncryptedFileWithEncryptedFooter( - const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len); + const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len, + std::shared_ptr* file_decryptor); }; uint32_t SerializedFile::ParseUnencryptedFileMetadata( - const std::shared_ptr& metadata_buffer, const uint32_t metadata_len) { + const std::shared_ptr& metadata_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor) { if (metadata_buffer->size() != metadata_len) { throw ParquetException("Failed reading metadata buffer (requested " + std::to_string(metadata_len) + " bytes but got " + @@ -637,7 +644,7 @@ uint32_t SerializedFile::ParseUnencryptedFileMetadata( uint32_t read_metadata_len = metadata_len; // The encrypted read path falls through to here, so pass in the decryptor file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, - properties_, file_decryptor_); + properties_, std::move(file_decryptor)); return read_metadata_len; } @@ -645,7 +652,7 @@ std::pair SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer, // both metadata & crypto metadata length - const uint32_t footer_len) { + const uint32_t footer_len, std::shared_ptr* file_decryptor) { // encryption with encrypted footer // Check if the footer_buffer contains the entire metadata if (crypto_metadata_buffer->size() != footer_len) { @@ -664,7 +671,7 @@ SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( // Handle AAD prefix EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + *file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_crypto_metadata->key_metadata(), properties_.memory_pool()); @@ -683,12 +690,12 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); // Handle AAD prefix std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + auto file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); // set the InternalFileDecryptor in the metadata as well, as it's used // for signature verification and for ColumnChunkMetaData creation. - file_metadata_->set_file_decryptor(file_decryptor_); + file_metadata_->set_file_decryptor(std::move(file_decryptor)); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3f101b5ae3ac6..b24883cdc160b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -826,6 +826,10 @@ class FileMetaData::FileMetaDataImpl { file_decryptor_ = std::move(file_decryptor); } + const std::shared_ptr& file_decryptor() const { + return file_decryptor_; + } + private: friend FileMetaDataBuilder; uint32_t metadata_len_ = 0; @@ -947,6 +951,10 @@ void FileMetaData::set_file_decryptor( impl_->set_file_decryptor(std::move(file_decryptor)); } +const std::shared_ptr& FileMetaData::file_decryptor() const { + return impl_->file_decryptor(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 640b898024346..9fc30df58e0d3 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -399,12 +399,14 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; friend class SerializedFile; + friend class SerializedRowGroup; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const ReaderProperties& properties, std::shared_ptr file_decryptor = NULLPTR); void set_file_decryptor(std::shared_ptr file_decryptor); + const std::shared_ptr& file_decryptor() const; // PIMPL Idiom FileMetaData(); From d83af8f749ee560c0b04d986ba2912e696e1cd68 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 8 May 2024 12:57:10 +0200 Subject: [PATCH 088/106] GH-38770: [C++][Python] RecordBatch.filter() segfaults if passed a ChunkedArray (#40971) ### Rationale for this change Filtering a record batch with a boolean mask in the form of a `ChunkedArray` results in a segmentation fault. ### What changes are included in this PR? In case chunked array is passed as a mask to filter record batch, the code path for `pa.Table.filter()` is taken resulting in a filtered table. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #38770 Authored-by: AlenkaF Signed-off-by: AlenkaF --- .../vector_selection_filter_internal.cc | 26 ++++++++++++++----- python/pyarrow/tests/test_compute.py | 5 ++++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index d5e5e5ad289ac..8d43c65668d4b 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/buffer_builder.h" #include "arrow/chunked_array.h" @@ -928,12 +929,26 @@ Result> FilterRecordBatch(const RecordBatch& batch, return Status::Invalid("Filter inputs must all be the same length"); } - // Convert filter to selection vector/indices and use Take + // Fetch filter const auto& filter_opts = *static_cast(options); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr indices, - GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior, - ctx->memory_pool())); + ArrayData filter_array; + switch (filter.kind()) { + case Datum::ARRAY: + filter_array = *filter.array(); + break; + case Datum::CHUNKED_ARRAY: { + ARROW_ASSIGN_OR_RAISE(auto combined, Concatenate(filter.chunked_array()->chunks())); + filter_array = *combined->data(); + break; + } + default: + return Status::TypeError("Filter should be array-like"); + } + + // Convert filter to selection vector/indices and use Take + ARROW_ASSIGN_OR_RAISE(std::shared_ptr indices, + GetTakeIndices(filter_array, filter_opts.null_selection_behavior, + ctx->memory_pool())); std::vector> columns(batch.num_columns()); for (int i = 0; i < batch.num_columns(); ++i) { ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices), @@ -1042,7 +1057,6 @@ class FilterMetaFunction : public MetaFunction { } if (args[0].kind() == Datum::RECORD_BATCH) { - auto values_batch = args[0].record_batch(); ARROW_ASSIGN_OR_RAISE( std::shared_ptr out_batch, FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx)); diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 17cc546f834ca..d7dee1ad05e93 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1345,6 +1345,11 @@ def test_filter_record_batch(): expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"]) assert result.equals(expected) + # GH-38770: mask is chunked array + chunked_mask = pa.chunked_array([[True, False], [None], [False, True]]) + result = batch.filter(chunked_mask) + assert result.equals(expected) + result = batch.filter(mask, null_selection_behavior="emit_null") expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"]) assert result.equals(expected) From e21952f969cd9d0906a86898f561088606447359 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Wed, 8 May 2024 13:47:21 +0200 Subject: [PATCH 089/106] GH-40750: [C++][Python] Map child Array constructed from keys and items shouldn't have offset (#40871) ### Rationale for this change When `MapArray` is constructed from `keys` and `items` array the offset of the list offsets is passed down to the struct child array which is not correct. ### What changes are included in this PR? This PR fixes this issue. ### Are these changes tested? Yes. ### Are there any user-facing changes? Shouldn't be. * GitHub Issue: #40750 Authored-by: AlenkaF Signed-off-by: AlenkaF --- cpp/src/arrow/array/array_list_test.cc | 16 +++++++++++++++- cpp/src/arrow/array/array_nested.cc | 2 +- python/pyarrow/tests/test_array.py | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 18afcc90d71f8..e79ce6fe172b2 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1287,7 +1287,7 @@ TEST_F(TestMapArray, ValidateErrorNullKey) { } TEST_F(TestMapArray, FromArrays) { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, keys, items; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, keys, items; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; @@ -1342,6 +1342,20 @@ TEST_F(TestMapArray, FromArrays) { // Zero-length offsets ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1->Slice(0, 0), keys, items, pool_)); + // Offseted offsets + ASSERT_OK_AND_ASSIGN(auto map5, + MapArray::FromArrays(offsets1->Slice(1), keys, items, pool_)); + ASSERT_OK(map5->Validate()); + + AssertArraysEqual(*expected1.Slice(1), *map5); + + std::vector offset5_values = {2, 2, 6}; + ArrayFromVector(offset5_values, &offsets5); + ASSERT_OK_AND_ASSIGN(auto map6, MapArray::FromArrays(offsets5, keys, items, pool_)); + ASSERT_OK(map6->Validate()); + + AssertArraysEqual(*map5, *map6); + // Offsets not the right type ASSERT_RAISES(TypeError, MapArray::FromArrays(keys, offsets1, items, pool_)); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 24e0dfb7081ac..1be771d8228d9 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -790,7 +790,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& items, int64_t null_count, int64_t offset) { auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length, - {nullptr}, {keys->data(), items->data()}, 0, offset); + {nullptr}, {keys->data(), items->data()}, 0); auto map_data = ArrayData::Make(type, length, std::move(buffers), {pair_data}, null_count, offset); SetData(map_data); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 3754daeb9b4bd..dbe29c5730758 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1099,6 +1099,30 @@ def test_map_from_arrays(): with pytest.raises(ValueError): pa.MapArray.from_arrays(offsets, keys_with_null, items) + # Check if offset in offsets > 0 + offsets = pa.array(offsets, pa.int32()) + result = pa.MapArray.from_arrays(offsets.slice(1), keys, items) + expected = pa.MapArray.from_arrays([1, 3, 5], keys, items) + + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + + offsets = pa.array([0, 0, 0, 0, 0, 0], pa.int32()) + result = pa.MapArray.from_arrays( + offsets.slice(1), + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + expected = pa.MapArray.from_arrays( + [0, 0, 0, 0, 0], + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + def test_fixed_size_list_from_arrays(): values = pa.array(range(12), pa.int64()) From f462ec7e6b85aef3d84b777bc577441f4e10b214 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 10:54:12 -0400 Subject: [PATCH 090/106] MINOR: [Go] Bump golang.org/x/sys from 0.19.0 to 0.20.0 in /go (#41554) Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.19.0 to 0.20.0.
Commits
  • 7d69d98 unix: extend support for z/OS
  • 7758090 cpu: add support for sve2 detection
  • 9a28524 windows: drop go version tags for unsupported versions
  • 27dc90b unix: update to Linux kernel 6.4
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/sys&package-manager=go_modules&previous-version=0.19.0&new-version=0.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 35fd9b9915c0b..188e5c6180ff1 100644 --- a/go/go.mod +++ b/go/go.mod @@ -36,7 +36,7 @@ require ( github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.7.0 - golang.org/x/sys v0.19.0 + golang.org/x/sys v0.20.0 golang.org/x/tools v0.20.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 diff --git a/go/go.sum b/go/go.sum index bf33fed6c4c97..998b3cd8bbcc5 100644 --- a/go/go.sum +++ b/go/go.sum @@ -124,8 +124,8 @@ golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY= From f6720276543844ad53dece91a9350b0a821e52d3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 10:54:32 -0400 Subject: [PATCH 091/106] MINOR: [Go] Bump google.golang.org/protobuf from 1.34.0 to 1.34.1 in /go (#41553) Bumps google.golang.org/protobuf from 1.34.0 to 1.34.1. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=google.golang.org/protobuf&package-manager=go_modules&previous-version=1.34.0&new-version=1.34.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Matt Topol --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index 188e5c6180ff1..7c14ddcf9e216 100644 --- a/go/go.mod +++ b/go/go.mod @@ -41,7 +41,7 @@ require ( golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.34.0 + google.golang.org/protobuf v1.34.1 modernc.org/sqlite v1.29.6 ) diff --git a/go/go.sum b/go/go.sum index 998b3cd8bbcc5..70e3a533d03f3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -138,8 +138,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.34.0 h1:Qo/qEd2RZPCf2nKuorzksSknv0d3ERwp1vFG38gSmH4= -google.golang.org/protobuf v1.34.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From 304650145689291eb87db5dd58f7b9776bdfaacf Mon Sep 17 00:00:00 2001 From: Sarah Gilmore <74676073+sgilmore10@users.noreply.github.com> Date: Wed, 8 May 2024 11:42:41 -0400 Subject: [PATCH 092/106] GH-41435: [CI][MATLAB] Add job to build and test MATLAB Interface on `macos-14` (#41592) ### Rationale for this change Currently, the MATLAB interface is built and tested on `macos-12` - not `macos-14` - because the version of `mathworks/libmexclass` depends on used to not support `macos-14`. However, now that https://github.com/apache/arrow/issues/41400 is closed, the version of `mathworks/libmexclass` the MATLAB interface depends on works on `macos-14`, so we will be able to build and test the MATLAB interface on `macos-14`. **Note**: When adding support for ARM-based macOS builds, we discovered an issue with the way in which we package the MLTBX files for the MATLAB Interface to Arrow. Currently, we bundle all shared libraries for all platforms (.dll, .dylib, and .so) into one large "monolithic" MLTBX file. Unfortunately, putting all platform-specific files into one MLTBX file poses an issue when we support multiple ISAs (e.g. x86 and ARM) because builds for the same operating system with different ISAs will have the same shared library file names. In other words, we will have a library named libarrowproxy.dylib for both ARM and x86 macOS builds. Therefore, we are going to hold off on adding ARM-based macOS builds to the crossbow packaging workflow for now until we have a chance to properly explore alternative packaging approaches. For example, we may want to consider having platform-specific MLTBX files. However, we still think it is worthwhile to add CI support for `macos-14` in the meantime. ### What changes are included in this PR? 1. Added workflow to build and test the MATLAB interface on `macos-14` as well as `macos-12`. ### Are these changes tested? N/A. ### Are there any user-facing changes? No. ### Future Directions 1. Add crossbow packaging workflow on `macos-14` once we determine how to package the interface for both ARM-based and Intel-based mac ISAs. * GitHub Issue: #41435 Authored-by: Sarah Gilmore Signed-off-by: Sarah Gilmore --- .github/workflows/matlab.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 2ae33d1e8d6c6..ca8280927f4a5 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -98,9 +98,16 @@ jobs: select-by-folder: matlab/test strict: true macos: - name: AMD64 macOS 12 MATLAB - runs-on: macos-12 + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} MATLAB + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + strategy: + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" steps: - name: Check out repository uses: actions/checkout@v4 From 5252c6ce13694fa31dbcb2623d1629cd8fe53a47 Mon Sep 17 00:00:00 2001 From: Alex Shcherbakov Date: Wed, 8 May 2024 22:46:45 +0300 Subject: [PATCH 093/106] GH-41594: [Go] Support reading `date64` type & properly validate list-like types (#41595) This PR includes 2 fixes: 1. support reading `date64` columns (as write is supported) 2. properly validate list-like data types (list of unsupported is unsupported) ### Rationale for this change See #41594 ### What changes are included in this PR? 1. Added `date64` reading & conversion funcs similar to `date32` 2. Refactored date type validation ### Are these changes tested? a55cd5324d2c47932410b0c7a9c46075386645d2 ### Are there any user-facing changes? No. * GitHub Issue: #41594 Authored-by: candiduslynx Signed-off-by: Matt Topol --- go/arrow/csv/common.go | 40 ++++++++++------- go/arrow/csv/reader.go | 74 +++++++++++++------------------- go/arrow/csv/reader_test.go | 8 ++++ go/arrow/csv/testdata/header.csv | 8 ++-- go/arrow/csv/testdata/types.csv | 8 ++-- go/arrow/csv/transformer.go | 69 +++++++---------------------- 6 files changed, 86 insertions(+), 121 deletions(-) diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 4455c8b782167..06fed69a77fe5 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -239,21 +239,31 @@ func WithStringsReplacer(replacer *strings.Replacer) Option { func validate(schema *arrow.Schema) { for i, f := range schema.Fields() { - switch ft := f.Type.(type) { - case *arrow.BooleanType: - case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: - case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: - case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: - case *arrow.StringType, *arrow.LargeStringType: - case *arrow.TimestampType: - case *arrow.Date32Type, *arrow.Date64Type: - case *arrow.Decimal128Type, *arrow.Decimal256Type: - case *arrow.ListType, *arrow.LargeListType, *arrow.FixedSizeListType: - case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: - case arrow.ExtensionType: - case *arrow.NullType: - default: - panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) + if !typeSupported(f.Type) { + panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, f.Type)) } } } + +func typeSupported(dt arrow.DataType) bool { + switch dt := dt.(type) { + case *arrow.BooleanType: + case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: + case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: + case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: + case *arrow.StringType, *arrow.LargeStringType: + case *arrow.TimestampType: + case *arrow.Date32Type, *arrow.Date64Type: + case *arrow.Decimal128Type, *arrow.Decimal256Type: + case *arrow.MapType: + return false + case arrow.ListLikeType: + return typeSupported(dt.Elem()) + case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: + case arrow.ExtensionType: + case *arrow.NullType: + default: + return false + } + return true +} diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go index 18f1083e6a9dc..46591a9a5adee 100644 --- a/go/arrow/csv/reader.go +++ b/go/arrow/csv/reader.go @@ -474,6 +474,10 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDate32(bldr, str) } + case *arrow.Date64Type: + return func(str string) { + r.parseDate64(bldr, str) + } case *arrow.Time32Type: return func(str string) { r.parseTime32(bldr, str, dt.Unit) @@ -486,17 +490,13 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDecimal256(bldr, str, dt.Precision, dt.Scale) } - case *arrow.ListType: - return func(s string) { - r.parseList(bldr, s) - } - case *arrow.LargeListType: + case *arrow.FixedSizeListType: return func(s string) { - r.parseLargeList(bldr, s) + r.parseFixedSizeList(bldr.(*array.FixedSizeListBuilder), s, int(dt.Len())) } - case *arrow.FixedSizeListType: + case arrow.ListLikeType: return func(s string) { - r.parseFixedSizeList(bldr, s, int(dt.Len())) + r.parseListLike(bldr.(array.ListLikeBuilder), s) } case *arrow.BinaryType: return func(s string) { @@ -740,81 +740,67 @@ func (r *Reader) parseDate32(field array.Builder, str string) { field.(*array.Date32Builder).Append(arrow.Date32FromTime(tm)) } -func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { +func (r *Reader) parseDate64(field array.Builder, str string) { if r.isNull(str) { field.AppendNull() return } - val, err := arrow.Time32FromString(str, unit) + tm, err := time.Parse("2006-01-02", str) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Time32Builder).Append(val) + field.(*array.Date64Builder).Append(arrow.Date64FromTime(tm)) } -func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal128.FromString(str, prec, scale) + val, err := arrow.Time32FromString(str, unit) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal128Builder).Append(val) + field.(*array.Time32Builder).Append(val) } -func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal256.FromString(str, prec, scale) + val, err := decimal128.FromString(str, prec, scale) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal256Builder).Append(val) + field.(*array.Decimal128Builder).Append(val) } -func (r *Reader) parseList(field array.Builder, str string) { +func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - if !(strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}")) { - r.err = errors.New("invalid list format. should start with '{' and end with '}'") - return - } - str = strings.Trim(str, "{}") - listBldr := field.(*array.ListBuilder) - listBldr.Append(true) - if len(str) == 0 { - // we don't want to create the csv reader if we already know the - // string is empty - return - } - valueBldr := listBldr.ValueBuilder() - reader := csv.NewReader(strings.NewReader(str)) - items, err := reader.Read() - if err != nil { + + val, err := decimal256.FromString(str, prec, scale) + if err != nil && r.err == nil { r.err = err + field.AppendNull() return } - for _, str := range items { - r.initFieldConverter(valueBldr)(str) - } + field.(*array.Decimal256Builder).Append(val) } -func (r *Reader) parseLargeList(field array.Builder, str string) { +func (r *Reader) parseListLike(field array.ListLikeBuilder, str string) { if r.isNull(str) { field.AppendNull() return @@ -824,14 +810,13 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { return } str = strings.Trim(str, "{}") - largeListBldr := field.(*array.LargeListBuilder) - largeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := largeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { @@ -843,7 +828,7 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { } } -func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { +func (r *Reader) parseFixedSizeList(field *array.FixedSizeListBuilder, str string, n int) { if r.isNull(str) { field.AppendNull() return @@ -853,14 +838,13 @@ func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { return } str = strings.Trim(str, "{}") - fixedSizeListBldr := field.(*array.FixedSizeListBuilder) - fixedSizeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := fixedSizeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go index b6654dd1984ea..65453db015a7e 100644 --- a/go/arrow/csv/reader_test.go +++ b/go/arrow/csv/reader_test.go @@ -357,6 +357,8 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, + {Name: "date32", Type: arrow.PrimitiveTypes.Date32}, + {Name: "date64", Type: arrow.PrimitiveTypes.Date64}, }, nil, ) @@ -420,6 +422,8 @@ rec[0]["binary"]: ["\x00\x01\x02"] rec[0]["large_binary"]: ["\x00\x01\x02"] rec[0]["fixed_size_binary"]: ["\x00\x01\x02"] rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"] +rec[0]["date32"]: [19121] +rec[0]["date64"]: [1652054400000] rec[1]["bool"]: [false] rec[1]["i8"]: [-2] rec[1]["i16"]: [-2] @@ -442,6 +446,8 @@ rec[1]["binary"]: [(null)] rec[1]["large_binary"]: [(null)] rec[1]["fixed_size_binary"]: [(null)] rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"] +rec[1]["date32"]: [19121] +rec[1]["date64"]: [1652054400000] rec[2]["bool"]: [(null)] rec[2]["i8"]: [(null)] rec[2]["i16"]: [(null)] @@ -464,6 +470,8 @@ rec[2]["binary"]: [(null)] rec[2]["large_binary"]: [(null)] rec[2]["fixed_size_binary"]: [(null)] rec[2]["uuid"]: [(null)] +rec[2]["date32"]: [(null)] +rec[2]["date64"]: [(null)] `, str1Value, str1Value, str2Value, str2Value) got, want := out.String(), want require.Equal(t, want, got) diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv index 50be4f5e4daca..68ae18a499dee 100644 --- a/go/arrow/csv/testdata/header.csv +++ b/go/arrow/csv/testdata/header.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv index d32941f4b214d..91c0cf3b252b3 100644 --- a/go/arrow/csv/testdata/types.csv +++ b/go/arrow/csv/testdata/types.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index 90c26ac981078..237437c0441e1 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v17/arrow/array" ) -func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string)string) []string { +func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string) string) []string { res := make([]string, col.Len()) switch typ.(type) { case *arrow.BooleanType: @@ -215,62 +215,25 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, st res[i] = w.nullValue } } - case *arrow.ListType: - arr := col.(*array.List) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.LargeListType: - arr := col.(*array.LargeList) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.FixedSizeListType: - arr := col.(*array.FixedSizeList) + case arrow.ListLikeType: + arr := col.(array.ListLike) listVals := arr.ListValues() for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64((arr.Len()-1)*i), int64((arr.Len()-1)*(i+1))) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { + if arr.IsNull(i) { res[i] = w.nullValue + continue } + start, end := arr.ValueOffsets(i) + list := array.NewSlice(listVals, start, end) + var b bytes.Buffer + b.Write([]byte{'{'}) + writer := csv.NewWriter(&b) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) + writer.Flush() + b.Truncate(b.Len() - 1) + b.Write([]byte{'}'}) + res[i] = b.String() + list.Release() } case *arrow.BinaryType: arr := col.(*array.Binary) From 318d22adda3b66bd4a10fddc7789c8a13e4aa540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Wed, 8 May 2024 22:05:23 +0200 Subject: [PATCH 094/106] MINOR: [Dev] Remove Dane from collaborators list (#41589) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Dane is a new committer. ### What changes are included in this PR? Remove the collaborator list. ### Are these changes tested? Not required. ### Are there any user-facing changes? No Authored-by: Raúl Cumplido Signed-off-by: Dane Pitkin --- .asf.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.asf.yaml b/.asf.yaml index 1eb019fea9af1..a1c6434587703 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,7 +21,6 @@ github: collaborators: - anjakefala - benibus - - danepitkin - davisusanibar - jbonofre - js8544 From 46e78160933d039991cedfabb9216dc4c861fb4b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 9 May 2024 06:29:46 +0900 Subject: [PATCH 095/106] GH-41430: [Docs] Use sphinxcontrib-mermaid instead of generating images from .mmd (#41455) ### Rationale for this change This is for easy to maintain. ### What changes are included in this PR? * Install sphinxcontrib-mermaid * Install Chromium to generate SVG from .mmd * Use Debian instead of Ubuntu for building docs because Ubuntu provides Chromium only via snap * Use a normal user not root to build documents because Mermaid require additional `--no-sandbox` argument when we use root ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #41430 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/docs.yml | 13 ++-- .github/workflows/docs_light.yml | 2 +- ci/conda_env_sphinx.txt | 1 + ci/docker/linux-apt-docs.dockerfile | 60 ++++++++++++------- ci/scripts/cpp_build.sh | 13 ++-- ci/scripts/integration_arrow.sh | 2 + ci/scripts/java_build.sh | 13 +++- ci/scripts/java_cdata_integration.sh | 4 +- ci/scripts/js_build.sh | 19 ++++-- ci/scripts/js_test.sh | 3 +- ci/scripts/python_build.sh | 33 ++++++++-- ci/scripts/r_build.sh | 20 ++++++- dev/archery/archery/docker/core.py | 4 ++ .../archery/integration/tester_java.py | 16 +++-- dev/archery/archery/integration/tester_js.py | 8 ++- dev/tasks/tasks.yml | 12 ++-- docker-compose.yml | 39 +++++++----- docs/requirements.txt | 3 +- docs/source/conf.py | 8 ++- docs/source/format/Flight.rst | 20 +++---- docs/source/format/Flight/DoExchange.mmd | 3 - docs/source/format/Flight/DoExchange.mmd.svg | 1 - docs/source/format/Flight/DoGet.mmd | 3 - docs/source/format/Flight/DoGet.mmd.svg | 1 - docs/source/format/Flight/DoPut.mmd | 3 - docs/source/format/Flight/DoPut.mmd.svg | 1 - docs/source/format/Flight/PollFlightInfo.mmd | 3 - .../format/Flight/PollFlightInfo.mmd.svg | 1 - docs/source/format/FlightSql.rst | 20 +++---- .../format/FlightSql/CommandGetTables.mmd | 3 - .../format/FlightSql/CommandGetTables.mmd.svg | 1 - .../CommandPreparedStatementQuery.mmd | 3 - .../CommandPreparedStatementQuery.mmd.svg | 1 - .../FlightSql/CommandStatementIngest.mmd | 3 - .../FlightSql/CommandStatementIngest.mmd.svg | 1 - .../FlightSql/CommandStatementQuery.mmd | 3 - .../FlightSql/CommandStatementQuery.mmd.svg | 1 - 37 files changed, 210 insertions(+), 135 deletions(-) delete mode 100644 docs/source/format/Flight/DoExchange.mmd.svg delete mode 100644 docs/source/format/Flight/DoGet.mmd.svg delete mode 100644 docs/source/format/Flight/DoPut.mmd.svg delete mode 100644 docs/source/format/Flight/PollFlightInfo.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandGetTables.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandStatementIngest.mmd.svg delete mode 100644 docs/source/format/FlightSql/CommandStatementQuery.mmd.svg diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fe49e275d908d..36a0dc014db8d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,12 +32,12 @@ env: jobs: complete: - name: AMD64 Ubuntu 22.04 Complete Documentation + name: AMD64 Debian 12 Complete Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 150 env: - UBUNTU: "22.04" + JDK: 17 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -50,8 +50,8 @@ jobs: uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: debian-docs-${{ hashFiles('cpp/**') }} + restore-keys: debian-docs- - name: Setup Python uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: @@ -62,7 +62,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run ubuntu-docs + JDK: 17 + run: archery docker run debian-docs - name: Docker Push if: >- success() && @@ -73,4 +74,4 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push ubuntu-docs + run: archery docker push debian-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 376c87651d2d0..947e2ac21b83c 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -31,7 +31,7 @@ on: permissions: contents: read - + env: ARCHERY_DEBUG: 1 ARCHERY_USE_DOCKER_CLI: 1 diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 83afa69a653a9..4665a32e24bbe 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -28,6 +28,7 @@ sphinx-design sphinx-copybutton sphinx-lint sphinxcontrib-jquery +sphinxcontrib-mermaid sphinx==6.2 # Requirement for doctest-cython # Needs upper pin of 0.3.0, see: diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index ec424b4e6eaa0..1c916840e071b 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -21,18 +21,34 @@ FROM ${base} ARG r=4.4 ARG jdk=8 -# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium + +# See R install instructions at https://cloud.r-project.org/bin/linux/ RUN apt-get update -y && \ apt-get install -y \ - dirmngr \ apt-transport-https \ - software-properties-common && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + dirmngr \ + gpg \ + lsb-release && \ + gpg --keyserver keyserver.ubuntu.com \ + --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ + gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + gpg --no-default-keyring \ + --keyring /usr/share/keyrings/cran.gpg \ + --import - && \ + echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + tee /etc/apt/sources.list.d/cran.list && \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i \ + -e 's/main$/main contrib non-free non-free-firmware/g' \ + /etc/apt/sources.list.d/debian.sources; \ + fi && \ + apt-get update -y && \ apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ + chromium \ + chromium-sandbox \ curl \ doxygen \ gi-docgen \ @@ -48,6 +64,8 @@ RUN apt-get update -y && \ libxml2-dev \ meson \ ninja-build \ + nodejs \ + npm \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ @@ -55,9 +73,12 @@ RUN apt-get update -y && \ r-base=${r}* \ rsync \ ruby-dev \ + sudo \ wget && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + npm install -g yarn @mermaid-js/mermaid-cli ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 @@ -68,20 +89,6 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=16 -RUN apt-get purge -y npm && \ - apt-get autoremove -y --purge && \ - wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - npm install -g yarn - -COPY docs/requirements.txt /arrow/docs/ -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install -r arrow/docs/requirements.txt - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -98,6 +105,17 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" +RUN useradd --user-group --create-home --groups audio,video arrow +RUN echo "arrow ALL=(ALL:ALL) NOPASSWD:ALL" | \ + EDITOR=tee visudo -f /etc/sudoers.d/arrow +USER arrow + +COPY docs/requirements.txt /arrow/docs/ +RUN sudo chown -R arrow: ${ARROW_PYTHON_VENV} && \ + python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -r arrow/docs/requirements.txt + ENV ARROW_ACERO=ON \ ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e28ceae8801f0..ceeab2455bef6 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -229,12 +229,17 @@ find . -name "*.o" -delete popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi + ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo -e "===\n=== ccache statistics after build\n===" - ccache -sv 2>/dev/null || ccache -s + echo -e "===\n=== ccache statistics after build\n===" + ccache -sv 2>/dev/null || ccache -s fi if command -v sccache &> /dev/null; then @@ -244,6 +249,6 @@ fi if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc - doxygen + OUTPUT_DIRECTORY=${build_dir}/apidoc doxygen popd fi diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a5a012ad2c5c4..2eb58e8dc75ec 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -40,6 +40,8 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +export ARROW_BUILD_ROOT=${build_dir} + # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 2103f0329baec..0fa1edab429c0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -75,7 +75,16 @@ fi # Use `2 * ncores` threads mvn="${mvn} -T 2C" -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +mkdir -p ${build_dir} +rm -rf ${build_dir}/format +cp -aL ${arrow_dir}/format ${build_dir}/ +rm -rf ${build_dir}/java +cp -aL ${source_dir} ${build_dir}/ +pushd ${build_dir}/java if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then mvn="${mvn} -Pshade-flatbuffers" @@ -95,7 +104,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh index 86ea7cf155350..0ee5d3026aa09 100755 --- a/ci/scripts/java_cdata_integration.sh +++ b/ci/scripts/java_cdata_integration.sh @@ -20,9 +20,9 @@ set -ex arrow_dir=${1} -export ARROW_SOURCE_DIR=${arrow_dir} +build_dir=${2} -pushd ${arrow_dir}/java/c/src/test/python +pushd ${build_dir}/java/c/src/test/python python integration_tests.py diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index d61f74f0b7ca1..196539ee0f101 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -25,7 +25,16 @@ build_dir=${2} : ${BUILD_DOCS_JS:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/js +mkdir -p ${build_dir} +cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ +cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ +cp -aL ${source_dir} ${build_dir}/js +pushd ${build_dir}/js yarn --immutable yarn lint:ci @@ -34,18 +43,18 @@ yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then # If apache or upstream are defined use those as remote. # Otherwise use origin which could be a fork on PRs. - if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache - elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi mkdir -p ${build_dir}/docs/js - rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js + rsync -a doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh index 40de974ede161..863b1c3d34613 100755 --- a/ci/scripts/js_test.sh +++ b/ci/scripts/js_test.sh @@ -20,8 +20,9 @@ set -ex source_dir=${1}/js +build_dir=${2}/js -pushd ${source_dir} +pushd ${build_dir} yarn lint yarn test diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 99153cdf75539..9455baf353633 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -78,17 +78,42 @@ export PYARROW_PARALLEL=${n_jobs} export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${python_build_dir} +cp -aL ${source_dir} ${python_build_dir} +pushd ${python_build_dir} # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions # (e.g. Numpy, Pandas) on some CI jobs. ${PYTHON:-python} -m pip install --no-deps --no-build-isolation -vv . -# Remove build artifacts from source directory -find build/ -user root -delete popd if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + # https://github.com/apache/arrow/issues/41429 + # TODO: We want to out-of-source build. This is a workaround. + # + # Copy docs/source because the "autosummary_generate = True" + # configuration generates files to docs/source/python/generated/. + rm -rf ${python_build_dir}/docs/source + mkdir -p ${python_build_dir}/docs + cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ + rm -rf ${python_build_dir}/format + cp -a ${arrow_dir}/format ${python_build_dir}/ + rm -rf ${python_build_dir}/cpp/examples + mkdir -p ${python_build_dir}/cpp + cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ + rm -rf ${python_build_dir}/ci + cp -a ${arrow_dir}/ci/ ${python_build_dir}/ ncpus=$(python -c "import os; print(os.cpu_count())") - sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs + export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml + pushd ${build_dir} + sphinx-build \ + -b html \ + ${python_build_dir}/docs/source \ + ${build_dir}/docs + popd fi diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 38b54e4434036..f4dc5a5781c6e 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -24,15 +24,29 @@ build_dir=${2} : ${BUILD_DOCS_R:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/r +cp -aL ${source_dir} ${build_dir}/r +pushd ${build_dir}/r # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +if [ -x "$(command -v sudo)" ]; then + SUDO=sudo +else + SUDO= +fi +${SUDO} \ + env \ + PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a ${source_dir}/docs/ ${build_dir}/docs/r + rsync -a docs/ ${build_dir}/docs/r fi popd diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 7376bb0a3b72d..cb831060022a4 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -371,6 +371,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, v = "{}:{}".format(v['source'], v['target']) args.extend(['-v', v]) + # append capabilities from the compose conf + for c in service.get('cap_add', []): + args.extend([f'--cap-add={c}']) + # infer whether an interactive shell is desired or not if command in ['cmd.exe', 'bash', 'sh', 'powershell']: args.append('-it') diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 8e7a0bb99f9de..ccc807410a848 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -18,17 +18,23 @@ import contextlib import functools import os +from pathlib import Path import subprocess from . import cdata from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT + + +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) def load_version_from_pom(): import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) + tree = ET.parse(os.path.join(ARROW_BUILD_ROOT, 'java', 'pom.xml')) tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' version_tag = list(tree.getroot().findall(tag_pattern))[0] return version_tag.text @@ -48,7 +54,7 @@ def load_version_from_pom(): _ARROW_TOOLS_JAR = os.environ.get( "ARROW_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/tools/target", f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar" ) @@ -56,7 +62,7 @@ def load_version_from_pom(): _ARROW_C_DATA_JAR = os.environ.get( "ARROW_C_DATA_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/c/target", f"arrow-c-data-{_arrow_version}.jar" ) @@ -64,7 +70,7 @@ def load_version_from_pom(): _ARROW_FLIGHT_JAR = os.environ.get( "ARROW_FLIGHT_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/flight/flight-integration-tests/target", f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar" ) diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py index c7f363ba54687..3d1a229931cde 100644 --- a/dev/archery/archery/integration/tester_js.py +++ b/dev/archery/archery/integration/tester_js.py @@ -16,13 +16,17 @@ # under the License. import os +from pathlib import Path from .tester import Tester from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT -ARROW_JS_ROOT = os.path.join(ARROW_ROOT_DEFAULT, 'js') +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) +ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, 'js') _EXE_PATH = os.path.join(ARROW_JS_ROOT, 'bin') _VALIDATE = os.path.join(_EXE_PATH, 'integration.ts') _JSON_TO_ARROW = os.path.join(_EXE_PATH, 'json-to-arrow.ts') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 126b0fcb6f76a..146fa52fa958b 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -65,7 +65,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-*-docs + - test-debian-*-docs {############################# Testing tasks #################################} @@ -1458,15 +1458,15 @@ tasks: {% endfor %} # be sure to update binary-task.rb when upgrading ubuntu - test-ubuntu-22.04-docs: + test-debian-12-docs: ci: github template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: false artifacts: - docs.tar.gz @@ -1594,8 +1594,8 @@ tasks: template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: true diff --git a/docker-compose.yml b/docker-compose.yml index d771fc2d22a35..9bedb59a77be8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,7 +131,8 @@ x-hierarchy: - debian-cpp: - debian-c-glib: - debian-ruby - - debian-python + - debian-python: + - debian-docs - debian-go: - debian-go-cgo - debian-go-cgo-python @@ -145,8 +146,7 @@ x-hierarchy: - ubuntu-c-glib: - ubuntu-ruby - ubuntu-lint - - ubuntu-python: - - ubuntu-docs + - ubuntu-python - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r @@ -1228,6 +1228,8 @@ services: # We should extend the list of enabled rules after adding this build to # the CI pipeline. image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_SUBSTRAIT: "ON" @@ -1378,7 +1380,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/java_jni_build.sh /arrow $${ARROW_HOME} /build /tmp/dist/java/ && /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && - /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + /arrow/ci/scripts/java_cdata_integration.sh /arrow /build" ] conda-python-cython2: # Usage: @@ -1680,7 +1682,7 @@ services: command: &js-command > /bin/bash -c " /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/js_test.sh /arrow" + /arrow/ci/scripts/js_test.sh /arrow /build" #################################### C# ##################################### @@ -1759,29 +1761,34 @@ services: ################################ Docs ####################################### - ubuntu-docs: + debian-docs: # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose build ubuntu-docs - # docker-compose run --rm ubuntu-docs - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + # docker-compose build debian-cpp + # docker-compose build debian-python + # docker-compose build debian-docs + # docker-compose run --rm debian-docs + image: ${REPO}:${ARCH}-debian-${DEBIAN}-docs build: context: . dockerfile: ci/docker/linux-apt-docs.dockerfile cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + - ${REPO}:${ARCH}-debian-${DEBIAN}-docs args: r: ${R} jdk: ${JDK} maven: ${MAVEN} node: ${NODE} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 + base: ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 + # This is for Chromium used by Mermaid. Chromium uses namespace + # isolation for security by default. + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_CUDA: "ON" ARROW_CXX_FLAGS_DEBUG: "-g1" ARROW_C_FLAGS_DEBUG: "-g1" + ARROW_HOME: "/tmp/local" ARROW_JAVA_SKIP_GIT_PLUGIN: ARROW_SUBSTRAIT: "ON" BUILD_DOCS_C_GLIB: "ON" @@ -1790,9 +1797,11 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - volumes: *ubuntu-volumes - command: &docs-command > + volumes: *debian-volumes + command: > /bin/bash -c " + sudo mkdir -p /build /ccache && + sudo chown -R `id --user --name`: /build /ccache && /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/c_glib_build.sh /arrow /build && diff --git a/docs/requirements.txt b/docs/requirements.txt index 8891680814dff..afb252e17457b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,8 +8,9 @@ myst-parser[linkify] numpydoc pydata-sphinx-theme~=0.14 sphinx-autobuild -sphinx-design sphinx-copybutton +sphinx-design sphinx-lint +sphinxcontrib-mermaid sphinx==6.2 pandas diff --git a/docs/source/conf.py b/docs/source/conf.py index 05340dc923c89..b487200555a09 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -125,6 +125,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinxcontrib.mermaid', ] # Show members for classes in .. autosummary @@ -137,7 +138,9 @@ } # Breathe configuration -breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_projects = { + "arrow_cpp": os.environ.get("ARROW_CPP_DOXYGEN_XML", "../../cpp/apidoc/xml"), +} breathe_default_project = "arrow_cpp" # Overridden conditionally below @@ -584,6 +587,9 @@ # # texinfo_no_detailmenu = False +# -- Options for mermaid output ------------------------------------------- + +mermaid_output_format = 'svg' def setup(app): # Use a config value to indicate whether CUDA API docs can be generated. diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index 7ee84952b4350..c65a1f70bde7f 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -68,9 +68,8 @@ Downloading Data A client that wishes to download the data would: -.. figure:: ./Flight/DoGet.mmd.svg - - Retrieving data via ``DoGet``. +.. mermaid:: ./Flight/DoGet.mmd + :caption: Retrieving data via ``DoGet``. #. Construct or acquire a ``FlightDescriptor`` for the data set they are interested in. @@ -168,9 +167,8 @@ data. However, ``GetFlightInfo`` doesn't return until the query completes, so the client is blocked. In this situation, the client can use ``PollFlightInfo`` instead of ``GetFlightInfo``: -.. figure:: ./Flight/PollFlightInfo.mmd.svg - - Polling a long-running query by ``PollFlightInfo``. +.. mermaid:: ./Flight/PollFlightInfo.mmd + :caption: Polling a long-running query by ``PollFlightInfo``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``PollFlightInfo(FlightDescriptor)`` to get a ``PollInfo`` @@ -229,9 +227,8 @@ Uploading Data To upload data, a client would: -.. figure:: ./Flight/DoPut.mmd.svg - - Uploading data via ``DoPut``. +.. mermaid:: ./Flight/DoPut.mmd + :caption: Uploading data via ``DoPut``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoPut(FlightData)`` and upload a stream of Arrow record @@ -257,9 +254,8 @@ require being stateful if implemented using ``DoGet`` and ``DoPut``. Instead, ``DoExchange`` allows this to be implemented as a single call. A client would: -.. figure:: ./Flight/DoExchange.mmd.svg - - Complex data flow with ``DoExchange``. +.. mermaid:: ./Flight/DoExchange.mmd + :caption: Complex data flow with ``DoExchange``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoExchange(FlightData)``. diff --git a/docs/source/format/Flight/DoExchange.mmd b/docs/source/format/Flight/DoExchange.mmd index 14f1789aeaaa7..f7586bf35eb4f 100644 --- a/docs/source/format/Flight/DoExchange.mmd +++ b/docs/source/format/Flight/DoExchange.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoExchange.mmd.svg b/docs/source/format/Flight/DoExchange.mmd.svg deleted file mode 100644 index 204d63d77218d..0000000000000 --- a/docs/source/format/Flight/DoExchange.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoExchange(FlightData)1stream of FlightData2stream of FlightData3par[[Client sends data]][[Server sends data]]ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/DoGet.mmd b/docs/source/format/Flight/DoGet.mmd index c2e3cd034448c..cac59afb8219f 100644 --- a/docs/source/format/Flight/DoGet.mmd +++ b/docs/source/format/Flight/DoGet.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoGet.mmd.svg b/docs/source/format/Flight/DoGet.mmd.svg deleted file mode 100644 index 48a50d77ed33f..0000000000000 --- a/docs/source/format/Flight/DoGet.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerGetFlightInfo(FlightDescriptor)1FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}2This may be parallelizedDoGet(Ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/Flight/DoPut.mmd b/docs/source/format/Flight/DoPut.mmd index 5845edef1f466..876505da2d300 100644 --- a/docs/source/format/Flight/DoPut.mmd +++ b/docs/source/format/Flight/DoPut.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoPut.mmd.svg b/docs/source/format/Flight/DoPut.mmd.svg deleted file mode 100644 index 9e490e152bdb3..0000000000000 --- a/docs/source/format/Flight/DoPut.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoPut(FlightData)1stream of FlightData2PutResult{app_metadata}3ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/PollFlightInfo.mmd b/docs/source/format/Flight/PollFlightInfo.mmd index d062a3a216958..f91c077b655c0 100644 --- a/docs/source/format/Flight/PollFlightInfo.mmd +++ b/docs/source/format/Flight/PollFlightInfo.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd):/data minlag/mermaid-cli -i /data/PollFlightInfo.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/PollFlightInfo.mmd.svg b/docs/source/format/Flight/PollFlightInfo.mmd.svg deleted file mode 100644 index 1890361f88ce4..0000000000000 --- a/docs/source/format/Flight/PollFlightInfo.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerThis may be parallelizedSome endpoints may be processed while pollingloop[for each endpoint in FlightInfo.endpoints]PollFlightInfo(FlightDescriptor)1PollInfo{descriptor: FlightDescriptor', ...}2PollFlightInfo(FlightDescriptor')3PollInfo{descriptor: FlightDescriptor'', ...}4PollFlightInfo(FlightDescriptor'')5PollInfo{descriptor: null, info: FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}6DoGet(Ticket)7stream of FlightData8ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 1a43e4bdff306..181efce286e70 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -242,21 +242,17 @@ Close and invalidate the current session context. Sequence Diagrams ================= -.. figure:: ./FlightSql/CommandGetTables.mmd.svg +.. mermaid:: ./FlightSql/CommandGetTables.mmd + :caption: Listing available tables. - Listing available tables. +.. mermaid:: ./FlightSql/CommandStatementQuery.mmd + :caption: Executing an ad-hoc query. -.. figure:: ./FlightSql/CommandStatementQuery.mmd.svg +.. mermaid:: ./FlightSql/CommandPreparedStatementQuery.mmd + :caption: Creating a prepared statement, then executing it. - Executing an ad-hoc query. - -.. figure:: ./FlightSql/CommandPreparedStatementQuery.mmd.svg - - Creating a prepared statement, then executing it. - -.. figure:: ./FlightSql/CommandStatementIngest.mmd.svg - - Executing a bulk ingestion. +.. mermaid:: ./FlightSql/CommandStatementIngest.mmd + :caption: Executing a bulk ingestion. External Resources ================== diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd b/docs/source/format/FlightSql/CommandGetTables.mmd index f151411647f23..e6b18ed7dc08b 100644 --- a/docs/source/format/FlightSql/CommandGetTables.mmd +++ b/docs/source/format/FlightSql/CommandGetTables.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd.svg b/docs/source/format/FlightSql/CommandGetTables.mmd.svg deleted file mode 100644 index 4e71c01982289..0000000000000 --- a/docs/source/format/FlightSql/CommandGetTables.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandGetTables)1FlightInfo{…Ticket…}2DoGet(Ticket)3stream of FlightData4ClientServer \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd index cbd1eb6014bca..ce18b91eaa33e 100644 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandPreparedStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg deleted file mode 100644 index cbf6a78e9a5ce..0000000000000 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientoptional response with updated handleloop[for each endpoint in FlightInfo.endpoints]loop[for each invocation of the prepared statement]DoAction(ActionCreatePreparedStatementRequest)1ActionCreatePreparedStatementResult{handle}2DoPut(CommandPreparedStatementQuery)3stream of FlightData4DoPutPreparedStatementResult{handle}5GetFlightInfo(CommandPreparedStatementQuery)6FlightInfo{endpoints: [FlightEndpoint{…}, …]}7DoGet(endpoint.ticket)8stream of FlightData9DoAction(ActionClosePreparedStatementRequest)10ActionClosePreparedStatementRequest{}11 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd b/docs/source/format/FlightSql/CommandStatementIngest.mmd index 781289d77b41a..0578f465d4dda 100644 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd +++ b/docs/source/format/FlightSql/CommandStatementIngest.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg b/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg deleted file mode 100644 index e2aa72459afa5..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientDoPut(CommandStatementIngest)1stream of FlightData2PutResult{DoPutUpdateResult{RecordCount: int64}}3 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd b/docs/source/format/FlightSql/CommandStatementQuery.mmd index 7b67fecfb75c6..f26aa2f951fcf 100644 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg deleted file mode 100644 index f5e8c79f137ff..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandStatementQuery)1FlightInfo{endpoints: [FlightEndpoint{…}, …]}2DoGet(endpoint.ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientServer \ No newline at end of file From 071ffaf2633eb58540a872514507ab362cc26fb4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 May 2024 00:22:02 +0200 Subject: [PATCH 096/106] GH-41256: [Format][Docs] Add a canonical extension type specification for JSON (#41257) ### Rationale for this change As per #41256 this proposes a specification of a canonical extension type for JSON serialized data. ### What changes are included in this PR? This adds to documentation of canonical extension types. ### Are these changes tested? No as only docs are changed. ### Are there any user-facing changes? No. * GitHub Issue: #41256 Lead-authored-by: Rok Mihevc Co-authored-by: Will Jones Co-authored-by: Antoine Pitrou Signed-off-by: Rok Mihevc --- docs/source/format/CanonicalExtensions.rst | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 1f055b7f8edb5..47c161c14cafc 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -51,7 +51,7 @@ types: 3) Its serialization *must* be described in the proposal and should not require unduly implementation work or unusual software dependencies - (for example, a trivial custom text format or JSON would be acceptable). + (for example, a trivial custom text format or a JSON-based format would be acceptable). 4) Its expected semantics *should* be described as well and any potential ambiguities or pain points addressed or at least mentioned. @@ -251,6 +251,27 @@ Variable shape tensor Values inside each **data** tensor element are stored in row-major/C-contiguous order according to the corresponding **shape**. +.. _json_extension: + +JSON +==== + +* Extension name: ``arrow.json``. + +* The storage type of this extension is ``String`` or + or ``LargeString`` or ``StringView``. + Only UTF-8 encoded JSON as specified in `rfc8259`_ is supported. + +* Extension type parameters: + + This type does not have any parameters. + +* Description of the serialization: + + Metadata is either an empty string or a JSON string with an empty object. + In the future, additional fields may be added, but they are not required + to interpret the array. + ========================= Community Extension Types ========================= @@ -268,3 +289,5 @@ GeoArrow Arrow extension types for representing vector geometries. It is well known within the Arrow geospatial subcommunity. The GeoArrow specification is not yet finalized. + +.. _rfc8259: https://datatracker.ietf.org/doc/html/rfc8259 From c5be02703312f01186ceea2d910a93e5421e3c83 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 May 2024 00:46:29 +0200 Subject: [PATCH 097/106] GH-41298: [Format][Docs] Add a canonical extension type specification for UUID (#41299) ### Rationale for this change Several users have expressed a need for a UUID type. This is to provide a canonical UUID extension type specification. ### What changes are included in this PR? This adds to documentation of canonical extension types. ### Are these changes tested? No as only docs are changed. ### Are there any user-facing changes? No. * GitHub Issue: #41298 Lead-authored-by: Rok Mihevc Co-authored-by: Antoine Pitrou Signed-off-by: Rok Mihevc --- docs/source/format/CanonicalExtensions.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 47c161c14cafc..c60f095dd354d 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -272,6 +272,17 @@ JSON In the future, additional fields may be added, but they are not required to interpret the array. +UUID +==== + +* Extension name: ``arrow.uuid``. + +* The storage type of the extension is ``FixedSizeBinary`` with a length of 16 bytes. + +.. note:: + A specific UUID version is not required or guaranteed. This extension represents + UUIDs as FixedSizeBinary(16) with big-endian notation and does not interpret the bytes in any way. + ========================= Community Extension Types ========================= From 7bfe02db04e34fc1ab6df6f647a76899e0c654db Mon Sep 17 00:00:00 2001 From: David Schlosnagle Date: Wed, 8 May 2024 19:46:15 -0400 Subject: [PATCH 098/106] GH-41573: [Java] VectorSchemaRoot uses inefficient stream to copy fieldVectors (#41574) ### Rationale for this change While reviewing allocation profiling of an Arrow intensive application, I noticed significant allocations due to `ArrayList#grow()` originating from `org.apache.arrow.vector.VectorSchemaRoot#getFieldVectors()`. The `org.apache.arrow.vector.VectorSchemaRoot#getFieldVectors()` method uses an inefficient `fieldVectors.stream().collect(Collectors.toList())` to create a list copy, leading to reallocations as the target list is collected. This could be replaced with a more efficent `new ArrayList<>(fieldVectors)` to make a pre-sized list copy, or even better an unmodifiable view via `Collections.unmodifiableList(fieldVectors)`. ### What changes are included in this PR? * Use `Collections.unmodifiableList(List)` to return unmodifiable list view of `fieldVectors` from `getFieldVectors()` * Pre-size the `fieldVectors` `ArrayList` in static factory `VectorSchemaRoot#create(Schema, BufferAllocator)` * `VectorSchemaRoot#setRowCount(int)` iterates over instance `fieldVectors` instead of copied list (similar to existing `allocateNew()`, `clear()`, `contentToTSVString()`). ### Are these changes tested? These changes are covered by existing unit and integration tests. ### Are there any user-facing changes? No * GitHub Issue: #41573 Authored-by: David Schlosnagle Signed-off-by: David Li --- .../java/org/apache/arrow/vector/VectorSchemaRoot.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 8768a90c80b83..9a92ce5060b1b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -121,7 +122,7 @@ public VectorSchemaRoot(Schema schema, List fieldVectors, int rowCo * Creates a new set of empty vectors corresponding to the given schema. */ public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) { - List fieldVectors = new ArrayList<>(); + List fieldVectors = new ArrayList<>(schema.getFields().size()); for (Field field : schema.getFields()) { FieldVector vector = field.createVector(allocator); fieldVectors.add(vector); @@ -160,7 +161,7 @@ public void clear() { } public List getFieldVectors() { - return fieldVectors.stream().collect(Collectors.toList()); + return Collections.unmodifiableList(fieldVectors); } /** @@ -236,7 +237,7 @@ public int getRowCount() { */ public void setRowCount(int rowCount) { this.rowCount = rowCount; - for (FieldVector v : getFieldVectors()) { + for (FieldVector v : fieldVectors) { v.setValueCount(rowCount); } } From f6127a6d18af12ce18a0b8b1eac02346721cc399 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 9 May 2024 04:58:59 +0200 Subject: [PATCH 099/106] GH-41356: [Release][Docs] Update post release documentation task to remove the warnings banner for stable version (#41377) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change With every release dev documentation is moved to `docs/` and becomes stable version of the documentation but the version warnings banner is still present. ### What changes are included in this PR? This PR removes the banner before the dev docs are copied to the `docs/` folder. ### Are these changes tested? Not yet. ### Are there any user-facing changes? No. * GitHub Issue: #41356 Lead-authored-by: AlenkaF Co-authored-by: Alenka Frim Co-authored-by: Raúl Cumplido Signed-off-by: Sutou Kouhei --- dev/release/post-08-docs.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index c59f9b96857a6..58a462551f199 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -72,13 +72,28 @@ fi # delete current stable docs and restore all previous versioned docs rm -rf docs/* git checkout "${versioned_paths[@]}" +# Download and untar released docs in a temp folder +rm -rf docs_new +mkdir docs_new +pushd docs_new curl \ --fail \ --location \ --remote-name \ https://apache.jfrog.io/artifactory/arrow/docs/${version}/docs.tar.gz tar xvf docs.tar.gz -rm -f docs.tar.gz +# Update DOCUMENTATION_OPTIONS.show_version_warning_banner +find docs \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +mv docs_new/docs/* docs/ +rm -rf docs_new + if [ "$is_major_release" = "yes" ] ; then previous_series=${previous_version%.*} mv docs_temp docs/${previous_series} From bd444106af494b3d4c6cce0af88f6ce2a6a327eb Mon Sep 17 00:00:00 2001 From: Tom McTiernan Date: Thu, 9 May 2024 20:15:43 +0100 Subject: [PATCH 100/106] GH-39645: [Python] Fix read_table for encrypted parquet (#39438) ### Rationale for this change Currently, if you try to read a decrypted parquet with read_table, passing decryption_properties - in the happy path (pyarrow.data available for import) the reading/decryption of the file fails, as the decryption properties are missing. ### What changes are included in this PR? Pass through the argument that was intended to have been passed. ### Are these changes tested? We have tested this locally on an encrypted parquet dataset - please advise on any further testing you would like beyond that and the standard CI. ### Are there any user-facing changes? Not in any cases where their code was previously working? The intended behaviour for encrypted dataset decryption should start working. * Closes: #39645 Lead-authored-by: Tom McTiernan Co-authored-by: Don Co-authored-by: Rok Mihevc Signed-off-by: Rok Mihevc --- python/pyarrow/_dataset_parquet.pxd | 1 + python/pyarrow/_dataset_parquet.pyx | 30 ++- .../pyarrow/_dataset_parquet_encryption.pyx | 8 + python/pyarrow/parquet/core.py | 5 +- .../pyarrow/tests/parquet/test_encryption.py | 180 +++++++++--------- .../pyarrow/tests/test_dataset_encryption.py | 12 ++ 6 files changed, 142 insertions(+), 94 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pxd b/python/pyarrow/_dataset_parquet.pxd index d5bc172d324d5..0a3a2ff526ea4 100644 --- a/python/pyarrow/_dataset_parquet.pxd +++ b/python/pyarrow/_dataset_parquet.pxd @@ -29,6 +29,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cdef: CParquetFragmentScanOptions* parquet_options object _parquet_decryption_config + object _decryption_properties cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp) cdef CReaderProperties* reader_properties(self) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index a55e889ba8246..4942336a12666 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -56,7 +56,7 @@ from pyarrow._parquet cimport ( try: from pyarrow._dataset_parquet_encryption import ( - set_encryption_config, set_decryption_config + set_encryption_config, set_decryption_config, set_decryption_properties ) parquet_encryption_enabled = True except ImportError: @@ -127,8 +127,7 @@ cdef class ParquetFileFormat(FileFormat): 'instance of ParquetReadOptions') if default_fragment_scan_options is None: - default_fragment_scan_options = ParquetFragmentScanOptions( - **scan_args) + default_fragment_scan_options = ParquetFragmentScanOptions(**scan_args) elif isinstance(default_fragment_scan_options, dict): default_fragment_scan_options = ParquetFragmentScanOptions( **default_fragment_scan_options) @@ -715,6 +714,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None If not None, use the provided ParquetDecryptionConfig to decrypt the Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. """ @@ -729,6 +731,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=None, thrift_container_size_limit=None, decryption_config=None, + decryption_properties=None, bint page_checksum_verification=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) @@ -743,6 +746,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_container_size_limit = thrift_container_size_limit if decryption_config is not None: self.parquet_decryption_config = decryption_config + if decryption_properties is not None: + self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): @@ -812,6 +817,25 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): raise ValueError("size must be larger than zero") self.reader_properties().set_thrift_container_size_limit(size) + @property + def decryption_properties(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Unable to access encryption features. " + "Encryption is not enabled in your installation of pyarrow." + ) + return self._decryption_properties + + @decryption_properties.setter + def decryption_properties(self, config): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but " + "decryption_properties were provided." + ) + set_decryption_properties(self, config) + self._decryption_properties = config + @property def parquet_decryption_config(self): if not parquet_encryption_enabled: diff --git a/python/pyarrow/_dataset_parquet_encryption.pyx b/python/pyarrow/_dataset_parquet_encryption.pyx index 11a7174eb3c9d..c8f5e5b01bf81 100644 --- a/python/pyarrow/_dataset_parquet_encryption.pyx +++ b/python/pyarrow/_dataset_parquet_encryption.pyx @@ -162,6 +162,14 @@ def set_encryption_config( opts.parquet_options.parquet_encryption_config = c_config +def set_decryption_properties( + ParquetFragmentScanOptions opts not None, + FileDecryptionProperties config not None +): + cdef CReaderProperties* reader_props = opts.reader_properties() + reader_props.file_decryption_properties(config.unwrap()) + + def set_decryption_config( ParquetFragmentScanOptions opts not None, ParquetDecryptionConfig config not None diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 69a1c9d19aae2..f54a203c8794c 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1299,7 +1299,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, f"local file systems, not {type(filesystem)}" ) - # check for single fragment dataset + # check for single fragment dataset or dataset directory single_file = None self._base_dir = None if not isinstance(path_or_paths, list): @@ -1313,8 +1313,6 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, except ValueError: filesystem = LocalFileSystem(use_mmap=memory_map) finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths if finfo.type == FileType.Directory: self._base_dir = path_or_paths else: @@ -1771,6 +1769,7 @@ def read_table(source, *, columns=None, use_threads=True, ignore_prefixes=ignore_prefixes, pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index edb6410d2fa0d..ff388ef506997 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -65,6 +65,44 @@ def basic_encryption_config(): return basic_encryption_config +def setup_encryption_environment(custom_kms_conf): + """ + Sets up and returns the KMS connection configuration and crypto factory + based on provided KMS configuration parameters. + """ + kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf) + + def kms_factory(kms_connection_configuration): + return InMemoryKmsClient(kms_connection_configuration) + + # Create our CryptoFactory + crypto_factory = pe.CryptoFactory(kms_factory) + + return kms_connection_config, crypto_factory + + +def write_encrypted_file(path, data_table, footer_key_name, col_key_name, + footer_key, col_key, encryption_config): + """ + Writes an encrypted parquet file based on the provided parameters. + """ + # Setup the custom KMS configuration with provided keys + custom_kms_conf = { + footer_key_name: footer_key.decode("UTF-8"), + col_key_name: col_key.decode("UTF-8"), + } + + # Setup encryption environment + kms_connection_config, crypto_factory = setup_encryption_environment( + custom_kms_conf) + + # Write the encrypted parquet file + write_encrypted_parquet(path, data_table, encryption_config, + kms_connection_config, crypto_factory) + + return kms_connection_config, crypto_factory + + def test_encrypted_parquet_write_read(tempdir, data_table): """Write an encrypted parquet, verify it's encrypted, and then read it.""" path = tempdir / PARQUET_NAME @@ -81,20 +119,10 @@ def test_encrypted_parquet_write_read(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) # Read with decryption properties @@ -150,36 +178,22 @@ def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, COL_KEY, encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) - # Read with decryption properties - wrong_kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - # Wrong keys - mixup in names - FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), - COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - } - ) + wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({ + FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), # Intentionally wrong + COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), # Intentionally wrong + }) + decryption_config = pe.DecryptionConfiguration( cache_lifetime=timedelta(minutes=5.0)) with pytest.raises(ValueError, match=r"Incorrect master key used"): read_encrypted_parquet( path, decryption_config, wrong_kms_connection_config, - crypto_factory) + wrong_crypto_factory) def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table): @@ -219,23 +233,12 @@ def test_encrypted_parquet_write_no_col_key(tempdir, data_table): encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) with pytest.raises(OSError, match="Either column_keys or uniform_encryption " "must be set"): # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, b"", encryption_config) def test_encrypted_parquet_write_kms_error(tempdir, data_table, @@ -497,24 +500,11 @@ def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config): # Encrypt the footer with the footer key, # encrypt column `a` and column `b` with another key, - # keep `c` plaintext - encryption_config = basic_encryption_config + # keep `c` plaintext, defined in basic_encryption_config + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) - - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) decryption_config = pe.DecryptionConfiguration( @@ -537,32 +527,46 @@ def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_ Test that decryption properties can be used if the crypto factory is no longer alive """ path = tempdir / PARQUET_NAME - encryption_config = basic_encryption_config - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - encryption_crypto_factory = pe.CryptoFactory(kms_factory) - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, encryption_crypto_factory) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) verify_file_encrypted(path) - # Use a local function to get decryption properties, so the crypto factory that - # creates the properties will be deleted after it returns. - def get_decryption_properties(): - decryption_crypto_factory = pe.CryptoFactory(kms_factory) - decryption_config = pe.DecryptionConfiguration( - cache_lifetime=timedelta(minutes=5.0)) - return decryption_crypto_factory.file_decryption_properties( - kms_connection_config, decryption_config) + # Create decryption properties and delete the crypto factory that created + # the properties afterwards. + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + del crypto_factory result = pq.ParquetFile( - path, decryption_properties=get_decryption_properties()) + path, decryption_properties=file_decryption_properties) result_table = result.read(use_threads=True) assert data_table.equals(result_table) + + +def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config): + """Write an encrypted parquet then read it back using read_table.""" + path = tempdir / PARQUET_NAME + + # Write the encrypted parquet file using the utility function + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) + + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + + # Read the encrypted parquet file using read_table + result_table = pq.read_table(path, decryption_properties=file_decryption_properties) + + # Assert that the read table matches the original data + assert data_table.equals(result_table) + + # Read the encrypted parquet folder using read_table + result_table = pq.read_table( + tempdir, decryption_properties=file_decryption_properties) + assert data_table.equals(result_table) diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 2a631db9fc0fa..0d8b4a152ab9f 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -142,6 +142,18 @@ def test_dataset_encryption_decryption(): assert table.equals(dataset.to_table()) + # set decryption properties for parquet fragment scan options + decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + pq_scan_opts = ds.ParquetFragmentScanOptions( + decryption_properties=decryption_properties + ) + + pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) + + assert table.equals(dataset.to_table()) + @pytest.mark.skipif( not encryption_unavailable, reason="Parquet Encryption is currently enabled" From 1c62df5255ced89171d5b846dc82f5a10d519f4a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 9 May 2024 15:17:20 -0400 Subject: [PATCH 101/106] GH-41179: [Docs] Documentation for Dissociated IPC Protocol (#41180) ### Rationale for this change Adding documentation for the Experimental Dissociated IPC Protocol for splitting arrow IPC metadata and body data into separate streams for use with newer transports such as UCX and Libfabric. The relevant mailing list discussion and vote: https://lists.apache.org/thread/k26n1h90b1wy1w5k53whh0t8o4nd0yx7 ### What changes are included in this PR? Only documentation changes and images for the Arrow docs site. I tagged people I thought might be relevant for reviewing, but feel free to tag and add anyone else that might seem relevant to reviewing this. Thanks! * GitHub Issue: #41179 Lead-authored-by: Matt Topol Co-authored-by: Benjamin Kietzman Co-authored-by: Sutou Kouhei Co-authored-by: Antoine Pitrou Signed-off-by: Matt Topol --- docs/source/format/Columnar.rst | 2 + docs/source/format/DissociatedIPC.rst | 403 ++++++++++++++++++ .../format/DissociatedIPC/ClientFlowchart.mmd | 37 ++ .../DissociatedIPC/SequenceDiagramSame.mmd | 43 ++ .../SequenceDiagramSeparate.mmd | 44 ++ docs/source/format/Flight.rst | 2 + docs/source/format/index.rst | 1 + 7 files changed, 532 insertions(+) create mode 100644 docs/source/format/DissociatedIPC.rst create mode 100644 docs/source/format/DissociatedIPC/ClientFlowchart.mmd create mode 100644 docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd create mode 100644 docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 0cfece2586294..ec6a7fa5e334a 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -1108,6 +1108,8 @@ includes a serialized Flatbuffer type along with an optional message body. We define this message format before describing how to serialize each constituent IPC message type. +.. _ipc-message-format: + Encapsulated message format --------------------------- diff --git a/docs/source/format/DissociatedIPC.rst b/docs/source/format/DissociatedIPC.rst new file mode 100644 index 0000000000000..0b0861399cb2f --- /dev/null +++ b/docs/source/format/DissociatedIPC.rst @@ -0,0 +1,403 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _dissociated-ipc: + +======================== +Dissociated IPC Protocol +======================== + +.. warning:: + + Experimental: The Dissociated IPC Protocol is experimental in its current + form. Based on feedback and usage the protocol definition may change until + it is fully standardized. + +Rationale +========= + +The :ref:`Arrow IPC format ` describes a protocol for transferring +Arrow data as a stream of record batches. This protocol expects a continuous +stream of bytes divided into discrete messages (using a length prefix and +continuation indicator). Each discrete message consists of two portions: + +* A `Flatbuffers`_ header message +* A series of bytes consisting of the flattened and packed body buffers (some + message types, like Schema messages, do not have this section) + - This is referred to as the *message body* in the IPC format spec. + +For most cases, the existing IPC format as it currently exists is sufficiently efficient: + +* Receiving data in the IPC format allows zero-copy utilization of the body + buffer bytes, no deserialization is required to form Arrow Arrays +* An IPC file format can be memory-mapped because it is location agnostic + and the bytes of the file are exactly what is expected in memory. + +However, there are use cases that aren't handled by this: + +* Constructing the IPC record batch message requires allocating a contiguous + chunk of bytes and copying all of the data buffers into it, packed together + back-to-back. This pessimizes the common case of wrapping existing, directly + consumable data into an IPC message. +* Even if Arrow data is located in a memory accessible across process boundaries + or transports (such as UCX), there is no standard way to specify that shared + location to consumers which could take advantage of it. +* Arrow data located on a non-CPU device (such as a GPU) cannot be sent using + Arrow IPC without having to copy the data back to the host device or copying + the Flatbuffers metadata bytes into device memory. + + * By the same token, receiving IPC messages into device memory would require + performing a copy of the Flatbuffers metadata back to the host CPU device. This + is due to the fact that the IPC stream interleaves data and metadata across a + single stream. + +This protocol attempts to solve these use cases in an efficient manner. + +Goals +----- + +* Define a generic protocol for passing Arrow IPC data, not tied to any particular + transport, that also allows for utilizing non-CPU device memory, shared memory, and + newer "high performance" transports such as `UCX`_ or `libfabric`_. + + * This allows for the data in the body to be kept on non-CPU devices (like GPUs) + without expensive device-to-host copies. + +* Allow for using :ref:`Flight RPC ` purely for control flow by separating + the stream of IPC metadata from IPC body bytes + +Definitions +----------- + +IPC Metadata + The Flatbuffers message bytes that encompass the header of an Arrow IPC message + +Tag + A little-endian ``uint64`` value used for flow control and used in determining + how to interpret the body of a message. Specific bits can be masked to allow + identifying messages by only a portion of the tag, leaving the rest of the bits + to be used for control flow or other message metadata. Some transports, such as + UCX, have built-in support for such tag values and will provide them in CPU + memory regardless of whether or not the body of the message may reside on a + non-CPU device. + +Sequence Number + A little-endian, 4-byte unsigned integer starting at 0 for a stream, indicating + the sequence order of messages. It is also used to identify specific messages to + tie the IPC metadata header to its corresponding body since the metadata and body + can be sent across separate pipes/streams/transports. + + If a sequence number reaches ``UINT32_MAX``, it should be allowed to roll over as + it is unlikely there would be enough unprocessed messages waiting to be processed + that would cause an overlap of sequence numbers. + + The sequence number serves two purposes: To identify corresponding metadata and + tagged body data messages and to ensure we do not rely on messages having to arrive + in order. A client should use the sequence number to correctly order messages as + they arrive for processing. + +The Protocol +============ + +A reference example implementation utilizing `libcudf`_ and `UCX`_ can be found in the +`arrow-experiments repo `_. + +Requirements +------------ + +A transport implementing this protocol **MUST** provide two pieces of functionality: + +* Message sending + + * Delimited messages (like gRPC) as opposed to non-delimited streams (like plain TCP + without further framing). + + * Alternatively, a framing mechanism like the :ref:`encapsulated message format ` + for the IPC protocol can be used while leaving out the body bytes. + +* Tagged message sending + + * Sending a message that has an attached little-endian, unsigned 64-bit integral tag + for control flow. A tag like this allows control flow to operate on a message whose body + is on a non-CPU device without requiring the message itself to get copied off of the device. + +URI Specification +----------------- + +When providing a URI to a consumer to contact for use with this protocol (such as via +the :ref:`Location URI for Flight `), the URI should specify a scheme +like *ucx:* or *fabric:*, that is easily identifiable. In addition, the URI should +encode the following URI query parameters: + +.. note:: + As this protocol matures, this document will get updated with commonly recognized + transport schemes that get used with it. + +* ``want_data`` - **REQUIRED** - uint64 integer value + + * This value should be used to tag an initial message to the server to initiate a + data transfer. The body of the initiating message should be an opaque binary identifier + of the data stream being requested (like the ``Ticket`` in the Flight RPC protocol) + +* ``free_data`` - **OPTIONAL** - uint64 integer value + + * If the server might send messages using offsets / addresses for remote memory accessing + or shared memory locations, the URI should include this parameter. This value is used to + tag messages sent from the client to the data server, containing specific offsets / addresses + which were provided that are no longer required by the client (i.e. any operations that + directly reference those memory locations, such as copying the remote data into local memory, + have been completed). + +* ``remote_handle`` - **OPTIONAL** - base64-encoded string + + * When working with shared memory or remote memory, this value indicates any required + handle or identifier that is necessary for accessing the memory. + + * Using UCX, this would be an *rkey* value + + * With CUDA IPC, this would be the value of the base GPU pointer or memory handle, + and subsequent addresses would be offsets from this base pointer. + +Handling of Backpressure +------------------------ + +*Currently* this proposal does not specify any way to manage the backpressure of +messages to throttle for memory and bandwidth reasons. For now, this will be +**transport-defined** rather than lock into something sub-optimal. + +As usage among different transports and libraries grows, common patterns will emerge +that will allow for a generic, but efficient, way to handle backpressure across +different use cases. + +.. note:: + While the protocol itself is transport agnostic, the current usage and examples + only have been tested using UCX and libfabric transports so far, but that's all. + + +Protocol Description +==================== + +There are two possibilities that can occur: + +1. The streams of metadata and body data are sent across separate connections + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSeparate.mmd + + +2. The streams of metadata and body data are sent simultaneously across the + same connection + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSame.mmd + +Server Sequence +--------------- + +There can be either a single server handling both the IPC Metadata stream and the +Body data streams, or separate servers for handling the IPC Metadata and the body +data. This allows for streaming of data across either a single transport pipe or +two pipes if desired. + +Metadata Stream Sequence +'''''''''''''''''''''''' + +The standing state of the server is waiting for a **tagged** message with a specific +```` tag value to initiate a transfer. This ```` value is defined +by the server and propagated to any clients via the URI they are provided. This protocol +does not prescribe any particular value so that it will not interfere with any other +existing protocols that rely on tag values. The body of that message will contain an +opaque, binary identifier to indicate a particular dataset / data stream to send. + +.. note:: + + For instance, the **ticket** that was passed with a *FlightInfo* message would be + the body of this message. Because it is opaque, it can be anything the server wants + to use. The URI and identifier do not need to be given to the client via Flight RPC, + but could come across from any transport or protocol desired. + +Upon receiving a ```` request, the server *should* respond by sending a stream +of messages consisting of the following: + +.. mermaid:: + + block-beta + columns 8 + + block:P["\n\n\n\nPrefix"]:5 + T["Message type\nByte 0"] + S["Sequence number\nBytes 1-4"] + end + H["Flatbuffer bytes\nRest of the message"]:3 + +* A 5-byte prefix + + - The first byte of the message indicates the type of message, currently there are only + two allowed message types (more types may get added in the future): + + 0) End of Stream + 1) Flatbuffers IPC Metadata Message + + - the next 4-bytes are a little-endian, unsigned 32-bit integer indicating the sequence number of + the message. The first message in the stream (**MUST** always be a schema message) **MUST** + have a sequence number of ``0``. Each subsequent message **MUST** increment the number by + ``1``. + +* The full Flatbuffers bytes of an Arrow IPC header + +As defined in the Arrow IPC format, each metadata message can represent a chunk of data or +dictionaries for use by the stream of data. + +After sending the last metadata message, the server **MUST** indicate the end of the stream +by sending a message consisting of **exactly** 5 bytes: + +* The first byte is ``0``, indicating an **End of Stream** message +* The last 4 bytes are the sequence number (4-byte, unsigned integer in little-endian byte order) + +Data Stream Sequence +'''''''''''''''''''' + +If a single server is handling both the data and metadata streams, then the data messages +**should** begin being sent to the client in parallel with the metadata messages. Otherwise, +as with the metadata sequence, the standing state of the server is to wait for a **tagged** +message with the ```` tag value, whose body indicates the dataset / data stream +to send to the client. + +For each IPC message in the stream of data, a **tagged** message **MUST** be sent on the data +stream if that message has a body (i.e. a Record Batch or Dictionary message). The +:term:`tag ` for each message should be structured as follows: + +.. mermaid:: + + block-beta + columns 8 + + S["Sequence number\nBytes 0-3"]:4 + U["Unused (Reserved)\nBytes 4-6"]:3 + T["Message type\nByte 7"]:1 + +* The *least significant* 4-bytes (bits 0 - 31) of the tag should be the unsigned 32-bit, little-endian sequence + number of the message. +* The *most significant* byte (bits 56 - 63) of the tag indicates the message body **type** as an 8-bit + unsigned integer. Currently only two message types are specified, but more can be added as + needed to expand the protocol: + + 0) The body contains the raw body buffer bytes as a packed buffer (i.e. the standard IPC + format body bytes) + 1) The body contains a series of unsigned, little-endian 64-bit integer pairs to represent + either shared or remote memory, schematically structured as + + * The first two integers (e.g. the first 16 bytes) represent the *total* size (in bytes) + of all buffers and the number of buffers in this message (and thus the number of following + pairs of ``uint64``) + + * Each subsequent pair of ``uint64`` values are an address / offset followed the length of + that particular buffer. + +* All unspecified bits (bits 32 - 55) of the tag are *reserved* for future use by potential updates + to this protocol. For now they **MUST** be 0. + +.. note:: + + Any shared/remote memory addresses that are sent across **MUST** be kept alive by the server + until a corresponding tagged ```` message is received. If the client disconnects + before sending any ```` messages, it can be assumed to be safe to clean up the memory + if desired by the server. + +After sending the last tagged IPC body message, the server should maintain the connection and wait +for tagged ```` messages. The structure of these ```` messages is simple: +one or more unsigned, little-endian 64-bit integers which indicate the addresses/offsets that can +be freed. + +Once there are no more outstanding addresses to be freed, the work for this stream is complete. + +Client Sequence +--------------- + +A client for this protocol needs to concurrently handle both the data and metadata streams of +messages which may either both come from the same server or different servers. Below is a flowchart +showing how a client might handle the metadata and data streams: + +.. mermaid:: ./DissociatedIPC/ClientFlowchart.mmd + +#. First the client sends a tagged message using the ```` value it was provided in the + URI as the tag, and the opaque ID as the body. + + * If the metadata and data servers are separate, then a ```` message needs to be sent + separately to each. + * In either scenario, the metadata and data streams can be processed concurrently and/or asynchronously + depending on the nature of the transports. + +#. For each **untagged** message the client receives in the metadata stream: + + * The first byte of the message indicates whether it is an *End of Stream* message (value ``0``) + or a metadata message (value ``1``). + * The next 4 bytes are the sequence number of the message, an unsigned 32-bit integer in + little-endian byte order. + * If it is **not** an *End of Stream* message, the remaining bytes are the IPC Flatbuffer bytes which + can be interpreted as normal. + + * If the message has a body (i.e. Record Batch or Dictionary message) then the client should retrieve + a tagged message from the Data Stream using the same sequence number. + + * If it **is** an *End of Stream* message, then it is safe to close the metadata connection if there are + no gaps in the sequence numbers received. + +#. When a metadata message that requires a body is received, the tag mask of ``0x00000000FFFFFFFF`` **should** + be used alongside the sequence number to match the message regardless of the higher bytes (e.g. we only + care about matching the lower 4 bytes to the sequence number) + + * Once recieved, the Most Significant Byte's value determines how the client processes the body data: + + * If the most significant byte is 0: Then the body of the message is the raw IPC packed body buffers + allowing it to easily be processed with the corresponding metadata header bytes. + + * If the most significant byte is 1: The body of the message will consist of a series of pairs of + unsigned, 64-bit integers in little-endian byte order. + + * The first two integers represent *1)* the total size of all the body buffers together to allow + for easy allocation if an intermediate buffer is needed and *2)* the number of buffers being sent (``nbuf``). + + * The rest of the message will be ``nbuf`` pairs of integers, one for each buffer. Each pair is + *1)* the address / offset of the buffer and *2)* the length of that buffer. Memory can then be retrieved + via shared or remote memory routines based on the underlying transport. These addresses / offsets **MUST** + be retained so they can be sent back in ```` messages later, indicating to the server that + the client no longer needs the shared memory. + +#. Once an *End of Stream* message is received, the client should process any remaining un-processed + IPC metadata messages. + +#. After individual memory addresses / offsets are able to be freed by the remote server (in the case where + it has sent these rather than the full body bytes), the client should send corresponding ```` messages + to the server. + + * A single ```` message consists of an arbitrary number of unsigned 64-bit integer values, representing + the addresses / offsets which can be freed. The reason for it being an *arbitrary number* is to allow a client + to choose whether to send multiple messages to free multiple addresses or to coalesce multiple addresses into + fewer messages to be freed (thus making the protocol less "chatty" if desired) + +Continuing Development +====================== + +If you decide to try this protocol in your own environments and system, we'd love feedback and to learn about +your use case. As this is an **experimental** protocol currently, we need real-world usage in order to facilitate +improving it and finding the right generalizations to standardize on across transports. + +Please chime in using the Arrow Developers Mailing list: https://arrow.apache.org/community/#mailing-lists + +.. _Flatbuffers: http://github.com/google/flatbuffers +.. _UCX: https://openucx.org/ +.. _libfabric: https://ofiwg.github.io/libfabric/ +.. _libcudf: https://docs.rapids.ai/api diff --git a/docs/source/format/DissociatedIPC/ClientFlowchart.mmd b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd new file mode 100644 index 0000000000000..652cabc1c7425 --- /dev/null +++ b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd @@ -0,0 +1,37 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at + +%% http://www.apache.org/licenses/LICENSE-2.0 + +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +graph LR +client((Client))-->c1{{Send #60;want_data#gt; Msg}} +subgraph meta [Meta Message] + direction LR + m1[/Msg Type #40;byte 0#41;
Seq Num #40;bytes 1-5#41;/]-- type 1 -->m2[[Process IPC Header]] + m2-- IPC has body -->m3[Get Corresponding
Tagged Msg] + m2-- Schema Msg -->m4[/Store Schema/] + m1-- type 0 -->e[Indicate End of Stream] +end +subgraph data [Data Stream] + direction LR + d1[Request Msg
for Seq Num]-->d2{Most Significant
Byte} + d2-- 0 -->d3[Construct from
Metadata and Body] + d2-- 1 -->d4[Get shared/remote
buffers] + d4 -->d5[Construct from
Metadata and buffers] + d3 & d5 -->e2[Output Batch] +end + +client -- recv untagged msg --> meta +client -- get tagged msg --> data diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd new file mode 100644 index 0000000000000..adf26bdc32767 --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd @@ -0,0 +1,43 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant C as Client + participant S as Server + activate C + C-->>+S: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + par + loop each chunk + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + end + S-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + and + loop each chunk + alt + S-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + S-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + + loop + C-->>S: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate S + deactivate C diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd new file mode 100644 index 0000000000000..11d2d9d6387eb --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd @@ -0,0 +1,44 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant D as Data Stream + participant C as Client + participant M as Metadata Stream + + activate C + C-->>+M: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + C-->>+D: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + loop each batch + par + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + and + alt + D-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + D-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + M-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + deactivate M + loop + C-->>D: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate D + deactivate C diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index c65a1f70bde7f..2c5487d857ea4 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -310,6 +310,8 @@ well, in which case any `authentication method supported by gRPC .. _Mutual TLS (mTLS): https://grpc.io/docs/guides/auth/#supported-auth-mechanisms +.. _flight-location-uris: + Location URIs ============= diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index 856830d863243..44ea3e8e7e608 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -30,6 +30,7 @@ Specifications CDataInterface CStreamInterface CDeviceDataInterface + DissociatedIPC Flight FlightSql ADBC From 5255adc5139d6094a7b3b04f273f3ef11d49ec38 Mon Sep 17 00:00:00 2001 From: ZhangHuiGui <106943008+ZhangHuiGui@users.noreply.github.com> Date: Thu, 9 May 2024 21:26:16 -0400 Subject: [PATCH 102/106] GH-41529: [C++][Compute] Remove redundant logic for ArrayData as ExecResults in ExecScalarCaseWhen (#41380) ### Rationale for this change Remove useless path in `ExecScalarCaseWhen` ### What changes are included in this PR? Refactor : remove processing logic for ArrayData as ExecResults in ExecScalarCaseWhen. ### Are these changes tested? Yes, by exists. ### Are there any user-facing changes? No * GitHub Issue: #41529 Authored-by: ZhangHuiGui <2689496754@qq.com> Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute/kernels/scalar_if_else.cc | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 13874d9d65e70..6368ef525ff9c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1483,39 +1483,27 @@ Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* result = temp.get(); } - // TODO(wesm): clean this up to have less duplication - if (out->is_array_data()) { - ArrayData* output = out->array_data().get(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - if (dict_from.is_scalar()) { - output->dictionary = checked_cast(*dict_from.scalar) - .value.dictionary->data(); - } else { - output->dictionary = dict_from.array.ToArrayData()->dictionary; - } - } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetMutableValues(0, 0), - output->GetMutableValues(1, 0), output->offset); - } else { - // ArraySpan - ArraySpan* output = out->array_span_mutable(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - output->child_data.resize(1); - if (dict_from.is_scalar()) { - output->child_data[0].SetMembers( - *checked_cast(*dict_from.scalar) - .value.dictionary->data()); - } else { - output->child_data[0] = dict_from.array; - } + // Only input types of non-fixed length (which cannot be pre-allocated) + // will save the output data in ArrayData. And make sure the FixedLength + // types must be output in ArraySpan. + static_assert(is_fixed_width(Type::type_id)); + DCHECK(out->is_array_span()); + + ArraySpan* output = out->array_span_mutable(); + if (is_dictionary_type::value) { + const ExecValue& dict_from = has_result ? result : batch[1]; + output->child_data.resize(1); + if (dict_from.is_scalar()) { + output->child_data[0].SetMembers( + *checked_cast(*dict_from.scalar) + .value.dictionary->data()); + } else { + output->child_data[0] = dict_from.array; } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetValues(0, 0), output->GetValues(1, 0), - output->offset); } + CopyValues(result, /*in_offset=*/0, batch.length, + output->GetValues(0, 0), output->GetValues(1, 0), + output->offset); return Status::OK(); } From f8d3b10b4b89b47f6e7a594b95c82e2ff161f1a5 Mon Sep 17 00:00:00 2001 From: Tai Le Manh <49281946+tlm365@users.noreply.github.com> Date: Fri, 10 May 2024 12:42:25 +0700 Subject: [PATCH 103/106] GH-41590: [Java] Improve BaseRepeatedValueVector function on isEmpty and isNull operations (#41601) ### Rationale for this change Resolves #41590 . ### What changes are included in this PR? Make `abstract` on `isNull` and `isEmpty` of `BaseRepeatedValueVector`. ### Are these changes tested? Existing tests pass. ### Are there any user-facing changes? No. * GitHub Issue: #41590 Authored-by: Tai Le Manh Signed-off-by: David Li --- .../vector/complex/BaseRepeatedValueVector.java | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 7906d90c2fff0..7c4015299a6cd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -355,16 +355,8 @@ public int getInnerValueCountAt(int index) { offsetBuffer.getInt(index * OFFSET_WIDTH); } - /** Return if value at index is null (this implementation is always false). */ - @Override - public boolean isNull(int index) { - return false; - } - - /** Return if value at index is empty (this implementation is always false). */ - public boolean isEmpty(int index) { - return false; - } + /** Return if value at index is empty. */ + public abstract boolean isEmpty(int index); /** Starts a new repeated value. */ public int startNewValue(int index) { From 7aea8bf7a65d679bd71d973b358f997eb3b6c6af Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Fri, 10 May 2024 23:58:10 +0900 Subject: [PATCH 104/106] GH-41316: [CI][Python] Reduce CI time on macOS (#41378) ### Rationale for this change Reduce CI time for python build on macos-12 and macos-14 using ccache ### What changes are included in this PR? Add ccache for macos-12 and macos-14 ### Are these changes tested? Check the cache hit rate ### Are there any user-facing changes? No * GitHub Issue: #41316 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- .github/workflows/python.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15056961f8cf4..a568f8346e7fc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -182,6 +182,19 @@ jobs: python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt + - name: Setup ccache + shell: bash + run: ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + shell: bash + run: echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: python-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**', 'python/**') }} + restore-keys: python-ccache-macos-${{ matrix.macos-version }}- - name: Build shell: bash run: | From 899422e16e3f1f71819f52fc627359d79f7d3662 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 10 May 2024 16:42:21 -0300 Subject: [PATCH 105/106] GH-39301: [Archery][CI][Integration] Add nanoarrow to archery + integration setup (#39302) ### Rationale for this change The ability to add integration testing was added in nanoarrow however, the infrastructure for running these tests currently lives in the arrow monorepo. ### What changes are included in this PR? - Added the relevant code to Archery such that these tests can be run - Added the relevant scripts/environment variables to CI such that these tests run in the integration CI job ### Are these changes tested? Yes, via the "Integration" CI job. ### Are there any user-facing changes? No. This PR still needs https://github.com/apache/arrow/pull/41264 for the integration tests to pass. * Closes: #39301 * GitHub Issue: #39301 Lead-authored-by: Dewey Dunnington Co-authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- .github/workflows/integration.yml | 6 + ci/scripts/integration_arrow_build.sh | 2 + ci/scripts/nanoarrow_build.sh | 52 ++++++ dev/archery/archery/cli.py | 5 +- dev/archery/archery/integration/datagen.py | 3 + dev/archery/archery/integration/runner.py | 8 +- .../archery/integration/tester_nanoarrow.py | 148 ++++++++++++++++++ docker-compose.yml | 2 + 8 files changed, 223 insertions(+), 3 deletions(-) create mode 100755 ci/scripts/nanoarrow_build.sh create mode 100644 dev/archery/archery/integration/tester_nanoarrow.py diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6e09ad61480a6..f53f4aeb505d2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -75,6 +75,11 @@ jobs: with: repository: apache/arrow-rs path: rust + - name: Checkout Arrow nanoarrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + repository: apache/arrow-nanoarrow + path: nanoarrow - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -97,6 +102,7 @@ jobs: run: > archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration - name: Docker Push diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index e5c31527aedff..9b54049a2b803 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -30,6 +30,8 @@ build_dir=${2} ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} + if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh new file mode 100755 index 0000000000000..1612b9a2d0102 --- /dev/null +++ b/ci/scripts/nanoarrow_build.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +source_dir=${1}/nanoarrow +build_dir=${2}/nanoarrow + +# This file is used to build the nanoarrow binaries needed for the archery +# integration tests. Testing of the nanoarrow implementation in normal CI is handled +# by github workflows in the arrow-nanoarrow repository. + +if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building nanoarrow" + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository" + echo "to arrow/nanoarrow before running the integration tests:" + echo " git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow" + echo "=====================================================================" + exit 1; +fi + +set -x + +mkdir -p ${build_dir} +pushd ${build_dir} + +cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON +cmake --build . + +popd diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 8a26d9266f22d..cd746f9c4499a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -738,6 +738,9 @@ def _set_default(opt, default): help='Include JavaScript in integration tests') @click.option('--with-go', type=bool, default=False, help='Include Go in integration tests') +@click.option('--with-nanoarrow', type=bool, default=False, + help='Include nanoarrow in integration tests', + envvar="ARCHERY_INTEGRATION_WITH_NANOARROW") @click.option('--with-rust', type=bool, default=False, help='Include Rust in integration tests', envvar="ARCHERY_INTEGRATION_WITH_RUST") @@ -776,7 +779,7 @@ def integration(with_all=False, random_seed=12345, **args): gen_path = args['write_generated_json'] - languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust'] + languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust'] formats = ['ipc', 'flight', 'c_data'] enabled_languages = 0 diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 5cae907a4aa71..f6302165cd5a0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1928,17 +1928,20 @@ def _temp_path(): .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_binary_view_case() .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_list_view_case() .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_extension_case() diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 5b66842b25926..0ea244720cc1d 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -36,6 +36,7 @@ from .tester_java import JavaTester from .tester_js import JSTester from .tester_csharp import CSharpTester +from .tester_nanoarrow import NanoarrowTester from .util import guid, printer from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC from ..utils.source import ARROW_ROOT_DEFAULT @@ -541,8 +542,8 @@ def get_static_json_files(): def run_all_tests(with_cpp=True, with_java=True, with_js=True, with_csharp=True, with_go=True, with_rust=False, - run_ipc=False, run_flight=False, run_c_data=False, - tempdir=None, **kwargs): + with_nanoarrow=False, run_ipc=False, run_flight=False, + run_c_data=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') testers: List[Tester] = [] @@ -562,6 +563,9 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, if with_go: testers.append(GoTester(**kwargs)) + if with_nanoarrow: + testers.append(NanoarrowTester(**kwargs)) + if with_rust: testers.append(RustTester(**kwargs)) diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py new file mode 100644 index 0000000000000..30ff1bb6e50a7 --- /dev/null +++ b/dev/archery/archery/integration/tester_nanoarrow.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import functools +import os + +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter +from ..utils.source import ARROW_ROOT_DEFAULT + + +_NANOARROW_PATH = os.environ.get( + "ARROW_NANOARROW_PATH", + os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"), +) + +_INTEGRATION_DLL = os.path.join( + _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix +) + + +class NanoarrowTester(Tester): + PRODUCER = False + CONSUMER = False + FLIGHT_SERVER = False + FLIGHT_CLIENT = False + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True + + name = "nanoarrow" + + def validate(self, json_path, arrow_path, quirks=None): + raise NotImplementedError() + + def json_to_file(self, json_path, arrow_path): + raise NotImplementedError() + + def stream_to_file(self, stream_path, file_path): + raise NotImplementedError() + + def file_to_stream(self, file_path, stream_path): + raise NotImplementedError() + + def make_c_data_exporter(self): + return NanoarrowCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return NanoarrowCDataImporter(self.debug, self.args) + + +_nanoarrow_c_data_entrypoints = """ + const char* nanoarrow_CDataIntegration_ExportSchemaFromJson( + const char* json_path, struct ArrowSchema* out); + + const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + const char* json_path, struct ArrowSchema* schema); + + const char* nanoarrow_CDataIntegration_ExportBatchFromJson( + const char* json_path, int num_batch, struct ArrowArray* out); + + const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + const char* json_path, int num_batch, struct ArrowArray* batch); + + int64_t nanoarrow_BytesAllocated(void); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_nanoarrow_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _check_nanoarrow_error(self, na_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is statically allocated on the nanoarrow side and does not + need to be released. + """ + assert self.ffi.typeof(na_error) is self.ffi.typeof("const char*") + if na_error != self.ffi.NULL: + error = self.ffi.string(na_error).decode("utf8", errors="replace") + raise RuntimeError(f"nanoarrow C Data Integration call failed: {error}") + + +class NanoarrowCDataExporter(CDataExporter, _CDataBase): + def export_schema_from_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportSchemaFromJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportBatchFromJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + return self.dll.nanoarrow_BytesAllocated() + + +class NanoarrowCDataImporter(CDataImporter, _CDataBase): + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/docker-compose.yml b/docker-compose.yml index 9bedb59a77be8..7a4d455dfe723 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1751,9 +1751,11 @@ services: volumes: *conda-volumes environment: <<: [*common, *ccache] + ARCHERY_INTEGRATION_WITH_NANOARROW: 0 ARCHERY_INTEGRATION_WITH_RUST: 0 # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_NANOARROW_PATH: /build/nanoarrow ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build && From 1e3772cac5f45edb6ada3d20140b77cc86208346 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 11 May 2024 12:41:57 +0800 Subject: [PATCH 106/106] GH-41343: [C++][CMake] Remove unused ARROW_NO_DEPRECATED_API (#41345) ### Rationale for this change ARROW_NO_DEPRECATED_API is not used in the source code. ### What changes are included in this PR? Remove ARROW_NO_DEPRECATED_API cmake variable. ### Are these changes tested? Pass CIs. ### Are there any user-facing changes? Perhaps yes, users used to set ARROW_NO_DEPRECATED_API to build Arrow will see a warning for unknown CMake variable. * GitHub Issue: #41343 Authored-by: Gang Wu Signed-off-by: Gang Wu --- ci/docker/debian-12-cpp.dockerfile | 1 - ci/docker/linux-apt-r.dockerfile | 1 - ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 - ci/docker/ubuntu-20.04-cpp.dockerfile | 1 - ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 - ci/docker/ubuntu-22.04-cpp.dockerfile | 1 - ci/docker/ubuntu-24.04-cpp.dockerfile | 1 - ci/scripts/c_glib_build.sh | 3 --- ci/scripts/cpp_build.sh | 1 - cpp/CMakeLists.txt | 4 ---- cpp/cmake_modules/DefineOptions.cmake | 2 -- dev/conbench_envs/benchmarks.env | 1 - docs/source/developers/cpp/building.rst | 7 ++++--- 13 files changed, 4 insertions(+), 21 deletions(-) diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 7036ddf27d52a..d7a6f9df2c2ee 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -119,7 +119,6 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index a68354e3abf8d..630b96e1007b9 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -113,7 +113,6 @@ ENV \ ARROW_GANDIVA=OFF \ ARROW_HDFS=OFF \ ARROW_JSON=ON \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index ae2ba9421cd55..e17c0306f115d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 124256378b287..d78c7a99cf4d6 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -158,7 +158,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index dd887a6d00ceb..341d8a87e8661 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index eb189841cd344..f12e7456add8e 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -196,7 +196,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 4a37818f94396..ecfb5e2f5096d 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -178,7 +178,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index c4d2c4fdb5617..6a6295e4ff0bd 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -30,9 +30,6 @@ with_doc=$([ "${BUILD_DOCS_C_GLIB}" == "ON" ] && echo "true" || echo "false") export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig -export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API" - mkdir -p ${build_dir} # Build with Meson diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index ceeab2455bef6..a1f40fc360e2f 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -152,7 +152,6 @@ else -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ -DARROW_ORC=${ARROW_ORC:-OFF} \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index df83f56dd2f70..679842c31e0b1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -431,10 +431,6 @@ endif() # Compiler flags # -if(ARROW_NO_DEPRECATED_API) - add_definitions(-DARROW_NO_DEPRECATED_API) -endif() - if(ARROW_EXTRA_ERROR_CONTEXT) add_definitions(-DARROW_EXTRA_ERROR_CONTEXT) endif() diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index dc0e5da63adb7..41466a1c22404 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -158,8 +158,6 @@ if(ARROW_DEFINE_OPTIONS) define_option_string(ARROW_GIT_DESCRIPTION "The Arrow git commit description (if any)" "") - define_option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - define_option(ARROW_POSITION_INDEPENDENT_CODE "Whether to create position-independent target" ON) diff --git a/dev/conbench_envs/benchmarks.env b/dev/conbench_envs/benchmarks.env index 2a5a9c32a86ec..3af29491a8345 100644 --- a/dev/conbench_envs/benchmarks.env +++ b/dev/conbench_envs/benchmarks.env @@ -31,7 +31,6 @@ ARROW_HOME=$CONDA_PREFIX ARROW_INSTALL_NAME_RPATH=ON ARROW_JEMALLOC=OFF ARROW_MIMALLOC=ON -ARROW_NO_DEPRECATED_API=ON ARROW_ORC=ON ARROW_PARQUET=ON ARROW_PYTHON=ON diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 040a046c5153d..7b80d2138c33e 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -627,9 +627,10 @@ outputs like: Deprecations and API Changes ---------------------------- -We use the compiler definition ``ARROW_NO_DEPRECATED_API`` to disable APIs that -have been deprecated. It is a good practice to compile third party applications -with this flag to proactively catch and account for API changes. +We use the marco ``ARROW_DEPRECATED`` which wraps C++ deprecated attribute for +APIs that have been deprecated. It is a good practice to compile third party +applications with ``-Werror=deprecated-declarations`` (for GCC/Clang or similar +flags of other compilers) to proactively catch and account for API changes. Modular Build Targets ---------------------