diff --git a/.asf.yaml b/.asf.yaml index 760a830ef98c7..1eb019fea9af1 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -26,6 +26,8 @@ github: - jbonofre - js8544 - vibhatha + - zanmato1984 + - ZhangHuiGui notifications: commits: commits@arrow.apache.org diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index badcf4f2f26ac..fb778be113029 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -90,7 +90,8 @@ add_arrow_test(internals_test light_array_test.cc registry_test.cc key_hash_test.cc - row/compare_test.cc) + row/compare_test.cc + row/grouper_test.cc) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 078a8287c71c0..98aea9011266c 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order) { + bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return; } uint32_t num_processed = 0; #if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { - num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + num_processed = NullUpdateColumnToRow_avx2( + use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector); } #endif - uint32_t null_bit_id = - are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows( continue; } - uint32_t offset_within_row = rows.metadata().encoded_field_offset( - are_cols_in_encoding_order - ? static_cast(icol) - : rows.metadata().pos_after_encoding(static_cast(icol))); + uint32_t offset_within_row = + rows.metadata().encoded_field_offset(ColIdInEncodingOrder( + rows, static_cast(icol), are_cols_in_encoding_order)); if (col.metadata().is_fixed_length) { if (sel_left_maybe_null) { CompareBinaryColumnToRow( @@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { // Version without using selection vector CompareBinaryColumnToRow( @@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); @@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { if (ivarbinary == 0) { CompareVarBinaryColumnToRow( @@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index b039ca97ff978..16002ee5184e9 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -43,13 +43,19 @@ class ARROW_EXPORT KeyCompare { uint8_t* out_match_bitvector_maybe_null = NULLPTR); private: + static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col, + bool are_cols_in_encoding_order) { + return are_cols_in_encoding_order ? id_col + : rows.metadata().pos_after_encoding(id_col); + } + template static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order); + bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static void CompareBinaryColumnToRowHelper( @@ -92,7 +98,8 @@ class ARROW_EXPORT KeyCompare { static uint32_t NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector); + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static uint32_t CompareBinaryColumnToRowHelper_avx2( @@ -118,13 +125,11 @@ class ARROW_EXPORT KeyCompare { static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); - static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col, - uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, - uint8_t* match_bytevector); + static uint32_t NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, + bool are_cols_in_encoding_order, uint8_t* match_bytevector); static uint32_t CompareBinaryColumnToRow_avx2( bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index ff407c51b83cb..18f656a2e458d 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -39,12 +39,14 @@ template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector) { + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return num_rows_to_compare; } - uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector) { + bool are_cols_in_encoding_order, uint8_t* match_bytevector) { int64_t num_rows_safe = TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0)); if (sel_left_maybe_null) { @@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( } if (use_selection) { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } else { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } } diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc new file mode 100644 index 0000000000000..1e853be5e4af7 --- /dev/null +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/row/grouper.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace compute { + +// Specialized case for GH-40997 +TEST(Grouper, ResortedColumnsWithLargeNullRows) { + const uint64_t num_rows = 1024; + + // construct random array with plenty of null values + const int32_t kSeed = 42; + const int32_t min = 0; + const int32_t max = 100; + const double null_probability = 0.3; + const double true_probability = 0.5; + auto rng = random::RandomArrayGenerator(kSeed); + auto b_arr = rng.Boolean(num_rows, true_probability, null_probability); + auto i32_arr = rng.Int32(num_rows, min, max, null_probability); + auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability); + + // construct batches with columns which will be resorted in the grouper make + std::vector exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows), + ExecBatch({i32_arr, i64_arr, b_arr}, num_rows), + ExecBatch({i64_arr, b_arr, i32_arr}, num_rows), + ExecBatch({i32_arr, b_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i32_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)}; + + const int num_batches = static_cast(exec_batches.size()); + std::vector group_num_vec; + group_num_vec.reserve(num_batches); + + for (const auto& exec_batch : exec_batches) { + ExecSpan span(exec_batch); + ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes())); + ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span)); + group_num_vec.emplace_back(grouper->num_groups()); + } + + for (int i = 1; i < num_batches; i++) { + ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index f6a62c09fcf24..469205e9b008d 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector( // // Columns are sorted based on the size in bytes of their fixed-length part. // For the varying-length column, the fixed-length part is the 32-bit field storing - // cumulative length of varying-length fields. + // cumulative length of varying-length fields. This is to make the memory access of + // each individual column within the encoded row alignment-friendly. // // The rules are: // diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 52a235c688eda..126b0fcb6f76a 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -409,7 +409,7 @@ tasks: arrow_jemalloc: "ON" python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - runs_on: "macos-latest" + runs_on: "macos-13" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6a11b19ffcdf5..946c82b258241 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3984,7 +3984,7 @@ cdef class RunEndEncodedArray(Array): ------- RunEndEncodedArray """ - logical_length = run_ends[-1] if len(run_ends) > 0 else 0 + logical_length = scalar(run_ends[-1]).as_py() if len(run_ends) > 0 else 0 return RunEndEncodedArray._from_arrays(type, True, logical_length, run_ends, values, 0) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 6a190957879d3..3754daeb9b4bd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -3578,12 +3578,23 @@ def check_run_end_encoded_from_arrays_with_type(ree_type=None): check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) +def check_run_end_encoded_from_typed_arrays(ree_type): + run_ends = [3, 5, 10, 19] + values = [1, 2, 1, 3] + typed_run_ends = pa.array(run_ends, ree_type.run_end_type) + typed_values = pa.array(values, ree_type.value_type) + ree_array = pa.RunEndEncodedArray.from_arrays(typed_run_ends, typed_values) + assert ree_array.type == ree_type + check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) + + def test_run_end_encoded_from_arrays(): check_run_end_encoded_from_arrays_with_type() for run_end_type in [pa.int16(), pa.int32(), pa.int64()]: for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]: ree_type = pa.run_end_encoded(run_end_type, value_type) check_run_end_encoded_from_arrays_with_type(ree_type) + check_run_end_encoded_from_typed_arrays(ree_type) def test_run_end_encoded_from_buffers(): diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index f91cd14211e0f..c8594c77df000 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -47,6 +47,14 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { msg <- paste("Expression", names(sorts)[i], "not supported in Arrow") return(abandon_ship(call, .data, msg)) } + if (length(mask$.aggregations)) { + # dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow") + return(abandon_ship(call, .data, msg)) + } descs[i] <- x[["desc"]] } .data$arrange_vars <- c(sorts, .data$arrange_vars) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index ff1619ce944d0..211c26cecce8c 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -121,24 +121,9 @@ arrow_not_supported <- function(msg) { } # Create a data mask for evaluating a dplyr expression -arrow_mask <- function(.data, aggregation = FALSE) { +arrow_mask <- function(.data) { f_env <- new_environment(.cache$functions) - if (aggregation) { - # Add the aggregation functions to the environment. - for (f in names(agg_funcs)) { - f_env[[f]] <- agg_funcs[[f]] - } - } else { - # Add functions that need to error hard and clear. - # Some R functions will still try to evaluate on an Expression - # and return NA with a warning :exploding_head: - fail <- function(...) stop("Not implemented") - for (f in c("mean", "sd")) { - f_env[[f]] <- fail - } - } - # Assign the schema to the expressions schema <- .data$.data$schema walk(.data$selected_columns, ~ (.$schema <- schema)) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index d85fa16af2e71..69decbd76655f 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -35,48 +35,24 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) - bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) - if (any(bad_filters)) { - # This is similar to abandon_ship() except that the filter eval is - # vectorized, and we apply filters that _did_ work before abandoning ship - # with the rest - expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) - if (query_on_dataset(out)) { - # Abort. We don't want to auto-collect if this is a Dataset because that - # could blow up, too big. - stop( - "Filter expression not supported for Arrow Datasets: ", - oxford_paste(expr_labs, quote = FALSE), - "\nCall collect() first to pull data into R.", - call. = FALSE - ) - } else { - arrow_errors <- map2_chr( - filters[bad_filters], expr_labs, - handle_arrow_not_supported - ) - if (length(arrow_errors) == 1) { - msg <- paste0(arrow_errors, "; ") - } else { - msg <- paste0("* ", arrow_errors, "\n", collapse = "") - } - warning( - msg, "pulling data into R", - immediate. = TRUE, - call. = FALSE - ) - # Set any valid filters first, then collect and then apply the invalid ones in R - out <- dplyr::collect(set_filters(out, filters[!bad_filters])) - if (by$from_by) { - out <- dplyr::ungroup(out) - } - return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) + mask <- arrow_mask(out) + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + if (inherits(filt, "try-error")) { + msg <- handle_arrow_not_supported(filt, format_expr(expr)) + return(abandon_ship(match.call(), .data, msg)) + } + if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in filter() in Arrow") + return(abandon_ship(match.call(), .data, msg)) } + out <- set_filters(out, filt) } - out <- set_filters(out, filters) - if (by$from_by) { out$group_by_vars <- character() } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index 9411ce5ce6faf..c0c4eb3089425 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -29,56 +29,56 @@ # you can use list_compute_functions("^hash_") register_bindings_aggregate <- function() { - register_binding_agg("base::sum", function(..., na.rm = FALSE) { + register_binding("base::sum", function(..., na.rm = FALSE) { set_agg( fun = "sum", data = ensure_one_arg(list2(...), "sum"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::prod", function(..., na.rm = FALSE) { + register_binding("base::prod", function(..., na.rm = FALSE) { set_agg( fun = "product", data = ensure_one_arg(list2(...), "prod"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::any", function(..., na.rm = FALSE) { + register_binding("base::any", function(..., na.rm = FALSE) { set_agg( fun = "any", data = ensure_one_arg(list2(...), "any"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::all", function(..., na.rm = FALSE) { + register_binding("base::all", function(..., na.rm = FALSE) { set_agg( fun = "all", data = ensure_one_arg(list2(...), "all"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::mean", function(x, na.rm = FALSE) { + register_binding("base::mean", function(x, na.rm = FALSE) { set_agg( fun = "mean", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::sd", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "stddev", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::var", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "variance", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg( + register_binding( "stats::quantile", function(x, probs, na.rm = FALSE) { if (length(probs) != 1) { @@ -103,7 +103,7 @@ register_bindings_aggregate <- function() { "approximate quantile (t-digest) is computed" ) ) - register_binding_agg( + register_binding( "stats::median", function(x, na.rm = FALSE) { # TODO: Bind to the Arrow function that returns an exact median and remove @@ -122,28 +122,28 @@ register_bindings_aggregate <- function() { }, notes = "approximate median (t-digest) is computed" ) - register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { + register_binding("dplyr::n_distinct", function(..., na.rm = FALSE) { set_agg( fun = "count_distinct", data = ensure_one_arg(list2(...), "n_distinct"), options = list(na.rm = na.rm) ) }) - register_binding_agg("dplyr::n", function() { + register_binding("dplyr::n", function() { set_agg( fun = "count_all", data = list(), options = list() ) }) - register_binding_agg("base::min", function(..., na.rm = FALSE) { + register_binding("base::min", function(..., na.rm = FALSE) { set_agg( fun = "min", data = ensure_one_arg(list2(...), "min"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::max", function(..., na.rm = FALSE) { + register_binding("base::max", function(..., na.rm = FALSE) { set_agg( fun = "max", data = ensure_one_arg(list2(...), "max"), diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index abf2362d0107f..c0eb47e428b7f 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -22,8 +22,8 @@ NULL #' Register compute bindings #' -#' The `register_binding()` and `register_binding_agg()` functions -#' are used to populate a list of functions that operate on (and return) +#' `register_binding()` is used to populate a list of functions that operate on +#' (and return) #' Expressions. These are the basis for the `.data` mask inside dplyr methods. #' #' @section Writing bindings: @@ -40,26 +40,10 @@ NULL #' * Inside your function, you can call any other binding with `call_binding()`. #' #' @param fun_name A string containing a function name in the form `"function"` or -#' `"package::function"`. The package name is currently not used but -#' may be used in the future to allow these types of function calls. -#' @param fun A function or `NULL` to un-register a previous function. +#' `"package::function"`. +#' @param fun A function, or `NULL` to un-register a previous function. #' This function must accept `Expression` objects as arguments and return #' `Expression` objects instead of regular R objects. -#' @param agg_fun An aggregate function or `NULL` to un-register a previous -#' aggregate function. This function must accept `Expression` objects as -#' arguments and return a `list()` with components: -#' - `fun`: string function name -#' - `data`: list of 0 or more `Expression`s -#' - `options`: list of function options, as passed to call_function -#' @param update_cache Update .cache$functions at the time of registration. -#' the default is FALSE because the majority of usage is to register -#' bindings at package load, after which we create the cache once. The -#' reason why .cache$functions is needed in addition to nse_funcs for -#' non-aggregate functions could be revisited...it is currently used -#' as the data mask in mutate, filter, and aggregate (but not -#' summarise) because the data mask has to be a list. -#' @param registry An environment in which the functions should be -#' assigned. #' @param notes string for the docs: note any limitations or differences in #' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously @@ -67,12 +51,10 @@ NULL #' @keywords internal register_binding <- function(fun_name, fun, - registry = nse_funcs, - update_cache = FALSE, notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] # if the unqualified name exists in the registry, warn if (!is.null(previous_fun) && !identical(fun, previous_fun)) { @@ -87,58 +69,25 @@ register_binding <- function(fun_name, # register both as `pkg::fun` and as `fun` if `qualified_name` is prefixed # unqualified_name and fun_name will be the same if not prefixed - registry[[unqualified_name]] <- fun - registry[[fun_name]] <- fun - + .cache$functions[[unqualified_name]] <- fun + .cache$functions[[fun_name]] <- fun .cache$docs[[fun_name]] <- notes - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- fun - fun_cache[[fun_name]] <- fun - .cache$functions <- fun_cache - } - invisible(previous_fun) } -unregister_binding <- function(fun_name, registry = nse_funcs, - update_cache = FALSE) { +unregister_binding <- function(fun_name) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] - rm( - list = unique(c(fun_name, unqualified_name)), - envir = registry, - inherits = FALSE - ) - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- NULL - fun_cache[[fun_name]] <- NULL - .cache$functions <- fun_cache - } + .cache$functions[[unqualified_name]] <- NULL + .cache$functions[[fun_name]] <- NULL invisible(previous_fun) } -#' @rdname register_binding -#' @keywords internal -register_binding_agg <- function(fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0)) { - register_binding(fun_name, agg_fun, registry = registry, notes = notes) -} - # Supports functions and tests that call previously-defined bindings call_binding <- function(fun_name, ...) { - nse_funcs[[fun_name]](...) -} - -call_binding_agg <- function(fun_name, ...) { - agg_funcs[[fun_name]](...) + .cache$functions[[fun_name]](...) } create_binding_cache <- function() { @@ -147,7 +96,7 @@ create_binding_cache <- function() { # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() - arrow_funcs <- set_names( + .cache$functions <- set_names( lapply(all_arrow_funs, function(fun) { force(fun) function(...) Expression$create(fun, ...) @@ -155,7 +104,7 @@ create_binding_cache <- function() { paste0("arrow_", all_arrow_funs) ) - # Register bindings into nse_funcs and agg_funcs + # Register bindings into the cache register_bindings_array_function_map() register_bindings_aggregate() register_bindings_conditional() @@ -165,37 +114,17 @@ create_binding_cache <- function() { register_bindings_type() register_bindings_augmented() - # We only create the cache for nse_funcs and not agg_funcs - .cache$functions <- c(as.list(nse_funcs), arrow_funcs) -} - -# environments in the arrow namespace used in the above functions -nse_funcs <- new.env(parent = emptyenv()) -agg_funcs <- new.env(parent = emptyenv()) -.cache <- new.env(parent = emptyenv()) - -# we register 2 versions of the "::" binding - one for use with nse_funcs -# and another one for use with agg_funcs (registered in dplyr-funcs-agg.R) -nse_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) + .cache$functions[["::"]] <- function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) - fun_name <- paste0(lhs_name, "::", rhs_name) + fun_name <- paste0(lhs_name, "::", rhs_name) - # if we do not have a binding for pkg::fun, then fall back on to the - # regular pkg::fun function - nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + # if we do not have a binding for pkg::fun, then fall back on to the + # regular pkg::fun function + .cache$functions[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + } } -agg_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) - - fun_name <- paste0(lhs_name, "::", rhs_name) - - # if we do not have a binding for pkg::fun, then fall back on to the - # nse_funcs (useful when we have a regular function inside an aggregating one) - # and then, if searching nse_funcs fails too, fall back to the - # regular `pkg::fun()` function - agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] -} +# environment in the arrow namespace used in the above functions +.cache <- new.env(parent = emptyenv()) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 880f7799e6316..f0a8c005676df 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -48,7 +48,7 @@ mutate.arrow_dplyr_query <- function(.data, # Create a mask with aggregation functions in it # If there are any aggregations, we will need to compute them and # and join the results back in, for "window functions" like x - mean(x) - mask <- arrow_mask(out, aggregation = TRUE) + mask <- arrow_mask(out) # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { @@ -84,12 +84,12 @@ mutate.arrow_dplyr_query <- function(.data, agg_query$aggregations <- mask$.aggregations agg_query <- collapse.arrow_dplyr_query(agg_query) if (length(grv)) { - out <- left_join(out, agg_query, by = grv) + out <- dplyr::left_join(out, agg_query, by = grv) } else { # If there are no group_by vars, add a scalar column to both and join on that agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) - out <- left_join(out, agg_query, by = "..tempjoin") + out <- dplyr::left_join(out, agg_query, by = "..tempjoin") } } diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 56de14db6dd44..58ca849152a75 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -84,7 +84,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # and the aggregation functions will pull out those terms and insert into # that list. # nolint end - mask <- arrow_mask(.data, aggregation = TRUE) + mask <- arrow_mask(.data) # We'll collect any transformations after the aggregation here. # summarize_eval() returns NULL when the outer expression is an aggregation, diff --git a/r/R/udf.R b/r/R/udf.R index 922095cceba6a..0415fbac3c9fc 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -95,12 +95,7 @@ register_scalar_function <- function(name, fun, in_type, out_type, body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name) environment(binding_fun) <- asNamespace("arrow") - register_binding( - name, - binding_fun, - update_cache = TRUE - ) - + register_binding(name, binding_fun) invisible(NULL) } diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index d10cd733bbe9d..b84cde3b8993a 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -2,63 +2,28 @@ % Please edit documentation in R/dplyr-funcs.R \name{register_binding} \alias{register_binding} -\alias{register_binding_agg} \title{Register compute bindings} \usage{ -register_binding( - fun_name, - fun, - registry = nse_funcs, - update_cache = FALSE, - notes = character(0) -) - -register_binding_agg( - fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0) -) +register_binding(fun_name, fun, notes = character(0)) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or -\code{"package::function"}. The package name is currently not used but -may be used in the future to allow these types of function calls.} +\code{"package::function"}.} -\item{fun}{A function or \code{NULL} to un-register a previous function. +\item{fun}{A function, or \code{NULL} to un-register a previous function. This function must accept \code{Expression} objects as arguments and return \code{Expression} objects instead of regular R objects.} -\item{registry}{An environment in which the functions should be -assigned.} - -\item{update_cache}{Update .cache$functions at the time of registration. -the default is FALSE because the majority of usage is to register -bindings at package load, after which we create the cache once. The -reason why .cache$functions is needed in addition to nse_funcs for -non-aggregate functions could be revisited...it is currently used -as the data mask in mutate, filter, and aggregate (but not -summarise) because the data mask has to be a list.} - \item{notes}{string for the docs: note any limitations or differences in behavior between the Arrow version and the R function.} - -\item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous -aggregate function. This function must accept \code{Expression} objects as -arguments and return a \code{list()} with components: -\itemize{ -\item \code{fun}: string function name -\item \code{data}: list of 0 or more \code{Expression}s -\item \code{options}: list of function options, as passed to call_function -}} } \value{ The previously registered binding or \code{NULL} if no previously registered function existed. } \description{ -The \code{register_binding()} and \code{register_binding_agg()} functions -are used to populate a list of functions that operate on (and return) +\code{register_binding()} is used to populate a list of functions that operate on +(and return) Expressions. These are the basis for the \code{.data} mask inside dplyr methods. } \section{Writing bindings}{ diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index 1e36ea8bd4966..493eac328e5cd 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -325,7 +325,7 @@ test_that("dplyr method not implemented messages", { # This one is more nuanced expect_error( ds %>% filter(int > 6, dbl > max(dbl)), - "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", + "Expression dbl > max(dbl) not supported in filter() in Arrow\nCall collect() first to pull data into R.", fixed = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index bf23685362a82..535bcb70c4cab 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -324,13 +324,14 @@ test_that("Filtering with unsupported functions", { filter( nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg int > 2, # good - pnorm(dbl) > .99 # bad, opaque + pnorm(dbl) > .99 # bad, opaque, but we'll error on the first one before we get here ) %>% collect(), tbl, - warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported in Arrow -\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow -pulling data into R' + warning = paste( + 'In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1,', + "allowNA = TRUE not supported in Arrow; pulling data into R" + ) ) }) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index 039604a85ee0c..48c5d730f8493 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -19,35 +19,25 @@ skip_on_cran() test_that("register_binding()/unregister_binding() works", { - fake_registry <- new.env(parent = emptyenv()) fun1 <- function() NULL fun2 <- function() "Hello" - expect_null(register_binding("some.pkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`some.pkg::some_fun`, fun1) + expect_null(register_binding("some.pkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) + expect_identical(.cache$functions$`some.pkg::some_fun`, fun1) - expect_identical(unregister_binding("some.pkg::some_fun", fake_registry), fun1) - expect_false("some.pkg::some_fun" %in% names(fake_registry)) - expect_false("some_fun" %in% names(fake_registry)) + expect_identical(unregister_binding("some.pkg::some_fun"), fun1) + expect_false("some.pkg::some_fun" %in% names(.cache$functions)) + expect_false("some_fun" %in% names(.cache$functions)) - expect_null(register_binding("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) + expect_null(register_binding("somePkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) expect_warning( - register_binding("some.pkg2::some_fun", fun2, fake_registry), + register_binding("some.pkg2::some_fun", fun2), "A \"some_fun\" binding already exists in the registry and will be overwritten." ) # No warning when an identical function is re-registered - expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry)) -}) - -test_that("register_binding_agg() works", { - fake_registry <- new.env(parent = emptyenv()) - fun1 <- function() NULL - - expect_null(register_binding_agg("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`somePkg::some_fun`, fun1) + expect_silent(register_binding("some.pkg2::some_fun", fun2)) }) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 87bb5e5fac959..a61ef95bee73d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -337,20 +337,20 @@ test_that("Functions that take ... but we only accept a single arg", { ) # Now that we've demonstrated that the whole machinery works, let's test - # the agg_funcs directly - expect_error(call_binding_agg("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("sum"), "sum() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("prod"), "prod() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("any"), "any() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("all"), "all() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("min"), "min() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("max"), "max() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("n_distinct", 1, 2), "Multiple arguments to n_distinct()") - expect_error(call_binding_agg("sum", 1, 2), "Multiple arguments to sum") - expect_error(call_binding_agg("any", 1, 2), "Multiple arguments to any()") - expect_error(call_binding_agg("all", 1, 2), "Multiple arguments to all()") - expect_error(call_binding_agg("min", 1, 2), "Multiple arguments to min()") - expect_error(call_binding_agg("max", 1, 2), "Multiple arguments to max()") + # the agg funcs directly + expect_error(call_binding("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) + expect_error(call_binding("sum"), "sum() with 0 arguments", fixed = TRUE) + expect_error(call_binding("prod"), "prod() with 0 arguments", fixed = TRUE) + expect_error(call_binding("any"), "any() with 0 arguments", fixed = TRUE) + expect_error(call_binding("all"), "all() with 0 arguments", fixed = TRUE) + expect_error(call_binding("min"), "min() with 0 arguments", fixed = TRUE) + expect_error(call_binding("max"), "max() with 0 arguments", fixed = TRUE) + expect_error(call_binding("n_distinct", 1, 2), "Multiple arguments to n_distinct()") + expect_error(call_binding("sum", 1, 2), "Multiple arguments to sum") + expect_error(call_binding("any", 1, 2), "Multiple arguments to any()") + expect_error(call_binding("all", 1, 2), "Multiple arguments to all()") + expect_error(call_binding("min", 1, 2), "Multiple arguments to min()") + expect_error(call_binding("max", 1, 2), "Multiple arguments to max()") }) test_that("median()", { diff --git a/r/tests/testthat/test-udf.R b/r/tests/testthat/test-udf.R index 0eb75b1dde6e5..8604dc610a435 100644 --- a/r/tests/testthat/test-udf.R +++ b/r/tests/testthat/test-udf.R @@ -90,7 +90,7 @@ test_that("register_scalar_function() adds a compute function to the registry", int32(), float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_true("times_32" %in% names(asNamespace("arrow")$.cache$functions)) expect_true("times_32" %in% list_compute_functions()) @@ -124,7 +124,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_array", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_array")) expect_error( call_function("times_32_bad_return_type_array", Array$create(1L)), @@ -137,7 +137,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_scalar", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_scalar")) expect_error( call_function("times_32_bad_return_type_scalar", Array$create(1L)), @@ -155,7 +155,7 @@ test_that("register_scalar_function() can register multiple kernels", { out_type = function(in_types) in_types[[1]], auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_equal( call_function("times_32", Scalar$create(1L, int32())), @@ -238,7 +238,7 @@ test_that("user-defined functions work during multi-threaded execution", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) # check a regular collect() result <- open_dataset(tf_dataset) %>% @@ -271,7 +271,7 @@ test_that("nested exec plans can contain user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) stream_plan_with_udf <- function() { record_batch(a = 1:1000) %>% @@ -310,7 +310,7 @@ test_that("head() on exec plan containing user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) result <- record_batch(a = 1:1000) %>% dplyr::mutate(b = times_32(a)) %>% diff --git a/r/vignettes/developers/writing_bindings.Rmd b/r/vignettes/developers/writing_bindings.Rmd index 443211b3c2b5e..e1ed92105dbc3 100644 --- a/r/vignettes/developers/writing_bindings.Rmd +++ b/r/vignettes/developers/writing_bindings.Rmd @@ -145,11 +145,10 @@ test_that("startsWith behaves identically in dplyr and Arrow", { df <- tibble(x = c("Foo", "bar", "baz", "qux")) compare_dplyr_binding( .input %>% - filter(startsWith(x, "b")) %>% - collect(), + filter(startsWith(x, "b")) %>% + collect(), df ) - }) ``` @@ -197,7 +196,7 @@ As `startsWith()` requires options, direct mapping is not appropriate. If the function cannot be mapped directly, some extra work may be needed to ensure that calling the arrow version of the function results in the same result as calling the R version of the function. In this case, the function will need -adding to the `nse_funcs` function registry. Here is how this might look for +adding to the `.cache$functions` function registry. Here is how this might look for `startsWith()`: ```{r, eval = FALSE}