diff --git a/NOTICE.txt b/NOTICE.txt index a609791374c28..2089c6fb20358 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ Apache Arrow -Copyright 2016-2019 The Apache Software Foundation +Copyright 2016-2024 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/ci/scripts/integration_substrait.sh b/ci/scripts/integration_substrait.sh index f7208ae113814..164f0e80b9890 100755 --- a/ci/scripts/integration_substrait.sh +++ b/ci/scripts/integration_substrait.sh @@ -24,7 +24,7 @@ set -e echo "Substrait Integration Tests" echo "Validating imports" python -c "import pyarrow.substrait" -python -c "from substrait_consumer.consumers import AceroConsumer" +python -c "from substrait_consumer.consumers.acero_consumer import AceroConsumer" echo "Executing pytest" cd consumer-testing diff --git a/cpp/cmake_modules/FindLLVMAlt.cmake b/cpp/cmake_modules/FindLLVMAlt.cmake index 69f680824b082..2730f829817f6 100644 --- a/cpp/cmake_modules/FindLLVMAlt.cmake +++ b/cpp/cmake_modules/FindLLVMAlt.cmake @@ -93,8 +93,8 @@ if(LLVM_FOUND) debuginfodwarf ipo linker - mcjit native + orcjit target) if(LLVM_VERSION_MAJOR GREATER_EQUAL 14) list(APPEND LLVM_TARGET_COMPONENTS passes) diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index acdd0a0742468..0b0e340a67d4e 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -894,7 +894,8 @@ const std::shared_ptr& FixedSizeListArray::value_type() const { const std::shared_ptr& FixedSizeListArray::values() const { return values_; } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, int32_t list_size) { + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap, int64_t null_count) { if (list_size <= 0) { return Status::Invalid("list_size needs to be a strict positive integer"); } @@ -905,14 +906,14 @@ Result> FixedSizeListArray::FromArrays( } int64_t length = values->length() / list_size; auto list_type = std::make_shared(values->type(), list_size); - std::shared_ptr validity_buf; - return std::make_shared(list_type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(list_type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::FromArrays( - const std::shared_ptr& values, std::shared_ptr type) { + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap, int64_t null_count) { if (type->id() != Type::FIXED_SIZE_LIST) { return Status::TypeError("Expected fixed size list type, got ", type->ToString()); } @@ -926,10 +927,9 @@ Result> FixedSizeListArray::FromArrays( "The length of the values Array needs to be a multiple of the list size"); } int64_t length = values->length() / list_type.list_size(); - std::shared_ptr validity_buf; - return std::make_shared(type, length, values, validity_buf, - /*null_count=*/0, /*offset=*/0); + return std::make_shared(type, length, values, null_bitmap, + null_count); } Result> FixedSizeListArray::Flatten( diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 61606e1592d61..768a630e0af54 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -599,17 +599,25 @@ class ARROW_EXPORT FixedSizeListArray : public Array { /// /// \param[in] values Array containing list values /// \param[in] list_size The fixed length of each list + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / list_size - static Result> FromArrays(const std::shared_ptr& values, - int32_t list_size); + static Result> FromArrays( + const std::shared_ptr& values, int32_t list_size, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); /// \brief Construct FixedSizeListArray from child value array and type /// /// \param[in] values Array containing list values /// \param[in] type The fixed sized list type + /// \param[in] null_bitmap Optional validity bitmap + /// \param[in] null_count Optional null count in null_bitmap /// \return Will have length equal to values.length() / type.list_size() - static Result> FromArrays(const std::shared_ptr& values, - std::shared_ptr type); + static Result> FromArrays( + const std::shared_ptr& values, std::shared_ptr type, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount); protected: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 4d2c814a69bbb..2e5210b073ee4 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -452,7 +452,7 @@ Result TDigest(const Datum& value, /// \brief Find the first index of a value in an array. /// /// \param[in] value The array to search. -/// \param[in] options The array to search for. See IndexOoptions. +/// \param[in] options The array to search for. See IndexOptions. /// \param[in] ctx the function execution context, optional /// \return out a Scalar containing the index (or -1 if not found). /// diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 26fbe64f74293..bad34f4a37881 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -491,7 +491,7 @@ struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions { /// How to interpret ambiguous local times (due to DST shifts) Ambiguous ambiguous; - /// How to interpret non-existent local times (due to DST shifts) + /// How to interpret nonexistent local times (due to DST shifts) Nonexistent nonexistent; }; @@ -1589,7 +1589,7 @@ ARROW_EXPORT Result MonthsBetween(const Datum& left, const Datum& right, ARROW_EXPORT Result WeeksBetween(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR); -/// \brief Month Day Nano Between finds the number of months, days, and nonaseconds +/// \brief Month Day Nano Between finds the number of months, days, and nanoseconds /// between two values /// /// \param[in] left input treated as the start time diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 759f9e5c1a408..919572f16ee69 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -401,7 +401,7 @@ Result> NthToIndices(const Array& values, int64_t n, /// \brief Return indices that partition an array around n-th sorted element. /// -/// This overload takes a PartitionNthOptions specifiying the pivot index +/// This overload takes a PartitionNthOptions specifying the pivot index /// and the null handling. /// /// \param[in] values array to be partitioned @@ -452,7 +452,7 @@ Result> SortIndices(const Array& array, /// \brief Return the indices that would sort an array. /// -/// This overload takes a ArraySortOptions specifiying the sort order +/// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] array array to sort @@ -486,7 +486,7 @@ Result> SortIndices(const ChunkedArray& chunked_array, /// \brief Return the indices that would sort a chunked array. /// -/// This overload takes a ArraySortOptions specifiying the sort order +/// This overload takes a ArraySortOptions specifying the sort order /// and the null handling. /// /// \param[in] chunked_array chunked array to sort diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index c18dfa0952245..28dcf493fa294 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -1164,7 +1164,7 @@ class ScalarAggExecutor : public KernelExecutorImpl { // TODO(wesm): this is odd and should be examined soon -- only one state // "should" be needed per thread of execution - // FIXME(ARROW-11840) don't merge *any* aggegates for every batch + // FIXME(ARROW-11840) don't merge *any* aggregates for every batch ARROW_ASSIGN_OR_RAISE(auto batch_state, kernel_->init(kernel_ctx_, {kernel_, *input_types_, options_})); diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h index 8beff2a6c63ac..7e4f364a9288e 100644 --- a/cpp/src/arrow/compute/exec_internal.h +++ b/cpp/src/arrow/compute/exec_internal.h @@ -46,7 +46,7 @@ class ARROW_EXPORT ExecSpanIterator { public: ExecSpanIterator() = default; - /// \brief Initialize itertor iterator and do basic argument validation + /// \brief Initialize iterator and do basic argument validation /// /// \param[in] batch the input ExecBatch /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index d661e5735fea6..cfce0c57fa416 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -1232,7 +1232,7 @@ void TestCallScalarFunctionPreallocationCases::DoTest(FunctionCallerMaker caller } // Set the exec_chunksize to be smaller, so now we have several invocations - // of the kernel, but still the output is onee array + // of the kernel, but still the output is one array { std::vector args = {Datum(arr)}; exec_ctx_->set_exec_chunksize(80); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index ddd241652460e..1fbcd6a249093 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -1100,7 +1100,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { AddFirstLastKernels(FirstLastInit, TemporalTypes(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); - // Add first/last as convience functions + // Add first/last as convenience functions func = std::make_shared("first", Arity::Unary(), first_doc, &default_scalar_aggregate_options); AddFirstOrLastAggKernel(func.get(), first_last_func); diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h index 4966e9871d62c..f08e7aaa538bb 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h @@ -472,7 +472,7 @@ struct FirstLastImpl : public ScalarAggregator { this->count += arr.length() - null_count; if (null_count == 0) { - // If there are no null valus, we can just merge + // If there are no null values, we can just merge // the first and last element this->state.MergeOne(arr.GetView(0)); this->state.MergeOne(arr.GetView(arr.length() - 1)); diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc index 7f359ead6cb83..3f84c0a5ee4c4 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc @@ -115,7 +115,7 @@ Status Finalize(KernelContext* ctx, const DataType& type, ExecResult* out, return Status::OK(); } -// count value occurances for integers with narrow value range +// count value occurrences for integers with narrow value range // O(1) space, O(n) time template struct CountModer { diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc index e675a1cec86c9..f4826229dd46c 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc @@ -120,7 +120,7 @@ struct SortQuantiler { }); // input array is partitioned around data point at `last_index` (pivot) - // for next quatile which is smaller, we only consider inputs left of the pivot + // for next quantile which is smaller, we only consider inputs left of the pivot uint64_t last_index = in_buffer.size(); if (is_datapoint) { CType* out_buffer = out_data->template GetMutableValues(1); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index aa19fb3401232..65439af2748b5 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -478,7 +478,7 @@ TEST_F(TestSumKernelRoundOff, Basics) { // array = np.arange(321000, dtype='float64') // array -= np.mean(array) - // array *= arrray + // array *= array double index = 0; ASSERT_OK_AND_ASSIGN( auto array, ArrayFromBuilderVisitor( @@ -3653,7 +3653,7 @@ class TestPrimitiveQuantileKernel : public ::testing::Test { #define INTYPE(x) Datum(static_cast(x)) #define DOUBLE(x) Datum(static_cast(x)) -// output type per interplation: linear, lower, higher, nearest, midpoint +// output type per interpolation: linear, lower, higher, nearest, midpoint #define O(a, b, c, d, e) \ { DOUBLE(a), INTYPE(b), INTYPE(c), INTYPE(d), DOUBLE(e) } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 47cae538e2e3f..c37e45513d040 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -1848,8 +1848,8 @@ struct GroupedFirstLastImpl final : public GroupedAggregator { const ArrayData& group_id_mapping) override { // The merge is asymmetric. "first" from this state gets pick over "first" from other // state. "last" from other state gets pick over from this state. This is so that when - // using with segmeneted aggregation, we still get the correct "first" and "last" - // value for the entire segement. + // using with segmented aggregation, we still get the correct "first" and "last" + // value for the entire segment. auto other = checked_cast(&raw_other); auto raw_firsts = firsts_.mutable_data(); diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc index 4b678da5f1b42..17e9951d69bc2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_benchmark.cc @@ -33,7 +33,7 @@ constexpr auto kSeed = 0x94378165; using BinaryOp = Result(const Datum&, const Datum&, ArithmeticOptions, ExecContext*); -// Add explicit overflow-checked shortcuts, for easy benchmark parametering. +// Add explicit overflow-checked shortcuts, for easy benchmark parameterizing. static Result AddChecked(const Datum& left, const Datum& right, ArithmeticOptions options = ArithmeticOptions(), ExecContext* ctx = NULLPTR) { diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc index 756b3028c4a59..37a1bcbc02d73 100644 --- a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc @@ -1857,7 +1857,7 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) { } } -// reference result from bc (precsion=100, scale=40) +// reference result from bc (precision=100, scale=40) TEST_F(TestBinaryArithmeticDecimal, AddSubtract) { // array array, decimal128 { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index b429c8175b020..a8acf68f66c8b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2842,19 +2842,19 @@ TEST(Cast, StructToDifferentNullabilityStruct) { ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), Cast(src_nullable, options1_non_nullable)); - std::vector> fields_dest2_non_nullble = { + std::vector> fields_dest2_non_nullable = { std::make_shared("a", int64(), false), std::make_shared("c", int64(), false)}; - const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullble); + const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullable); const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), Cast(src_nullable, options2_non_nullable)); - std::vector> fields_dest3_non_nullble = { + std::vector> fields_dest3_non_nullable = { std::make_shared("c", int64(), false)}; - const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullble); + const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullable); const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 771261cac9140..c4c46b5efe84d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -69,7 +69,7 @@ template class TestIfElsePrimitive : public ::testing::Test {}; // There are a lot of tests here if we cover all the types and it gets slow on valgrind -// so we overrdie the standard type sets with a smaller range +// so we override the standard type sets with a smaller range #ifdef ARROW_VALGRIND using IfElseNumericBasedTypes = ::testing::Types::Round(round_val); } - // Equality check is ommitted so that the common case of 10^0 (integer rounding) + // Equality check is omitted so that the common case of 10^0 (integer rounding) // uses multiply-only round_val = ndigits > 0 ? (round_val / pow10) : (round_val * pow10); if (!std::isfinite(round_val)) { diff --git a/cpp/src/arrow/compute/kernels/scalar_string_internal.h b/cpp/src/arrow/compute/kernels/scalar_string_internal.h index 1a9969441655d..7a5d5a7c86e85 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_string_internal.h @@ -306,7 +306,7 @@ struct StringSplitExec { using ListOffsetsBuilderType = TypedBufferBuilder; using State = OptionsWrapper; - // Keep the temporary storage accross individual values, to minimize reallocations + // Keep the temporary storage across individual values, to minimize reallocations std::vector parts; Options options; diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index ff14f5e7a5c5d..5dec16d89e29c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -2060,7 +2060,7 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) { this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"([""])", &options_edgecase_1); - // this is a safeguard agains an optimization path possible, but actually a tricky case + // this is a safeguard against an optimization path possible, but actually a tricky case SliceOptions options_edgecase_2{-6, -2}; this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), R"(["𝑓öõ"])", &options_edgecase_2); @@ -2189,7 +2189,7 @@ TYPED_TEST(TestBinaryKernels, SliceBytesBasic) { "ds\"]", this->type(), R"([""])", &options_edgecase_1); - // this is a safeguard agains an optimization path possible, but actually a tricky case + // this is a safeguard against an optimization path possible, but actually a tricky case SliceOptions options_edgecase_2{-6, -2}; this->CheckUnary("binary_slice", "[\"f\xc2\xa2" diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc index d8bbe5ca8a34c..d4482334285bc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_temporal_test.cc @@ -2101,9 +2101,9 @@ TEST_F(ScalarTemporalTest, StrftimeNoTimezone) { TEST_F(ScalarTemporalTest, StrftimeInvalidTimezone) { const char* seconds = R"(["1970-01-01T00:00:59", null])"; - auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "non-existent"), seconds); + auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "nonexistent"), seconds); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, testing::HasSubstr("Cannot locate timezone 'non-existent'"), + Invalid, testing::HasSubstr("Cannot locate timezone 'nonexistent'"), Strftime(arr, StrftimeOptions())); } @@ -2159,12 +2159,12 @@ TEST_F(ScalarTemporalTest, StrftimeOtherLocale) { } TEST_F(ScalarTemporalTest, StrftimeInvalidLocale) { - auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "non-existent"); + auto options = StrftimeOptions("%d %B %Y %H:%M:%S", "nonexistent"); const char* seconds = R"(["1970-01-01T00:00:59", null])"; auto arr = ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC"), seconds); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, - testing::HasSubstr("Cannot find locale 'non-existent'"), + testing::HasSubstr("Cannot find locale 'nonexistent'"), Strftime(arr, options)); } @@ -2601,7 +2601,7 @@ TEST_F(ScalarTemporalTestStrictCeil, TestCeilTemporalStrictCeil) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, CeilUTC) { std::string op = "ceil_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* ceil_15_nanosecond = @@ -2989,7 +2989,7 @@ TEST_F(ScalarTemporalTest, TestFloorTemporal) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, FloorUTC) { std::string op = "floor_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* floor_15_nanosecond = @@ -3402,7 +3402,7 @@ TEST_F(ScalarTemporalTest, TestCeilFloorRoundTemporalBrussels) { TEST_F(ScalarTemporalTestMultipleSinceGreaterUnit, RoundUTC) { std::string op = "round_temporal"; - // Data for tests below was generaed via lubridate with the exception + // Data for tests below was generated via lubridate with the exception // of week data because lubridate currently does not support rounding to // multiple of week. const char* round_15_nanosecond = diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc index 943fdcd6b147f..811ed23e1134b 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode.cc @@ -30,11 +30,11 @@ namespace compute { namespace internal { namespace { -struct RunEndEncondingState : public KernelState { - explicit RunEndEncondingState(std::shared_ptr run_end_type) +struct RunEndEncodingState : public KernelState { + explicit RunEndEncodingState(std::shared_ptr run_end_type) : run_end_type{std::move(run_end_type)} {} - ~RunEndEncondingState() override = default; + ~RunEndEncodingState() override = default; std::shared_ptr run_end_type; }; @@ -273,7 +273,7 @@ struct RunEndEncodeExec { template static Status Exec(KernelContext* ctx, const ExecSpan& span, ExecResult* result) { - auto state = checked_cast(ctx->state()); + auto state = checked_cast(ctx->state()); switch (state->run_end_type->id()) { case Type::INT16: return DoExec(ctx, span, result); @@ -290,7 +290,7 @@ struct RunEndEncodeExec { /// \brief The OutputType::Resolver of the "run_end_decode" function. static Result ResolveOutputType( KernelContext* ctx, const std::vector& input_types) { - auto state = checked_cast(ctx->state()); + auto state = checked_cast(ctx->state()); return TypeHolder(std::make_shared(state->run_end_type, input_types[0].GetSharedPtr())); } @@ -301,7 +301,7 @@ Result> RunEndEncodeInit(KernelContext*, auto* options = checked_cast(args.options); auto run_end_type = options ? options->run_end_type : RunEndEncodeOptions::Defaults().run_end_type; - return std::make_unique(std::move(run_end_type)); + return std::make_unique(std::move(run_end_type)); } template diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 1740a9b7f0bb4..97996e6d52cc0 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -72,9 +72,9 @@ class SelectKComparator { } }; -class ArraySelecter : public TypeVisitor { +class ArraySelector : public TypeVisitor { public: - ArraySelecter(ExecContext* ctx, const Array& array, const SelectKOptions& options, + ArraySelector(ExecContext* ctx, const Array& array, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -164,9 +164,9 @@ struct TypedHeapItem { ArrayType* array; }; -class ChunkedArraySelecter : public TypeVisitor { +class ChunkedArraySelector : public TypeVisitor { public: - ChunkedArraySelecter(ExecContext* ctx, const ChunkedArray& chunked_array, + ChunkedArraySelector(ExecContext* ctx, const ChunkedArray& chunked_array, const SelectKOptions& options, Datum* output) : TypeVisitor(), chunked_array_(chunked_array), @@ -273,13 +273,13 @@ class ChunkedArraySelecter : public TypeVisitor { Datum* output_; }; -class RecordBatchSelecter : public TypeVisitor { +class RecordBatchSelector : public TypeVisitor { private: using ResolvedSortKey = ResolvedRecordBatchSortKey; using Comparator = MultipleKeyComparator; public: - RecordBatchSelecter(ExecContext* ctx, const RecordBatch& record_batch, + RecordBatchSelector(ExecContext* ctx, const RecordBatch& record_batch, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -391,7 +391,7 @@ class RecordBatchSelecter : public TypeVisitor { Comparator comparator_; }; -class TableSelecter : public TypeVisitor { +class TableSelector : public TypeVisitor { private: struct ResolvedSortKey { ResolvedSortKey(const std::shared_ptr& chunked_array, @@ -420,7 +420,7 @@ class TableSelecter : public TypeVisitor { using Comparator = MultipleKeyComparator; public: - TableSelecter(ExecContext* ctx, const Table& table, const SelectKOptions& options, + TableSelector(ExecContext* ctx, const Table& table, const SelectKOptions& options, Datum* output) : TypeVisitor(), ctx_(ctx), @@ -610,32 +610,32 @@ class SelectKUnstableMetaFunction : public MetaFunction { Result SelectKth(const Array& array, const SelectKOptions& options, ExecContext* ctx) const { Datum output; - ArraySelecter selecter(ctx, array, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + ArraySelector selector(ctx, array, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const ChunkedArray& chunked_array, const SelectKOptions& options, ExecContext* ctx) const { Datum output; - ChunkedArraySelecter selecter(ctx, chunked_array, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + ChunkedArraySelector selector(ctx, chunked_array, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const RecordBatch& record_batch, const SelectKOptions& options, ExecContext* ctx) const { ARROW_RETURN_NOT_OK(CheckConsistency(*record_batch.schema(), options.sort_keys)); Datum output; - RecordBatchSelecter selecter(ctx, record_batch, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + RecordBatchSelector selector(ctx, record_batch, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } Result SelectKth(const Table& table, const SelectKOptions& options, ExecContext* ctx) const { ARROW_RETURN_NOT_OK(CheckConsistency(*table.schema(), options.sort_keys)); Datum output; - TableSelecter selecter(ctx, table, options, &output); - ARROW_RETURN_NOT_OK(selecter.Run()); + TableSelector selector(ctx, table, options, &output); + ARROW_RETURN_NOT_OK(selector.Run()); return output; } }; diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index 30e85c1f71089..bdf9f5454fdef 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -2488,7 +2488,7 @@ TEST(TestIndicesNonZero, IndicesNonZeroBoolean) { Datum actual; std::shared_ptr result; - // boool + // bool ASSERT_OK_AND_ASSIGN( actual, CallFunction("indices_nonzero", {ArrayFromJSON(boolean(), "[null, true, false, true]")})); diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index 525dae850f19b..a027ec811cf24 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -505,7 +505,7 @@ void SwissTable::find(const int num_keys, const uint32_t* hashes, // Slow processing of input keys in the most generic case. // Handles inserting new keys. -// Pre-existing keys will be handled correctly, although the intended use is for this +// Preexisting keys will be handled correctly, although the intended use is for this // call to follow a call to find() method, which would only pass on new keys that were // not present in the hash table. // @@ -617,7 +617,7 @@ Status SwissTable::map_new_keys(uint32_t num_ids, uint16_t* ids, const uint32_t* ARROW_DCHECK(static_cast(num_ids) <= (1 << log_minibatch_)); ARROW_DCHECK(static_cast(max_id + 1) <= (1 << log_minibatch_)); - // Allocate temporary buffers for slot ids and intialize them + // Allocate temporary buffers for slot ids and initialize them auto slot_ids_buf = util::TempVectorHolder(temp_stack, max_id + 1); uint32_t* slot_ids = slot_ids_buf.mutable_data(); init_slot_ids_for_new_keys(num_ids, ids, hashes, slot_ids); diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 85ef9029d6fc9..8e06dc83483aa 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -142,7 +142,7 @@ class ARROW_EXPORT SwissTable { void extract_group_ids_imp(const int num_keys, const uint16_t* selection, const uint32_t* hashes, const uint8_t* local_slots, uint32_t* out_group_ids, int elements_offset, - int element_mutltiplier) const; + int element_multiplier) const; inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found) const; @@ -187,7 +187,7 @@ class ARROW_EXPORT SwissTable { // Slow processing of input keys in the most generic case. // Handles inserting new keys. - // Pre-existing keys will be handled correctly, although the intended use is for this + // Preexisting keys will be handled correctly, although the intended use is for this // call to follow a call to find() method, which would only pass on new keys that were // not present in the hash table. // diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc index 731553511044f..3526a6cb0f344 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -117,7 +117,7 @@ int SwissTable::early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* h vlocal_slot = _mm256_add_epi32(_mm256_and_si256(vlocal_slot, _mm256_set1_epi32(0xff)), _mm256_and_si256(vgt, _mm256_set1_epi32(4))); - // Convert slot id relative to the block to slot id relative to the beginnning of the + // Convert slot id relative to the block to slot id relative to the beginning of the // table // uint64_t local_slot = _mm256_extract_epi64( diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 93a054de1957c..73ea01a03a8fa 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -89,7 +89,7 @@ KeyColumnArray KeyColumnArray::Slice(int64_t offset, int64_t length) const { sliced.bit_offset_[0] = (bit_offset_[0] + offset) % 8; if (metadata_.fixed_length == 0 && !metadata_.is_null_type) { - ARROW_DCHECK(is_bool_type()) << "Expected BOOL type type but got a different type."; + ARROW_DCHECK(is_bool_type()) << "Expected BOOL type but got a different type."; sliced.buffers_[1] = buffers_[1] ? buffers_[1] + (bit_offset_[1] + offset) / 8 : nullptr; sliced.mutable_buffers_[1] = mutable_buffers_[1] diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 52121530fe91d..3ceba43604b28 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -333,7 +333,7 @@ TEST(ResizableArrayData, Binary) { ASSERT_EQ(0, array.num_rows()); ASSERT_OK(array.ResizeFixedLengthBuffers(2)); ASSERT_EQ(2, array.num_rows()); - // At this point the offets memory has been allocated and needs to be filled + // At this point the offsets memory has been allocated and needs to be filled // in before we allocate the variable length memory int offsets_width = static_cast(arrow::internal::checked_pointer_cast(type) diff --git a/cpp/src/arrow/compute/ordering.h b/cpp/src/arrow/compute/ordering.h index e581269cc20dd..61caa2b570dd3 100644 --- a/cpp/src/arrow/compute/ordering.h +++ b/cpp/src/arrow/compute/ordering.h @@ -52,7 +52,7 @@ class ARROW_EXPORT SortKey : public util::EqualityComparable { bool Equals(const SortKey& other) const; std::string ToString() const; - /// A FieldRef targetting the sort column. + /// A FieldRef targeting the sort column. FieldRef target; /// How to order by this sort key. SortOrder order; diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc index 2d69f119df1f4..3dc14bcff83ee 100644 --- a/cpp/src/arrow/compute/registry_test.cc +++ b/cpp/src/arrow/compute/registry_test.cc @@ -69,7 +69,7 @@ TEST_P(TestRegistry, Basics) { ASSERT_OK_AND_ASSIGN(std::shared_ptr f1, registry_->GetFunction("f1")); ASSERT_EQ("f1", f1->name()); - // Non-existent function + // Nonexistent function ASSERT_RAISES(KeyError, registry_->GetFunction("f2")); // Try adding a function with name collision diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index b3d28ef19a1a0..5e23eda16fda2 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -210,7 +210,7 @@ struct SimpleKeySegmenter : public BaseRowSegmenter { private: TypeHolder key_type_; - std::vector save_key_data_; // previusly seen segment-key grouping data + std::vector save_key_data_; // previously seen segment-key grouping data bool extend_was_called_; }; diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 15f00eaac2191..628a9c14f3e44 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -29,12 +29,12 @@ namespace arrow { namespace compute { /// \brief A segment -/// A segment group is a chunk of continous rows that have the same segment key. (For +/// A segment group is a chunk of continuous rows that have the same segment key. (For /// example, in ordered time series processing, segment key can be "date", and a segment /// group can be all the rows that belong to the same date.) A segment group can span -/// across multiple exec batches. A segment is a chunk of continous rows that has the same -/// segment key within a given batch. When a segment group span cross batches, it will -/// have multiple segments. A segment never spans cross batches. The segment data +/// across multiple exec batches. A segment is a chunk of continuous rows that has the +/// same segment key within a given batch. When a segment group span cross batches, it +/// will have multiple segments. A segment never spans cross batches. The segment data /// structure only makes sense when used along with a exec batch. struct ARROW_EXPORT Segment { /// \brief the offset into the batch where the segment starts @@ -92,7 +92,7 @@ class ARROW_EXPORT RowSegmenter { /// \brief Reset this segmenter /// /// A segmenter normally extends (see `Segment`) a segment from one batch to the next. - /// If segment-extenion is undesirable, for example when each batch is processed + /// If segment-extension is undesirable, for example when each batch is processed /// independently, then `Reset` should be invoked before processing the next batch. virtual Status Reset() = 0; diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 21350a490411a..029e19bc0e32a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -18,7 +18,16 @@ #include "arrow/filesystem/azurefs.h" #include "arrow/filesystem/azurefs_internal.h" +// idenfity.hpp triggers -Wattributes warnings cause -Werror builds to fail, +// so disable it for this file with pragmas. +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" +#endif #include +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif #include #include diff --git a/cpp/src/gandiva/configuration.h b/cpp/src/gandiva/configuration.h index f43a2b190731f..620c58537f963 100644 --- a/cpp/src/gandiva/configuration.h +++ b/cpp/src/gandiva/configuration.h @@ -37,10 +37,12 @@ class GANDIVA_EXPORT Configuration { explicit Configuration(bool optimize, std::shared_ptr function_registry = - gandiva::default_function_registry()) + gandiva::default_function_registry(), + bool dump_ir = false) : optimize_(optimize), target_host_cpu_(true), - function_registry_(function_registry) {} + function_registry_(std::move(function_registry)), + dump_ir_(dump_ir) {} Configuration() : Configuration(true) {} @@ -50,11 +52,13 @@ class GANDIVA_EXPORT Configuration { bool optimize() const { return optimize_; } bool target_host_cpu() const { return target_host_cpu_; } + bool dump_ir() const { return dump_ir_; } std::shared_ptr function_registry() const { return function_registry_; } void set_optimize(bool optimize) { optimize_ = optimize; } + void set_dump_ir(bool dump_ir) { dump_ir_ = dump_ir; } void target_host_cpu(bool target_host_cpu) { target_host_cpu_ = target_host_cpu; } void set_function_registry(std::shared_ptr function_registry) { function_registry_ = std::move(function_registry); @@ -65,6 +69,9 @@ class GANDIVA_EXPORT Configuration { bool target_host_cpu_; /* set the mcpu flag to host cpu while compiling llvm ir */ std::shared_ptr function_registry_; /* function registry that may contain external functions */ + // flag indicating if IR dumping is needed, defaults to false, and turning it on will + // negatively affect performance + bool dump_ir_ = false; }; /// \brief configuration builder for gandiva @@ -83,6 +90,12 @@ class GANDIVA_EXPORT ConfigurationBuilder { return configuration; } + std::shared_ptr build_with_ir_dumping(bool dump_ir) { + std::shared_ptr configuration( + new Configuration(true, gandiva::default_function_registry(), dump_ir)); + return configuration; + } + std::shared_ptr build( std::shared_ptr function_registry) { std::shared_ptr configuration( diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 1cea1fd2cbf30..fc047f2ac0763 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -31,7 +31,8 @@ #include #include -#include "arrow/util/logging.h" +#include +#include #if defined(_MSC_VER) #pragma warning(push) @@ -46,13 +47,14 @@ #include #include #include -#include +#include #include #include #include #include #include #include +#include #if LLVM_VERSION_MAJOR >= 17 #include #else @@ -86,6 +88,13 @@ #include #include +// JITLink is available in LLVM 9+ +// but the `InProcessMemoryManager::Create` API was added since LLVM 14 +#if LLVM_VERSION_MAJOR >= 14 && !defined(_WIN32) +#define JIT_LINK_SUPPORTED +#include +#endif + #if defined(_MSC_VER) #pragma warning(pop) #endif @@ -103,9 +112,136 @@ extern const size_t kPrecompiledBitcodeSize; std::once_flag llvm_init_once_flag; static bool llvm_init = false; static llvm::StringRef cpu_name; -static llvm::SmallVector cpu_attrs; +static std::vector cpu_attrs; std::once_flag register_exported_funcs_flag; +template +arrow::Result AsArrowResult(llvm::Expected& expected, + const std::string& error_context) { + if (!expected) { + return Status::CodeGenError(error_context, llvm::toString(expected.takeError())); + } + return std::move(expected.get()); +} + +Result MakeTargetMachineBuilder( + const Configuration& conf) { + llvm::orc::JITTargetMachineBuilder jtmb( + (llvm::Triple(llvm::sys::getDefaultTargetTriple()))); + if (conf.target_host_cpu()) { + jtmb.setCPU(cpu_name.str()); + jtmb.addFeatures(cpu_attrs); + } + auto const opt_level = + conf.optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; + jtmb.setCodeGenOptLevel(opt_level); + return jtmb; +} + +std::string DumpModuleIR(const llvm::Module& module) { + std::string ir; + llvm::raw_string_ostream stream(ir); + module.print(stream, nullptr); + return ir; +} + +void AddAbsoluteSymbol(llvm::orc::LLJIT& lljit, const std::string& name, + void* function_ptr) { + llvm::orc::MangleAndInterner mangle(lljit.getExecutionSession(), lljit.getDataLayout()); + + // https://github.com/llvm/llvm-project/commit/8b1771bd9f304be39d4dcbdcccedb6d3bcd18200#diff-77984a824d9182e5c67a481740f3bc5da78d5bd4cf6e1716a083ddb30a4a4931 + // LLVM 17 introduced ExecutorSymbolDef and move most of ORC APIs to ExecutorAddr +#if LLVM_VERSION_MAJOR >= 17 + llvm::orc::ExecutorSymbolDef symbol( + llvm::orc::ExecutorAddr(reinterpret_cast(function_ptr)), + llvm::JITSymbolFlags::Exported); +#else + llvm::JITEvaluatedSymbol symbol(reinterpret_cast(function_ptr), + llvm::JITSymbolFlags::Exported); +#endif + + auto error = lljit.getMainJITDylib().define( + llvm::orc::absoluteSymbols({{mangle(name), symbol}})); + llvm::cantFail(std::move(error)); +} + +// add current process symbol to dylib +// LLVM >= 18 does this automatically +void AddProcessSymbol(llvm::orc::LLJIT& lljit) { + lljit.getMainJITDylib().addGenerator( + llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess( + lljit.getDataLayout().getGlobalPrefix()))); + // the `atexit` symbol cannot be found for ASAN +#ifdef ADDRESS_SANITIZER + if (!lljit.lookup("atexit")) { + AddAbsoluteSymbol(lljit, "atexit", reinterpret_cast(atexit)); + } +#endif +} + +#ifdef JIT_LINK_SUPPORTED +Result> CreateMemmoryManager() { + auto maybe_mem_manager = llvm::jitlink::InProcessMemoryManager::Create(); + return AsArrowResult(maybe_mem_manager, "Could not create memory manager: "); +} + +Status UseJITLinkIfEnabled(llvm::orc::LLJITBuilder& jit_builder) { + static auto maybe_use_jit_link = ::arrow::internal::GetEnvVar("GANDIVA_USE_JIT_LINK"); + if (maybe_use_jit_link.ok()) { + ARROW_ASSIGN_OR_RAISE(static auto memory_manager, CreateMemmoryManager()); + jit_builder.setObjectLinkingLayerCreator( + [&](llvm::orc::ExecutionSession& ES, const llvm::Triple& TT) { + return std::make_unique(ES, *memory_manager); + }); + } + return Status::OK(); +} +#endif + +Result> BuildJIT( + llvm::orc::JITTargetMachineBuilder jtmb, + std::optional>& object_cache) { + llvm::orc::LLJITBuilder jit_builder; + +#ifdef JIT_LINK_SUPPORTED + ARROW_RETURN_NOT_OK(UseJITLinkIfEnabled(jit_builder)); +#endif + + jit_builder.setJITTargetMachineBuilder(std::move(jtmb)); + if (object_cache.has_value()) { + jit_builder.setCompileFunctionCreator( + [&object_cache](llvm::orc::JITTargetMachineBuilder JTMB) + -> llvm::Expected> { + auto target_machine = JTMB.createTargetMachine(); + if (!target_machine) { + return target_machine.takeError(); + } + // after compilation, the object code will be stored into the given object + // cache + return std::make_unique( + std::move(*target_machine), &object_cache.value().get()); + }); + } + auto maybe_jit = jit_builder.create(); + ARROW_ASSIGN_OR_RAISE(auto jit, + AsArrowResult(maybe_jit, "Could not create LLJIT instance: ")); + + AddProcessSymbol(*jit); + return jit; +} + +Status Engine::SetLLVMObjectCache(GandivaObjectCache& object_cache) { + auto cached_buffer = object_cache.getObject(nullptr); + if (cached_buffer) { + auto error = lljit_->addObjectFile(std::move(cached_buffer)); + if (error) { + return Status::CodeGenError("Failed to add cached object file to LLJIT: ", + llvm::toString(std::move(error))); + } + } + return Status::OK(); +} + void Engine::InitOnce() { DCHECK_EQ(llvm_init, false); @@ -127,28 +263,34 @@ void Engine::InitOnce() { } } ARROW_LOG(INFO) << "Detected CPU Name : " << cpu_name.str(); - ARROW_LOG(INFO) << "Detected CPU Features:" << cpu_attrs_str; + ARROW_LOG(INFO) << "Detected CPU Features: [" << cpu_attrs_str << "]"; llvm_init = true; } Engine::Engine(const std::shared_ptr& conf, - std::unique_ptr ctx, - std::unique_ptr engine, llvm::Module* module, - bool cached) - : context_(std::move(ctx)), - execution_engine_(std::move(engine)), + std::unique_ptr lljit, + std::unique_ptr target_machine, bool cached) + : context_(std::make_unique()), + lljit_(std::move(lljit)), ir_builder_(std::make_unique>(*context_)), - module_(module), types_(*context_), optimize_(conf->optimize()), cached_(cached), - function_registry_(conf->function_registry()) {} + function_registry_(conf->function_registry()), + target_machine_(std::move(target_machine)), + conf_(conf) { + // LLVM 10 doesn't like the expr function name to be the same as the module name + auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); + module_ = std::make_unique(module_id, *context_); +} + +Engine::~Engine() {} Status Engine::Init() { std::call_once(register_exported_funcs_flag, gandiva::RegisterExportedFuncs); + // Add mappings for global functions that can be accessed from LLVM/IR module. ARROW_RETURN_NOT_OK(AddGlobalMappings()); - return Status::OK(); } @@ -163,101 +305,32 @@ Status Engine::LoadFunctionIRs() { } /// factory method to construct the engine. -Status Engine::Make(const std::shared_ptr& conf, bool cached, - std::unique_ptr* out) { +Result> Engine::Make( + const std::shared_ptr& conf, bool cached, + std::optional> object_cache) { std::call_once(llvm_init_once_flag, InitOnce); - auto ctx = std::make_unique(); - auto module = std::make_unique("codegen", *ctx); - - // Capture before moving, ExecutionEngine does not allow retrieving the - // original Module. - auto module_ptr = module.get(); - - auto opt_level = - conf->optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; - - // Note that the lifetime of the error string is not captured by the - // ExecutionEngine but only for the lifetime of the builder. Found by - // inspecting LLVM sources. - std::string builder_error; - - llvm::EngineBuilder engine_builder(std::move(module)); - - engine_builder.setEngineKind(llvm::EngineKind::JIT) - .setOptLevel(opt_level) - .setErrorStr(&builder_error); - - if (conf->target_host_cpu()) { - engine_builder.setMCPU(cpu_name); - engine_builder.setMAttrs(cpu_attrs); - } - std::unique_ptr exec_engine{engine_builder.create()}; - - if (exec_engine == nullptr) { - return Status::CodeGenError("Could not instantiate llvm::ExecutionEngine: ", - builder_error); - } + ARROW_ASSIGN_OR_RAISE(auto jtmb, MakeTargetMachineBuilder(*conf)); + ARROW_ASSIGN_OR_RAISE(auto jit, BuildJIT(jtmb, object_cache)); + auto maybe_tm = jtmb.createTargetMachine(); + ARROW_ASSIGN_OR_RAISE(auto target_machine, + AsArrowResult(maybe_tm, "Could not create target machine: ")); std::unique_ptr engine{ - new Engine(conf, std::move(ctx), std::move(exec_engine), module_ptr, cached)}; - ARROW_RETURN_NOT_OK(engine->Init()); - *out = std::move(engine); - return Status::OK(); -} - -// This method was modified from its original version for a part of MLIR -// Original source from -// https://github.com/llvm/llvm-project/blob/9f2ce5b915a505a5488a5cf91bb0a8efa9ddfff7/mlir/lib/ExecutionEngine/ExecutionEngine.cpp -// The original copyright notice follows. - -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -static void SetDataLayout(llvm::Module* module) { - auto target_triple = llvm::sys::getDefaultTargetTriple(); - std::string error_message; - auto target = llvm::TargetRegistry::lookupTarget(target_triple, error_message); - if (!target) { - return; - } - - std::string cpu(llvm::sys::getHostCPUName()); - llvm::SubtargetFeatures features; - llvm::StringMap host_features; - - if (llvm::sys::getHostCPUFeatures(host_features)) { - for (auto& f : host_features) { - features.AddFeature(f.first(), f.second); - } - } + new Engine(conf, std::move(jit), std::move(target_machine), cached)}; - std::unique_ptr machine( - target->createTargetMachine(target_triple, cpu, features.getString(), {}, {})); - - module->setDataLayout(machine->createDataLayout()); -} -// end of the modified method from MLIR - -template -static arrow::Result AsArrowResult(llvm::Expected& expected) { - if (!expected) { - std::string str; - llvm::raw_string_ostream stream(str); - stream << expected.takeError(); - return Status::CodeGenError(stream.str()); - } - return std::move(expected.get()); + ARROW_RETURN_NOT_OK(engine->Init()); + return engine; } static arrow::Status VerifyAndLinkModule( - llvm::Module* dest_module, + llvm::Module& dest_module, llvm::Expected> src_module_or_error) { - ARROW_ASSIGN_OR_RAISE(auto src_ir_module, AsArrowResult(src_module_or_error)); + ARROW_ASSIGN_OR_RAISE( + auto src_ir_module, + AsArrowResult(src_module_or_error, "Failed to verify and link module: ")); - // set dataLayout - SetDataLayout(src_ir_module.get()); + src_ir_module->setDataLayout(dest_module.getDataLayout()); std::string error_info; llvm::raw_string_ostream error_stream(error_info); @@ -265,16 +338,21 @@ static arrow::Status VerifyAndLinkModule( llvm::verifyModule(*src_ir_module, &error_stream), Status::CodeGenError("verify of IR Module failed: " + error_stream.str())); - ARROW_RETURN_IF(llvm::Linker::linkModules(*dest_module, std::move(src_ir_module)), + ARROW_RETURN_IF(llvm::Linker::linkModules(dest_module, std::move(src_ir_module)), Status::CodeGenError("failed to link IR Modules")); return Status::OK(); } +llvm::Module* Engine::module() { + DCHECK(!module_finalized_) << "module cannot be accessed after finalized"; + return module_.get(); +} + // Handling for pre-compiled IR libraries. Status Engine::LoadPreCompiledIR() { - auto bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), - kPrecompiledBitcodeSize); + auto const bitcode = llvm::StringRef(reinterpret_cast(kPrecompiledBitcode), + kPrecompiledBitcodeSize); /// Read from file into memory buffer. llvm::ErrorOr> buffer_or_error = @@ -291,14 +369,14 @@ Status Engine::LoadPreCompiledIR() { llvm::getOwningLazyBitcodeModule(std::move(buffer), *context()); // NOTE: llvm::handleAllErrors() fails linking with RTTI-disabled LLVM builds // (ARROW-5148) - ARROW_RETURN_NOT_OK(VerifyAndLinkModule(module_, std::move(module_or_error))); + ARROW_RETURN_NOT_OK(VerifyAndLinkModule(*module_, std::move(module_or_error))); return Status::OK(); } static llvm::MemoryBufferRef AsLLVMMemoryBuffer(const arrow::Buffer& arrow_buffer) { - auto data = reinterpret_cast(arrow_buffer.data()); - auto size = arrow_buffer.size(); - return llvm::MemoryBufferRef(llvm::StringRef(data, size), "external_bitcode"); + auto const data = reinterpret_cast(arrow_buffer.data()); + auto const size = arrow_buffer.size(); + return {llvm::StringRef(data, size), "external_bitcode"}; } Status Engine::LoadExternalPreCompiledIR() { @@ -306,7 +384,7 @@ Status Engine::LoadExternalPreCompiledIR() { for (auto const& buffer : buffers) { auto llvm_memory_buffer_ref = AsLLVMMemoryBuffer(*buffer); auto module_or_error = llvm::parseBitcodeFile(llvm_memory_buffer_ref, *context()); - ARROW_RETURN_NOT_OK(VerifyAndLinkModule(module_, std::move(module_or_error))); + ARROW_RETURN_NOT_OK(VerifyAndLinkModule(*module_, std::move(module_or_error))); } return Status::OK(); @@ -386,7 +464,8 @@ static void OptimizeModuleWithLegacyPassManager(llvm::Module& module, std::unique_ptr pass_manager( new llvm::legacy::PassManager()); - pass_manager->add(llvm::createTargetTransformInfoWrapperPass(target_analysis)); + pass_manager->add( + llvm::createTargetTransformInfoWrapperPass(std::move(target_analysis))); pass_manager->add(llvm::createFunctionInliningPass()); pass_manager->add(llvm::createInstructionCombiningPass()); pass_manager->add(llvm::createPromoteMemoryToRegisterPass()); @@ -411,40 +490,64 @@ Status Engine::FinalizeModule() { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); if (optimize_) { - auto target_analysis = execution_engine_->getTargetMachine()->getTargetIRAnalysis(); - + auto target_analysis = target_machine_->getTargetIRAnalysis(); // misc passes to allow for inlining, vectorization, .. #if LLVM_VERSION_MAJOR >= 14 - OptimizeModuleWithNewPassManager(*module_, target_analysis); + OptimizeModuleWithNewPassManager(*module_, std::move(target_analysis)); #else - OptimizeModuleWithLegacyPassManager(*module_, target_analysis); + OptimizeModuleWithLegacyPassManager(*module_, std::move(target_analysis)); #endif } ARROW_RETURN_IF(llvm::verifyModule(*module_, &llvm::errs()), Status::CodeGenError("Module verification failed after optimizer")); - } - // do the compilation - execution_engine_->finalizeObject(); + // print the module IR and save it for later use if IR dumping is needed + // since the module will be moved to construct LLJIT instance, and it is not + // available after LLJIT instance is constructed + if (conf_->dump_ir()) { + module_ir_ = DumpModuleIR(*module_); + } + + llvm::orc::ThreadSafeModule tsm(std::move(module_), std::move(context_)); + auto error = lljit_->addIRModule(std::move(tsm)); + if (error) { + return Status::CodeGenError("Failed to add IR module to LLJIT: ", + llvm::toString(std::move(error))); + } + } module_finalized_ = true; return Status::OK(); } -void* Engine::CompiledFunction(std::string& function) { - DCHECK(module_finalized_); - return reinterpret_cast(execution_engine_->getFunctionAddress(function)); +Result Engine::CompiledFunction(const std::string& function) { + DCHECK(module_finalized_) + << "module must be finalized before getting compiled function"; + auto sym = lljit_->lookup(function); + if (!sym) { + return Status::CodeGenError("Failed to look up function: " + function + + " error: " + llvm::toString(sym.takeError())); + } + // Since LLVM 15, `LLJIT::lookup` returns ExecutorAddrs rather than + // JITEvaluatedSymbols +#if LLVM_VERSION_MAJOR >= 15 + auto fn_addr = sym->getValue(); +#else + auto fn_addr = sym->getAddress(); +#endif + auto fn_ptr = reinterpret_cast(fn_addr); + if (fn_ptr == nullptr) { + return Status::CodeGenError("Failed to get address for function: " + function); + } + return fn_ptr; } void Engine::AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type, - const std::vector& args, - void* function_ptr) { - constexpr bool is_var_arg = false; - auto prototype = llvm::FunctionType::get(ret_type, args, is_var_arg); - constexpr auto linkage = llvm::GlobalValue::ExternalLinkage; - auto fn = llvm::Function::Create(prototype, linkage, name, module()); - execution_engine_->addGlobalMapping(fn, function_ptr); + const std::vector& args, void* func) { + auto const prototype = llvm::FunctionType::get(ret_type, args, /*is_var_arg*/ false); + llvm::Function::Create(prototype, llvm::GlobalValue::ExternalLinkage, name, module()); + AddAbsoluteSymbol(*lljit_, name, func); } arrow::Status Engine::AddGlobalMappings() { @@ -453,11 +556,9 @@ arrow::Status Engine::AddGlobalMappings() { return c_funcs.AddMappings(this); } -std::string Engine::DumpIR() { - std::string ir; - llvm::raw_string_ostream stream(ir); - module_->print(stream, nullptr); - return ir; +const std::string& Engine::ir() { + DCHECK(!module_ir_.empty()) << "dump_ir in Configuration must be set for dumping IR"; + return module_ir_; } } // namespace gandiva diff --git a/cpp/src/gandiva/engine.h b/cpp/src/gandiva/engine.h index df2d8b36d9260..565c3f142502d 100644 --- a/cpp/src/gandiva/engine.h +++ b/cpp/src/gandiva/engine.h @@ -17,11 +17,16 @@ #pragma once +#include +#include #include +#include #include #include #include +#include + #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "gandiva/configuration.h" @@ -30,23 +35,34 @@ #include "gandiva/llvm_types.h" #include "gandiva/visibility.h" +namespace llvm::orc { +class LLJIT; +} // namespace llvm::orc + namespace gandiva { /// \brief LLVM Execution engine wrapper. class GANDIVA_EXPORT Engine { public: + ~Engine(); llvm::LLVMContext* context() { return context_.get(); } llvm::IRBuilder<>* ir_builder() { return ir_builder_.get(); } LLVMTypes* types() { return &types_; } - llvm::Module* module() { return module_; } + + /// Retrieve LLVM module in the engine. + /// This should only be called before `FinalizeModule` is called + llvm::Module* module(); /// Factory method to create and initialize the engine object. /// /// \param[in] config the engine configuration /// \param[in] cached flag to mark if the module is already compiled and cached - /// \param[out] engine the created engine - static Status Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* engine); + /// \param[in] object_cache an optional object_cache used for building the module + /// \return arrow::Result containing the created engine + static Result> Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache = + std::nullopt); /// Add the function to the list of IR functions that need to be compiled. /// Compiling only the functions that are used by the module saves time. @@ -59,36 +75,31 @@ class GANDIVA_EXPORT Engine { Status FinalizeModule(); /// Set LLVM ObjectCache. - void SetLLVMObjectCache(GandivaObjectCache& object_cache) { - execution_engine_->setObjectCache(&object_cache); - } + Status SetLLVMObjectCache(GandivaObjectCache& object_cache); /// Get the compiled function corresponding to the irfunction. - void* CompiledFunction(std::string& function); + Result CompiledFunction(const std::string& function); // Create and add a mapping for the cpp function to make it accessible from LLVM. void AddGlobalMappingForFunc(const std::string& name, llvm::Type* ret_type, const std::vector& args, void* func); /// Return the generated IR for the module. - std::string DumpIR(); + const std::string& ir(); /// Load the function IRs that can be accessed in the module. Status LoadFunctionIRs(); private: Engine(const std::shared_ptr& conf, - std::unique_ptr ctx, - std::unique_ptr engine, llvm::Module* module, - bool cached); + std::unique_ptr lljit, + std::unique_ptr target_machine, bool cached); // Post construction init. This _must_ be called after the constructor. Status Init(); static void InitOnce(); - llvm::ExecutionEngine& execution_engine() { return *execution_engine_; } - /// load pre-compiled IR modules from precompiled_bitcode.cc and merge them into /// the main module. Status LoadPreCompiledIR(); @@ -103,9 +114,9 @@ class GANDIVA_EXPORT Engine { Status RemoveUnusedFunctions(); std::unique_ptr context_; - std::unique_ptr execution_engine_; + std::unique_ptr lljit_; std::unique_ptr> ir_builder_; - llvm::Module* module_; + std::unique_ptr module_; LLVMTypes types_; std::vector functions_to_compile_; @@ -115,6 +126,9 @@ class GANDIVA_EXPORT Engine { bool cached_; bool functions_loaded_ = false; std::shared_ptr function_registry_; + std::string module_ir_; + std::unique_ptr target_machine_; + const std::shared_ptr conf_; }; } // namespace gandiva diff --git a/cpp/src/gandiva/engine_llvm_test.cc b/cpp/src/gandiva/engine_llvm_test.cc index 9baaa82d2e0d3..78f468d13fa1f 100644 --- a/cpp/src/gandiva/engine_llvm_test.cc +++ b/cpp/src/gandiva/engine_llvm_test.cc @@ -24,14 +24,14 @@ namespace gandiva { -typedef int64_t (*add_vector_func_t)(int64_t* data, int n); +using add_vector_func_t = int64_t (*)(int64_t*, int); class TestEngine : public ::testing::Test { protected: - std::string BuildVecAdd(Engine* engine) { - auto types = engine->types(); - llvm::IRBuilder<>* builder = engine->ir_builder(); - llvm::LLVMContext* context = engine->context(); + std::string BuildVecAdd(Engine* gdv_engine) { + auto types = gdv_engine->types(); + llvm::IRBuilder<>* builder = gdv_engine->ir_builder(); + llvm::LLVMContext* context = gdv_engine->context(); // Create fn prototype : // int64_t add_longs(int64_t *elements, int32_t nelements) @@ -42,10 +42,10 @@ class TestEngine : public ::testing::Test { llvm::FunctionType::get(types->i64_type(), arguments, false /*isVarArg*/); // Create fn - std::string func_name = "add_longs"; - engine->AddFunctionToCompile(func_name); + std::string func_name = "add_longs_test_expr"; + gdv_engine->AddFunctionToCompile(func_name); llvm::Function* fn = llvm::Function::Create( - prototype, llvm::GlobalValue::ExternalLinkage, func_name, engine->module()); + prototype, llvm::GlobalValue::ExternalLinkage, func_name, gdv_engine->module()); assert(fn != nullptr); // Name the arguments @@ -99,7 +99,9 @@ class TestEngine : public ::testing::Test { return func_name; } - void BuildEngine() { ASSERT_OK(Engine::Make(TestConfiguration(), false, &engine)); } + void BuildEngine() { + ASSERT_OK_AND_ASSIGN(engine, Engine::Make(TestConfiguration(), false)); + } std::unique_ptr engine; std::shared_ptr configuration = TestConfiguration(); @@ -111,7 +113,8 @@ TEST_F(TestEngine, TestAddUnoptimised) { std::string fn_name = BuildVecAdd(engine.get()); ASSERT_OK(engine->FinalizeModule()); - auto add_func = reinterpret_cast(engine->CompiledFunction(fn_name)); + ASSERT_OK_AND_ASSIGN(auto fn_ptr, engine->CompiledFunction(fn_name)); + auto add_func = reinterpret_cast(fn_ptr); int64_t my_array[] = {1, 3, -5, 8, 10}; EXPECT_EQ(add_func(my_array, 5), 17); @@ -123,7 +126,8 @@ TEST_F(TestEngine, TestAddOptimised) { std::string fn_name = BuildVecAdd(engine.get()); ASSERT_OK(engine->FinalizeModule()); - auto add_func = reinterpret_cast(engine->CompiledFunction(fn_name)); + EXPECT_OK_AND_ASSIGN(auto fn_ptr, engine->CompiledFunction(fn_name)); + auto add_func = reinterpret_cast(fn_ptr); int64_t my_array[] = {1, 3, -5, 8, 10}; EXPECT_EQ(add_func(my_array, 5), 17); diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc index 416d97b5dbd1d..8a270cfdc06f2 100644 --- a/cpp/src/gandiva/filter.cc +++ b/cpp/src/gandiva/filter.cc @@ -65,8 +65,8 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, GandivaObjectCache obj_cache(cache, cache_key); // Build LLVM generator, and generate code for the specified expression - std::unique_ptr llvm_gen; - ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + ARROW_ASSIGN_OR_RAISE(auto llvm_gen, + LLVMGenerator::Make(configuration, is_cached, obj_cache)); if (!is_cached) { // Run the validation on the expression. @@ -77,7 +77,7 @@ Status Filter::Make(SchemaPtr schema, ConditionPtr condition, } // Set the object cache for LLVM - llvm_gen->SetLLVMObjectCache(obj_cache); + ARROW_RETURN_NOT_OK(llvm_gen->SetLLVMObjectCache(obj_cache)); ARROW_RETURN_NOT_OK(llvm_gen->Build({condition}, SelectionVector::Mode::MODE_NONE)); @@ -119,7 +119,7 @@ Status Filter::Evaluate(const arrow::RecordBatch& batch, return out_selection->PopulateFromBitMap(result, bitmap_size, num_rows - 1); } -std::string Filter::DumpIR() { return llvm_generator_->DumpIR(); } +const std::string& Filter::DumpIR() { return llvm_generator_->ir(); } void Filter::SetBuiltFromCache(bool flag) { built_from_cache_ = flag; } diff --git a/cpp/src/gandiva/filter.h b/cpp/src/gandiva/filter.h index cc536bca1bb3d..b4043d93c857a 100644 --- a/cpp/src/gandiva/filter.h +++ b/cpp/src/gandiva/filter.h @@ -76,7 +76,7 @@ class GANDIVA_EXPORT Filter { Status Evaluate(const arrow::RecordBatch& batch, std::shared_ptr out_selection); - std::string DumpIR(); + const std::string& DumpIR(); void SetBuiltFromCache(bool flag); diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 41cbe0ffe3a3a..62ebab08f4d6b 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -42,15 +42,15 @@ LLVMGenerator::LLVMGenerator(bool cached, function_registry_(std::move(function_registry)), enable_ir_traces_(false) {} -Status LLVMGenerator::Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* llvm_generator) { - std::unique_ptr llvmgen_obj( +Result> LLVMGenerator::Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache) { + std::unique_ptr llvm_generator( new LLVMGenerator(cached, config->function_registry())); - ARROW_RETURN_NOT_OK(Engine::Make(config, cached, &(llvmgen_obj->engine_))); - *llvm_generator = std::move(llvmgen_obj); - - return Status::OK(); + ARROW_ASSIGN_OR_RAISE(llvm_generator->engine_, + Engine::Make(config, cached, object_cache)); + return llvm_generator; } std::shared_ptr>> @@ -62,8 +62,8 @@ LLVMGenerator::GetCache() { return shared_cache; } -void LLVMGenerator::SetLLVMObjectCache(GandivaObjectCache& object_cache) { - engine_->SetLLVMObjectCache(object_cache); +Status LLVMGenerator::SetLLVMObjectCache(GandivaObjectCache& object_cache) { + return engine_->SetLLVMObjectCache(object_cache); } Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr output) { @@ -73,7 +73,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out ValueValidityPairPtr value_validity; ARROW_RETURN_NOT_OK(decomposer.Decompose(*expr->root(), &value_validity)); // Generate the IR function for the decomposed expression. - std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); + auto compiled_expr = std::make_unique(value_validity, output); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); if (!cached_) { @@ -103,7 +103,8 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { auto fn_name = compiled_expr->GetFunctionName(mode); - auto jit_fn = reinterpret_cast(engine_->CompiledFunction(fn_name)); + ARROW_ASSIGN_OR_RAISE(auto fn_ptr, engine_->CompiledFunction(fn_name)); + auto jit_fn = reinterpret_cast(fn_ptr); compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 250ab78fbfe28..0c532998e8b83 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include #include @@ -47,15 +49,17 @@ class FunctionHolder; class GANDIVA_EXPORT LLVMGenerator { public: /// \brief Factory method to initialize the generator. - static Status Make(const std::shared_ptr& config, bool cached, - std::unique_ptr* llvm_generator); + static Result> Make( + const std::shared_ptr& config, bool cached, + std::optional> object_cache = + std::nullopt); /// \brief Get the cache to be used for LLVM ObjectCache. static std::shared_ptr>> GetCache(); /// \brief Set LLVM ObjectCache. - void SetLLVMObjectCache(GandivaObjectCache& object_cache); + Status SetLLVMObjectCache(GandivaObjectCache& object_cache); /// \brief Build the code for the expression trees for default mode with a LLVM /// ObjectCache. Each element in the vector represents an expression tree @@ -79,7 +83,7 @@ class GANDIVA_EXPORT LLVMGenerator { SelectionVector::Mode selection_vector_mode() { return selection_vector_mode_; } LLVMTypes* types() { return engine_->types(); } llvm::Module* module() { return engine_->module(); } - std::string DumpIR() { return engine_->DumpIR(); } + const std::string& ir() { return engine_->ir(); } private: explicit LLVMGenerator(bool cached, diff --git a/cpp/src/gandiva/llvm_generator_test.cc b/cpp/src/gandiva/llvm_generator_test.cc index 853d8ae6c3b8d..79654e7b78c7e 100644 --- a/cpp/src/gandiva/llvm_generator_test.cc +++ b/cpp/src/gandiva/llvm_generator_test.cc @@ -47,8 +47,7 @@ class TestLLVMGenerator : public ::testing::Test { auto external_registry = std::make_shared(); auto config = config_factory(std::move(external_registry)); - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(config, false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); auto module = generator->module(); ASSERT_OK(generator->engine_->LoadFunctionIRs()); @@ -58,8 +57,7 @@ class TestLLVMGenerator : public ::testing::Test { // Verify that a valid pc function exists for every function in the registry. TEST_F(TestLLVMGenerator, VerifyPCFunctions) { - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); llvm::Module* module = generator->module(); ASSERT_OK(generator->engine_->LoadFunctionIRs()); @@ -70,8 +68,8 @@ TEST_F(TestLLVMGenerator, VerifyPCFunctions) { TEST_F(TestLLVMGenerator, TestAdd) { // Setup LLVM generator to do an arithmetic add of two vectors - std::unique_ptr generator; - ASSERT_OK(LLVMGenerator::Make(TestConfiguration(), false, &generator)); + ASSERT_OK_AND_ASSIGN(auto generator, + LLVMGenerator::Make(TestConfigWithIrDumping(), false)); Annotator annotator; auto field0 = std::make_shared("f0", arrow::int32()); @@ -100,18 +98,22 @@ TEST_F(TestLLVMGenerator, TestAdd) { auto field_sum = std::make_shared("out", arrow::int32()); auto desc_sum = annotator.CheckAndAddInputFieldDescriptor(field_sum); - std::string fn_name = "codegen"; + // LLVM 10 doesn't like the expr function name to be the same as the module name when + // LLJIT is used + std::string fn_name = "llvm_gen_test_add_expr"; ASSERT_OK(generator->engine_->LoadFunctionIRs()); ASSERT_OK(generator->CodeGenExprValue(func_dex, 4, desc_sum, 0, fn_name, SelectionVector::MODE_NONE)); ASSERT_OK(generator->engine_->FinalizeModule()); - auto ir = generator->engine_->DumpIR(); + auto const& ir = generator->engine_->ir(); EXPECT_THAT(ir, testing::HasSubstr("vector.body")); - EvalFunc eval_func = (EvalFunc)generator->engine_->CompiledFunction(fn_name); + ASSERT_OK_AND_ASSIGN(auto fn_ptr, generator->engine_->CompiledFunction(fn_name)); + ASSERT_TRUE(fn_ptr); + auto eval_func = reinterpret_cast(fn_ptr); constexpr size_t kNumRecords = 4; std::array a0{1, 2, 3, 4}; std::array a1{5, 6, 7, 8}; @@ -126,6 +128,7 @@ TEST_F(TestLLVMGenerator, TestAdd) { reinterpret_cast(out.data()), reinterpret_cast(&out_bitmap), }; std::array addr_offsets{0, 0, 0, 0, 0, 0}; + eval_func(addrs.data(), addr_offsets.data(), nullptr, nullptr, nullptr, 0 /* dummy context ptr */, kNumRecords); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index e717e825dfc71..ec0302146fff5 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -80,8 +80,8 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, GandivaObjectCache obj_cache(cache, cache_key); // Build LLVM generator, and generate code for the specified expressions - std::unique_ptr llvm_gen; - ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + ARROW_ASSIGN_OR_RAISE(auto llvm_gen, + LLVMGenerator::Make(configuration, is_cached, obj_cache)); // Run the validation on the expressions. // Return if any of the expression is invalid since @@ -95,7 +95,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, } // Set the object cache for LLVM - llvm_gen->SetLLVMObjectCache(obj_cache); + ARROW_RETURN_NOT_OK(llvm_gen->SetLLVMObjectCache(obj_cache)); ARROW_RETURN_NOT_OK(llvm_gen->Build(exprs, selection_vector_mode)); @@ -281,7 +281,7 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, return Status::OK(); } -std::string Projector::DumpIR() { return llvm_generator_->DumpIR(); } +const std::string& Projector::DumpIR() { return llvm_generator_->ir(); } void Projector::SetBuiltFromCache(bool flag) { built_from_cache_ = flag; } diff --git a/cpp/src/gandiva/projector.h b/cpp/src/gandiva/projector.h index 6801a7c9f3f3c..f1ae7e4dc8ccd 100644 --- a/cpp/src/gandiva/projector.h +++ b/cpp/src/gandiva/projector.h @@ -118,7 +118,7 @@ class GANDIVA_EXPORT Projector { const SelectionVector* selection_vector, const ArrayDataVector& output) const; - std::string DumpIR(); + const std::string& DumpIR(); void SetBuiltFromCache(bool flag); diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index f126b769b2010..450e691323cae 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -16,6 +16,7 @@ // under the License. #include + #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/gtest_util.h" @@ -420,6 +421,35 @@ static void DoDecimalAdd2(benchmark::State& state, int32_t precision, int32_t sc ASSERT_OK(status); } +static void TimedTestExprCompilation(benchmark::State& state) { + int64_t iteration = 0; + for (auto _ : state) { + // schema for input fields + auto field0 = field("f0", int64()); + auto field1 = field("f1", int64()); + auto literal = TreeExprBuilder::MakeLiteral(iteration); + auto schema = arrow::schema({field0, field1}); + + // output field + auto field_add = field("c1", int64()); + auto field_less_than = field("c2", boolean()); + + // Build expression + auto add_func = TreeExprBuilder::MakeFunction( + "add", {TreeExprBuilder::MakeField(field0), literal}, int64()); + auto less_than_func = TreeExprBuilder::MakeFunction( + "less_than", {TreeExprBuilder::MakeField(field1), literal}, boolean()); + + auto expr_0 = TreeExprBuilder::MakeExpression(add_func, field_add); + auto expr_1 = TreeExprBuilder::MakeExpression(less_than_func, field_less_than); + + std::shared_ptr projector; + ASSERT_OK(Projector::Make(schema, {expr_0, expr_1}, TestConfiguration(), &projector)); + + ++iteration; + } +} + static void DecimalAdd2Fast(benchmark::State& state) { // use lesser precision to test the fast-path DoDecimalAdd2(state, DecimalTypeUtil::kMaxPrecision - 6, 18); @@ -460,6 +490,7 @@ static void DecimalAdd3Large(benchmark::State& state) { DoDecimalAdd3(state, DecimalTypeUtil::kMaxPrecision, 18, true); } +BENCHMARK(TimedTestExprCompilation)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestAdd3)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestBigNested)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestExtractYear)->Unit(benchmark::kMicrosecond); diff --git a/cpp/src/gandiva/tests/test_util.cc b/cpp/src/gandiva/tests/test_util.cc index 959ea3cd7a446..2ee49ffae0ed6 100644 --- a/cpp/src/gandiva/tests/test_util.cc +++ b/cpp/src/gandiva/tests/test_util.cc @@ -30,6 +30,10 @@ std::shared_ptr TestConfiguration() { return ConfigurationBuilder::DefaultConfiguration(); } +std::shared_ptr TestConfigWithIrDumping() { + return ConfigurationBuilder().build_with_ir_dumping(true); +} + #ifndef GANDIVA_EXTENSION_TEST_DIR #define GANDIVA_EXTENSION_TEST_DIR "." #endif diff --git a/cpp/src/gandiva/tests/test_util.h b/cpp/src/gandiva/tests/test_util.h index 69d63732aeeaa..d8181fe67516c 100644 --- a/cpp/src/gandiva/tests/test_util.h +++ b/cpp/src/gandiva/tests/test_util.h @@ -98,6 +98,8 @@ static inline ArrayPtr MakeArrowTypeArray(const std::shared_ptr std::shared_ptr TestConfiguration(); +std::shared_ptr TestConfigWithIrDumping(); + // helper function to create a Configuration with an external function registered to the // given function registry std::shared_ptr TestConfigWithFunctionRegistry( diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a49e58afbdb83..99978e283b46a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1370,6 +1370,26 @@ class TypedRecordReader : public TypedColumnReaderImpl, return bytes_for_values; } + const void* ReadDictionary(int32_t* dictionary_length) override { + if (this->current_decoder_ == nullptr && !this->HasNextInternal()) { + dictionary_length = 0; + return nullptr; + } + // Verify the current data page is dictionary encoded. The current_encoding_ should + // have been set as RLE_DICTIONARY if the page encoding is RLE_DICTIONARY or + // PLAIN_DICTIONARY. + if (this->current_encoding_ != Encoding::RLE_DICTIONARY) { + std::stringstream ss; + ss << "Data page is not dictionary encoded. Encoding: " + << EncodingToString(this->current_encoding_); + throw ParquetException(ss.str()); + } + auto decoder = dynamic_cast*>(this->current_decoder_); + const T* dictionary = nullptr; + decoder->GetDictionary(&dictionary, dictionary_length); + return reinterpret_cast(dictionary); + } + int64_t ReadRecords(int64_t num_records) override { if (num_records == 0) return 0; // Delimit records, then read values at the end diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 334b8bcffe0b8..086f6c0e55806 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -368,6 +368,16 @@ class PARQUET_EXPORT RecordReader { virtual void DebugPrintState() = 0; + /// \brief Returns the dictionary owned by the current decoder. Throws an + /// exception if the current decoder is not for dictionary encoding. The caller is + /// responsible for casting the returned pointer to proper type depending on the + /// column's physical type. An example: + /// const ByteArray* dict = reinterpret_cast(ReadDictionary(&len)); + /// or: + /// const float* dict = reinterpret_cast(ReadDictionary(&len)); + /// \param[out] dictionary_length The number of dictionary entries. + virtual const void* ReadDictionary(int32_t* dictionary_length) = 0; + /// \brief Decoded definition levels int16_t* def_levels() const { return reinterpret_cast(def_levels_->mutable_data()); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1d972b78fb99c..b3dd1d6054ac8 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -54,6 +54,36 @@ using arrow::internal::AddWithOverflow; namespace parquet { +namespace { +bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { + // Check the encoding_stats to see if all data pages are dictionary encoded. + const std::vector& encoding_stats = col.encoding_stats(); + if (encoding_stats.empty()) { + // Some parquet files may have empty encoding_stats. In this case we are + // not sure whether all data pages are dictionary encoded. + return false; + } + // The 1st page should be the dictionary page. + if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE || + (encoding_stats[0].encoding != Encoding::PLAIN && + encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) { + return false; + } + // The following pages should be dictionary encoded data pages. + for (size_t idx = 1; idx < encoding_stats.size(); ++idx) { + if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY && + encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) || + (encoding_stats[idx].page_type != PageType::DATA_PAGE && + encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) { + // Return false if any following page is not a dictionary encoded data + // page. + return false; + } + } + return true; +} +} // namespace + // PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file static constexpr int64_t kDefaultFooterReadSize = 64 * 1024; static constexpr uint32_t kFooterSize = 8; @@ -82,7 +112,8 @@ std::shared_ptr RowGroupReader::Column(int i) { const_cast(contents_->properties())->memory_pool()); } -std::shared_ptr RowGroupReader::RecordReader(int i) { +std::shared_ptr RowGroupReader::RecordReader( + int i, bool read_dictionary) { if (i >= metadata()->num_columns()) { std::stringstream ss; ss << "Trying to read column index " << i << " but row group metadata has only " @@ -96,8 +127,8 @@ std::shared_ptr RowGroupReader::RecordReader(int i) { internal::LevelInfo level_info = internal::LevelInfo::ComputeLevelInfo(descr); auto reader = internal::RecordReader::Make( - descr, level_info, contents_->properties()->memory_pool(), - /* read_dictionary = */ false, contents_->properties()->read_dense_for_nullable()); + descr, level_info, contents_->properties()->memory_pool(), read_dictionary, + contents_->properties()->read_dense_for_nullable()); reader->SetPageReader(std::move(page_reader)); return reader; } @@ -106,41 +137,23 @@ std::shared_ptr RowGroupReader::ColumnWithExposeEncoding( int i, ExposedEncoding encoding_to_expose) { std::shared_ptr reader = Column(i); - if (encoding_to_expose == ExposedEncoding::DICTIONARY) { - // Check the encoding_stats to see if all data pages are dictionary encoded. - std::unique_ptr col = metadata()->ColumnChunk(i); - const std::vector& encoding_stats = col->encoding_stats(); - if (encoding_stats.empty()) { - // Some parquet files may have empty encoding_stats. In this case we are - // not sure whether all data pages are dictionary encoded. So we do not - // enable exposing dictionary. - return reader; - } - // The 1st page should be the dictionary page. - if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE || - (encoding_stats[0].encoding != Encoding::PLAIN && - encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) { - return reader; - } - // The following pages should be dictionary encoded data pages. - for (size_t idx = 1; idx < encoding_stats.size(); ++idx) { - if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY && - encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) || - (encoding_stats[idx].page_type != PageType::DATA_PAGE && - encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) { - return reader; - } - } - } else { - // Exposing other encodings are not supported for now. - return reader; + if (encoding_to_expose == ExposedEncoding::DICTIONARY && + IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i))) { + // Set exposed encoding. + reader->SetExposedEncoding(encoding_to_expose); } - // Set exposed encoding. - reader->SetExposedEncoding(encoding_to_expose); return reader; } +std::shared_ptr RowGroupReader::RecordReaderWithExposeEncoding( + int i, ExposedEncoding encoding_to_expose) { + return RecordReader( + i, + /*read_dictionary=*/encoding_to_expose == ExposedEncoding::DICTIONARY && + IsColumnChunkFullyDictionaryEncoded(*metadata()->ColumnChunk(i))); +} + std::unique_ptr RowGroupReader::GetColumnPageReader(int i) { if (i >= metadata()->num_columns()) { std::stringstream ss; diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index da85b73fc2dfe..b59b59f95c2d8 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -64,7 +64,8 @@ class PARQUET_EXPORT RowGroupReader { // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. // Ownership is shared with the RowGroupReader. - std::shared_ptr RecordReader(int i); + std::shared_ptr RecordReader(int i, + bool read_dictionary = false); // Construct a ColumnReader, trying to enable exposed encoding. // @@ -80,6 +81,18 @@ class PARQUET_EXPORT RowGroupReader { std::shared_ptr ColumnWithExposeEncoding( int i, ExposedEncoding encoding_to_expose); + // Construct a RecordReader, trying to enable exposed encoding. + // + // For dictionary encoding, currently we only support column chunks that are + // fully dictionary encoded byte arrays. The caller should verify if the reader can read + // and expose the dictionary by checking the reader's read_dictionary(). If a column + // chunk uses dictionary encoding but then falls back to plain encoding, the returned + // reader will read decoded data without exposing the dictionary. + // + // \note API EXPERIMENTAL + std::shared_ptr RecordReaderWithExposeEncoding( + int i, ExposedEncoding encoding_to_expose); + std::unique_ptr GetColumnPageReader(int i); private: diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 5223158e5f4f9..2c2b62f5d12f6 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -542,6 +542,83 @@ TEST(TestFileReader, GetRecordReader) { ASSERT_EQ(8, col_record_reader_->levels_written()); } +TEST(TestFileReader, RecordReaderWithExposingDictionary) { + const int num_rows = 1000; + + // Make schema + schema::NodeVector fields; + fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, Type::BYTE_ARRAY, + ConvertedType::NONE)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + // Write small batches and small data pages + std::shared_ptr writer_props = WriterProperties::Builder() + .write_batch_size(64) + ->data_pagesize(128) + ->enable_dictionary() + ->build(); + + ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create()); + std::shared_ptr file_writer = + ParquetFileWriter::Open(out_file, schema, writer_props); + + RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // write one column + ::arrow::random::RandomArrayGenerator rag(0); + ByteArrayWriter* writer = static_cast(rg_writer->NextColumn()); + std::vector raw_unique_data = {"a", "bc", "defg"}; + std::vector col_typed; + for (int i = 0; i < num_rows; i++) { + std::string_view chosed_data = raw_unique_data[i % raw_unique_data.size()]; + col_typed.emplace_back(chosed_data); + } + writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.data()); + rg_writer->Close(); + file_writer->Close(); + + // Open the reader + ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish()); + auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf); + + ReaderProperties reader_props; + reader_props.enable_buffered_stream(); + reader_props.set_buffer_size(64); + std::unique_ptr file_reader = + ParquetFileReader::Open(in_file, reader_props); + + auto row_group = file_reader->RowGroup(0); + auto record_reader = std::dynamic_pointer_cast( + row_group->RecordReaderWithExposeEncoding(0, ExposedEncoding::DICTIONARY)); + ASSERT_NE(record_reader, nullptr); + ASSERT_TRUE(record_reader->read_dictionary()); + + int32_t dict_len = 0; + auto dict = + reinterpret_cast(record_reader->ReadDictionary(&dict_len)); + ASSERT_NE(dict, nullptr); + ASSERT_EQ(dict_len, raw_unique_data.size()); + ASSERT_EQ(record_reader->ReadRecords(num_rows), num_rows); + std::shared_ptr<::arrow::ChunkedArray> result_array = record_reader->GetResult(); + ASSERT_EQ(result_array->num_chunks(), 1); + const std::shared_ptr<::arrow::Array> chunk = result_array->chunk(0); + auto dictionary_array = std::dynamic_pointer_cast<::arrow::DictionaryArray>(chunk); + const int32_t* indices = + (std::dynamic_pointer_cast<::arrow::Int32Array>(dictionary_array->indices())) + ->raw_values(); + + // Verify values based on the dictionary from ReadDictionary(). + int64_t indices_read = chunk->length(); + ASSERT_EQ(indices_read, num_rows); + for (int i = 0; i < indices_read; ++i) { + ASSERT_LT(indices[i], dict_len); + ASSERT_EQ(std::string_view(reinterpret_cast(dict[indices[i]].ptr), + dict[indices[i]].len), + col_typed[i]); + } +} + class TestLocalFile : public ::testing::Test { public: void SetUp() { @@ -1064,6 +1141,56 @@ TEST(TestFileReader, BufferedReadsWithDictionary) { } } +TEST(TestFileReader, PartiallyDictionaryEncodingNotExposed) { + const int num_rows = 1000; + + // Make schema + schema::NodeVector fields; + fields.push_back(PrimitiveNode::Make("field", Repetition::REQUIRED, Type::DOUBLE, + ConvertedType::NONE)); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, fields)); + + // Write small batches and small data pages. Explicitly set the dictionary page size + // limit such that the column chunk will not be fully dictionary encoded. + std::shared_ptr writer_props = WriterProperties::Builder() + .write_batch_size(64) + ->data_pagesize(128) + ->enable_dictionary() + ->dictionary_pagesize_limit(4) + ->build(); + + ASSERT_OK_AND_ASSIGN(auto out_file, ::arrow::io::BufferOutputStream::Create()); + std::shared_ptr file_writer = + ParquetFileWriter::Open(out_file, schema, writer_props); + + RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + + // write one column + ::arrow::random::RandomArrayGenerator rag(0); + DoubleWriter* writer = static_cast(rg_writer->NextColumn()); + std::shared_ptr<::arrow::Array> col = rag.Float64(num_rows, 0, 100); + const auto& col_typed = static_cast(*col); + writer->WriteBatch(num_rows, nullptr, nullptr, col_typed.raw_values()); + rg_writer->Close(); + file_writer->Close(); + + // Open the reader + ASSERT_OK_AND_ASSIGN(auto file_buf, out_file->Finish()); + auto in_file = std::make_shared<::arrow::io::BufferReader>(file_buf); + + ReaderProperties reader_props; + reader_props.enable_buffered_stream(); + reader_props.set_buffer_size(64); + std::unique_ptr file_reader = + ParquetFileReader::Open(in_file, reader_props); + + auto row_group = file_reader->RowGroup(0); + auto col_reader = std::static_pointer_cast( + row_group->ColumnWithExposeEncoding(0, ExposedEncoding::DICTIONARY)); + EXPECT_NE(col_reader->GetExposedEncoding(), ExposedEncoding::DICTIONARY); +} + TEST(TestFileReader, BufferedReads) { // PARQUET-1636: Buffered reads were broken before introduction of // RandomAccessFile::GetStream diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 01cb836ea2a86..e9df0c8d7566b 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -90,8 +90,8 @@ ARROW_OPENTELEMETRY_BUILD_VERSION=v1.8.1 ARROW_OPENTELEMETRY_BUILD_SHA256_CHECKSUM=3d640201594b07f08dade9cd1017bd0b59674daca26223b560b9bb6bf56264c2 ARROW_OPENTELEMETRY_PROTO_BUILD_VERSION=v0.17.0 ARROW_OPENTELEMETRY_PROTO_BUILD_SHA256_CHECKSUM=f269fbcb30e17b03caa1decd231ce826e59d7651c0f71c3b28eb5140b4bb5412 -ARROW_ORC_BUILD_VERSION=1.9.0 -ARROW_ORC_BUILD_SHA256_CHECKSUM=0dca8bbccdb2ee87e59ba964933436beebd02ea78c4134424828a8127fbc4faa +ARROW_ORC_BUILD_VERSION=1.9.2 +ARROW_ORC_BUILD_SHA256_CHECKSUM=7f46f2c184ecefd6791f1a53fb062286818bd8710c3f08b94dd3cac365e240ee ARROW_PROTOBUF_BUILD_VERSION=v21.3 ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0128ce4c946b8c78c8c49f # Because of https://github.com/Tencent/rapidjson/pull/1323, we require diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ed6ea08894f10..04faef427e281 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1032,6 +1032,8 @@ tasks: ci: github template: docker-tests/github.linux.yml params: + env: + UBUNTU: 22.04 image: {{ image }} {% endfor %} diff --git a/docs/source/developers/java/development.rst b/docs/source/developers/java/development.rst index f7b19d73da2e2..261cd5702ae07 100644 --- a/docs/source/developers/java/development.rst +++ b/docs/source/developers/java/development.rst @@ -42,6 +42,7 @@ Unit Testing Unit tests are run by Maven during the build. To speed up the build, you can skip them by passing -DskipTests. + .. code-block:: $ cd arrow/java diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index c226297da04c6..9e26de7a6d820 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -30,6 +30,7 @@ import ( type BinaryLike interface { arrow.Array + ValueLen(int) int ValueBytes() []byte ValueOffset64(int) int64 } @@ -367,6 +368,11 @@ func (a *BinaryView) Value(i int) []byte { return buf.Bytes()[start : start+int32(s.Len())] } +func (a *BinaryView) ValueLen(i int) int { + s := a.ValueHeader(i) + return s.Len() +} + // ValueString returns the value at index i as a string instead of // a byte slice, without copying the underlying data. func (a *BinaryView) ValueString(i int) string { @@ -441,4 +447,7 @@ var ( _ arrow.Array = (*Binary)(nil) _ arrow.Array = (*LargeBinary)(nil) _ arrow.Array = (*BinaryView)(nil) + + _ BinaryLike = (*Binary)(nil) + _ BinaryLike = (*LargeBinary)(nil) ) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 90a4628f0d0fb..c8517ba3056df 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -31,6 +31,7 @@ import ( type StringLike interface { arrow.Array Value(int) string + ValueLen(int) int } // String represents an immutable sequence of variable-length UTF-8 strings. @@ -225,6 +226,14 @@ func (a *LargeString) ValueOffset64(i int) int64 { return a.ValueOffset(i) } +func (a *LargeString) ValueLen(i int) int { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + beg := a.array.data.offset + i + return int(a.offsets[beg+1] - a.offsets[beg]) +} + func (a *LargeString) ValueOffsets() []int64 { beg := a.array.data.offset end := beg + a.array.data.length + 1 @@ -364,6 +373,11 @@ func (a *StringView) Value(i int) string { return *(*string)(unsafe.Pointer(&value)) } +func (a *StringView) ValueLen(i int) int { + s := a.ValueHeader(i) + return s.Len() +} + func (a *StringView) String() string { var o strings.Builder o.WriteString("[") @@ -698,4 +712,7 @@ var ( _ StringLikeBuilder = (*StringBuilder)(nil) _ StringLikeBuilder = (*LargeStringBuilder)(nil) _ StringLikeBuilder = (*StringViewBuilder)(nil) + _ StringLike = (*String)(nil) + _ StringLike = (*LargeString)(nil) + _ StringLike = (*StringView)(nil) ) diff --git a/java/adapter/orc/pom.xml b/java/adapter/orc/pom.xml index 803ae5a33826f..a42a458e2072a 100644 --- a/java/adapter/orc/pom.xml +++ b/java/adapter/orc/pom.xml @@ -34,7 +34,7 @@ org.apache.orc orc-core - 1.9.0 + 1.9.2 test diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml index b533a1733521b..7d6092743bf4d 100644 --- a/java/dataset/pom.xml +++ b/java/dataset/pom.xml @@ -27,7 +27,7 @@ ../../../cpp/release-build/ 2.5.0 1.11.0 - 1.8.2 + 1.11.3 diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 86ac402732bc4..0923984c8e5e5 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -281,6 +281,27 @@ + + + org.apache.maven.plugins + maven-assembly-plugin + + + package + + single + + + + + + src + + + diff --git a/java/pom.xml b/java/pom.xml index 523e5642720cd..522ee4abc7669 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -32,7 +32,7 @@ 1.9.0 5.10.1 2.0.9 - 32.1.3-jre + 33.0.0-jre 4.1.104.Final 1.60.0 3.23.1 diff --git a/js/examples/read_file.html b/js/examples/read_file.html index 1013fbe79ef65..cd4d58f542756 100644 --- a/js/examples/read_file.html +++ b/js/examples/read_file.html @@ -41,7 +41,7 @@ } reader.onload = function (evt) { - var arrowTable = Arrow.Table.from([new Uint8Array(evt.target.result)]); + var arrowTable = Arrow.tableFromIPC(evt.target.result); var thead = document.getElementById("thead"); var tbody = document.getElementById("tbody"); diff --git a/js/src/builder/largebinary.ts b/js/src/builder/largebinary.ts index 59aa7144d20a1..f737349ac1c49 100644 --- a/js/src/builder/largebinary.ts +++ b/js/src/builder/largebinary.ts @@ -24,7 +24,7 @@ import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; export class LargeBinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/table.ts b/js/src/table.ts index 58518257b30cb..e719b7ca9d313 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -73,6 +73,8 @@ export class Table { constructor(...batches: readonly RecordBatch[]); constructor(...columns: { [P in keyof T]: Vector }[]); constructor(...columns: { [P in keyof T]: Data | DataProps }[]); + constructor(schema: Schema, ...columns: { [P in keyof T]: Vector }[]); + constructor(schema: Schema, ...columns: { [P in keyof T]: Data | DataProps }[]); constructor(schema: Schema, data?: RecordBatch | RecordBatch[]); constructor(schema: Schema, data?: RecordBatch | RecordBatch[], offsets?: Uint32Array); constructor(...args: any[]) { @@ -112,8 +114,8 @@ export class Table { } else if (typeof x === 'object') { const keys = Object.keys(x) as (keyof T)[]; const vecs = keys.map((k) => new Vector([x[k]])); - const schema = new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullCount > 0))); - const [, batches] = distributeVectorsIntoRecordBatches(schema, vecs); + const batchSchema = schema ?? new Schema(keys.map((k, i) => new Field(String(k), vecs[i].type, vecs[i].nullable))); + const [, batches] = distributeVectorsIntoRecordBatches(batchSchema, vecs); return batches.length === 0 ? [new RecordBatch(x)] : batches; } } diff --git a/js/src/util/chunk.ts b/js/src/util/chunk.ts index 6098b04243422..36620627f197d 100644 --- a/js/src/util/chunk.ts +++ b/js/src/util/chunk.ts @@ -51,6 +51,11 @@ export class ChunkedIterator implements IterableIterator(chunks: ReadonlyArray>) { + return chunks.some(chunk => chunk.nullable); +} + /** @ignore */ export function computeChunkNullCounts(chunks: ReadonlyArray>) { return chunks.reduce((nullCount, chunk) => nullCount + chunk.nullCount, 0); diff --git a/js/src/vector.ts b/js/src/vector.ts index 7e1caa343562c..8b94b14e3fff7 100644 --- a/js/src/vector.ts +++ b/js/src/vector.ts @@ -24,6 +24,7 @@ import { BigIntArray, TypedArray, TypedArrayDataType } from './interfaces.js'; import { isChunkedValid, computeChunkOffsets, + computeChunkNullable, computeChunkNullCounts, sliceChunks, wrapChunkedCall1, @@ -132,6 +133,13 @@ export class Vector { return this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0); } + /** + * Whether this Vector's elements can contain null values. + */ + public get nullable() { + return computeChunkNullable(this.data); + } + /** * The number of null elements in this Vector. */ diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 6b34124abcaba..ffda47f473368 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -139,10 +139,29 @@ describe(`Table`, () => { const i32 = makeVector([i32s]); expect(i32).toHaveLength(i32s.length); expect(i32.nullCount).toBe(0); + expect(i32.nullable).toBe(true); const table = new Table({ i32 }); const i32Field = table.schema.fields[0]; + expect(i32Field.name).toBe('i32'); + expect(i32).toHaveLength(i32s.length); + expect(i32Field.nullable).toBe(true); + expect(i32.nullCount).toBe(0); + + expect(i32).toEqualVector(makeVector(i32s)); + }); + + test(`creates a new Table from a Typed Array and force not nullable`, () => { + const i32s = new Int32Array(arange(new Array(10))); + const i32 = makeVector([i32s]); + expect(i32).toHaveLength(i32s.length); + expect(i32.nullCount).toBe(0); + expect(i32.nullable).toBe(true); + + const table = new Table(new Schema([new Field('i32', new Int32, false)]), { i32 }); + const i32Field = table.schema.fields[0]; + expect(i32Field.name).toBe('i32'); expect(i32).toHaveLength(i32s.length); expect(i32Field.nullable).toBe(false); @@ -170,8 +189,8 @@ describe(`Table`, () => { expect(f32Field.name).toBe('f32'); expect(i32).toHaveLength(i32s.length); expect(f32).toHaveLength(f32s.length); - expect(i32Field.nullable).toBe(false); - expect(f32Field.nullable).toBe(false); + expect(i32Field.nullable).toBe(true); + expect(f32Field.nullable).toBe(true); expect(i32.nullCount).toBe(0); expect(f32.nullCount).toBe(0); @@ -205,7 +224,7 @@ describe(`Table`, () => { expect(i32Vector).toHaveLength(i32s.length); expect(f32Vector).toHaveLength(i32s.length); // new length should be the same as the longest sibling - expect(i32Field.nullable).toBe(false); + expect(i32Field.nullable).toBe(true); expect(f32Field.nullable).toBe(true); // true, with 12 additional nulls expect(i32Vector.nullCount).toBe(0); expect(f32Vector.nullCount).toBe(i32s.length - f32s.length); @@ -247,7 +266,7 @@ describe(`Table`, () => { expect(f32RenamedField.name).toBe('f32Renamed'); expect(i32Renamed).toHaveLength(i32s.length); expect(f32Renamed).toHaveLength(i32s.length); // new length should be the same as the longest sibling - expect(i32RenamedField.nullable).toBe(false); + expect(i32RenamedField.nullable).toBe(true); expect(f32RenamedField.nullable).toBe(true); // true, with 4 additional nulls expect(i32Renamed.nullCount).toBe(0); expect(f32Renamed.nullCount).toBe(i32s.length - f32s.length); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 74a196002bfa6..751dfbcce4342 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2484,7 +2484,7 @@ cdef class MapArray(ListArray): Examples -------- - First, let's understand the structure of our dataset when viewed in a rectangular data model. + First, let's understand the structure of our dataset when viewed in a rectangular data model. The total of 5 respondents answered the question "How much did you like the movie x?". The value -1 in the integer array means that the value is missing. The boolean array represents the null bitmask corresponding to the missing values in the integer array. @@ -2590,7 +2590,7 @@ cdef class FixedSizeListArray(BaseListArray): """ @staticmethod - def from_arrays(values, list_size=None, DataType type=None): + def from_arrays(values, list_size=None, DataType type=None, mask=None): """ Construct FixedSizeListArray from array of values and a list length. @@ -2602,6 +2602,9 @@ cdef class FixedSizeListArray(BaseListArray): type : DataType, optional If not specified, a default ListType with the values' type and `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + Returns ------- @@ -2652,19 +2655,21 @@ cdef class FixedSizeListArray(BaseListArray): _values = asarray(values) + c_mask = c_mask_inverted_from_obj(mask, None) + if type is not None: if list_size is not None: raise ValueError("Cannot specify both list_size and type") with nogil: c_result = CFixedSizeListArray.FromArraysAndType( - _values.sp_array, type.sp_type) + _values.sp_array, type.sp_type, c_mask) else: if list_size is None: raise ValueError("Should specify one of list_size and type") _list_size = list_size with nogil: c_result = CFixedSizeListArray.FromArrays( - _values.sp_array, _list_size) + _values.sp_array, _list_size, c_mask) cdef Array result = pyarrow_wrap_array(GetResultValue(c_result)) result.validate() return result diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx index 35bbf5018f08a..2202ec64f2962 100644 --- a/python/pyarrow/gandiva.pyx +++ b/python/pyarrow/gandiva.pyx @@ -36,6 +36,7 @@ from pyarrow.includes.libgandiva cimport ( CNode, CProjector, CFilter, CSelectionVector, _ensure_selection_mode, + CConfiguration, CConfigurationBuilder, TreeExprBuilder_MakeExpression, TreeExprBuilder_MakeFunction, @@ -583,9 +584,47 @@ cdef class TreeExprBuilder(_Weakrefable): condition.node) return Condition.create(r) +cdef class Configuration(_Weakrefable): + cdef: + shared_ptr[CConfiguration] configuration + + def __cinit__(self, bint optimize=True, bint dump_ir=False): + """ + Initialize the configuration with specified options. + + Parameters + ---------- + optimize : bool, default True + Whether to enable optimizations. + dump_ir : bool, default False + Whether to dump LLVM IR. + """ + self.configuration = CConfigurationBuilder().build() + self.configuration.get().set_optimize(optimize) + self.configuration.get().set_dump_ir(dump_ir) + + @staticmethod + cdef create(shared_ptr[CConfiguration] configuration): + """ + Create a Configuration instance from an existing CConfiguration pointer. + + Parameters + ---------- + configuration : shared_ptr[CConfiguration] + Existing CConfiguration pointer. + + Returns + ------- + Configuration instance + """ + cdef Configuration self = Configuration.__new__(Configuration) + self.configuration = configuration + return self + cpdef make_projector(Schema schema, children, MemoryPool pool, - str selection_mode="NONE"): + str selection_mode="NONE", + Configuration configuration=None): """ Construct a projection using expressions. @@ -602,6 +641,8 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, Memory pool used to allocate output arrays. selection_mode : str, default "NONE" Possible values are NONE, UINT16, UINT32, UINT64. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the projector. Returns ------- @@ -612,6 +653,9 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, c_vector[shared_ptr[CGandivaExpression]] c_children shared_ptr[CProjector] result + if configuration is None: + configuration = Configuration() + for child in children: if child is None: raise TypeError("Expressions must not be None") @@ -620,12 +664,13 @@ cpdef make_projector(Schema schema, children, MemoryPool pool, check_status( Projector_Make(schema.sp_schema, c_children, _ensure_selection_mode(selection_mode), - CConfigurationBuilder.DefaultConfiguration(), + configuration.configuration, &result)) return Projector.create(result, pool) -cpdef make_filter(Schema schema, Condition condition): +cpdef make_filter(Schema schema, Condition condition, + Configuration configuration=None): """ Construct a filter based on a condition. @@ -638,6 +683,8 @@ cpdef make_filter(Schema schema, Condition condition): Schema for the record batches, and the condition. condition : pyarrow.gandiva.Condition Filter condition. + configuration : pyarrow.gandiva.Configuration, default None + Configuration for the filter. Returns ------- @@ -646,8 +693,12 @@ cpdef make_filter(Schema schema, Condition condition): cdef shared_ptr[CFilter] result if condition is None: raise TypeError("Condition must not be None") + + if configuration is None: + configuration = Configuration() + check_status( - Filter_Make(schema.sp_schema, condition.condition, &result)) + Filter_Make(schema.sp_schema, condition.condition, configuration.configuration, &result)) return Filter.create(result) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index bad5ec606c268..82b888f584813 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -673,11 +673,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeListArray" arrow::FixedSizeListArray"(CArray): @staticmethod CResult[shared_ptr[CArray]] FromArrays( - const shared_ptr[CArray]& values, int32_t list_size) + const shared_ptr[CArray]& values, + int32_t list_size, + shared_ptr[CBuffer] null_bitmap) @staticmethod CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( - const shared_ptr[CArray]& values, shared_ptr[CDataType]) + const shared_ptr[CArray]& values, + shared_ptr[CDataType], + shared_ptr[CBuffer] null_bitmap) int64_t value_offset(int i) int64_t value_length(int i) diff --git a/python/pyarrow/includes/libgandiva.pxd b/python/pyarrow/includes/libgandiva.pxd index fa3b72bad61be..7d76576bef2b9 100644 --- a/python/pyarrow/includes/libgandiva.pxd +++ b/python/pyarrow/includes/libgandiva.pxd @@ -252,6 +252,7 @@ cdef extern from "gandiva/filter.h" namespace "gandiva" nogil: cdef CStatus Filter_Make \ "gandiva::Filter::Make"( shared_ptr[CSchema] schema, shared_ptr[CCondition] condition, + shared_ptr[CConfiguration] configuration, shared_ptr[CFilter]* filter) cdef extern from "gandiva/function_signature.h" namespace "gandiva" nogil: @@ -278,9 +279,20 @@ cdef extern from "gandiva/expression_registry.h" namespace "gandiva" nogil: cdef extern from "gandiva/configuration.h" namespace "gandiva" nogil: cdef cppclass CConfiguration" gandiva::Configuration": - pass + + CConfiguration() + + CConfiguration(bint optimize, bint dump_ir) + + void set_optimize(bint optimize) + + void set_dump_ir(bint dump_ir) cdef cppclass CConfigurationBuilder \ " gandiva::ConfigurationBuilder": @staticmethod shared_ptr[CConfiguration] DefaultConfiguration() + + CConfigurationBuilder() + + shared_ptr[CConfiguration] build() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 599d15d023a55..d598630dc2103 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1091,6 +1091,16 @@ def test_fixed_size_list_from_arrays(): assert result.type.equals(typ) assert result.type.value_field.name == "name" + result = pa.FixedSizeListArray.from_arrays(values, + type=typ, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + + result = pa.FixedSizeListArray.from_arrays(values, + list_size=4, + mask=pa.array([False, True, False])) + assert result.to_pylist() == [[0, 1, 2, 3], None, [8, 9, 10, 11]] + # raise on invalid values / list_size with pytest.raises(ValueError): pa.FixedSizeListArray.from_arrays(values, -4) diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 241cac4d83db4..80d119a48530d 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -47,8 +47,9 @@ def test_tree_exp_builder(): assert expr.result().type == pa.int32() + config = gandiva.Configuration(dump_ir=True) projector = gandiva.make_projector( - schema, [expr], pa.default_memory_pool()) + schema, [expr], pa.default_memory_pool(), "NONE", config) # Gandiva generates compute kernel function named `@expr_X` assert projector.llvm_ir.find("@expr_") != -1 @@ -104,7 +105,8 @@ def test_filter(): assert condition.result().type == pa.bool_() - filter = gandiva.make_filter(table.schema, condition) + config = gandiva.Configuration(dump_ir=True) + filter = gandiva.make_filter(table.schema, condition, config) # Gandiva generates compute kernel function named `@expr_X` assert filter.llvm_ir.find("@expr_") != -1 diff --git a/ruby/red-arrow/lib/arrow/table-loader.rb b/ruby/red-arrow/lib/arrow/table-loader.rb index 308eb16a37ad0..450be3fbe09ff 100644 --- a/ruby/red-arrow/lib/arrow/table-loader.rb +++ b/ruby/red-arrow/lib/arrow/table-loader.rb @@ -252,7 +252,13 @@ def load_as_feather def load_as_json open_input_stream do |input| - reader = JSONReader.new(input) + options = JSONReadOptions.new + @options.each do |key, value| + next if value.nil? + setter = :"#{key}=" + options.__send__(setter, value) if options.respond_to?(setter) + end + reader = JSONReader.new(input, options) table = reader.read table.refer_input(input) table diff --git a/ruby/red-arrow/test/helper.rb b/ruby/red-arrow/test/helper.rb index 7fa6764dd40c2..42732a5954a6d 100644 --- a/ruby/red-arrow/test/helper.rb +++ b/ruby/red-arrow/test/helper.rb @@ -18,6 +18,7 @@ require "arrow" require "fiddle" +require "json" require "pathname" require "tempfile" require "timeout" diff --git a/ruby/red-arrow/test/test-table.rb b/ruby/red-arrow/test/test-table.rb index 7c372bd44f14a..883cf70c269bb 100644 --- a/ruby/red-arrow/test/test-table.rb +++ b/ruby/red-arrow/test/test-table.rb @@ -677,6 +677,31 @@ def test_tsv format: :tsv, schema: @table.schema)) end + + def test_json + output = create_output(".json") + # TODO: Implement this. + # @table.save(output, format: :json) + columns = "" + @table.each_record.each do |record| + column = { + "count" => record.count, + "visible" => record.visible, + } + columns << column.to_json + columns << "\n" + end + if output.is_a?(String) + File.write(output, columns) + else + output.resize(columns.bytesize) + output.set_data(0, columns) + end + assert_equal(@table, + Arrow::Table.load(output, + format: :json, + schema: @table.schema)) + end end sub_test_case("path") do