diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 2e3c2a355a884..3d4fb10b10c39 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -340,8 +340,6 @@ jobs: fail-fast: false matrix: include: - - msystem_lower: mingw32 - msystem_upper: MINGW32 - msystem_lower: mingw64 msystem_upper: MINGW64 - msystem_lower: clang64 diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 8d818346faa6e..341c2dd0577ef 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -45,6 +45,7 @@ export ARROW_DEBUG_MEMORY_POOL=trap : ${PYARROW_TEST_HDFS:=${ARROW_HDFS:-ON}} : ${PYARROW_TEST_ORC:=${ARROW_ORC:-ON}} : ${PYARROW_TEST_PARQUET:=${ARROW_PARQUET:-ON}} +: ${PYARROW_TEST_PARQUET_ENCRYPTION:=${PARQUET_REQUIRE_ENCRYPTION:-ON}} : ${PYARROW_TEST_S3:=${ARROW_S3:-ON}} export PYARROW_TEST_ACERO @@ -56,6 +57,7 @@ export PYARROW_TEST_GCS export PYARROW_TEST_HDFS export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET +export PYARROW_TEST_PARQUET_ENCRYPTION export PYARROW_TEST_S3 # Testing PyArrow diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index a6cc3bb7b29b7..01250ff7ef40c 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -46,6 +46,7 @@ export PYARROW_TEST_HDFS=ON export PYARROW_TEST_ORC=ON export PYARROW_TEST_PANDAS=ON export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PARQUET_ENCRYPTION=ON export PYARROW_TEST_SUBSTRAIT=${ARROW_SUBSTRAIT} export PYARROW_TEST_S3=${ARROW_S3} export PYARROW_TEST_TENSORFLOW=ON diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index c73b0cfd1b9bd..b14bfddfb36d3 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -26,6 +26,7 @@ set PYARROW_TEST_GCS=ON set PYARROW_TEST_HDFS=ON set PYARROW_TEST_ORC=OFF set PYARROW_TEST_PARQUET=ON +set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON set PYARROW_TEST_S3=OFF set PYARROW_TEST_TENSORFLOW=ON diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 89d046945e5fe..3f327ed64ff00 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1015,6 +1015,10 @@ else() endif() include(FetchContent) +set(FC_DECLARE_COMMON_OPTIONS) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28) + list(APPEND FC_DECLARE_COMMON_OPTIONS EXCLUDE_FROM_ALL TRUE) +endif() macro(prepare_fetchcontent) set(BUILD_SHARED_LIBS OFF) @@ -2146,6 +2150,9 @@ function(build_gtest) message(STATUS "Building gtest from source") set(GTEST_VENDORED TRUE) fetchcontent_declare(googletest + # We should not specify "EXCLUDE_FROM_ALL TRUE" here. + # Because we install GTest with custom path. + # ${FC_DECLARE_COMMON_OPTIONS} URL ${GTEST_SOURCE_URL} URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() @@ -5096,8 +5103,7 @@ function(build_azure_sdk) endif() message(STATUS "Building Azure SDK for C++ from source") fetchcontent_declare(azure_sdk - # EXCLUDE_FROM_ALL is available since CMake 3.28 - # EXCLUDE_FROM_ALL TRUE + ${FC_DECLARE_COMMON_OPTIONS} URL ${ARROW_AZURE_SDK_URL} URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() diff --git a/cpp/examples/arrow/compute_and_write_csv_example.cc b/cpp/examples/arrow/compute_and_write_csv_example.cc index edf21e45b2bb7..7e0f6cdf1ce16 100644 --- a/cpp/examples/arrow/compute_and_write_csv_example.cc +++ b/cpp/examples/arrow/compute_and_write_csv_example.cc @@ -16,7 +16,7 @@ // under the License. #include -#include +#include #include #include #include diff --git a/cpp/src/arrow/acero/aggregate_internal.cc b/cpp/src/arrow/acero/aggregate_internal.cc index 3cd5491720dcd..9c4b7fe5ae98c 100644 --- a/cpp/src/arrow/acero/aggregate_internal.cc +++ b/cpp/src/arrow/acero/aggregate_internal.cc @@ -25,6 +25,7 @@ #include "arrow/acero/exec_plan.h" #include "arrow/acero/options.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc index ae59aa692096a..c7805f4d24eb2 100644 --- a/cpp/src/arrow/acero/scalar_aggregate_node.cc +++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc @@ -25,6 +25,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/util.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index f29f164d19973..edd443adc43c4 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -451,6 +451,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) const { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].data_as() + this->offset, length); } @@ -466,6 +467,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].mutable_data_as() + this->offset, length); } diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index 5b5dfdf69eb94..b701d9928691f 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -20,18 +20,23 @@ #pragma once +/// \defgroup compute-functions Abstract compute function API +/// @{ +/// @} + /// \defgroup compute-concrete-options Concrete option classes for compute functions /// @{ /// @} -#include "arrow/compute/api_aggregate.h" // IWYU pragma: export -#include "arrow/compute/api_scalar.h" // IWYU pragma: export -#include "arrow/compute/api_vector.h" // IWYU pragma: export -#include "arrow/compute/cast.h" // IWYU pragma: export -#include "arrow/compute/function.h" // IWYU pragma: export -#include "arrow/compute/kernel.h" // IWYU pragma: export -#include "arrow/compute/registry.h" // IWYU pragma: export -#include "arrow/datum.h" // IWYU pragma: export +#include "arrow/compute/api_aggregate.h" // IWYU pragma: export +#include "arrow/compute/api_scalar.h" // IWYU pragma: export +#include "arrow/compute/api_vector.h" // IWYU pragma: export +#include "arrow/compute/cast.h" // IWYU pragma: export +#include "arrow/compute/function.h" // IWYU pragma: export +#include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export +#include "arrow/compute/registry.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export #include "arrow/compute/expression.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 3493c3146310d..4d2c814a69bbb 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -22,7 +22,7 @@ #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 9f12471ddca14..26fbe64f74293 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -24,7 +24,7 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 0233090ef6fb9..759f9e5c1a408 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -20,9 +20,8 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/ordering.h" -#include "arrow/datum.h" #include "arrow/result.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 613e8a55addd2..18e56092dda2a 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index c0433145dd1d0..e1a2e8c5d8879 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -26,6 +26,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function_internal.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 333c9a65c56c4..be934a3c5abfc 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -36,53 +36,9 @@ namespace arrow { namespace compute { -/// \defgroup compute-functions Abstract compute function API -/// +/// \addtogroup compute-functions /// @{ -/// \brief Extension point for defining options outside libarrow (but -/// still within this project). -class ARROW_EXPORT FunctionOptionsType { - public: - virtual ~FunctionOptionsType() = default; - - virtual const char* type_name() const = 0; - virtual std::string Stringify(const FunctionOptions&) const = 0; - virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; - virtual Result> Serialize(const FunctionOptions&) const; - virtual Result> Deserialize( - const Buffer& buffer) const; - virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; -}; - -/// \brief Base class for specifying options configuring a function's behavior, -/// such as error handling. -class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { - public: - virtual ~FunctionOptions() = default; - - const FunctionOptionsType* options_type() const { return options_type_; } - const char* type_name() const { return options_type()->type_name(); } - - bool Equals(const FunctionOptions& other) const; - std::string ToString() const; - std::unique_ptr Copy() const; - /// \brief Serialize an options struct to a buffer. - Result> Serialize() const; - /// \brief Deserialize an options struct from a buffer. - /// Note: this will only look for `type_name` in the default FunctionRegistry; - /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then - /// call FunctionOptionsType::Deserialize(). - static Result> Deserialize( - const std::string& type_name, const Buffer& buffer); - - protected: - explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} - const FunctionOptionsType* options_type_; -}; - -ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); - /// \brief Contains the number of required arguments for the function. /// /// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. diff --git a/cpp/src/arrow/compute/function_options.h b/cpp/src/arrow/compute/function_options.h new file mode 100644 index 0000000000000..88ec2fd2d0679 --- /dev/null +++ b/cpp/src/arrow/compute/function_options.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle. + +#pragma once + +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-functions +/// @{ + +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + virtual Result> Serialize(const FunctionOptions&) const; + virtual Result> Deserialize( + const Buffer& buffer) const; + virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; +}; + +/// \brief Base class for specifying options configuring a function's behavior, +/// such as error handling. +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { + public: + virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } + const char* type_name() const { return options_type()->type_name(); } + + bool Equals(const FunctionOptions& other) const; + std::string ToString() const; + std::unique_ptr Copy() const; + /// \brief Serialize an options struct to a buffer. + Result> Serialize() const; + /// \brief Deserialize an options struct from a buffer. + /// Note: this will only look for `type_name` in the default FunctionRegistry; + /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then + /// call FunctionOptionsType::Deserialize(). + static Result> Deserialize( + const std::string& type_name, const Buffer& buffer); + + protected: + explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} + const FunctionOptionsType* options_type_; +}; + +ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6fd449a931381..ec5291ef608a3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -401,7 +401,7 @@ void AddTypeToTypeCast(CastFunction* func) { kernel.exec = CastFunctor::Exec; kernel.signature = KernelSignature::Make({InputType(SrcT::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); + DCHECK_OK(func->AddKernel(SrcT::type_id, std::move(kernel))); } template @@ -480,14 +480,18 @@ std::vector> GetNestedCasts() { auto cast_list = std::make_shared("cast_list", Type::LIST); AddCommonCasts(Type::LIST, kOutputTargetType, cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddTypeToTypeCast, FixedSizeListType>(cast_list.get()); auto cast_large_list = std::make_shared("cast_large_list", Type::LARGE_LIST); AddCommonCasts(Type::LARGE_LIST, kOutputTargetType, cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddTypeToTypeCast, FixedSizeListType>( cast_large_list.get()); @@ -503,7 +507,11 @@ std::vector> GetNestedCasts() { AddCommonCasts(Type::FIXED_SIZE_LIST, kOutputTargetType, cast_fsl.get()); AddTypeToTypeCast(cast_fsl.get()); AddTypeToTypeCast, ListType>(cast_fsl.get()); + AddTypeToTypeCast, ListViewType>(cast_fsl.get()); AddTypeToTypeCast, LargeListType>(cast_fsl.get()); + AddTypeToTypeCast, LargeListViewType>( + cast_fsl.get()); + AddTypeToTypeCast, MapType>(cast_fsl.get()); // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ebeb597207a81..a6576e4e4c26f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -20,11 +20,14 @@ #include "arrow/array/array_base.h" #include "arrow/array/builder_binary.h" +#include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" #include "arrow/util/utf8_internal.h" @@ -284,9 +287,8 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, } template -enable_if_base_binary BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { +enable_if_t::value && !is_fixed_size_binary_type::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const CastOptions& options = checked_cast(*ctx->state()).options; const ArraySpan& input = batch[0].array; @@ -387,6 +389,33 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou return ZeroCopyCastExec(ctx, batch, out); } +template +enable_if_t::value && std::is_same::value, + Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const CastOptions& options = checked_cast(*ctx->state()).options; + FixedSizeBinaryBuilder builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); + const ArraySpan& input = batch[0].array; + RETURN_NOT_OK(builder.Reserve(input.length)); + + RETURN_NOT_OK(VisitArraySpanInline( + input, + [&](std::string_view v) { + if (v.size() != static_cast(builder.byte_width())) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + options.to_type.ToString(), ": widths must match"); + } + builder.UnsafeAppend(v); + return Status::OK(); + }, + [&] { + builder.UnsafeAppendNull(); + return Status::OK(); + })); + + return builder.FinishInternal(&std::get>(out->value)); +} + #if defined(_MSC_VER) #pragma warning(pop) #endif @@ -452,6 +481,26 @@ void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast(func); } +template +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + auto resolver_fsb = [](KernelContext* ctx, const std::vector&) { + const CastOptions& options = checked_cast(*ctx->state()).options; + return options.to_type; + }; + + DCHECK_OK(func->AddKernel(InType::type_id, {InputType(InType::type_id)}, resolver_fsb, + BinaryToBinaryCastExec, + NullHandling::COMPUTED_NO_PREALLOCATE)); +} + +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); +} + } // namespace std::vector> GetBinaryLikeCasts() { @@ -483,11 +532,7 @@ std::vector> GetBinaryLikeCasts() { std::make_shared("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), cast_fsb.get()); - DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), - BinaryToBinaryCastExec, - NullHandling::COMPUTED_NO_PREALLOCATE)); + AddBinaryToFixedSizeBinaryCast(cast_fsb.get()); return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index c84125bbdd19e..b429c8175b020 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2171,6 +2171,22 @@ TEST(Cast, StringToString) { } } +TEST(Cast, BinaryOrStringToFixedSizeBinary) { + for (auto in_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quu"])"); + auto invalid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quux"])"); + + CheckCast(valid_input, ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])")); + CheckCastFails(invalid_input, CastOptions::Safe(fixed_size_binary(3))); + CheckCastFails(valid_input, CastOptions::Safe(fixed_size_binary(5))); + + auto empty_input = ArrayFromJSON(in_type, "[]"); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(3), "[]")); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(5), "[]")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index b72402bbccd4e..58bc560f52842 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -21,6 +21,7 @@ #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/compute/api_scalar.h" +#include "arrow/compute/function.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/key_value_metadata.h" diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 65e59d1a2eb14..800deba3a5ed2 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -26,17 +26,20 @@ #include "arrow/array/concatenate.h" #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" +#include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/int_util.h" #include "arrow/util/unreachable.h" namespace arrow { using internal::DictionaryTraits; using internal::HashTraits; +using internal::TransposeInts; namespace compute { namespace internal { @@ -448,9 +451,9 @@ class DictionaryHashKernel : public HashKernel { Status Append(const ArraySpan& arr) override { auto arr_dict = arr.dictionary().ToArray(); - if (!dictionary_) { - dictionary_ = arr_dict; - } else if (!dictionary_->Equals(*arr_dict)) { + if (!first_dictionary_) { + first_dictionary_ = arr_dict; + } else if (!first_dictionary_->Equals(*arr_dict)) { // NOTE: This approach computes a new dictionary unification per chunk. // This is in effect O(n*k) where n is the total chunked array length and // k is the number of chunks (therefore O(n**2) if chunks have a fixed size). @@ -458,21 +461,23 @@ class DictionaryHashKernel : public HashKernel { // A better approach may be to run the kernel over each individual chunk, // and then hash-aggregate all results (for example sum-group-by for // the "value_counts" kernel). - auto out_dict_type = dictionary_->type(); + if (dictionary_unifier_ == nullptr) { + ARROW_ASSIGN_OR_RAISE(dictionary_unifier_, + DictionaryUnifier::Make(first_dictionary_->type())); + RETURN_NOT_OK(dictionary_unifier_->Unify(*first_dictionary_)); + } + auto out_dict_type = first_dictionary_->type(); std::shared_ptr transpose_map; - std::shared_ptr out_dict; - ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type)); - ARROW_CHECK_OK(unifier->Unify(*dictionary_)); - ARROW_CHECK_OK(unifier->Unify(*arr_dict, &transpose_map)); - ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict)); + RETURN_NOT_OK(dictionary_unifier_->Unify(*arr_dict, &transpose_map)); - dictionary_ = out_dict; auto transpose = reinterpret_cast(transpose_map->data()); - auto in_dict_array = arr.ToArray(); + auto in_array = arr.ToArray(); + const auto& in_dict_array = + arrow::internal::checked_cast(*in_array); ARROW_ASSIGN_OR_RAISE( - auto tmp, arrow::internal::checked_cast(*in_dict_array) - .Transpose(arr.type->GetSharedPtr(), out_dict, transpose)); + auto tmp, in_dict_array.Transpose(arr.type->GetSharedPtr(), + in_dict_array.dictionary(), transpose)); return indices_kernel_->Append(*tmp->data()); } @@ -495,12 +500,27 @@ class DictionaryHashKernel : public HashKernel { return dictionary_value_type_; } - std::shared_ptr dictionary() const { return dictionary_; } + /// This can't be called more than once because DictionaryUnifier::GetResult() + /// can't be called more than once and produce the same output. + Result> dictionary() const { + if (!first_dictionary_) { // Append was never called + return nullptr; + } + if (!dictionary_unifier_) { // Append was called only once + return first_dictionary_; + } + + auto out_dict_type = first_dictionary_->type(); + std::shared_ptr out_dict; + RETURN_NOT_OK(dictionary_unifier_->GetResult(&out_dict_type, &out_dict)); + return out_dict; + } private: std::unique_ptr indices_kernel_; - std::shared_ptr dictionary_; + std::shared_ptr first_dictionary_; std::shared_ptr dictionary_value_type_; + std::unique_ptr dictionary_unifier_; }; // ---------------------------------------------------------------------- @@ -630,8 +650,9 @@ Status ValueCountsFinalize(KernelContext* ctx, std::vector* out) { // hence have no dictionary. Result> EnsureHashDictionary(KernelContext* ctx, DictionaryHashKernel* hash) { - if (hash->dictionary()) { - return hash->dictionary()->data(); + ARROW_ASSIGN_OR_RAISE(auto dict, hash->dictionary()); + if (dict) { + return dict->data(); } ARROW_ASSIGN_OR_RAISE(auto null, MakeArrayOfNull(hash->dictionary_value_type(), /*length=*/0, ctx->memory_pool())); diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index e9548e133aa00..472f50db8cf92 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -25,6 +25,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/logging.h" #include "arrow/compute/api.h" @@ -226,6 +227,33 @@ static void UniqueString100bytes(benchmark::State& state) { BenchUnique(state, HashParams{general_bench_cases[state.range(0)], 100}); } +template +void BenchValueCountsDictionaryChunks(benchmark::State& state, const ParamType& params) { + std::shared_ptr arr; + params.GenerateTestData(&arr); + // chunk arr to 100 slices + std::vector> chunks; + const int64_t chunk_size = arr->length() / 100; + for (int64_t i = 0; i < 100; ++i) { + auto slice = arr->Slice(i * chunk_size, chunk_size); + auto datum = DictionaryEncode(slice).ValueOrDie(); + ARROW_CHECK(datum.is_array()); + chunks.push_back(datum.make_array()); + } + auto chunked_array = std::make_shared(chunks); + + while (state.KeepRunning()) { + ABORT_NOT_OK(ValueCounts(chunked_array).status()); + } + params.SetMetadata(state); +} + +static void ValueCountsDictionaryChunks(benchmark::State& state) { + // Dictionary of byte strings with 10 bytes each + BenchValueCountsDictionaryChunks( + state, HashParams{general_bench_cases[state.range(0)], 10}); +} + void HashSetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { bench->Arg(i); @@ -239,6 +267,14 @@ BENCHMARK(UniqueInt64)->Apply(HashSetArgs); BENCHMARK(UniqueString10bytes)->Apply(HashSetArgs); BENCHMARK(UniqueString100bytes)->Apply(HashSetArgs); +void DictionaryChunksHashSetArgs(benchmark::internal::Benchmark* bench) { + for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { + bench->Arg(i); + } +} + +BENCHMARK(ValueCountsDictionaryChunks)->Apply(DictionaryChunksHashSetArgs); + void UInt8SetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(uint8_bench_cases.size()); ++i) { bench->Arg(i); diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 780ae25d96360..0cea7246e516c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc index 719969d46ea7c..971a841de0773 100644 --- a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc @@ -18,6 +18,7 @@ #include #include "arrow/array.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index 0bd8e3386e7cc..f02aee1b35996 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -21,6 +21,7 @@ #include "arrow/array/validate.h" #include "arrow/builder.h" #include "arrow/compute/api_vector.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/type_fwd.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 5000de8996280..1740a9b7f0bb4 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8ddcbb9905cb2..e08a2bc10372f 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 4e8b2b2d7cc3a..93a054de1957c 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -398,9 +398,12 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr& column, } else { --num_rows_left; int row_id_removed = row_ids[num_rows_left]; - const uint32_t* offsets = - reinterpret_cast(column->buffers[1]->data()); + const int32_t* offsets = column->GetValues(1); num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed]; + // Skip consecutive rows with the same id + while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) { + --num_rows_left; + } } } diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 87f6b6c76a12c..84aa86d64bb62 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -416,7 +416,9 @@ class ARROW_EXPORT ExecBatchBuilder { // without checking buffer bounds (useful with SIMD or fixed size memory loads // and stores). // - // The sequence of row_ids provided must be non-decreasing. + // The sequence of row_ids provided must be non-decreasing. In case of consecutive rows + // with the same row id, they are skipped all at once because they occupy the same + // space. // static int NumRowsToSkip(const std::shared_ptr& column, int num_rows, const uint16_t* row_ids, int num_tail_bytes_to_skip); diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 4e33f7b578ea8..52121530fe91d 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -471,6 +471,32 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) { ASSERT_EQ(0, pool->bytes_allocated()); } +TEST(ExecBatchBuilder, AppendBatchDupRows) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + // Case of cross-word copying for the last row, which may exceed the buffer boundary. + // This is a simplified case of GH-32570 + { + // 64-byte data fully occupying one minimal 64-byte aligned memory region. + ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["ABCDEF0"], + ["123456789"]])"); // 9-byte tail row, larger than a word. + ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64); + ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64); + ExecBatchBuilder builder; + uint16_t row_ids[2] = {4, 4}; + ASSERT_OK(builder.AppendSelected(pool, batch_string, 2, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ExecBatch batch_string_appended = + JSONToExecBatch({binary()}, R"([["123456789"], ["123456789"]])"); + ASSERT_EQ(batch_string_appended, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + TEST(ExecBatchBuilder, AppendBatchesSomeCols) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc index 7fee136de7a0b..2d69f119df1f4 100644 --- a/cpp/src/arrow/compute/registry_test.cc +++ b/cpp/src/arrow/compute/registry_test.cc @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/registry.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/type_fwd.h b/cpp/src/arrow/compute/type_fwd.h index 3f990b1814311..89f32ceb0f906 100644 --- a/cpp/src/arrow/compute/type_fwd.h +++ b/cpp/src/arrow/compute/type_fwd.h @@ -27,6 +27,7 @@ struct TypeHolder; namespace compute { class Function; +class ScalarAggregateFunction; class FunctionExecutor; class FunctionOptions; class FunctionRegistry; diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index e62e779f71797..1ac0ec3f39e97 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -290,12 +290,12 @@ TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteBackpresure) { write_options_.max_open_files = 2; write_options_.min_rows_per_group = kFileSizeLimit - 1; auto dataset_writer = MakeDatasetWriter(/*max_rows=*/kFileSizeLimit); - for (int i = 0; i < 20; ++i) { - dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 5), ""); + for (int i = 0; i < 5; ++i) { + dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 2), ""); } EndWriterChecked(dataset_writer.get()); std::vector expected_files; - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 10; ++i) { expected_files.emplace_back("testdir/chunk-" + std::to_string(i) + ".arrow", kFileSizeLimit * i, kFileSizeLimit); } diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3afe4ec85cf49..1c2fd2dea6307 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/parquet_encryption_config.h" @@ -58,6 +59,8 @@ using parquet::arrow::SchemaField; using parquet::arrow::SchemaManifest; using parquet::arrow::StatisticsAsScalars; +using compute::Cast; + namespace { parquet::ReaderProperties MakeReaderProperties( @@ -370,12 +373,12 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } - auto maybe_min = min->CastTo(field.type()); - auto maybe_max = max->CastTo(field.type()); + auto maybe_min = Cast(min, field.type()); + auto maybe_max = Cast(max, field.type()); if (maybe_min.ok() && maybe_max.ok()) { - min = maybe_min.MoveValueUnsafe(); - max = maybe_max.MoveValueUnsafe(); + min = maybe_min.MoveValueUnsafe().scalar(); + max = maybe_max.MoveValueUnsafe().scalar(); if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d72ead92ed111..21350a490411a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -58,7 +58,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { blob_storage_scheme == other.blob_storage_scheme && dfs_storage_scheme == other.dfs_storage_scheme && default_metadata == other.default_metadata && - account_name_ == other.account_name_ && + account_name == other.account_name && credential_kind_ == other.credential_kind_; if (!equals) { return false; @@ -104,23 +104,39 @@ std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const { return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name); } -Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key) { +Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_key) { credential_kind_ = CredentialKind::kStorageSharedKeyCredential; - account_name_ = account_name; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } storage_shared_key_credential_ = std::make_shared(account_name, account_key); return Status::OK(); } -Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { +Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared( + tenant_id, client_id, client_secret); + return Status::OK(); +} + +Status AzureOptions::ConfigureDefaultCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); } -Status AzureOptions::ConfigureWorkloadIdentityCredential( - const std::string& account_name) { +Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& client_id) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = + std::make_shared(client_id); + return Status::OK(); +} + +Status AzureOptions::ConfigureWorkloadIdentityCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); @@ -128,14 +144,17 @@ Status AzureOptions::ConfigureWorkloadIdentityCredential( Result> AzureOptions::MakeBlobServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); @@ -143,15 +162,18 @@ Result> AzureOptions::MakeBlobServiceC Result> AzureOptions::MakeDataLakeServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: return std::make_unique( - AccountDfsUrl(account_name_), token_credential_); + AccountDfsUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique( - AccountDfsUrl(account_name_), storage_shared_key_credential_); + AccountDfsUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -941,14 +963,38 @@ class AzureFileSystem::Impl { break; } ARROW_ASSIGN_OR_RAISE( - cached_hns_support_, + auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled(adlfs_client, options_)); - DCHECK_NE(cached_hns_support_, HNSSupport::kUnknown); - // Caller should handle kContainerNotFound case appropriately. - return cached_hns_support_; + DCHECK_NE(hns_support, HNSSupport::kUnknown); + if (hns_support == HNSSupport::kContainerNotFound) { + // Caller should handle kContainerNotFound case appropriately as it knows the + // container this refers to, but the cached value in that case should remain + // kUnknown before we get a CheckIfHierarchicalNamespaceIsEnabled result that + // is not kContainerNotFound. + cached_hns_support_ = HNSSupport::kUnknown; + } else { + cached_hns_support_ = hns_support; + } + return hns_support; } public: + /// This is used from unit tests to ensure we perform operations on all the + /// possible states of cached_hns_support_. + void ForceCachedHierarchicalNamespaceSupport(int support) { + auto hns_support = static_cast(support); + switch (hns_support) { + case HNSSupport::kUnknown: + case HNSSupport::kContainerNotFound: + case HNSSupport::kDisabled: + case HNSSupport::kEnabled: + cached_hns_support_ = hns_support; + return; + } + // This is reachable if an invalid int is cast to enum class HNSSupport. + DCHECK(false) << "Invalid enum HierarchicalNamespaceSupport value."; + } + Result GetFileInfo(const AzureLocation& location) { if (location.container.empty()) { DCHECK(location.path.empty()); @@ -1560,6 +1606,10 @@ AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) default_async_is_sync_ = false; } +void AzureFileSystem::ForceCachedHierarchicalNamespaceSupport(int hns_support) { + impl_->ForceCachedHierarchicalNamespaceSupport(hns_support); +} + Result> AzureFileSystem::Make( const AzureOptions& options, const io::IOContext& io_context) { ARROW_ASSIGN_OR_RAISE(auto impl, AzureFileSystem::Impl::Make(options, io_context)); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index be3ca5ba238ae..78e0a8148c616 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -44,8 +44,13 @@ class DataLakeServiceClient; namespace arrow::fs { +class TestAzureFileSystem; + /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { + /// \brief account name of the Azure Storage account. + std::string account_name; + /// \brief hostname[:port] of the Azure Blob Storage Service. /// /// If the hostname is a relative domain name (one that starts with a '.'), then storage @@ -92,7 +97,6 @@ struct ARROW_EXPORT AzureOptions { kStorageSharedKeyCredential, } credential_kind_ = CredentialKind::kAnonymous; - std::string account_name_; std::shared_ptr token_credential_; std::shared_ptr storage_shared_key_credential_; @@ -101,12 +105,17 @@ struct ARROW_EXPORT AzureOptions { AzureOptions(); ~AzureOptions(); - Status ConfigureDefaultCredential(const std::string& account_name); + Status ConfigureDefaultCredential(); + + Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string()); - Status ConfigureWorkloadIdentityCredential(const std::string& account_name); + Status ConfigureWorkloadIdentityCredential(); - Status ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key); + Status ConfigureAccountKeyCredential(const std::string& account_key); + + Status ConfigureClientSecretCredential(const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret); bool Equals(const AzureOptions& other) const; @@ -156,6 +165,9 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { explicit AzureFileSystem(std::unique_ptr&& impl); + friend class TestAzureFileSystem; + void ForceCachedHierarchicalNamespaceSupport(int hns_support); + public: ~AzureFileSystem() override = default; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf7522b98eef..f6af9f722dbac 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -62,7 +62,6 @@ namespace arrow { using internal::TemporaryDir; namespace fs { using internal::ConcatAbstractPath; -namespace { namespace bp = boost::process; using ::testing::IsEmpty; @@ -272,15 +271,44 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +TEST(AzureFileSystem, InitializingFilesystemWithoutAccountNameFails) { + AzureOptions options; + ASSERT_RAISES(Invalid, options.ConfigureAccountKeyCredential("account_key")); + + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); + ASSERT_RAISES(Invalid, AzureFileSystem::Make(options)); +} + +TEST(AzureFileSystem, InitializeFilesystemWithClientSecretCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureDefaultCredential()); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + +TEST(AzureFileSystem, InitializeFilesystemWithManagedIdentityCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential("specific-client-id")); + EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, InitializeFilesystemWithWorkloadIdentityCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -354,7 +382,7 @@ class TestAzureFileSystem : public ::testing::Test { bool set_up_succeeded_ = false; AzureOptions options_; - std::shared_ptr fs_; + std::shared_ptr fs_dont_use_directly_; // use fs() std::unique_ptr blob_service_client_; std::unique_ptr datalake_service_client_; @@ -362,9 +390,22 @@ class TestAzureFileSystem : public ::testing::Test { TestAzureFileSystem() : rng_(std::random_device()()) {} virtual Result GetAzureEnv() const = 0; + virtual HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const = 0; + + FileSystem* fs(HNSSupport cached_hns_support) const { + auto* fs_ptr = fs_dont_use_directly_.get(); + fs_ptr->ForceCachedHierarchicalNamespaceSupport(static_cast(cached_hns_support)); + return fs_ptr; + } + + FileSystem* fs() const { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + return fs(CachedHNSSupport(*env)); + } static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; + options.account_name = env->account_name(); switch (env->backend()) { case AzureBackend::kAzurite: options.blob_storage_authority = "127.0.0.1:10000"; @@ -376,8 +417,7 @@ class TestAzureFileSystem : public ::testing::Test { // Use the default values break; } - ARROW_EXPECT_OK( - options.ConfigureAccountKeyCredential(env->account_name(), env->account_key())); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); return options; } @@ -395,7 +435,7 @@ class TestAzureFileSystem : public ::testing::Test { EXPECT_OK_AND_ASSIGN(options_, options_res); } - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + ASSERT_OK_AND_ASSIGN(fs_dont_use_directly_, AzureFileSystem::Make(options_)); EXPECT_OK_AND_ASSIGN(blob_service_client_, options_.MakeBlobServiceClient()); EXPECT_OK_AND_ASSIGN(datalake_service_client_, options_.MakeDataLakeServiceClient()); set_up_succeeded_ = true; @@ -435,7 +475,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const std::string& path, int total_size) { - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); ASSERT_OK(output->Write(all_lines)); ASSERT_OK(output->Close()); @@ -461,19 +501,19 @@ class TestAzureFileSystem : public ::testing::Test { const auto sub_directory_path = ConcatAbstractPath(directory_path, "new-sub"); const auto sub_blob_path = ConcatAbstractPath(sub_directory_path, "sub.txt"); const auto top_blob_path = ConcatAbstractPath(directory_path, "top.txt"); - ASSERT_OK(fs_->CreateDir(sub_directory_path, true)); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(sub_blob_path)); + ASSERT_OK(fs()->CreateDir(sub_directory_path, true)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(sub_blob_path)); ASSERT_OK(output->Write(std::string_view("sub"))); ASSERT_OK(output->Close()); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(top_blob_path)); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(top_blob_path)); ASSERT_OK(output->Write(std::string_view("top"))); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_blob_path, FileType::File); - AssertFileInfo(fs_.get(), top_blob_path, FileType::File); + AssertFileInfo(fs(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_blob_path, FileType::File); + AssertFileInfo(fs(), top_blob_path, FileType::File); paths->container = data.container_name; paths->directory = directory_path; @@ -538,52 +578,52 @@ class TestAzureFileSystem : public ::testing::Test { const auto directory_path = data.RandomDirectoryPath(rng_); if (WithHierarchicalNamespace()) { - ASSERT_OK(fs_->CreateDir(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs_->CreateDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } } void TestCreateDirSuccessContainerAndDirectory() { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->CreateDir(path, false)); + ASSERT_OK(fs()->CreateDir(path, false)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); } } void TestCreateDirRecursiveSuccessContainerOnly() { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, true)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, true)); + AssertFileInfo(fs(), container_name, FileType::Directory); } void TestCreateDirRecursiveSuccessDirectoryOnly() { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } } @@ -591,31 +631,31 @@ class TestAzureFileSystem : public ::testing::Test { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } } void TestDeleteDirContentsSuccessNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } void TestDeleteDirContentsFailureNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); + ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } }; @@ -672,12 +712,12 @@ void TestAzureFileSystem::TestGetFileInfoObject() { .GetProperties() .Value; - AssertFileInfo(fs_.get(), data.ObjectPath(), FileType::File, + AssertFileInfo(fs(), data.ObjectPath(), FileType::File, std::chrono::system_clock::time_point{object_properties.LastModified}, static_cast(object_properties.BlobSize)); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + std::string{data.kObjectName})); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); } void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { @@ -685,37 +725,37 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). const std::string kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(data.ContainerPath(kObjectName), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(auto output, + fs()->OpenOutputStream(data.ContainerPath(kObjectName), + /*metadata=*/{})); const std::string_view lorem_ipsum(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN( - output, fs_->OpenOutputStream(data.ContainerPath("test-object-dir/some_other_dir0"), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream( + data.ContainerPath("test-object-dir/some_other_dir0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); ASSERT_OK_AND_ASSIGN(output, - fs_->OpenOutputStream(data.ContainerPath(kObjectName + "0"), - /*metadata=*/{})); + fs()->OpenOutputStream(data.ContainerPath(kObjectName + "0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName), FileType::File); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir") + "/", - FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir"), + AssertFileInfo(fs(), data.ContainerPath(kObjectName), FileType::File); + AssertFileInfo(fs(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir"), FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir") + "/", FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir") + "/", + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir") + "/", FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-di"), FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_di"), + AssertFileInfo(fs(), data.ContainerPath("test-object-di"), FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_di"), FileType::NotFound); if (WithHierarchicalNamespace()) { @@ -723,17 +763,45 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { .GetDirectoryClient("test-empty-object-dir") .Create(); - AssertFileInfo(fs_.get(), data.ContainerPath("test-empty-object-dir"), + AssertFileInfo(fs(), data.ContainerPath("test-empty-object-dir"), FileType::Directory); } } -template +template +struct TestingScenario { + using AzureEnvClass = AzureEnv; + static constexpr bool kHNSSupportShouldBeKnown = HNSSupportShouldBeKnown; +}; + +template class AzureFileSystemTestImpl : public TestAzureFileSystem { public: + using AzureEnvClass = typename TestingScenario::AzureEnvClass; + using TestAzureFileSystem::TestAzureFileSystem; Result GetAzureEnv() const final { return AzureEnvClass::GetInstance(); } + + /// \brief HNSSupport value that should be assumed as the cached + /// HNSSupport on every fs()->Operation(...) call in tests. + /// + /// If TestingScenario::kHNSSupportShouldBeKnown is true, this value + /// will be HNSSupport::kEnabled or HNSSupport::kDisabled, depending + /// on the environment. Otherwise, this value will be HNSSupport::kUnknown. + /// + /// This ensures all the branches in the AzureFileSystem code operations are tested. + /// For instance, many operations executed on a missing container, wouldn't + /// get a HNSSupport::kContainerNotFound error if the cached HNSSupport was + /// already known due to a previous operation that cached the HNSSupport value. + HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const final { + if constexpr (TestingScenario::kHNSSupportShouldBeKnown) { + return env.WithHierarchicalNamespace() ? HNSSupport::kEnabled + : HNSSupport::kDisabled; + } else { + return HNSSupport::kUnknown; + } + } }; // How to enable the non-Azurite tests: @@ -762,54 +830,71 @@ class AzureFileSystemTestImpl : public TestAzureFileSystem { // [1]: https://azure.microsoft.com/en-gb/free/ // [2]: // https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account -using TestAzureFlatNSFileSystem = AzureFileSystemTestImpl; -using TestAzureHierarchicalNSFileSystem = AzureFileSystemTestImpl; -using TestAzuriteFileSystem = AzureFileSystemTestImpl; +using TestAzureFlatNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzureHierarchicalNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzuriteFileSystem = AzureFileSystemTestImpl>; -// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) - -template -using AzureFileSystemTestOnAllEnvs = AzureFileSystemTestImpl; +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS). +template +using TestAzureFileSystemOnAllEnvs = AzureFileSystemTestImpl; using AllEnvironments = - ::testing::Types; + ::testing::Types, TestingScenario, + TestingScenario>; -TYPED_TEST_SUITE(AzureFileSystemTestOnAllEnvs, AllEnvironments); +TYPED_TEST_SUITE(TestAzureFileSystemOnAllEnvs, AllEnvironments); -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespace) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespace) { this->TestDetectHierarchicalNamespace(true); this->TestDetectHierarchicalNamespace(false); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { this->TestDetectHierarchicalNamespaceOnMissingContainer(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObject) { +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) +// combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and +// known according to the environment. +template +using TestAzureFileSystemOnAllScenarios = AzureFileSystemTestImpl; + +using AllScenarios = ::testing::Types< + TestingScenario, TestingScenario, + TestingScenario, TestingScenario, + TestingScenario, + TestingScenario>; + +TYPED_TEST_SUITE(TestAzureFileSystemOnAllScenarios, AllScenarios); + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObject) { this->TestGetFileInfoObject(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DeleteDirSuccessEmpty) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { this->TestDeleteDirSuccessEmpty(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObjectWithNestedStructure) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructure) { this->TestGetFileInfoObjectWithNestedStructure(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { this->TestCreateDirSuccessContainerAndDirectory(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessContainerOnly) { this->TestCreateDirRecursiveSuccessContainerOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessDirectoryOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessDirectoryOnly) { this->TestCreateDirRecursiveSuccessDirectoryOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + CreateDirRecursiveSuccessContainerAndDirectory) { this->TestCreateDirRecursiveSuccessContainerAndDirectory(); } @@ -818,41 +903,41 @@ TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDi TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDir(path)); + ASSERT_RAISES(IOError, fs()->DeleteDir(path)); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view("hello"))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(parent)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(path, true)); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(parent)); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { auto preexisting_data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); + AssertFileInfo(fs(), paths.directory, FileType::Directory); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -867,20 +952,20 @@ TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { - AssertFileInfo(fs_.get(), "", FileType::Directory); + AssertFileInfo(fs(), "", FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); } TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { auto data = SetUpPreexistingData(); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent-container", FileType::NotFound); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + data.container_name)); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); } TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { @@ -891,7 +976,7 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 2); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container", FileType::Directory); @@ -899,18 +984,18 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Nonexistent container select.base_dir = "nonexistent-container"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -920,33 +1005,33 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); select.base_dir = "container/somedir/subdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); // Nonexistent select.base_dir = "container/nonexistent"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Trailing slashes select.base_dir = "empty-container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "nonexistent-container/"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.base_dir = "container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); } @@ -960,19 +1045,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { std::vector infos; // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 12); ASSERT_EQ(infos, SortedInfos(infos)); AssertInfoAllContainersRecursive(infos); // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 10); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -988,19 +1073,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 2); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); select.base_dir = "container/otherdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); @@ -1023,13 +1108,13 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { FileSelector select; // non-recursive select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir", FileType::Directory); select.base_dir = "container/mydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 4); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); @@ -1038,55 +1123,55 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); select.base_dir = "container/mydir/emptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/emptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/nonemptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); select.base_dir = "container/mydir/nonemptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", false)); } TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, false)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, false)); + AssertFileInfo(fs(), container_name, FileType::Directory); } TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); - ASSERT_RAISES(IOError, fs_->CreateDir(path, false)); + ASSERT_RAISES(IOError, fs()->CreateDir(path, false)); } TEST_F(TestAzuriteFileSystem, CreateDirRecursiveFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", true)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", true)); } TEST_F(TestAzuriteFileSystem, CreateDirUri) { ASSERT_RAISES( Invalid, - fs_->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); + fs()->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { const auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { @@ -1094,8 +1179,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { const auto directory_path = data.RandomDirectoryPath(rng_); // There is only virtual directory without hierarchical namespace // support. So the DeleteDir() for nonexistent directory does nothing. - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { @@ -1110,21 +1195,21 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { int64_t n_blobs = 257; for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view(std::to_string(i)))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + AssertFileInfo(fs(), blob_path, FileType::File); } - ASSERT_OK(fs_->DeleteDir(directory_path)); + ASSERT_OK(fs()->DeleteDir(directory_path)); for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } } TEST_F(TestAzuriteFileSystem, DeleteDirUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + data.container_name + "/")); + ASSERT_RAISES(Invalid, fs()->DeleteDir("abfs://" + data.container_name + "/")); } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { @@ -1135,11 +1220,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.container)); - arrow::fs::AssertFileInfo(fs_.get(), paths.container, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(paths.container)); + AssertFileInfo(fs(), paths.container, FileType::Directory); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1151,11 +1236,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); // GH-38772: We may change this to FileType::Directory. - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1170,52 +1255,52 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), destination_path)); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(destination_path)); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), - internal::EnsureTrailingSlash(data.ObjectPath()))); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), internal::EnsureTrailingSlash( + data.ObjectPath()))); } TEST_F(TestAzuriteFileSystem, CopyFileFailureSourceNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.NotFoundObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.NotFoundObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationParentNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileUri) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(Invalid, fs_->CopyFile("abfs://" + data.ObjectPath(), destination_path)); - ASSERT_RAISES(Invalid, fs_->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile("abfs://" + data.ObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1224,7 +1309,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::string contents; std::shared_ptr buffer; @@ -1238,10 +1323,10 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { TEST_F(TestAzuriteFileSystem, OpenInputStreamInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1255,7 +1340,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(path)); std::array buffer{}; std::int64_t size; ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); @@ -1264,26 +1349,26 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { TEST_F(TestAzuriteFileSystem, OpenInputStreamNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name + "/")); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name + "/")); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + data.ObjectPath())); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + data.ObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.ObjectPath() + '/')); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.ObjectPath() + '/')); } namespace { @@ -1324,7 +1409,7 @@ std::shared_ptr NormalizerKeyValueMetadata( TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); @@ -1354,7 +1439,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { TEST_F(TestAzuriteFileSystem, OpenInputStreamClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); @@ -1399,13 +1484,13 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); @@ -1416,7 +1501,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; std::array buffers{ std::string(sizes[0], 'A'), @@ -1432,7 +1517,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::string contents; std::shared_ptr buffer; @@ -1448,26 +1533,26 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has been overwritten. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); } @@ -1475,27 +1560,27 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data())); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenAppendStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has not been overwritten and that the block from // the other client was not committed. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(std::string(inbuf.data(), size), std::string(expected0) + std::string(expected1)); @@ -1504,7 +1589,7 @@ TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-closed.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); ASSERT_OK(output->Close()); ASSERT_RAISES(Invalid, output->Write(PreexistingData::kLoremIpsum, std::strlen(PreexistingData::kLoremIpsum))); @@ -1515,7 +1600,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-uri.txt"); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + path)); } TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { @@ -1534,7 +1619,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1582,7 +1667,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileRandomSeek) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1607,16 +1692,16 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileIoContext) { contents.length()); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); - EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); + EXPECT_EQ(fs()->io_context().external_id(), file->io_context().external_id()); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(info)); std::array buffer{}; std::int64_t size; @@ -1629,21 +1714,21 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { TEST_F(TestAzuriteFileSystem, OpenInputFileNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputFile(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name)); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name)); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputFile(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Tell()); @@ -1654,6 +1739,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->Seek(2)); } -} // namespace } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index ac740f92c8527..e8b8784e7a314 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1077,7 +1077,8 @@ std::shared_ptr MakeListType( template void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to_type) { - EXPECT_OK_AND_ASSIGN(auto cast_scalar, scalar.CastTo(to_type)); + EXPECT_OK_AND_ASSIGN(auto cast_scalar_datum, Cast(scalar, to_type)); + const auto& cast_scalar = cast_scalar_datum.scalar(); ASSERT_OK(cast_scalar->ValidateFull()); ASSERT_EQ(*cast_scalar->type, *to_type); @@ -1087,11 +1088,25 @@ void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to *checked_cast(*cast_scalar).value); } -void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& to_type, - const std::string& expected_message) { - EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(StatusCode::Invalid, - ::testing::HasSubstr(expected_message), - scalar.CastTo(to_type)); +template +void CheckListCastError(const ScalarType& scalar, + const std::shared_ptr& to_type) { + StatusCode code; + std::string expected_message; + if (scalar.type->id() == Type::FIXED_SIZE_LIST) { + code = StatusCode::TypeError; + expected_message = + "Size of FixedSizeList is not the same. input list: " + scalar.type->ToString() + + " output list: " + to_type->ToString(); + } else { + code = StatusCode::Invalid; + expected_message = + "ListType can only be casted to FixedSizeListType if the lists are all the " + "expected size."; + } + + EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, ::testing::HasSubstr(expected_message), + Cast(scalar, to_type)); } template @@ -1178,10 +1193,8 @@ class TestListLikeScalar : public ::testing::Test { CheckListCast( scalar, fixed_size_list(value_->type(), static_cast(value_->length()))); - CheckInvalidListCast(scalar, fixed_size_list(value_->type(), 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value_->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(value_->type(), 5); + CheckListCastError(scalar, invalid_cast_type); } protected: @@ -1238,10 +1251,8 @@ TEST(TestMapScalar, Cast) { CheckListCast(scalar, large_list(key_value_type)); CheckListCast(scalar, fixed_size_list(key_value_type, 2)); - CheckInvalidListCast(scalar, fixed_size_list(key_value_type, 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(key_value_type, 5); + CheckListCastError(scalar, invalid_cast_type); } TEST(TestStructScalar, FieldAccess) { diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9ad1ee6efc12a..840efa12cc3c1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1080,9 +1080,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size ParquetException::EofException(); } for (int i = 0; i < num_values; ++i) { - out[i].ptr = data; - data += type_length; - data_size -= type_length; + out[i].ptr = data + i * type_length; } return static_cast(bytes_to_decode); } diff --git a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs index 5dc0d1b434b6d..a7c459935c240 100644 --- a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs +++ b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs @@ -16,10 +16,8 @@ using System.Threading.Tasks; using Apache.Arrow.Flight.Internal; using Apache.Arrow.Flight.Protocol; -using Apache.Arrow.Flight.Server; using Apache.Arrow.Flight.Server.Internal; using Grpc.Core; -using Grpc.Net.Client; namespace Apache.Arrow.Flight.Client { @@ -29,7 +27,7 @@ public class FlightClient private readonly FlightService.FlightServiceClient _client; - public FlightClient(GrpcChannel grpcChannel) + public FlightClient(ChannelBase grpcChannel) { _client = new FlightService.FlightServiceClient(grpcChannel); } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 463ca49e29c94..698d74e4bac84 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; using Apache.Arrow.Types; using System; using System.Collections.Generic; @@ -46,8 +47,11 @@ private class ArrayDataConcatenationVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -84,17 +88,50 @@ public void Visit(FixedWidthType type) { CheckData(type, 2); ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); - ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(type); + ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(1, type); Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, valueBuffer }); } public void Visit(BinaryType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(BinaryViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(StringType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(StringViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(ListType type) => ConcatenateLists(type); + public void Visit(ListViewType type) + { + CheckData(type, 3); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + + var offsetsBuilder = new ArrowBuffer.Builder(_totalLength); + int baseOffset = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length > 0) + { + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (int offset in span) + { + offsetsBuilder.Append(baseOffset + offset); + } + } + + baseOffset += arrayData.Children[0].Length; + } + + ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator); + ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default); + ArrayData child = Concatenate(SelectChildren(0), _allocator); + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child }); + } + public void Visit(FixedSizeListType type) { CheckData(type, 1); @@ -161,6 +198,15 @@ private void CheckData(IArrowType type, int expectedBufferCount) } } + private void CheckDataVariadicCount(IArrowType type, int expectedBufferCount) + { + foreach (ArrayData arrayData in _arrayDataList) + { + arrayData.EnsureDataType(type.TypeId); + arrayData.EnsureVariadicBufferCount(expectedBufferCount); + } + } + private void ConcatenateVariableBinaryArrayData(IArrowType type) { CheckData(type, 3); @@ -171,6 +217,26 @@ private void ConcatenateVariableBinaryArrayData(IArrowType type) Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, valueBuffer }); } + private void ConcatenateBinaryViewArrayData(IArrowType type) + { + CheckDataVariadicCount(type, 2); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + ArrowBuffer viewBuffer = ConcatenateViewBuffer(out int variadicBufferCount); + ArrowBuffer[] buffers = new ArrowBuffer[2 + variadicBufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewBuffer; + int index = 2; + foreach (ArrayData arrayData in _arrayDataList) + { + for (int i = 2; i < arrayData.Buffers.Length; i++) + { + buffers[index++] = arrayData.Buffers[i]; + } + } + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, buffers); + } + private void ConcatenateLists(NestedType type) { CheckData(type, 2); @@ -206,7 +272,7 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex) return builder.Build(_allocator); } - private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) + private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedWidthType type) { int typeByteWidth = type.BitWidth / 8; var builder = new ArrowBuffer.Builder(_totalLength * typeByteWidth); @@ -216,7 +282,7 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) int length = arrayData.Length; int byteLength = length * typeByteWidth; - builder.Append(arrayData.Buffers[1].Span.Slice(0, byteLength)); + builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength)); } return builder.Build(_allocator); @@ -265,6 +331,36 @@ private ArrowBuffer ConcatenateOffsetBuffer() return builder.Build(_allocator); } + private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount) + { + var builder = new ArrowBuffer.Builder(_totalLength); + variadicBufferCount = 0; + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length == 0) + { + continue; + } + + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (BinaryView view in span) + { + if (view.Length > BinaryView.MaxInlineLength) + { + builder.Append(view.AdjustBufferIndex(variadicBufferCount)); + } + else + { + builder.Append(view); + } + } + + variadicBufferCount += (arrayData.Buffers.Length - 2); + } + + return builder.Build(_allocator); + } + private ArrowBuffer ConcatenateUnionTypeBuffer() { var builder = new ArrowBuffer.Builder(_totalLength); diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs index af5a524798396..f8367102082f5 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs @@ -54,8 +54,12 @@ internal static IArrowArrayBuilder> return new DoubleArray.Builder(); case ArrowTypeId.String: return new StringArray.Builder(); + case ArrowTypeId.StringView: + return new StringViewArray.Builder(); case ArrowTypeId.Binary: return new BinaryArray.Builder(); + case ArrowTypeId.BinaryView: + return new BinaryViewArray.Builder(); case ArrowTypeId.Timestamp: return new TimestampArray.Builder(); case ArrowTypeId.Date64: @@ -70,6 +74,8 @@ internal static IArrowArrayBuilder> return new DurationArray.Builder(dataType as DurationType); case ArrowTypeId.List: return new ListArray.Builder(dataType as ListType); + case ArrowTypeId.ListView: + return new ListViewArray.Builder(dataType as ListViewType); case ArrowTypeId.FixedSizeList: return new FixedSizeListArray.Builder(dataType as FixedSizeListType); case ArrowTypeId.Decimal128: diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index d6577260bb82d..3d2ab1d2129f1 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -51,14 +51,20 @@ public static IArrowArray BuildArray(ArrayData data) return new DoubleArray(data); case ArrowTypeId.String: return new StringArray(data); + case ArrowTypeId.StringView: + return new StringViewArray(data); case ArrowTypeId.FixedSizedBinary: return new FixedSizeBinaryArray(data); case ArrowTypeId.Binary: return new BinaryArray(data); + case ArrowTypeId.BinaryView: + return new BinaryViewArray(data); case ArrowTypeId.Timestamp: return new TimestampArray(data); case ArrowTypeId.List: return new ListArray(data); + case ArrowTypeId.ListView: + return new ListViewArray(data); case ArrowTypeId.Map: return new MapArray(data); case ArrowTypeId.Struct: diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs new file mode 100644 index 0000000000000..4f62dffd1ddeb --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; +using Apache.Arrow.Types; +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Collections; + +namespace Apache.Arrow +{ + public class BinaryViewArray : Array, IReadOnlyList + { + public class Builder : BuilderBase + { + public Builder() : base(BinaryViewType.Default) { } + public Builder(IArrowType dataType) : base(dataType) { } + + protected override BinaryViewArray Build(ArrayData data) + { + return new BinaryViewArray(data); + } + } + + public BinaryViewArray(ArrayData data) + : base(data) + { + data.EnsureDataType(ArrowTypeId.BinaryView); + data.EnsureVariadicBufferCount(2); + } + + public BinaryViewArray(ArrowTypeId typeId, ArrayData data) + : base(data) + { + data.EnsureDataType(typeId); + data.EnsureVariadicBufferCount(2); + } + + public abstract class BuilderBase : IArrowArrayBuilder + where TArray : IArrowArray + where TBuilder : class, IArrowArrayBuilder + { + protected IArrowType DataType { get; } + protected TBuilder Instance => this as TBuilder; + protected ArrowBuffer.Builder BinaryViews { get; } + protected ArrowBuffer.Builder ValueBuffer { get; } + protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; } + protected int NullCount => this.ValidityBuffer.UnsetBitCount; + + protected BuilderBase(IArrowType dataType) + { + DataType = dataType; + BinaryViews = new ArrowBuffer.Builder(); + ValueBuffer = new ArrowBuffer.Builder(); + ValidityBuffer = new ArrowBuffer.BitmapBuilder(); + } + + protected abstract TArray Build(ArrayData data); + + /// + /// Gets the length of the array built so far. + /// + public int Length => BinaryViews.Length; + + /// + /// Build an Arrow array from the appended contents so far. + /// + /// Optional memory allocator. + /// Returns an array of type . + public TArray Build(MemoryAllocator allocator = default) + { + bool hasValues = ValueBuffer.Length > 0; + var bufs = new ArrowBuffer[hasValues ? 3 : 2]; + bufs[0] = NullCount > 0 ? ValidityBuffer.Build(allocator) : ArrowBuffer.Empty; + bufs[1] = BinaryViews.Build(allocator); + if (hasValues) { bufs[2] = ValueBuffer.Build(allocator); } + + var data = new ArrayData( + DataType, + length: Length, + NullCount, + offset: 0, + bufs); + + return Build(data); + } + + /// + /// Append a single null value to the array. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder AppendNull() + { + // Do not add to the value buffer in the case of a null. + // Note that we do not need to increment the offset as a result. + ValidityBuffer.Append(false); + BinaryViews.Append(default(BinaryView)); + return Instance; + } + + /// + /// Appends a value, consisting of a single byte, to the array. + /// + /// Byte value to append. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(byte value) + { + ValidityBuffer.Append(true); + Span buf = stackalloc[] { value }; + BinaryViews.Append(new BinaryView(buf)); + return Instance; + } + + /// + /// Append a value, consisting of a span of bytes, to the array. + /// + /// + /// Note that a single value is added, which consists of arbitrarily many bytes. If multiple values are + /// to be added, use the method. + /// + /// Span of bytes to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + if (span.Length > BinaryView.MaxInlineLength) + { + int offset = ValueBuffer.Length; + ValueBuffer.Append(span); + BinaryViews.Append(new BinaryView(span.Length, span.Slice(0, 4), 0, offset)); + } + else + { + BinaryViews.Append(new BinaryView(span)); + } + ValidityBuffer.Append(true); + return Instance; + } + + /// + /// Append an enumerable collection of single-byte values to the array. + /// + /// + /// Note that this method appends multiple values, each of which is a single byte + /// + /// Single-byte values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte b in values) + { + Append(b); + } + + return Instance; + } + + /// + /// Append an enumerable collection of values to the array. + /// + /// Values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte[] arr in values) + { + if (arr == null) + { + AppendNull(); + } + else + { + Append((ReadOnlySpan)arr); + } + } + + return Instance; + } + + public TBuilder Reserve(int capacity) + { + // TODO: [ARROW-9366] Reserve capacity in the value buffer in a more sensible way. + BinaryViews.Reserve(capacity); + ValueBuffer.Reserve(capacity); + ValidityBuffer.Reserve(capacity); + return Instance; + } + + public TBuilder Resize(int length) + { + // TODO: [ARROW-9366] Resize the value buffer to a safe length based on offsets, not `length`. + BinaryViews.Resize(length); + ValueBuffer.Resize(length); + ValidityBuffer.Resize(length); + return Instance; + } + + public TBuilder Swap(int i, int j) + { + ValidityBuffer.Swap(i, j); + BinaryView view = BinaryViews.Span[i]; + BinaryViews.Span[i] = BinaryViews.Span[j]; + BinaryViews.Span[j] = view; + return Instance; + } + + public TBuilder Set(int index, byte value) + { + // TODO: Implement + throw new NotImplementedException(); + } + + /// + /// Clear all contents appended so far. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder Clear() + { + BinaryViews.Clear(); + ValueBuffer.Clear(); + ValidityBuffer.Clear(); + return Instance; + } + } + + public BinaryViewArray(IArrowType dataType, int length, + ArrowBuffer binaryViewsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, binaryViewsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public ArrowBuffer ViewsBuffer => Data.Buffers[1]; + + public int DataBufferCount => Data.Buffers.Length - 2; + + public ArrowBuffer DataBuffer(int index) => Data.Buffers[index + 2]; + + public ReadOnlySpan Views => ViewsBuffer.Span.CastTo().Slice(Offset, Length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + if (!IsValid(index)) + { + return 0; + } + + return Views[index].Length; + } + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// + /// Note that this method cannot reliably identify null values, which are indistinguishable from empty byte + /// collection values when seen in the context of this method's return type of . + /// Use the method or the overload instead + /// to reliably determine null values. + /// + /// Index at which to get bytes. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index) => GetBytes(index, out _); + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// Index at which to get bytes. + /// Set to if the value at the given index is null. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index, out bool isNull) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + isNull = IsNull(index); + + if (isNull) + { + // Note that `return null;` is valid syntax, but would be misleading as `null` in the context of a span + // is actually returned as an empty span. + return ReadOnlySpan.Empty; + } + + BinaryView binaryView = Views[index]; + if (binaryView.IsInline) + { + return ViewsBuffer.Span.Slice(16 * index + 4, binaryView.Length); + } + + return DataBuffer(binaryView._bufferIndex).Span.Slice(binaryView._bufferOffset, binaryView.Length); + } + + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs new file mode 100644 index 0000000000000..081385d9211a4 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + public class ListViewArray : Array + { + public class Builder : IArrowArrayBuilder + { + public IArrowArrayBuilder> ValueBuilder { get; } + + public int Length => ValueOffsetsBufferBuilder.Length; + + private ArrowBuffer.Builder ValueOffsetsBufferBuilder { get; } + + private ArrowBuffer.Builder SizesBufferBuilder { get; } + + private ArrowBuffer.BitmapBuilder ValidityBufferBuilder { get; } + + public int NullCount { get; protected set; } + + private IArrowType DataType { get; } + + private int Start { get; set; } + + public Builder(IArrowType valueDataType) : this(new ListViewType(valueDataType)) + { + } + + public Builder(Field valueField) : this(new ListViewType(valueField)) + { + } + + internal Builder(ListViewType dataType) + { + ValueBuilder = ArrowArrayBuilderFactory.Build(dataType.ValueDataType); + ValueOffsetsBufferBuilder = new ArrowBuffer.Builder(); + SizesBufferBuilder = new ArrowBuffer.Builder(); + ValidityBufferBuilder = new ArrowBuffer.BitmapBuilder(); + DataType = dataType; + Start = -1; + } + + /// + /// Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder. TODO: Consider adding builder APIs to support construction + /// of overlapping lists. + /// + public Builder Append() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(true); + + return this; + } + + public Builder AppendNull() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(false); + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(0); + NullCount++; + Start = -1; + + return this; + } + + private void AppendPrevious() + { + if (Start >= 0) + { + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(ValueBuilder.Length - Start); + } + Start = ValueBuilder.Length; + } + + public ListViewArray Build(MemoryAllocator allocator = default) + { + AppendPrevious(); + + ArrowBuffer validityBuffer = NullCount > 0 + ? ValidityBufferBuilder.Build(allocator) + : ArrowBuffer.Empty; + + return new ListViewArray(DataType, Length, + ValueOffsetsBufferBuilder.Build(allocator), SizesBufferBuilder.Build(allocator), + ValueBuilder.Build(allocator), + validityBuffer, NullCount, 0); + } + + public Builder Reserve(int capacity) + { + ValueOffsetsBufferBuilder.Reserve(capacity); + SizesBufferBuilder.Reserve(capacity); + ValidityBufferBuilder.Reserve(capacity); + return this; + } + + public Builder Resize(int length) + { + ValueOffsetsBufferBuilder.Resize(length); + SizesBufferBuilder.Resize(length); + ValidityBufferBuilder.Resize(length); + return this; + } + + public Builder Clear() + { + ValueOffsetsBufferBuilder.Clear(); + SizesBufferBuilder.Clear(); + ValueBuilder.Clear(); + ValidityBufferBuilder.Clear(); + return this; + } + + } + + public IArrowArray Values { get; } + + public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(Offset, Length); + + public ArrowBuffer SizesBuffer => Data.Buffers[2]; + + public ReadOnlySpan Sizes => SizesBuffer.Span.CastTo().Slice(Offset, Length); + + public ListViewArray(IArrowType dataType, int length, + ArrowBuffer valueOffsetsBuffer, ArrowBuffer sizesBuffer, IArrowArray values, + ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, sizesBuffer }, new[] { values.Data }), + values) + { + } + + public ListViewArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0])) + { + } + + private ListViewArray(ArrayData data, IArrowArray values) : base(data) + { + data.EnsureBufferCount(3); + data.EnsureDataType(ArrowTypeId.ListView); + Values = values; + } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return 0; + } + + return Sizes[index]; + } + + public IArrowArray GetSlicedValues(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return null; + } + + if (!(Values is Array array)) + { + return default; + } + + return array.Slice(ValueOffsets[index], GetValueLength(index)); + } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + Values?.Dispose(); + } + base.Dispose(disposing); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs new file mode 100644 index 0000000000000..88644761535d9 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; + +namespace Apache.Arrow +{ + public class StringViewArray: BinaryViewArray, IReadOnlyList + { + public static readonly Encoding DefaultEncoding = Encoding.UTF8; + + public new class Builder : BuilderBase + { + public Builder() : base(StringViewType.Default) { } + + protected override StringViewArray Build(ArrayData data) + { + return new StringViewArray(data); + } + + public Builder Append(string value, Encoding encoding = null) + { + if (value == null) + { + return AppendNull(); + } + encoding = encoding ?? DefaultEncoding; + byte[] span = encoding.GetBytes(value); + return Append(span.AsSpan()); + } + + public Builder AppendRange(IEnumerable values, Encoding encoding = null) + { + foreach (string value in values) + { + Append(value, encoding); + } + + return this; + } + } + + public StringViewArray(ArrayData data) + : base(ArrowTypeId.StringView, data) { } + + public StringViewArray(int length, + ArrowBuffer valueOffsetsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(StringViewType.Default, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public string GetString(int index, Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + ReadOnlySpan bytes = GetBytes(index, out bool isNull); + + if (isNull) + { + return null; + } + if (bytes.Length == 0) + { + return string.Empty; + } + + unsafe + { + fixed (byte* data = &MemoryMarshal.GetReference(bytes)) + return encoding.GetString(data, bytes.Length); + } + } + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 2d9febea33f54..03059eaf5d4df 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -15,10 +15,12 @@ using System; +using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Apache.Arrow.Memory; +using Apache.Arrow.Types; namespace Apache.Arrow.C { @@ -121,7 +123,16 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->buffers = null; if (cArray->n_buffers > 0) { - cArray->buffers = (byte**)sharedOwner.Allocate(array.Buffers.Length * IntPtr.Size); + long* lengths = null; + int bufferCount = array.Buffers.Length; + if (array.DataType.TypeId == ArrowTypeId.BinaryView || array.DataType.TypeId == ArrowTypeId.StringView) + { + lengths = (long*)sharedOwner.Allocate(8 * bufferCount); // overallocation to avoid edge case + bufferCount++; + cArray->n_buffers++; + } + + cArray->buffers = (byte**)sharedOwner.Allocate(bufferCount * IntPtr.Size); for (int i = 0; i < array.Buffers.Length; i++) { ArrowBuffer buffer = array.Buffers[i]; @@ -131,6 +142,15 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr throw new NotSupportedException($"An ArrowArray of type {array.DataType.TypeId} could not be exported: failed on buffer #{i}"); } cArray->buffers[i] = (byte*)ptr; + if (lengths != null && i >= 2) + { + lengths[i - 2] = array.Buffers[i].Length; + } + } + + if (lengths != null) + { + cArray->buffers[array.Buffers.Length] = (byte*)lengths; } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 1b40ec49658bb..fbb2be661fc5d 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -157,10 +157,18 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) case ArrowTypeId.Binary: buffers = ImportByteArrayBuffers(cArray); break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = ImportByteArrayViewBuffers(cArray); + break; case ArrowTypeId.List: children = ProcessListChildren(cArray, ((ListType)type).ValueDataType); buffers = ImportListBuffers(cArray); break; + case ArrowTypeId.ListView: + children = ProcessListChildren(cArray, ((ListViewType)type).ValueDataType); + buffers = ImportListViewBuffers(cArray); + break; case ArrowTypeId.FixedSizeList: children = ProcessListChildren(cArray, ((FixedSizeListType)type).ValueDataType); buffers = ImportFixedSizeListBuffers(cArray); @@ -268,6 +276,28 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers < 3) + { + throw new InvalidOperationException("Byte array views are expected to have at least three buffers"); + } + + int length = checked((int)cArray->length); + int viewsLength = length * 16; + + long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; + ArrowBuffer[] buffers = new ArrowBuffer[cArray->n_buffers - 1]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, viewsLength)); + for (int i = 2; i < buffers.Length; i++) + { + buffers[i] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, checked((int)bufferLengths[i - 2]))); + } + + return buffers; + } + private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 2) @@ -285,6 +315,24 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 3) + { + throw new InvalidOperationException("List view arrays are expected to have exactly three buffers"); + } + + int length = checked((int)cArray->length); + int offsetsLength = length * 4; + + ArrowBuffer[] buffers = new ArrowBuffer[3]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, offsetsLength)); + + return buffers; + } + private ArrowBuffer[] ImportFixedSizeListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 1) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index c9b45a8eb2d87..3bb7134af3ba9 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -167,7 +167,9 @@ private static string GetFormat(IArrowType datatype) return $"d:{decimalType.Precision},{decimalType.Scale},256"; // Binary case BinaryType _: return "z"; + case BinaryViewType _: return "vz"; case StringType _: return "u"; + case StringViewType _: return "vu"; case FixedSizeBinaryType binaryType: return $"w:{binaryType.ByteWidth}"; // Date @@ -196,6 +198,7 @@ private static string GetFormat(IArrowType datatype) }; // Nested case ListType _: return "+l"; + case ListViewType _: return "+vl"; case FixedSizeListType fixedListType: return $"+w:{fixedListType.ListSize}"; case StructType _: return "+s"; diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 9c81195771bae..f1acc007bcef7 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -165,7 +165,7 @@ public ArrowType GetAsType() } // Special handling for nested types - if (format == "+l") + if (format == "+l" || format == "+vl") { if (_cSchema->n_children != 1) { @@ -180,7 +180,7 @@ public ArrowType GetAsType() Field childField = childSchema.GetAsField(); - return new ListType(childField); + return format[1] == 'v' ? new ListViewType(childField) : new ListType(childField); } else if (format == "+s") { @@ -303,8 +303,10 @@ public ArrowType GetAsType() "g" => DoubleType.Default, // Binary data "z" => BinaryType.Default, + "vz" => BinaryViewType.Default, //"Z" => new LargeBinaryType() // Not yet implemented "u" => StringType.Default, + "vu" => StringViewType.Default, //"U" => new LargeStringType(), // Not yet implemented // Date and time "tdD" => Date32Type.Default, diff --git a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs index 399d9bf5e6bf1..2b6742a3d0cb2 100644 --- a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs @@ -23,6 +23,17 @@ internal static class ArrayDataExtensions public static void EnsureBufferCount(this ArrayData data, int count) { if (data.Buffers.Length != count) + { + // TODO: Use localizable string resource + throw new ArgumentException( + $"Buffer count <{data.Buffers.Length}> must be at exactly <{count}>", + nameof(data.Buffers.Length)); + } + } + + public static void EnsureVariadicBufferCount(this ArrayData data, int count) + { + if (data.Buffers.Length < count) { // TODO: Use localizable string resource throw new ArgumentException( diff --git a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs index 5f39680b90ebc..b44c02d854077 100644 --- a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs @@ -19,25 +19,6 @@ namespace Apache.Arrow { internal static class FlatbufExtensions { - public static bool IsFixedPrimitive(this Flatbuf.Type t) - { - if (t == Flatbuf.Type.Utf8 || t == Flatbuf.Type.Binary) - return false; - return true; - } - - public static bool IsFixedPrimitive(this Types.IArrowType t) - { - return t.TypeId.IsFixedPrimitive(); - } - - public static bool IsFixedPrimitive(this Types.ArrowTypeId t) - { - if (t == Types.ArrowTypeId.String || t == Types.ArrowTypeId.Binary) - return false; - return true; - } - public static Types.IntervalUnit ToArrow(this Flatbuf.IntervalUnit unit) { switch (unit) diff --git a/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs new file mode 100644 index 0000000000000..2f9cca51737f8 --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct BinaryView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb) { return GetRootAsBinaryView(_bb, new BinaryView()); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb, BinaryView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public BinaryView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartBinaryView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndBinaryView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class BinaryViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs index 1e893e8cb6ffc..13b5315805dc9 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs @@ -8,21 +8,21 @@ namespace Apache.Arrow.Flatbuf internal enum MetadataVersion : short { /// 0.1.0 (October 2016). - V1 = 0, + V1 = 0, /// 0.2.0 (February 2017). Non-backwards compatible with V1. - V2 = 1, + V2 = 1, /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. - V3 = 2, + V3 = 2, /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. - V4 = 3, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + V4 = 3, + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// /// Incompatible changes between V4 and V5: /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. - V5 = 4, + V5 = 4, }; diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs index 10f852efb9b96..9c04288648dea 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs @@ -33,6 +33,10 @@ internal enum Type : byte LargeUtf8 = 20, LargeList = 21, RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26, }; @@ -110,6 +114,18 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, byte typeId, uin case Type.RunEndEncoded: result = RunEndEncodedVerify.Verify(verifier, tablePos); break; + case Type.BinaryView: + result = BinaryViewVerify.Verify(verifier, tablePos); + break; + case Type.Utf8View: + result = Utf8ViewVerify.Verify(verifier, tablePos); + break; + case Type.ListView: + result = ListViewVerify.Verify(verifier, tablePos); + break; + case Type.LargeListView: + result = LargeListViewVerify.Verify(verifier, tablePos); + break; default: result = true; break; } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Field.cs b/csharp/src/Apache.Arrow/Flatbuf/Field.cs index c5c6c0a165598..efbc6afb06d03 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Field.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Field.cs @@ -57,6 +57,10 @@ internal struct Field : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// Present only if the field is dictionary encoded. public DictionaryEncoding? Dictionary { get { int o = __p.__offset(12); return o != 0 ? (DictionaryEncoding?)(new DictionaryEncoding()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } /// children apply only to nested data types like Struct, List and Union. For diff --git a/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs new file mode 100644 index 0000000000000..685e91333c38c --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs @@ -0,0 +1,42 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +internal struct LargeListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb) { return GetRootAsLargeListView(_bb, new LargeListView()); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb, LargeListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public LargeListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartLargeListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndLargeListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class LargeListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/ListView.cs b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs new file mode 100644 index 0000000000000..d2e54e428524b --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs @@ -0,0 +1,43 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +internal struct ListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static ListView GetRootAsListView(ByteBuffer _bb) { return GetRootAsListView(_bb, new ListView()); } + public static ListView GetRootAsListView(ByteBuffer _bb, ListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public ListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class ListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs index 9ab9715165ddc..2df8716bc1655 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs @@ -38,27 +38,57 @@ internal struct RecordBatch : IFlatbufferObject public int BuffersLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } /// Optional compression of the message body public BodyCompression? Compression { get { int o = __p.__offset(10); return o != 0 ? (BodyCompression?)(new BodyCompression()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + public long VariadicBufferCounts(int j) { int o = __p.__offset(12); return o != 0 ? __p.bb.GetLong(__p.__vector(o) + j * 8) : (long)0; } + public int VariadicBufferCountsLength { get { int o = __p.__offset(12); return o != 0 ? __p.__vector_len(o) : 0; } } +#if ENABLE_SPAN_T + public Span GetVariadicCountsBytes() { return __p.__vector_as_span(12, 8); } +#else + public ArraySegment? GetVariadicCountsBytes() { return __p.__vector_as_arraysegment(12); } +#endif + public long[] GetVariadicCountsArray() { return __p.__vector_as_array(12); } public static Offset CreateRecordBatch(FlatBufferBuilder builder, long length = 0, VectorOffset nodesOffset = default(VectorOffset), VectorOffset buffersOffset = default(VectorOffset), - Offset compressionOffset = default(Offset)) { - builder.StartTable(4); + Offset compressionOffset = default(Offset), + VectorOffset variadicCountsOffset = default(VectorOffset)) { + builder.StartTable(5); RecordBatch.AddLength(builder, length); + RecordBatch.AddVariadicCounts(builder, variadicCountsOffset); RecordBatch.AddCompression(builder, compressionOffset); RecordBatch.AddBuffers(builder, buffersOffset); RecordBatch.AddNodes(builder, nodesOffset); return RecordBatch.EndRecordBatch(builder); } - public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(4); } + public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(5); } public static void AddLength(FlatBufferBuilder builder, long length) { builder.AddLong(0, length, 0); } public static void AddNodes(FlatBufferBuilder builder, VectorOffset nodesOffset) { builder.AddOffset(1, nodesOffset.Value, 0); } public static void StartNodesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddBuffers(FlatBufferBuilder builder, VectorOffset buffersOffset) { builder.AddOffset(2, buffersOffset.Value, 0); } public static void StartBuffersVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddCompression(FlatBufferBuilder builder, Offset compressionOffset) { builder.AddOffset(3, compressionOffset.Value, 0); } + public static void AddVariadicCounts(FlatBufferBuilder builder, VectorOffset variadicCountsOffset) { builder.AddOffset(4, variadicCountsOffset.Value, 0); } + public static VectorOffset CreateVariadicCountsVector(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddLong(data[i]); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, ArraySegment data) { builder.StartVector(8, data.Count, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, IntPtr dataPtr, int sizeInBytes) { builder.StartVector(1, sizeInBytes, 1); builder.Add(dataPtr, sizeInBytes); return builder.EndVector(); } + public static void StartVariadicCountsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); } public static Offset EndRecordBatch(FlatBufferBuilder builder) { int o = builder.EndTable(); return new Offset(o); @@ -75,6 +105,7 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) && verifier.VerifyVectorOfData(tablePos, 6 /*Nodes*/, 16 /*FieldNode*/, false) && verifier.VerifyVectorOfData(tablePos, 8 /*Buffers*/, 16 /*Buffer*/, false) && verifier.VerifyTable(tablePos, 10 /*Compression*/, BodyCompressionVerify.Verify, false) + && verifier.VerifyVectorOfData(tablePos, 12 /*VariadicCounts*/, 8 /*long*/, false) && verifier.VerifyTableEnd(tablePos); } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs index 3f9e1de7c00a9..099950fafe4ee 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs @@ -47,6 +47,10 @@ internal struct SparseTensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named. public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs index f8c213768a3fc..eb39257d861ca 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs @@ -46,6 +46,10 @@ internal struct Tensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs new file mode 100644 index 0000000000000..e85c5374a9acc --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct Utf8View : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb) { return GetRootAsUtf8View(_bb, new Utf8View()); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb, Utf8View obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public Utf8View __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartUtf8View(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndUtf8View(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class Utf8ViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index d3115da52cc6c..eb7349a570786 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -191,9 +191,7 @@ private List BuildArrays( Field field = schema.GetFieldByIndex(schemaFieldIndex++); Flatbuf.FieldNode fieldNode = recordBatchEnumerator.CurrentNode; - ArrayData arrayData = field.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); + ArrayData arrayData = LoadField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); arrays.Add(ArrowArrayFactory.BuildArray(arrayData)); } while (recordBatchEnumerator.MoveNextNode()); @@ -229,7 +227,7 @@ private IBufferCreator GetBufferCreator(BodyCompression? compression) return new DecompressingBufferCreator(decompressor, _allocator); } - private ArrayData LoadPrimitiveField( + private ArrayData LoadField( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, @@ -276,6 +274,16 @@ private ArrayData LoadPrimitiveField( case ArrowTypeId.FixedSizeList: buffers = 1; break; + case ArrowTypeId.String: + case ArrowTypeId.Binary: + case ArrowTypeId.ListView: + buffers = 3; + break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = checked((int)(2 + recordBatchEnumerator.CurrentVariadicCount)); + recordBatchEnumerator.MoveNextVariadicCount(); + break; default: buffers = 2; break; @@ -300,54 +308,6 @@ private ArrayData LoadPrimitiveField( return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); } - private ArrayData LoadVariableField( - MetadataVersion version, - ref RecordBatchEnumerator recordBatchEnumerator, - Field field, - in Flatbuf.FieldNode fieldNode, - ByteBuffer bodyData, - IBufferCreator bufferCreator) - { - - ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer offsetArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - recordBatchEnumerator.MoveNextBuffer(); - - int fieldLength = (int)fieldNode.Length; - int fieldNullCount = (int)fieldNode.NullCount; - - if (fieldLength < 0) - { - throw new InvalidDataException("Field length must be >= 0"); // TODO: Localize exception message - } - - if (fieldNullCount < 0) - { - throw new InvalidDataException("Null count length must be >= 0"); //TODO: Localize exception message - } - - ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, offsetArrowBuffer, valueArrowBuffer }; - ArrayData[] children = GetChildren(version, ref recordBatchEnumerator, field, bodyData, bufferCreator); - - IArrowArray dictionary = null; - if (field.DataType.TypeId == ArrowTypeId.Dictionary) - { - long id = DictionaryMemo.GetId(field); - dictionary = DictionaryMemo.GetDictionary(id); - } - - return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); - } - private ArrayData[] GetChildren( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, @@ -365,11 +325,7 @@ private ArrayData[] GetChildren( Flatbuf.FieldNode childFieldNode = recordBatchEnumerator.CurrentNode; Field childField = type.Fields[index]; - ArrayData child = childField.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); - - children[index] = child; + children[index] = LoadField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); } return children; } @@ -394,11 +350,14 @@ internal struct RecordBatchEnumerator private Flatbuf.RecordBatch RecordBatch { get; } internal int CurrentBufferIndex { get; private set; } internal int CurrentNodeIndex { get; private set; } + internal int CurrentVariadicCountIndex { get; private set; } internal Flatbuf.Buffer CurrentBuffer => RecordBatch.Buffers(CurrentBufferIndex).GetValueOrDefault(); internal Flatbuf.FieldNode CurrentNode => RecordBatch.Nodes(CurrentNodeIndex).GetValueOrDefault(); + internal long CurrentVariadicCount => RecordBatch.VariadicBufferCounts(CurrentVariadicCountIndex); + internal bool MoveNextBuffer() { return ++CurrentBufferIndex < RecordBatch.BuffersLength; @@ -409,11 +368,17 @@ internal bool MoveNextNode() return ++CurrentNodeIndex < RecordBatch.NodesLength; } + internal bool MoveNextVariadicCount() + { + return ++CurrentVariadicCountIndex < RecordBatch.VariadicBufferCountsLength; + } + internal RecordBatchEnumerator(in Flatbuf.RecordBatch recordBatch) { RecordBatch = recordBatch; CurrentBufferIndex = 0; CurrentNodeIndex = 0; + CurrentVariadicCountIndex = 0; } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 5f490019b2133..07d1dcfdb171d 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -54,9 +54,12 @@ internal class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -81,6 +84,7 @@ public Buffer(ArrowBuffer buffer, int offset) public IReadOnlyList Buffers => _buffers; + public List VariadicCounts { get; private set; } public int TotalLength { get; private set; } public ArrowRecordBatchFlatBufferBuilder() @@ -121,6 +125,15 @@ public void Visit(ListArray array) array.Values.Accept(this); } + public void Visit(ListViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); + _buffers.Add(CreateBuffer(array.SizesBuffer)); + + array.Values.Accept(this); + } + public void Visit(FixedSizeListArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -130,6 +143,8 @@ public void Visit(FixedSizeListArray array) public void Visit(StringArray array) => Visit(array as BinaryArray); + public void Visit(StringViewArray array) => Visit(array as BinaryViewArray); + public void Visit(BinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -137,6 +152,18 @@ public void Visit(BinaryArray array) _buffers.Add(CreateBuffer(array.ValueBuffer)); } + public void Visit(BinaryViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ViewsBuffer)); + for (int i = 0; i < array.DataBufferCount; i++) + { + _buffers.Add(CreateBuffer(array.DataBuffer(i))); + } + VariadicCounts = VariadicCounts ?? new List(); + VariadicCounts.Add(array.DataBufferCount); + } + public void Visit(FixedSizeBinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -328,7 +355,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -339,7 +366,9 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength); @@ -367,7 +396,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -378,7 +407,9 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength, @@ -451,12 +482,12 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList PreparingWritingRecordBatch(RecordBatch recordBatch) + private Tuple PreparingWritingRecordBatch(RecordBatch recordBatch) { return PreparingWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); } - private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) + private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) { Builder.Clear(); @@ -483,6 +514,12 @@ private Tuple PreparingWritingR fieldArray.Accept(recordBatchBuilder); } + VectorOffset variadicCountOffset = default; + if (recordBatchBuilder.VariadicCounts != null) + { + variadicCountOffset = Flatbuf.RecordBatch.CreateVariadicCountsVectorBlock(Builder, recordBatchBuilder.VariadicCounts.ToArray()); + } + IReadOnlyList buffers = recordBatchBuilder.Buffers; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); @@ -494,7 +531,7 @@ private Tuple PreparingWritingR buffers[i].Offset, buffers[i].DataBuffer.Length); } - return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset); + return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset, variadicCountOffset); } private protected virtual void StartingWritingDictionary() @@ -561,7 +598,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, var arrays = new List { dictionary }; - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(fields, arrays); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -569,7 +606,9 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, // Serialize record batch Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, dictionary.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); // TODO: Support delta. Offset dictionaryBatchOffset = Flatbuf.DictionaryBatch.CreateDictionaryBatch(Builder, id, recordBatchOffset, false); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 84ff4f9cc7202..473e18968f8cb 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -50,9 +50,13 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -60,8 +64,10 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -106,6 +112,14 @@ public void Visit(BinaryType type) Flatbuf.Binary.EndBinary(Builder)); } + public void Visit(BinaryViewType type) + { + Flatbuf.BinaryView.StartBinaryView(Builder); + Offset offset = Flatbuf.BinaryView.EndBinaryView(Builder); + Result = FieldType.Build( + Flatbuf.Type.BinaryView, offset); + } + public void Visit(ListType type) { Flatbuf.List.StartList(Builder); @@ -114,6 +128,14 @@ public void Visit(ListType type) Flatbuf.List.EndList(Builder)); } + public void Visit(ListViewType type) + { + Flatbuf.ListView.StartListView(Builder); + Result = FieldType.Build( + Flatbuf.Type.ListView, + Flatbuf.ListView.EndListView(Builder)); + } + public void Visit(FixedSizeListType type) { Result = FieldType.Build( @@ -136,6 +158,14 @@ public void Visit(StringType type) Flatbuf.Type.Utf8, offset); } + public void Visit(StringViewType type) + { + Flatbuf.Utf8View.StartUtf8View(Builder); + Offset offset = Flatbuf.Utf8View.EndUtf8View(Builder); + Result = FieldType.Build( + Flatbuf.Type.Utf8View, offset); + } + public void Visit(TimestampType type) { StringOffset timezoneStringOffset = default; @@ -169,6 +199,15 @@ public void Visit(Time32Type type) Flatbuf.Time.CreateTime(Builder, ToFlatBuffer(type.Unit))); } +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) + { + Result = FieldType.Build( + Flatbuf.Type.FloatingPoint, + Flatbuf.FloatingPoint.CreateFloatingPoint(Builder, Precision.HALF)); + } +#endif + public void Visit(FloatType type) { Result = FieldType.Build( diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 633554fc53261..0e6f330aef091 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -184,17 +184,27 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c return Types.IntervalType.FromIntervalUnit(intervalMetadata.Unit.ToArrow()); case Flatbuf.Type.Utf8: return Types.StringType.Default; + case Flatbuf.Type.Utf8View: + return Types.StringViewType.Default; case Flatbuf.Type.FixedSizeBinary: Flatbuf.FixedSizeBinary fixedSizeBinaryMetadata = field.Type().Value; return new Types.FixedSizeBinaryType(fixedSizeBinaryMetadata.ByteWidth); case Flatbuf.Type.Binary: return Types.BinaryType.Default; + case Flatbuf.Type.BinaryView: + return Types.BinaryViewType.Default; case Flatbuf.Type.List: if (childFields == null || childFields.Length != 1) { throw new InvalidDataException($"List type must have exactly one child."); } return new Types.ListType(childFields[0]); + case Flatbuf.Type.ListView: + if (childFields == null || childFields.Length != 1) + { + throw new InvalidDataException($"List view type must have exactly one child."); + } + return new Types.ListViewType(childFields[0]); case Flatbuf.Type.FixedSizeList: if (childFields == null || childFields.Length != 1) { diff --git a/csharp/src/Apache.Arrow/Scalars/BinaryView.cs b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs new file mode 100644 index 0000000000000..eaba89c7a3a8e --- /dev/null +++ b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Apache.Arrow.Scalars +{ + [StructLayout(LayoutKind.Explicit)] + public unsafe struct BinaryView : IEquatable + { + public const int PrefixLength = 4; + public const int MaxInlineLength = 12; + + [FieldOffset(0)] + public readonly int Length; + + [FieldOffset(4)] + internal readonly int _prefix; + + [FieldOffset(8)] + internal readonly int _bufferIndex; + + [FieldOffset(12)] + internal readonly int _bufferOffset; + + [FieldOffset(4)] + internal fixed byte _inline[MaxInlineLength]; + + public unsafe BinaryView(ReadOnlySpan inline) : this() + { + if (inline.Length > MaxInlineLength) + { + throw new ArgumentException("invalid inline data length", nameof(inline)); + } + + Length = inline.Length; + fixed (byte* dest = _inline) + fixed (byte* src = inline) + { + Buffer.MemoryCopy(src, dest, MaxInlineLength, inline.Length); + } + } + + public BinaryView(int length, ReadOnlySpan prefix, int bufferIndex, int bufferOffset) + { + if (length < MaxInlineLength) + { + throw new ArgumentException("invalid length", nameof(length)); + } + if (prefix.Length != PrefixLength) + { + throw new ArgumentException("invalid prefix length", nameof(prefix)); + } + + Length = length; + _bufferIndex = bufferIndex; + _bufferOffset = bufferOffset; + _prefix = prefix.CastTo()[0]; + } + + private BinaryView(int length, int prefix, int bufferIndex, int offset) + { + Length = length; + _prefix = prefix; + _bufferIndex = bufferIndex; + _bufferOffset = offset; + } + + public bool IsInline => Length <= MaxInlineLength; + +#if NET5_0_OR_GREATER + public ReadOnlySpan Bytes => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(_inline[0]), IsInline ? Length : PrefixLength); +#else + public unsafe ReadOnlySpan Bytes => new ReadOnlySpan(Unsafe.AsPointer(ref _inline[0]), IsInline ? Length : PrefixLength); +#endif + + public int BufferIndex => IsInline ? -1 : _bufferIndex; + + public int BufferOffset => IsInline ? -1 : _bufferOffset; + + public override int GetHashCode() => Length ^ _prefix ^ _bufferIndex ^ _bufferOffset; + + public override bool Equals(object obj) + { + BinaryView? other = obj as BinaryView?; + return other != null && Equals(other.Value); + } + + public bool Equals(BinaryView other) => + Length == other.Length && _prefix == other._prefix && _bufferIndex == other._bufferIndex && _bufferOffset == other._bufferOffset; + + internal BinaryView AdjustBufferIndex(int bufferOffset) + { + return new BinaryView(Length, _prefix, _bufferIndex + bufferOffset, _bufferOffset); + } + } +} diff --git a/csharp/src/Apache.Arrow/Types/BinaryViewType.cs b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs new file mode 100644 index 0000000000000..f5cfc034dc967 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public class BinaryViewType: ArrowType + { + public static readonly BinaryViewType Default = new BinaryViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.BinaryView; + public override string Name => "binaryview"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs index 5e107813be828..cf520391fe1e6 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs @@ -50,6 +50,9 @@ public enum ArrowTypeId FixedSizeList, Duration, RecordBatch, + BinaryView, + StringView, + ListView, } public interface IArrowType diff --git a/csharp/src/Apache.Arrow/Types/ListViewType.cs b/csharp/src/Apache.Arrow/Types/ListViewType.cs new file mode 100644 index 0000000000000..ecf745723c4ae --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/ListViewType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types +{ + public sealed class ListViewType : NestedType + { + public override ArrowTypeId TypeId => ArrowTypeId.ListView; + public override string Name => "listview"; + + public Field ValueField => Fields[0]; + + public IArrowType ValueDataType => Fields[0].DataType; + + public ListViewType(Field valueField) + : base(valueField) { } + + public ListViewType(IArrowType valueDataType) + : this(new Field("item", valueDataType, true)) { } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/StringViewType.cs b/csharp/src/Apache.Arrow/Types/StringViewType.cs new file mode 100644 index 0000000000000..0c539a56b03b5 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/StringViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public sealed class StringViewType : ArrowType + { + public static StringViewType Default = new StringViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.StringView; + public override string Name => "utf8view"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs index c791c9969356a..f35c2a5d78d79 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs +++ b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs @@ -38,7 +38,7 @@ public class ArrowWriterBenchmark [GlobalSetup] public void GlobalSetup() { - _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount, false); + _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount); _memoryStream = new MemoryStream(); } diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 94ef4b5f3c5f5..dd2c75dd3df90 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,8 +8,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 46d0a59b5d8e1..0e9c02d61977c 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 99c772770d6c6..d38413ba45b3a 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index f3fe73588a7bb..31a5676f01315 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -21,6 +21,7 @@ using System.Numerics; using System.Text; using System.Text.Json; +using System.Text.Json.Nodes; using System.Text.Json.Serialization; using System.Threading.Tasks; using Apache.Arrow.Arrays; @@ -175,7 +176,9 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "floatingpoint" => ToFloatingPointArrowType(type), "decimal" => ToDecimalArrowType(type), "binary" => BinaryType.Default, + "binaryview" => BinaryViewType.Default, "utf8" => StringType.Default, + "utf8view" => StringViewType.Default, "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), "date" => ToDateArrowType(type), "time" => ToTimeArrowType(type), @@ -184,6 +187,7 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "interval_mdn" => ToIntervalArrowType(type), "timestamp" => ToTimestampArrowType(type), "list" => ToListArrowType(type, children), + "listview" => ToListViewArrowType(type, children), "fixedsizelist" => ToFixedSizeListArrowType(type, children), "struct" => ToStructArrowType(type, children), "union" => ToUnionArrowType(type, children), @@ -294,6 +298,11 @@ private static IArrowType ToListArrowType(JsonArrowType type, Field[] children) return new ListType(children[0]); } + private static IArrowType ToListViewArrowType(JsonArrowType type, Field[] children) + { + return new ListViewType(children[0]); + } + private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) { return new FixedSizeListType(children[0], type.ListSize); @@ -451,9 +460,12 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -652,6 +664,38 @@ public void Visit(StringType type) Array = new StringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); } + public void Visit(StringViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Encoding.UTF8.GetBytes(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new StringViewArray(arrayData); + } + public void Visit(BinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -671,6 +715,38 @@ public void Visit(BinaryType type) Array = new BinaryArray(arrayData); } + public void Visit(BinaryViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Convert.FromHexString(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new BinaryViewArray(arrayData); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -704,6 +780,22 @@ public void Visit(ListType type) Array = new ListArray(arrayData); } + public void Visit(ListViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + ArrowBuffer sizeBuffer = GetSizeBuffer(); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer, offsetBuffer, sizeBuffer }, new[] { Array.Data }); + Array = new ListViewArray(arrayData); + } + public void Visit(FixedSizeListType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -878,11 +970,18 @@ private void GenerateArray(Func valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Length); - valueOffsets.AppendRange(JsonFieldData.Offset); + ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Count); + valueOffsets.AppendRange(JsonFieldData.IntOffset); return valueOffsets.Build(default); } + private ArrowBuffer GetSizeBuffer() + { + ArrowBuffer.Builder valueSizes = new ArrowBuffer.Builder(JsonFieldData.Size.Count); + valueSizes.AppendRange(JsonFieldData.IntSize); + return valueSizes.Build(default); + } + private ArrowBuffer GetTypeIdBuffer() { ArrowBuffer.Builder typeIds = new ArrowBuffer.Builder(JsonFieldData.TypeId.Length); @@ -920,10 +1019,61 @@ public class JsonFieldData public string Name { get; set; } public int Count { get; set; } public bool[] Validity { get; set; } - public int[] Offset { get; set; } + public JsonArray Offset { get; set; } + + [JsonPropertyName("SIZE")] + public JsonArray Size { get; set; } public int[] TypeId { get; set; } public JsonElement Data { get; set; } public List Children { get; set; } + + [JsonPropertyName("VIEWS")] + public List Views { get; set; } + + [JsonPropertyName("VARIADIC_DATA_BUFFERS")] + public List VariadicDataBuffers { get; set; } + + [JsonIgnore] + public IEnumerable IntOffset + { + get { return Offset.Select(GetInt); } + } + + [JsonIgnore] + public IEnumerable IntSize + { + get { return Size.Select(GetInt); } + } + + static int GetInt(JsonNode node) + { + try + { + return node.GetValue(); + } + catch + { + return int.Parse(node.GetValue()); + } + } + } + + public class JsonView + { + [JsonPropertyName("SIZE")] + public int Size { get; set; } + + [JsonPropertyName("INLINED")] + public string Inlined { get; set; } + + [JsonPropertyName("PREFIX_HEX")] + public string PrefixHex { get; set; } + + [JsonPropertyName("BUFFER_INDEX")] + public int? BufferIndex { get; set; } + + [JsonPropertyName("OFFSET")] + public int? Offset { get; set; } } internal sealed class ValidityConverter : JsonConverter diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index fde30a90e6479..0afd1490e7b69 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,8 +15,8 @@ - - + + all runtime; build; native; contentfiles; analyzers diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index 137dc16d473a4..25ef289f0dc25 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -64,13 +64,16 @@ private static IEnumerable, IArrowArray>> GenerateTestDa FloatType.Default, DoubleType.Default, BinaryType.Default, + BinaryViewType.Default, StringType.Default, + StringViewType.Default, Date32Type.Default, Date64Type.Default, TimestampType.Default, new Decimal128Type(14, 10), new Decimal256Type(14,10), new ListType(Int64Type.Default), + new ListViewType(Int64Type.Default), new StructType(new List{ new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() @@ -122,7 +125,9 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -131,6 +136,7 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -368,6 +374,34 @@ public void Visit(BinaryType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(BinaryViewType type) + { + BinaryViewArray.Builder resultBuilder = new BinaryViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + BinaryViewArray.Builder builder = new BinaryViewArray.Builder().Reserve(dataList.Count); + + foreach (byte? value in dataList) + { + if (value.HasValue) + { + builder.Append(value.Value); + resultBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(StringType type) { StringArray.Builder resultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); @@ -388,6 +422,26 @@ public void Visit(StringType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(StringViewType type) + { + StringViewArray.Builder resultBuilder = new StringViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + StringViewArray.Builder builder = new StringViewArray.Builder().Reserve(dataList.Count); + + foreach (string value in dataList.Select(_ => _.ToString() ?? null)) + { + builder.Append(value); + resultBuilder.Append(value); + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(ListType type) { ListArray.Builder resultBuilder = new ListArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); @@ -423,6 +477,41 @@ public void Visit(ListType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(ListViewType type) + { + ListViewArray.Builder resultBuilder = new ListViewArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); + Int64Array.Builder resultValueBuilder = (Int64Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + + ListViewArray.Builder builder = new ListViewArray.Builder(type.ValueField).Reserve(dataList.Count); + Int64Array.Builder valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); + + foreach (long? value in dataList) + { + if (value.HasValue) + { + builder.Append(); + resultBuilder.Append(); + + valueBuilder.Append(value.Value); + resultValueBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(FixedSizeListType type) { FixedSizeListArray.Builder resultBuilder = new FixedSizeListArray.Builder(type.ValueDataType, type.ListSize).Reserve(_baseDataTotalElementCount); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 2aaffe7835258..10315ff287c0b 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -20,6 +20,7 @@ using System.Threading.Tasks; using Apache.Arrow.Arrays; using Xunit; +using System.Diagnostics; namespace Apache.Arrow.Tests { @@ -90,10 +91,13 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -136,12 +140,15 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(DayTimeIntervalArray array) => CompareArrays(array); public void Visit(MonthDayNanosecondIntervalArray array) => CompareArrays(array); public void Visit(ListArray array) => CompareArrays(array); + public void Visit(ListViewArray array) => CompareArrays(array); public void Visit(FixedSizeListArray array) => CompareArrays(array); public void Visit(FixedSizeBinaryArray array) => CompareArrays(array); public void Visit(Decimal128Array array) => CompareArrays(array); public void Visit(Decimal256Array array) => CompareArrays(array); public void Visit(StringArray array) => CompareBinaryArrays(array); + public void Visit(StringViewArray array) => CompareVariadicArrays(array); public void Visit(BinaryArray array) => CompareBinaryArrays(array); + public void Visit(BinaryViewArray array) => CompareVariadicArrays(array); public void Visit(StructArray array) { @@ -230,6 +237,32 @@ private void CompareBinaryArrays(BinaryArray actualArray) } } + private void CompareVariadicArrays(BinaryViewArray actualArray) + where T : IArrowArray + { + Assert.IsAssignableFrom(_expectedArray); + Assert.IsAssignableFrom(actualArray); + + var expectedArray = (BinaryViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + Assert.True(expectedArray.Views.SequenceEqual(actualArray.Views)); + + for (int i = 0; i < expectedArray.Length; i++) + { + Assert.True( + expectedArray.GetBytes(i).SequenceEqual(actualArray.GetBytes(i)), + $"BinaryArray values do not match at index {i}."); + } + } + private void CompareArrays(FixedSizeBinaryArray actualArray) { Assert.IsAssignableFrom(_expectedArray); @@ -346,6 +379,34 @@ private void CompareArrays(ListArray actualArray) actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } + private void CompareArrays(ListViewArray actualArray) + { + Assert.IsAssignableFrom(_expectedArray); + ListViewArray expectedArray = (ListViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + if (_strictCompare) + { + Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); + } + else + { + int length = expectedArray.Length * sizeof(int); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(0, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); + Assert.True(expectedArray.SizesBuffer.Span.Slice(0, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); + } + + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); + } + private void CompareArrays(FixedSizeListArray actualArray) { Assert.IsAssignableFrom(_expectedArray); diff --git a/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs new file mode 100644 index 0000000000000..eb617b4dedc75 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Scalars; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class BinaryViewTests + { + private static readonly byte[] empty = new byte[0]; + private static readonly byte[] oneByte = new byte[1]; + private static readonly byte[] fourBytes = new byte[] { 1, 2, 3, 4 }; + private static readonly byte[] fiveBytes = new byte[] { 5, 4, 3, 2, 1 }; + private static readonly byte[] twelveBytes = new byte[] { 1, 2, 3, 4, 8, 7, 6, 5, 9, 10, 11, 12 }; + private static readonly byte[] thirteenBytes = new byte[13]; + + [Fact] + public void Equality() + { + BinaryView one = new BinaryView(oneByte); + BinaryView four = new BinaryView(fourBytes); + BinaryView twelve = new BinaryView(twelveBytes); + BinaryView twelvePlus = new BinaryView(13, fourBytes, 0, 0); + Assert.Equal(one, one); + Assert.NotEqual(one, four); + Assert.NotEqual(four, twelve); + Assert.NotEqual(four, twelvePlus); + } + + [Fact] + public void ConstructorThrows() + { + Assert.Throws(() => new BinaryView(thirteenBytes)); + Assert.Throws(() => new BinaryView(20, empty, 0, 0)); + Assert.Throws(() => new BinaryView(20, fiveBytes, 0, 0)); + Assert.Throws(() => new BinaryView(13, thirteenBytes, 0, 0)); + Assert.Throws(() => new BinaryView(4, fourBytes, 0, 0)); + } + + [Fact] + public void ConstructInline() + { + BinaryView zero = new BinaryView(empty); + Assert.Equal(-1, zero.BufferIndex); + Assert.Equal(-1, zero.BufferOffset); + Assert.Equal(0, zero.Length); + Assert.Equal(0, zero.Bytes.Length); + + BinaryView one = new BinaryView(oneByte); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(1, one.Length); + Assert.Equal(1, one.Bytes.Length); + Assert.Equal((byte)0, one.Bytes[0]); + + BinaryView twelve = new BinaryView(twelveBytes); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(12, twelve.Length); + Assert.Equal(12, twelve.Bytes.Length); + Assert.Equal((byte)8, twelve.Bytes[4]); + } + + [Fact] + public void ConstructPrefix() + { + BinaryView four = new BinaryView(14, fourBytes, 2, 3); + Assert.Equal(2, four.BufferIndex); + Assert.Equal(3, four.BufferOffset); + Assert.Equal(14, four.Length); + Assert.Equal(4, four.Bytes.Length); + Assert.Equal((byte)2, four.Bytes[1]); + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 83902d8d93c70..274434e4bab09 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -741,7 +741,9 @@ public unsafe void ExportBatch() [SkippableFact] public unsafe void RoundTripTestBatch() { - RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, createDictionaryArray: true); + // TODO: Enable these once this the version of pyarrow referenced during testing supports them + HashSet unsupported = new HashSet { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView }; + RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported); RecordBatch batch2 = batch1.Clone(); CArrowArray* cExportArray = CArrowArray.Create(); diff --git a/csharp/test/Apache.Arrow.Tests/TableTests.cs b/csharp/test/Apache.Arrow.Tests/TableTests.cs index d52b514e092d9..83c88265d172b 100644 --- a/csharp/test/Apache.Arrow.Tests/TableTests.cs +++ b/csharp/test/Apache.Arrow.Tests/TableTests.cs @@ -62,7 +62,11 @@ public void TestTableFromRecordBatches() Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches); Assert.Equal(20, table1.RowCount); - Assert.Equal(30, table1.ColumnCount); +#if NET5_0_OR_GREATER + Assert.Equal(35, table1.ColumnCount); +#else + Assert.Equal(34, table1.ColumnCount); +#endif Assert.Equal("ChunkedArray: Length=20, DataType=list", table1.Column(0).Data.ToString()); FixedSizeBinaryType type = new FixedSizeBinaryType(17); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index b43321abd7499..29ddef2864862 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -24,53 +24,66 @@ namespace Apache.Arrow.Tests { public static class TestData { - public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray = true) + public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray) { - return CreateSampleRecordBatch(length, columnSetCount: 1, createDictionaryArray); + HashSet excluded = createDictionaryArray ? null : new HashSet { ArrowTypeId.Dictionary }; + return CreateSampleRecordBatch(length, columnSetCount: 1, excluded); } - public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount, bool createAdvancedTypeArrays) + public static RecordBatch CreateSampleRecordBatch( + int length, + int columnSetCount = 1, + HashSet excludedTypes = null) { Schema.Builder builder = new Schema.Builder(); - for (int i = 0; i < columnSetCount; i++) + + void AddField(Field field) { - builder.Field(CreateField(new ListType(Int64Type.Default), i)); - builder.Field(CreateField(BooleanType.Default, i)); - builder.Field(CreateField(UInt8Type.Default, i)); - builder.Field(CreateField(Int8Type.Default, i)); - builder.Field(CreateField(UInt16Type.Default, i)); - builder.Field(CreateField(Int16Type.Default, i)); - builder.Field(CreateField(UInt32Type.Default, i)); - builder.Field(CreateField(Int32Type.Default, i)); - builder.Field(CreateField(UInt64Type.Default, i)); - builder.Field(CreateField(Int64Type.Default, i)); - builder.Field(CreateField(FloatType.Default, i)); - builder.Field(CreateField(DoubleType.Default, i)); - builder.Field(CreateField(Date32Type.Default, i)); - builder.Field(CreateField(Date64Type.Default, i)); - builder.Field(CreateField(Time32Type.Default, i)); - builder.Field(CreateField(Time64Type.Default, i)); - builder.Field(CreateField(TimestampType.Default, i)); - builder.Field(CreateField(StringType.Default, i)); - builder.Field(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); - builder.Field(CreateField(new Decimal128Type(10, 6), i)); - builder.Field(CreateField(new Decimal256Type(16, 8), i)); - builder.Field(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); - builder.Field(CreateField(IntervalType.YearMonth, i)); - builder.Field(CreateField(IntervalType.DayTime, i)); - builder.Field(CreateField(IntervalType.MonthDayNanosecond, i)); - - if (createAdvancedTypeArrays) + if (excludedTypes == null || !excludedTypes.Contains(field.DataType.TypeId)) { - builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); - builder.Field(CreateField(new FixedSizeBinaryType(16), i)); - builder.Field(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + builder.Field(field); } + } - //builder.Field(CreateField(HalfFloatType.Default)); - //builder.Field(CreateField(StringType.Default)); + for (int i = 0; i < columnSetCount; i++) + { + AddField(CreateField(new ListType(Int64Type.Default), i)); + AddField(CreateField(new ListViewType(Int64Type.Default), i)); + AddField(CreateField(BooleanType.Default, i)); + AddField(CreateField(UInt8Type.Default, i)); + AddField(CreateField(Int8Type.Default, i)); + AddField(CreateField(UInt16Type.Default, i)); + AddField(CreateField(Int16Type.Default, i)); + AddField(CreateField(UInt32Type.Default, i)); + AddField(CreateField(Int32Type.Default, i)); + AddField(CreateField(UInt64Type.Default, i)); + AddField(CreateField(Int64Type.Default, i)); +#if NET5_0_OR_GREATER + AddField(CreateField(HalfFloatType.Default, i)); +#endif + AddField(CreateField(FloatType.Default, i)); + AddField(CreateField(DoubleType.Default, i)); + AddField(CreateField(Date32Type.Default, i)); + AddField(CreateField(Date64Type.Default, i)); + AddField(CreateField(Time32Type.Default, i)); + AddField(CreateField(Time64Type.Default, i)); + AddField(CreateField(TimestampType.Default, i)); + AddField(CreateField(StringType.Default, i)); + AddField(CreateField(StringViewType.Default, i)); + AddField(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); + AddField(CreateField(new Decimal128Type(10, 6), i)); + AddField(CreateField(new Decimal256Type(16, 8), i)); + AddField(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); + AddField(CreateField(IntervalType.YearMonth, i)); + AddField(CreateField(IntervalType.DayTime, i)); + AddField(CreateField(IntervalType.MonthDayNanosecond, i)); + AddField(CreateField(BinaryType.Default, i)); + AddField(CreateField(BinaryViewType.Default, i)); + AddField(CreateField(new FixedSizeBinaryType(16), i)); + AddField(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + AddField(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); } Schema schema = builder.Build(); @@ -130,16 +143,23 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor { private int Length { get; } @@ -160,6 +180,9 @@ public ArrayCreator(int length) public void Visit(UInt32Type type) => GenerateArray(new UInt32Array.Builder(), x => (uint)x); public void Visit(UInt64Type type) => GenerateArray(new UInt64Array.Builder(), x => (ulong)x); public void Visit(FloatType type) => GenerateArray(new FloatArray.Builder(), x => ((float)x / Length)); +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) => GenerateArray(new HalfFloatArray.Builder(), x => ((Half)x / (Half)Length)); +#endif public void Visit(DoubleType type) => GenerateArray(new DoubleArray.Builder(), x => ((double)x / Length)); public void Visit(Decimal128Type type) { @@ -277,6 +300,30 @@ public void Visit(StringType type) Array = builder.Build(); } + public void Visit(StringViewType type) + { + var str = "length=ten"; + var builder = new StringViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(str); + break; + case 2: + builder.Append(str + str); + break; + } + } + + Array = builder.Build(); + } + public void Visit(ListType type) { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); @@ -294,6 +341,23 @@ public void Visit(ListType type) Array = builder.Build(); } + public void Visit(ListViewType type) + { + var builder = new ListViewArray.Builder(type.ValueField).Reserve(Length); + + var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1); + + for (var i = 0; i < Length; i++) + { + builder.Append(); + valueBuilder.Append(i); + } + //Add a value to check if Values.Length can exceed ListArray.Length + valueBuilder.Append(0); + + Array = builder.Build(); + } + public void Visit(FixedSizeListType type) { var builder = new FixedSizeListArray.Builder(type.ValueField, type.ListSize).Reserve(Length); @@ -411,6 +475,64 @@ public void Visit(DictionaryType type) Array = new DictionaryArray(type, indicesBuilder.Build(), valueBuilder.Build()); } + public void Visit(BinaryType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + + public void Visit(BinaryViewType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 2bbc843836af9..230ec5b3effff 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1932,13 +1932,12 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), generate_list_view_case() - .skip_tester('C#') + .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 0c1a01d7c6778..03095aa2e9356 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -16,6 +16,8 @@ .. under the License. +.. _arrow-pycapsule-interface: + ============================= The Arrow PyCapsule Interface ============================= diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index ee92cebcb549c..b7261005e66ee 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -21,6 +21,38 @@ Extending pyarrow ================= +Controlling conversion to (Py)Arrow with the PyCapsule Interface +---------------------------------------------------------------- + +The :ref:`Arrow C data interface ` allows moving Arrow data between +different implementations of Arrow. This is a generic, cross-language interface not +specific to Python, but for Python libraries this interface is extended with a Python +specific layer: :ref:`arrow-pycapsule-interface`. + +This Python interface ensures that different libraries that support the C Data interface +can export Arrow data structures in a standard way and recognize each other's objects. + +If you have a Python library providing data structures that hold Arrow-compatible data +under the hood, you can implement the following methods on those objects: + +- ``__arrow_c_schema__`` for schema or type-like objects. +- ``__arrow_c_array__`` for arrays and record batches (contiguous tables). +- ``__arrow_c_stream__`` for chunked tables or streams of data. + +Those methods return `PyCapsule `__ +objects, and more details on the exact semantics can be found in the +:ref:`specification `. + +When your data structures have those methods defined, the PyArrow constructors +(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +supporting this protocol, and convert them to PyArrow data structures zero-copy. And the +same can be true for any other library supporting this protocol on ingesting data. + +Similarly, if your library has functions that accept user-provided data, you can add +support for this protocol by checking for the presence of those methods, and +therefore accept any Arrow data (instead of harcoding support for a specific +Arrow producer such as PyArrow). + .. _arrow_array_protocol: Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 85a9674a689ca..d4717897660b6 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -511,36 +511,20 @@ from a remote filesystem into a pandas dataframe you may need to run ``sort_index`` to maintain row ordering (as long as the ``preserve_index`` option was enabled on write). -.. note:: - - The ParquetDataset is being reimplemented based on the new generic Dataset - API (see the :ref:`dataset` docs for an overview). This is not yet the - default, but can already be enabled by passing the ``use_legacy_dataset=False`` - keyword to :class:`ParquetDataset` or :func:`read_table`:: - - pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) - - Enabling this gives the following new features: - - - Filtering on all columns (using row group statistics) instead of only on - the partition keys. - - More fine-grained partitioning: support for a directory partitioning scheme - in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of - "/year=2019/month=11/day=15/"), and the ability to specify a schema for - the partition keys. - - General performance improvement and bug fixes. +Other features: - It also has the following changes in behaviour: +- Filtering on all columns (using row group statistics) instead of only on + the partition keys. +- Fine-grained partitioning: support for a directory partitioning scheme + in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of + "/year=2019/month=11/day=15/"), and the ability to specify a schema for + the partition keys. - - The partition keys need to be explicitly included in the ``columns`` - keyword when you want to include them in the result while reading a - subset of the columns +Note: - This new implementation is already enabled in ``read_table``, and in the - future, this will be turned on by default for ``ParquetDataset``. The new - implementation does not yet cover all existing ParquetDataset features (e.g. - specifying the ``metadata``, or the ``pieces`` property API). Feedback is - very welcome. +- The partition keys need to be explicitly included in the ``columns`` + keyword when you want to include them in the result while reading a + subset of the columns Using with Spark diff --git a/docs/source/status.rst b/docs/source/status.rst index e860aceb76e15..03a87012342c2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -68,9 +68,13 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Binary View | ✓ | | ✓ | | | | | | +| Binary View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| String View | ✓ | | ✓ | | | | | | +| Large Binary View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Utf8 View | ✓ | | ✓ | | ✓ | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Large Utf8 View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -83,7 +87,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| List View | ✓ | | ✓ | | | | | | +| List View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java index daa35b7e15be6..e8b780638e2c1 100644 --- a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java +++ b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java @@ -79,6 +79,7 @@ protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBu byte[] outBytes = out.toByteArray(); ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length); decompressedBuffer.setBytes(/*index=*/0, outBytes); + decompressedBuffer.writerIndex(decompressedLength); return decompressedBuffer; } diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java index 403130edba52e..01156fa2b0e0b 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java @@ -117,6 +117,12 @@ private List deCompressBuffers(CompressionCodec codec, List return outputBuffers; } + private void assertWriterIndex(List decompressedBuffers) { + for (ArrowBuf decompressedBuf : decompressedBuffers) { + assertTrue(decompressedBuf.writerIndex() > 0); + } + } + @ParameterizedTest @MethodSource("codecs") void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) throws Exception { @@ -139,6 +145,7 @@ void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) thr List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(2, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector IntVector newVec = new IntVector("new vec", allocator); @@ -180,6 +187,7 @@ void testCompressVariableWidthBuffers(int vectorLength, CompressionCodec codec) List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(3, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector VarCharVector newVec = new VarCharVector("new vec", allocator); diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java index d50dc385a62e1..ffb0048181c7c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java @@ -115,6 +115,16 @@ static KeyStore getKeyStoreInstance(String instance) return keyStore; } + @VisibleForTesting + static KeyStore getDefaultKeyStoreInstance(String password) + throws KeyStoreException, CertificateException, NoSuchAlgorithmException, IOException { + try (InputStream fileInputStream = getKeystoreInputStream()) { + KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); + keyStore.load(fileInputStream, password == null ? null : password.toCharArray()); + return keyStore; + } + } + static String getOperatingSystem() { return System.getProperty("os.name"); } @@ -156,16 +166,9 @@ public static InputStream getCertificateInputStreamFromSystem(String password) t keyStoreList.add(getKeyStoreInstance("Windows-MY")); } else if (isMac()) { keyStoreList.add(getKeyStoreInstance("KeychainStore")); + keyStoreList.add(getDefaultKeyStoreInstance(password)); } else { - try (InputStream fileInputStream = getKeystoreInputStream()) { - KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); - if (password == null) { - keyStore.load(fileInputStream, null); - } else { - keyStore.load(fileInputStream, password.toCharArray()); - } - keyStoreList.add(keyStore); - } + keyStoreList.add(getDefaultKeyStoreInstance(password)); } return getCertificatesInputStream(keyStoreList); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index 27bba64587367..b7977462e9c01 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -77,6 +77,33 @@ public void testGetKeyStoreInstance() throws IOException, } } + @Test + public void testGetDefaultKeyStoreInstancePassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test + public void testGetDefaultKeyStoreInstanceNoPassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance(null)) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance(null); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test public void testGetCertificateInputStreamFromMacSystem() throws IOException, KeyStoreException, CertificateException, NoSuchAlgorithmException { @@ -90,11 +117,18 @@ public void testGetCertificateInputStreamFromMacSystem() throws IOException, keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getKeyStoreInstance("KeychainStore")) .thenReturn(keyStoreMock); + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + clientAuthenticationUtilsMockedStatic + .when(ClientAuthenticationUtils::getKeystoreInputStream) + .thenCallRealMethod(); + keyStoreMockedStatic.when(KeyStore::getDefaultType).thenCallRealMethod(); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("test"); + InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("changeit"); Assert.assertEquals(inputStream, mock); } } @@ -136,9 +170,11 @@ public void testGetCertificateInputStreamFromLinuxSystem() throws IOException, setOperatingSystemMock(clientAuthenticationUtilsMockedStatic, false, false); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils - .getCertificatesInputStream(Mockito.any())) + .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance(Mockito.any())) + .thenReturn(keyStoreMock); clientAuthenticationUtilsMockedStatic .when(ClientAuthenticationUtils::getKeystoreInputStream) .thenCallRealMethod(); diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 06c6669cfd162..ba9aba353c351 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -71,7 +71,7 @@ public UnsafeDirectLittleEndian allocate(long size) { } public int getChunkSize() { - return allocator.chunkSize; + return allocator.chunkSize(); } public long getHugeBufferSize() { @@ -137,7 +137,6 @@ private class InnerAllocator extends PooledByteBufAllocator { private final PoolArena[] directArenas; private final MemoryStatusThread statusThread; - private final int chunkSize; public InnerAllocator() { super(true); @@ -150,8 +149,6 @@ public InnerAllocator() { throw new RuntimeException("Failure while initializing allocator. Unable to retrieve direct arenas field.", e); } - this.chunkSize = directArenas[0].chunkSize; - if (memoryLogger.isTraceEnabled()) { statusThread = new MemoryStatusThread(this); statusThread.start(); @@ -166,7 +163,7 @@ private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCa if (directArena != null) { - if (initialCapacity > directArena.chunkSize) { + if (initialCapacity > chunkSize()) { // This is beyond chunk size so we'll allocate separately. ByteBuf buf = UnpooledByteBufAllocator.DEFAULT.directBuffer(initialCapacity, maxCapacity); diff --git a/java/performance/pom.xml b/java/performance/pom.xml index a3e4da85b4321..4d449af46b6b1 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -199,7 +199,7 @@ maven-resources-plugin - 2.6 + 3.3.1 maven-site-plugin @@ -211,7 +211,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 diff --git a/java/pom.xml b/java/pom.xml index 75e0946f10811..523e5642720cd 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.1 2.0.9 32.1.3-jre - 4.1.100.Final + 4.1.104.Final 1.60.0 3.23.1 2.16.0 @@ -412,7 +412,7 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 + 3.3.1 org.apache.maven.plugins @@ -442,7 +442,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 org.junit.jupiter @@ -609,7 +609,7 @@ org.assertj assertj-core - 3.23.1 + 3.24.2 test diff --git a/js/src/builder.ts b/js/src/builder.ts index a4e2d4d89325c..1880db3818ca5 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -342,7 +342,7 @@ export abstract class Builder { export abstract class FixedWidthBuilder extends Builder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); + this._values = new DataBufferBuilder(this.ArrayType, 0, this.stride); } public setValue(index: number, value: T['TValue']) { const values = this._values; diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index 3c12ddf34abb0..fa9a11b24ec39 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -16,15 +16,15 @@ // under the License. import { Binary } from '../type.js'; -import { toUint8Array } from '../util/buffer.js'; import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; +import { toUint8Array } from '../util/buffer.js'; /** @ignore */ export class BinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 402172059682c..18c6dcda738b9 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -24,20 +24,36 @@ function roundLengthUpToNearest64Bytes(len: number, BPE: number) { const bytesMinus1 = Math.ceil(len) * BPE - 1; return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; } + /** @ignore */ -const sliceOrExtendArray = (arr: T, len = 0) => ( - arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) -) as T; +function resizeArray(arr: T, len = 0): T { + // TODO: remove when https://github.com/microsoft/TypeScript/issues/54636 is fixed + const buffer = arr.buffer as ArrayBufferLike & { resizable: boolean; resize: (byteLength: number) => void; maxByteLength: number }; + const byteLength = len * arr.BYTES_PER_ELEMENT; + if (buffer.resizable && byteLength <= buffer.maxByteLength) { + buffer.resize(byteLength); + return arr; + } + + // Fallback for non-resizable buffers + return arr.length >= len ? + arr.subarray(0, len) as T : + memcpy(new (arr.constructor as any)(len), arr, 0); +} + +/** @ignore */ +export const SAFE_ARRAY_SIZE = 2 ** 32 - 1; /** @ignore */ export class BufferBuilder { - constructor(buffer: T, stride = 1) { - this.buffer = buffer; + constructor(bufferType: ArrayCtor, initialSize = 0, stride = 1) { + this.length = Math.ceil(initialSize / stride); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new bufferType(new (ArrayBuffer as any)(this.length * bufferType.BYTES_PER_ELEMENT, { maxByteLength: SAFE_ARRAY_SIZE })) as T; this.stride = stride; - this.BYTES_PER_ELEMENT = buffer.BYTES_PER_ELEMENT; - this.ArrayType = buffer.constructor as ArrayCtor; - this._resize(this.length = Math.ceil(buffer.length / stride)); + this.BYTES_PER_ELEMENT = bufferType.BYTES_PER_ELEMENT; + this.ArrayType = bufferType; } public buffer: T; @@ -72,17 +88,18 @@ export class BufferBuilder { } public flush(length = this.length) { length = roundLengthUpToNearest64Bytes(length * this.stride, this.BYTES_PER_ELEMENT); - const array = sliceOrExtendArray(this.buffer, length); + const array = resizeArray(this.buffer, length); this.clear(); return array; } public clear() { this.length = 0; - this._resize(0); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new this.ArrayType(new (ArrayBuffer as any)(0, { maxByteLength: SAFE_ARRAY_SIZE })) as T; return this; } protected _resize(newLength: number) { - return this.buffer = memcpy(new this.ArrayType(newLength), this.buffer); + return this.buffer = resizeArray(this.buffer, newLength); } } @@ -100,7 +117,7 @@ export class DataBufferBuilder extends Buffe /** @ignore */ export class BitmapBufferBuilder extends DataBufferBuilder { - constructor(data = new Uint8Array(0)) { super(data, 1 / 8); } + constructor() { super(Uint8Array, 0, 1 / 8); } public numValid = 0; public get numInvalid() { return this.length - this.numValid; } @@ -123,9 +140,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { - super(new type.OffsetArrayType(1), 1); + super(type.OffsetArrayType as ArrayCtor, 1, 1); } - public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 51890100095c1..90a0bde9f3443 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -25,7 +25,7 @@ import { LargeBinaryBuilder } from './largebinary.js'; export class LargeUtf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/union.ts b/js/src/builder/union.ts index ac8a13191a549..7bee460a77de1 100644 --- a/js/src/builder/union.ts +++ b/js/src/builder/union.ts @@ -31,7 +31,7 @@ export abstract class UnionBuilder extends Builder constructor(options: UnionBuilderOptions) { super(options); - this._typeIds = new DataBufferBuilder(new Int8Array(0), 1); + this._typeIds = new DataBufferBuilder(Int8Array, 0, 1); if (typeof options['valueToChildTypeId'] === 'function') { this._valueToChildTypeId = options['valueToChildTypeId']; } @@ -84,7 +84,7 @@ export class DenseUnionBuilder extends UnionB constructor(options: UnionBuilderOptions) { super(options); - this._offsets = new DataBufferBuilder(new Int32Array(0)); + this._offsets = new DataBufferBuilder(Int32Array); } /** @ignore */ diff --git a/js/src/builder/utf8.ts b/js/src/builder/utf8.ts index 53b8306cbaffd..aac0aec54fe90 100644 --- a/js/src/builder/utf8.ts +++ b/js/src/builder/utf8.ts @@ -25,7 +25,7 @@ import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; export class Utf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/type.ts b/js/src/type.ts index dea5301aed355..ae3aefa025999 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -79,7 +79,11 @@ export abstract class DataTypeType.NONE; } + declare public readonly typeId: TType; + + constructor(typeId: TType) { + this.typeId = typeId; + } protected static [Symbol.toStringTag] = ((proto: DataType) => { (proto).children = null; @@ -93,8 +97,10 @@ export abstract class DataType { TArray: void; TValue: null } /** @ignore */ export class Null extends DataType { + constructor() { + super(Type.Null); + } public toString() { return `Null`; } - public get typeId() { return Type.Null as Type.Null; } protected static [Symbol.toStringTag] = ((proto: Null) => proto[Symbol.toStringTag] = 'Null')(Null.prototype); } @@ -119,9 +125,8 @@ interface Int_ extends DataType { TArray: IType[T]['TA class Int_ extends DataType { constructor(public readonly isSigned: IType[T]['isSigned'], public readonly bitWidth: IType[T]['bitWidth']) { - super(); + super(Type.Int as T); } - public get typeId() { return Type.Int as T; } public get ArrayType() { switch (this.bitWidth) { case 8: return this.isSigned ? Int8Array : Uint8Array; @@ -206,9 +211,8 @@ export interface Float extends DataType { TArray: /** @ignore */ export class Float extends DataType { constructor(public readonly precision: Precision) { - super(); + super(Type.Float as T); } - public get typeId() { return Type.Float as T; } public get ArrayType(): TypedArrayConstructor { switch (this.precision) { case Precision.HALF: return Uint16Array; @@ -241,9 +245,8 @@ export interface Binary extends DataType { TArray: Uint8Array; TOff /** @ignore */ export class Binary extends DataType { constructor() { - super(); + super(Type.Binary); } - public get typeId() { return Type.Binary as Type.Binary; } public toString() { return `Binary`; } protected static [Symbol.toStringTag] = ((proto: Binary) => { (proto).ArrayType = Uint8Array; @@ -256,9 +259,8 @@ export interface LargeBinary extends DataType { TArray: Uint8A /** @ignore */ export class LargeBinary extends DataType { constructor() { - super(); + super(Type.LargeBinary); } - public get typeId() { return Type.LargeBinary as Type.LargeBinary; } public toString() { return `LargeBinary`; } protected static [Symbol.toStringTag] = ((proto: LargeBinary) => { (proto).ArrayType = Uint8Array; @@ -272,9 +274,8 @@ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetA /** @ignore */ export class Utf8 extends DataType { constructor() { - super(); + super(Type.Utf8); } - public get typeId() { return Type.Utf8 as Type.Utf8; } public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; @@ -287,9 +288,8 @@ export interface LargeUtf8 extends DataType { TArray: Uint8Array /** @ignore */ export class LargeUtf8 extends DataType { constructor() { - super(); + super(Type.LargeUtf8); } - public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } public toString() { return `LargeUtf8`; } protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { (proto).ArrayType = Uint8Array; @@ -303,9 +303,8 @@ export interface Bool extends DataType { TArray: Uint8Array; TValue: /** @ignore */ export class Bool extends DataType { constructor() { - super(); + super(Type.Bool); } - public get typeId() { return Type.Bool as Type.Bool; } public toString() { return `Bool`; } protected static [Symbol.toStringTag] = ((proto: Bool) => { (proto).ArrayType = Uint8Array; @@ -320,9 +319,8 @@ export class Decimal extends DataType { constructor(public readonly scale: number, public readonly precision: number, public readonly bitWidth: number = 128) { - super(); + super(Type.Decimal); } - public get typeId() { return Type.Decimal as Type.Decimal; } public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } protected static [Symbol.toStringTag] = ((proto: Decimal) => { (proto).scale = null; @@ -339,9 +337,8 @@ export interface Date_ extends DataType { TArray: In /** @ignore */ export class Date_ extends DataType { constructor(public readonly unit: DateUnit) { - super(); + super(Type.Date as T); } - public get typeId() { return Type.Date as T; } public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Date_) => { (proto).unit = null; @@ -375,9 +372,8 @@ interface Time_ extends DataType { class Time_ extends DataType { constructor(public readonly unit: TimesType[T]['unit'], public readonly bitWidth: TimeBitWidth) { - super(); + super(Type.Time as T); } - public get typeId() { return Type.Time as T; } public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } public get ArrayType() { switch (this.bitWidth) { @@ -418,9 +414,8 @@ interface Timestamp_ extends DataType { class Timestamp_ extends DataType { constructor(public readonly unit: TimeUnit, public readonly timezone?: string | null) { - super(); + super(Type.Timestamp as T); } - public get typeId() { return Type.Timestamp as T; } public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } protected static [Symbol.toStringTag] = ((proto: Timestamp_) => { (proto).unit = null; @@ -453,9 +448,8 @@ interface Interval_ extends DataType { /** @ignore */ class Interval_ extends DataType { constructor(public readonly unit: IntervalUnit) { - super(); + super(Type.Interval as T); } - public get typeId() { return Type.Interval as T; } public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Interval_) => { (proto).unit = null; @@ -483,9 +477,8 @@ export interface Duration extends DataType { /** @ignore */ export class Duration extends DataType { constructor(public readonly unit: TimeUnit) { - super(); + super(Type.Duration as T); } - public get typeId() { return Type.Duration as T; } public toString() { return `Duration<${TimeUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Duration) => { (proto).unit = null; @@ -513,11 +506,10 @@ export interface List extends DataType extends DataType { constructor(child: Field) { - super(); + super(Type.List); this.children = [child]; } public declare readonly children: Field[]; - public get typeId() { return Type.List as Type.List; } public toString() { return `List<${this.valueType}>`; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } @@ -540,10 +532,9 @@ export class Struct extends DataType { public declare _row: StructRow; public declare readonly children: Field[]; constructor(children: Field[]) { - super(); + super(Type.Struct); this.children = children; } - public get typeId() { return Type.Struct as Type.Struct; } public toString() { return `Struct<{${this.children.map((f) => `${f.name}:${f.type}`).join(`, `)}}>`; } protected static [Symbol.toStringTag] = ((proto: Struct) => { (proto).children = null; @@ -564,13 +555,12 @@ class Union_ extends DataType { constructor(mode: UnionMode, typeIds: number[] | Int32Array, children: Field[]) { - super(); + super(Type.Union as T); this.mode = mode; this.children = children; this.typeIds = typeIds = Int32Array.from(typeIds); this.typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex, Object.create(null) as { [key: number]: number }); } - public get typeId() { return Type.Union as T; } public toString() { return `${this[Symbol.toStringTag]}<${this.children.map((x) => `${x.type}`).join(` | `) }>`; @@ -611,9 +601,8 @@ export interface FixedSizeBinary extends DataType { /** @ignore */ export class FixedSizeBinary extends DataType { constructor(public readonly byteWidth: number) { - super(); + super(Type.FixedSizeBinary); } - public get typeId() { return Type.FixedSizeBinary as Type.FixedSizeBinary; } public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; @@ -632,10 +621,9 @@ export interface FixedSizeList extends DataType extends DataType { public declare readonly children: Field[]; constructor(public readonly listSize: number, child: Field) { - super(); + super(Type.FixedSizeList); this.children = [child]; } - public get typeId() { return Type.FixedSizeList as Type.FixedSizeList; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } @@ -657,7 +645,7 @@ export interface Map_ extends DataType }> { constructor(entries: Field>, keysSorted = false) { - super(); + super(Type.Map); this.children = [entries]; this.keysSorted = keysSorted; // ARROW-8716 @@ -678,7 +666,6 @@ export class Map_ ex } public declare readonly keysSorted: boolean; public declare readonly children: Field>[]; - public get typeId() { return Type.Map as Type.Map; } public get keyType(): TKey { return this.children[0].type.children[0].type as TKey; } public get valueType(): TValue { return this.children[0].type.children[1].type as TValue; } public get childType() { return this.children[0].type as Struct<{ key: TKey; value: TValue }>; } @@ -709,13 +696,12 @@ export class Dictionary ex public declare readonly dictionary: T; public declare readonly isOrdered: boolean; constructor(dictionary: T, indices: TKey, id?: bigint | number | null, isOrdered?: boolean | null) { - super(); + super(Type.Dictionary); this.indices = indices; this.dictionary = dictionary; this.isOrdered = isOrdered || false; this.id = id == null ? getId() : bigIntToNumber(id); } - public get typeId() { return Type.Dictionary as Type.Dictionary; } public get children() { return this.dictionary.children; } public get valueType(): T { return this.dictionary as T; } public get ArrayType(): T['ArrayType'] { return this.dictionary.ArrayType; } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3f810d27271e5..2df1e67b9f4c7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -332,22 +332,6 @@ if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_PARQUET_ENCRYPTION) - if(PARQUET_REQUIRE_ENCRYPTION) - list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) - if(ARROW_BUILD_SHARED) - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_shared) - else() - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_static) - endif() - message(STATUS "Parquet Encryption Enabled") - else() - message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") - endif() -else() - message(STATUS "Parquet Encryption is NOT Enabled") -endif() - if(PYARROW_BUILD_HDFS) if(NOT ARROW_HDFS) message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") @@ -391,6 +375,26 @@ install(TARGETS arrow_python LIBRARY DESTINATION . RUNTIME DESTINATION .) +set(PYARROW_CPP_ENCRYPTION_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) +if(NOT PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Parquet Encryption is NOT Enabled") +else() + if(PARQUET_REQUIRE_ENCRYPTION) + add_library(arrow_python_parquet_encryption SHARED ${PYARROW_CPP_ENCRYPTION_SRCS}) + target_link_libraries(arrow_python_parquet_encryption PUBLIC arrow_python + ${PARQUET_LINK_LIBS}) + target_compile_definitions(arrow_python_parquet_encryption + PRIVATE ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) + install(TARGETS arrow_python_parquet_encryption + ARCHIVE DESTINATION . + LIBRARY DESTINATION . + RUNTIME DESTINATION .) + message(STATUS "Parquet Encryption Enabled") + else() + message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") + endif() +endif() + set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) if(NOT ARROW_FLIGHT) @@ -814,6 +818,6 @@ endif() if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet PRIVATE ${PARQUET_LINK_LIBS}) if(PYARROW_BUILD_PARQUET_ENCRYPTION) - target_link_libraries(_parquet_encryption PRIVATE ${PARQUET_LINK_LIBS}) + target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index 3aeca425bc8f0..e459ea2c369b4 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -29,35 +29,6 @@ pq = None -class ParquetManifestCreation(object): - """Benchmark creating a parquet manifest.""" - - size = 10 ** 6 - tmpdir = None - - param_names = ('num_partitions', 'num_threads') - params = [(10, 100, 1000), (1, 8)] - - def setup(self, num_partitions, num_threads): - if pq is None: - raise NotImplementedError("Parquet support not enabled") - - self.tmpdir = tempfile.mkdtemp('benchmark_parquet') - rnd = np.random.RandomState(42) - num1 = rnd.randint(0, num_partitions, size=self.size) - num2 = rnd.randint(0, 1000, size=self.size) - output_df = pd.DataFrame({'num1': num1, 'num2': num2}) - output_table = pa.Table.from_pandas(output_df) - pq.write_to_dataset(output_table, self.tmpdir, ['num1']) - - def teardown(self, num_partitions, num_threads): - if self.tmpdir is not None: - shutil.rmtree(self.tmpdir) - - def time_manifest_creation(self, num_partitions, num_threads): - pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads) - - class ParquetWriteBinary(object): def setup(self): diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 852b339211b0d..98a4b2a1138c7 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -17,22 +17,17 @@ from collections import defaultdict -from concurrent import futures from contextlib import nullcontext -from functools import partial, reduce +from functools import reduce import inspect import json -from collections.abc import Collection -import numpy as np import os import re import operator -import urllib.parse import warnings import pyarrow as pa -import pyarrow.lib as lib try: import pyarrow._parquet as _parquet @@ -55,28 +50,6 @@ from pyarrow import filesystem as legacyfs from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api -_URI_STRIP_SCHEMES = ('hdfs',) - - -def _parse_uri(path): - path = _stringify_path(path) - parsed_uri = urllib.parse.urlparse(path) - if parsed_uri.scheme in _URI_STRIP_SCHEMES: - return parsed_uri.path - else: - # ARROW-4073: On Windows returning the path with the scheme - # stripped removes the drive letter, if any - return path - - -def _get_filesystem_and_path(passed_filesystem, path): - if passed_filesystem is None: - return legacyfs.resolve_filesystem_and_path(path, passed_filesystem) - else: - passed_filesystem = legacyfs._ensure_filesystem(passed_filesystem) - parsed_path = _parse_uri(path) - return passed_filesystem, parsed_path - def _check_contains_null(val): if isinstance(val, bytes): @@ -1148,516 +1121,15 @@ def _get_pandas_index_columns(keyvalues): ['index_columns']) -# ---------------------------------------------------------------------- -# Metadata container providing instructions about reading a single Parquet -# file, possibly part of a partitioned dataset - - -class ParquetDatasetPiece: - """ - DEPRECATED: A single chunk of a potentially larger Parquet dataset to read. - - The arguments will indicate to read either a single row group or all row - groups, and whether to add partition keys to the resulting pyarrow.Table. - - .. deprecated:: 5.0 - Directly constructing a ``ParquetDatasetPiece`` is deprecated, as well - as accessing the pieces of a ``ParquetDataset`` object. Specify - ``use_legacy_dataset=False`` when constructing the ``ParquetDataset`` - and use the ``ParquetDataset.fragments`` attribute instead. - - Parameters - ---------- - path : str or pathlib.Path - Path to file in the file system where this piece is located. - open_file_func : callable - Function to use for obtaining file handle to dataset piece. - file_options : dict - Options - row_group : int, default None - Row group to load. By default, reads all row groups. - partition_keys : list of tuples - Two-element tuples of ``(column name, ordinal index)``. - """ - - def __init__(self, path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - warnings.warn( - "ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - - @staticmethod - def _create(path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - self = ParquetDatasetPiece.__new__(ParquetDatasetPiece) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - return self - - def _init(self, path, open_file_func, file_options, row_group, - partition_keys): - self.path = _stringify_path(path) - self.open_file_func = open_file_func - self.row_group = row_group - self.partition_keys = partition_keys or [] - self.file_options = file_options or {} - - def __eq__(self, other): - if not isinstance(other, ParquetDatasetPiece): - return False - return (self.path == other.path and - self.row_group == other.row_group and - self.partition_keys == other.partition_keys) - - def __repr__(self): - return ('{}({!r}, row_group={!r}, partition_keys={!r})' - .format(type(self).__name__, self.path, - self.row_group, - self.partition_keys)) - - def __str__(self): - result = '' - - if len(self.partition_keys) > 0: - partition_str = ', '.join('{}={}'.format(name, index) - for name, index in self.partition_keys) - result += 'partition[{}] '.format(partition_str) - - result += self.path - - if self.row_group is not None: - result += ' | row_group={}'.format(self.row_group) - - return result - - def get_metadata(self): - """ - Return the file's metadata. - - Returns - ------- - metadata : FileMetaData - The file's metadata - """ - with self.open() as parquet: - return parquet.metadata - - def open(self): - """ - Return instance of ParquetFile. - """ - reader = self.open_file_func(self.path) - if not isinstance(reader, ParquetFile): - reader = ParquetFile(reader, **self.file_options) - - # ensure reader knows it's responsible for closing source - # since we opened the source here internally. - reader._close_source = True - return reader - - def read(self, columns=None, use_threads=True, partitions=None, - file=None, use_pandas_metadata=False): - """ - Read this piece as a pyarrow.Table. - - Parameters - ---------- - columns : list of column names, default None - use_threads : bool, default True - Perform multi-threaded column reads. - partitions : ParquetPartitions, default None - file : file-like object - Passed to ParquetFile. - use_pandas_metadata : bool - If pandas metadata should be used or not. - - Returns - ------- - table : pyarrow.Table - The piece as a pyarrow.Table. - """ - if self.open_file_func is not None: - reader = self.open() - elif file is not None: - reader = ParquetFile(file, **self.file_options) - else: - # try to read the local path - reader = ParquetFile(self.path, **self.file_options) - - options = dict(columns=columns, - use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - if self.row_group is not None: - table = reader.read_row_group(self.row_group, **options) - else: - table = reader.read(**options) - - if len(self.partition_keys) > 0: - if partitions is None: - raise ValueError('Must pass partition sets') - - # Here, the index is the categorical code of the partition where - # this piece is located. Suppose we had - # - # /foo=a/0.parq - # /foo=b/0.parq - # /foo=c/0.parq - # - # Then we assign a=0, b=1, c=2. And the resulting Table pieces will - # have a DictionaryArray column named foo having the constant index - # value as indicated. The distinct categories of the partition have - # been computed in the ParquetManifest - for i, (name, index) in enumerate(self.partition_keys): - # The partition code is the same for all values in this piece - indices = np.full(len(table), index, dtype='i4') - - # This is set of all partition values, computed as part of the - # manifest, so ['a', 'b', 'c'] as in our example above. - dictionary = partitions.levels[i].dictionary - - arr = pa.DictionaryArray.from_arrays(indices, dictionary) - table = table.append_column(name, arr) - - # To ParquetFile the source looked like it was already open, so won't - # actually close it without overriding. - reader.close(force=True) - return table - - -class PartitionSet: - """ - A data structure for cataloguing the observed Parquet partitions at a - particular level. So if we have - - /foo=a/bar=0 - /foo=a/bar=1 - /foo=a/bar=2 - /foo=b/bar=0 - /foo=b/bar=1 - /foo=b/bar=2 - - Then we have two partition sets, one for foo, another for bar. As we visit - levels of the partition hierarchy, a PartitionSet tracks the distinct - values and assigns categorical codes to use when reading the pieces - - Parameters - ---------- - name : str - Name of the partition set. Under which key to collect all values. - keys : list - All possible values that have been collected for that partition set. - """ - - def __init__(self, name, keys=None): - self.name = name - self.keys = keys or [] - self.key_indices = {k: i for i, k in enumerate(self.keys)} - self._dictionary = None - - def get_index(self, key): - """ - Get the index of the partition value if it is known, otherwise assign - one - - Parameters - ---------- - key : str or int - The value for which we want to known the index. - """ - if key in self.key_indices: - return self.key_indices[key] - else: - index = len(self.key_indices) - self.keys.append(key) - self.key_indices[key] = index - return index - - @property - def dictionary(self): - if self._dictionary is not None: - return self._dictionary - - if len(self.keys) == 0: - raise ValueError('No known partition keys') - - # Only integer and string partition types are supported right now - try: - integer_keys = [int(x) for x in self.keys] - dictionary = lib.array(integer_keys) - except ValueError: - dictionary = lib.array(self.keys) - - self._dictionary = dictionary - return dictionary - - @property - def is_sorted(self): - return list(self.keys) == sorted(self.keys) - - -class ParquetPartitions: - - def __init__(self): - self.levels = [] - self.partition_names = set() - - def __len__(self): - return len(self.levels) - - def __getitem__(self, i): - return self.levels[i] - - def equals(self, other): - if not isinstance(other, ParquetPartitions): - raise TypeError('`other` must be an instance of ParquetPartitions') - - return (self.levels == other.levels and - self.partition_names == other.partition_names) - - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented - - def get_index(self, level, name, key): - """ - Record a partition value at a particular level, returning the distinct - code for that value at that level. - - Examples - -------- - - partitions.get_index(1, 'foo', 'a') returns 0 - partitions.get_index(1, 'foo', 'b') returns 1 - partitions.get_index(1, 'foo', 'c') returns 2 - partitions.get_index(1, 'foo', 'a') returns 0 - - Parameters - ---------- - level : int - The nesting level of the partition we are observing - name : str - The partition name - key : str or int - The partition value - """ - if level == len(self.levels): - if name in self.partition_names: - raise ValueError('{} was the name of the partition in ' - 'another level'.format(name)) - - part_set = PartitionSet(name) - self.levels.append(part_set) - self.partition_names.add(name) - - return self.levels[level].get_index(key) - - def filter_accepts_partition(self, part_key, filter, level): - p_column, p_value_index = part_key - f_column, op, f_value = filter - if p_column != f_column: - return True - - f_type = type(f_value) - - if op in {'in', 'not in'}: - if not isinstance(f_value, Collection): - raise TypeError( - "'%s' object is not a collection", f_type.__name__) - if not f_value: - raise ValueError("Cannot use empty collection as filter value") - if len({type(item) for item in f_value}) != 1: - raise ValueError("All elements of the collection '%s' must be" - " of same type", f_value) - f_type = type(next(iter(f_value))) - - elif not isinstance(f_value, str) and isinstance(f_value, Collection): - raise ValueError( - "Op '%s' not supported with a collection value", op) - - p_value = f_type(self.levels[level] - .dictionary[p_value_index].as_py()) - - if op == "=" or op == "==": - return p_value == f_value - elif op == "!=": - return p_value != f_value - elif op == '<': - return p_value < f_value - elif op == '>': - return p_value > f_value - elif op == '<=': - return p_value <= f_value - elif op == '>=': - return p_value >= f_value - elif op == 'in': - return p_value in f_value - elif op == 'not in': - return p_value not in f_value - else: - raise ValueError("'%s' is not a valid operator in predicates.", - filter[1]) - - -class ParquetManifest: - - def __init__(self, dirpath, open_file_func=None, filesystem=None, - pathsep='/', partition_scheme='hive', metadata_nthreads=1): - filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) - self.filesystem = filesystem - self.open_file_func = open_file_func - self.pathsep = pathsep - self.dirpath = _stringify_path(dirpath) - self.partition_scheme = partition_scheme - self.partitions = ParquetPartitions() - self.pieces = [] - self._metadata_nthreads = metadata_nthreads - self._thread_pool = futures.ThreadPoolExecutor( - max_workers=metadata_nthreads) - - self.common_metadata_path = None - self.metadata_path = None - - self._visit_level(0, self.dirpath, []) - - # Due to concurrency, pieces will potentially by out of order if the - # dataset is partitioned so we sort them to yield stable results - self.pieces.sort(key=lambda piece: piece.path) - - if self.common_metadata_path is None: - # _common_metadata is a subset of _metadata - self.common_metadata_path = self.metadata_path - - self._thread_pool.shutdown() - - def _visit_level(self, level, base_path, part_keys): - fs = self.filesystem - - _, directories, files = next(fs.walk(base_path)) - - filtered_files = [] - for path in files: - full_path = self.pathsep.join((base_path, path)) - if path.endswith('_common_metadata'): - self.common_metadata_path = full_path - elif path.endswith('_metadata'): - self.metadata_path = full_path - elif self._should_silently_exclude(path): - continue - else: - filtered_files.append(full_path) - - # ARROW-1079: Filter out "private" directories starting with underscore - filtered_directories = [self.pathsep.join((base_path, x)) - for x in directories - if not _is_private_directory(x)] - - filtered_files.sort() - filtered_directories.sort() - - if len(filtered_files) > 0 and len(filtered_directories) > 0: - raise ValueError('Found files in an intermediate ' - 'directory: {}'.format(base_path)) - elif len(filtered_directories) > 0: - self._visit_directories(level, filtered_directories, part_keys) - else: - self._push_pieces(filtered_files, part_keys) - - def _should_silently_exclude(self, file_name): - return (file_name.endswith('.crc') or # Checksums - file_name.endswith('_$folder$') or # HDFS directories in S3 - file_name.startswith('.') or # Hidden files starting with . - file_name.startswith('_') or # Hidden files starting with _ - file_name in EXCLUDED_PARQUET_PATHS) - - def _visit_directories(self, level, directories, part_keys): - futures_list = [] - for path in directories: - head, tail = _path_split(path, self.pathsep) - name, key = _parse_hive_partition(tail) - - index = self.partitions.get_index(level, name, key) - dir_part_keys = part_keys + [(name, index)] - # If you have less threads than levels, the wait call will block - # indefinitely due to multiple waits within a thread. - if level < self._metadata_nthreads: - future = self._thread_pool.submit(self._visit_level, - level + 1, - path, - dir_part_keys) - futures_list.append(future) - else: - self._visit_level(level + 1, path, dir_part_keys) - if futures_list: - futures.wait(futures_list) - - def _parse_partition(self, dirname): - if self.partition_scheme == 'hive': - return _parse_hive_partition(dirname) - else: - raise NotImplementedError('partition schema: {}' - .format(self.partition_scheme)) - - def _push_pieces(self, files, part_keys): - self.pieces.extend([ - ParquetDatasetPiece._create(path, partition_keys=part_keys, - open_file_func=self.open_file_func) - for path in files - ]) - - -def _parse_hive_partition(value): - if '=' not in value: - raise ValueError('Directory name did not appear to be a ' - 'partition: {}'.format(value)) - return value.split('=', 1) - - -def _is_private_directory(x): - _, tail = os.path.split(x) - return (tail.startswith('_') or tail.startswith('.')) and '=' not in tail - - -def _path_split(path, sep): - i = path.rfind(sep) + 1 - head, tail = path[:i], path[i:] - head = head.rstrip(sep) - return head, tail - - EXCLUDED_PARQUET_PATHS = {'_SUCCESS'} -class _ParquetDatasetMetadata: - __slots__ = ('fs', 'memory_map', 'read_dictionary', 'common_metadata', - 'buffer_size') - - -def _open_dataset_file(dataset, path, meta=None): - if (dataset.fs is not None and - not isinstance(dataset.fs, legacyfs.LocalFileSystem)): - path = dataset.fs.open(path, mode='rb') - return ParquetFile( - path, - metadata=meta, - memory_map=dataset.memory_map, - read_dictionary=dataset.read_dictionary, - common_metadata=dataset.common_metadata, - buffer_size=dataset.buffer_size +def _is_local_file_system(fs): + return isinstance(fs, LocalFileSystem) or isinstance( + fs, legacyfs.LocalFileSystem ) -_DEPR_MSG = ( - "'{}' attribute is deprecated as of pyarrow 5.0.0 and will be removed " - "in a future version.{}" -) - - _read_docstring_common = """\ read_dictionary : list, default None List of names or column paths (for nested types) to read directly @@ -1680,6 +1152,7 @@ def _open_dataset_file(dataset, path, meta=None): you need to specify the field names or a full schema. See the ``pyarrow.dataset.partitioning()`` function for more details.""" + _parquet_dataset_example = """\ Generate an example PyArrow Table and write it to a partitioned dataset: @@ -1688,15 +1161,13 @@ def _open_dataset_file(dataset, path, meta=None): ... 'n_legs': [2, 2, 4, 4, 5, 100], ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq ->>> pq.write_to_dataset(table, root_path='dataset_name', -... partition_cols=['year'], -... use_legacy_dataset=False) +>>> pq.write_to_dataset(table, root_path='dataset_v2', +... partition_cols=['year']) create a ParquetDataset object from the dataset source: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) +>>> dataset = pq.ParquetDataset('dataset_v2/') and read the data: @@ -1711,7 +1182,7 @@ def _open_dataset_file(dataset, path, meta=None): create a ParquetDataset object with filter: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False, +>>> dataset = pq.ParquetDataset('dataset_v2/', ... filters=[('n_legs','=',4)]) >>> dataset.read().to_pandas() n_legs animal year @@ -1721,7 +1192,6 @@ def _open_dataset_file(dataset, path, meta=None): class ParquetDataset: - __doc__ = """ Encapsulates details of reading a complete Parquet dataset possibly consisting of multiple files and partitions in subdirectories. @@ -1735,39 +1205,26 @@ class ParquetDataset: Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. schema : pyarrow.parquet.Schema - Use schema obtained elsewhere to validate file schemas. Alternative to - metadata parameter. -metadata : pyarrow.parquet.FileMetaData - Use metadata obtained elsewhere to validate file schemas. -split_row_groups : bool, default False - Divide files into pieces for each row group in the file. -validate_schema : bool, default True - Check that individual file schemas are all the same / compatible. + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {1} -metadata_nthreads : int, default 1 - How many threads to allow the thread pool which is used to read the - dataset metadata. Increasing this is helpful to read partitioned - datasets. {0} -use_legacy_dataset : bool, default False - Set to False to enable the new code path (using the - new Arrow Dataset API). Among other things, this allows to pass - `filters` for all columns and not only the partition keys, enables - different partitioning schemes, etc. +ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. Set to False if you want to prioritize minimal memory usage over maximum speed. @@ -1775,6 +1232,10 @@ class ParquetDataset: Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps in nanoseconds. +decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -1785,739 +1246,95 @@ class ParquetDataset: sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. Examples -------- {2} """.format(_read_docstring_common, _DNF_filter_doc, _parquet_dataset_example) - def __new__(cls, path_or_paths=None, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False): - - extra_msg = "" - if use_legacy_dataset is None: - # if an old filesystem is passed -> still use to old implementation - if isinstance(filesystem, legacyfs.FileSystem): - use_legacy_dataset = True - extra_msg = ( - " The legacy behaviour was still chosen because a " - "deprecated 'pyarrow.filesystem' filesystem was specified " - "(use the filesystems from pyarrow.fs instead)." - ) - # otherwise the default is already False - else: - use_legacy_dataset = False - - if not use_legacy_dataset: - return _ParquetDatasetV2( - path_or_paths, filesystem=filesystem, - filters=filters, - partitioning=partitioning, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - # unsupported keywords - schema=schema, metadata=metadata, - split_row_groups=split_row_groups, - validate_schema=validate_schema, - metadata_nthreads=metadata_nthreads, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, - ) - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 11.0.0, and the legacy implementation " - "will be removed in a future version." + extra_msg, - FutureWarning, stacklevel=2) - self = object.__new__(cls) - return self - - def __init__(self, path_or_paths, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, + def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, + read_dictionary=None, memory_map=False, buffer_size=None, + partitioning="hive", ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, + decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): - if partitioning != "hive": - raise ValueError( - 'Only "hive" for hive-like partitioning is supported when ' - 'using use_legacy_dataset=True') - if metadata_nthreads is not None: - warnings.warn( - "Specifying the 'metadata_nthreads' argument is deprecated as " - "of pyarrow 8.0.0, and the argument will be removed in a " - "future version", - FutureWarning, stacklevel=2, - ) - else: - metadata_nthreads = 1 - - self._ds_metadata = _ParquetDatasetMetadata() - a_path = path_or_paths - if isinstance(a_path, list): - a_path = a_path[0] - - self._ds_metadata.fs, _ = _get_filesystem_and_path(filesystem, a_path) - if isinstance(path_or_paths, list): - self.paths = [_parse_uri(path) for path in path_or_paths] - else: - self.paths = _parse_uri(path_or_paths) - - self._ds_metadata.read_dictionary = read_dictionary - self._ds_metadata.memory_map = memory_map - self._ds_metadata.buffer_size = buffer_size - - (self._pieces, - self._partitions, - self._common_metadata_path, - self._metadata_path) = _make_manifest( - path_or_paths, self._fs, metadata_nthreads=metadata_nthreads, - open_file_func=partial(_open_dataset_file, self._ds_metadata) - ) - - if self._common_metadata_path is not None: - with self._fs.open(self._common_metadata_path) as f: - self._ds_metadata.common_metadata = read_metadata( - f, - memory_map=memory_map - ) - else: - self._ds_metadata.common_metadata = None + page_checksum_verification=False, + use_legacy_dataset=None): - if metadata is not None: + if use_legacy_dataset is not None: warnings.warn( - "Specifying the 'metadata' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0.", + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", FutureWarning, stacklevel=2) - if metadata is None and self._metadata_path is not None: - with self._fs.open(self._metadata_path) as f: - self._metadata = read_metadata(f, memory_map=memory_map) - else: - self._metadata = metadata - - if schema is not None: - warnings.warn( - "Specifying the 'schema' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0. You can still " - "specify it in combination with 'use_legacy_dataset=False', " - "but in that case you need to specify a pyarrow.Schema " - "instead of a ParquetSchema.", - FutureWarning, stacklevel=2) - self._schema = schema + import pyarrow.dataset as ds - self.split_row_groups = split_row_groups + # map format arguments + read_options = { + "pre_buffer": pre_buffer, + "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, + "thrift_string_size_limit": thrift_string_size_limit, + "thrift_container_size_limit": thrift_container_size_limit, + "page_checksum_verification": page_checksum_verification, + } + if buffer_size: + read_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + read_options.update(dictionary_columns=read_dictionary) - if split_row_groups: - raise NotImplementedError("split_row_groups not yet implemented") + if decryption_properties is not None: + read_options.update(decryption_properties=decryption_properties) + self._filter_expression = None if filters is not None: - if hasattr(filters, "cast"): - raise TypeError( - "Expressions as filter not supported for legacy dataset") - filters = _check_filters(filters) - self._filter(filters) - - if validate_schema: - self.validate_schemas() - - def __getnewargs_ex__(self): - # when creating a new instance while unpickling, force to use the - # legacy code path to create a ParquetDataset instance - # instead of a _ParquetDatasetV2 instance - return ((), dict(use_legacy_dataset=True)) - - def equals(self, other): - if not isinstance(other, ParquetDataset): - raise TypeError('`other` must be an instance of ParquetDataset') + self._filter_expression = filters_to_expression(filters) - if self._fs.__class__ != other._fs.__class__: - return False - for prop in ('paths', '_pieces', '_partitions', - '_common_metadata_path', '_metadata_path', - '_common_metadata', '_metadata', '_schema', - 'split_row_groups'): - if getattr(self, prop) != getattr(other, prop): - return False - for prop in ('memory_map', 'buffer_size'): - if ( - getattr(self._ds_metadata, prop) != - getattr(other._ds_metadata, prop) - ): - return False - - return True + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem( + filesystem, use_mmap=memory_map) + elif filesystem is None and memory_map: + # if memory_map is specified, assume local file system (string + # path can in principle be URI for any filesystem) + filesystem = LocalFileSystem(use_mmap=memory_map) - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented + # This needs to be checked after _ensure_filesystem, because that + # handles the case of an fsspec LocalFileSystem + if ( + hasattr(path_or_paths, "__fspath__") and + filesystem is not None and + not _is_local_file_system(filesystem) + ): + raise TypeError( + "Path-like objects with __fspath__ must only be used with " + f"local file systems, not {type(filesystem)}" + ) - def validate_schemas(self): - if self._metadata is None and self._schema is None: - if self._common_metadata is not None: - self._schema = self._common_metadata.schema + # check for single fragment dataset + single_file = None + self._base_dir = None + if not isinstance(path_or_paths, list): + if _is_path_like(path_or_paths): + path_or_paths = _stringify_path(path_or_paths) + if filesystem is None: + # path might be a URI describing the FileSystem as well + try: + filesystem, path_or_paths = FileSystem.from_uri( + path_or_paths) + except ValueError: + filesystem = LocalFileSystem(use_mmap=memory_map) + finfo = filesystem.get_file_info(path_or_paths) + if finfo.is_file: + single_file = path_or_paths + if finfo.type == FileType.Directory: + self._base_dir = path_or_paths else: - self._schema = self._pieces[0].get_metadata().schema - elif self._schema is None: - self._schema = self._metadata.schema - - # Verify schemas are all compatible - dataset_schema = self._schema.to_arrow_schema() - # Exclude the partition columns from the schema, they are provided - # by the path, not the DatasetPiece - if self._partitions is not None: - for partition_name in self._partitions.partition_names: - if dataset_schema.get_field_index(partition_name) != -1: - field_idx = dataset_schema.get_field_index(partition_name) - dataset_schema = dataset_schema.remove(field_idx) - - for piece in self._pieces: - file_metadata = piece.get_metadata() - file_schema = file_metadata.schema.to_arrow_schema() - if not dataset_schema.equals(file_schema, check_metadata=False): - raise ValueError('Schema in {!s} was different. \n' - '{!s}\n\nvs\n\n{!s}' - .format(piece, file_schema, - dataset_schema)) + single_file = path_or_paths - def read(self, columns=None, use_threads=True, use_pandas_metadata=False): - """ - Read multiple Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the file. - use_threads : bool, default True - Perform multi-threaded column reads - use_pandas_metadata : bool, default False - Passed through to each dataset piece. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_read/', - ... use_legacy_dataset=False) - - Read multiple Parquet files as a single pyarrow.Table: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - tables = [] - for piece in self._pieces: - table = piece.read(columns=columns, - use_threads=use_threads, - partitions=self._partitions, - use_pandas_metadata=use_pandas_metadata) - tables.append(table) - - all_data = lib.concat_tables(tables) - - if use_pandas_metadata: - # We need to ensure that this metadata is set in the Table's schema - # so that Table.to_pandas will construct pandas.DataFrame with the - # right index - common_metadata = self._get_common_pandas_metadata() - current_metadata = all_data.schema.metadata or {} - - if common_metadata and b'pandas' not in current_metadata: - all_data = all_data.replace_schema_metadata({ - b'pandas': common_metadata}) - - return all_data - - def read_pandas(self, **kwargs): - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - All additional options to pass to the reader. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned - dataset: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, 'table.parquet') - >>> dataset = pq.ParquetDataset('table.parquet', - ... use_legacy_dataset=False) - - Read dataset including pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - Select pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ - return self.read(use_pandas_metadata=True, **kwargs) - - def _get_common_pandas_metadata(self): - if self._common_metadata is None: - return None - - keyvalues = self._common_metadata.metadata - return keyvalues.get(b'pandas', None) - - def _filter(self, filters): - accepts_filter = self._partitions.filter_accepts_partition - - def one_filter_accepts(piece, filter): - return all(accepts_filter(part_key, filter, level) - for level, part_key in enumerate(piece.partition_keys)) - - def all_filters_accept(piece): - return any(all(one_filter_accepts(piece, f) for f in conjunction) - for conjunction in filters) - - self._pieces = [p for p in self._pieces if all_filters_accept(p)] - - @property - def pieces(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.pieces", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.fragments' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._pieces - - @property - def partitions(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.partitions", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.partitioning' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._partitions - - @property - def schema(self): - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.schema", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.schema' attribute " - "instead (which will return an Arrow schema instead of a " - "Parquet schema)."), - FutureWarning, stacklevel=2) - return self._schema - - @property - def memory_map(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.memory_map", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.memory_map - - @property - def read_dictionary(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.read_dictionary", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.read_dictionary - - @property - def buffer_size(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.buffer_size", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.buffer_size - - _fs = property( - operator.attrgetter('_ds_metadata.fs') - ) - - @property - def fs(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.fs", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.filesystem' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._ds_metadata.fs - - @property - def metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata", ""), - FutureWarning, stacklevel=2) - return self._metadata - - @property - def metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata_path", ""), - FutureWarning, stacklevel=2) - return self._metadata_path - - @property - def common_metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata_path", ""), - FutureWarning, stacklevel=2) - return self._common_metadata_path - - _common_metadata = property( - operator.attrgetter('_ds_metadata.common_metadata') - ) - - @property - def common_metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.common_metadata - - @property - def fragments(self): - """ - A list of the Dataset source fragments or pieces with absolute - file paths. To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_fragments/', - ... use_legacy_dataset=False) - - List the fragments: - - >>> dataset.fragments - [>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_files/', - ... use_legacy_dataset=False) - - List the files: - - >>> dataset.files - ['dataset_name_files/year=2019/...-0.parquet', ... - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def filesystem(self): - """ - The filesystem type of the Dataset source. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def partitioning(self): - """ - The partitioning of the Dataset source, if discovered. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - -def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, - open_file_func=None): - partitions = None - common_metadata_path = None - metadata_path = None - - if isinstance(path_or_paths, list) and len(path_or_paths) == 1: - # Dask passes a directory as a list of length 1 - path_or_paths = path_or_paths[0] - - if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): - manifest = ParquetManifest(path_or_paths, filesystem=fs, - open_file_func=open_file_func, - pathsep=getattr(fs, "pathsep", "/"), - metadata_nthreads=metadata_nthreads) - common_metadata_path = manifest.common_metadata_path - metadata_path = manifest.metadata_path - pieces = manifest.pieces - partitions = manifest.partitions - else: - if not isinstance(path_or_paths, list): - path_or_paths = [path_or_paths] - - # List of paths - if len(path_or_paths) == 0: - raise ValueError('Must pass at least one file path') - - pieces = [] - for path in path_or_paths: - if not fs.isfile(path): - raise OSError('Passed non-file path: {}' - .format(path)) - piece = ParquetDatasetPiece._create( - path, open_file_func=open_file_func) - pieces.append(piece) - - return pieces, partitions, common_metadata_path, metadata_path - - -def _is_local_file_system(fs): - return isinstance(fs, LocalFileSystem) or isinstance( - fs, legacyfs.LocalFileSystem - ) - - -class _ParquetDatasetV2: - """ - ParquetDataset shim using the Dataset API under the hood. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_v2', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset('dataset_v2/', use_legacy_dataset=False) - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset('dataset_v2/', - ... filters=[('n_legs','=',4)], - ... use_legacy_dataset=False) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ - - def __init__(self, path_or_paths, filesystem=None, *, filters=None, - partitioning="hive", read_dictionary=None, buffer_size=None, - memory_map=False, ignore_prefixes=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, schema=None, - decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False, - **kwargs): - import pyarrow.dataset as ds - - # Raise error for not supported keywords - for keyword, default in [ - ("metadata", None), ("split_row_groups", False), - ("validate_schema", True), ("metadata_nthreads", None)]: - if keyword in kwargs and kwargs[keyword] is not default: - raise ValueError( - "Keyword '{0}' is not yet supported with the new " - "Dataset API".format(keyword)) - - # map format arguments - read_options = { - "pre_buffer": pre_buffer, - "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, - "thrift_string_size_limit": thrift_string_size_limit, - "thrift_container_size_limit": thrift_container_size_limit, - "page_checksum_verification": page_checksum_verification, - } - if buffer_size: - read_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - read_options.update(dictionary_columns=read_dictionary) - - if decryption_properties is not None: - read_options.update(decryption_properties=decryption_properties) - - self._filter_expression = None - if filters is not None: - self._filter_expression = filters_to_expression(filters) - - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem( - filesystem, use_mmap=memory_map) - elif filesystem is None and memory_map: - # if memory_map is specified, assume local file system (string - # path can in principle be URI for any filesystem) - filesystem = LocalFileSystem(use_mmap=memory_map) - - # This needs to be checked after _ensure_filesystem, because that - # handles the case of an fsspec LocalFileSystem - if ( - hasattr(path_or_paths, "__fspath__") and - filesystem is not None and - not _is_local_file_system(filesystem) - ): - raise TypeError( - "Path-like objects with __fspath__ must only be used with " - f"local file systems, not {type(filesystem)}" - ) - - # check for single fragment dataset - single_file = None - self._base_dir = None - if not isinstance(path_or_paths, list): - if _is_path_like(path_or_paths): - path_or_paths = _stringify_path(path_or_paths) - if filesystem is None: - # path might be a URI describing the FileSystem as well - try: - filesystem, path_or_paths = FileSystem.from_uri( - path_or_paths) - except ValueError: - filesystem = LocalFileSystem(use_mmap=memory_map) - finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths - if finfo.type == FileType.Directory: - self._base_dir = path_or_paths - else: - single_file = path_or_paths - - parquet_format = ds.ParquetFileFormat(**read_options) + parquet_format = ds.ParquetFileFormat(**read_options) if single_file is not None: fragment = parquet_format.make_fragment(single_file, filesystem) @@ -2540,12 +1357,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, ignore_prefixes=ignore_prefixes) def equals(self, other): - if isinstance(other, ParquetDataset): - raise TypeError( - "`other` must be an instance of ParquetDataset constructed " - "with `use_legacy_dataset=False`" - ) - if not isinstance(other, _ParquetDatasetV2): + if not isinstance(other, ParquetDataset): raise TypeError('`other` must be an instance of ParquetDataset') return (self.schema == other.schema and @@ -2576,10 +1388,8 @@ def schema(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_schema', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_schema/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_schema/') Read the schema: @@ -2598,8 +1408,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ---------- columns : List[str] Names of columns to read from the dataset. The partition fields - are not automatically included (in contrast to when setting - ``use_legacy_dataset=True``). + are not automatically included. use_threads : bool, default True Perform multi-threaded column reads. use_pandas_metadata : bool, default False @@ -2622,10 +1431,8 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_read/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_read/') Read the dataset: @@ -2694,7 +1501,12 @@ def _get_common_pandas_metadata(self): def read_pandas(self, **kwargs): """ Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` Examples -------- @@ -2709,8 +1521,7 @@ def read_pandas(self, **kwargs): >>> table = pa.Table.from_pandas(df) >>> import pyarrow.parquet as pq >>> pq.write_table(table, 'table_V2.parquet') - >>> dataset = pq.ParquetDataset('table_V2.parquet', - ... use_legacy_dataset=False) + >>> dataset = pq.ParquetDataset('table_V2.parquet') Read the dataset with pandas metadata: @@ -2725,14 +1536,6 @@ def read_pandas(self, **kwargs): """ return self.read(use_pandas_metadata=True, **kwargs) - @property - def pieces(self): - warnings.warn( - _DEPR_MSG.format("ParquetDataset.pieces", - " Use the '.fragments' attribute instead"), - FutureWarning, stacklevel=2) - return list(self._dataset.get_fragments()) - @property def fragments(self): """ @@ -2750,10 +1553,8 @@ def fragments(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_fragments/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_fragments/') List the fragments: @@ -2778,10 +1579,8 @@ def files(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_files/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_files/') List the files: @@ -2822,8 +1621,6 @@ def partitioning(self): no columns. use_threads : bool, default True Perform multi-threaded column reads. -metadata : FileMetaData - If separately computed schema : Schema, optional Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source. @@ -2836,30 +1633,21 @@ def partitioning(self): Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {3} -use_legacy_dataset : bool, default False - By default, `read_table` uses the new Arrow Datasets API since - pyarrow 1.0.0. Among other things, this allows to pass `filters` - for all columns and not only the partition keys, enables - different partitioning schemes, etc. - Set to True to use the legacy behaviour (this option is deprecated, - and the legacy implementation will be removed in a future version). +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. ignore_prefixes : list, optional Files matching any of these prefixes will be ignored by the - discovery process if use_legacy_dataset=False. + discovery process. This is matched to the basename of a path. By default this is ['.', '_']. Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. coerce_int96_timestamp_unit : str, default None @@ -2968,129 +1756,78 @@ def partitioning(self): """ -def read_table(source, *, columns=None, use_threads=True, metadata=None, +def read_table(source, *, columns=None, use_threads=True, schema=None, use_pandas_metadata=False, read_dictionary=None, memory_map=False, buffer_size=0, partitioning="hive", - filesystem=None, filters=None, use_legacy_dataset=False, + filesystem=None, filters=None, use_legacy_dataset=None, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, page_checksum_verification=False): - if not use_legacy_dataset: - if metadata is not None: + + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + try: + dataset = ParquetDataset( + source, + schema=schema, + filesystem=filesystem, + partitioning=partitioning, + memory_map=memory_map, + read_dictionary=read_dictionary, + buffer_size=buffer_size, + filters=filters, + ignore_prefixes=ignore_prefixes, + pre_buffer=pre_buffer, + coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, + ) + except ImportError: + # fall back on ParquetFile for simple cases when pyarrow.dataset + # module is not available + if filters is not None: raise ValueError( - "The 'metadata' keyword is no longer supported with the new " - "datasets-based implementation. Specify " - "'use_legacy_dataset=True' to temporarily recover the old " - "behaviour." - ) - try: - dataset = _ParquetDatasetV2( - source, - schema=schema, - filesystem=filesystem, - partitioning=partitioning, - memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filters=filters, - ignore_prefixes=ignore_prefixes, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + "the 'filters' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - except ImportError: - # fall back on ParquetFile for simple cases when pyarrow.dataset - # module is not available - if filters is not None: - raise ValueError( - "the 'filters' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if partitioning != "hive": - raise ValueError( - "the 'partitioning' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if schema is not None: - raise ValueError( - "the 'schema' argument is not supported when the " - "pyarrow.dataset module is not available" - ) - filesystem, path = _resolve_filesystem_and_path(source, filesystem) - if filesystem is not None: - source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list - dataset = ParquetFile( - source, metadata=metadata, read_dictionary=read_dictionary, - memory_map=memory_map, buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + if partitioning != "hive": + raise ValueError( + "the 'partitioning' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - - return dataset.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - - if ignore_prefixes is not None: - raise ValueError( - "The 'ignore_prefixes' keyword is only supported when " - "use_legacy_dataset=False") - - if page_checksum_verification: - raise ValueError( - "The 'page_checksum_verification' keyword is only supported when " - "use_legacy_dataset=False") - - if schema is not None: - raise ValueError( - "The 'schema' argument is only supported when " - "use_legacy_dataset=False") - - if _is_path_like(source): - with warnings.catch_warnings(): - # Suppress second warning from ParquetDataset constructor - warnings.filterwarnings( - "ignore", "Passing 'use_legacy_dataset", FutureWarning) - pf = ParquetDataset( - source, metadata=metadata, memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filesystem=filesystem, filters=filters, - partitioning=partitioning, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - use_legacy_dataset=True, + if schema is not None: + raise ValueError( + "the 'schema' argument is not supported when the " + "pyarrow.dataset module is not available" ) - else: - pf = ParquetFile( - source, metadata=metadata, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, + filesystem, path = _resolve_filesystem_and_path(source, filesystem) + if filesystem is not None: + source = filesystem.open_input_file(path) + # TODO test that source is not a directory or a list + dataset = ParquetFile( + source, read_dictionary=read_dictionary, + memory_map=memory_map, buffer_size=buffer_size, + pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties + decryption_properties=decryption_properties, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) - return pf.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) + return dataset.read(columns=columns, use_threads=use_threads, + use_pandas_metadata=use_pandas_metadata) -read_table.__doc__ = _read_table_docstring.format( - """Read a Table from Parquet format -Note: starting with pyarrow 1.0, the default for `use_legacy_dataset` is -switched to False.""", +read_table.__doc__ = _read_table_docstring.format( + """Read a Table from Parquet format""", "\n".join(("""use_pandas_metadata : bool, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded.""", _read_docstring_common)), @@ -3233,23 +1970,13 @@ def write_table(table, where, row_group_size=None, version='2.6', """.format(_parquet_writer_arg_docs, _write_table_example) -def _mkdir_if_not_exists(fs, path): - if fs._isfilestore() and not fs.exists(path): - try: - fs.mkdir(path) - except OSError: - assert fs.exists(path) - - def write_to_dataset(table, root_path, partition_cols=None, - partition_filename_cb=None, filesystem=None, - use_legacy_dataset=None, schema=None, - partitioning=None, basename_template=None, - use_threads=None, file_visitor=None, - existing_data_behavior=None, + filesystem=None, use_legacy_dataset=None, + schema=None, partitioning=None, + basename_template=None, use_threads=None, + file_visitor=None, existing_data_behavior=None, **kwargs): - """Wrapper around dataset.write_dataset (when use_legacy_dataset=False) or - parquet.write_table (when use_legacy_dataset=True) for writing a Table to + """Wrapper around dataset.write_dataset for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following @@ -3271,45 +1998,31 @@ def write_to_dataset(table, root_path, partition_cols=None, ---------- table : pyarrow.Table root_path : str, pathlib.Path - The root directory of the dataset + The root directory of the dataset. partition_cols : list, Column names by which to partition the dataset. - Columns are partitioned in the order they are given - partition_filename_cb : callable, - A callback function that takes the partition key(s) as an argument - and allow you to override the partition filename. If nothing is - passed, the filename will consist of a uuid. - This option is only supported for use_legacy_dataset=True. - When use_legacy_dataset=None and this option is specified, - use_legacy_dataset will be set to True. + Columns are partitioned in the order they are given. filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. - use_legacy_dataset : bool - Default is False. Set to True to use the legacy behaviour - (this option is deprecated, and the legacy implementation will be - removed in a future version). The legacy implementation still - supports the `partition_filename_cb` keyword but is less efficient - when using partition columns. + use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. schema : Schema, optional - This option is only supported for use_legacy_dataset=False. + This Schema of the dataset. partitioning : Partitioning or list[str], optional The partitioning scheme specified with the ``pyarrow.dataset.partitioning()`` function or a list of field names. When providing a list of field names, you can use ``partitioning_flavor`` to drive which partitioning type should be used. - This option is only supported for use_legacy_dataset=False. basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to "guid-{i}.parquet". - This option is only supported for use_legacy_dataset=False. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. - This option is only supported for use_legacy_dataset=False. file_visitor : function If set, this function will be called with a WrittenFile instance for each file created during the call. This object will have both @@ -3330,7 +2043,6 @@ def write_to_dataset(table, root_path, partition_cols=None, def file_visitor(written_file): visited_paths.append(written_file.path) - This option is only supported for use_legacy_dataset=False. existing_data_behavior : 'overwrite_or_ignore' | 'error' | \ 'delete_matching' Controls how the dataset will handle data that already exists in @@ -3348,15 +2060,12 @@ def file_visitor(written_file): dataset. The first time each partition directory is encountered the entire directory will be deleted. This allows you to overwrite old partitions completely. - This option is only supported for use_legacy_dataset=False. **kwargs : dict, - When use_legacy_dataset=False, used as additional kwargs for - `dataset.write_dataset` function for matching kwargs, and remainder to - `ParquetFileFormat.make_write_options`. See the docstring - of `write_table` and `dataset.write_dataset` for the available options. - When use_legacy_dataset=True, used as additional kwargs for - `parquet.write_table` function (See docstring for `write_table` - or `ParquetWriter` for more information). + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. Using `metadata_collector` in kwargs allows one to collect the file metadata instances of dataset pieces. The file paths in the ColumnChunkMetaData will be set relative to `root_path`. @@ -3376,194 +2085,79 @@ def file_visitor(written_file): >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_name_3', ... partition_cols=['year']) - >>> pq.ParquetDataset('dataset_name_3', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_3').files ['dataset_name_3/year=2019/...-0.parquet', ... Write a single Parquet file into the root folder: >>> pq.write_to_dataset(table, root_path='dataset_name_4') - >>> pq.ParquetDataset('dataset_name_4/', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_4/').files ['dataset_name_4/...-0.parquet'] """ - # Choose the implementation - if use_legacy_dataset is None: - # if partition_filename_cb is specified -> - # default to the old implementation - if partition_filename_cb: - use_legacy_dataset = True - # otherwise the default is False - else: - use_legacy_dataset = False + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + metadata_collector = kwargs.pop('metadata_collector', None) # Check for conflicting keywords - msg_confl_0 = ( - "The '{0}' argument is not supported by use_legacy_dataset={2}. " - "Use only '{1}' instead." - ) - msg_confl_1 = ( - "The '{1}' argument is not supported by use_legacy_dataset={2}. " + msg_confl = ( + "The '{1}' argument is not supported. " "Use only '{0}' instead." ) - msg_confl = msg_confl_0 if use_legacy_dataset else msg_confl_1 - if partition_filename_cb is not None and basename_template is not None: - raise ValueError(msg_confl.format("basename_template", - "partition_filename_cb", - use_legacy_dataset)) - if partition_cols is not None and partitioning is not None: raise ValueError(msg_confl.format("partitioning", - "partition_cols", - use_legacy_dataset)) + "partition_cols")) - metadata_collector = kwargs.pop('metadata_collector', None) if metadata_collector is not None and file_visitor is not None: raise ValueError(msg_confl.format("file_visitor", - "metadata_collector", - use_legacy_dataset)) + "metadata_collector")) - # New dataset implementation - if not use_legacy_dataset: - import pyarrow.dataset as ds + import pyarrow.dataset as ds - # extract write_dataset specific options - # reset assumed to go to make_write_options - write_dataset_kwargs = dict() - for key in inspect.signature(ds.write_dataset).parameters: - if key in kwargs: - write_dataset_kwargs[key] = kwargs.pop(key) - write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( - 'row_group_size', kwargs.pop("chunk_size", None) - ) - # raise for unsupported keywords - msg = ( - "The '{}' argument is not supported with the new dataset " - "implementation." - ) - if metadata_collector is not None: - def file_visitor(written_file): - metadata_collector.append(written_file.metadata) - if partition_filename_cb is not None: - raise ValueError(msg.format("partition_filename_cb")) + # extract write_dataset specific options + # reset assumed to go to make_write_options + write_dataset_kwargs = dict() + for key in inspect.signature(ds.write_dataset).parameters: + if key in kwargs: + write_dataset_kwargs[key] = kwargs.pop(key) + write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( + 'row_group_size', kwargs.pop("chunk_size", None) + ) - # map format arguments - parquet_format = ds.ParquetFileFormat() - write_options = parquet_format.make_write_options(**kwargs) + if metadata_collector is not None: + def file_visitor(written_file): + metadata_collector.append(written_file.metadata) - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem(filesystem) - - if partition_cols: - part_schema = table.select(partition_cols).schema - partitioning = ds.partitioning(part_schema, flavor="hive") - - if basename_template is None: - basename_template = guid() + '-{i}.parquet' - - if existing_data_behavior is None: - existing_data_behavior = 'overwrite_or_ignore' - - ds.write_dataset( - table, root_path, filesystem=filesystem, - format=parquet_format, file_options=write_options, schema=schema, - partitioning=partitioning, use_threads=use_threads, - file_visitor=file_visitor, - basename_template=basename_template, - existing_data_behavior=existing_data_behavior, - **write_dataset_kwargs) - return - - # warnings and errors when using legacy implementation - if use_legacy_dataset: - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation " - "will be removed in a future version.", - FutureWarning, stacklevel=2) - msg2 = ( - "The '{}' argument is not supported with the legacy " - "implementation. To use this argument specify " - "'use_legacy_dataset=False' while constructing the " - "ParquetDataset." - ) - if schema is not None: - raise ValueError(msg2.format("schema")) - if partitioning is not None: - raise ValueError(msg2.format("partitioning")) - if use_threads is not None: - raise ValueError(msg2.format("use_threads")) - if file_visitor is not None: - raise ValueError(msg2.format("file_visitor")) - if existing_data_behavior is not None: - raise ValueError(msg2.format("existing_data_behavior")) - if basename_template is not None: - raise ValueError(msg2.format("basename_template")) - if partition_filename_cb is not None: - warnings.warn( - _DEPR_MSG.format("partition_filename_cb", " Specify " - "'use_legacy_dataset=False' while constructing " - "the ParquetDataset, and then use the " - "'basename_template' parameter instead. For " - "usage see `pyarrow.dataset.write_dataset`"), - FutureWarning, stacklevel=2) + # map format arguments + parquet_format = ds.ParquetFileFormat() + write_options = parquet_format.make_write_options(**kwargs) - # Legacy implementation - fs, root_path = legacyfs.resolve_filesystem_and_path(root_path, filesystem) - - _mkdir_if_not_exists(fs, root_path) - - if partition_cols is not None and len(partition_cols) > 0: - df = table.to_pandas() - partition_keys = [df[col] for col in partition_cols] - data_df = df.drop(partition_cols, axis='columns') - data_cols = df.columns.drop(partition_cols) - if len(data_cols) == 0: - raise ValueError('No data left to save outside partition columns') - - subschema = table.schema - - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset - for col in table.schema.names: - if col in partition_cols: - subschema = subschema.remove(subschema.get_field_index(col)) - - # ARROW-17829: avoid deprecation warnings for df.groupby - # https://github.com/pandas-dev/pandas/issues/42795 - if len(partition_keys) == 1: - partition_keys = partition_keys[0] - - for keys, subgroup in data_df.groupby(partition_keys, observed=True): - if not isinstance(keys, tuple): - keys = (keys,) - subdir = '/'.join( - ['{colname}={value}'.format(colname=name, value=val) - for name, val in zip(partition_cols, keys)]) - subtable = pa.Table.from_pandas(subgroup, schema=subschema, - safe=False) - _mkdir_if_not_exists(fs, '/'.join([root_path, subdir])) - if partition_filename_cb: - outfile = partition_filename_cb(keys) - else: - outfile = guid() + '.parquet' - relative_path = '/'.join([subdir, outfile]) - full_path = '/'.join([root_path, relative_path]) - with fs.open(full_path, 'wb') as f: - write_table(subtable, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(relative_path) - else: - if partition_filename_cb: - outfile = partition_filename_cb(None) - else: - outfile = guid() + '.parquet' - full_path = '/'.join([root_path, outfile]) - with fs.open(full_path, 'wb') as f: - write_table(table, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(outfile) + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem(filesystem) + + if partition_cols: + part_schema = table.select(partition_cols).schema + partitioning = ds.partitioning(part_schema, flavor="hive") + + if basename_template is None: + basename_template = guid() + '-{i}.parquet' + + if existing_data_behavior is None: + existing_data_behavior = 'overwrite_or_ignore' + + ds.write_dataset( + table, root_path, filesystem=filesystem, + format=parquet_format, file_options=write_options, schema=schema, + partitioning=partitioning, use_threads=use_threads, + file_visitor=file_visitor, + basename_template=basename_template, + existing_data_behavior=existing_data_behavior, + **write_dataset_kwargs) + return def write_metadata(schema, where, metadata_collector=None, filesystem=None, @@ -3741,15 +2335,11 @@ def read_schema(where, memory_map=False, decryption_properties=None, "FileEncryptionProperties", "FileMetaData", "ParquetDataset", - "ParquetDatasetPiece", "ParquetFile", "ParquetLogicalType", - "ParquetManifest", - "ParquetPartitions", "ParquetReader", "ParquetSchema", "ParquetWriter", - "PartitionSet", "RowGroupMetaData", "SortingColumn", "Statistics", diff --git a/python/pyarrow/src/arrow/python/parquet_encryption.h b/python/pyarrow/src/arrow/python/parquet_encryption.h index 23ee478348ecd..a1aaa30e260f5 100644 --- a/python/pyarrow/src/arrow/python/parquet_encryption.h +++ b/python/pyarrow/src/arrow/python/parquet_encryption.h @@ -26,6 +26,27 @@ #include "parquet/encryption/kms_client.h" #include "parquet/encryption/kms_client_factory.h" +#if defined(_WIN32) || defined(__CYGWIN__) // Windows +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_PYTHON_STATIC +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport) +#else +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport) +#endif + +#else // Not Windows +#ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default"))) +#endif +#endif // Non-Windows + namespace arrow { namespace py { namespace parquet { @@ -33,7 +54,7 @@ namespace encryption { /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable { public: std::function @@ -44,7 +65,8 @@ class ARROW_PYTHON_EXPORT PyKmsClientVtable { }; /// \brief A helper for KmsClient implementation in Python. -class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient + : public ::parquet::encryption::KmsClient { public: PyKmsClient(PyObject* handler, PyKmsClientVtable vtable); ~PyKmsClient() override; @@ -62,7 +84,7 @@ class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientFactoryVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable { public: std::function> SafeGetFileEncryptionProperties( diff --git a/python/pyarrow/tests/parquet/__init__.py b/python/pyarrow/tests/parquet/__init__.py index 4c4e8240b8736..d08d67d2860f4 100644 --- a/python/pyarrow/tests/parquet/__init__.py +++ b/python/pyarrow/tests/parquet/__init__.py @@ -21,7 +21,4 @@ # Ignore these with pytest ... -m 'not parquet' pytestmark = [ pytest.mark.parquet, - pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), ] diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 4401d3ca6bb75..8365ed5b28543 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -18,31 +18,10 @@ import io import numpy as np -import pytest import pyarrow as pa from pyarrow.tests import util -legacy_filter_mark = pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy:FutureWarning" -) - -parametrize_legacy_dataset = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.dataset)] -) -parametrize_legacy_dataset_not_supported = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.skip)] -) -parametrize_legacy_dataset_fixed = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=[pytest.mark.xfail, legacy_filter_mark]), - pytest.param(False, marks=pytest.mark.dataset)] -) - def _write_table(table, path, **kwargs): # So we see the ImportError somewhere @@ -65,19 +44,18 @@ def _read_table(*args, **kwargs): def _roundtrip_table(table, read_table_kwargs=None, - write_table_kwargs=None, use_legacy_dataset=False): + write_table_kwargs=None): read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} writer = pa.BufferOutputStream() _write_table(table, writer, **write_table_kwargs) reader = pa.BufferReader(writer.getvalue()) - return _read_table(reader, use_legacy_dataset=use_legacy_dataset, - **read_table_kwargs) + return _read_table(reader, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, - use_legacy_dataset=False, **write_table_kwargs): + **write_table_kwargs): if expected is None: expected = table @@ -85,20 +63,17 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, # intentionally check twice result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) -def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=False): +def _roundtrip_pandas_dataframe(df, write_kwargs): table = pa.Table.from_pandas(df) result = _roundtrip_table( - table, write_table_kwargs=write_kwargs, - use_legacy_dataset=use_legacy_dataset) + table, write_table_kwargs=write_kwargs) return result.to_pandas() diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 83e6ebeb7a1fc..3c867776ac052 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -28,7 +28,6 @@ from pyarrow.filesystem import LocalFileSystem, FileSystem from pyarrow.tests import util from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table, - parametrize_legacy_dataset, _test_dataframe) try: @@ -63,21 +62,18 @@ def test_parquet_invalid_version(tempdir): data_page_version="2.2") -@parametrize_legacy_dataset -def test_set_data_page_size(use_legacy_dataset): +def test_set_data_page_size(): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) # 128K, 512K page_sizes = [2 << 16, 2 << 18] for target_page_size in page_sizes: - _check_roundtrip(t, data_page_size=target_page_size, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(t, data_page_size=target_page_size) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_write_batch_size(use_legacy_dataset): +def test_set_write_batch_size(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -87,8 +83,7 @@ def test_set_write_batch_size(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_dictionary_pagesize_limit(use_legacy_dataset): +def test_set_dictionary_pagesize_limit(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -101,8 +96,7 @@ def test_set_dictionary_pagesize_limit(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_chunked_table_write(use_legacy_dataset): +def test_chunked_table_write(): # ARROW-232 tables = [] batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10)) @@ -116,66 +110,56 @@ def test_chunked_table_write(use_legacy_dataset): for table in tables: _check_roundtrip( table, version='2.6', - use_legacy_dataset=use_legacy_dataset, data_page_version=data_page_version, use_dictionary=use_dictionary) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_memory_map(tempdir, use_legacy_dataset): +def test_memory_map(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'memory_map': True}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, memory_map=True, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, memory_map=True) assert table_read.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_enable_buffered_stream(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, buffer_size=4096, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, buffer_size=4096) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_special_chars_filename(tempdir, use_legacy_dataset): +def test_special_chars_filename(tempdir): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() - table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(str(path)) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_invalid_source(use_legacy_dataset): +def test_invalid_source(): # Test that we provide an helpful error message pointing out # that None wasn't expected when trying to open a Parquet None file. - # - # Depending on use_legacy_dataset the message changes slightly - # but in both cases it should point out that None wasn't expected. with pytest.raises(TypeError, match="None"): - pq.read_table(None, use_legacy_dataset=use_legacy_dataset) + pq.read_table(None) with pytest.raises(TypeError, match="None"): pq.ParquetFile(None) @@ -193,8 +177,7 @@ def test_file_with_over_int16_max_row_groups(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_roundtrip(use_legacy_dataset): +def test_empty_table_roundtrip(): df = alltypes_sample(size=10) # Create a non-empty table to infer the types correctly, then slice to 0 @@ -206,19 +189,17 @@ def test_empty_table_roundtrip(use_legacy_dataset): assert table.schema.field('null').type == pa.null() assert table.schema.field('null_list').type == pa.list_(pa.null()) _check_roundtrip( - table, version='2.6', use_legacy_dataset=use_legacy_dataset) + table, version='2.6') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_no_columns(use_legacy_dataset): +def test_empty_table_no_columns(): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) - _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(empty) -@parametrize_legacy_dataset -def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): +def test_write_nested_zero_length_array_chunk_failure(): # Bug report in ARROW-3792 cols = OrderedDict( int32=pa.int32(), @@ -243,17 +224,16 @@ def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) - _check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(tbl) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiple_path_types(tempdir, use_legacy_dataset): +def test_multiple_path_types(tempdir): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -261,13 +241,12 @@ def test_multiple_path_types(tempdir, use_legacy_dataset): path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) -@parametrize_legacy_dataset -def test_fspath(tempdir, use_legacy_dataset): +def test_fspath(tempdir): # ARROW-12472 support __fspath__ objects without using str() path = tempdir / "test.parquet" table = pa.table({"a": [1, 2, 3]}) @@ -275,9 +254,7 @@ def test_fspath(tempdir, use_legacy_dataset): fs_protocol_obj = util.FSProtocolClass(path) - result = _read_table( - fs_protocol_obj, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(fs_protocol_obj) assert result.equals(table) # combined with non-local filesystem raises @@ -285,15 +262,11 @@ def test_fspath(tempdir, use_legacy_dataset): _read_table(fs_protocol_obj, filesystem=FileSystem()) -@pytest.mark.dataset -@parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ None, fs.LocalFileSystem(), LocalFileSystem._get_instance() ]) @pytest.mark.parametrize("name", ("data.parquet", "例.parquet")) -def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): - if use_legacy_dataset and isinstance(filesystem, fs.FileSystem): - pytest.skip("Passing new filesystem not supported for legacy reader") +def test_relative_paths(tempdir, filesystem, name): # reading and writing from relative paths table = pa.table({"a": [1, 2, 3]}) path = tempdir / name @@ -301,8 +274,7 @@ def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): # reading pq.write_table(table, str(path)) with util.change_cwd(tempdir): - result = pq.read_table(name, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(name, filesystem=filesystem) assert result.equals(table) path.unlink() @@ -334,24 +306,21 @@ def seek(self, *args): pq.read_table(BogusFile(b"")) -@parametrize_legacy_dataset -def test_parquet_read_from_buffer(tempdir, use_legacy_dataset): +def test_parquet_read_from_buffer(tempdir): # reading from a buffer from python's open() table = pa.table({"a": [1, 2, 3]}) pq.write_table(table, str(tempdir / "data.parquet")) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(f) assert result.equals(table) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(pa.PythonFile(f), - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(pa.PythonFile(f)) assert result.equals(table) -@parametrize_legacy_dataset -def test_byte_stream_split(use_legacy_dataset): +def test_byte_stream_split(): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) @@ -385,12 +354,10 @@ def test_byte_stream_split(use_legacy_dataset): table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False, - use_legacy_dataset=use_legacy_dataset) + use_dictionary=False) -@parametrize_legacy_dataset -def test_column_encoding(use_legacy_dataset): +def test_column_encoding(): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary()) @@ -406,30 +373,26 @@ def test_column_encoding(use_legacy_dataset): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "PLAIN" for all columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="PLAIN", - use_legacy_dataset=use_legacy_dataset) + column_encoding="PLAIN") # Check "DELTA_BINARY_PACKED" for integer columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_LENGTH_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "DELTA_LENGTH_BYTE_ARRAY"}) # Check "DELTA_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, @@ -437,14 +400,12 @@ def test_column_encoding(use_legacy_dataset): column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", 'c': "DELTA_BYTE_ARRAY", - 'd': "DELTA_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'd': "DELTA_BYTE_ARRAY"}) # Check "RLE" for boolean columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'e': "RLE"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'e': "RLE"}) # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. # This should throw an error as it is only supports FLOAT and DOUBLE. @@ -455,8 +416,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass use "DELTA_BINARY_PACKED" encoding on float column. # This should throw an error as only integers are supported. @@ -465,8 +425,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "DELTA_BINARY_PACKED", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass "RLE_DICTIONARY". # This should throw an error as dictionary encoding is already used by @@ -474,30 +433,26 @@ def test_column_encoding(use_legacy_dataset): with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="RLE_DICTIONARY", - use_legacy_dataset=use_legacy_dataset) + column_encoding="RLE_DICTIONARY") # Try to pass unsupported encoding. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'a': "MADE_UP_ENCODING"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'a': "MADE_UP_ENCODING"}) # Try to pass column_encoding and use_dictionary. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=['b'], - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_dictionary=True (default value). # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split on same column. # This should throw an error. @@ -507,8 +462,7 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=['a'], column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split=True. # This should throw an error. @@ -518,54 +472,45 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=True, column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding=True. # This should throw an error. with pytest.raises(TypeError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding=True, - use_legacy_dataset=use_legacy_dataset) + column_encoding=True) -@parametrize_legacy_dataset -def test_compression_level(use_legacy_dataset): +def test_compression_level(): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=5, - use_legacy_dataset=use_legacy_dataset) + compression_level=5) # Check that the user can provide a compression per column _check_roundtrip(table, expected=table, - compression={'a': "gzip", 'b': "snappy"}, - use_legacy_dataset=use_legacy_dataset) + compression={'a': "gzip", 'b': "snappy"}) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", - compression_level={'a': 2, 'b': 3}, - use_legacy_dataset=use_legacy_dataset) + compression_level={'a': 2, 'b': 3}) # Check if both LZ4 compressors are working # (level < 3 -> fast, level >= 3 -> HC) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=9, - use_legacy_dataset=use_legacy_dataset) + compression_level=9) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. @@ -594,8 +539,7 @@ def test_sanitized_spark_field_names(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multithreaded_read(use_legacy_dataset): +def test_multithreaded_read(): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) @@ -604,19 +548,16 @@ def test_multithreaded_read(use_legacy_dataset): _write_table(table, buf, compression='SNAPPY', version='2.6') buf.seek(0) - table1 = _read_table( - buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) + table1 = _read_table(buf, use_threads=True) buf.seek(0) - table2 = _read_table( - buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) + table2 = _read_table(buf, use_threads=False) assert table1.equals(table2) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_min_chunksize(use_legacy_dataset): +def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) @@ -624,7 +565,7 @@ def test_min_chunksize(use_legacy_dataset): _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) + result = _read_table(buf) assert result.equals(table) @@ -659,57 +600,46 @@ def test_write_error_deletes_incomplete_file(tempdir): assert not filename.exists() -@parametrize_legacy_dataset -def test_read_non_existent_file(tempdir, use_legacy_dataset): +def test_read_non_existent_file(tempdir): path = 'nonexistent-file.parquet' try: - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) except Exception as e: assert path in e.args[0] -@parametrize_legacy_dataset -def test_read_table_doesnt_warn(datadir, use_legacy_dataset): - if use_legacy_dataset: - msg = "Passing 'use_legacy_dataset=True'" - with pytest.warns(FutureWarning, match=msg): - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) - else: - with warnings.catch_warnings(): - warnings.simplefilter(action="error") - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) +def test_read_table_doesnt_warn(datadir): + with warnings.catch_warnings(): + warnings.simplefilter(action="error") + pq.read_table(datadir / 'v0.7.1.parquet') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_zlib_compression_bug(use_legacy_dataset): +def test_zlib_compression_bug(): # ARROW-3514: "zlib deflate failed, output buffer too small" table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col']) f = io.BytesIO() pq.write_table(table, f, compression='gzip') f.seek(0) - roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + roundtrip = pq.read_table(f) tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas()) -@parametrize_legacy_dataset -def test_parquet_file_too_small(tempdir, use_legacy_dataset): +def test_parquet_file_too_small(tempdir): path = str(tempdir / "test.parquet") # TODO(dataset) with datasets API it raises OSError instead with pytest.raises((pa.ArrowInvalid, OSError), match='size is 0 bytes'): with open(path, 'wb') as f: pass - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) with pytest.raises((pa.ArrowInvalid, OSError), match='size is 4 bytes'): with open(path, 'wb') as f: f.write(b'ffff') - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) @pytest.mark.pandas @@ -752,17 +682,15 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) -@parametrize_legacy_dataset @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), lambda: pa.array(["", None] * 10), lambda: pa.array(["", None] * 10).dictionary_encode(), ]) -@pytest.mark.parametrize('use_dictionary', [False, True]) @pytest.mark.parametrize('read_dictionary', [False, True]) def test_buffer_contents( - array_factory, use_dictionary, read_dictionary, use_legacy_dataset + array_factory, read_dictionary ): # Test that null values are deterministically initialized to zero # after a roundtrip through Parquet. @@ -773,8 +701,7 @@ def test_buffer_contents( bio.seek(0) read_dictionary = ['col'] if read_dictionary else None table = pq.read_table(bio, use_threads=False, - read_dictionary=read_dictionary, - use_legacy_dataset=use_legacy_dataset) + read_dictionary=read_dictionary) for col in table.columns: [chunk] = col.chunks @@ -826,7 +753,6 @@ def test_reads_over_batch(tempdir): assert table == table2 -@pytest.mark.dataset def test_permutation_of_column_order(tempdir): # ARROW-2366 case = tempdir / "dataset_column_order_permutation" @@ -846,18 +772,6 @@ def test_permutation_of_column_order(tempdir): assert table == table2 -def test_read_table_legacy_deprecated(tempdir): - # ARROW-15870 - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - - with pytest.warns( - FutureWarning, match="Passing 'use_legacy_dataset=True'" - ): - pq.read_table(path, use_legacy_dataset=True) - - def test_thrift_size_limits(tempdir): path = tempdir / 'largethrift.parquet' @@ -942,28 +856,9 @@ def test_page_checksum_verification_write_table(tempdir): with pytest.raises(OSError, match="CRC checksum verification"): _ = corrupted_pq_file.read() - # Case 5: Check that enabling page checksum verification in combination - # with legacy dataset raises an exception - with pytest.raises(ValueError, match="page_checksum_verification"): - _ = pq.read_table(corrupted_path, - page_checksum_verification=True, - use_legacy_dataset=True) - @pytest.mark.dataset -@pytest.mark.parametrize( - "use_legacy_dataset", - [ - False, - pytest.param( - True, - marks=pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), - ), - ], -) -def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): +def test_checksum_write_to_dataset(tempdir): """Check that checksum verification works for datasets created with pq.write_to_dataset""" @@ -973,8 +868,7 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): original_dir_path = tempdir / 'correct_dir' pq.write_to_dataset(table_orig, original_dir_path, - write_page_checksum=True, - use_legacy_dataset=use_legacy_dataset) + write_page_checksum=True) # Read file and verify that the data is correct original_file_path_list = list(original_dir_path.iterdir()) @@ -1014,3 +908,23 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): # checksum verification enabled raises an exception with pytest.raises(OSError, match="CRC checksum verification"): _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) + + +@pytest.mark.dataset +def test_deprecated_use_legacy_dataset(tempdir): + # Test that specifying use_legacy_dataset in ParquetDataset, write_to_dataset + # and read_table doesn't raise an error but gives a warning. + table = pa.table({"a": [1, 2, 3]}) + path = tempdir / "deprecate_legacy" + + msg = "Passing 'use_legacy_dataset'" + with pytest.warns(FutureWarning, match=msg): + pq.write_to_dataset(table, path, use_legacy_dataset=False) + + pq.write_to_dataset(table, path) + + with pytest.warns(FutureWarning, match=msg): + pq.read_table(path, use_legacy_dataset=False) + + with pytest.warns(FutureWarning, match=msg): + pq.ParquetDataset(path, use_legacy_dataset=False) diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index ca1ad7ee32255..2345855a3321b 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -18,7 +18,6 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -58,16 +57,13 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_enable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_enable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag (default behaviour) _roundtrip_pandas_dataframe(df, - write_kwargs={}, - use_legacy_dataset=use_legacy_dataset) + write_kwargs={}) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -83,21 +79,17 @@ def test_write_compliant_nested_type_enable(tempdir, assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly - _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(new_table) @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_disable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_disable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled _roundtrip_pandas_dataframe(df, write_kwargs={ - 'use_compliant_nested_type': False}, - use_legacy_dataset=use_legacy_dataset) + 'use_compliant_nested_type': False}) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -114,5 +106,4 @@ def test_write_compliant_nested_type_disable(tempdir, # Verify that the new table can be read/written correctly _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 32fe128bbae9b..e6b66b00428fb 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -23,8 +23,7 @@ import pyarrow as pa from pyarrow.tests import util -from pyarrow.tests.parquet.common import (_check_roundtrip, - parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -54,9 +53,8 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): +def test_parquet_2_0_roundtrip(tempdir, chunk_size): df = alltypes_sample(size=10000, categorical=True) filename = tempdir / 'pandas_roundtrip.parquet' @@ -65,8 +63,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): _write_table(arrow_table, filename, version='2.6', chunk_size=chunk_size) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) assert table_read.schema.pandas_metadata is not None read_metadata = table_read.schema.metadata @@ -77,8 +74,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_1_0_roundtrip(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -100,7 +96,7 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -113,18 +109,17 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): # ----------------------------------------------------------------------------- -def _simple_table_write_read(table, use_legacy_dataset): +def _simple_table_write_read(table): bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() return pq.read_table( - pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset + pa.BufferReader(contents) ) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary(use_legacy_dataset): +def test_direct_read_dictionary(): # ARROW-3325 repeats = 10 nunique = 5 @@ -140,8 +135,7 @@ def test_direct_read_dictionary(use_legacy_dataset): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0']) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) @@ -149,8 +143,7 @@ def test_direct_read_dictionary(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary_subfield(use_legacy_dataset): +def test_direct_read_dictionary_subfield(): repeats = 10 nunique = 5 @@ -163,8 +156,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0.list.element'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0.list.element']) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() @@ -181,8 +173,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): assert result[0].num_chunks == 1 -@parametrize_legacy_dataset -def test_dictionary_array_automatically_read(use_legacy_dataset): +def test_dictionary_array_automatically_read(): # ARROW-3246 # Make a large dictionary, a little over 4MB of data @@ -200,7 +191,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): dict_values)) table = pa.table([pa.chunked_array(chunks)], names=['f0']) - result = _simple_table_write_read(table, use_legacy_dataset) + result = _simple_table_write_read(table) assert result.equals(table) @@ -213,8 +204,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_decimal_roundtrip(tempdir, use_legacy_dataset): +def test_decimal_roundtrip(tempdir): num_values = 10 columns = {} @@ -234,8 +224,7 @@ def test_decimal_roundtrip(tempdir, use_legacy_dataset): string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) - result_table = _read_table( - string_filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(string_filename) result = result_table.to_pandas() tm.assert_frame_equal(result, expected) @@ -259,14 +248,13 @@ def test_decimal_roundtrip_negative_scale(tempdir): # ----------------------------------------------------------------------------- -@parametrize_legacy_dataset @pytest.mark.parametrize('dtype', [int, float]) -def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): +def test_single_pylist_column_roundtrip(tempdir, dtype,): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] @@ -277,16 +265,14 @@ def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): assert data_written.equals(data_read) -@parametrize_legacy_dataset -def test_empty_lists_table_roundtrip(use_legacy_dataset): +def test_empty_lists_table_roundtrip(): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) - _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table) -@parametrize_legacy_dataset -def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): +def test_nested_list_nonnullable_roundtrip_bug(): # Reproduce failure in ARROW-5630 typ = pa.list_(pa.field("item", pa.float32(), False)) num_rows = 10000 @@ -295,26 +281,22 @@ def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): (num_rows // 10)), type=typ) ], ['a']) _check_roundtrip( - t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset) + t, data_page_size=4096) -@parametrize_legacy_dataset -def test_nested_list_struct_multiple_batches_roundtrip( - tempdir, use_legacy_dataset -): +def test_nested_list_struct_multiple_batches_roundtrip(tempdir): # Reproduce failure in ARROW-11024 data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100 table = pa.table([pa.array(data)], names=['column']) _check_roundtrip( - table, row_group_size=20, use_legacy_dataset=use_legacy_dataset) + table, row_group_size=20) # Reproduce failure in ARROW-11069 (plain non-nested structs with strings) data = pa.array( [{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10 ) table = pa.table({'column': data}) - _check_roundtrip( - table, row_group_size=10, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table, row_group_size=10) def test_writing_empty_lists(): @@ -366,8 +348,7 @@ def test_large_list_records(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_nested_convenience(tempdir, use_legacy_dataset): +def test_parquet_nested_convenience(tempdir): # ARROW-1684 df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], @@ -380,11 +361,11 @@ def test_parquet_nested_convenience(tempdir, use_legacy_dataset): _write_table(table, path) read = pq.read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) tm.assert_frame_equal(read.to_pandas(), df[['a']]) read = pq.read_table( - path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a', 'b']) tm.assert_frame_equal(read.to_pandas(), df) @@ -420,17 +401,16 @@ def test_large_table_int32_overflow(): _write_table(table, f) -def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs): +def _simple_table_roundtrip(table, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) buf = stream.getvalue() - return _read_table(buf, use_legacy_dataset=use_legacy_dataset) + return _read_table(buf) @pytest.mark.slow @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_byte_array_exactly_2gb(use_legacy_dataset): +def test_byte_array_exactly_2gb(): # Test edge case reported in ARROW-3762 val = b'x' * (1 << 10) @@ -444,15 +424,14 @@ def test_byte_array_exactly_2gb(use_legacy_dataset): values = pa.chunked_array([base, pa.array(case)]) t = pa.table([values], names=['f0']) result = _simple_table_roundtrip( - t, use_legacy_dataset=use_legacy_dataset, use_dictionary=False) + t, use_dictionary=False) assert t.equals(result) @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_binary_array_overflow_to_chunked(use_legacy_dataset): +def test_binary_array_overflow_to_chunked(): # ARROW-3762 # 2^31 + 1 bytes @@ -462,8 +441,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): df = pd.DataFrame({'byte_col': values}) tbl = pa.Table.from_pandas(df, preserve_index=False) - read_tbl = _simple_table_roundtrip( - tbl, use_legacy_dataset=use_legacy_dataset) + read_tbl = _simple_table_roundtrip(tbl) col0_data = read_tbl[0] assert isinstance(col0_data, pa.ChunkedArray) @@ -477,8 +455,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_list_of_binary_large_cell(use_legacy_dataset): +def test_list_of_binary_large_cell(): # ARROW-4688 data = [] @@ -491,8 +468,7 @@ def test_list_of_binary_large_cell(use_legacy_dataset): arr = pa.array(data) table = pa.Table.from_arrays([arr], ['chunky_cells']) - read_table = _simple_table_roundtrip( - table, use_legacy_dataset=use_legacy_dataset) + read_table = _simple_table_roundtrip(table) assert table.equals(read_table) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index a9e99d5d65cf9..b6e351bdef9a7 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -29,9 +29,6 @@ from pyarrow import fs from pyarrow.filesystem import LocalFileSystem from pyarrow.tests import util -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_fixed, - parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -53,76 +50,10 @@ # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' -pytestmark = pytest.mark.parquet +pytestmark = [pytest.mark.parquet, pytest.mark.dataset] -@pytest.mark.pandas -def test_parquet_piece_read(tempdir): - df = _test_dataframe(1000) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece1 = pq.ParquetDatasetPiece(path) - - result = piece1.read() - assert result.equals(table) - - -@pytest.mark.pandas -def test_parquet_piece_open_and_get_metadata(tempdir): - df = _test_dataframe(100) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece = pq.ParquetDatasetPiece(path) - - table1 = piece.read() - assert isinstance(table1, pa.Table) - meta1 = piece.get_metadata() - assert isinstance(meta1, pq.FileMetaData) - - assert table.equals(table1) - - -@pytest.mark.filterwarnings("ignore:ParquetDatasetPiece:FutureWarning") -def test_parquet_piece_basics(): - path = '/baz.parq' - - piece1 = pq.ParquetDatasetPiece(path) - piece2 = pq.ParquetDatasetPiece(path, row_group=1) - piece3 = pq.ParquetDatasetPiece( - path, row_group=1, partition_keys=[('foo', 0), ('bar', 1)]) - - assert str(piece1) == path - assert str(piece2) == '/baz.parq | row_group=1' - assert str(piece3) == 'partition[foo=0, bar=1] /baz.parq | row_group=1' - - assert piece1 == piece1 - assert piece2 == piece2 - assert piece3 == piece3 - assert piece1 != piece3 - - -def test_partition_set_dictionary_type(): - set1 = pq.PartitionSet('key1', ['foo', 'bar', 'baz']) - set2 = pq.PartitionSet('key2', [2007, 2008, 2009]) - - assert isinstance(set1.dictionary, pa.StringArray) - assert isinstance(set2.dictionary, pa.IntegerArray) - - set3 = pq.PartitionSet('key2', [datetime.datetime(2007, 1, 1)]) - with pytest.raises(TypeError): - set3.dictionary - - -@parametrize_legacy_dataset_fixed -def test_filesystem_uri(tempdir, use_legacy_dataset): +def test_filesystem_uri(tempdir): table = pa.table({"a": [1, 2, 3]}) directory = tempdir / "data_dir" @@ -132,72 +63,36 @@ def test_filesystem_uri(tempdir, use_legacy_dataset): # filesystem object result = pq.read_table( - path, filesystem=fs.LocalFileSystem(), - use_legacy_dataset=use_legacy_dataset) + path, filesystem=fs.LocalFileSystem()) assert result.equals(table) # filesystem URI result = pq.read_table( - "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir), - use_legacy_dataset=use_legacy_dataset) + "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir)) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_directory(tempdir, use_legacy_dataset): +def test_read_partitioned_directory(tempdir): fs = LocalFileSystem._get_instance() - _partition_test_for_filesystem(fs, tempdir, use_legacy_dataset) + _partition_test_for_filesystem(fs, tempdir) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -def test_create_parquet_dataset_multi_threaded(tempdir): - fs = LocalFileSystem._get_instance() - base_path = tempdir - - _partition_test_for_filesystem(fs, base_path) - - manifest = pq.ParquetManifest(base_path, filesystem=fs, - metadata_nthreads=1) - with pytest.warns( - FutureWarning, match="Specifying the 'metadata_nthreads'" - ): - dataset = pq.ParquetDataset( - base_path, filesystem=fs, metadata_nthreads=16, - use_legacy_dataset=True - ) - assert len(dataset.pieces) > 0 - partitions = dataset.partitions - assert len(partitions.partition_names) > 0 - assert partitions.partition_names == manifest.partitions.partition_names - assert len(partitions.levels) == len(manifest.partitions.levels) - - -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): +def test_read_partitioned_columns_selection(tempdir): # ARROW-3861 - do not include partition columns in resulting table when # `columns` keyword was passed without those columns fs = LocalFileSystem._get_instance() base_path = tempdir _partition_test_for_filesystem(fs, base_path) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read(columns=["values"]) - if use_legacy_dataset: - # ParquetDataset implementation always includes the partition columns - # automatically, and we can't easily "fix" this since dask relies on - # this behaviour (ARROW-8644) - assert result.column_names == ["values", "foo", "bar"] - else: - assert result.column_names == ["values"] + assert result.column_names == ["values"] @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_equivalency(tempdir, use_legacy_dataset): +def test_filters_equivalency(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -225,7 +120,6 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', 'True')], - use_legacy_dataset=use_legacy_dataset, ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -247,8 +141,7 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): [('integer', '=', 0), ('boolean', '==', 'False')] ] dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + base_path, filesystem=fs, filters=filters) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) @@ -262,30 +155,15 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): assert df_filter_2.sum() > 0 assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum()) - if use_legacy_dataset: - # Check for \0 in predicate values. Until they are correctly - # implemented in ARROW-3391, they would otherwise lead to weird - # results with the current code. - with pytest.raises(NotImplementedError): - filters = [[('string', '==', b'1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - with pytest.raises(NotImplementedError): - filters = [[('string', '==', '1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - else: - for filters in [[[('string', '==', b'1\0a')]], - [[('string', '==', '1\0a')]]]: - dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=False) - assert dataset.read().num_rows == 0 + for filters in [[[('string', '==', b'1\0a')]], + [[('string', '==', '1\0a')]]]: + dataset = pq.ParquetDataset( + base_path, filesystem=fs, filters=filters) + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): +def test_filters_cutoff_exclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -308,7 +186,6 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): ('integers', '<', 4), ('integers', '>', 1), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -319,15 +196,14 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): assert result_list == [2, 3] -@pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.xfail( # different error with use_legacy_datasets because result_df is no longer # categorical raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): +@pytest.mark.pandas +def test_filters_cutoff_exclusive_datetime(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -356,7 +232,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): ('dates', '<', "2018-04-12"), ('dates', '>', "2018-04-10") ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -371,7 +246,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): @pytest.mark.pandas -@pytest.mark.dataset def test_filters_inclusive_datetime(tempdir): # ARROW-11480 path = tempdir / 'timestamps.parquet' @@ -389,8 +263,7 @@ def test_filters_inclusive_datetime(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_integer(tempdir, use_legacy_dataset): +def test_filters_inclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -413,7 +286,6 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): ('integers', '<=', 3), ('integers', '>=', 2), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -425,8 +297,7 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_set(tempdir, use_legacy_dataset): +def test_filters_inclusive_set(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -451,7 +322,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset( base_path, filesystem=fs, filters=[('string', 'in', 'ab')], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -464,7 +334,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', 'in', [1]), ('string', 'in', ('a', 'b')), ('boolean', 'not in', {'False'})], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -475,8 +344,7 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): +def test_filters_invalid_pred_op(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -496,49 +364,30 @@ def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): with pytest.raises(TypeError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', 'in', 3), ], - use_legacy_dataset=use_legacy_dataset) + filters=[('integers', 'in', 3), ]) with pytest.raises(ValueError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', '=<', 3), ], - use_legacy_dataset=use_legacy_dataset) - - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - else: - # Dataset API returns empty table instead - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - assert dataset.read().num_rows == 0 + filters=[('integers', '=<', 3), ]) - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - else: - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - with pytest.raises(NotImplementedError): - assert dataset.read().num_rows == 0 + # Dataset API returns empty table + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', 'in', set()), ]) + assert dataset.read().num_rows == 0 + + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})]) + with pytest.raises(NotImplementedError): + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_filters_invalid_column(tempdir, use_legacy_dataset): +def test_filters_invalid_column(tempdir): # ARROW-5572 - raise error on invalid name in filter specification - # works with new dataset / xfail with legacy implementation + # works with new dataset fs = LocalFileSystem._get_instance() base_path = tempdir @@ -556,12 +405,10 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): msg = r"No match for FieldRef.Name\(non_existent_column\)" with pytest.raises(ValueError, match=msg): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('non_existent_column', '<', 3), ], - use_legacy_dataset=use_legacy_dataset).read() + filters=[('non_existent_column', '<', 3), ]).read() @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize("filters", ([('integers', '<', 3)], [[('integers', '<', 3)]], @@ -569,7 +416,7 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): pc.field('nested', 'a') < 3, pc.field('nested', 'b').cast(pa.int64()) < 3)) @pytest.mark.parametrize("read_method", ("read_table", "read_pandas")) -def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): +def test_filters_read_table(tempdir, filters, read_method): read = getattr(pq, read_method) # test that filters keyword is passed through in read_table fs = LocalFileSystem._get_instance() @@ -589,24 +436,15 @@ def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): _generate_partition_directories(fs, base_path, partition_spec, df) - kwargs = dict(filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + kwargs = dict(filesystem=fs, filters=filters) - # Using Expression in legacy dataset not supported - if use_legacy_dataset and isinstance(filters, pc.Expression): - msg = "Expressions as filter not supported for legacy dataset" - with pytest.raises(TypeError, match=msg): - read(base_path, **kwargs) - else: - table = read(base_path, **kwargs) - assert table.num_rows == 3 + table = read(base_path, **kwargs) + assert table.num_rows == 3 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): +def test_partition_keys_with_underscores(tempdir): # ARROW-5666 - partition field values with underscores preserve underscores - # xfail with legacy dataset -> they get interpreted as integers fs = LocalFileSystem._get_instance() base_path = tempdir @@ -623,60 +461,47 @@ def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read() assert result.column("year_week").to_pylist() == string_keys @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_s3fs(s3_example_s3fs, ): fs, path = s3_example_s3fs path = path + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(path, filesystem=fs) assert result.equals(table) @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_directory_s3fs(s3_example_s3fs): fs, directory = s3_example_s3fs path = directory + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - directory, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(directory, filesystem=fs) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_single_file_list(tempdir, use_legacy_dataset): +def test_read_single_file_list(tempdir): data_path = str(tempdir / 'data.parquet') table = pa.table({"a": [1, 2, 3]}) _write_table(table, data_path) - result = pq.ParquetDataset( - [data_path], use_legacy_dataset=use_legacy_dataset - ).read() + result = pq.ParquetDataset([data_path]).read() assert result.equals(table) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs_wrapper( - s3_example_s3fs, use_legacy_dataset -): +def test_read_partitioned_directory_s3fs_wrapper(s3_example_s3fs): import s3fs from pyarrow.filesystem import S3FSWrapper @@ -690,23 +515,18 @@ def test_read_partitioned_directory_s3fs_wrapper( _partition_test_for_filesystem(wrapper, path) # Check that we can auto-wrap - dataset = pq.ParquetDataset( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + dataset = pq.ParquetDataset(path, filesystem=fs) dataset.read() @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_partitioned_directory_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs - _partition_test_for_filesystem( - fs, path, use_legacy_dataset=use_legacy_dataset - ) + _partition_test_for_filesystem(fs, path) -def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): +def _partition_test_for_filesystem(fs, base_path): foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -724,8 +544,7 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, filesystem=fs, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path, filesystem=fs) table = dataset.read() result_df = (table.to_pandas() .sort_values(by='index') @@ -735,15 +554,11 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): .reset_index(drop=True) .reindex(columns=result_df.columns)) - if use_legacy_dataset or Version(pd.__version__) < Version("2.0.0"): - expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) - expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys) - else: - # With pandas 2.0.0 Index can store all numeric dtypes (not just - # int64/uint64/float64). Using astype() to create a categorical - # column preserves original dtype (int32) - expected_df['foo'] = expected_df['foo'].astype("category") - expected_df['bar'] = expected_df['bar'].astype("category") + # With pandas 2.0.0 Index can store all numeric dtypes (not just + # int64/uint64/float64). Using astype() to create a categorical + # column preserves original dtype (int32) + expected_df['foo'] = expected_df['foo'].astype("category") + expected_df['bar'] = expected_df['bar'].astype("category") assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all() @@ -790,83 +605,6 @@ def _visit_level(base_dir, level, part_keys): _visit_level(base_dir, 0, []) -def _test_read_common_metadata_files(fs, base_path): - import pandas as pd - - import pyarrow.parquet as pq - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - base_path = str(base_path) - data_path = os.path.join(base_path, 'data.parquet') - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = os.path.join(base_path, '_common_metadata') - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(base_path, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.common_metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - common_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(common_schema) - - # handle list of one directory - dataset2 = pq.ParquetDataset([base_path], filesystem=fs, - use_legacy_dataset=True) - assert dataset2.schema.equals(dataset.schema) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_common_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - _test_read_common_metadata_files(fs, tempdir) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - data_path = tempdir / 'data.parquet' - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = tempdir / '_metadata' - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(tempdir, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - metadata_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(metadata_schema) - - def _filter_partition(df, part_keys): predicate = np.ones(len(df), dtype=bool) @@ -883,9 +621,8 @@ def _filter_partition(df, part_keys): return df[predicate].drop(to_drop, axis=1) -@parametrize_legacy_dataset @pytest.mark.pandas -def test_filter_before_validate_schema(tempdir, use_legacy_dataset): +def test_filter_before_validate_schema(tempdir): # ARROW-4076 apply filter before schema validation # to avoid checking unneeded schemas @@ -902,16 +639,12 @@ def test_filter_before_validate_schema(tempdir, use_legacy_dataset): pq.write_table(table2, dir2 / 'data.parquet') # read single file using filter - table = pq.read_table(tempdir, filters=[[('A', '==', 0)]], - use_legacy_dataset=use_legacy_dataset) + table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) @pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Specifying the 'metadata':FutureWarning") -@parametrize_legacy_dataset -def test_read_multiple_files(tempdir, use_legacy_dataset): +def test_read_multiple_files(tempdir): nfiles = 10 size = 5 @@ -938,8 +671,7 @@ def test_read_multiple_files(tempdir, use_legacy_dataset): (dirpath / '_SUCCESS.crc').touch() def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): - dataset = pq.ParquetDataset( - paths, use_legacy_dataset=use_legacy_dataset, **kwargs) + dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) @@ -947,37 +679,18 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): assert result.equals(expected) - # Read with provided metadata - # TODO(dataset) specifying metadata not yet supported - metadata = pq.read_metadata(paths[0]) - if use_legacy_dataset: - result2 = read_multiple_files(paths, metadata=metadata) - assert result2.equals(expected) - - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - result3 = pq.ParquetDataset(dirpath, schema=metadata.schema, - use_legacy_dataset=True).read() - assert result3.equals(expected) - else: - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table(paths, metadata=metadata, use_legacy_dataset=False) - # Read column subset to_read = [0, 2, 6, result.num_columns - 1] col_names = [result.field(i).name for i in to_read] - out = pq.read_table( - dirpath, columns=col_names, use_legacy_dataset=use_legacy_dataset - ) + out = pq.read_table(dirpath, columns=col_names) expected = pa.Table.from_arrays([result.column(i) for i in to_read], names=col_names, metadata=result.schema.metadata) assert out.equals(expected) # Read with multiple threads - pq.read_table( - dirpath, use_threads=True, use_legacy_dataset=use_legacy_dataset - ) + pq.read_table(dirpath, use_threads=True) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] @@ -986,31 +699,24 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) - if not use_legacy_dataset: - # TODO(dataset) Dataset API skips bad files - return + # TODO(dataset) Dataset API skips bad files - bad_meta = pq.read_metadata(bad_apple_path) + # bad_meta = pq.read_metadata(bad_apple_path) - with pytest.raises(ValueError): - read_multiple_files(paths + [bad_apple_path]) + # with pytest.raises(ValueError): + # read_multiple_files(paths + [bad_apple_path]) - with pytest.raises(ValueError): - read_multiple_files(paths, metadata=bad_meta) + # with pytest.raises(ValueError): + # read_multiple_files(paths, metadata=bad_meta) - mixed_paths = [bad_apple_path, paths[0]] + # mixed_paths = [bad_apple_path, paths[0]] - with pytest.raises(ValueError): - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - read_multiple_files(mixed_paths, schema=bad_meta.schema) - - with pytest.raises(ValueError): - read_multiple_files(mixed_paths) + # with pytest.raises(ValueError): + # read_multiple_files(mixed_paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_pandas(tempdir, use_legacy_dataset): +def test_dataset_read_pandas(tempdir): nfiles = 5 size = 5 @@ -1033,7 +739,7 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): frames.append(df) paths.append(path) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) @@ -1047,10 +753,8 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): tm.assert_frame_equal(result.reindex(columns=expected.columns), expected) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_memory_map(tempdir, use_legacy_dataset): +def test_dataset_memory_map(tempdir): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() dirpath.mkdir() @@ -1061,15 +765,12 @@ def test_dataset_memory_map(tempdir, use_legacy_dataset): _write_table(table, path, version='2.6') dataset = pq.ParquetDataset( - dirpath, memory_map=True, use_legacy_dataset=use_legacy_dataset) + dirpath, memory_map=True) assert dataset.read().equals(table) - if use_legacy_dataset: - assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_dataset_enable_buffered_stream(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1080,19 +781,16 @@ def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): with pytest.raises(ValueError): pq.ParquetDataset( - dirpath, buffer_size=-64, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=-64) for buffer_size in [128, 1024]: dataset = pq.ParquetDataset( - dirpath, buffer_size=buffer_size, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=buffer_size) assert dataset.read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): +def test_dataset_enable_pre_buffer(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1103,11 +801,9 @@ def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): for pre_buffer in (True, False): dataset = pq.ParquetDataset( - dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + dirpath, pre_buffer=pre_buffer) assert dataset.read().equals(table) - actual = pq.read_table(dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + actual = pq.read_table(dirpath, pre_buffer=pre_buffer) assert actual.equals(table) @@ -1123,18 +819,14 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): return paths -def _assert_dataset_paths(dataset, paths, use_legacy_dataset): - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset._pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) +def _assert_dataset_paths(dataset, paths): + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset.files) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): +def test_ignore_private_directories(tempdir, dir_prefix): dirpath = tempdir / guid() dirpath.mkdir() @@ -1144,14 +836,13 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): # private directory (dirpath / '{}staging'.format(dir_prefix)).mkdir() - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_dot(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1164,14 +855,13 @@ def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): with (dirpath / '.private').open('wb') as f: f.write(b'gibberish') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_underscore(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1184,17 +874,14 @@ def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): with (dirpath / '_started_321').open('wb') as f: f.write(b'abcd') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_no_private_directories_in_base_path( - tempdir, dir_prefix, use_legacy_dataset -): +def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix): # ARROW-8427 - don't ignore explicitly listed files if parent directory # is a private directory dirpath = tempdir / "{0}data".format(dir_prefix) / guid() @@ -1203,17 +890,15 @@ def test_ignore_no_private_directories_in_base_path( paths = _make_example_multifile_dataset(dirpath, nfiles=10, file_nrows=5) - dataset = pq.ParquetDataset(paths, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(paths) + _assert_dataset_paths(dataset, paths) # ARROW-9644 - don't ignore full directory with underscore in base path - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) + _assert_dataset_paths(dataset, paths) -@pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): +def test_ignore_custom_prefixes(tempdir): # ARROW-9573 - allow override of default ignore_prefixes part = ["xxx"] * 3 + ["yyy"] * 3 table = pa.table([ @@ -1221,7 +906,6 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): pa.array(part).dictionary_encode(), ], names=['index', '_part']) - # TODO use_legacy_dataset ARROW-10247 pq.write_to_dataset(table, str(tempdir), partition_cols=['_part']) private_duplicate = tempdir / '_private_duplicate' @@ -1230,29 +914,23 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): partition_cols=['_part']) read = pq.read_table( - tempdir, use_legacy_dataset=use_legacy_dataset, - ignore_prefixes=['_private']) + tempdir, ignore_prefixes=['_private']) assert read.equals(table) -@parametrize_legacy_dataset_fixed -def test_empty_directory(tempdir, use_legacy_dataset): - # ARROW-5310 - reading empty directory - # fails with legacy implementation +def test_empty_directory(tempdir): + # ARROW-5310 empty_dir = tempdir / 'dataset' empty_dir.mkdir() - dataset = pq.ParquetDataset( - empty_dir, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(empty_dir) result = dataset.read() assert result.num_rows == 0 assert result.num_columns == 0 -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") def _test_write_to_dataset_with_partitions(base_path, - use_legacy_dataset=True, filesystem=None, schema=None, index_name=None): @@ -1275,8 +953,7 @@ def _test_write_to_dataset_with_partitions(base_path, output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False, preserve_index=False) pq.write_to_dataset(output_table, base_path, partition_by, - filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) metadata_path = os.path.join(str(base_path), '_common_metadata') @@ -1286,19 +963,11 @@ def _test_write_to_dataset_with_partitions(base_path, else: pq.write_metadata(output_table.schema, metadata_path) - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset dataset = pq.ParquetDataset(base_path, - filesystem=filesystem, - validate_schema=True, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) # ARROW-2209: Ensure the dataset schema also includes the partition columns - if use_legacy_dataset: - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset_cols = set(dataset.schema.to_arrow_schema().names) - else: - # NB schema property is an arrow and not parquet schema - dataset_cols = set(dataset.schema.names) + # NB schema property is an arrow and not parquet schema + dataset_cols = set(dataset.schema.names) assert dataset_cols == set(output_table.schema.names) @@ -1323,7 +992,6 @@ def _test_write_to_dataset_with_partitions(base_path, def _test_write_to_dataset_no_partitions(base_path, - use_legacy_dataset=True, filesystem=None): import pandas as pd @@ -1347,7 +1015,6 @@ def _test_write_to_dataset_no_partitions(base_path, n = 5 for i in range(n): pq.write_to_dataset(output_table, base_path, - use_legacy_dataset=use_legacy_dataset, filesystem=filesystem) output_files = [file for file in filesystem.ls(str(base_path)) if file.endswith(".parquet")] @@ -1356,8 +1023,7 @@ def _test_write_to_dataset_no_partitions(base_path, # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset( - base_path, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset + base_path, filesystem=filesystem ).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() @@ -1366,131 +1032,71 @@ def _test_write_to_dataset_no_partitions(base_path, @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_with_partitions(tempdir): + _test_write_to_dataset_with_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_schema( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_schema(tempdir): schema = pa.schema([pa.field('group1', type=pa.string()), pa.field('group2', type=pa.string()), pa.field('num', type=pa.int64()), pa.field('nan', type=pa.int32()), pa.field('date', type=pa.timestamp(unit='us'))]) _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, schema=schema) + str(tempdir), schema=schema) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_index_name( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_index_name(tempdir): _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, index_name='index_name') + str(tempdir), index_name='index_name') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_no_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_no_partitions(tempdir): + _test_write_to_dataset_no_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset) - _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset) +def test_write_to_dataset_pathlib(tempdir): + _test_write_to_dataset_with_partitions(tempdir / "test1") + _test_write_to_dataset_no_partitions(tempdir / "test2") @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib_nonlocal( - tempdir, s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_pathlib_nonlocal(tempdir, s3_example_s3fs): # pathlib paths are only accepted for local files fs, _ = s3_example_s3fs with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset, filesystem=fs) + tempdir / "test1", filesystem=fs) with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset, filesystem=fs) + tempdir / "test2", filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_with_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_no_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_no_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) -@pytest.mark.filterwarnings( - "ignore:'ParquetDataset:FutureWarning", - "ignore:'partition_filename_cb':FutureWarning") -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_write_to_dataset_with_partitions_and_custom_filenames( - tempdir, use_legacy_dataset -): - output_df = pd.DataFrame({'group1': list('aaabbbbccc'), - 'group2': list('eefeffgeee'), - 'num': list(range(10)), - 'nan': [np.nan] * 10, - 'date': np.arange('2017-01-01', '2017-01-11', - dtype='datetime64[D]')}) - partition_by = ['group1', 'group2'] - output_table = pa.Table.from_pandas(output_df) - path = str(tempdir) - - def partition_filename_callback(keys): - return "{}-{}.parquet".format(*keys) - - pq.write_to_dataset(output_table, path, - partition_by, partition_filename_callback, - use_legacy_dataset=use_legacy_dataset) - - dataset = pq.ParquetDataset(path, use_legacy_dataset=use_legacy_dataset) - - # ARROW-3538: Ensure partition filenames match the given pattern - # defined in the local function partition_filename_callback - expected_basenames = [ - 'a-e.parquet', 'a-f.parquet', - 'b-e.parquet', 'b-f.parquet', - 'b-g.parquet', 'c-e.parquet' - ] - output_basenames = [os.path.basename(p.path) for p in dataset.pieces] - - assert sorted(expected_basenames) == sorted(output_basenames) - - -@pytest.mark.dataset @pytest.mark.pandas def test_write_to_dataset_filesystem(tempdir): df = pd.DataFrame({'A': [1, 2, 3]}) @@ -1502,7 +1108,7 @@ def test_write_to_dataset_filesystem(tempdir): assert result.equals(table) -def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): +def _make_dataset_for_pickling(tempdir, N=100): path = tempdir / 'data.parquet' fs = LocalFileSystem._get_instance() @@ -1525,42 +1131,22 @@ def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset( - tempdir, filesystem=fs, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) + tempdir, filesystem=fs) return dataset @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pickle_dataset(tempdir, datadir, use_legacy_dataset, pickle_module): +def test_pickle_dataset(tempdir, pickle_module): def is_pickleable(obj): return obj == pickle_module.loads(pickle_module.dumps(obj)) - dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset) + dataset = _make_dataset_for_pickling(tempdir) assert is_pickleable(dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - metadata = dataset.metadata - assert is_pickleable(metadata) - assert is_pickleable(metadata.schema) - assert len(metadata.schema) - for column in metadata.schema: - assert is_pickleable(column) - - for piece in dataset._pieces: - assert is_pickleable(piece) - metadata = piece.get_metadata() - assert metadata.num_row_groups - for i in range(metadata.num_row_groups): - assert is_pickleable(metadata.row_group(i)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_partitioned_dataset(tempdir, use_legacy_dataset): +def test_partitioned_dataset(tempdir): # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset # to a Parquet file path = tempdir / "ARROW-3208" @@ -1571,27 +1157,20 @@ def test_partitioned_dataset(tempdir, use_legacy_dataset): }) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=str(path), - partition_cols=['one', 'two'], - use_legacy_dataset=use_legacy_dataset) - table = pq.ParquetDataset( - path, use_legacy_dataset=use_legacy_dataset).read() + partition_cols=['one', 'two']) + table = pq.ParquetDataset(path).read() pq.write_table(table, path / "output.parquet") -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_dictionary(tempdir, use_legacy_dataset): +def test_dataset_read_dictionary(tempdir): path = tempdir / "ARROW-3325-dataset" t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) - pq.write_to_dataset(t1, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) - pq.write_to_dataset(t2, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) + pq.write_to_dataset(t1, root_path=str(path)) + pq.write_to_dataset(t2, root_path=str(path)) result = pq.ParquetDataset( - path, read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset).read() + path, read_dictionary=['f0']).read() # The order of the chunks is non-deterministic ex_chunks = [t1[0].chunk(0).dictionary_encode(), @@ -1606,9 +1185,6 @@ def test_dataset_read_dictionary(tempdir, use_legacy_dataset): assert c1.equals(ex_chunks[0]) -@pytest.mark.dataset -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_read_table_schema(tempdir): # test that schema keyword is passed through in read_table table = pa.table({'a': pa.array([1, 2, 3], pa.int32())}) @@ -1627,42 +1203,24 @@ def test_read_table_schema(tempdir): expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.equals(expected) - # don't allow it with the legacy reader - with pytest.raises( - ValueError, match="The 'schema' argument is only supported" - ): - pq.read_table(tempdir / "data.parquet", schema=schema, - use_legacy_dataset=True) - - # using ParquetDataset directory with non-legacy implementation - result = pq.ParquetDataset( - tempdir, schema=schema, use_legacy_dataset=False - ) + result = pq.ParquetDataset(tempdir, schema=schema) expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.read().equals(expected) -@pytest.mark.dataset -def test_dataset_unsupported_keywords(): - - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata=pa.schema([])) +def test_read_table_duplicate_column_selection(tempdir): + # test that duplicate column selection gives duplicate columns + table = pa.table({'a': pa.array([1, 2, 3], pa.int32()), + 'b': pa.array([1, 2, 3], pa.uint8())}) + pq.write_table(table, tempdir / "data.parquet") - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, validate_schema=False) + result = pq.read_table(tempdir / "data.parquet", columns=['a', 'a']) + expected_schema = pa.schema([('a', 'int32'), ('a', 'int32')]) - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, split_row_groups=True) + assert result.column_names == ['a', 'a'] + assert result.schema == expected_schema - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata_nthreads=4) - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) - - -@pytest.mark.dataset -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_dataset_partitioning(tempdir): import pyarrow.dataset as ds @@ -1679,42 +1237,25 @@ def test_dataset_partitioning(tempdir): # read_table part = ds.partitioning(field_names=["year", "month", "day"]) result = pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=False) + str(root_path), partitioning=part) assert result.column_names == ["a", "year", "month", "day"] result = pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=False).read() + str(root_path), partitioning=part).read() assert result.column_names == ["a", "year", "month", "day"] - # This raises an error for legacy dataset - with pytest.raises(ValueError): - pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=True) - - with pytest.raises(ValueError): - pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=True) - -@pytest.mark.dataset def test_parquet_dataset_new_filesystem(tempdir): # Ensure we can pass new FileSystem object to ParquetDataset - # (use new implementation automatically without specifying - # use_legacy_dataset=False) table = pa.table({'a': [1, 2, 3]}) pq.write_table(table, tempdir / 'data.parquet') - # don't use simple LocalFileSystem (as that gets mapped to legacy one) filesystem = fs.SubTreeFileSystem(str(tempdir), fs.LocalFileSystem()) dataset = pq.ParquetDataset('.', filesystem=filesystem) result = dataset.read() assert result.equals(table) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") -@parametrize_legacy_dataset -def test_parquet_dataset_partitions_piece_path_with_fsspec( - tempdir, use_legacy_dataset -): +def test_parquet_dataset_partitions_piece_path_with_fsspec(tempdir): # ARROW-10462 ensure that on Windows we properly use posix-style paths # as used by fsspec fsspec = pytest.importorskip("fsspec") @@ -1725,109 +1266,12 @@ def test_parquet_dataset_partitions_piece_path_with_fsspec( # pass a posix-style path (using "/" also on Windows) path = str(tempdir).replace("\\", "/") dataset = pq.ParquetDataset( - path, filesystem=filesystem, use_legacy_dataset=use_legacy_dataset) + path, filesystem=filesystem) # ensure the piece path is also posix-style expected = path + "/data.parquet" - assert dataset.pieces[0].path == expected - - -@pytest.mark.dataset -def test_parquet_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - dataset = pq.ParquetDataset(path, use_legacy_dataset=True) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset.pieces - - with pytest.warns(FutureWarning, match="'ParquetDataset.partitions"): - dataset.partitions - - with pytest.warns(FutureWarning, match="'ParquetDataset.memory_map"): - dataset.memory_map - - with pytest.warns(FutureWarning, match="'ParquetDataset.read_dictio"): - dataset.read_dictionary - - with pytest.warns(FutureWarning, match="'ParquetDataset.buffer_size"): - dataset.buffer_size - - with pytest.warns(FutureWarning, match="'ParquetDataset.fs"): - dataset.fs - - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset.schema - - with pytest.warns(FutureWarning, match="'ParquetDataset.common_metadata'"): - dataset.common_metadata - - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata"): - dataset.metadata + assert dataset.fragments[0].path == expected - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata_path"): - dataset.metadata_path - with pytest.warns(FutureWarning, - match="'ParquetDataset.common_metadata_path"): - dataset.common_metadata_path - - dataset2 = pq.ParquetDataset(path, use_legacy_dataset=False) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset2.pieces - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, use_legacy_dataset=True) - - # check also that legacy implementation is set when - # partition_filename_cb is specified - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, - partition_filename_cb=lambda x: 'filename.parquet') - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_unsupported_keywords_in_legacy(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="schema"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - schema=pa.schema([ - ('a', pa.int32()) - ])) - - with pytest.raises(ValueError, match="partitioning"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - partitioning=["a"]) - - with pytest.raises(ValueError, match="use_threads"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - use_threads=False) - - with pytest.raises(ValueError, match="file_visitor"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - file_visitor=lambda x: x) - - with pytest.raises(ValueError, match="existing_data_behavior"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - existing_data_behavior='error') - - with pytest.raises(ValueError, match="basename_template"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - basename_template='part-{i}.parquet') - - -@pytest.mark.dataset def test_parquet_write_to_dataset_exposed_keywords(tempdir): table = pa.table({'a': [1, 2, 3]}) path = tempdir / 'partitioning' @@ -1841,8 +1285,7 @@ def file_visitor(written_file): pq.write_to_dataset(table, path, partitioning=["a"], file_visitor=file_visitor, - basename_template=basename_template, - use_legacy_dataset=False) + basename_template=basename_template) expected_paths = { path / '1' / 'part-0.parquet', @@ -1853,53 +1296,6 @@ def file_visitor(written_file): assert paths_written_set == expected_paths -@pytest.mark.dataset -def test_write_to_dataset_conflicting_keywords(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="'basename_template' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - with pytest.raises(ValueError, match="'partition_filename_cb' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - - with pytest.raises(ValueError, match="'partitioning' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'partition_cols' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'file_visitor' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - metadata_collector=[], - file_visitor=lambda x: x) - with pytest.raises(ValueError, match="'metadata_collector' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - metadata_collector=[], - file_visitor=lambda x: x) - - -@pytest.mark.dataset @pytest.mark.parametrize("write_dataset_kwarg", ( ("create_dir", True), ("create_dir", False), @@ -1926,8 +1322,7 @@ def test_write_to_dataset_kwargs_passed(tempdir, write_dataset_kwarg): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): +def test_write_to_dataset_category_observed(tempdir): # if we partition on a categorical variable with "unobserved" categories # (values present in the dictionary, but not in the actual data) # ensure those are not creating empty files/directories @@ -1938,8 +1333,7 @@ def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): table = pa.table(df) path = tempdir / "dataset" pq.write_to_dataset( - table, tempdir / "dataset", partition_cols=["cat"], - use_legacy_dataset=use_legacy_dataset + table, tempdir / "dataset", partition_cols=["cat"] ) subdirs = [f.name for f in path.iterdir() if f.is_dir()] assert len(subdirs) == 2 diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index f97c451df7ad7..6a9cbd4f73d4f 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -23,8 +23,7 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import ( - _check_roundtrip, parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -48,8 +47,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_datetime_tz(use_legacy_dataset): +def test_pandas_parquet_datetime_tz(): # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units # so we need to cast the pandas dtype. Pandas v1 will always silently # coerce to [ns] due to lack of non-[ns] support. @@ -69,21 +67,19 @@ def test_pandas_parquet_datetime_tz(use_legacy_dataset): _write_table(arrow_table, f) f.seek(0) - table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(f) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_datetime_timezone_tzinfo(use_legacy_dataset): +def test_datetime_timezone_tzinfo(): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) - _roundtrip_pandas_dataframe( - df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) + _roundtrip_pandas_dataframe(df, write_kwargs={}) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 0ed305bff1945..f194d12876968 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -23,8 +23,6 @@ import pyarrow as pa from pyarrow.fs import LocalFileSystem, SubTreeFileSystem -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -101,8 +99,7 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): +def test_pandas_parquet_column_multiindex(tempdir): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), @@ -115,17 +112,13 @@ def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( - tempdir, use_legacy_dataset -): +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' @@ -137,8 +130,7 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( assert js['columns'] _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) js = table_read.schema.pandas_metadata assert not js['index_columns'] @@ -150,52 +142,20 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( tm.assert_frame_equal(df, df_read) -# TODO(dataset) duplicate column selection actually gives duplicate columns now -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_pandas_column_selection(tempdir, use_legacy_dataset): - size = 10000 - np.random.seed(0) - df = pd.DataFrame({ - 'uint8': np.arange(size, dtype=np.uint8), - 'uint16': np.arange(size, dtype=np.uint16) - }) - filename = tempdir / 'pandas_roundtrip.parquet' - arrow_table = pa.Table.from_pandas(df) - _write_table(arrow_table, filename) - table_read = _read_table( - filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - # ARROW-4267: Selection of duplicate columns still leads to these columns - # being read uniquely. - table_read = _read_table( - filename, columns=['uint8', 'uint8'], - use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_native_file_roundtrip(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_pandas_column_subset(tempdir, use_legacy_dataset): +def test_read_pandas_column_subset(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() @@ -204,27 +164,24 @@ def test_read_pandas_column_subset(tempdir, use_legacy_dataset): reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], - use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_empty_roundtrip(): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -def test_pandas_can_write_nested_data(tempdir): +def test_pandas_can_write_nested_data(): data = { "agg_col": [ {"page_type": 1}, @@ -241,8 +198,7 @@ def test_pandas_can_write_nested_data(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_pyfile_roundtrip(tempdir): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ @@ -260,14 +216,13 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): data = io.BytesIO(filename.read_bytes()) - table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(data) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): +def test_pandas_parquet_configuration_options(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -289,16 +244,14 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.6', use_dictionary=use_dictionary) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.6', write_statistics=write_statistics) - table_read = _read_table(filename, - use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -308,8 +261,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): continue _write_table(arrow_table, filename, version='2.6', compression=compression) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -327,8 +279,7 @@ def test_spark_flavor_preserves_pandas_metadata(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_index_column_name_duplicate(tempdir, use_legacy_dataset): +def test_index_column_name_duplicate(tempdir): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, @@ -352,14 +303,13 @@ def test_index_column_name_duplicate(tempdir, use_legacy_dataset): tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) - arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + arrow_table = _read_table(path) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): +def test_multiindex_duplicate_values(tempdir): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( @@ -373,7 +323,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) - result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(filename) assert table.equals(result_table) result_df = result_table.to_pandas() @@ -381,8 +331,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): +def test_backwards_compatible_index_naming(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -397,17 +346,13 @@ def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0, engine='python') - table = _read_table( - datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -426,17 +371,13 @@ def test_backwards_compatible_index_multi_level_named( header=0, engine='python' ).sort_index() - table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.all-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_some_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_some_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -456,17 +397,13 @@ def test_backwards_compatible_index_multi_level_some_named( ).sort_index() expected.index = expected.index.set_names(['cut', None, 'clarity']) - table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.some-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_column_metadata_handling( - datadir, use_legacy_dataset -): +def test_backwards_compatible_column_metadata_handling(datadir): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -476,19 +413,18 @@ def test_backwards_compatible_column_metadata_handling( names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' - table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table = _read_table(path) result = table.to_pandas() tm.assert_frame_equal(result, expected) table = _read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_index_survives_roundtrip(use_legacy_dataset): +def test_categorical_index_survives_roundtrip(): # ARROW-3652, addressed by ARROW-3246 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) df['c1'] = df['c1'].astype('category') @@ -497,15 +433,13 @@ def test_categorical_index_survives_roundtrip(use_legacy_dataset): table = pa.Table.from_pandas(df) bos = pa.BufferOutputStream() pq.write_table(table, bos) - ref_df = pq.read_pandas( - bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + ref_df = pq.read_pandas(bos.getvalue()).to_pandas() assert isinstance(ref_df.index, pd.CategoricalIndex) assert ref_df.index.equals(df.index) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_order_survives_roundtrip(use_legacy_dataset): +def test_categorical_order_survives_roundtrip(): # ARROW-6302 df = pd.DataFrame({"a": pd.Categorical( ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) @@ -515,15 +449,13 @@ def test_categorical_order_survives_roundtrip(use_legacy_dataset): pq.write_table(table, bos) contents = bos.getvalue() - result = pq.read_pandas( - contents, use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_pandas(contents).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): +def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) df_category = df.astype({"col": "category", "int": "category"}) @@ -533,8 +465,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): # it works pq.write_table(table_cat, buf, version='2.6', chunk_size=10) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(buf.getvalue()) # Result is non-categorical assert result[0].equals(table[0]) @@ -542,8 +473,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_roundtrip(use_legacy_dataset): +def test_pandas_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1) @@ -555,8 +485,7 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset): buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_table(buf.getvalue()).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) @@ -587,41 +516,28 @@ def test_categories_with_string_pyarrow_dtype(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_extensiondtypes( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) df['col'] = df['col'].astype("Int64") table = pa.table(df) pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): +def test_write_to_dataset_pandas_preserve_index(tempdir): # ARROW-8251 - preserve pandas index in roundtrip df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) @@ -632,34 +548,24 @@ def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result, df_cat) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result, df) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('preserve_index', [True, False, None]) @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"]) def test_dataset_read_pandas_common_metadata( - tempdir, use_legacy_dataset, preserve_index, metadata_fname + tempdir, preserve_index, metadata_fname ): # ARROW-1103 nfiles = 5 @@ -696,7 +602,7 @@ def test_dataset_read_pandas_common_metadata( ) pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 9f920206a107e..93097a1afaac9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -18,7 +18,6 @@ import io import os import sys -from unittest import mock import pytest @@ -296,28 +295,6 @@ def test_parquet_file_explicitly_closed(tempdir): table = pa.table({'col1': [0, 1], 'col2': [0, 1]}) pq.write_table(table, fn) - # read_table (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - pq.read_table(f, use_legacy_dataset=True) - assert not f.closed # Didn't close it internally after read_table - - # read_table (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.read_table(fn, use_legacy_dataset=True) - mock_close.assert_called() - - # ParquetDataset test (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.ParquetDataset(fn, use_legacy_dataset=True).read() - mock_close.assert_called() - - # ParquetDataset test (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - # ARROW-8075: support ParquetDataset from file-like, not just path-like - with pytest.raises(TypeError, match='not a path-like object'): - pq.ParquetDataset(f, use_legacy_dataset=True).read() - assert not f.closed - # ParquetFile with opened file (will leave open) with open(fn, 'rb') as f: with pq.ParquetFile(f) as p: @@ -338,7 +315,7 @@ def test_parquet_file_explicitly_closed(tempdir): @pytest.mark.s3 @pytest.mark.parametrize("use_uri", (True, False)) -def test_parquet_file_with_filesystem(tempdir, s3_example_fs, use_uri): +def test_parquet_file_with_filesystem(s3_example_fs, use_uri): s3_fs, s3_uri, s3_path = s3_example_fs args = (s3_uri if use_uri else s3_path,) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index b902541015aa2..16584684f5c7f 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -20,7 +20,6 @@ import pyarrow as pa from pyarrow import fs from pyarrow.filesystem import FileSystem, LocalFileSystem -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -44,8 +43,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): +def test_parquet_incremental_file_build(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -65,8 +63,7 @@ def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): writer.close() buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -105,8 +102,7 @@ def test_parquet_invalid_writer(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): +def test_parquet_writer_context_obj(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -124,18 +120,14 @@ def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): frames.append(df.copy()) buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj_with_exception( - tempdir, use_legacy_dataset -): +def test_parquet_writer_context_obj_with_exception(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -160,8 +152,7 @@ def test_parquet_writer_context_obj_with_exception( assert str(e) == error_text buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -340,8 +331,7 @@ def test_parquet_writer_filesystem_buffer_raises(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): +def test_parquet_writer_with_caller_provided_filesystem(): out = pa.BufferOutputStream() class CustomFS(FileSystem): @@ -368,8 +358,7 @@ def open(self, path, mode='rb'): assert out.closed buf = out.getvalue() - table_read = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(pa.BufferReader(buf)) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a37eb1e426f7a..e2bb4400c8bde 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1148,7 +1148,6 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") - # write_to_dataset currently requires pandas pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( @@ -1158,10 +1157,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): return table, dataset -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1208,10 +1204,7 @@ def test_fragments_implicit_cast(tempdir): assert len(list(fragments)) == 1 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1272,10 +1265,7 @@ def assert_yields_projected(fragment, row_slice, dataset_reader.to_table(new_fragment, filter=ds.field('part') == 'a') -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1326,8 +1316,6 @@ def test_fragments_parquet_num_row_groups(tempdir): @pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): - import pandas as pd - df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2])) df['col1'] = df['col1'].astype("category") @@ -1340,10 +1328,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): assert (df.iloc[0] == result.to_pandas()).all().all() -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_module): fs, assert_opens = open_logging_fs _, dataset = _create_dataset_for_fragments( @@ -1384,7 +1369,6 @@ def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_modu assert row_group.statistics is not None -@pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs, pickle_module): # https://issues.apache.org/jira/browse/ARROW-15796 @@ -1454,16 +1438,13 @@ def _create_dataset_all_types(tempdir, chunk_size=None): path = str(tempdir / "test_parquet_dataset_all_types") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, use_legacy_dataset=True, - chunk_size=chunk_size) + pq.write_to_dataset(table, path, chunk_size=chunk_size) return table, ds.dataset(path, format="parquet", partitioning="hive") @pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_parquet_fragment_statistics(tempdir): table, dataset = _create_dataset_all_types(tempdir) @@ -1529,10 +1510,7 @@ def test_parquet_empty_row_group_statistics(tempdir): assert fragments[0].row_groups[0].statistics == {} -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_predicate(tempdir): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1555,10 +1533,7 @@ def test_fragments_parquet_row_groups_predicate(tempdir): assert len(row_group_fragments) == 0 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1600,10 +1575,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, dataset_reader.to_table(new_fragment) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1631,10 +1603,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, assert result.equals(table[:0]) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1666,10 +1635,7 @@ def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, assert subfrag.num_row_groups == 4 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_invalid(tempdir): _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1) fragment = list(dataset.get_fragments())[0] @@ -3591,10 +3557,7 @@ def test_parquet_dataset_factory_fsspec(tempdir): @pytest.mark.parquet @pytest.mark.pandas # write_to_dataset currently requires pandas -@pytest.mark.parametrize('use_legacy_dataset', [False, True]) -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_dataset_factory_roundtrip(tempdir): # Simple test to ensure we can roundtrip dataset to # _metadata/common_metadata and back. A more complex test # using partitioning will have to wait for ARROW-13269. The @@ -3606,7 +3569,6 @@ def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): metadata_collector = [] pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector, - use_legacy_dataset=use_legacy_dataset ) metadata_path = str(root_path / '_metadata') # write _metadata file @@ -3820,7 +3782,6 @@ def test_dataset_project_only_partition_columns(tempdir, dataset_reader): @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir, dataset_reader): - import pandas as pd df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) f = tempdir / "test_dataset_project_null_column.parquet" @@ -3930,8 +3891,7 @@ def test_write_to_dataset_given_null_just_works(tempdir): 'col': list(range(4))}, schema=schema) path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=False) + pq.write_to_dataset(table, path, partition_cols=['part']) actual_table = pq.read_table(tempdir / 'test_dataset') # column.equals can handle the difference in chunking but not the fact @@ -3941,28 +3901,6 @@ def test_write_to_dataset_given_null_just_works(tempdir): assert actual_table.column('col').equals(table.column('col')) -@pytest.mark.parquet -@pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_legacy_write_to_dataset_drops_null(tempdir): - schema = pa.schema([ - pa.field('col', pa.int64()), - pa.field('part', pa.dictionary(pa.int32(), pa.string())) - ]) - table = pa.table({'part': ['a', 'a', None, None], - 'col': list(range(4))}, schema=schema) - expected = pa.table( - {'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) - - path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=True) - - actual = pq.read_table(tempdir / 'test_dataset') - assert actual == expected - - def _sort_table(tab, sort_col): import pyarrow.compute as pc sorted_indices = pc.sort_indices( diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 511dbf9a1c4e1..5b94c200f35de 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -27,7 +27,7 @@ from pyarrow.tests import util from pyarrow.tests.parquet.common import _test_dataframe from pyarrow.tests.parquet.test_dataset import ( - _test_read_common_metadata_files, _test_write_to_dataset_with_partitions, + _test_write_to_dataset_with_partitions, _test_write_to_dataset_no_partitions ) from pyarrow.util import guid @@ -309,6 +309,9 @@ def _write_multiple_hdfs_pq_files(self, tmpdir): expected = pa.concat_tables(test_data) return expected + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_multiple_parquet_files(self): @@ -343,6 +346,9 @@ def test_read_multiple_parquet_files_with_uri(self): expected.to_pandas() ) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_write_parquet_files_with_uri(self): @@ -360,19 +366,13 @@ def test_read_write_parquet_files_with_uri(self): pq.write_table(table, path, filesystem=self.hdfs) - result = pq.read_table( - path, filesystem=self.hdfs, use_legacy_dataset=True - ).to_pandas() + result = pq.read_table(path, filesystem=self.hdfs).to_pandas() assert_frame_equal(result, df) - @pytest.mark.parquet - @pytest.mark.pandas - def test_read_common_metadata_files(self): - tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid()) - self.hdfs.mkdir(tmpdir) - _test_read_common_metadata_files(self.hdfs, tmpdir) - + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_with_partitions(self): @@ -381,6 +381,9 @@ def test_write_to_dataset_with_partitions(self): _test_write_to_dataset_with_partitions( tmpdir, filesystem=self.hdfs) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_no_partitions(self): diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 9745393d01bbc..bdaac0a9ce5d2 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -275,7 +275,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; return array->IsNull(j) ? cpp11::na() @@ -466,10 +467,10 @@ struct AltrepFactor : public AltrepVectorBase { std::unique_ptr unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); BufferVector arrays_transpose(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *internal::checked_cast(*chunked_array->chunk(i)) .dictionary(); @@ -559,17 +560,14 @@ struct AltrepFactor : public AltrepVectorBase { return dup; } - // The value at position i - static int Elt(SEXP alt, R_xlen_t i) { - if (Base::IsMaterialized(alt)) { - return INTEGER_ELT(Representation(alt), i); - } - + // The value at position i as an int64_t (to make bounds checking less verbose) + static int64_t Elt64(SEXP alt, R_xlen_t i) { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; if (!array->IsNull(j)) { @@ -578,7 +576,7 @@ struct AltrepFactor : public AltrepVectorBase { if (WasUnified(alt)) { const auto* transpose_data = reinterpret_cast( - GetArrayTransposed(alt, resolve.chunk_index)->data()); + GetArrayTransposed(alt, static_cast(resolve.chunk_index))->data()); switch (indices->type_id()) { case Type::UINT8: @@ -617,7 +615,7 @@ struct AltrepFactor : public AltrepVectorBase { case Type::INT64: return indices->data()->GetValues(1)[j] + 1; case Type::UINT64: - return indices->data()->GetValues(1)[j] + 1; + return static_cast(indices->data()->GetValues(1)[j] + 1); default: break; } @@ -628,6 +626,18 @@ struct AltrepFactor : public AltrepVectorBase { return NA_INTEGER; } + // The value at position i as an int (which R needs because this is a factor) + static int Elt(SEXP alt, R_xlen_t i) { + if (Base::IsMaterialized(alt)) { + return INTEGER_ELT(Representation(alt), i); + } + + int64_t elt64 = Elt64(alt, i); + ARROW_R_DCHECK(elt64 == NA_INTEGER || elt64 >= 1); + ARROW_R_DCHECK(elt64 <= std::numeric_limits::max()); + return static_cast(elt64); + } + static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) { // If we have data2, we can just copy the region into buf // using the standard Get_region for this R type @@ -667,7 +677,7 @@ struct AltrepFactor : public AltrepVectorBase { // using the transpose data for this chunk const auto* transpose_data = reinterpret_cast(GetArrayTransposed(alt, j)->data()); - auto transpose = [transpose_data](int x) { return transpose_data[x]; }; + auto transpose = [transpose_data](int64_t x) { return transpose_data[x]; }; GetRegionDispatch(array, indices, transpose, out); @@ -677,7 +687,7 @@ struct AltrepFactor : public AltrepVectorBase { } else { // simpler case, identity transpose - auto transpose = [](int x) { return x; }; + auto transpose = [](int64_t x) { return static_cast(x); }; int* out = buf; for (const auto& array : slice->chunks()) { @@ -718,7 +728,13 @@ struct AltrepFactor : public AltrepVectorBase { VisitArraySpanInline( *array->data(), - /*valid_func=*/[&](index_type index) { *out++ = transpose(index) + 1; }, + /*valid_func=*/ + [&](index_type index) { + int64_t transposed = transpose(index) + 1; + ARROW_R_DCHECK(transposed >= 1); + ARROW_R_DCHECK(transposed <= std::numeric_limits::max()); + *out++ = static_cast(transposed); + }, /*null_func=*/[&]() { *out++ = cpp11::na(); }); } @@ -765,7 +781,8 @@ struct AltrepVectorString : public AltrepVectorBase> { bool no_nul = std::find(view_.begin(), view_.end(), '\0') == view_.end(); if (no_nul) { - return Rf_mkCharLenCE(view_.data(), view_.size(), CE_UTF8); + ARROW_R_DCHECK(view_.size() <= std::numeric_limits::max()); + return Rf_mkCharLenCE(view_.data(), static_cast(view_.size()), CE_UTF8); } else if (strip_out_nuls_) { return ConvertStripNul(); } else { @@ -802,7 +819,9 @@ struct AltrepVectorString : public AltrepVectorBase> { } nul_was_stripped_ = true; - return Rf_mkCharLenCE(stripped_string_.data(), stripped_len, CE_UTF8); + ARROW_R_DCHECK(stripped_len <= std::numeric_limits::max()); + return Rf_mkCharLenCE(stripped_string_.data(), static_cast(stripped_len), + CE_UTF8); } bool nul_was_stripped() const { return nul_was_stripped_; } @@ -847,7 +866,8 @@ struct AltrepVectorString : public AltrepVectorBase> { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; SEXP s = NA_STRING; diff --git a/r/src/array.cpp b/r/src/array.cpp index ae76c01a94910..38406e494d67b 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -92,7 +92,7 @@ std::shared_ptr Array__Slice2(const std::shared_ptr& return array->Slice(offset, length); } -void arrow::r::validate_index(int i, int len) { +void arrow::r::validate_index(int64_t i, int64_t len) { if (i == NA_INTEGER) { cpp11::stop("'i' cannot be NA"); } @@ -119,10 +119,14 @@ r_vec_size Array__length(const std::shared_ptr& x) { } // [[arrow::export]] -int Array__offset(const std::shared_ptr& x) { return x->offset(); } +r_vec_size Array__offset(const std::shared_ptr& x) { + return r_vec_size(x->offset()); +} // [[arrow::export]] -int Array__null_count(const std::shared_ptr& x) { return x->null_count(); } +r_vec_size Array__null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count()); +} // [[arrow::export]] std::shared_ptr Array__type(const std::shared_ptr& x) { @@ -263,9 +267,9 @@ r_vec_size LargeListArray__value_length( } // [[arrow::export]] -r_vec_size FixedSizeListArray__value_length( +int FixedSizeListArray__value_length( const std::shared_ptr& array, int64_t i) { - return r_vec_size(array->value_length(i)); + return array->value_length(i); } // [[arrow::export]] @@ -294,10 +298,10 @@ cpp11::writable::integers ListArray__raw_value_offsets( } // [[arrow::export]] -cpp11::writable::integers LargeListArray__raw_value_offsets( +cpp11::writable::doubles LargeListArray__raw_value_offsets( const std::shared_ptr& array) { auto offsets = array->raw_value_offsets(); - return cpp11::writable::integers(offsets, offsets + array->length()); + return cpp11::writable::doubles(offsets, offsets + array->length()); } // [[arrow::export]] diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index bf026d2723a1a..2f0508eb7a47a 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -375,7 +375,7 @@ struct Converter_String : public Converter { private: static SEXP r_string_from_view(std::string_view view) { - return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + return Rf_mkCharLenCE(view.data(), static_cast(view.size()), CE_UTF8); } static SEXP r_string_from_view_strip_nul(std::string_view view, @@ -576,10 +576,10 @@ class Converter_Dictionary : public Converter { const auto& arr_type = checked_cast(*chunked_array->type()); unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); arrays_transpose_.resize(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *checked_cast(*chunked_array->chunk(i)).dictionary(); StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose_[i])); @@ -748,7 +748,7 @@ class Converter_Struct : public Converter { auto colnames = arrow::r::to_r_strings( type->fields(), [](const std::shared_ptr& field) { return field->name(); }); - out.attr(symbols::row_names) = arrow::r::short_row_names(n); + out.attr(symbols::row_names) = arrow::r::short_row_names(static_cast(n)); out.attr(R_NamesSymbol) = colnames; out.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; @@ -756,7 +756,7 @@ class Converter_Struct : public Converter { } Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { - int nf = converters.size(); + int nf = static_cast(converters.size()); for (int i = 0; i < nf; i++) { SEXP data_i = VECTOR_ELT(data, i); @@ -771,7 +771,7 @@ class Converter_Struct : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto struct_array = checked_cast(array.get()); - int nf = converters.size(); + int nf = static_cast(converters.size()); // Flatten() deals with merging of nulls auto arrays = ValueOrStop(struct_array->Flatten(gc_memory_pool())); for (int i = 0; i < nf; i++) { @@ -1384,7 +1384,7 @@ cpp11::writable::list to_data_frame(const std::shared_ptr& data, tbl.attr(R_NamesSymbol) = names; tbl.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; - tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(nr); + tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(static_cast(nr)); return tbl; } diff --git a/r/src/arraydata.cpp b/r/src/arraydata.cpp index cdab38f1147aa..d879e807323af 100644 --- a/r/src/arraydata.cpp +++ b/r/src/arraydata.cpp @@ -26,18 +26,18 @@ std::shared_ptr ArrayData__get_type( } // [[arrow::export]] -int ArrayData__get_length(const std::shared_ptr& x) { - return x->length; +r_vec_size ArrayData__get_length(const std::shared_ptr& x) { + return r_vec_size(x->length); } // [[arrow::export]] -int ArrayData__get_null_count(const std::shared_ptr& x) { - return x->null_count; +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count); } // [[arrow::export]] -int ArrayData__get_offset(const std::shared_ptr& x) { - return x->offset; +r_vec_size ArrayData__get_offset(const std::shared_ptr& x) { + return r_vec_size(x->offset); } // [[arrow::export]] diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 790207efce1d2..75e0f27b4002e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -110,7 +110,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__offset(const std::shared_ptr& x); +r_vec_size Array__offset(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -118,7 +118,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__null_count(const std::shared_ptr& x); +r_vec_size Array__null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -315,7 +315,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -r_vec_size FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); +int FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -359,7 +359,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -cpp11::writable::integers LargeListArray__raw_value_offsets(const std::shared_ptr& array); +cpp11::writable::doubles LargeListArray__raw_value_offsets(const std::shared_ptr& array); extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -467,7 +467,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_length(const std::shared_ptr& x); +r_vec_size ArrayData__get_length(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_length(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -475,7 +475,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_null_count(const std::shared_ptr& x); +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -483,7 +483,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_offset(const std::shared_ptr& x); +r_vec_size ArrayData__get_offset(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -765,7 +765,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -r_vec_size ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -869,11 +869,11 @@ BEGIN_CPP11 END_CPP11 } // compression.cpp -std::shared_ptr util___Codec__Create(arrow::Compression::type codec, R_xlen_t compression_level); +std::shared_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); extern "C" SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_level_sexp){ BEGIN_CPP11 arrow::r::Input::type codec(codec_sexp); - arrow::r::Input::type compression_level(compression_level_sexp); + arrow::r::Input::type compression_level(compression_level_sexp); return cpp11::as_sexp(util___Codec__Create(codec, compression_level)); END_CPP11 } @@ -2024,14 +2024,14 @@ extern "C" SEXP _arrow_dataset___JsonFragmentScanOptions__Make(SEXP parse_option // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int64_t thrift_string_size_limit, int64_t thrift_container_size_limit); +std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int32_t thrift_string_size_limit, int32_t thrift_container_size_limit); extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp, SEXP thrift_string_size_limit_sexp, SEXP thrift_container_size_limit_sexp){ BEGIN_CPP11 arrow::r::Input::type use_buffered_stream(use_buffered_stream_sexp); arrow::r::Input::type buffer_size(buffer_size_sexp); arrow::r::Input::type pre_buffer(pre_buffer_sexp); - arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); - arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); + arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); + arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); return cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer, thrift_string_size_limit, thrift_container_size_limit)); END_CPP11 } @@ -2567,10 +2567,10 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width); +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width); extern "C" SEXP _arrow_FixedSizeBinary__initialize(SEXP byte_width_sexp){ BEGIN_CPP11 - arrow::r::Input::type byte_width(byte_width_sexp); + arrow::r::Input::type byte_width(byte_width_sexp); return cpp11::as_sexp(FixedSizeBinary__initialize(byte_width)); END_CPP11 } @@ -3976,7 +3976,7 @@ BEGIN_CPP11 END_CPP11 } // message.cpp -r_vec_size ipc___Message__Verify(const std::unique_ptr& message); +bool ipc___Message__Verify(const std::unique_ptr& message); extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){ BEGIN_CPP11 arrow::r::Input&>::type message(message_sexp); @@ -4684,7 +4684,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x); +int RecordBatch__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -4734,11 +4734,11 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column(batch, i)); END_CPP11 } @@ -4771,42 +4771,42 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__AddColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__AddColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__SetColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__SetColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__RemoveColumn(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__RemoveColumn(batch, i)); END_CPP11 } // recordbatch.cpp -std::string RecordBatch__column_name(const std::shared_ptr& batch, R_xlen_t i); +std::string RecordBatch__column_name(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column_name(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column_name(batch, i)); END_CPP11 } @@ -5346,7 +5346,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -r_vec_size Table__num_columns(const std::shared_ptr& x); +int Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -5379,20 +5379,20 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__column(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__column(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__column(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__field(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__field(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__field(table, i)); END_CPP11 } @@ -5476,31 +5476,31 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__RemoveColumn(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__RemoveColumn(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__AddColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__AddColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__AddColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__AddColumn(table, i, field, column)); END_CPP11 } // table.cpp -std::shared_ptr Table__SetColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__SetColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__SetColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__SetColumn(table, i, field, column)); diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index d8c4b719d1d3e..ab60586628164 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -27,6 +27,18 @@ #include "./nameof.h" +// Simple dcheck that doesn't use assert (i.e., won't crash the R session) +// Condition this on our own debug flag to avoid this ending up in any CRAN +// checks. +#if defined(ARROW_R_DEBUG) +#define ARROW_R_DCHECK(EXPR) \ + do { \ + if (!(EXPR)) Rf_error("Failed DCHECK: %s evaluated to false", #EXPR); \ + } while (false) +#else +#define ARROW_R_DCHECK(EXPR) +#endif + // borrowed from enc package // because R does not make these macros available (i.e. from Defn.h) #define UTF8_MASK (1 << 3) @@ -465,7 +477,7 @@ inline SEXP as_sexp(r_vec_size size) { if (x > std::numeric_limits::max()) { return Rf_ScalarReal(x); } else { - return Rf_ScalarInteger(x); + return Rf_ScalarInteger(static_cast(x)); } } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index fadc39c75fc06..05c8f6062dabb 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -189,13 +189,13 @@ void validate_slice_offset(R_xlen_t offset, int64_t len); void validate_slice_length(R_xlen_t length, int64_t available); -void validate_index(int i, int len); +void validate_index(int64_t i, int64_t len); template void TraverseDots(cpp11::list dots, int num_fields, Lambda lambda) { cpp11::strings names(dots.attr(R_NamesSymbol)); - for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { + for (int i = 0, j = 0; j < num_fields; i++) { auto name_i = names[i]; if (name_i.size() == 0) { diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 36884bb531b62..258013fc4da57 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -34,9 +34,8 @@ r_vec_size ChunkedArray__null_count( } // [[arrow::export]] -r_vec_size ChunkedArray__num_chunks( - const std::shared_ptr& chunked_array) { - return r_vec_size(chunked_array->num_chunks()); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array) { + return chunked_array->num_chunks(); } // [[arrow::export]] diff --git a/r/src/compression.cpp b/r/src/compression.cpp index 148c6e14002f5..bc893afd8d28b 100644 --- a/r/src/compression.cpp +++ b/r/src/compression.cpp @@ -22,7 +22,7 @@ // [[arrow::export]] std::shared_ptr util___Codec__Create(arrow::Compression::type codec, - R_xlen_t compression_level) { + int compression_level) { return ValueOrStop(arrow::util::Codec::Create(codec, compression_level)); } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 87d1326ed3419..bd97e30005ca3 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -241,10 +241,10 @@ std::shared_ptr make_compute_options( interpolation); } if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { - out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } @@ -479,9 +479,9 @@ std::shared_ptr make_compute_options( func_name == "hash_stddev") { using Options = arrow::compute::VarianceOptions; auto out = std::make_shared(); - out->ddof = cpp11::as_cpp(options["ddof"]); + out->ddof = cpp11::as_cpp(options["ddof"]); if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); @@ -683,7 +683,7 @@ arrow::Status CallRScalarUDF(arrow::compute::KernelContext* context, } } - cpp11::sexp batch_length_sexp = cpp11::as_sexp(span.length); + cpp11::sexp batch_length_sexp = cpp11::as_sexp(static_cast(span.length)); std::shared_ptr output_type = result->type()->GetSharedPtr(); cpp11::sexp output_type_sexp = cpp11::to_r6(output_type); @@ -738,8 +738,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { // Compute the Arity from the list of input kernels. We don't currently handle // variable numbers of arguments in a user-defined function. - int64_t n_args = - cpp11::as_cpp>(in_type_r[0])->num_fields(); + int n_args = cpp11::as_cpp>(in_type_r[0])->num_fields(); for (R_xlen_t i = 1; i < n_kernels; i++) { auto in_types = cpp11::as_cpp>(in_type_r[i]); if (in_types->num_fields() != n_args) { @@ -767,7 +766,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { cpp11::sexp out_type_func = out_type_r[i]; std::vector compute_in_types(in_types->num_fields()); - for (int64_t j = 0; j < in_types->num_fields(); j++) { + for (int j = 0; j < in_types->num_fields(); j++) { compute_in_types[j] = arrow::compute::InputType(in_types->field(j)->type()); } diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 83c430fb634d3..e53fc03bdb413 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -343,8 +343,8 @@ std::shared_ptr dataset___JsonFragmentScanOptions__ std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, - int64_t thrift_string_size_limit, - int64_t thrift_container_size_limit) { + int32_t thrift_string_size_limit, + int32_t thrift_container_size_limit) { auto options = std::make_shared(); if (use_buffered_stream) { options->reader_properties->enable_buffered_stream(); diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index f19ba92527157..2f2b89d658d91 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -201,7 +201,7 @@ std::shared_ptr DayTimeInterval__initialize() { } // [[arrow::export]] -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width) { +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width) { if (byte_width == NA_INTEGER) { cpp11::stop("'byte_width' cannot be NA"); } diff --git a/r/src/io.cpp b/r/src/io.cpp index 321b1b17febc3..4d5ee31794ae8 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -253,11 +253,16 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return arrow::Status::IOError("R connection is closed"); } + if (nbytes > std::numeric_limits::max()) { + return arrow::Status::Invalid( + "Can't read more than INT_MAX bytes from an R connection"); + } + return SafeCallIntoR( [&] { cpp11::function read_bin = cpp11::package("base")["readBin"]; cpp11::writable::raws ptype((R_xlen_t)0); - cpp11::integers n = cpp11::as_sexp(nbytes); + cpp11::integers n = cpp11::as_sexp(static_cast(nbytes)); cpp11::sexp result = read_bin(connection_sexp_, ptype, n); @@ -512,8 +517,8 @@ struct ReencodeUTF8TransformFunctionWrapper { // UTF-16, and UTF-32. while (in_bytes_left > 0) { // Make enough place in the output to hopefully consume all of the input. - RETURN_NOT_OK( - builder.Reserve(std::max(in_bytes_left * kOversizeFactor, 4))); + RETURN_NOT_OK(builder.Reserve( + std::max(static_cast(in_bytes_left * kOversizeFactor), 4))); out_buf = builder.mutable_data() + builder.length(); out_bytes_left = builder.capacity() - builder.length(); diff --git a/r/src/message.cpp b/r/src/message.cpp index d9832ddc22a74..3f21873fea3b2 100644 --- a/r/src/message.cpp +++ b/r/src/message.cpp @@ -39,8 +39,8 @@ std::shared_ptr ipc___Message__body( } // [[arrow::export]] -r_vec_size ipc___Message__Verify(const std::unique_ptr& message) { - return r_vec_size(message->Verify()); +bool ipc___Message__Verify(const std::unique_ptr& message) { + return message->Verify(); } // [[arrow::export]] diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d9bf848e24292..d2db11e14a787 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -335,7 +335,7 @@ struct RConvert { template static enable_if_integer> Convert(Type*, From from) { - return CIntFromRScalarImpl(from); + return CIntFromRScalarImpl(static_cast(from)); } // ---- convert R integer types to double @@ -461,7 +461,7 @@ class RPrimitiveConverter< if (std::is_same::value) { auto append_value = [this](r_value_type value) { - this->primitive_builder_->UnsafeAppend(value); + this->primitive_builder_->UnsafeAppend(static_cast(value)); return Status::OK(); }; return VisitVector(it, size, append_null, append_value); @@ -595,19 +595,21 @@ class RPrimitiveConverter::value>> return VisitVector(it, size, append_null, append_value); } - static int FromRDate(const Date32Type*, int from) { return from; } + static int FromRDate(const Date32Type*, double from) { return static_cast(from); } - static int64_t FromRDate(const Date64Type*, int from) { + static int64_t FromRDate(const Date64Type*, double from) { constexpr int64_t kMilliSecondsPerDay = 86400000; - return from * kMilliSecondsPerDay; + return static_cast(from * kMilliSecondsPerDay); } static int FromPosixct(const Date32Type*, double from) { constexpr int64_t kSecondsPerDay = 86400; - return from / kSecondsPerDay; + return static_cast(from / kSecondsPerDay); } - static int64_t FromPosixct(const Date64Type*, double from) { return from * 1000; } + static int64_t FromPosixct(const Date64Type*, double from) { + return static_cast(from * 1000); + } }; int64_t get_TimeUnit_multiplier(TimeUnit::type unit) { @@ -1081,7 +1083,7 @@ class RListConverter : public ListConverter { auto append_value = [this](SEXP value) { // TODO: if we decide that this can be run concurrently // we'll have to do vec_size() upfront - int n = arrow::r::vec_size(value); + R_xlen_t n = arrow::r::vec_size(value); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(n)); RETURN_NOT_OK(this->list_builder_->Append()); diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index aca3a74fd81df..bf88e98ed1026 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -27,8 +27,8 @@ #include // [[arrow::export]] -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int RecordBatch__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -80,7 +80,7 @@ cpp11::list RecordBatch__columns(const std::shared_ptr& batc // [[arrow::export]] std::shared_ptr RecordBatch__column( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column(i); } @@ -106,7 +106,7 @@ bool RecordBatch__Equals(const std::shared_ptr& self, // [[arrow::export]] std::shared_ptr RecordBatch__AddColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->AddColumn(i, field, column)); @@ -114,7 +114,7 @@ std::shared_ptr RecordBatch__AddColumn( // [[arrow::export]] std::shared_ptr RecordBatch__SetColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->SetColumn(i, field, column)); @@ -122,14 +122,14 @@ std::shared_ptr RecordBatch__SetColumn( // [[arrow::export]] std::shared_ptr RecordBatch__RemoveColumn( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return ValueOrStop(batch->RemoveColumn(i)); } // [[arrow::export]] std::string RecordBatch__column_name(const std::shared_ptr& batch, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column_name(i); } diff --git a/r/src/schema.cpp b/r/src/schema.cpp index cf959707305a7..41d3d38d2eda3 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -29,14 +29,14 @@ std::shared_ptr Schema__from_fields( // [[arrow::export]] std::shared_ptr Schema__from_list(cpp11::list field_list) { - int n = field_list.size(); + R_xlen_t n = field_list.size(); bool nullable = true; cpp11::strings names(field_list.attr(R_NamesSymbol)); std::vector> fields(n); - for (int i = 0; i < n; i++) { + for (R_xlen_t i = 0; i < n; i++) { fields[i] = arrow::field( names[i], cpp11::as_cpp>(field_list[i]), nullable); diff --git a/r/src/table.cpp b/r/src/table.cpp index 04537000f5d48..04a8c7caf24fd 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -23,8 +23,8 @@ #include // [[arrow::export]] -r_vec_size Table__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int Table__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -49,14 +49,14 @@ std::shared_ptr Table__ReplaceSchemaMetadata( // [[arrow::export]] std::shared_ptr Table__column( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { arrow::r::validate_index(i, table->num_columns()); return table->column(i); } // [[arrow::export]] std::shared_ptr Table__field(const std::shared_ptr& table, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, table->num_columns()); return table->field(i); } @@ -123,13 +123,13 @@ std::shared_ptr Table__GetColumnByName( // [[arrow::export]] std::shared_ptr Table__RemoveColumn( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { return ValueOrStop(table->RemoveColumn(i)); } // [[arrow::export]] std::shared_ptr Table__AddColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->AddColumn(i, field, column)); @@ -137,7 +137,7 @@ std::shared_ptr Table__AddColumn( // [[arrow::export]] std::shared_ptr Table__SetColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->SetColumn(i, field, column)); @@ -241,7 +241,7 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, // Remove metadata for ExtensionType columns, because these have their own mechanism for // preserving R type information - for (R_xlen_t i = 0; i < schema->num_fields(); i++) { + for (int i = 0; i < schema->num_fields(); i++) { if (schema->field(i)->type()->id() == Type::EXTENSION) { metadata_columns[i] = R_NilValue; }