From 596259ee47b5c675b71432743d9bfd196efe08e3 Mon Sep 17 00:00:00 2001 From: Vibhatha Lakmal Abeykoon Date: Thu, 21 Dec 2023 19:02:38 +0530 Subject: [PATCH 01/31] GH-38725: [Java] decompression in Lz4CompressionCodec.java does not set writer index (#38840) ### Rationale for this change The `doDecompress` function in `Lz4CompressionCodec` misses writing the index when it is compared with the functionality in `ZstdCompressionCodec`. This PR fixes that issue. ### What changes are included in this PR? Writes the index for the decompressed ArrowBuf. ### Are these changes tested? No ### Are there any user-facing changes? No * Closes: #38725 Lead-authored-by: Vibhatha Lakmal Abeykoon Co-authored-by: vibhatha Signed-off-by: David Li --- .../org/apache/arrow/compression/Lz4CompressionCodec.java | 1 + .../apache/arrow/compression/TestCompressionCodec.java | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java index daa35b7e15be6..e8b780638e2c1 100644 --- a/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java +++ b/java/compression/src/main/java/org/apache/arrow/compression/Lz4CompressionCodec.java @@ -79,6 +79,7 @@ protected ArrowBuf doDecompress(BufferAllocator allocator, ArrowBuf compressedBu byte[] outBytes = out.toByteArray(); ArrowBuf decompressedBuffer = allocator.buffer(outBytes.length); decompressedBuffer.setBytes(/*index=*/0, outBytes); + decompressedBuffer.writerIndex(decompressedLength); return decompressedBuffer; } diff --git a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java index 403130edba52e..01156fa2b0e0b 100644 --- a/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java +++ b/java/compression/src/test/java/org/apache/arrow/compression/TestCompressionCodec.java @@ -117,6 +117,12 @@ private List deCompressBuffers(CompressionCodec codec, List return outputBuffers; } + private void assertWriterIndex(List decompressedBuffers) { + for (ArrowBuf decompressedBuf : decompressedBuffers) { + assertTrue(decompressedBuf.writerIndex() > 0); + } + } + @ParameterizedTest @MethodSource("codecs") void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) throws Exception { @@ -139,6 +145,7 @@ void testCompressFixedWidthBuffers(int vectorLength, CompressionCodec codec) thr List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(2, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector IntVector newVec = new IntVector("new vec", allocator); @@ -180,6 +187,7 @@ void testCompressVariableWidthBuffers(int vectorLength, CompressionCodec codec) List decompressedBuffers = deCompressBuffers(codec, compressedBuffers); assertEquals(3, decompressedBuffers.size()); + assertWriterIndex(decompressedBuffers); // orchestrate new vector VarCharVector newVec = new VarCharVector("new vec", allocator); From 2f9f892a0075d990a1b42dc97a97d490b6b08345 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Dec 2023 15:53:41 +0100 Subject: [PATCH 02/31] GH-39196: [Python][Docs] Document the Arrow PyCapsule protocol in the 'extending pyarrow' section of the Python docs (#39199) ### Rationale for this change While the Arrow PyCapsule protocol itself is defined in the specification part of the docs, this PR adds a section about it in the Python user guide as well (referring to the specification for most details), where users might typically look for Python specific docs. * Closes: #39196 Lead-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- .../CDataInterface/PyCapsuleInterface.rst | 2 ++ docs/source/python/extending_types.rst | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 0c1a01d7c6778..03095aa2e9356 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -16,6 +16,8 @@ .. under the License. +.. _arrow-pycapsule-interface: + ============================= The Arrow PyCapsule Interface ============================= diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index ee92cebcb549c..b7261005e66ee 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -21,6 +21,38 @@ Extending pyarrow ================= +Controlling conversion to (Py)Arrow with the PyCapsule Interface +---------------------------------------------------------------- + +The :ref:`Arrow C data interface ` allows moving Arrow data between +different implementations of Arrow. This is a generic, cross-language interface not +specific to Python, but for Python libraries this interface is extended with a Python +specific layer: :ref:`arrow-pycapsule-interface`. + +This Python interface ensures that different libraries that support the C Data interface +can export Arrow data structures in a standard way and recognize each other's objects. + +If you have a Python library providing data structures that hold Arrow-compatible data +under the hood, you can implement the following methods on those objects: + +- ``__arrow_c_schema__`` for schema or type-like objects. +- ``__arrow_c_array__`` for arrays and record batches (contiguous tables). +- ``__arrow_c_stream__`` for chunked tables or streams of data. + +Those methods return `PyCapsule `__ +objects, and more details on the exact semantics can be found in the +:ref:`specification `. + +When your data structures have those methods defined, the PyArrow constructors +(such as :func:`pyarrow.array` or :func:`pyarrow.table`) will recognize those objects as +supporting this protocol, and convert them to PyArrow data structures zero-copy. And the +same can be true for any other library supporting this protocol on ingesting data. + +Similarly, if your library has functions that accept user-provided data, you can add +support for this protocol by checking for the presence of those methods, and +therefore accept any Arrow data (instead of harcoding support for a specific +Arrow producer such as PyArrow). + .. _arrow_array_protocol: Controlling conversion to pyarrow.Array with the ``__arrow_array__`` protocol From 535b925bf073fb1af4e6e23ab54027f30dc8751f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Fri, 22 Dec 2023 01:34:06 +0800 Subject: [PATCH 03/31] GH-39232: [C++] Support binary to fixed_size_binary cast (#39236) ### Rationale for this change Add binary to fixed_size_binary cast. ### What changes are included in this PR? Add binary to fixed_size_binary cast. ### Are these changes tested? Yes ### Are there any user-facing changes? No * Closes: #39232 Authored-by: Jin Shang Signed-off-by: Antoine Pitrou --- .../compute/kernels/scalar_cast_string.cc | 61 ++++++++++++++++--- .../arrow/compute/kernels/scalar_cast_test.cc | 16 +++++ 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index ebeb597207a81..a6576e4e4c26f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -20,11 +20,14 @@ #include "arrow/array/array_base.h" #include "arrow/array/builder_binary.h" +#include "arrow/compute/kernels/base_arithmetic_internal.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/compute/kernels/temporal_internal.h" #include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/util/formatting.h" #include "arrow/util/int_util.h" #include "arrow/util/utf8_internal.h" @@ -284,9 +287,8 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, } template -enable_if_base_binary BinaryToBinaryCastExec(KernelContext* ctx, - const ExecSpan& batch, - ExecResult* out) { +enable_if_t::value && !is_fixed_size_binary_type::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const CastOptions& options = checked_cast(*ctx->state()).options; const ArraySpan& input = batch[0].array; @@ -387,6 +389,33 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou return ZeroCopyCastExec(ctx, batch, out); } +template +enable_if_t::value && std::is_same::value, + Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const CastOptions& options = checked_cast(*ctx->state()).options; + FixedSizeBinaryBuilder builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); + const ArraySpan& input = batch[0].array; + RETURN_NOT_OK(builder.Reserve(input.length)); + + RETURN_NOT_OK(VisitArraySpanInline( + input, + [&](std::string_view v) { + if (v.size() != static_cast(builder.byte_width())) { + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + options.to_type.ToString(), ": widths must match"); + } + builder.UnsafeAppend(v); + return Status::OK(); + }, + [&] { + builder.UnsafeAppendNull(); + return Status::OK(); + })); + + return builder.FinishInternal(&std::get>(out->value)); +} + #if defined(_MSC_VER) #pragma warning(pop) #endif @@ -452,6 +481,26 @@ void AddBinaryToBinaryCast(CastFunction* func) { AddBinaryToBinaryCast(func); } +template +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + auto resolver_fsb = [](KernelContext* ctx, const std::vector&) { + const CastOptions& options = checked_cast(*ctx->state()).options; + return options.to_type; + }; + + DCHECK_OK(func->AddKernel(InType::type_id, {InputType(InType::type_id)}, resolver_fsb, + BinaryToBinaryCastExec, + NullHandling::COMPUTED_NO_PREALLOCATE)); +} + +void AddBinaryToFixedSizeBinaryCast(CastFunction* func) { + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); + AddBinaryToFixedSizeBinaryCast(func); +} + } // namespace std::vector> GetBinaryLikeCasts() { @@ -483,11 +532,7 @@ std::vector> GetBinaryLikeCasts() { std::make_shared("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY); AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions), cast_fsb.get()); - DCHECK_OK(cast_fsb->AddKernel( - Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)}, - OutputType(FirstType), - BinaryToBinaryCastExec, - NullHandling::COMPUTED_NO_PREALLOCATE)); + AddBinaryToFixedSizeBinaryCast(cast_fsb.get()); return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb}; } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index c84125bbdd19e..b429c8175b020 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2171,6 +2171,22 @@ TEST(Cast, StringToString) { } } +TEST(Cast, BinaryOrStringToFixedSizeBinary) { + for (auto in_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quu"])"); + auto invalid_input = ArrayFromJSON(in_type, R"(["foo", null, "bar", "baz", "quux"])"); + + CheckCast(valid_input, ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])")); + CheckCastFails(invalid_input, CastOptions::Safe(fixed_size_binary(3))); + CheckCastFails(valid_input, CastOptions::Safe(fixed_size_binary(5))); + + auto empty_input = ArrayFromJSON(in_type, "[]"); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(3), "[]")); + CheckCast(empty_input, ArrayFromJSON(fixed_size_binary(5), "[]")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), From e5145bff901778360f6faba3be27efa3d9522976 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 15:00:22 -0300 Subject: [PATCH 04/31] GH-39339: [C++] Add ForceCachedHierarchicalNamespaceSupport to help with testing (#39340) ### Rationale for this change This ensures all the branches in the `AzureFileSystem` code operations are tested. For instance, many operations executed on a missing container, wouldn't get a `HNSSupport::kContainerNotFound` error if the cached `HNSSupport` was already known due to a previous operation that cached the `HNSSupport` value. ### What changes are included in this PR? Introduction of the helper that overrides `cached_hns_support_` and enumeration of the scenarios. ### Are these changes tested? Yes. This is a test improvement PR. * Closes: #39339 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 36 +- cpp/src/arrow/filesystem/azurefs.h | 5 + cpp/src/arrow/filesystem/azurefs_test.cc | 453 +++++++++++++---------- 3 files changed, 291 insertions(+), 203 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d72ead92ed111..27bdb5092a3ea 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -941,14 +941,38 @@ class AzureFileSystem::Impl { break; } ARROW_ASSIGN_OR_RAISE( - cached_hns_support_, + auto hns_support, internal::CheckIfHierarchicalNamespaceIsEnabled(adlfs_client, options_)); - DCHECK_NE(cached_hns_support_, HNSSupport::kUnknown); - // Caller should handle kContainerNotFound case appropriately. - return cached_hns_support_; + DCHECK_NE(hns_support, HNSSupport::kUnknown); + if (hns_support == HNSSupport::kContainerNotFound) { + // Caller should handle kContainerNotFound case appropriately as it knows the + // container this refers to, but the cached value in that case should remain + // kUnknown before we get a CheckIfHierarchicalNamespaceIsEnabled result that + // is not kContainerNotFound. + cached_hns_support_ = HNSSupport::kUnknown; + } else { + cached_hns_support_ = hns_support; + } + return hns_support; } public: + /// This is used from unit tests to ensure we perform operations on all the + /// possible states of cached_hns_support_. + void ForceCachedHierarchicalNamespaceSupport(int support) { + auto hns_support = static_cast(support); + switch (hns_support) { + case HNSSupport::kUnknown: + case HNSSupport::kContainerNotFound: + case HNSSupport::kDisabled: + case HNSSupport::kEnabled: + cached_hns_support_ = hns_support; + return; + } + // This is reachable if an invalid int is cast to enum class HNSSupport. + DCHECK(false) << "Invalid enum HierarchicalNamespaceSupport value."; + } + Result GetFileInfo(const AzureLocation& location) { if (location.container.empty()) { DCHECK(location.path.empty()); @@ -1560,6 +1584,10 @@ AzureFileSystem::AzureFileSystem(std::unique_ptr&& impl) default_async_is_sync_ = false; } +void AzureFileSystem::ForceCachedHierarchicalNamespaceSupport(int hns_support) { + impl_->ForceCachedHierarchicalNamespaceSupport(hns_support); +} + Result> AzureFileSystem::Make( const AzureOptions& options, const io::IOContext& io_context) { ARROW_ASSIGN_OR_RAISE(auto impl, AzureFileSystem::Impl::Make(options, io_context)); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index be3ca5ba238ae..69f6295237043 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -44,6 +44,8 @@ class DataLakeServiceClient; namespace arrow::fs { +class TestAzureFileSystem; + /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { /// \brief hostname[:port] of the Azure Blob Storage Service. @@ -156,6 +158,9 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { explicit AzureFileSystem(std::unique_ptr&& impl); + friend class TestAzureFileSystem; + void ForceCachedHierarchicalNamespaceSupport(int hns_support); + public: ~AzureFileSystem() override = default; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf7522b98eef..3266c1bfda2dc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -62,7 +62,6 @@ namespace arrow { using internal::TemporaryDir; namespace fs { using internal::ConcatAbstractPath; -namespace { namespace bp = boost::process; using ::testing::IsEmpty; @@ -354,7 +353,7 @@ class TestAzureFileSystem : public ::testing::Test { bool set_up_succeeded_ = false; AzureOptions options_; - std::shared_ptr fs_; + std::shared_ptr fs_dont_use_directly_; // use fs() std::unique_ptr blob_service_client_; std::unique_ptr datalake_service_client_; @@ -362,6 +361,18 @@ class TestAzureFileSystem : public ::testing::Test { TestAzureFileSystem() : rng_(std::random_device()()) {} virtual Result GetAzureEnv() const = 0; + virtual HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const = 0; + + FileSystem* fs(HNSSupport cached_hns_support) const { + auto* fs_ptr = fs_dont_use_directly_.get(); + fs_ptr->ForceCachedHierarchicalNamespaceSupport(static_cast(cached_hns_support)); + return fs_ptr; + } + + FileSystem* fs() const { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + return fs(CachedHNSSupport(*env)); + } static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; @@ -395,7 +406,7 @@ class TestAzureFileSystem : public ::testing::Test { EXPECT_OK_AND_ASSIGN(options_, options_res); } - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + ASSERT_OK_AND_ASSIGN(fs_dont_use_directly_, AzureFileSystem::Make(options_)); EXPECT_OK_AND_ASSIGN(blob_service_client_, options_.MakeBlobServiceClient()); EXPECT_OK_AND_ASSIGN(datalake_service_client_, options_.MakeDataLakeServiceClient()); set_up_succeeded_ = true; @@ -435,7 +446,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const std::string& path, int total_size) { - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const auto all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); ASSERT_OK(output->Write(all_lines)); ASSERT_OK(output->Close()); @@ -461,19 +472,19 @@ class TestAzureFileSystem : public ::testing::Test { const auto sub_directory_path = ConcatAbstractPath(directory_path, "new-sub"); const auto sub_blob_path = ConcatAbstractPath(sub_directory_path, "sub.txt"); const auto top_blob_path = ConcatAbstractPath(directory_path, "top.txt"); - ASSERT_OK(fs_->CreateDir(sub_directory_path, true)); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(sub_blob_path)); + ASSERT_OK(fs()->CreateDir(sub_directory_path, true)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(sub_blob_path)); ASSERT_OK(output->Write(std::string_view("sub"))); ASSERT_OK(output->Close()); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(top_blob_path)); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(top_blob_path)); ASSERT_OK(output->Write(std::string_view("top"))); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_directory_path, FileType::Directory); - AssertFileInfo(fs_.get(), sub_blob_path, FileType::File); - AssertFileInfo(fs_.get(), top_blob_path, FileType::File); + AssertFileInfo(fs(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_directory_path, FileType::Directory); + AssertFileInfo(fs(), sub_blob_path, FileType::File); + AssertFileInfo(fs(), top_blob_path, FileType::File); paths->container = data.container_name; paths->directory = directory_path; @@ -538,52 +549,52 @@ class TestAzureFileSystem : public ::testing::Test { const auto directory_path = data.RandomDirectoryPath(rng_); if (WithHierarchicalNamespace()) { - ASSERT_OK(fs_->CreateDir(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs_->CreateDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } } void TestCreateDirSuccessContainerAndDirectory() { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->CreateDir(path, false)); + ASSERT_OK(fs()->CreateDir(path, false)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); } } void TestCreateDirRecursiveSuccessContainerOnly() { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, true)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, true)); + AssertFileInfo(fs(), container_name, FileType::Directory); } void TestCreateDirRecursiveSuccessDirectoryOnly() { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } } @@ -591,31 +602,31 @@ class TestAzureFileSystem : public ::testing::Test { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); + ASSERT_OK(fs()->CreateDir(path, true)); if (WithHierarchicalNamespace()) { - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } else { // There is only virtual directory without hierarchical namespace // support. So the CreateDir() does nothing. - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } } void TestDeleteDirContentsSuccessNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_OK(fs_->DeleteDirContents(directory_path, true)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } void TestDeleteDirContentsFailureNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDirContents(directory_path, false)); + ASSERT_RAISES(IOError, fs()->DeleteDirContents(directory_path, false)); } }; @@ -672,12 +683,12 @@ void TestAzureFileSystem::TestGetFileInfoObject() { .GetProperties() .Value; - AssertFileInfo(fs_.get(), data.ObjectPath(), FileType::File, + AssertFileInfo(fs(), data.ObjectPath(), FileType::File, std::chrono::system_clock::time_point{object_properties.LastModified}, static_cast(object_properties.BlobSize)); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + std::string{data.kObjectName})); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); } void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { @@ -685,37 +696,37 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { // Adds detailed tests to handle cases of different edge cases // with directory naming conventions (e.g. with and without slashes). const std::string kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(data.ContainerPath(kObjectName), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(auto output, + fs()->OpenOutputStream(data.ContainerPath(kObjectName), + /*metadata=*/{})); const std::string_view lorem_ipsum(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); // 0 is immediately after "/" lexicographically, ensure that this doesn't // cause unexpected issues. - ASSERT_OK_AND_ASSIGN( - output, fs_->OpenOutputStream(data.ContainerPath("test-object-dir/some_other_dir0"), - /*metadata=*/{})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream( + data.ContainerPath("test-object-dir/some_other_dir0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); ASSERT_OK_AND_ASSIGN(output, - fs_->OpenOutputStream(data.ContainerPath(kObjectName + "0"), - /*metadata=*/{})); + fs()->OpenOutputStream(data.ContainerPath(kObjectName + "0"), + /*metadata=*/{})); ASSERT_OK(output->Write(lorem_ipsum)); ASSERT_OK(output->Close()); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName), FileType::File); - AssertFileInfo(fs_.get(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir") + "/", - FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir"), + AssertFileInfo(fs(), data.ContainerPath(kObjectName), FileType::File); + AssertFileInfo(fs(), data.ContainerPath(kObjectName) + "/", FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir"), FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir") + "/", FileType::Directory); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir"), FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_dir") + "/", + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_dir") + "/", FileType::Directory); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-di"), FileType::NotFound); - AssertFileInfo(fs_.get(), data.ContainerPath("test-object-dir/some_other_di"), + AssertFileInfo(fs(), data.ContainerPath("test-object-di"), FileType::NotFound); + AssertFileInfo(fs(), data.ContainerPath("test-object-dir/some_other_di"), FileType::NotFound); if (WithHierarchicalNamespace()) { @@ -723,17 +734,45 @@ void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { .GetDirectoryClient("test-empty-object-dir") .Create(); - AssertFileInfo(fs_.get(), data.ContainerPath("test-empty-object-dir"), + AssertFileInfo(fs(), data.ContainerPath("test-empty-object-dir"), FileType::Directory); } } -template +template +struct TestingScenario { + using AzureEnvClass = AzureEnv; + static constexpr bool kHNSSupportShouldBeKnown = HNSSupportShouldBeKnown; +}; + +template class AzureFileSystemTestImpl : public TestAzureFileSystem { public: + using AzureEnvClass = typename TestingScenario::AzureEnvClass; + using TestAzureFileSystem::TestAzureFileSystem; Result GetAzureEnv() const final { return AzureEnvClass::GetInstance(); } + + /// \brief HNSSupport value that should be assumed as the cached + /// HNSSupport on every fs()->Operation(...) call in tests. + /// + /// If TestingScenario::kHNSSupportShouldBeKnown is true, this value + /// will be HNSSupport::kEnabled or HNSSupport::kDisabled, depending + /// on the environment. Otherwise, this value will be HNSSupport::kUnknown. + /// + /// This ensures all the branches in the AzureFileSystem code operations are tested. + /// For instance, many operations executed on a missing container, wouldn't + /// get a HNSSupport::kContainerNotFound error if the cached HNSSupport was + /// already known due to a previous operation that cached the HNSSupport value. + HNSSupport CachedHNSSupport(const BaseAzureEnv& env) const final { + if constexpr (TestingScenario::kHNSSupportShouldBeKnown) { + return env.WithHierarchicalNamespace() ? HNSSupport::kEnabled + : HNSSupport::kDisabled; + } else { + return HNSSupport::kUnknown; + } + } }; // How to enable the non-Azurite tests: @@ -762,54 +801,71 @@ class AzureFileSystemTestImpl : public TestAzureFileSystem { // [1]: https://azure.microsoft.com/en-gb/free/ // [2]: // https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account -using TestAzureFlatNSFileSystem = AzureFileSystemTestImpl; -using TestAzureHierarchicalNSFileSystem = AzureFileSystemTestImpl; -using TestAzuriteFileSystem = AzureFileSystemTestImpl; +using TestAzureFlatNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzureHierarchicalNSFileSystem = + AzureFileSystemTestImpl>; +using TestAzuriteFileSystem = AzureFileSystemTestImpl>; -// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) - -template -using AzureFileSystemTestOnAllEnvs = AzureFileSystemTestImpl; +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS). +template +using TestAzureFileSystemOnAllEnvs = AzureFileSystemTestImpl; using AllEnvironments = - ::testing::Types; + ::testing::Types, TestingScenario, + TestingScenario>; -TYPED_TEST_SUITE(AzureFileSystemTestOnAllEnvs, AllEnvironments); +TYPED_TEST_SUITE(TestAzureFileSystemOnAllEnvs, AllEnvironments); -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespace) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespace) { this->TestDetectHierarchicalNamespace(true); this->TestDetectHierarchicalNamespace(false); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { +TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespaceOnMissingContainer) { this->TestDetectHierarchicalNamespaceOnMissingContainer(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObject) { +// Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) +// combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and +// known according to the environment. +template +using TestAzureFileSystemOnAllScenarios = AzureFileSystemTestImpl; + +using AllScenarios = ::testing::Types< + TestingScenario, TestingScenario, + TestingScenario, TestingScenario, + TestingScenario, + TestingScenario>; + +TYPED_TEST_SUITE(TestAzureFileSystemOnAllScenarios, AllScenarios); + +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObject) { this->TestGetFileInfoObject(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, DeleteDirSuccessEmpty) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { this->TestDeleteDirSuccessEmpty(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, GetFileInfoObjectWithNestedStructure) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructure) { this->TestGetFileInfoObjectWithNestedStructure(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { this->TestCreateDirSuccessContainerAndDirectory(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessContainerOnly) { this->TestCreateDirRecursiveSuccessContainerOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessDirectoryOnly) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessDirectoryOnly) { this->TestCreateDirRecursiveSuccessDirectoryOnly(); } -TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDirectory) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, + CreateDirRecursiveSuccessContainerAndDirectory) { this->TestCreateDirRecursiveSuccessContainerAndDirectory(); } @@ -818,41 +874,41 @@ TYPED_TEST(AzureFileSystemTestOnAllEnvs, CreateDirRecursiveSuccessContainerAndDi TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs_->DeleteDir(path)); + ASSERT_RAISES(IOError, fs()->DeleteDir(path)); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view("hello"))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs_->CreateDir(path, true)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(parent)); - arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound); - arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(path, true)); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(parent)); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); } TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { auto preexisting_data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::Directory); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); + AssertFileInfo(fs(), paths.directory, FileType::Directory); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -867,20 +923,20 @@ TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { - AssertFileInfo(fs_.get(), "", FileType::Directory); + AssertFileInfo(fs(), "", FileType::Directory); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); } TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { auto data = SetUpPreexistingData(); - AssertFileInfo(fs_.get(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); - AssertFileInfo(fs_.get(), "nonexistent-container", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent-container", FileType::NotFound); // URI - ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + data.container_name)); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); } TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { @@ -891,7 +947,7 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 2); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container", FileType::Directory); @@ -899,18 +955,18 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Nonexistent container select.base_dir = "nonexistent-container"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -920,33 +976,33 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); select.base_dir = "container/somedir/subdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); // Nonexistent select.base_dir = "container/nonexistent"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.allow_not_found = true; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.allow_not_found = false; // Trailing slashes select.base_dir = "empty-container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "nonexistent-container/"; - ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + ASSERT_RAISES(IOError, fs()->GetFileInfo(select)); select.base_dir = "container/"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); } @@ -960,19 +1016,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { std::vector infos; // Root dir select.base_dir = ""; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 12); ASSERT_EQ(infos, SortedInfos(infos)); AssertInfoAllContainersRecursive(infos); // Empty container select.base_dir = "empty-container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty container select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 10); AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); @@ -988,19 +1044,19 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorRecursive) { // Empty "directory" select.base_dir = "container/emptydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); // Non-empty "directories" select.base_dir = "container/somedir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 2); AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); select.base_dir = "container/otherdir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos, SortedInfos(infos)); ASSERT_EQ(infos.size(), 4); AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); @@ -1023,13 +1079,13 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { FileSelector select; // non-recursive select.base_dir = "container"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir", FileType::Directory); select.base_dir = "container/mydir"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 4); ASSERT_EQ(infos, SortedInfos(infos)); AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); @@ -1038,55 +1094,55 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); select.base_dir = "container/mydir/emptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/emptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 0); select.base_dir = "container/mydir/nonemptydir1"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); select.base_dir = "container/mydir/nonemptydir2"; - ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_OK_AND_ASSIGN(infos, fs()->GetFileInfo(select)); ASSERT_EQ(infos.size(), 1); AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", false)); } TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name, false)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); + ASSERT_OK(fs()->CreateDir(container_name, false)); + AssertFileInfo(fs(), container_name, FileType::Directory); } TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); - ASSERT_RAISES(IOError, fs_->CreateDir(path, false)); + ASSERT_RAISES(IOError, fs()->CreateDir(path, false)); } TEST_F(TestAzuriteFileSystem, CreateDirRecursiveFailureNoContainer) { - ASSERT_RAISES(Invalid, fs_->CreateDir("", true)); + ASSERT_RAISES(Invalid, fs()->CreateDir("", true)); } TEST_F(TestAzuriteFileSystem, CreateDirUri) { ASSERT_RAISES( Invalid, - fs_->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); + fs()->CreateDir("abfs://" + PreexistingData::RandomContainerName(rng_), true)); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { const auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs_->CreateDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory); - ASSERT_OK(fs_->DeleteDir(container_name)); - arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(container_name)); + AssertFileInfo(fs(), container_name, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { @@ -1094,8 +1150,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { const auto directory_path = data.RandomDirectoryPath(rng_); // There is only virtual directory without hierarchical namespace // support. So the DeleteDir() for nonexistent directory does nothing. - ASSERT_OK(fs_->DeleteDir(directory_path)); - arrow::fs::AssertFileInfo(fs_.get(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { @@ -1110,21 +1166,21 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { int64_t n_blobs = 257; for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(blob_path)); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); ASSERT_OK(output->Write(std::string_view(std::to_string(i)))); ASSERT_OK(output->Close()); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::File); + AssertFileInfo(fs(), blob_path, FileType::File); } - ASSERT_OK(fs_->DeleteDir(directory_path)); + ASSERT_OK(fs()->DeleteDir(directory_path)); for (int64_t i = 0; i < n_blobs; ++i) { const auto blob_path = ConcatAbstractPath(directory_path, std::to_string(i) + ".txt"); - arrow::fs::AssertFileInfo(fs_.get(), blob_path, FileType::NotFound); + AssertFileInfo(fs(), blob_path, FileType::NotFound); } } TEST_F(TestAzuriteFileSystem, DeleteDirUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->DeleteDir("abfs://" + data.container_name + "/")); + ASSERT_RAISES(Invalid, fs()->DeleteDir("abfs://" + data.container_name + "/")); } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { @@ -1135,11 +1191,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.container)); - arrow::fs::AssertFileInfo(fs_.get(), paths.container, FileType::Directory); - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + ASSERT_OK(fs()->DeleteDirContents(paths.container)); + AssertFileInfo(fs(), paths.container, FileType::Directory); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1151,11 +1207,11 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); - ASSERT_OK(fs_->DeleteDirContents(paths.directory)); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); // GH-38772: We may change this to FileType::Directory. - arrow::fs::AssertFileInfo(fs_.get(), paths.directory, FileType::NotFound); + AssertFileInfo(fs(), paths.directory, FileType::NotFound); for (const auto& sub_path : paths.sub_paths) { - arrow::fs::AssertFileInfo(fs_.get(), sub_path, FileType::NotFound); + AssertFileInfo(fs(), sub_path, FileType::NotFound); } } @@ -1170,52 +1226,52 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsFailureNonexistent) { TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), destination_path)); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(destination_path)); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); - ASSERT_OK(fs_->CopyFile(data.ObjectPath(), data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(info)); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), - internal::EnsureTrailingSlash(data.ObjectPath()))); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), internal::EnsureTrailingSlash( + data.ObjectPath()))); } TEST_F(TestAzuriteFileSystem, CopyFileFailureSourceNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.NotFoundObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.NotFoundObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileFailureDestinationParentNonexistent) { auto data = SetUpPreexistingData(); const auto destination_path = ConcatAbstractPath(PreexistingData::RandomContainerName(rng_), "copy-destionation"); - ASSERT_RAISES(IOError, fs_->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_RAISES(IOError, fs()->CopyFile(data.ObjectPath(), destination_path)); } TEST_F(TestAzuriteFileSystem, CopyFileUri) { auto data = SetUpPreexistingData(); const auto destination_path = data.ContainerPath("copy-destionation"); - ASSERT_RAISES(Invalid, fs_->CopyFile("abfs://" + data.ObjectPath(), destination_path)); - ASSERT_RAISES(Invalid, fs_->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile("abfs://" + data.ObjectPath(), destination_path)); + ASSERT_RAISES(Invalid, fs()->CopyFile(data.ObjectPath(), "abfs://" + destination_path)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1224,7 +1280,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamString) { TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::string contents; std::shared_ptr buffer; @@ -1238,10 +1294,10 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamStringBuffers) { TEST_F(TestAzuriteFileSystem, OpenInputStreamInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(info)); ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); EXPECT_EQ(buffer->ToString(), PreexistingData::kLoremIpsum); @@ -1255,7 +1311,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(path)); std::array buffer{}; std::int64_t size; ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); @@ -1264,26 +1320,26 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamEmpty) { TEST_F(TestAzuriteFileSystem, OpenInputStreamNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name + "/")); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name + "/")); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputStream(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputStreamUri) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + data.ObjectPath())); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + data.ObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputStreamTrailingSlash) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputStream(data.ObjectPath() + '/')); + ASSERT_RAISES(IOError, fs()->OpenInputStream(data.ObjectPath() + '/')); } namespace { @@ -1324,7 +1380,7 @@ std::shared_ptr NormalizerKeyValueMetadata( TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { auto data = SetUpPreexistingData(); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs()->OpenInputStream(data.ObjectPath())); std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); @@ -1354,7 +1410,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputStreamReadMetadata) { TEST_F(TestAzuriteFileSystem, OpenInputStreamClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); @@ -1399,13 +1455,13 @@ TEST_F(TestAzuriteFileSystem, WriteMetadata) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected)); ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); @@ -1416,7 +1472,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamSmall) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); std::array sizes{257 * 1024, 258 * 1024, 259 * 1024}; std::array buffers{ std::string(sizes[0], 'A'), @@ -1432,7 +1488,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { ASSERT_OK(output->Close()); // Verify we can read the object back. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::string contents; std::shared_ptr buffer; @@ -1448,26 +1504,26 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamLarge) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data(), size)); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenOutputStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has been overwritten. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected1, std::string_view(inbuf.data(), size)); } @@ -1475,27 +1531,27 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamTruncatesExistingFile) { TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("test-write-object"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); const std::string_view expected0("Existing blob content"); ASSERT_OK(output->Write(expected0)); ASSERT_OK(output->Close()); // Check that the initial content has been written - if not this test is not achieving // what it's meant to. - ASSERT_OK_AND_ASSIGN(auto input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(auto input, fs()->OpenInputStream(path)); std::array inbuf{}; ASSERT_OK_AND_ASSIGN(auto size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(expected0, std::string_view(inbuf.data())); - ASSERT_OK_AND_ASSIGN(output, fs_->OpenAppendStream(path, {})); + ASSERT_OK_AND_ASSIGN(output, fs()->OpenAppendStream(path, {})); const std::string_view expected1(PreexistingData::kLoremIpsum); ASSERT_OK(output->Write(expected1)); ASSERT_OK(output->Close()); // Verify that the initial content has not been overwritten and that the block from // the other client was not committed. - ASSERT_OK_AND_ASSIGN(input, fs_->OpenInputStream(path)); + ASSERT_OK_AND_ASSIGN(input, fs()->OpenInputStream(path)); ASSERT_OK_AND_ASSIGN(size, input->Read(inbuf.size(), inbuf.data())); EXPECT_EQ(std::string(inbuf.data(), size), std::string(expected0) + std::string(expected1)); @@ -1504,7 +1560,7 @@ TEST_F(TestAzuriteFileSystem, OpenAppendStreamDoesNotTruncateExistingFile) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-closed.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs_->OpenOutputStream(path, {})); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); ASSERT_OK(output->Close()); ASSERT_RAISES(Invalid, output->Write(PreexistingData::kLoremIpsum, std::strlen(PreexistingData::kLoremIpsum))); @@ -1515,7 +1571,7 @@ TEST_F(TestAzuriteFileSystem, OpenOutputStreamClosed) { TEST_F(TestAzuriteFileSystem, OpenOutputStreamUri) { auto data = SetUpPreexistingData(); const auto path = data.ContainerPath("open-output-stream-uri.txt"); - ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + path)); + ASSERT_RAISES(Invalid, fs()->OpenInputStream("abfs://" + path)); } TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { @@ -1534,7 +1590,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileMixedReadVsReadAt) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1582,7 +1638,7 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileRandomSeek) { UploadLines(lines, path, kLineCount * kLineWidth); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); for (int i = 0; i != 32; ++i) { SCOPED_TRACE("Iteration " + std::to_string(i)); // Verify sequential reads work as expected. @@ -1607,16 +1663,16 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileIoContext) { contents.length()); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); - EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(path)); + EXPECT_EQ(fs()->io_context().external_id(), file->io_context().external_id()); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.ObjectPath())); std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(file, fs()->OpenInputFile(info)); std::array buffer{}; std::int64_t size; @@ -1629,21 +1685,21 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileInfo) { TEST_F(TestAzuriteFileSystem, OpenInputFileNotFound) { auto data = SetUpPreexistingData(); - ASSERT_RAISES(IOError, fs_->OpenInputFile(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(data.NotFoundObjectPath())); } TEST_F(TestAzuriteFileSystem, OpenInputFileInfoInvalid) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(data.container_name)); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(data.container_name)); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info)); - ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(data.NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); + ASSERT_OK_AND_ASSIGN(auto info2, fs()->GetFileInfo(data.NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs()->OpenInputFile(info2)); } TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { auto data = SetUpPreexistingData(); - ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(data.ObjectPath())); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputFile(data.ObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Tell()); @@ -1654,6 +1710,5 @@ TEST_F(TestAzuriteFileSystem, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->Seek(2)); } -} // namespace } // namespace fs } // namespace arrow From b70ad0b8801d9ca0634c1937df1fc02c1609548e Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 21 Dec 2023 22:00:25 +0100 Subject: [PATCH 05/31] GH-31303: [Python] Remove the legacy ParquetDataset custom python-based implementation (#39112) ### Rationale for this change Legacy ParquetDataset has been deprecated for a while now, see https://github.com/apache/arrow/issues/31529. This PR is removing the legacy implementation from the code. ### What changes are included in this PR? The PR is removing: - `ParquetDatasetPiece ` - `ParquetManifest` - `_ParquetDatasetMetadata ` - `ParquetDataset` The PR is renaming `_ParquetDatasetV2` to `ParquetDataset` which was removed. It is also updating the docstrings. The PR is updating: - `read_table` - `write_to_dataset` The PR is updating all the tests to not use `use_legacy_dataset` keyword or legacy parametrisation. ### Are these changes tested? Yes. ### Are there any user-facing changes? Deprecated code is removed. * Closes: #31303 --- docs/source/python/parquet.rst | 38 +- python/benchmarks/parquet.py | 29 - python/pyarrow/parquet/core.py | 1882 +++-------------- python/pyarrow/tests/parquet/__init__.py | 3 - python/pyarrow/tests/parquet/common.py | 39 +- python/pyarrow/tests/parquet/test_basic.py | 278 +-- .../parquet/test_compliant_nested_type.py | 19 +- .../pyarrow/tests/parquet/test_data_types.py | 94 +- python/pyarrow/tests/parquet/test_dataset.py | 926 ++------ python/pyarrow/tests/parquet/test_datetime.py | 14 +- python/pyarrow/tests/parquet/test_pandas.py | 192 +- .../tests/parquet/test_parquet_file.py | 25 +- .../tests/parquet/test_parquet_writer.py | 27 +- python/pyarrow/tests/test_dataset.py | 68 +- python/pyarrow/tests/test_hdfs.py | 25 +- 15 files changed, 630 insertions(+), 3029 deletions(-) diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index 85a9674a689ca..d4717897660b6 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -511,36 +511,20 @@ from a remote filesystem into a pandas dataframe you may need to run ``sort_index`` to maintain row ordering (as long as the ``preserve_index`` option was enabled on write). -.. note:: - - The ParquetDataset is being reimplemented based on the new generic Dataset - API (see the :ref:`dataset` docs for an overview). This is not yet the - default, but can already be enabled by passing the ``use_legacy_dataset=False`` - keyword to :class:`ParquetDataset` or :func:`read_table`:: - - pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) - - Enabling this gives the following new features: - - - Filtering on all columns (using row group statistics) instead of only on - the partition keys. - - More fine-grained partitioning: support for a directory partitioning scheme - in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of - "/year=2019/month=11/day=15/"), and the ability to specify a schema for - the partition keys. - - General performance improvement and bug fixes. +Other features: - It also has the following changes in behaviour: +- Filtering on all columns (using row group statistics) instead of only on + the partition keys. +- Fine-grained partitioning: support for a directory partitioning scheme + in addition to the Hive-like partitioning (e.g. "/2019/11/15/" instead of + "/year=2019/month=11/day=15/"), and the ability to specify a schema for + the partition keys. - - The partition keys need to be explicitly included in the ``columns`` - keyword when you want to include them in the result while reading a - subset of the columns +Note: - This new implementation is already enabled in ``read_table``, and in the - future, this will be turned on by default for ``ParquetDataset``. The new - implementation does not yet cover all existing ParquetDataset features (e.g. - specifying the ``metadata``, or the ``pieces`` property API). Feedback is - very welcome. +- The partition keys need to be explicitly included in the ``columns`` + keyword when you want to include them in the result while reading a + subset of the columns Using with Spark diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index 3aeca425bc8f0..e459ea2c369b4 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -29,35 +29,6 @@ pq = None -class ParquetManifestCreation(object): - """Benchmark creating a parquet manifest.""" - - size = 10 ** 6 - tmpdir = None - - param_names = ('num_partitions', 'num_threads') - params = [(10, 100, 1000), (1, 8)] - - def setup(self, num_partitions, num_threads): - if pq is None: - raise NotImplementedError("Parquet support not enabled") - - self.tmpdir = tempfile.mkdtemp('benchmark_parquet') - rnd = np.random.RandomState(42) - num1 = rnd.randint(0, num_partitions, size=self.size) - num2 = rnd.randint(0, 1000, size=self.size) - output_df = pd.DataFrame({'num1': num1, 'num2': num2}) - output_table = pa.Table.from_pandas(output_df) - pq.write_to_dataset(output_table, self.tmpdir, ['num1']) - - def teardown(self, num_partitions, num_threads): - if self.tmpdir is not None: - shutil.rmtree(self.tmpdir) - - def time_manifest_creation(self, num_partitions, num_threads): - pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads) - - class ParquetWriteBinary(object): def setup(self): diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 852b339211b0d..98a4b2a1138c7 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -17,22 +17,17 @@ from collections import defaultdict -from concurrent import futures from contextlib import nullcontext -from functools import partial, reduce +from functools import reduce import inspect import json -from collections.abc import Collection -import numpy as np import os import re import operator -import urllib.parse import warnings import pyarrow as pa -import pyarrow.lib as lib try: import pyarrow._parquet as _parquet @@ -55,28 +50,6 @@ from pyarrow import filesystem as legacyfs from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api -_URI_STRIP_SCHEMES = ('hdfs',) - - -def _parse_uri(path): - path = _stringify_path(path) - parsed_uri = urllib.parse.urlparse(path) - if parsed_uri.scheme in _URI_STRIP_SCHEMES: - return parsed_uri.path - else: - # ARROW-4073: On Windows returning the path with the scheme - # stripped removes the drive letter, if any - return path - - -def _get_filesystem_and_path(passed_filesystem, path): - if passed_filesystem is None: - return legacyfs.resolve_filesystem_and_path(path, passed_filesystem) - else: - passed_filesystem = legacyfs._ensure_filesystem(passed_filesystem) - parsed_path = _parse_uri(path) - return passed_filesystem, parsed_path - def _check_contains_null(val): if isinstance(val, bytes): @@ -1148,516 +1121,15 @@ def _get_pandas_index_columns(keyvalues): ['index_columns']) -# ---------------------------------------------------------------------- -# Metadata container providing instructions about reading a single Parquet -# file, possibly part of a partitioned dataset - - -class ParquetDatasetPiece: - """ - DEPRECATED: A single chunk of a potentially larger Parquet dataset to read. - - The arguments will indicate to read either a single row group or all row - groups, and whether to add partition keys to the resulting pyarrow.Table. - - .. deprecated:: 5.0 - Directly constructing a ``ParquetDatasetPiece`` is deprecated, as well - as accessing the pieces of a ``ParquetDataset`` object. Specify - ``use_legacy_dataset=False`` when constructing the ``ParquetDataset`` - and use the ``ParquetDataset.fragments`` attribute instead. - - Parameters - ---------- - path : str or pathlib.Path - Path to file in the file system where this piece is located. - open_file_func : callable - Function to use for obtaining file handle to dataset piece. - file_options : dict - Options - row_group : int, default None - Row group to load. By default, reads all row groups. - partition_keys : list of tuples - Two-element tuples of ``(column name, ordinal index)``. - """ - - def __init__(self, path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - warnings.warn( - "ParquetDatasetPiece is deprecated as of pyarrow 5.0.0 and will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - - @staticmethod - def _create(path, open_file_func=partial(open, mode='rb'), - file_options=None, row_group=None, partition_keys=None): - self = ParquetDatasetPiece.__new__(ParquetDatasetPiece) - self._init( - path, open_file_func, file_options, row_group, partition_keys) - return self - - def _init(self, path, open_file_func, file_options, row_group, - partition_keys): - self.path = _stringify_path(path) - self.open_file_func = open_file_func - self.row_group = row_group - self.partition_keys = partition_keys or [] - self.file_options = file_options or {} - - def __eq__(self, other): - if not isinstance(other, ParquetDatasetPiece): - return False - return (self.path == other.path and - self.row_group == other.row_group and - self.partition_keys == other.partition_keys) - - def __repr__(self): - return ('{}({!r}, row_group={!r}, partition_keys={!r})' - .format(type(self).__name__, self.path, - self.row_group, - self.partition_keys)) - - def __str__(self): - result = '' - - if len(self.partition_keys) > 0: - partition_str = ', '.join('{}={}'.format(name, index) - for name, index in self.partition_keys) - result += 'partition[{}] '.format(partition_str) - - result += self.path - - if self.row_group is not None: - result += ' | row_group={}'.format(self.row_group) - - return result - - def get_metadata(self): - """ - Return the file's metadata. - - Returns - ------- - metadata : FileMetaData - The file's metadata - """ - with self.open() as parquet: - return parquet.metadata - - def open(self): - """ - Return instance of ParquetFile. - """ - reader = self.open_file_func(self.path) - if not isinstance(reader, ParquetFile): - reader = ParquetFile(reader, **self.file_options) - - # ensure reader knows it's responsible for closing source - # since we opened the source here internally. - reader._close_source = True - return reader - - def read(self, columns=None, use_threads=True, partitions=None, - file=None, use_pandas_metadata=False): - """ - Read this piece as a pyarrow.Table. - - Parameters - ---------- - columns : list of column names, default None - use_threads : bool, default True - Perform multi-threaded column reads. - partitions : ParquetPartitions, default None - file : file-like object - Passed to ParquetFile. - use_pandas_metadata : bool - If pandas metadata should be used or not. - - Returns - ------- - table : pyarrow.Table - The piece as a pyarrow.Table. - """ - if self.open_file_func is not None: - reader = self.open() - elif file is not None: - reader = ParquetFile(file, **self.file_options) - else: - # try to read the local path - reader = ParquetFile(self.path, **self.file_options) - - options = dict(columns=columns, - use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - if self.row_group is not None: - table = reader.read_row_group(self.row_group, **options) - else: - table = reader.read(**options) - - if len(self.partition_keys) > 0: - if partitions is None: - raise ValueError('Must pass partition sets') - - # Here, the index is the categorical code of the partition where - # this piece is located. Suppose we had - # - # /foo=a/0.parq - # /foo=b/0.parq - # /foo=c/0.parq - # - # Then we assign a=0, b=1, c=2. And the resulting Table pieces will - # have a DictionaryArray column named foo having the constant index - # value as indicated. The distinct categories of the partition have - # been computed in the ParquetManifest - for i, (name, index) in enumerate(self.partition_keys): - # The partition code is the same for all values in this piece - indices = np.full(len(table), index, dtype='i4') - - # This is set of all partition values, computed as part of the - # manifest, so ['a', 'b', 'c'] as in our example above. - dictionary = partitions.levels[i].dictionary - - arr = pa.DictionaryArray.from_arrays(indices, dictionary) - table = table.append_column(name, arr) - - # To ParquetFile the source looked like it was already open, so won't - # actually close it without overriding. - reader.close(force=True) - return table - - -class PartitionSet: - """ - A data structure for cataloguing the observed Parquet partitions at a - particular level. So if we have - - /foo=a/bar=0 - /foo=a/bar=1 - /foo=a/bar=2 - /foo=b/bar=0 - /foo=b/bar=1 - /foo=b/bar=2 - - Then we have two partition sets, one for foo, another for bar. As we visit - levels of the partition hierarchy, a PartitionSet tracks the distinct - values and assigns categorical codes to use when reading the pieces - - Parameters - ---------- - name : str - Name of the partition set. Under which key to collect all values. - keys : list - All possible values that have been collected for that partition set. - """ - - def __init__(self, name, keys=None): - self.name = name - self.keys = keys or [] - self.key_indices = {k: i for i, k in enumerate(self.keys)} - self._dictionary = None - - def get_index(self, key): - """ - Get the index of the partition value if it is known, otherwise assign - one - - Parameters - ---------- - key : str or int - The value for which we want to known the index. - """ - if key in self.key_indices: - return self.key_indices[key] - else: - index = len(self.key_indices) - self.keys.append(key) - self.key_indices[key] = index - return index - - @property - def dictionary(self): - if self._dictionary is not None: - return self._dictionary - - if len(self.keys) == 0: - raise ValueError('No known partition keys') - - # Only integer and string partition types are supported right now - try: - integer_keys = [int(x) for x in self.keys] - dictionary = lib.array(integer_keys) - except ValueError: - dictionary = lib.array(self.keys) - - self._dictionary = dictionary - return dictionary - - @property - def is_sorted(self): - return list(self.keys) == sorted(self.keys) - - -class ParquetPartitions: - - def __init__(self): - self.levels = [] - self.partition_names = set() - - def __len__(self): - return len(self.levels) - - def __getitem__(self, i): - return self.levels[i] - - def equals(self, other): - if not isinstance(other, ParquetPartitions): - raise TypeError('`other` must be an instance of ParquetPartitions') - - return (self.levels == other.levels and - self.partition_names == other.partition_names) - - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented - - def get_index(self, level, name, key): - """ - Record a partition value at a particular level, returning the distinct - code for that value at that level. - - Examples - -------- - - partitions.get_index(1, 'foo', 'a') returns 0 - partitions.get_index(1, 'foo', 'b') returns 1 - partitions.get_index(1, 'foo', 'c') returns 2 - partitions.get_index(1, 'foo', 'a') returns 0 - - Parameters - ---------- - level : int - The nesting level of the partition we are observing - name : str - The partition name - key : str or int - The partition value - """ - if level == len(self.levels): - if name in self.partition_names: - raise ValueError('{} was the name of the partition in ' - 'another level'.format(name)) - - part_set = PartitionSet(name) - self.levels.append(part_set) - self.partition_names.add(name) - - return self.levels[level].get_index(key) - - def filter_accepts_partition(self, part_key, filter, level): - p_column, p_value_index = part_key - f_column, op, f_value = filter - if p_column != f_column: - return True - - f_type = type(f_value) - - if op in {'in', 'not in'}: - if not isinstance(f_value, Collection): - raise TypeError( - "'%s' object is not a collection", f_type.__name__) - if not f_value: - raise ValueError("Cannot use empty collection as filter value") - if len({type(item) for item in f_value}) != 1: - raise ValueError("All elements of the collection '%s' must be" - " of same type", f_value) - f_type = type(next(iter(f_value))) - - elif not isinstance(f_value, str) and isinstance(f_value, Collection): - raise ValueError( - "Op '%s' not supported with a collection value", op) - - p_value = f_type(self.levels[level] - .dictionary[p_value_index].as_py()) - - if op == "=" or op == "==": - return p_value == f_value - elif op == "!=": - return p_value != f_value - elif op == '<': - return p_value < f_value - elif op == '>': - return p_value > f_value - elif op == '<=': - return p_value <= f_value - elif op == '>=': - return p_value >= f_value - elif op == 'in': - return p_value in f_value - elif op == 'not in': - return p_value not in f_value - else: - raise ValueError("'%s' is not a valid operator in predicates.", - filter[1]) - - -class ParquetManifest: - - def __init__(self, dirpath, open_file_func=None, filesystem=None, - pathsep='/', partition_scheme='hive', metadata_nthreads=1): - filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath) - self.filesystem = filesystem - self.open_file_func = open_file_func - self.pathsep = pathsep - self.dirpath = _stringify_path(dirpath) - self.partition_scheme = partition_scheme - self.partitions = ParquetPartitions() - self.pieces = [] - self._metadata_nthreads = metadata_nthreads - self._thread_pool = futures.ThreadPoolExecutor( - max_workers=metadata_nthreads) - - self.common_metadata_path = None - self.metadata_path = None - - self._visit_level(0, self.dirpath, []) - - # Due to concurrency, pieces will potentially by out of order if the - # dataset is partitioned so we sort them to yield stable results - self.pieces.sort(key=lambda piece: piece.path) - - if self.common_metadata_path is None: - # _common_metadata is a subset of _metadata - self.common_metadata_path = self.metadata_path - - self._thread_pool.shutdown() - - def _visit_level(self, level, base_path, part_keys): - fs = self.filesystem - - _, directories, files = next(fs.walk(base_path)) - - filtered_files = [] - for path in files: - full_path = self.pathsep.join((base_path, path)) - if path.endswith('_common_metadata'): - self.common_metadata_path = full_path - elif path.endswith('_metadata'): - self.metadata_path = full_path - elif self._should_silently_exclude(path): - continue - else: - filtered_files.append(full_path) - - # ARROW-1079: Filter out "private" directories starting with underscore - filtered_directories = [self.pathsep.join((base_path, x)) - for x in directories - if not _is_private_directory(x)] - - filtered_files.sort() - filtered_directories.sort() - - if len(filtered_files) > 0 and len(filtered_directories) > 0: - raise ValueError('Found files in an intermediate ' - 'directory: {}'.format(base_path)) - elif len(filtered_directories) > 0: - self._visit_directories(level, filtered_directories, part_keys) - else: - self._push_pieces(filtered_files, part_keys) - - def _should_silently_exclude(self, file_name): - return (file_name.endswith('.crc') or # Checksums - file_name.endswith('_$folder$') or # HDFS directories in S3 - file_name.startswith('.') or # Hidden files starting with . - file_name.startswith('_') or # Hidden files starting with _ - file_name in EXCLUDED_PARQUET_PATHS) - - def _visit_directories(self, level, directories, part_keys): - futures_list = [] - for path in directories: - head, tail = _path_split(path, self.pathsep) - name, key = _parse_hive_partition(tail) - - index = self.partitions.get_index(level, name, key) - dir_part_keys = part_keys + [(name, index)] - # If you have less threads than levels, the wait call will block - # indefinitely due to multiple waits within a thread. - if level < self._metadata_nthreads: - future = self._thread_pool.submit(self._visit_level, - level + 1, - path, - dir_part_keys) - futures_list.append(future) - else: - self._visit_level(level + 1, path, dir_part_keys) - if futures_list: - futures.wait(futures_list) - - def _parse_partition(self, dirname): - if self.partition_scheme == 'hive': - return _parse_hive_partition(dirname) - else: - raise NotImplementedError('partition schema: {}' - .format(self.partition_scheme)) - - def _push_pieces(self, files, part_keys): - self.pieces.extend([ - ParquetDatasetPiece._create(path, partition_keys=part_keys, - open_file_func=self.open_file_func) - for path in files - ]) - - -def _parse_hive_partition(value): - if '=' not in value: - raise ValueError('Directory name did not appear to be a ' - 'partition: {}'.format(value)) - return value.split('=', 1) - - -def _is_private_directory(x): - _, tail = os.path.split(x) - return (tail.startswith('_') or tail.startswith('.')) and '=' not in tail - - -def _path_split(path, sep): - i = path.rfind(sep) + 1 - head, tail = path[:i], path[i:] - head = head.rstrip(sep) - return head, tail - - EXCLUDED_PARQUET_PATHS = {'_SUCCESS'} -class _ParquetDatasetMetadata: - __slots__ = ('fs', 'memory_map', 'read_dictionary', 'common_metadata', - 'buffer_size') - - -def _open_dataset_file(dataset, path, meta=None): - if (dataset.fs is not None and - not isinstance(dataset.fs, legacyfs.LocalFileSystem)): - path = dataset.fs.open(path, mode='rb') - return ParquetFile( - path, - metadata=meta, - memory_map=dataset.memory_map, - read_dictionary=dataset.read_dictionary, - common_metadata=dataset.common_metadata, - buffer_size=dataset.buffer_size +def _is_local_file_system(fs): + return isinstance(fs, LocalFileSystem) or isinstance( + fs, legacyfs.LocalFileSystem ) -_DEPR_MSG = ( - "'{}' attribute is deprecated as of pyarrow 5.0.0 and will be removed " - "in a future version.{}" -) - - _read_docstring_common = """\ read_dictionary : list, default None List of names or column paths (for nested types) to read directly @@ -1680,6 +1152,7 @@ def _open_dataset_file(dataset, path, meta=None): you need to specify the field names or a full schema. See the ``pyarrow.dataset.partitioning()`` function for more details.""" + _parquet_dataset_example = """\ Generate an example PyArrow Table and write it to a partitioned dataset: @@ -1688,15 +1161,13 @@ def _open_dataset_file(dataset, path, meta=None): ... 'n_legs': [2, 2, 4, 4, 5, 100], ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq ->>> pq.write_to_dataset(table, root_path='dataset_name', -... partition_cols=['year'], -... use_legacy_dataset=False) +>>> pq.write_to_dataset(table, root_path='dataset_v2', +... partition_cols=['year']) create a ParquetDataset object from the dataset source: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False) +>>> dataset = pq.ParquetDataset('dataset_v2/') and read the data: @@ -1711,7 +1182,7 @@ def _open_dataset_file(dataset, path, meta=None): create a ParquetDataset object with filter: ->>> dataset = pq.ParquetDataset('dataset_name/', use_legacy_dataset=False, +>>> dataset = pq.ParquetDataset('dataset_v2/', ... filters=[('n_legs','=',4)]) >>> dataset.read().to_pandas() n_legs animal year @@ -1721,7 +1192,6 @@ def _open_dataset_file(dataset, path, meta=None): class ParquetDataset: - __doc__ = """ Encapsulates details of reading a complete Parquet dataset possibly consisting of multiple files and partitions in subdirectories. @@ -1735,39 +1205,26 @@ class ParquetDataset: Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. schema : pyarrow.parquet.Schema - Use schema obtained elsewhere to validate file schemas. Alternative to - metadata parameter. -metadata : pyarrow.parquet.FileMetaData - Use metadata obtained elsewhere to validate file schemas. -split_row_groups : bool, default False - Divide files into pieces for each row group in the file. -validate_schema : bool, default True - Check that individual file schemas are all the same / compatible. + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {1} -metadata_nthreads : int, default 1 - How many threads to allow the thread pool which is used to read the - dataset metadata. Increasing this is helpful to read partitioned - datasets. {0} -use_legacy_dataset : bool, default False - Set to False to enable the new code path (using the - new Arrow Dataset API). Among other things, this allows to pass - `filters` for all columns and not only the partition keys, enables - different partitioning schemes, etc. +ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. Set to False if you want to prioritize minimal memory usage over maximum speed. @@ -1775,6 +1232,10 @@ class ParquetDataset: Cast timestamps that are stored in INT96 format to a particular resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 timestamps will be inferred as timestamps in nanoseconds. +decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -1785,739 +1246,95 @@ class ParquetDataset: sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. Examples -------- {2} """.format(_read_docstring_common, _DNF_filter_doc, _parquet_dataset_example) - def __new__(cls, path_or_paths=None, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False): - - extra_msg = "" - if use_legacy_dataset is None: - # if an old filesystem is passed -> still use to old implementation - if isinstance(filesystem, legacyfs.FileSystem): - use_legacy_dataset = True - extra_msg = ( - " The legacy behaviour was still chosen because a " - "deprecated 'pyarrow.filesystem' filesystem was specified " - "(use the filesystems from pyarrow.fs instead)." - ) - # otherwise the default is already False - else: - use_legacy_dataset = False - - if not use_legacy_dataset: - return _ParquetDatasetV2( - path_or_paths, filesystem=filesystem, - filters=filters, - partitioning=partitioning, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - # unsupported keywords - schema=schema, metadata=metadata, - split_row_groups=split_row_groups, - validate_schema=validate_schema, - metadata_nthreads=metadata_nthreads, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, - ) - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 11.0.0, and the legacy implementation " - "will be removed in a future version." + extra_msg, - FutureWarning, stacklevel=2) - self = object.__new__(cls) - return self - - def __init__(self, path_or_paths, filesystem=None, schema=None, - metadata=None, split_row_groups=False, validate_schema=True, - filters=None, metadata_nthreads=None, read_dictionary=None, - memory_map=False, buffer_size=0, partitioning="hive", - use_legacy_dataset=None, pre_buffer=True, + def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, + read_dictionary=None, memory_map=False, buffer_size=None, + partitioning="hive", ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, - thrift_string_size_limit=None, + decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): - if partitioning != "hive": - raise ValueError( - 'Only "hive" for hive-like partitioning is supported when ' - 'using use_legacy_dataset=True') - if metadata_nthreads is not None: - warnings.warn( - "Specifying the 'metadata_nthreads' argument is deprecated as " - "of pyarrow 8.0.0, and the argument will be removed in a " - "future version", - FutureWarning, stacklevel=2, - ) - else: - metadata_nthreads = 1 - - self._ds_metadata = _ParquetDatasetMetadata() - a_path = path_or_paths - if isinstance(a_path, list): - a_path = a_path[0] - - self._ds_metadata.fs, _ = _get_filesystem_and_path(filesystem, a_path) - if isinstance(path_or_paths, list): - self.paths = [_parse_uri(path) for path in path_or_paths] - else: - self.paths = _parse_uri(path_or_paths) - - self._ds_metadata.read_dictionary = read_dictionary - self._ds_metadata.memory_map = memory_map - self._ds_metadata.buffer_size = buffer_size - - (self._pieces, - self._partitions, - self._common_metadata_path, - self._metadata_path) = _make_manifest( - path_or_paths, self._fs, metadata_nthreads=metadata_nthreads, - open_file_func=partial(_open_dataset_file, self._ds_metadata) - ) - - if self._common_metadata_path is not None: - with self._fs.open(self._common_metadata_path) as f: - self._ds_metadata.common_metadata = read_metadata( - f, - memory_map=memory_map - ) - else: - self._ds_metadata.common_metadata = None + page_checksum_verification=False, + use_legacy_dataset=None): - if metadata is not None: + if use_legacy_dataset is not None: warnings.warn( - "Specifying the 'metadata' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0.", + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", FutureWarning, stacklevel=2) - if metadata is None and self._metadata_path is not None: - with self._fs.open(self._metadata_path) as f: - self._metadata = read_metadata(f, memory_map=memory_map) - else: - self._metadata = metadata - - if schema is not None: - warnings.warn( - "Specifying the 'schema' argument with 'use_legacy_dataset=" - "True' is deprecated as of pyarrow 8.0.0. You can still " - "specify it in combination with 'use_legacy_dataset=False', " - "but in that case you need to specify a pyarrow.Schema " - "instead of a ParquetSchema.", - FutureWarning, stacklevel=2) - self._schema = schema + import pyarrow.dataset as ds - self.split_row_groups = split_row_groups + # map format arguments + read_options = { + "pre_buffer": pre_buffer, + "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, + "thrift_string_size_limit": thrift_string_size_limit, + "thrift_container_size_limit": thrift_container_size_limit, + "page_checksum_verification": page_checksum_verification, + } + if buffer_size: + read_options.update(use_buffered_stream=True, + buffer_size=buffer_size) + if read_dictionary is not None: + read_options.update(dictionary_columns=read_dictionary) - if split_row_groups: - raise NotImplementedError("split_row_groups not yet implemented") + if decryption_properties is not None: + read_options.update(decryption_properties=decryption_properties) + self._filter_expression = None if filters is not None: - if hasattr(filters, "cast"): - raise TypeError( - "Expressions as filter not supported for legacy dataset") - filters = _check_filters(filters) - self._filter(filters) - - if validate_schema: - self.validate_schemas() - - def __getnewargs_ex__(self): - # when creating a new instance while unpickling, force to use the - # legacy code path to create a ParquetDataset instance - # instead of a _ParquetDatasetV2 instance - return ((), dict(use_legacy_dataset=True)) - - def equals(self, other): - if not isinstance(other, ParquetDataset): - raise TypeError('`other` must be an instance of ParquetDataset') + self._filter_expression = filters_to_expression(filters) - if self._fs.__class__ != other._fs.__class__: - return False - for prop in ('paths', '_pieces', '_partitions', - '_common_metadata_path', '_metadata_path', - '_common_metadata', '_metadata', '_schema', - 'split_row_groups'): - if getattr(self, prop) != getattr(other, prop): - return False - for prop in ('memory_map', 'buffer_size'): - if ( - getattr(self._ds_metadata, prop) != - getattr(other._ds_metadata, prop) - ): - return False - - return True + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem( + filesystem, use_mmap=memory_map) + elif filesystem is None and memory_map: + # if memory_map is specified, assume local file system (string + # path can in principle be URI for any filesystem) + filesystem = LocalFileSystem(use_mmap=memory_map) - def __eq__(self, other): - try: - return self.equals(other) - except TypeError: - return NotImplemented + # This needs to be checked after _ensure_filesystem, because that + # handles the case of an fsspec LocalFileSystem + if ( + hasattr(path_or_paths, "__fspath__") and + filesystem is not None and + not _is_local_file_system(filesystem) + ): + raise TypeError( + "Path-like objects with __fspath__ must only be used with " + f"local file systems, not {type(filesystem)}" + ) - def validate_schemas(self): - if self._metadata is None and self._schema is None: - if self._common_metadata is not None: - self._schema = self._common_metadata.schema + # check for single fragment dataset + single_file = None + self._base_dir = None + if not isinstance(path_or_paths, list): + if _is_path_like(path_or_paths): + path_or_paths = _stringify_path(path_or_paths) + if filesystem is None: + # path might be a URI describing the FileSystem as well + try: + filesystem, path_or_paths = FileSystem.from_uri( + path_or_paths) + except ValueError: + filesystem = LocalFileSystem(use_mmap=memory_map) + finfo = filesystem.get_file_info(path_or_paths) + if finfo.is_file: + single_file = path_or_paths + if finfo.type == FileType.Directory: + self._base_dir = path_or_paths else: - self._schema = self._pieces[0].get_metadata().schema - elif self._schema is None: - self._schema = self._metadata.schema - - # Verify schemas are all compatible - dataset_schema = self._schema.to_arrow_schema() - # Exclude the partition columns from the schema, they are provided - # by the path, not the DatasetPiece - if self._partitions is not None: - for partition_name in self._partitions.partition_names: - if dataset_schema.get_field_index(partition_name) != -1: - field_idx = dataset_schema.get_field_index(partition_name) - dataset_schema = dataset_schema.remove(field_idx) - - for piece in self._pieces: - file_metadata = piece.get_metadata() - file_schema = file_metadata.schema.to_arrow_schema() - if not dataset_schema.equals(file_schema, check_metadata=False): - raise ValueError('Schema in {!s} was different. \n' - '{!s}\n\nvs\n\n{!s}' - .format(piece, file_schema, - dataset_schema)) + single_file = path_or_paths - def read(self, columns=None, use_threads=True, use_pandas_metadata=False): - """ - Read multiple Parquet files as a single pyarrow.Table. - - Parameters - ---------- - columns : List[str] - Names of columns to read from the file. - use_threads : bool, default True - Perform multi-threaded column reads - use_pandas_metadata : bool, default False - Passed through to each dataset piece. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_read/', - ... use_legacy_dataset=False) - - Read multiple Parquet files as a single pyarrow.Table: - - >>> dataset.read(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[5],[2],[4,100],[2,4]] - """ - tables = [] - for piece in self._pieces: - table = piece.read(columns=columns, - use_threads=use_threads, - partitions=self._partitions, - use_pandas_metadata=use_pandas_metadata) - tables.append(table) - - all_data = lib.concat_tables(tables) - - if use_pandas_metadata: - # We need to ensure that this metadata is set in the Table's schema - # so that Table.to_pandas will construct pandas.DataFrame with the - # right index - common_metadata = self._get_common_pandas_metadata() - current_metadata = all_data.schema.metadata or {} - - if common_metadata and b'pandas' not in current_metadata: - all_data = all_data.replace_schema_metadata({ - b'pandas': common_metadata}) - - return all_data - - def read_pandas(self, **kwargs): - """ - Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. - - Parameters - ---------- - **kwargs : optional - All additional options to pass to the reader. - - Returns - ------- - pyarrow.Table - Content of the file as a table (of columns). - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned - dataset: - - >>> import pyarrow as pa - >>> import pandas as pd - >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> table = pa.Table.from_pandas(df) - >>> import pyarrow.parquet as pq - >>> pq.write_table(table, 'table.parquet') - >>> dataset = pq.ParquetDataset('table.parquet', - ... use_legacy_dataset=False) - - Read dataset including pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]) - pyarrow.Table - n_legs: int64 - ---- - n_legs: [[2,2,4,4,5,100]] - - Select pandas metadata: - - >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata - {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} - """ - return self.read(use_pandas_metadata=True, **kwargs) - - def _get_common_pandas_metadata(self): - if self._common_metadata is None: - return None - - keyvalues = self._common_metadata.metadata - return keyvalues.get(b'pandas', None) - - def _filter(self, filters): - accepts_filter = self._partitions.filter_accepts_partition - - def one_filter_accepts(piece, filter): - return all(accepts_filter(part_key, filter, level) - for level, part_key in enumerate(piece.partition_keys)) - - def all_filters_accept(piece): - return any(all(one_filter_accepts(piece, f) for f in conjunction) - for conjunction in filters) - - self._pieces = [p for p in self._pieces if all_filters_accept(p)] - - @property - def pieces(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.pieces", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.fragments' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._pieces - - @property - def partitions(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.partitions", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.partitioning' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._partitions - - @property - def schema(self): - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.schema", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.schema' attribute " - "instead (which will return an Arrow schema instead of a " - "Parquet schema)."), - FutureWarning, stacklevel=2) - return self._schema - - @property - def memory_map(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.memory_map", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.memory_map - - @property - def read_dictionary(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.read_dictionary", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.read_dictionary - - @property - def buffer_size(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.buffer_size", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.buffer_size - - _fs = property( - operator.attrgetter('_ds_metadata.fs') - ) - - @property - def fs(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format( - "ParquetDataset.fs", - " Specify 'use_legacy_dataset=False' while constructing the " - "ParquetDataset, and then use the '.filesystem' attribute " - "instead."), - FutureWarning, stacklevel=2) - return self._ds_metadata.fs - - @property - def metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata", ""), - FutureWarning, stacklevel=2) - return self._metadata - - @property - def metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.metadata_path", ""), - FutureWarning, stacklevel=2) - return self._metadata_path - - @property - def common_metadata_path(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata_path", ""), - FutureWarning, stacklevel=2) - return self._common_metadata_path - - _common_metadata = property( - operator.attrgetter('_ds_metadata.common_metadata') - ) - - @property - def common_metadata(self): - """ - DEPRECATED - """ - warnings.warn( - _DEPR_MSG.format("ParquetDataset.common_metadata", ""), - FutureWarning, stacklevel=2) - return self._ds_metadata.common_metadata - - @property - def fragments(self): - """ - A list of the Dataset source fragments or pieces with absolute - file paths. To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - - Examples - -------- - Generate an example dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_fragments/', - ... use_legacy_dataset=False) - - List the fragments: - - >>> dataset.fragments - [>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_name_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_name_files/', - ... use_legacy_dataset=False) - - List the files: - - >>> dataset.files - ['dataset_name_files/year=2019/...-0.parquet', ... - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def filesystem(self): - """ - The filesystem type of the Dataset source. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - @property - def partitioning(self): - """ - The partitioning of the Dataset source, if discovered. - To use this property set 'use_legacy_dataset=False' - while constructing ParquetDataset object. - """ - raise NotImplementedError( - "To use this property set 'use_legacy_dataset=False' while " - "constructing the ParquetDataset") - - -def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1, - open_file_func=None): - partitions = None - common_metadata_path = None - metadata_path = None - - if isinstance(path_or_paths, list) and len(path_or_paths) == 1: - # Dask passes a directory as a list of length 1 - path_or_paths = path_or_paths[0] - - if _is_path_like(path_or_paths) and fs.isdir(path_or_paths): - manifest = ParquetManifest(path_or_paths, filesystem=fs, - open_file_func=open_file_func, - pathsep=getattr(fs, "pathsep", "/"), - metadata_nthreads=metadata_nthreads) - common_metadata_path = manifest.common_metadata_path - metadata_path = manifest.metadata_path - pieces = manifest.pieces - partitions = manifest.partitions - else: - if not isinstance(path_or_paths, list): - path_or_paths = [path_or_paths] - - # List of paths - if len(path_or_paths) == 0: - raise ValueError('Must pass at least one file path') - - pieces = [] - for path in path_or_paths: - if not fs.isfile(path): - raise OSError('Passed non-file path: {}' - .format(path)) - piece = ParquetDatasetPiece._create( - path, open_file_func=open_file_func) - pieces.append(piece) - - return pieces, partitions, common_metadata_path, metadata_path - - -def _is_local_file_system(fs): - return isinstance(fs, LocalFileSystem) or isinstance( - fs, legacyfs.LocalFileSystem - ) - - -class _ParquetDatasetV2: - """ - ParquetDataset shim using the Dataset API under the hood. - - Examples - -------- - Generate an example PyArrow Table and write it to a partitioned dataset: - - >>> import pyarrow as pa - >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], - ... 'n_legs': [2, 2, 4, 4, 5, 100], - ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse", - ... "Brittle stars", "Centipede"]}) - >>> import pyarrow.parquet as pq - >>> pq.write_to_dataset(table, root_path='dataset_v2', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - - create a ParquetDataset object from the dataset source: - - >>> dataset = pq.ParquetDataset('dataset_v2/', use_legacy_dataset=False) - - and read the data: - - >>> dataset.read().to_pandas() - n_legs animal year - 0 5 Brittle stars 2019 - 1 2 Flamingo 2020 - 2 4 Dog 2021 - 3 100 Centipede 2021 - 4 2 Parrot 2022 - 5 4 Horse 2022 - - create a ParquetDataset object with filter: - - >>> dataset = pq.ParquetDataset('dataset_v2/', - ... filters=[('n_legs','=',4)], - ... use_legacy_dataset=False) - >>> dataset.read().to_pandas() - n_legs animal year - 0 4 Dog 2021 - 1 4 Horse 2022 - """ - - def __init__(self, path_or_paths, filesystem=None, *, filters=None, - partitioning="hive", read_dictionary=None, buffer_size=None, - memory_map=False, ignore_prefixes=None, pre_buffer=True, - coerce_int96_timestamp_unit=None, schema=None, - decryption_properties=None, thrift_string_size_limit=None, - thrift_container_size_limit=None, - page_checksum_verification=False, - **kwargs): - import pyarrow.dataset as ds - - # Raise error for not supported keywords - for keyword, default in [ - ("metadata", None), ("split_row_groups", False), - ("validate_schema", True), ("metadata_nthreads", None)]: - if keyword in kwargs and kwargs[keyword] is not default: - raise ValueError( - "Keyword '{0}' is not yet supported with the new " - "Dataset API".format(keyword)) - - # map format arguments - read_options = { - "pre_buffer": pre_buffer, - "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit, - "thrift_string_size_limit": thrift_string_size_limit, - "thrift_container_size_limit": thrift_container_size_limit, - "page_checksum_verification": page_checksum_verification, - } - if buffer_size: - read_options.update(use_buffered_stream=True, - buffer_size=buffer_size) - if read_dictionary is not None: - read_options.update(dictionary_columns=read_dictionary) - - if decryption_properties is not None: - read_options.update(decryption_properties=decryption_properties) - - self._filter_expression = None - if filters is not None: - self._filter_expression = filters_to_expression(filters) - - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem( - filesystem, use_mmap=memory_map) - elif filesystem is None and memory_map: - # if memory_map is specified, assume local file system (string - # path can in principle be URI for any filesystem) - filesystem = LocalFileSystem(use_mmap=memory_map) - - # This needs to be checked after _ensure_filesystem, because that - # handles the case of an fsspec LocalFileSystem - if ( - hasattr(path_or_paths, "__fspath__") and - filesystem is not None and - not _is_local_file_system(filesystem) - ): - raise TypeError( - "Path-like objects with __fspath__ must only be used with " - f"local file systems, not {type(filesystem)}" - ) - - # check for single fragment dataset - single_file = None - self._base_dir = None - if not isinstance(path_or_paths, list): - if _is_path_like(path_or_paths): - path_or_paths = _stringify_path(path_or_paths) - if filesystem is None: - # path might be a URI describing the FileSystem as well - try: - filesystem, path_or_paths = FileSystem.from_uri( - path_or_paths) - except ValueError: - filesystem = LocalFileSystem(use_mmap=memory_map) - finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths - if finfo.type == FileType.Directory: - self._base_dir = path_or_paths - else: - single_file = path_or_paths - - parquet_format = ds.ParquetFileFormat(**read_options) + parquet_format = ds.ParquetFileFormat(**read_options) if single_file is not None: fragment = parquet_format.make_fragment(single_file, filesystem) @@ -2540,12 +1357,7 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, ignore_prefixes=ignore_prefixes) def equals(self, other): - if isinstance(other, ParquetDataset): - raise TypeError( - "`other` must be an instance of ParquetDataset constructed " - "with `use_legacy_dataset=False`" - ) - if not isinstance(other, _ParquetDatasetV2): + if not isinstance(other, ParquetDataset): raise TypeError('`other` must be an instance of ParquetDataset') return (self.schema == other.schema and @@ -2576,10 +1388,8 @@ def schema(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_schema', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_schema/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_schema/') Read the schema: @@ -2598,8 +1408,7 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ---------- columns : List[str] Names of columns to read from the dataset. The partition fields - are not automatically included (in contrast to when setting - ``use_legacy_dataset=True``). + are not automatically included. use_threads : bool, default True Perform multi-threaded column reads. use_pandas_metadata : bool, default False @@ -2622,10 +1431,8 @@ def read(self, columns=None, use_threads=True, use_pandas_metadata=False): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_read', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_read/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_read/') Read the dataset: @@ -2694,7 +1501,12 @@ def _get_common_pandas_metadata(self): def read_pandas(self, **kwargs): """ Read dataset including pandas metadata, if any. Other arguments passed - through to ParquetDataset.read, see docstring for further details. + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` Examples -------- @@ -2709,8 +1521,7 @@ def read_pandas(self, **kwargs): >>> table = pa.Table.from_pandas(df) >>> import pyarrow.parquet as pq >>> pq.write_table(table, 'table_V2.parquet') - >>> dataset = pq.ParquetDataset('table_V2.parquet', - ... use_legacy_dataset=False) + >>> dataset = pq.ParquetDataset('table_V2.parquet') Read the dataset with pandas metadata: @@ -2725,14 +1536,6 @@ def read_pandas(self, **kwargs): """ return self.read(use_pandas_metadata=True, **kwargs) - @property - def pieces(self): - warnings.warn( - _DEPR_MSG.format("ParquetDataset.pieces", - " Use the '.fragments' attribute instead"), - FutureWarning, stacklevel=2) - return list(self._dataset.get_fragments()) - @property def fragments(self): """ @@ -2750,10 +1553,8 @@ def fragments(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_fragments', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_fragments/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_fragments/') List the fragments: @@ -2778,10 +1579,8 @@ def files(self): ... "Brittle stars", "Centipede"]}) >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_v2_files', - ... partition_cols=['year'], - ... use_legacy_dataset=False) - >>> dataset = pq.ParquetDataset('dataset_v2_files/', - ... use_legacy_dataset=False) + ... partition_cols=['year']) + >>> dataset = pq.ParquetDataset('dataset_v2_files/') List the files: @@ -2822,8 +1621,6 @@ def partitioning(self): no columns. use_threads : bool, default True Perform multi-threaded column reads. -metadata : FileMetaData - If separately computed schema : Schema, optional Optionally provide the Schema for the parquet dataset, in which case it will not be inferred from the source. @@ -2836,30 +1633,21 @@ def partitioning(self): Rows which do not match the filter predicate will be removed from scanned data. Partition keys embedded in a nested directory structure will be exploited to avoid loading files at all if they contain no matching rows. - If `use_legacy_dataset` is True, filters can only reference partition - keys and only a hive-style directory structure is supported. When - setting `use_legacy_dataset` to False, also within-file level filtering - and different partitioning schemes are supported. + Within-file level filtering and different partitioning schemes are supported. {3} -use_legacy_dataset : bool, default False - By default, `read_table` uses the new Arrow Datasets API since - pyarrow 1.0.0. Among other things, this allows to pass `filters` - for all columns and not only the partition keys, enables - different partitioning schemes, etc. - Set to True to use the legacy behaviour (this option is deprecated, - and the legacy implementation will be removed in a future version). +use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. ignore_prefixes : list, optional Files matching any of these prefixes will be ignored by the - discovery process if use_legacy_dataset=False. + discovery process. This is matched to the basename of a path. By default this is ['.', '_']. Note that discovery happens only if a directory is passed as source. pre_buffer : bool, default True Coalesce and issue file reads in parallel to improve performance on high-latency filesystems (e.g. S3). If True, Arrow will use a - background I/O thread pool. This option is only supported for - use_legacy_dataset=False. If using a filesystem layer that itself + background I/O thread pool. If using a filesystem layer that itself performs readahead (e.g. fsspec's S3FS), disable readahead for best results. coerce_int96_timestamp_unit : str, default None @@ -2968,129 +1756,78 @@ def partitioning(self): """ -def read_table(source, *, columns=None, use_threads=True, metadata=None, +def read_table(source, *, columns=None, use_threads=True, schema=None, use_pandas_metadata=False, read_dictionary=None, memory_map=False, buffer_size=0, partitioning="hive", - filesystem=None, filters=None, use_legacy_dataset=False, + filesystem=None, filters=None, use_legacy_dataset=None, ignore_prefixes=None, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, page_checksum_verification=False): - if not use_legacy_dataset: - if metadata is not None: + + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + try: + dataset = ParquetDataset( + source, + schema=schema, + filesystem=filesystem, + partitioning=partitioning, + memory_map=memory_map, + read_dictionary=read_dictionary, + buffer_size=buffer_size, + filters=filters, + ignore_prefixes=ignore_prefixes, + pre_buffer=pre_buffer, + coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, + ) + except ImportError: + # fall back on ParquetFile for simple cases when pyarrow.dataset + # module is not available + if filters is not None: raise ValueError( - "The 'metadata' keyword is no longer supported with the new " - "datasets-based implementation. Specify " - "'use_legacy_dataset=True' to temporarily recover the old " - "behaviour." - ) - try: - dataset = _ParquetDatasetV2( - source, - schema=schema, - filesystem=filesystem, - partitioning=partitioning, - memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filters=filters, - ignore_prefixes=ignore_prefixes, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + "the 'filters' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - except ImportError: - # fall back on ParquetFile for simple cases when pyarrow.dataset - # module is not available - if filters is not None: - raise ValueError( - "the 'filters' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if partitioning != "hive": - raise ValueError( - "the 'partitioning' keyword is not supported when the " - "pyarrow.dataset module is not available" - ) - if schema is not None: - raise ValueError( - "the 'schema' argument is not supported when the " - "pyarrow.dataset module is not available" - ) - filesystem, path = _resolve_filesystem_and_path(source, filesystem) - if filesystem is not None: - source = filesystem.open_input_file(path) - # TODO test that source is not a directory or a list - dataset = ParquetFile( - source, metadata=metadata, read_dictionary=read_dictionary, - memory_map=memory_map, buffer_size=buffer_size, - pre_buffer=pre_buffer, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties, - thrift_string_size_limit=thrift_string_size_limit, - thrift_container_size_limit=thrift_container_size_limit, - page_checksum_verification=page_checksum_verification, + if partitioning != "hive": + raise ValueError( + "the 'partitioning' keyword is not supported when the " + "pyarrow.dataset module is not available" ) - - return dataset.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) - - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation will " - "be removed in a future version.", - FutureWarning, stacklevel=2) - - if ignore_prefixes is not None: - raise ValueError( - "The 'ignore_prefixes' keyword is only supported when " - "use_legacy_dataset=False") - - if page_checksum_verification: - raise ValueError( - "The 'page_checksum_verification' keyword is only supported when " - "use_legacy_dataset=False") - - if schema is not None: - raise ValueError( - "The 'schema' argument is only supported when " - "use_legacy_dataset=False") - - if _is_path_like(source): - with warnings.catch_warnings(): - # Suppress second warning from ParquetDataset constructor - warnings.filterwarnings( - "ignore", "Passing 'use_legacy_dataset", FutureWarning) - pf = ParquetDataset( - source, metadata=metadata, memory_map=memory_map, - read_dictionary=read_dictionary, - buffer_size=buffer_size, - filesystem=filesystem, filters=filters, - partitioning=partitioning, - coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - use_legacy_dataset=True, + if schema is not None: + raise ValueError( + "the 'schema' argument is not supported when the " + "pyarrow.dataset module is not available" ) - else: - pf = ParquetFile( - source, metadata=metadata, - read_dictionary=read_dictionary, - memory_map=memory_map, - buffer_size=buffer_size, + filesystem, path = _resolve_filesystem_and_path(source, filesystem) + if filesystem is not None: + source = filesystem.open_input_file(path) + # TODO test that source is not a directory or a list + dataset = ParquetFile( + source, read_dictionary=read_dictionary, + memory_map=memory_map, buffer_size=buffer_size, + pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, - decryption_properties=decryption_properties + decryption_properties=decryption_properties, + thrift_string_size_limit=thrift_string_size_limit, + thrift_container_size_limit=thrift_container_size_limit, + page_checksum_verification=page_checksum_verification, ) - return pf.read(columns=columns, use_threads=use_threads, - use_pandas_metadata=use_pandas_metadata) + return dataset.read(columns=columns, use_threads=use_threads, + use_pandas_metadata=use_pandas_metadata) -read_table.__doc__ = _read_table_docstring.format( - """Read a Table from Parquet format -Note: starting with pyarrow 1.0, the default for `use_legacy_dataset` is -switched to False.""", +read_table.__doc__ = _read_table_docstring.format( + """Read a Table from Parquet format""", "\n".join(("""use_pandas_metadata : bool, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded.""", _read_docstring_common)), @@ -3233,23 +1970,13 @@ def write_table(table, where, row_group_size=None, version='2.6', """.format(_parquet_writer_arg_docs, _write_table_example) -def _mkdir_if_not_exists(fs, path): - if fs._isfilestore() and not fs.exists(path): - try: - fs.mkdir(path) - except OSError: - assert fs.exists(path) - - def write_to_dataset(table, root_path, partition_cols=None, - partition_filename_cb=None, filesystem=None, - use_legacy_dataset=None, schema=None, - partitioning=None, basename_template=None, - use_threads=None, file_visitor=None, - existing_data_behavior=None, + filesystem=None, use_legacy_dataset=None, + schema=None, partitioning=None, + basename_template=None, use_threads=None, + file_visitor=None, existing_data_behavior=None, **kwargs): - """Wrapper around dataset.write_dataset (when use_legacy_dataset=False) or - parquet.write_table (when use_legacy_dataset=True) for writing a Table to + """Wrapper around dataset.write_dataset for writing a Table to Parquet format by partitions. For each combination of partition columns and values, a subdirectories are created in the following @@ -3271,45 +1998,31 @@ def write_to_dataset(table, root_path, partition_cols=None, ---------- table : pyarrow.Table root_path : str, pathlib.Path - The root directory of the dataset + The root directory of the dataset. partition_cols : list, Column names by which to partition the dataset. - Columns are partitioned in the order they are given - partition_filename_cb : callable, - A callback function that takes the partition key(s) as an argument - and allow you to override the partition filename. If nothing is - passed, the filename will consist of a uuid. - This option is only supported for use_legacy_dataset=True. - When use_legacy_dataset=None and this option is specified, - use_legacy_dataset will be set to True. + Columns are partitioned in the order they are given. filesystem : FileSystem, default None If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. - use_legacy_dataset : bool - Default is False. Set to True to use the legacy behaviour - (this option is deprecated, and the legacy implementation will be - removed in a future version). The legacy implementation still - supports the `partition_filename_cb` keyword but is less efficient - when using partition columns. + use_legacy_dataset : bool, optional + Deprecated and has no effect from PyArrow version 15.0.0. schema : Schema, optional - This option is only supported for use_legacy_dataset=False. + This Schema of the dataset. partitioning : Partitioning or list[str], optional The partitioning scheme specified with the ``pyarrow.dataset.partitioning()`` function or a list of field names. When providing a list of field names, you can use ``partitioning_flavor`` to drive which partitioning type should be used. - This option is only supported for use_legacy_dataset=False. basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to "guid-{i}.parquet". - This option is only supported for use_legacy_dataset=False. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. - This option is only supported for use_legacy_dataset=False. file_visitor : function If set, this function will be called with a WrittenFile instance for each file created during the call. This object will have both @@ -3330,7 +2043,6 @@ def write_to_dataset(table, root_path, partition_cols=None, def file_visitor(written_file): visited_paths.append(written_file.path) - This option is only supported for use_legacy_dataset=False. existing_data_behavior : 'overwrite_or_ignore' | 'error' | \ 'delete_matching' Controls how the dataset will handle data that already exists in @@ -3348,15 +2060,12 @@ def file_visitor(written_file): dataset. The first time each partition directory is encountered the entire directory will be deleted. This allows you to overwrite old partitions completely. - This option is only supported for use_legacy_dataset=False. **kwargs : dict, - When use_legacy_dataset=False, used as additional kwargs for - `dataset.write_dataset` function for matching kwargs, and remainder to - `ParquetFileFormat.make_write_options`. See the docstring - of `write_table` and `dataset.write_dataset` for the available options. - When use_legacy_dataset=True, used as additional kwargs for - `parquet.write_table` function (See docstring for `write_table` - or `ParquetWriter` for more information). + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. Using `metadata_collector` in kwargs allows one to collect the file metadata instances of dataset pieces. The file paths in the ColumnChunkMetaData will be set relative to `root_path`. @@ -3376,194 +2085,79 @@ def file_visitor(written_file): >>> import pyarrow.parquet as pq >>> pq.write_to_dataset(table, root_path='dataset_name_3', ... partition_cols=['year']) - >>> pq.ParquetDataset('dataset_name_3', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_3').files ['dataset_name_3/year=2019/...-0.parquet', ... Write a single Parquet file into the root folder: >>> pq.write_to_dataset(table, root_path='dataset_name_4') - >>> pq.ParquetDataset('dataset_name_4/', use_legacy_dataset=False).files + >>> pq.ParquetDataset('dataset_name_4/').files ['dataset_name_4/...-0.parquet'] """ - # Choose the implementation - if use_legacy_dataset is None: - # if partition_filename_cb is specified -> - # default to the old implementation - if partition_filename_cb: - use_legacy_dataset = True - # otherwise the default is False - else: - use_legacy_dataset = False + if use_legacy_dataset is not None: + warnings.warn( + "Passing 'use_legacy_dataset' is deprecated as of pyarrow 15.0.0 " + "and will be removed in a future version.", + FutureWarning, stacklevel=2) + + metadata_collector = kwargs.pop('metadata_collector', None) # Check for conflicting keywords - msg_confl_0 = ( - "The '{0}' argument is not supported by use_legacy_dataset={2}. " - "Use only '{1}' instead." - ) - msg_confl_1 = ( - "The '{1}' argument is not supported by use_legacy_dataset={2}. " + msg_confl = ( + "The '{1}' argument is not supported. " "Use only '{0}' instead." ) - msg_confl = msg_confl_0 if use_legacy_dataset else msg_confl_1 - if partition_filename_cb is not None and basename_template is not None: - raise ValueError(msg_confl.format("basename_template", - "partition_filename_cb", - use_legacy_dataset)) - if partition_cols is not None and partitioning is not None: raise ValueError(msg_confl.format("partitioning", - "partition_cols", - use_legacy_dataset)) + "partition_cols")) - metadata_collector = kwargs.pop('metadata_collector', None) if metadata_collector is not None and file_visitor is not None: raise ValueError(msg_confl.format("file_visitor", - "metadata_collector", - use_legacy_dataset)) + "metadata_collector")) - # New dataset implementation - if not use_legacy_dataset: - import pyarrow.dataset as ds + import pyarrow.dataset as ds - # extract write_dataset specific options - # reset assumed to go to make_write_options - write_dataset_kwargs = dict() - for key in inspect.signature(ds.write_dataset).parameters: - if key in kwargs: - write_dataset_kwargs[key] = kwargs.pop(key) - write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( - 'row_group_size', kwargs.pop("chunk_size", None) - ) - # raise for unsupported keywords - msg = ( - "The '{}' argument is not supported with the new dataset " - "implementation." - ) - if metadata_collector is not None: - def file_visitor(written_file): - metadata_collector.append(written_file.metadata) - if partition_filename_cb is not None: - raise ValueError(msg.format("partition_filename_cb")) + # extract write_dataset specific options + # reset assumed to go to make_write_options + write_dataset_kwargs = dict() + for key in inspect.signature(ds.write_dataset).parameters: + if key in kwargs: + write_dataset_kwargs[key] = kwargs.pop(key) + write_dataset_kwargs['max_rows_per_group'] = kwargs.pop( + 'row_group_size', kwargs.pop("chunk_size", None) + ) - # map format arguments - parquet_format = ds.ParquetFileFormat() - write_options = parquet_format.make_write_options(**kwargs) + if metadata_collector is not None: + def file_visitor(written_file): + metadata_collector.append(written_file.metadata) - # map old filesystems to new one - if filesystem is not None: - filesystem = _ensure_filesystem(filesystem) - - if partition_cols: - part_schema = table.select(partition_cols).schema - partitioning = ds.partitioning(part_schema, flavor="hive") - - if basename_template is None: - basename_template = guid() + '-{i}.parquet' - - if existing_data_behavior is None: - existing_data_behavior = 'overwrite_or_ignore' - - ds.write_dataset( - table, root_path, filesystem=filesystem, - format=parquet_format, file_options=write_options, schema=schema, - partitioning=partitioning, use_threads=use_threads, - file_visitor=file_visitor, - basename_template=basename_template, - existing_data_behavior=existing_data_behavior, - **write_dataset_kwargs) - return - - # warnings and errors when using legacy implementation - if use_legacy_dataset: - warnings.warn( - "Passing 'use_legacy_dataset=True' to get the legacy behaviour is " - "deprecated as of pyarrow 8.0.0, and the legacy implementation " - "will be removed in a future version.", - FutureWarning, stacklevel=2) - msg2 = ( - "The '{}' argument is not supported with the legacy " - "implementation. To use this argument specify " - "'use_legacy_dataset=False' while constructing the " - "ParquetDataset." - ) - if schema is not None: - raise ValueError(msg2.format("schema")) - if partitioning is not None: - raise ValueError(msg2.format("partitioning")) - if use_threads is not None: - raise ValueError(msg2.format("use_threads")) - if file_visitor is not None: - raise ValueError(msg2.format("file_visitor")) - if existing_data_behavior is not None: - raise ValueError(msg2.format("existing_data_behavior")) - if basename_template is not None: - raise ValueError(msg2.format("basename_template")) - if partition_filename_cb is not None: - warnings.warn( - _DEPR_MSG.format("partition_filename_cb", " Specify " - "'use_legacy_dataset=False' while constructing " - "the ParquetDataset, and then use the " - "'basename_template' parameter instead. For " - "usage see `pyarrow.dataset.write_dataset`"), - FutureWarning, stacklevel=2) + # map format arguments + parquet_format = ds.ParquetFileFormat() + write_options = parquet_format.make_write_options(**kwargs) - # Legacy implementation - fs, root_path = legacyfs.resolve_filesystem_and_path(root_path, filesystem) - - _mkdir_if_not_exists(fs, root_path) - - if partition_cols is not None and len(partition_cols) > 0: - df = table.to_pandas() - partition_keys = [df[col] for col in partition_cols] - data_df = df.drop(partition_cols, axis='columns') - data_cols = df.columns.drop(partition_cols) - if len(data_cols) == 0: - raise ValueError('No data left to save outside partition columns') - - subschema = table.schema - - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset - for col in table.schema.names: - if col in partition_cols: - subschema = subschema.remove(subschema.get_field_index(col)) - - # ARROW-17829: avoid deprecation warnings for df.groupby - # https://github.com/pandas-dev/pandas/issues/42795 - if len(partition_keys) == 1: - partition_keys = partition_keys[0] - - for keys, subgroup in data_df.groupby(partition_keys, observed=True): - if not isinstance(keys, tuple): - keys = (keys,) - subdir = '/'.join( - ['{colname}={value}'.format(colname=name, value=val) - for name, val in zip(partition_cols, keys)]) - subtable = pa.Table.from_pandas(subgroup, schema=subschema, - safe=False) - _mkdir_if_not_exists(fs, '/'.join([root_path, subdir])) - if partition_filename_cb: - outfile = partition_filename_cb(keys) - else: - outfile = guid() + '.parquet' - relative_path = '/'.join([subdir, outfile]) - full_path = '/'.join([root_path, relative_path]) - with fs.open(full_path, 'wb') as f: - write_table(subtable, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(relative_path) - else: - if partition_filename_cb: - outfile = partition_filename_cb(None) - else: - outfile = guid() + '.parquet' - full_path = '/'.join([root_path, outfile]) - with fs.open(full_path, 'wb') as f: - write_table(table, f, metadata_collector=metadata_collector, - **kwargs) - if metadata_collector is not None: - metadata_collector[-1].set_file_path(outfile) + # map old filesystems to new one + if filesystem is not None: + filesystem = _ensure_filesystem(filesystem) + + if partition_cols: + part_schema = table.select(partition_cols).schema + partitioning = ds.partitioning(part_schema, flavor="hive") + + if basename_template is None: + basename_template = guid() + '-{i}.parquet' + + if existing_data_behavior is None: + existing_data_behavior = 'overwrite_or_ignore' + + ds.write_dataset( + table, root_path, filesystem=filesystem, + format=parquet_format, file_options=write_options, schema=schema, + partitioning=partitioning, use_threads=use_threads, + file_visitor=file_visitor, + basename_template=basename_template, + existing_data_behavior=existing_data_behavior, + **write_dataset_kwargs) + return def write_metadata(schema, where, metadata_collector=None, filesystem=None, @@ -3741,15 +2335,11 @@ def read_schema(where, memory_map=False, decryption_properties=None, "FileEncryptionProperties", "FileMetaData", "ParquetDataset", - "ParquetDatasetPiece", "ParquetFile", "ParquetLogicalType", - "ParquetManifest", - "ParquetPartitions", "ParquetReader", "ParquetSchema", "ParquetWriter", - "PartitionSet", "RowGroupMetaData", "SortingColumn", "Statistics", diff --git a/python/pyarrow/tests/parquet/__init__.py b/python/pyarrow/tests/parquet/__init__.py index 4c4e8240b8736..d08d67d2860f4 100644 --- a/python/pyarrow/tests/parquet/__init__.py +++ b/python/pyarrow/tests/parquet/__init__.py @@ -21,7 +21,4 @@ # Ignore these with pytest ... -m 'not parquet' pytestmark = [ pytest.mark.parquet, - pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), ] diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 4401d3ca6bb75..8365ed5b28543 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -18,31 +18,10 @@ import io import numpy as np -import pytest import pyarrow as pa from pyarrow.tests import util -legacy_filter_mark = pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy:FutureWarning" -) - -parametrize_legacy_dataset = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.dataset)] -) -parametrize_legacy_dataset_not_supported = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=legacy_filter_mark), - pytest.param(False, marks=pytest.mark.skip)] -) -parametrize_legacy_dataset_fixed = pytest.mark.parametrize( - "use_legacy_dataset", - [pytest.param(True, marks=[pytest.mark.xfail, legacy_filter_mark]), - pytest.param(False, marks=pytest.mark.dataset)] -) - def _write_table(table, path, **kwargs): # So we see the ImportError somewhere @@ -65,19 +44,18 @@ def _read_table(*args, **kwargs): def _roundtrip_table(table, read_table_kwargs=None, - write_table_kwargs=None, use_legacy_dataset=False): + write_table_kwargs=None): read_table_kwargs = read_table_kwargs or {} write_table_kwargs = write_table_kwargs or {} writer = pa.BufferOutputStream() _write_table(table, writer, **write_table_kwargs) reader = pa.BufferReader(writer.getvalue()) - return _read_table(reader, use_legacy_dataset=use_legacy_dataset, - **read_table_kwargs) + return _read_table(reader, **read_table_kwargs) def _check_roundtrip(table, expected=None, read_table_kwargs=None, - use_legacy_dataset=False, **write_table_kwargs): + **write_table_kwargs): if expected is None: expected = table @@ -85,20 +63,17 @@ def _check_roundtrip(table, expected=None, read_table_kwargs=None, # intentionally check twice result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs, - write_table_kwargs=write_table_kwargs, - use_legacy_dataset=use_legacy_dataset) + write_table_kwargs=write_table_kwargs) assert result.equals(expected) -def _roundtrip_pandas_dataframe(df, write_kwargs, use_legacy_dataset=False): +def _roundtrip_pandas_dataframe(df, write_kwargs): table = pa.Table.from_pandas(df) result = _roundtrip_table( - table, write_table_kwargs=write_kwargs, - use_legacy_dataset=use_legacy_dataset) + table, write_table_kwargs=write_kwargs) return result.to_pandas() diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 83e6ebeb7a1fc..3c867776ac052 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -28,7 +28,6 @@ from pyarrow.filesystem import LocalFileSystem, FileSystem from pyarrow.tests import util from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table, - parametrize_legacy_dataset, _test_dataframe) try: @@ -63,21 +62,18 @@ def test_parquet_invalid_version(tempdir): data_page_version="2.2") -@parametrize_legacy_dataset -def test_set_data_page_size(use_legacy_dataset): +def test_set_data_page_size(): arr = pa.array([1, 2, 3] * 100000) t = pa.Table.from_arrays([arr], names=['f0']) # 128K, 512K page_sizes = [2 << 16, 2 << 18] for target_page_size in page_sizes: - _check_roundtrip(t, data_page_size=target_page_size, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(t, data_page_size=target_page_size) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_write_batch_size(use_legacy_dataset): +def test_set_write_batch_size(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -87,8 +83,7 @@ def test_set_write_batch_size(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_set_dictionary_pagesize_limit(use_legacy_dataset): +def test_set_dictionary_pagesize_limit(): df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) @@ -101,8 +96,7 @@ def test_set_dictionary_pagesize_limit(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_chunked_table_write(use_legacy_dataset): +def test_chunked_table_write(): # ARROW-232 tables = [] batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10)) @@ -116,66 +110,56 @@ def test_chunked_table_write(use_legacy_dataset): for table in tables: _check_roundtrip( table, version='2.6', - use_legacy_dataset=use_legacy_dataset, data_page_version=data_page_version, use_dictionary=use_dictionary) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_memory_map(tempdir, use_legacy_dataset): +def test_memory_map(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'memory_map': True}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, memory_map=True, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, memory_map=True) assert table_read.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_enable_buffered_stream(tempdir): df = alltypes_sample(size=10) table = pa.Table.from_pandas(df) _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025}, - version='2.6', use_legacy_dataset=use_legacy_dataset) + version='2.6') filename = str(tempdir / 'tmp_file') with open(filename, 'wb') as f: _write_table(table, f, version='2.6') - table_read = pq.read_pandas(filename, buffer_size=4096, - use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename, buffer_size=4096) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_special_chars_filename(tempdir, use_legacy_dataset): +def test_special_chars_filename(tempdir): table = pa.Table.from_arrays([pa.array([42])], ["ints"]) filename = "foo # bar" path = tempdir / filename assert not path.exists() _write_table(table, str(path)) assert path.exists() - table_read = _read_table(str(path), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(str(path)) assert table_read.equals(table) -@parametrize_legacy_dataset -def test_invalid_source(use_legacy_dataset): +def test_invalid_source(): # Test that we provide an helpful error message pointing out # that None wasn't expected when trying to open a Parquet None file. - # - # Depending on use_legacy_dataset the message changes slightly - # but in both cases it should point out that None wasn't expected. with pytest.raises(TypeError, match="None"): - pq.read_table(None, use_legacy_dataset=use_legacy_dataset) + pq.read_table(None) with pytest.raises(TypeError, match="None"): pq.ParquetFile(None) @@ -193,8 +177,7 @@ def test_file_with_over_int16_max_row_groups(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_roundtrip(use_legacy_dataset): +def test_empty_table_roundtrip(): df = alltypes_sample(size=10) # Create a non-empty table to infer the types correctly, then slice to 0 @@ -206,19 +189,17 @@ def test_empty_table_roundtrip(use_legacy_dataset): assert table.schema.field('null').type == pa.null() assert table.schema.field('null_list').type == pa.list_(pa.null()) _check_roundtrip( - table, version='2.6', use_legacy_dataset=use_legacy_dataset) + table, version='2.6') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_empty_table_no_columns(use_legacy_dataset): +def test_empty_table_no_columns(): df = pd.DataFrame() empty = pa.Table.from_pandas(df, preserve_index=False) - _check_roundtrip(empty, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(empty) -@parametrize_legacy_dataset -def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): +def test_write_nested_zero_length_array_chunk_failure(): # Bug report in ARROW-3792 cols = OrderedDict( int32=pa.int32(), @@ -243,17 +224,16 @@ def test_write_nested_zero_length_array_chunk_failure(use_legacy_dataset): my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols)) for batch in my_arrays] tbl = pa.Table.from_batches(my_batches, pa.schema(cols)) - _check_roundtrip(tbl, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(tbl) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiple_path_types(tempdir, use_legacy_dataset): +def test_multiple_path_types(tempdir): # Test compatibility with PEP 519 path-like objects path = tempdir / 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -261,13 +241,12 @@ def test_multiple_path_types(tempdir, use_legacy_dataset): path = str(tempdir) + 'zzz.parquet' df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)}) _write_table(df, path) - table_read = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(path) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) -@parametrize_legacy_dataset -def test_fspath(tempdir, use_legacy_dataset): +def test_fspath(tempdir): # ARROW-12472 support __fspath__ objects without using str() path = tempdir / "test.parquet" table = pa.table({"a": [1, 2, 3]}) @@ -275,9 +254,7 @@ def test_fspath(tempdir, use_legacy_dataset): fs_protocol_obj = util.FSProtocolClass(path) - result = _read_table( - fs_protocol_obj, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(fs_protocol_obj) assert result.equals(table) # combined with non-local filesystem raises @@ -285,15 +262,11 @@ def test_fspath(tempdir, use_legacy_dataset): _read_table(fs_protocol_obj, filesystem=FileSystem()) -@pytest.mark.dataset -@parametrize_legacy_dataset @pytest.mark.parametrize("filesystem", [ None, fs.LocalFileSystem(), LocalFileSystem._get_instance() ]) @pytest.mark.parametrize("name", ("data.parquet", "例.parquet")) -def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): - if use_legacy_dataset and isinstance(filesystem, fs.FileSystem): - pytest.skip("Passing new filesystem not supported for legacy reader") +def test_relative_paths(tempdir, filesystem, name): # reading and writing from relative paths table = pa.table({"a": [1, 2, 3]}) path = tempdir / name @@ -301,8 +274,7 @@ def test_relative_paths(tempdir, use_legacy_dataset, filesystem, name): # reading pq.write_table(table, str(path)) with util.change_cwd(tempdir): - result = pq.read_table(name, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(name, filesystem=filesystem) assert result.equals(table) path.unlink() @@ -334,24 +306,21 @@ def seek(self, *args): pq.read_table(BogusFile(b"")) -@parametrize_legacy_dataset -def test_parquet_read_from_buffer(tempdir, use_legacy_dataset): +def test_parquet_read_from_buffer(tempdir): # reading from a buffer from python's open() table = pa.table({"a": [1, 2, 3]}) pq.write_table(table, str(tempdir / "data.parquet")) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(f) assert result.equals(table) with open(str(tempdir / "data.parquet"), "rb") as f: - result = pq.read_table(pa.PythonFile(f), - use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(pa.PythonFile(f)) assert result.equals(table) -@parametrize_legacy_dataset -def test_byte_stream_split(use_legacy_dataset): +def test_byte_stream_split(): # This is only a smoke test. arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) @@ -385,12 +354,10 @@ def test_byte_stream_split(use_legacy_dataset): table = pa.Table.from_arrays([arr_int], names=['tmp']) with pytest.raises(IOError): _check_roundtrip(table, expected=table, use_byte_stream_split=True, - use_dictionary=False, - use_legacy_dataset=use_legacy_dataset) + use_dictionary=False) -@parametrize_legacy_dataset -def test_column_encoding(use_legacy_dataset): +def test_column_encoding(): arr_float = pa.array(list(map(float, range(100)))) arr_int = pa.array(list(map(int, range(100)))) arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary()) @@ -406,30 +373,26 @@ def test_column_encoding(use_legacy_dataset): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "BYTE_STREAM_SPLIT", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "PLAIN" for all columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="PLAIN", - use_legacy_dataset=use_legacy_dataset) + column_encoding="PLAIN") # Check "DELTA_BINARY_PACKED" for integer columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", - 'c': "DELTA_LENGTH_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "DELTA_LENGTH_BYTE_ARRAY"}) # Check "DELTA_BYTE_ARRAY" for byte columns. _check_roundtrip(mixed_table, expected=mixed_table, @@ -437,14 +400,12 @@ def test_column_encoding(use_legacy_dataset): column_encoding={'a': "PLAIN", 'b': "DELTA_BINARY_PACKED", 'c': "DELTA_BYTE_ARRAY", - 'd': "DELTA_BYTE_ARRAY"}, - use_legacy_dataset=use_legacy_dataset) + 'd': "DELTA_BYTE_ARRAY"}) # Check "RLE" for boolean columns. _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'e': "RLE"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'e': "RLE"}) # Try to pass "BYTE_STREAM_SPLIT" column encoding for integer column 'b'. # This should throw an error as it is only supports FLOAT and DOUBLE. @@ -455,8 +416,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "PLAIN", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass use "DELTA_BINARY_PACKED" encoding on float column. # This should throw an error as only integers are supported. @@ -465,8 +425,7 @@ def test_column_encoding(use_legacy_dataset): use_dictionary=False, column_encoding={'a': "DELTA_BINARY_PACKED", 'b': "PLAIN", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass "RLE_DICTIONARY". # This should throw an error as dictionary encoding is already used by @@ -474,30 +433,26 @@ def test_column_encoding(use_legacy_dataset): with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding="RLE_DICTIONARY", - use_legacy_dataset=use_legacy_dataset) + column_encoding="RLE_DICTIONARY") # Try to pass unsupported encoding. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding={'a': "MADE_UP_ENCODING"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'a': "MADE_UP_ENCODING"}) # Try to pass column_encoding and use_dictionary. # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=['b'], - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_dictionary=True (default value). # This should throw an error. with pytest.raises(ValueError): _check_roundtrip(mixed_table, expected=mixed_table, - column_encoding={'b': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + column_encoding={'b': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split on same column. # This should throw an error. @@ -507,8 +462,7 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=['a'], column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding and use_byte_stream_split=True. # This should throw an error. @@ -518,54 +472,45 @@ def test_column_encoding(use_legacy_dataset): use_byte_stream_split=True, column_encoding={'a': "RLE", 'b': "BYTE_STREAM_SPLIT", - 'c': "PLAIN"}, - use_legacy_dataset=use_legacy_dataset) + 'c': "PLAIN"}) # Try to pass column_encoding=True. # This should throw an error. with pytest.raises(TypeError): _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False, - column_encoding=True, - use_legacy_dataset=use_legacy_dataset) + column_encoding=True) -@parametrize_legacy_dataset -def test_compression_level(use_legacy_dataset): +def test_compression_level(): arr = pa.array(list(map(int, range(1000)))) data = [arr, arr] table = pa.Table.from_arrays(data, names=['a', 'b']) # Check one compression level. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) # Check another one to make sure that compression_level=1 does not # coincide with the default one in Arrow. _check_roundtrip(table, expected=table, compression="gzip", - compression_level=5, - use_legacy_dataset=use_legacy_dataset) + compression_level=5) # Check that the user can provide a compression per column _check_roundtrip(table, expected=table, - compression={'a': "gzip", 'b': "snappy"}, - use_legacy_dataset=use_legacy_dataset) + compression={'a': "gzip", 'b': "snappy"}) # Check that the user can provide a compression level per column _check_roundtrip(table, expected=table, compression="gzip", - compression_level={'a': 2, 'b': 3}, - use_legacy_dataset=use_legacy_dataset) + compression_level={'a': 2, 'b': 3}) # Check if both LZ4 compressors are working # (level < 3 -> fast, level >= 3 -> HC) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=1, - use_legacy_dataset=use_legacy_dataset) + compression_level=1) _check_roundtrip(table, expected=table, compression="lz4", - compression_level=9, - use_legacy_dataset=use_legacy_dataset) + compression_level=9) # Check that specifying a compression level for a codec which does allow # specifying one, results into an error. @@ -594,8 +539,7 @@ def test_sanitized_spark_field_names(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multithreaded_read(use_legacy_dataset): +def test_multithreaded_read(): df = alltypes_sample(size=10000) table = pa.Table.from_pandas(df) @@ -604,19 +548,16 @@ def test_multithreaded_read(use_legacy_dataset): _write_table(table, buf, compression='SNAPPY', version='2.6') buf.seek(0) - table1 = _read_table( - buf, use_threads=True, use_legacy_dataset=use_legacy_dataset) + table1 = _read_table(buf, use_threads=True) buf.seek(0) - table2 = _read_table( - buf, use_threads=False, use_legacy_dataset=use_legacy_dataset) + table2 = _read_table(buf, use_threads=False) assert table1.equals(table2) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_min_chunksize(use_legacy_dataset): +def test_min_chunksize(): data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) table = pa.Table.from_pandas(data.reset_index()) @@ -624,7 +565,7 @@ def test_min_chunksize(use_legacy_dataset): _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = _read_table(buf, use_legacy_dataset=use_legacy_dataset) + result = _read_table(buf) assert result.equals(table) @@ -659,57 +600,46 @@ def test_write_error_deletes_incomplete_file(tempdir): assert not filename.exists() -@parametrize_legacy_dataset -def test_read_non_existent_file(tempdir, use_legacy_dataset): +def test_read_non_existent_file(tempdir): path = 'nonexistent-file.parquet' try: - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) except Exception as e: assert path in e.args[0] -@parametrize_legacy_dataset -def test_read_table_doesnt_warn(datadir, use_legacy_dataset): - if use_legacy_dataset: - msg = "Passing 'use_legacy_dataset=True'" - with pytest.warns(FutureWarning, match=msg): - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) - else: - with warnings.catch_warnings(): - warnings.simplefilter(action="error") - pq.read_table(datadir / 'v0.7.1.parquet', - use_legacy_dataset=use_legacy_dataset) +def test_read_table_doesnt_warn(datadir): + with warnings.catch_warnings(): + warnings.simplefilter(action="error") + pq.read_table(datadir / 'v0.7.1.parquet') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_zlib_compression_bug(use_legacy_dataset): +def test_zlib_compression_bug(): # ARROW-3514: "zlib deflate failed, output buffer too small" table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col']) f = io.BytesIO() pq.write_table(table, f, compression='gzip') f.seek(0) - roundtrip = pq.read_table(f, use_legacy_dataset=use_legacy_dataset) + roundtrip = pq.read_table(f) tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas()) -@parametrize_legacy_dataset -def test_parquet_file_too_small(tempdir, use_legacy_dataset): +def test_parquet_file_too_small(tempdir): path = str(tempdir / "test.parquet") # TODO(dataset) with datasets API it raises OSError instead with pytest.raises((pa.ArrowInvalid, OSError), match='size is 0 bytes'): with open(path, 'wb') as f: pass - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) with pytest.raises((pa.ArrowInvalid, OSError), match='size is 4 bytes'): with open(path, 'wb') as f: f.write(b'ffff') - pq.read_table(path, use_legacy_dataset=use_legacy_dataset) + pq.read_table(path) @pytest.mark.pandas @@ -752,17 +682,15 @@ def test_fastparquet_cross_compatibility(tempdir): tm.assert_frame_equal(table_fp.to_pandas(), df) -@parametrize_legacy_dataset @pytest.mark.parametrize('array_factory', [ lambda: pa.array([0, None] * 10), lambda: pa.array([0, None] * 10).dictionary_encode(), lambda: pa.array(["", None] * 10), lambda: pa.array(["", None] * 10).dictionary_encode(), ]) -@pytest.mark.parametrize('use_dictionary', [False, True]) @pytest.mark.parametrize('read_dictionary', [False, True]) def test_buffer_contents( - array_factory, use_dictionary, read_dictionary, use_legacy_dataset + array_factory, read_dictionary ): # Test that null values are deterministically initialized to zero # after a roundtrip through Parquet. @@ -773,8 +701,7 @@ def test_buffer_contents( bio.seek(0) read_dictionary = ['col'] if read_dictionary else None table = pq.read_table(bio, use_threads=False, - read_dictionary=read_dictionary, - use_legacy_dataset=use_legacy_dataset) + read_dictionary=read_dictionary) for col in table.columns: [chunk] = col.chunks @@ -826,7 +753,6 @@ def test_reads_over_batch(tempdir): assert table == table2 -@pytest.mark.dataset def test_permutation_of_column_order(tempdir): # ARROW-2366 case = tempdir / "dataset_column_order_permutation" @@ -846,18 +772,6 @@ def test_permutation_of_column_order(tempdir): assert table == table2 -def test_read_table_legacy_deprecated(tempdir): - # ARROW-15870 - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - - with pytest.warns( - FutureWarning, match="Passing 'use_legacy_dataset=True'" - ): - pq.read_table(path, use_legacy_dataset=True) - - def test_thrift_size_limits(tempdir): path = tempdir / 'largethrift.parquet' @@ -942,28 +856,9 @@ def test_page_checksum_verification_write_table(tempdir): with pytest.raises(OSError, match="CRC checksum verification"): _ = corrupted_pq_file.read() - # Case 5: Check that enabling page checksum verification in combination - # with legacy dataset raises an exception - with pytest.raises(ValueError, match="page_checksum_verification"): - _ = pq.read_table(corrupted_path, - page_checksum_verification=True, - use_legacy_dataset=True) - @pytest.mark.dataset -@pytest.mark.parametrize( - "use_legacy_dataset", - [ - False, - pytest.param( - True, - marks=pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning" - ), - ), - ], -) -def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): +def test_checksum_write_to_dataset(tempdir): """Check that checksum verification works for datasets created with pq.write_to_dataset""" @@ -973,8 +868,7 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): original_dir_path = tempdir / 'correct_dir' pq.write_to_dataset(table_orig, original_dir_path, - write_page_checksum=True, - use_legacy_dataset=use_legacy_dataset) + write_page_checksum=True) # Read file and verify that the data is correct original_file_path_list = list(original_dir_path.iterdir()) @@ -1014,3 +908,23 @@ def test_checksum_write_to_dataset(tempdir, use_legacy_dataset): # checksum verification enabled raises an exception with pytest.raises(OSError, match="CRC checksum verification"): _ = pq.read_table(corrupted_file_path, page_checksum_verification=True) + + +@pytest.mark.dataset +def test_deprecated_use_legacy_dataset(tempdir): + # Test that specifying use_legacy_dataset in ParquetDataset, write_to_dataset + # and read_table doesn't raise an error but gives a warning. + table = pa.table({"a": [1, 2, 3]}) + path = tempdir / "deprecate_legacy" + + msg = "Passing 'use_legacy_dataset'" + with pytest.warns(FutureWarning, match=msg): + pq.write_to_dataset(table, path, use_legacy_dataset=False) + + pq.write_to_dataset(table, path) + + with pytest.warns(FutureWarning, match=msg): + pq.read_table(path, use_legacy_dataset=False) + + with pytest.warns(FutureWarning, match=msg): + pq.ParquetDataset(path, use_legacy_dataset=False) diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index ca1ad7ee32255..2345855a3321b 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -18,7 +18,6 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -58,16 +57,13 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_enable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_enable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write pandas df with new flag (default behaviour) _roundtrip_pandas_dataframe(df, - write_kwargs={}, - use_legacy_dataset=use_legacy_dataset) + write_kwargs={}) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -83,21 +79,17 @@ def test_write_compliant_nested_type_enable(tempdir, assert new_table.schema.types[0].value_field.name == 'element' # Verify that the new table can be read/written correctly - _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(new_table) @pytest.mark.pandas -@parametrize_legacy_dataset @parametrize_test_data -def test_write_compliant_nested_type_disable(tempdir, - use_legacy_dataset, test_data): +def test_write_compliant_nested_type_disable(tempdir, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) # verify that we can read/write with new flag disabled _roundtrip_pandas_dataframe(df, write_kwargs={ - 'use_compliant_nested_type': False}, - use_legacy_dataset=use_legacy_dataset) + 'use_compliant_nested_type': False}) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) @@ -114,5 +106,4 @@ def test_write_compliant_nested_type_disable(tempdir, # Verify that the new table can be read/written correctly _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset, use_compliant_nested_type=False) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 32fe128bbae9b..e6b66b00428fb 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -23,8 +23,7 @@ import pyarrow as pa from pyarrow.tests import util -from pyarrow.tests.parquet.common import (_check_roundtrip, - parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -54,9 +53,8 @@ @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('chunk_size', [None, 1000]) -def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): +def test_parquet_2_0_roundtrip(tempdir, chunk_size): df = alltypes_sample(size=10000, categorical=True) filename = tempdir / 'pandas_roundtrip.parquet' @@ -65,8 +63,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): _write_table(arrow_table, filename, version='2.6', chunk_size=chunk_size) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) assert table_read.schema.pandas_metadata is not None read_metadata = table_read.schema.metadata @@ -77,8 +74,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_1_0_roundtrip(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -100,7 +96,7 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='1.0') - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -113,18 +109,17 @@ def test_parquet_1_0_roundtrip(tempdir, use_legacy_dataset): # ----------------------------------------------------------------------------- -def _simple_table_write_read(table, use_legacy_dataset): +def _simple_table_write_read(table): bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() return pq.read_table( - pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset + pa.BufferReader(contents) ) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary(use_legacy_dataset): +def test_direct_read_dictionary(): # ARROW-3325 repeats = 10 nunique = 5 @@ -140,8 +135,7 @@ def test_direct_read_dictionary(use_legacy_dataset): contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0']) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) @@ -149,8 +143,7 @@ def test_direct_read_dictionary(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_direct_read_dictionary_subfield(use_legacy_dataset): +def test_direct_read_dictionary_subfield(): repeats = 10 nunique = 5 @@ -163,8 +156,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0.list.element'], - use_legacy_dataset=use_legacy_dataset) + read_dictionary=['f0.list.element']) arr = pa.array(data[0]) values_as_dict = arr.values.dictionary_encode() @@ -181,8 +173,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): assert result[0].num_chunks == 1 -@parametrize_legacy_dataset -def test_dictionary_array_automatically_read(use_legacy_dataset): +def test_dictionary_array_automatically_read(): # ARROW-3246 # Make a large dictionary, a little over 4MB of data @@ -200,7 +191,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): dict_values)) table = pa.table([pa.chunked_array(chunks)], names=['f0']) - result = _simple_table_write_read(table, use_legacy_dataset) + result = _simple_table_write_read(table) assert result.equals(table) @@ -213,8 +204,7 @@ def test_dictionary_array_automatically_read(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_decimal_roundtrip(tempdir, use_legacy_dataset): +def test_decimal_roundtrip(tempdir): num_values = 10 columns = {} @@ -234,8 +224,7 @@ def test_decimal_roundtrip(tempdir, use_legacy_dataset): string_filename = str(filename) table = pa.Table.from_pandas(expected) _write_table(table, string_filename) - result_table = _read_table( - string_filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(string_filename) result = result_table.to_pandas() tm.assert_frame_equal(result, expected) @@ -259,14 +248,13 @@ def test_decimal_roundtrip_negative_scale(tempdir): # ----------------------------------------------------------------------------- -@parametrize_legacy_dataset @pytest.mark.parametrize('dtype', [int, float]) -def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): +def test_single_pylist_column_roundtrip(tempdir, dtype,): filename = tempdir / 'single_{}_column.parquet'.format(dtype.__name__) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=['a']) _write_table(table, filename) - table_read = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) for i in range(table.num_columns): col_written = table[i] col_read = table_read[i] @@ -277,16 +265,14 @@ def test_single_pylist_column_roundtrip(tempdir, dtype, use_legacy_dataset): assert data_written.equals(data_read) -@parametrize_legacy_dataset -def test_empty_lists_table_roundtrip(use_legacy_dataset): +def test_empty_lists_table_roundtrip(): # ARROW-2744: Shouldn't crash when writing an array of empty lists arr = pa.array([[], []], type=pa.list_(pa.int32())) table = pa.Table.from_arrays([arr], ["A"]) - _check_roundtrip(table, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table) -@parametrize_legacy_dataset -def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): +def test_nested_list_nonnullable_roundtrip_bug(): # Reproduce failure in ARROW-5630 typ = pa.list_(pa.field("item", pa.float32(), False)) num_rows = 10000 @@ -295,26 +281,22 @@ def test_nested_list_nonnullable_roundtrip_bug(use_legacy_dataset): (num_rows // 10)), type=typ) ], ['a']) _check_roundtrip( - t, data_page_size=4096, use_legacy_dataset=use_legacy_dataset) + t, data_page_size=4096) -@parametrize_legacy_dataset -def test_nested_list_struct_multiple_batches_roundtrip( - tempdir, use_legacy_dataset -): +def test_nested_list_struct_multiple_batches_roundtrip(tempdir): # Reproduce failure in ARROW-11024 data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100 table = pa.table([pa.array(data)], names=['column']) _check_roundtrip( - table, row_group_size=20, use_legacy_dataset=use_legacy_dataset) + table, row_group_size=20) # Reproduce failure in ARROW-11069 (plain non-nested structs with strings) data = pa.array( [{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10 ) table = pa.table({'column': data}) - _check_roundtrip( - table, row_group_size=10, use_legacy_dataset=use_legacy_dataset) + _check_roundtrip(table, row_group_size=10) def test_writing_empty_lists(): @@ -366,8 +348,7 @@ def test_large_list_records(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_nested_convenience(tempdir, use_legacy_dataset): +def test_parquet_nested_convenience(tempdir): # ARROW-1684 df = pd.DataFrame({ 'a': [[1, 2, 3], None, [4, 5], []], @@ -380,11 +361,11 @@ def test_parquet_nested_convenience(tempdir, use_legacy_dataset): _write_table(table, path) read = pq.read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) tm.assert_frame_equal(read.to_pandas(), df[['a']]) read = pq.read_table( - path, columns=['a', 'b'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a', 'b']) tm.assert_frame_equal(read.to_pandas(), df) @@ -420,17 +401,16 @@ def test_large_table_int32_overflow(): _write_table(table, f) -def _simple_table_roundtrip(table, use_legacy_dataset=False, **write_kwargs): +def _simple_table_roundtrip(table, **write_kwargs): stream = pa.BufferOutputStream() _write_table(table, stream, **write_kwargs) buf = stream.getvalue() - return _read_table(buf, use_legacy_dataset=use_legacy_dataset) + return _read_table(buf) @pytest.mark.slow @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_byte_array_exactly_2gb(use_legacy_dataset): +def test_byte_array_exactly_2gb(): # Test edge case reported in ARROW-3762 val = b'x' * (1 << 10) @@ -444,15 +424,14 @@ def test_byte_array_exactly_2gb(use_legacy_dataset): values = pa.chunked_array([base, pa.array(case)]) t = pa.table([values], names=['f0']) result = _simple_table_roundtrip( - t, use_legacy_dataset=use_legacy_dataset, use_dictionary=False) + t, use_dictionary=False) assert t.equals(result) @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_binary_array_overflow_to_chunked(use_legacy_dataset): +def test_binary_array_overflow_to_chunked(): # ARROW-3762 # 2^31 + 1 bytes @@ -462,8 +441,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): df = pd.DataFrame({'byte_col': values}) tbl = pa.Table.from_pandas(df, preserve_index=False) - read_tbl = _simple_table_roundtrip( - tbl, use_legacy_dataset=use_legacy_dataset) + read_tbl = _simple_table_roundtrip(tbl) col0_data = read_tbl[0] assert isinstance(col0_data, pa.ChunkedArray) @@ -477,8 +455,7 @@ def test_binary_array_overflow_to_chunked(use_legacy_dataset): @pytest.mark.slow @pytest.mark.pandas @pytest.mark.large_memory -@parametrize_legacy_dataset -def test_list_of_binary_large_cell(use_legacy_dataset): +def test_list_of_binary_large_cell(): # ARROW-4688 data = [] @@ -491,8 +468,7 @@ def test_list_of_binary_large_cell(use_legacy_dataset): arr = pa.array(data) table = pa.Table.from_arrays([arr], ['chunky_cells']) - read_table = _simple_table_roundtrip( - table, use_legacy_dataset=use_legacy_dataset) + read_table = _simple_table_roundtrip(table) assert table.equals(read_table) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index a9e99d5d65cf9..b6e351bdef9a7 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -29,9 +29,6 @@ from pyarrow import fs from pyarrow.filesystem import LocalFileSystem from pyarrow.tests import util -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_fixed, - parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -53,76 +50,10 @@ # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' -pytestmark = pytest.mark.parquet +pytestmark = [pytest.mark.parquet, pytest.mark.dataset] -@pytest.mark.pandas -def test_parquet_piece_read(tempdir): - df = _test_dataframe(1000) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece1 = pq.ParquetDatasetPiece(path) - - result = piece1.read() - assert result.equals(table) - - -@pytest.mark.pandas -def test_parquet_piece_open_and_get_metadata(tempdir): - df = _test_dataframe(100) - table = pa.Table.from_pandas(df) - - path = tempdir / 'parquet_piece_read.parquet' - _write_table(table, path, version='2.6') - - with pytest.warns(FutureWarning): - piece = pq.ParquetDatasetPiece(path) - - table1 = piece.read() - assert isinstance(table1, pa.Table) - meta1 = piece.get_metadata() - assert isinstance(meta1, pq.FileMetaData) - - assert table.equals(table1) - - -@pytest.mark.filterwarnings("ignore:ParquetDatasetPiece:FutureWarning") -def test_parquet_piece_basics(): - path = '/baz.parq' - - piece1 = pq.ParquetDatasetPiece(path) - piece2 = pq.ParquetDatasetPiece(path, row_group=1) - piece3 = pq.ParquetDatasetPiece( - path, row_group=1, partition_keys=[('foo', 0), ('bar', 1)]) - - assert str(piece1) == path - assert str(piece2) == '/baz.parq | row_group=1' - assert str(piece3) == 'partition[foo=0, bar=1] /baz.parq | row_group=1' - - assert piece1 == piece1 - assert piece2 == piece2 - assert piece3 == piece3 - assert piece1 != piece3 - - -def test_partition_set_dictionary_type(): - set1 = pq.PartitionSet('key1', ['foo', 'bar', 'baz']) - set2 = pq.PartitionSet('key2', [2007, 2008, 2009]) - - assert isinstance(set1.dictionary, pa.StringArray) - assert isinstance(set2.dictionary, pa.IntegerArray) - - set3 = pq.PartitionSet('key2', [datetime.datetime(2007, 1, 1)]) - with pytest.raises(TypeError): - set3.dictionary - - -@parametrize_legacy_dataset_fixed -def test_filesystem_uri(tempdir, use_legacy_dataset): +def test_filesystem_uri(tempdir): table = pa.table({"a": [1, 2, 3]}) directory = tempdir / "data_dir" @@ -132,72 +63,36 @@ def test_filesystem_uri(tempdir, use_legacy_dataset): # filesystem object result = pq.read_table( - path, filesystem=fs.LocalFileSystem(), - use_legacy_dataset=use_legacy_dataset) + path, filesystem=fs.LocalFileSystem()) assert result.equals(table) # filesystem URI result = pq.read_table( - "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir), - use_legacy_dataset=use_legacy_dataset) + "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir)) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_directory(tempdir, use_legacy_dataset): +def test_read_partitioned_directory(tempdir): fs = LocalFileSystem._get_instance() - _partition_test_for_filesystem(fs, tempdir, use_legacy_dataset) + _partition_test_for_filesystem(fs, tempdir) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -def test_create_parquet_dataset_multi_threaded(tempdir): - fs = LocalFileSystem._get_instance() - base_path = tempdir - - _partition_test_for_filesystem(fs, base_path) - - manifest = pq.ParquetManifest(base_path, filesystem=fs, - metadata_nthreads=1) - with pytest.warns( - FutureWarning, match="Specifying the 'metadata_nthreads'" - ): - dataset = pq.ParquetDataset( - base_path, filesystem=fs, metadata_nthreads=16, - use_legacy_dataset=True - ) - assert len(dataset.pieces) > 0 - partitions = dataset.partitions - assert len(partitions.partition_names) > 0 - assert partitions.partition_names == manifest.partitions.partition_names - assert len(partitions.levels) == len(manifest.partitions.levels) - - -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_partitioned_columns_selection(tempdir, use_legacy_dataset): +def test_read_partitioned_columns_selection(tempdir): # ARROW-3861 - do not include partition columns in resulting table when # `columns` keyword was passed without those columns fs = LocalFileSystem._get_instance() base_path = tempdir _partition_test_for_filesystem(fs, base_path) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read(columns=["values"]) - if use_legacy_dataset: - # ParquetDataset implementation always includes the partition columns - # automatically, and we can't easily "fix" this since dask relies on - # this behaviour (ARROW-8644) - assert result.column_names == ["values", "foo", "bar"] - else: - assert result.column_names == ["values"] + assert result.column_names == ["values"] @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_equivalency(tempdir, use_legacy_dataset): +def test_filters_equivalency(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -225,7 +120,6 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', '=', 1), ('string', '!=', 'b'), ('boolean', '==', 'True')], - use_legacy_dataset=use_legacy_dataset, ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -247,8 +141,7 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): [('integer', '=', 0), ('boolean', '==', 'False')] ] dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + base_path, filesystem=fs, filters=filters) table = dataset.read() result_df = table.to_pandas().reset_index(drop=True) @@ -262,30 +155,15 @@ def test_filters_equivalency(tempdir, use_legacy_dataset): assert df_filter_2.sum() > 0 assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum()) - if use_legacy_dataset: - # Check for \0 in predicate values. Until they are correctly - # implemented in ARROW-3391, they would otherwise lead to weird - # results with the current code. - with pytest.raises(NotImplementedError): - filters = [[('string', '==', b'1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - with pytest.raises(NotImplementedError): - filters = [[('string', '==', '1\0a')]] - pq.ParquetDataset(base_path, filesystem=fs, filters=filters, - use_legacy_dataset=True) - else: - for filters in [[[('string', '==', b'1\0a')]], - [[('string', '==', '1\0a')]]]: - dataset = pq.ParquetDataset( - base_path, filesystem=fs, filters=filters, - use_legacy_dataset=False) - assert dataset.read().num_rows == 0 + for filters in [[[('string', '==', b'1\0a')]], + [[('string', '==', '1\0a')]]]: + dataset = pq.ParquetDataset( + base_path, filesystem=fs, filters=filters) + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): +def test_filters_cutoff_exclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -308,7 +186,6 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): ('integers', '<', 4), ('integers', '>', 1), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -319,15 +196,14 @@ def test_filters_cutoff_exclusive_integer(tempdir, use_legacy_dataset): assert result_list == [2, 3] -@pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.xfail( # different error with use_legacy_datasets because result_df is no longer # categorical raises=(TypeError, AssertionError), reason='Loss of type information in creation of categoricals.' ) -def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): +@pytest.mark.pandas +def test_filters_cutoff_exclusive_datetime(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -356,7 +232,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): ('dates', '<', "2018-04-12"), ('dates', '>', "2018-04-10") ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -371,7 +246,6 @@ def test_filters_cutoff_exclusive_datetime(tempdir, use_legacy_dataset): @pytest.mark.pandas -@pytest.mark.dataset def test_filters_inclusive_datetime(tempdir): # ARROW-11480 path = tempdir / 'timestamps.parquet' @@ -389,8 +263,7 @@ def test_filters_inclusive_datetime(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_integer(tempdir, use_legacy_dataset): +def test_filters_inclusive_integer(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -413,7 +286,6 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): ('integers', '<=', 3), ('integers', '>=', 2), ], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas() @@ -425,8 +297,7 @@ def test_filters_inclusive_integer(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_inclusive_set(tempdir, use_legacy_dataset): +def test_filters_inclusive_set(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -451,7 +322,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): dataset = pq.ParquetDataset( base_path, filesystem=fs, filters=[('string', 'in', 'ab')], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -464,7 +334,6 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): base_path, filesystem=fs, filters=[('integer', 'in', [1]), ('string', 'in', ('a', 'b')), ('boolean', 'not in', {'False'})], - use_legacy_dataset=use_legacy_dataset ) table = dataset.read() result_df = (table.to_pandas().reset_index(drop=True)) @@ -475,8 +344,7 @@ def test_filters_inclusive_set(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): +def test_filters_invalid_pred_op(tempdir): fs = LocalFileSystem._get_instance() base_path = tempdir @@ -496,49 +364,30 @@ def test_filters_invalid_pred_op(tempdir, use_legacy_dataset): with pytest.raises(TypeError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', 'in', 3), ], - use_legacy_dataset=use_legacy_dataset) + filters=[('integers', 'in', 3), ]) with pytest.raises(ValueError): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('integers', '=<', 3), ], - use_legacy_dataset=use_legacy_dataset) - - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - else: - # Dataset API returns empty table instead - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', 'in', set()), ], - use_legacy_dataset=use_legacy_dataset) - assert dataset.read().num_rows == 0 + filters=[('integers', '=<', 3), ]) - if use_legacy_dataset: - with pytest.raises(ValueError): - pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - else: - dataset = pq.ParquetDataset(base_path, - filesystem=fs, - filters=[('integers', '!=', {3})], - use_legacy_dataset=use_legacy_dataset) - with pytest.raises(NotImplementedError): - assert dataset.read().num_rows == 0 + # Dataset API returns empty table + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', 'in', set()), ]) + assert dataset.read().num_rows == 0 + + dataset = pq.ParquetDataset(base_path, + filesystem=fs, + filters=[('integers', '!=', {3})]) + with pytest.raises(NotImplementedError): + assert dataset.read().num_rows == 0 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_filters_invalid_column(tempdir, use_legacy_dataset): +def test_filters_invalid_column(tempdir): # ARROW-5572 - raise error on invalid name in filter specification - # works with new dataset / xfail with legacy implementation + # works with new dataset fs = LocalFileSystem._get_instance() base_path = tempdir @@ -556,12 +405,10 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): msg = r"No match for FieldRef.Name\(non_existent_column\)" with pytest.raises(ValueError, match=msg): pq.ParquetDataset(base_path, filesystem=fs, - filters=[('non_existent_column', '<', 3), ], - use_legacy_dataset=use_legacy_dataset).read() + filters=[('non_existent_column', '<', 3), ]).read() @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize("filters", ([('integers', '<', 3)], [[('integers', '<', 3)]], @@ -569,7 +416,7 @@ def test_filters_invalid_column(tempdir, use_legacy_dataset): pc.field('nested', 'a') < 3, pc.field('nested', 'b').cast(pa.int64()) < 3)) @pytest.mark.parametrize("read_method", ("read_table", "read_pandas")) -def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): +def test_filters_read_table(tempdir, filters, read_method): read = getattr(pq, read_method) # test that filters keyword is passed through in read_table fs = LocalFileSystem._get_instance() @@ -589,24 +436,15 @@ def test_filters_read_table(tempdir, use_legacy_dataset, filters, read_method): _generate_partition_directories(fs, base_path, partition_spec, df) - kwargs = dict(filesystem=fs, filters=filters, - use_legacy_dataset=use_legacy_dataset) + kwargs = dict(filesystem=fs, filters=filters) - # Using Expression in legacy dataset not supported - if use_legacy_dataset and isinstance(filters, pc.Expression): - msg = "Expressions as filter not supported for legacy dataset" - with pytest.raises(TypeError, match=msg): - read(base_path, **kwargs) - else: - table = read(base_path, **kwargs) - assert table.num_rows == 3 + table = read(base_path, **kwargs) + assert table.num_rows == 3 @pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): +def test_partition_keys_with_underscores(tempdir): # ARROW-5666 - partition field values with underscores preserve underscores - # xfail with legacy dataset -> they get interpreted as integers fs = LocalFileSystem._get_instance() base_path = tempdir @@ -623,60 +461,47 @@ def test_partition_keys_with_underscores(tempdir, use_legacy_dataset): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path) result = dataset.read() assert result.column("year_week").to_pylist() == string_keys @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_s3fs(s3_example_s3fs, ): fs, path = s3_example_s3fs path = path + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(path, filesystem=fs) assert result.equals(table) @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_directory_s3fs(s3_example_s3fs): fs, directory = s3_example_s3fs path = directory + "/test.parquet" table = pa.table({"a": [1, 2, 3]}) _write_table(table, path, filesystem=fs) - result = _read_table( - directory, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + result = _read_table(directory, filesystem=fs) assert result.equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_single_file_list(tempdir, use_legacy_dataset): +def test_read_single_file_list(tempdir): data_path = str(tempdir / 'data.parquet') table = pa.table({"a": [1, 2, 3]}) _write_table(table, data_path) - result = pq.ParquetDataset( - [data_path], use_legacy_dataset=use_legacy_dataset - ).read() + result = pq.ParquetDataset([data_path]).read() assert result.equals(table) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs_wrapper( - s3_example_s3fs, use_legacy_dataset -): +def test_read_partitioned_directory_s3fs_wrapper(s3_example_s3fs): import s3fs from pyarrow.filesystem import S3FSWrapper @@ -690,23 +515,18 @@ def test_read_partitioned_directory_s3fs_wrapper( _partition_test_for_filesystem(wrapper, path) # Check that we can auto-wrap - dataset = pq.ParquetDataset( - path, filesystem=fs, use_legacy_dataset=use_legacy_dataset - ) + dataset = pq.ParquetDataset(path, filesystem=fs) dataset.read() @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_read_partitioned_directory_s3fs(s3_example_s3fs, use_legacy_dataset): +def test_read_partitioned_directory_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs - _partition_test_for_filesystem( - fs, path, use_legacy_dataset=use_legacy_dataset - ) + _partition_test_for_filesystem(fs, path) -def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): +def _partition_test_for_filesystem(fs, base_path): foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -724,8 +544,7 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): _generate_partition_directories(fs, base_path, partition_spec, df) - dataset = pq.ParquetDataset( - base_path, filesystem=fs, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(base_path, filesystem=fs) table = dataset.read() result_df = (table.to_pandas() .sort_values(by='index') @@ -735,15 +554,11 @@ def _partition_test_for_filesystem(fs, base_path, use_legacy_dataset=True): .reset_index(drop=True) .reindex(columns=result_df.columns)) - if use_legacy_dataset or Version(pd.__version__) < Version("2.0.0"): - expected_df['foo'] = pd.Categorical(df['foo'], categories=foo_keys) - expected_df['bar'] = pd.Categorical(df['bar'], categories=bar_keys) - else: - # With pandas 2.0.0 Index can store all numeric dtypes (not just - # int64/uint64/float64). Using astype() to create a categorical - # column preserves original dtype (int32) - expected_df['foo'] = expected_df['foo'].astype("category") - expected_df['bar'] = expected_df['bar'].astype("category") + # With pandas 2.0.0 Index can store all numeric dtypes (not just + # int64/uint64/float64). Using astype() to create a categorical + # column preserves original dtype (int32) + expected_df['foo'] = expected_df['foo'].astype("category") + expected_df['bar'] = expected_df['bar'].astype("category") assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all() @@ -790,83 +605,6 @@ def _visit_level(base_dir, level, part_keys): _visit_level(base_dir, 0, []) -def _test_read_common_metadata_files(fs, base_path): - import pandas as pd - - import pyarrow.parquet as pq - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - base_path = str(base_path) - data_path = os.path.join(base_path, 'data.parquet') - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = os.path.join(base_path, '_common_metadata') - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(base_path, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.common_metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - common_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(common_schema) - - # handle list of one directory - dataset2 = pq.ParquetDataset([base_path], filesystem=fs, - use_legacy_dataset=True) - assert dataset2.schema.equals(dataset.schema) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_common_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - _test_read_common_metadata_files(fs, tempdir) - - -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") -def test_read_metadata_files(tempdir): - fs = LocalFileSystem._get_instance() - - N = 100 - df = pd.DataFrame({ - 'index': np.arange(N), - 'values': np.random.randn(N) - }, columns=['index', 'values']) - - data_path = tempdir / 'data.parquet' - - table = pa.Table.from_pandas(df) - - with fs.open(data_path, 'wb') as f: - _write_table(table, f) - - metadata_path = tempdir / '_metadata' - with fs.open(metadata_path, 'wb') as f: - pq.write_metadata(table.schema, f) - - dataset = pq.ParquetDataset(tempdir, filesystem=fs, - use_legacy_dataset=True) - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) - - with fs.open(data_path) as f: - metadata_schema = pq.read_metadata(f).schema - assert dataset.schema.equals(metadata_schema) - - def _filter_partition(df, part_keys): predicate = np.ones(len(df), dtype=bool) @@ -883,9 +621,8 @@ def _filter_partition(df, part_keys): return df[predicate].drop(to_drop, axis=1) -@parametrize_legacy_dataset @pytest.mark.pandas -def test_filter_before_validate_schema(tempdir, use_legacy_dataset): +def test_filter_before_validate_schema(tempdir): # ARROW-4076 apply filter before schema validation # to avoid checking unneeded schemas @@ -902,16 +639,12 @@ def test_filter_before_validate_schema(tempdir, use_legacy_dataset): pq.write_table(table2, dir2 / 'data.parquet') # read single file using filter - table = pq.read_table(tempdir, filters=[[('A', '==', 0)]], - use_legacy_dataset=use_legacy_dataset) + table = pq.read_table(tempdir, filters=[[('A', '==', 0)]]) assert table.column('B').equals(pa.chunked_array([[1, 2, 3]])) @pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Specifying the 'metadata':FutureWarning") -@parametrize_legacy_dataset -def test_read_multiple_files(tempdir, use_legacy_dataset): +def test_read_multiple_files(tempdir): nfiles = 10 size = 5 @@ -938,8 +671,7 @@ def test_read_multiple_files(tempdir, use_legacy_dataset): (dirpath / '_SUCCESS.crc').touch() def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): - dataset = pq.ParquetDataset( - paths, use_legacy_dataset=use_legacy_dataset, **kwargs) + dataset = pq.ParquetDataset(paths, **kwargs) return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) @@ -947,37 +679,18 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): assert result.equals(expected) - # Read with provided metadata - # TODO(dataset) specifying metadata not yet supported - metadata = pq.read_metadata(paths[0]) - if use_legacy_dataset: - result2 = read_multiple_files(paths, metadata=metadata) - assert result2.equals(expected) - - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - result3 = pq.ParquetDataset(dirpath, schema=metadata.schema, - use_legacy_dataset=True).read() - assert result3.equals(expected) - else: - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table(paths, metadata=metadata, use_legacy_dataset=False) - # Read column subset to_read = [0, 2, 6, result.num_columns - 1] col_names = [result.field(i).name for i in to_read] - out = pq.read_table( - dirpath, columns=col_names, use_legacy_dataset=use_legacy_dataset - ) + out = pq.read_table(dirpath, columns=col_names) expected = pa.Table.from_arrays([result.column(i) for i in to_read], names=col_names, metadata=result.schema.metadata) assert out.equals(expected) # Read with multiple threads - pq.read_table( - dirpath, use_threads=True, use_legacy_dataset=use_legacy_dataset - ) + pq.read_table(dirpath, use_threads=True) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] @@ -986,31 +699,24 @@ def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): t = pa.Table.from_pandas(bad_apple) _write_table(t, bad_apple_path) - if not use_legacy_dataset: - # TODO(dataset) Dataset API skips bad files - return + # TODO(dataset) Dataset API skips bad files - bad_meta = pq.read_metadata(bad_apple_path) + # bad_meta = pq.read_metadata(bad_apple_path) - with pytest.raises(ValueError): - read_multiple_files(paths + [bad_apple_path]) + # with pytest.raises(ValueError): + # read_multiple_files(paths + [bad_apple_path]) - with pytest.raises(ValueError): - read_multiple_files(paths, metadata=bad_meta) + # with pytest.raises(ValueError): + # read_multiple_files(paths, metadata=bad_meta) - mixed_paths = [bad_apple_path, paths[0]] + # mixed_paths = [bad_apple_path, paths[0]] - with pytest.raises(ValueError): - with pytest.warns(FutureWarning, match="Specifying the 'schema'"): - read_multiple_files(mixed_paths, schema=bad_meta.schema) - - with pytest.raises(ValueError): - read_multiple_files(mixed_paths) + # with pytest.raises(ValueError): + # read_multiple_files(mixed_paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_pandas(tempdir, use_legacy_dataset): +def test_dataset_read_pandas(tempdir): nfiles = 5 size = 5 @@ -1033,7 +739,7 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): frames.append(df) paths.append(path) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) @@ -1047,10 +753,8 @@ def test_dataset_read_pandas(tempdir, use_legacy_dataset): tm.assert_frame_equal(result.reindex(columns=expected.columns), expected) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_memory_map(tempdir, use_legacy_dataset): +def test_dataset_memory_map(tempdir): # ARROW-2627: Check that we can use ParquetDataset with memory-mapping dirpath = tempdir / guid() dirpath.mkdir() @@ -1061,15 +765,12 @@ def test_dataset_memory_map(tempdir, use_legacy_dataset): _write_table(table, path, version='2.6') dataset = pq.ParquetDataset( - dirpath, memory_map=True, use_legacy_dataset=use_legacy_dataset) + dirpath, memory_map=True) assert dataset.read().equals(table) - if use_legacy_dataset: - assert dataset.pieces[0].read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): +def test_dataset_enable_buffered_stream(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1080,19 +781,16 @@ def test_dataset_enable_buffered_stream(tempdir, use_legacy_dataset): with pytest.raises(ValueError): pq.ParquetDataset( - dirpath, buffer_size=-64, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=-64) for buffer_size in [128, 1024]: dataset = pq.ParquetDataset( - dirpath, buffer_size=buffer_size, - use_legacy_dataset=use_legacy_dataset) + dirpath, buffer_size=buffer_size) assert dataset.read().equals(table) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): +def test_dataset_enable_pre_buffer(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1103,11 +801,9 @@ def test_dataset_enable_pre_buffer(tempdir, use_legacy_dataset): for pre_buffer in (True, False): dataset = pq.ParquetDataset( - dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + dirpath, pre_buffer=pre_buffer) assert dataset.read().equals(table) - actual = pq.read_table(dirpath, pre_buffer=pre_buffer, - use_legacy_dataset=use_legacy_dataset) + actual = pq.read_table(dirpath, pre_buffer=pre_buffer) assert actual.equals(table) @@ -1123,18 +819,14 @@ def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5): return paths -def _assert_dataset_paths(dataset, paths, use_legacy_dataset): - if use_legacy_dataset: - assert set(map(str, paths)) == {x.path for x in dataset._pieces} - else: - paths = [str(path.as_posix()) for path in paths] - assert set(paths) == set(dataset._dataset.files) +def _assert_dataset_paths(dataset, paths): + paths = [str(path.as_posix()) for path in paths] + assert set(paths) == set(dataset.files) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): +def test_ignore_private_directories(tempdir, dir_prefix): dirpath = tempdir / guid() dirpath.mkdir() @@ -1144,14 +836,13 @@ def test_ignore_private_directories(tempdir, dir_prefix, use_legacy_dataset): # private directory (dirpath / '{}staging'.format(dir_prefix)).mkdir() - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_dot(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1164,14 +855,13 @@ def test_ignore_hidden_files_dot(tempdir, use_legacy_dataset): with (dirpath / '.private').open('wb') as f: f.write(b'gibberish') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): +def test_ignore_hidden_files_underscore(tempdir): dirpath = tempdir / guid() dirpath.mkdir() @@ -1184,17 +874,14 @@ def test_ignore_hidden_files_underscore(tempdir, use_legacy_dataset): with (dirpath / '_started_321').open('wb') as f: f.write(b'abcd') - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + _assert_dataset_paths(dataset, paths) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('dir_prefix', ['_', '.']) -def test_ignore_no_private_directories_in_base_path( - tempdir, dir_prefix, use_legacy_dataset -): +def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix): # ARROW-8427 - don't ignore explicitly listed files if parent directory # is a private directory dirpath = tempdir / "{0}data".format(dir_prefix) / guid() @@ -1203,17 +890,15 @@ def test_ignore_no_private_directories_in_base_path( paths = _make_example_multifile_dataset(dirpath, nfiles=10, file_nrows=5) - dataset = pq.ParquetDataset(paths, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(paths) + _assert_dataset_paths(dataset, paths) # ARROW-9644 - don't ignore full directory with underscore in base path - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) - _assert_dataset_paths(dataset, paths, use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) + _assert_dataset_paths(dataset, paths) -@pytest.mark.pandas -@parametrize_legacy_dataset_fixed -def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): +def test_ignore_custom_prefixes(tempdir): # ARROW-9573 - allow override of default ignore_prefixes part = ["xxx"] * 3 + ["yyy"] * 3 table = pa.table([ @@ -1221,7 +906,6 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): pa.array(part).dictionary_encode(), ], names=['index', '_part']) - # TODO use_legacy_dataset ARROW-10247 pq.write_to_dataset(table, str(tempdir), partition_cols=['_part']) private_duplicate = tempdir / '_private_duplicate' @@ -1230,29 +914,23 @@ def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): partition_cols=['_part']) read = pq.read_table( - tempdir, use_legacy_dataset=use_legacy_dataset, - ignore_prefixes=['_private']) + tempdir, ignore_prefixes=['_private']) assert read.equals(table) -@parametrize_legacy_dataset_fixed -def test_empty_directory(tempdir, use_legacy_dataset): - # ARROW-5310 - reading empty directory - # fails with legacy implementation +def test_empty_directory(tempdir): + # ARROW-5310 empty_dir = tempdir / 'dataset' empty_dir.mkdir() - dataset = pq.ParquetDataset( - empty_dir, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(empty_dir) result = dataset.read() assert result.num_rows == 0 assert result.num_columns == 0 -@pytest.mark.filterwarnings("ignore:'ParquetDataset.schema:FutureWarning") def _test_write_to_dataset_with_partitions(base_path, - use_legacy_dataset=True, filesystem=None, schema=None, index_name=None): @@ -1275,8 +953,7 @@ def _test_write_to_dataset_with_partitions(base_path, output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False, preserve_index=False) pq.write_to_dataset(output_table, base_path, partition_by, - filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) metadata_path = os.path.join(str(base_path), '_common_metadata') @@ -1286,19 +963,11 @@ def _test_write_to_dataset_with_partitions(base_path, else: pq.write_metadata(output_table.schema, metadata_path) - # ARROW-2891: Ensure the output_schema is preserved when writing a - # partitioned dataset dataset = pq.ParquetDataset(base_path, - filesystem=filesystem, - validate_schema=True, - use_legacy_dataset=use_legacy_dataset) + filesystem=filesystem) # ARROW-2209: Ensure the dataset schema also includes the partition columns - if use_legacy_dataset: - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset_cols = set(dataset.schema.to_arrow_schema().names) - else: - # NB schema property is an arrow and not parquet schema - dataset_cols = set(dataset.schema.names) + # NB schema property is an arrow and not parquet schema + dataset_cols = set(dataset.schema.names) assert dataset_cols == set(output_table.schema.names) @@ -1323,7 +992,6 @@ def _test_write_to_dataset_with_partitions(base_path, def _test_write_to_dataset_no_partitions(base_path, - use_legacy_dataset=True, filesystem=None): import pandas as pd @@ -1347,7 +1015,6 @@ def _test_write_to_dataset_no_partitions(base_path, n = 5 for i in range(n): pq.write_to_dataset(output_table, base_path, - use_legacy_dataset=use_legacy_dataset, filesystem=filesystem) output_files = [file for file in filesystem.ls(str(base_path)) if file.endswith(".parquet")] @@ -1356,8 +1023,7 @@ def _test_write_to_dataset_no_partitions(base_path, # Deduplicated incoming DataFrame should match # original outgoing Dataframe input_table = pq.ParquetDataset( - base_path, filesystem=filesystem, - use_legacy_dataset=use_legacy_dataset + base_path, filesystem=filesystem ).read() input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() @@ -1366,131 +1032,71 @@ def _test_write_to_dataset_no_partitions(base_path, @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_with_partitions(tempdir): + _test_write_to_dataset_with_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_schema( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_schema(tempdir): schema = pa.schema([pa.field('group1', type=pa.string()), pa.field('group2', type=pa.string()), pa.field('num', type=pa.int64()), pa.field('nan', type=pa.int32()), pa.field('date', type=pa.timestamp(unit='us'))]) _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, schema=schema) + str(tempdir), schema=schema) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_and_index_name( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_and_index_name(tempdir): _test_write_to_dataset_with_partitions( - str(tempdir), use_legacy_dataset, index_name='index_name') + str(tempdir), index_name='index_name') @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions(tempdir, use_legacy_dataset): - _test_write_to_dataset_no_partitions(str(tempdir), use_legacy_dataset) +def test_write_to_dataset_no_partitions(tempdir): + _test_write_to_dataset_no_partitions(str(tempdir)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib(tempdir, use_legacy_dataset): - _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset) - _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset) +def test_write_to_dataset_pathlib(tempdir): + _test_write_to_dataset_with_partitions(tempdir / "test1") + _test_write_to_dataset_no_partitions(tempdir / "test2") @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_pathlib_nonlocal( - tempdir, s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_pathlib_nonlocal(tempdir, s3_example_s3fs): # pathlib paths are only accepted for local files fs, _ = s3_example_s3fs with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_with_partitions( - tempdir / "test1", use_legacy_dataset, filesystem=fs) + tempdir / "test1", filesystem=fs) with pytest.raises(TypeError, match="path-like objects are only allowed"): _test_write_to_dataset_no_partitions( - tempdir / "test2", use_legacy_dataset, filesystem=fs) + tempdir / "test2", filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_with_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_with_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_with_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) @pytest.mark.pandas @pytest.mark.s3 -@parametrize_legacy_dataset -def test_write_to_dataset_no_partitions_s3fs( - s3_example_s3fs, use_legacy_dataset -): +def test_write_to_dataset_no_partitions_s3fs(s3_example_s3fs): fs, path = s3_example_s3fs _test_write_to_dataset_no_partitions( - path, use_legacy_dataset, filesystem=fs) + path, filesystem=fs) -@pytest.mark.filterwarnings( - "ignore:'ParquetDataset:FutureWarning", - "ignore:'partition_filename_cb':FutureWarning") -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_write_to_dataset_with_partitions_and_custom_filenames( - tempdir, use_legacy_dataset -): - output_df = pd.DataFrame({'group1': list('aaabbbbccc'), - 'group2': list('eefeffgeee'), - 'num': list(range(10)), - 'nan': [np.nan] * 10, - 'date': np.arange('2017-01-01', '2017-01-11', - dtype='datetime64[D]')}) - partition_by = ['group1', 'group2'] - output_table = pa.Table.from_pandas(output_df) - path = str(tempdir) - - def partition_filename_callback(keys): - return "{}-{}.parquet".format(*keys) - - pq.write_to_dataset(output_table, path, - partition_by, partition_filename_callback, - use_legacy_dataset=use_legacy_dataset) - - dataset = pq.ParquetDataset(path, use_legacy_dataset=use_legacy_dataset) - - # ARROW-3538: Ensure partition filenames match the given pattern - # defined in the local function partition_filename_callback - expected_basenames = [ - 'a-e.parquet', 'a-f.parquet', - 'b-e.parquet', 'b-f.parquet', - 'b-g.parquet', 'c-e.parquet' - ] - output_basenames = [os.path.basename(p.path) for p in dataset.pieces] - - assert sorted(expected_basenames) == sorted(output_basenames) - - -@pytest.mark.dataset @pytest.mark.pandas def test_write_to_dataset_filesystem(tempdir): df = pd.DataFrame({'A': [1, 2, 3]}) @@ -1502,7 +1108,7 @@ def test_write_to_dataset_filesystem(tempdir): assert result.equals(table) -def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): +def _make_dataset_for_pickling(tempdir, N=100): path = tempdir / 'data.parquet' fs = LocalFileSystem._get_instance() @@ -1525,42 +1131,22 @@ def _make_dataset_for_pickling(tempdir, use_legacy_dataset=False, N=100): pq.write_metadata(table.schema, f) dataset = pq.ParquetDataset( - tempdir, filesystem=fs, use_legacy_dataset=use_legacy_dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - assert dataset.metadata_path == str(metadata_path) + tempdir, filesystem=fs) return dataset @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pickle_dataset(tempdir, datadir, use_legacy_dataset, pickle_module): +def test_pickle_dataset(tempdir, pickle_module): def is_pickleable(obj): return obj == pickle_module.loads(pickle_module.dumps(obj)) - dataset = _make_dataset_for_pickling(tempdir, use_legacy_dataset) + dataset = _make_dataset_for_pickling(tempdir) assert is_pickleable(dataset) - if use_legacy_dataset: - with pytest.warns(FutureWarning): - metadata = dataset.metadata - assert is_pickleable(metadata) - assert is_pickleable(metadata.schema) - assert len(metadata.schema) - for column in metadata.schema: - assert is_pickleable(column) - - for piece in dataset._pieces: - assert is_pickleable(piece) - metadata = piece.get_metadata() - assert metadata.num_row_groups - for i in range(metadata.num_row_groups): - assert is_pickleable(metadata.row_group(i)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_partitioned_dataset(tempdir, use_legacy_dataset): +def test_partitioned_dataset(tempdir): # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset # to a Parquet file path = tempdir / "ARROW-3208" @@ -1571,27 +1157,20 @@ def test_partitioned_dataset(tempdir, use_legacy_dataset): }) table = pa.Table.from_pandas(df) pq.write_to_dataset(table, root_path=str(path), - partition_cols=['one', 'two'], - use_legacy_dataset=use_legacy_dataset) - table = pq.ParquetDataset( - path, use_legacy_dataset=use_legacy_dataset).read() + partition_cols=['one', 'two']) + table = pq.ParquetDataset(path).read() pq.write_table(table, path / "output.parquet") -@pytest.mark.pandas -@parametrize_legacy_dataset -def test_dataset_read_dictionary(tempdir, use_legacy_dataset): +def test_dataset_read_dictionary(tempdir): path = tempdir / "ARROW-3325-dataset" t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0']) - pq.write_to_dataset(t1, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) - pq.write_to_dataset(t2, root_path=str(path), - use_legacy_dataset=use_legacy_dataset) + pq.write_to_dataset(t1, root_path=str(path)) + pq.write_to_dataset(t2, root_path=str(path)) result = pq.ParquetDataset( - path, read_dictionary=['f0'], - use_legacy_dataset=use_legacy_dataset).read() + path, read_dictionary=['f0']).read() # The order of the chunks is non-deterministic ex_chunks = [t1[0].chunk(0).dictionary_encode(), @@ -1606,9 +1185,6 @@ def test_dataset_read_dictionary(tempdir, use_legacy_dataset): assert c1.equals(ex_chunks[0]) -@pytest.mark.dataset -@pytest.mark.pandas -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_read_table_schema(tempdir): # test that schema keyword is passed through in read_table table = pa.table({'a': pa.array([1, 2, 3], pa.int32())}) @@ -1627,42 +1203,24 @@ def test_read_table_schema(tempdir): expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.equals(expected) - # don't allow it with the legacy reader - with pytest.raises( - ValueError, match="The 'schema' argument is only supported" - ): - pq.read_table(tempdir / "data.parquet", schema=schema, - use_legacy_dataset=True) - - # using ParquetDataset directory with non-legacy implementation - result = pq.ParquetDataset( - tempdir, schema=schema, use_legacy_dataset=False - ) + result = pq.ParquetDataset(tempdir, schema=schema) expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema) assert result.read().equals(expected) -@pytest.mark.dataset -def test_dataset_unsupported_keywords(): - - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata=pa.schema([])) +def test_read_table_duplicate_column_selection(tempdir): + # test that duplicate column selection gives duplicate columns + table = pa.table({'a': pa.array([1, 2, 3], pa.int32()), + 'b': pa.array([1, 2, 3], pa.uint8())}) + pq.write_table(table, tempdir / "data.parquet") - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, validate_schema=False) + result = pq.read_table(tempdir / "data.parquet", columns=['a', 'a']) + expected_schema = pa.schema([('a', 'int32'), ('a', 'int32')]) - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, split_row_groups=True) + assert result.column_names == ['a', 'a'] + assert result.schema == expected_schema - with pytest.raises(ValueError, match="not yet supported with the new"): - pq.ParquetDataset("", use_legacy_dataset=False, metadata_nthreads=4) - with pytest.raises(ValueError, match="no longer supported"): - pq.read_table("", use_legacy_dataset=False, metadata=pa.schema([])) - - -@pytest.mark.dataset -@pytest.mark.filterwarnings("ignore:Passing 'use_legacy:FutureWarning") def test_dataset_partitioning(tempdir): import pyarrow.dataset as ds @@ -1679,42 +1237,25 @@ def test_dataset_partitioning(tempdir): # read_table part = ds.partitioning(field_names=["year", "month", "day"]) result = pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=False) + str(root_path), partitioning=part) assert result.column_names == ["a", "year", "month", "day"] result = pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=False).read() + str(root_path), partitioning=part).read() assert result.column_names == ["a", "year", "month", "day"] - # This raises an error for legacy dataset - with pytest.raises(ValueError): - pq.read_table( - str(root_path), partitioning=part, use_legacy_dataset=True) - - with pytest.raises(ValueError): - pq.ParquetDataset( - str(root_path), partitioning=part, use_legacy_dataset=True) - -@pytest.mark.dataset def test_parquet_dataset_new_filesystem(tempdir): # Ensure we can pass new FileSystem object to ParquetDataset - # (use new implementation automatically without specifying - # use_legacy_dataset=False) table = pa.table({'a': [1, 2, 3]}) pq.write_table(table, tempdir / 'data.parquet') - # don't use simple LocalFileSystem (as that gets mapped to legacy one) filesystem = fs.SubTreeFileSystem(str(tempdir), fs.LocalFileSystem()) dataset = pq.ParquetDataset('.', filesystem=filesystem) result = dataset.read() assert result.equals(table) -@pytest.mark.filterwarnings("ignore:'ParquetDataset:FutureWarning") -@parametrize_legacy_dataset -def test_parquet_dataset_partitions_piece_path_with_fsspec( - tempdir, use_legacy_dataset -): +def test_parquet_dataset_partitions_piece_path_with_fsspec(tempdir): # ARROW-10462 ensure that on Windows we properly use posix-style paths # as used by fsspec fsspec = pytest.importorskip("fsspec") @@ -1725,109 +1266,12 @@ def test_parquet_dataset_partitions_piece_path_with_fsspec( # pass a posix-style path (using "/" also on Windows) path = str(tempdir).replace("\\", "/") dataset = pq.ParquetDataset( - path, filesystem=filesystem, use_legacy_dataset=use_legacy_dataset) + path, filesystem=filesystem) # ensure the piece path is also posix-style expected = path + "/data.parquet" - assert dataset.pieces[0].path == expected - - -@pytest.mark.dataset -def test_parquet_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - pq.write_table(table, path) - dataset = pq.ParquetDataset(path, use_legacy_dataset=True) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset.pieces - - with pytest.warns(FutureWarning, match="'ParquetDataset.partitions"): - dataset.partitions - - with pytest.warns(FutureWarning, match="'ParquetDataset.memory_map"): - dataset.memory_map - - with pytest.warns(FutureWarning, match="'ParquetDataset.read_dictio"): - dataset.read_dictionary - - with pytest.warns(FutureWarning, match="'ParquetDataset.buffer_size"): - dataset.buffer_size - - with pytest.warns(FutureWarning, match="'ParquetDataset.fs"): - dataset.fs - - with pytest.warns(FutureWarning, match="'ParquetDataset.schema'"): - dataset.schema - - with pytest.warns(FutureWarning, match="'ParquetDataset.common_metadata'"): - dataset.common_metadata - - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata"): - dataset.metadata + assert dataset.fragments[0].path == expected - with pytest.warns(FutureWarning, match="'ParquetDataset.metadata_path"): - dataset.metadata_path - with pytest.warns(FutureWarning, - match="'ParquetDataset.common_metadata_path"): - dataset.common_metadata_path - - dataset2 = pq.ParquetDataset(path, use_legacy_dataset=False) - - with pytest.warns(FutureWarning, match="'ParquetDataset.pieces"): - dataset2.pieces - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_deprecated_properties(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, use_legacy_dataset=True) - - # check also that legacy implementation is set when - # partition_filename_cb is specified - with pytest.warns(FutureWarning, - match="Passing 'use_legacy_dataset=True'"): - pq.write_to_dataset(table, path, - partition_filename_cb=lambda x: 'filename.parquet') - - -@pytest.mark.dataset -def test_parquet_write_to_dataset_unsupported_keywords_in_legacy(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="schema"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - schema=pa.schema([ - ('a', pa.int32()) - ])) - - with pytest.raises(ValueError, match="partitioning"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - partitioning=["a"]) - - with pytest.raises(ValueError, match="use_threads"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - use_threads=False) - - with pytest.raises(ValueError, match="file_visitor"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - file_visitor=lambda x: x) - - with pytest.raises(ValueError, match="existing_data_behavior"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - existing_data_behavior='error') - - with pytest.raises(ValueError, match="basename_template"): - pq.write_to_dataset(table, path, use_legacy_dataset=True, - basename_template='part-{i}.parquet') - - -@pytest.mark.dataset def test_parquet_write_to_dataset_exposed_keywords(tempdir): table = pa.table({'a': [1, 2, 3]}) path = tempdir / 'partitioning' @@ -1841,8 +1285,7 @@ def file_visitor(written_file): pq.write_to_dataset(table, path, partitioning=["a"], file_visitor=file_visitor, - basename_template=basename_template, - use_legacy_dataset=False) + basename_template=basename_template) expected_paths = { path / '1' / 'part-0.parquet', @@ -1853,53 +1296,6 @@ def file_visitor(written_file): assert paths_written_set == expected_paths -@pytest.mark.dataset -def test_write_to_dataset_conflicting_keywords(tempdir): - table = pa.table({'a': [1, 2, 3]}) - path = tempdir / 'data.parquet' - - with pytest.raises(ValueError, match="'basename_template' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - with pytest.raises(ValueError, match="'partition_filename_cb' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_filename_cb=lambda x: 'filename.parquet', - basename_template='file-{i}.parquet') - - with pytest.raises(ValueError, match="'partitioning' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'partition_cols' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - partition_cols=["a"], - partitioning=["a"]) - - with pytest.raises(ValueError, match="'file_visitor' argument " - "is not supported by use_legacy_dataset=True"): - pq.write_to_dataset(table, path, - use_legacy_dataset=True, - metadata_collector=[], - file_visitor=lambda x: x) - with pytest.raises(ValueError, match="'metadata_collector' argument " - "is not supported by use_legacy_dataset=False"): - pq.write_to_dataset(table, path, - use_legacy_dataset=False, - metadata_collector=[], - file_visitor=lambda x: x) - - -@pytest.mark.dataset @pytest.mark.parametrize("write_dataset_kwarg", ( ("create_dir", True), ("create_dir", False), @@ -1926,8 +1322,7 @@ def test_write_to_dataset_kwargs_passed(tempdir, write_dataset_kwarg): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): +def test_write_to_dataset_category_observed(tempdir): # if we partition on a categorical variable with "unobserved" categories # (values present in the dictionary, but not in the actual data) # ensure those are not creating empty files/directories @@ -1938,8 +1333,7 @@ def test_write_to_dataset_category_observed(tempdir, use_legacy_dataset): table = pa.table(df) path = tempdir / "dataset" pq.write_to_dataset( - table, tempdir / "dataset", partition_cols=["cat"], - use_legacy_dataset=use_legacy_dataset + table, tempdir / "dataset", partition_cols=["cat"] ) subdirs = [f.name for f in path.iterdir() if f.is_dir()] assert len(subdirs) == 2 diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index f97c451df7ad7..6a9cbd4f73d4f 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -23,8 +23,7 @@ import pytest import pyarrow as pa -from pyarrow.tests.parquet.common import ( - _check_roundtrip, parametrize_legacy_dataset) +from pyarrow.tests.parquet.common import _check_roundtrip try: import pyarrow.parquet as pq @@ -48,8 +47,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_datetime_tz(use_legacy_dataset): +def test_pandas_parquet_datetime_tz(): # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units # so we need to cast the pandas dtype. Pandas v1 will always silently # coerce to [ns] due to lack of non-[ns] support. @@ -69,21 +67,19 @@ def test_pandas_parquet_datetime_tz(use_legacy_dataset): _write_table(arrow_table, f) f.seek(0) - table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(f) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_datetime_timezone_tzinfo(use_legacy_dataset): +def test_datetime_timezone_tzinfo(): value = datetime.datetime(2018, 1, 1, 1, 23, 45, tzinfo=datetime.timezone.utc) df = pd.DataFrame({'foo': [value]}) - _roundtrip_pandas_dataframe( - df, write_kwargs={}, use_legacy_dataset=use_legacy_dataset) + _roundtrip_pandas_dataframe(df, write_kwargs={}) @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 0ed305bff1945..f194d12876968 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -23,8 +23,6 @@ import pyarrow as pa from pyarrow.fs import LocalFileSystem, SubTreeFileSystem -from pyarrow.tests.parquet.common import ( - parametrize_legacy_dataset, parametrize_legacy_dataset_not_supported) from pyarrow.util import guid from pyarrow.vendored.version import Version @@ -101,8 +99,7 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): +def test_pandas_parquet_column_multiindex(tempdir): df = alltypes_sample(size=10) df.columns = pd.MultiIndex.from_tuples( list(zip(df.columns, df.columns[::-1])), @@ -115,17 +112,13 @@ def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( - tempdir, use_legacy_dataset -): +def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(tempdir): df = alltypes_sample(size=10000) filename = tempdir / 'pandas_roundtrip.parquet' @@ -137,8 +130,7 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( assert js['columns'] _write_table(arrow_table, filename) - table_read = pq.read_pandas( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = pq.read_pandas(filename) js = table_read.schema.pandas_metadata assert not js['index_columns'] @@ -150,52 +142,20 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( tm.assert_frame_equal(df, df_read) -# TODO(dataset) duplicate column selection actually gives duplicate columns now -@pytest.mark.pandas -@parametrize_legacy_dataset_not_supported -def test_pandas_column_selection(tempdir, use_legacy_dataset): - size = 10000 - np.random.seed(0) - df = pd.DataFrame({ - 'uint8': np.arange(size, dtype=np.uint8), - 'uint16': np.arange(size, dtype=np.uint16) - }) - filename = tempdir / 'pandas_roundtrip.parquet' - arrow_table = pa.Table.from_pandas(df) - _write_table(arrow_table, filename) - table_read = _read_table( - filename, columns=['uint8'], use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - # ARROW-4267: Selection of duplicate columns still leads to these columns - # being read uniquely. - table_read = _read_table( - filename, columns=['uint8', 'uint8'], - use_legacy_dataset=use_legacy_dataset) - df_read = table_read.to_pandas() - - tm.assert_frame_equal(df[['uint8']], df_read) - - @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_native_file_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_native_file_roundtrip(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_read_pandas_column_subset(tempdir, use_legacy_dataset): +def test_read_pandas_column_subset(): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() @@ -204,27 +164,24 @@ def test_read_pandas_column_subset(tempdir, use_legacy_dataset): reader = pa.BufferReader(buf) df_read = pq.read_pandas( reader, columns=['strings', 'uint8'], - use_legacy_dataset=use_legacy_dataset ).to_pandas() tm.assert_frame_equal(df[['strings', 'uint8']], df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_empty_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_empty_roundtrip(): df = _test_dataframe(0) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() _write_table(arrow_table, imos, version='2.6') buf = imos.getvalue() reader = pa.BufferReader(buf) - df_read = _read_table( - reader, use_legacy_dataset=use_legacy_dataset).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -def test_pandas_can_write_nested_data(tempdir): +def test_pandas_can_write_nested_data(): data = { "agg_col": [ {"page_type": 1}, @@ -241,8 +198,7 @@ def test_pandas_can_write_nested_data(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): +def test_pandas_parquet_pyfile_roundtrip(tempdir): filename = tempdir / 'pandas_pyfile_roundtrip.parquet' size = 5 df = pd.DataFrame({ @@ -260,14 +216,13 @@ def test_pandas_parquet_pyfile_roundtrip(tempdir, use_legacy_dataset): data = io.BytesIO(filename.read_bytes()) - table_read = _read_table(data, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(data) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): +def test_pandas_parquet_configuration_options(tempdir): size = 10000 np.random.seed(0) df = pd.DataFrame({ @@ -289,16 +244,14 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): for use_dictionary in [True, False]: _write_table(arrow_table, filename, version='2.6', use_dictionary=use_dictionary) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for write_statistics in [True, False]: _write_table(arrow_table, filename, version='2.6', write_statistics=write_statistics) - table_read = _read_table(filename, - use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -308,8 +261,7 @@ def test_pandas_parquet_configuration_options(tempdir, use_legacy_dataset): continue _write_table(arrow_table, filename, version='2.6', compression=compression) - table_read = _read_table( - filename, use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(filename) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -327,8 +279,7 @@ def test_spark_flavor_preserves_pandas_metadata(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_index_column_name_duplicate(tempdir, use_legacy_dataset): +def test_index_column_name_duplicate(tempdir): data = { 'close': { pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998, @@ -352,14 +303,13 @@ def test_index_column_name_duplicate(tempdir, use_legacy_dataset): tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) - arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + arrow_table = _read_table(path) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): +def test_multiindex_duplicate_values(tempdir): num_rows = 3 numbers = list(range(num_rows)) index = pd.MultiIndex.from_arrays( @@ -373,7 +323,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): filename = tempdir / 'dup_multi_index_levels.parquet' _write_table(table, filename) - result_table = _read_table(filename, use_legacy_dataset=use_legacy_dataset) + result_table = _read_table(filename) assert table.equals(result_table) result_df = result_table.to_pandas() @@ -381,8 +331,7 @@ def test_multiindex_duplicate_values(tempdir, use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): +def test_backwards_compatible_index_naming(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -397,17 +346,13 @@ def test_backwards_compatible_index_naming(datadir, use_legacy_dataset): 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0, engine='python') - table = _read_table( - datadir / 'v0.7.1.parquet', use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -426,17 +371,13 @@ def test_backwards_compatible_index_multi_level_named( header=0, engine='python' ).sort_index() - table = _read_table(datadir / 'v0.7.1.all-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.all-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_index_multi_level_some_named( - datadir, use_legacy_dataset -): +def test_backwards_compatible_index_multi_level_some_named(datadir): expected_string = b"""\ carat cut color clarity depth table price x y z 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 @@ -456,17 +397,13 @@ def test_backwards_compatible_index_multi_level_some_named( ).sort_index() expected.index = expected.index.set_names(['cut', None, 'clarity']) - table = _read_table(datadir / 'v0.7.1.some-named-index.parquet', - use_legacy_dataset=use_legacy_dataset) + table = _read_table(datadir / 'v0.7.1.some-named-index.parquet') result = table.to_pandas() tm.assert_frame_equal(result, expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_backwards_compatible_column_metadata_handling( - datadir, use_legacy_dataset -): +def test_backwards_compatible_column_metadata_handling(datadir): expected = pd.DataFrame( {'a': [1, 2, 3], 'b': [.1, .2, .3], 'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) @@ -476,19 +413,18 @@ def test_backwards_compatible_column_metadata_handling( names=['index', None]) path = datadir / 'v0.7.1.column-metadata-handling.parquet' - table = _read_table(path, use_legacy_dataset=use_legacy_dataset) + table = _read_table(path) result = table.to_pandas() tm.assert_frame_equal(result, expected) table = _read_table( - path, columns=['a'], use_legacy_dataset=use_legacy_dataset) + path, columns=['a']) result = table.to_pandas() tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_index_survives_roundtrip(use_legacy_dataset): +def test_categorical_index_survives_roundtrip(): # ARROW-3652, addressed by ARROW-3246 df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2']) df['c1'] = df['c1'].astype('category') @@ -497,15 +433,13 @@ def test_categorical_index_survives_roundtrip(use_legacy_dataset): table = pa.Table.from_pandas(df) bos = pa.BufferOutputStream() pq.write_table(table, bos) - ref_df = pq.read_pandas( - bos.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + ref_df = pq.read_pandas(bos.getvalue()).to_pandas() assert isinstance(ref_df.index, pd.CategoricalIndex) assert ref_df.index.equals(df.index) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_categorical_order_survives_roundtrip(use_legacy_dataset): +def test_categorical_order_survives_roundtrip(): # ARROW-6302 df = pd.DataFrame({"a": pd.Categorical( ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)}) @@ -515,15 +449,13 @@ def test_categorical_order_survives_roundtrip(use_legacy_dataset): pq.write_table(table, bos) contents = bos.getvalue() - result = pq.read_pandas( - contents, use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_pandas(contents).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): +def test_pandas_categorical_na_type_row_groups(): # ARROW-5085 df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100}) df_category = df.astype({"col": "category", "int": "category"}) @@ -533,8 +465,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): # it works pq.write_table(table_cat, buf, version='2.6', chunk_size=10) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset) + result = pq.read_table(buf.getvalue()) # Result is non-categorical assert result[0].equals(table[0]) @@ -542,8 +473,7 @@ def test_pandas_categorical_na_type_row_groups(use_legacy_dataset): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_pandas_categorical_roundtrip(use_legacy_dataset): +def test_pandas_categorical_roundtrip(): # ARROW-5480, this was enabled by ARROW-3246 # Have one of the categories unobserved and include a null (-1) @@ -555,8 +485,7 @@ def test_pandas_categorical_roundtrip(use_legacy_dataset): buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) - result = pq.read_table( - buf.getvalue(), use_legacy_dataset=use_legacy_dataset).to_pandas() + result = pq.read_table(buf.getvalue()).to_pandas() assert result.x.dtype == 'category' assert (result.x.cat.categories == categories).all() tm.assert_frame_equal(result, df) @@ -587,41 +516,28 @@ def test_categories_with_string_pyarrow_dtype(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_extensiondtypes( - tempdir, use_legacy_dataset -): +def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]}) df['col'] = df['col'].astype("Int64") table = pa.table(df) pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result[["col"]], df[["col"]]) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): +def test_write_to_dataset_pandas_preserve_index(tempdir): # ARROW-8251 - preserve pandas index in roundtrip df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]}) @@ -632,34 +548,24 @@ def test_write_to_dataset_pandas_preserve_index(tempdir, use_legacy_dataset): pq.write_to_dataset( table, str(tempdir / "case1"), partition_cols=['part'], - use_legacy_dataset=use_legacy_dataset ) - result = pq.read_table( - str(tempdir / "case1"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "case1")).to_pandas() tm.assert_frame_equal(result, df_cat) - pq.write_to_dataset( - table, str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ) - result = pq.read_table( - str(tempdir / "case2"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + pq.write_to_dataset(table, str(tempdir / "case2")) + result = pq.read_table(str(tempdir / "case2")).to_pandas() tm.assert_frame_equal(result, df) pq.write_table(table, str(tempdir / "data.parquet")) - result = pq.read_table( - str(tempdir / "data.parquet"), use_legacy_dataset=use_legacy_dataset - ).to_pandas() + result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() tm.assert_frame_equal(result, df) @pytest.mark.pandas -@parametrize_legacy_dataset @pytest.mark.parametrize('preserve_index', [True, False, None]) @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"]) def test_dataset_read_pandas_common_metadata( - tempdir, use_legacy_dataset, preserve_index, metadata_fname + tempdir, preserve_index, metadata_fname ): # ARROW-1103 nfiles = 5 @@ -696,7 +602,7 @@ def test_dataset_read_pandas_common_metadata( ) pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname) - dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset) + dataset = pq.ParquetDataset(dirpath) columns = ['uint8', 'strings'] result = dataset.read_pandas(columns=columns).to_pandas() expected = pd.concat([x[columns] for x in frames]) diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 9f920206a107e..93097a1afaac9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -18,7 +18,6 @@ import io import os import sys -from unittest import mock import pytest @@ -296,28 +295,6 @@ def test_parquet_file_explicitly_closed(tempdir): table = pa.table({'col1': [0, 1], 'col2': [0, 1]}) pq.write_table(table, fn) - # read_table (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - pq.read_table(f, use_legacy_dataset=True) - assert not f.closed # Didn't close it internally after read_table - - # read_table (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.read_table(fn, use_legacy_dataset=True) - mock_close.assert_called() - - # ParquetDataset test (legacy) with unopened file (will close) - with mock.patch.object(pq.ParquetFile, "close") as mock_close: - pq.ParquetDataset(fn, use_legacy_dataset=True).read() - mock_close.assert_called() - - # ParquetDataset test (legacy) with opened file (will leave open) - with open(fn, 'rb') as f: - # ARROW-8075: support ParquetDataset from file-like, not just path-like - with pytest.raises(TypeError, match='not a path-like object'): - pq.ParquetDataset(f, use_legacy_dataset=True).read() - assert not f.closed - # ParquetFile with opened file (will leave open) with open(fn, 'rb') as f: with pq.ParquetFile(f) as p: @@ -338,7 +315,7 @@ def test_parquet_file_explicitly_closed(tempdir): @pytest.mark.s3 @pytest.mark.parametrize("use_uri", (True, False)) -def test_parquet_file_with_filesystem(tempdir, s3_example_fs, use_uri): +def test_parquet_file_with_filesystem(s3_example_fs, use_uri): s3_fs, s3_uri, s3_path = s3_example_fs args = (s3_uri if use_uri else s3_path,) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index b902541015aa2..16584684f5c7f 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -20,7 +20,6 @@ import pyarrow as pa from pyarrow import fs from pyarrow.filesystem import FileSystem, LocalFileSystem -from pyarrow.tests.parquet.common import parametrize_legacy_dataset try: import pyarrow.parquet as pq @@ -44,8 +43,7 @@ @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): +def test_parquet_incremental_file_build(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -65,8 +63,7 @@ def test_parquet_incremental_file_build(tempdir, use_legacy_dataset): writer.close() buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -105,8 +102,7 @@ def test_parquet_invalid_writer(tempdir): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): +def test_parquet_writer_context_obj(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -124,18 +120,14 @@ def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): frames.append(df.copy()) buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_context_obj_with_exception( - tempdir, use_legacy_dataset -): +def test_parquet_writer_context_obj_with_exception(tempdir): df = _test_dataframe(100) df['unique_id'] = 0 @@ -160,8 +152,7 @@ def test_parquet_writer_context_obj_with_exception( assert str(e) == error_text buf = out.getvalue() - result = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + result = _read_table(pa.BufferReader(buf)) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected) @@ -340,8 +331,7 @@ def test_parquet_writer_filesystem_buffer_raises(): @pytest.mark.pandas -@parametrize_legacy_dataset -def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): +def test_parquet_writer_with_caller_provided_filesystem(): out = pa.BufferOutputStream() class CustomFS(FileSystem): @@ -368,8 +358,7 @@ def open(self, path, mode='rb'): assert out.closed buf = out.getvalue() - table_read = _read_table( - pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) + table_read = _read_table(pa.BufferReader(buf)) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index a37eb1e426f7a..e2bb4400c8bde 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1148,7 +1148,6 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): path = str(tempdir / "test_parquet_dataset") - # write_to_dataset currently requires pandas pq.write_to_dataset(table, path, partition_cols=["part"], chunk_size=chunk_size) dataset = ds.dataset( @@ -1158,10 +1157,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None): return table, dataset -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1208,10 +1204,7 @@ def test_fragments_implicit_cast(tempdir): assert len(list(fragments)) == 1 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir) @@ -1272,10 +1265,7 @@ def assert_yields_projected(fragment, row_slice, dataset_reader.to_table(new_fragment, filter=ds.field('part') == 'a') -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups(tempdir, dataset_reader): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1326,8 +1316,6 @@ def test_fragments_parquet_num_row_groups(tempdir): @pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): - import pandas as pd - df = pd.DataFrame(dict(col1=['a', 'b'], col2=[1, 2])) df['col1'] = df['col1'].astype("category") @@ -1340,10 +1328,7 @@ def test_fragments_parquet_row_groups_dictionary(tempdir, dataset_reader): assert (df.iloc[0] == result.to_pandas()).all().all() -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_module): fs, assert_opens = open_logging_fs _, dataset = _create_dataset_for_fragments( @@ -1384,7 +1369,6 @@ def test_fragments_parquet_ensure_metadata(tempdir, open_logging_fs, pickle_modu assert row_group.statistics is not None -@pytest.mark.pandas @pytest.mark.parquet def test_fragments_parquet_pickle_no_metadata(tempdir, open_logging_fs, pickle_module): # https://issues.apache.org/jira/browse/ARROW-15796 @@ -1454,16 +1438,13 @@ def _create_dataset_all_types(tempdir, chunk_size=None): path = str(tempdir / "test_parquet_dataset_all_types") # write_to_dataset currently requires pandas - pq.write_to_dataset(table, path, use_legacy_dataset=True, - chunk_size=chunk_size) + pq.write_to_dataset(table, path, chunk_size=chunk_size) return table, ds.dataset(path, format="parquet", partitioning="hive") @pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_parquet_fragment_statistics(tempdir): table, dataset = _create_dataset_all_types(tempdir) @@ -1529,10 +1510,7 @@ def test_parquet_empty_row_group_statistics(tempdir): assert fragments[0].row_groups[0].statistics == {} -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_predicate(tempdir): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1555,10 +1533,7 @@ def test_fragments_parquet_row_groups_predicate(tempdir): assert len(row_group_fragments) == 0 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, pickle_module): table, dataset = _create_dataset_for_fragments(tempdir, chunk_size=2) @@ -1600,10 +1575,7 @@ def test_fragments_parquet_row_groups_reconstruct(tempdir, dataset_reader, dataset_reader.to_table(new_fragment) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1631,10 +1603,7 @@ def test_fragments_parquet_subset_ids(tempdir, open_logging_fs, assert result.equals(table[:0]) -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, dataset_reader): fs, assert_opens = open_logging_fs @@ -1666,10 +1635,7 @@ def test_fragments_parquet_subset_filter(tempdir, open_logging_fs, assert subfrag.num_row_groups == 4 -@pytest.mark.pandas @pytest.mark.parquet -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") def test_fragments_parquet_subset_invalid(tempdir): _, dataset = _create_dataset_for_fragments(tempdir, chunk_size=1) fragment = list(dataset.get_fragments())[0] @@ -3591,10 +3557,7 @@ def test_parquet_dataset_factory_fsspec(tempdir): @pytest.mark.parquet @pytest.mark.pandas # write_to_dataset currently requires pandas -@pytest.mark.parametrize('use_legacy_dataset', [False, True]) -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): +def test_parquet_dataset_factory_roundtrip(tempdir): # Simple test to ensure we can roundtrip dataset to # _metadata/common_metadata and back. A more complex test # using partitioning will have to wait for ARROW-13269. The @@ -3606,7 +3569,6 @@ def test_parquet_dataset_factory_roundtrip(tempdir, use_legacy_dataset): metadata_collector = [] pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector, - use_legacy_dataset=use_legacy_dataset ) metadata_path = str(root_path / '_metadata') # write _metadata file @@ -3820,7 +3782,6 @@ def test_dataset_project_only_partition_columns(tempdir, dataset_reader): @pytest.mark.parquet @pytest.mark.pandas def test_dataset_project_null_column(tempdir, dataset_reader): - import pandas as pd df = pd.DataFrame({"col": np.array([None, None, None], dtype='object')}) f = tempdir / "test_dataset_project_null_column.parquet" @@ -3930,8 +3891,7 @@ def test_write_to_dataset_given_null_just_works(tempdir): 'col': list(range(4))}, schema=schema) path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=False) + pq.write_to_dataset(table, path, partition_cols=['part']) actual_table = pq.read_table(tempdir / 'test_dataset') # column.equals can handle the difference in chunking but not the fact @@ -3941,28 +3901,6 @@ def test_write_to_dataset_given_null_just_works(tempdir): assert actual_table.column('col').equals(table.column('col')) -@pytest.mark.parquet -@pytest.mark.pandas -@pytest.mark.filterwarnings( - "ignore:Passing 'use_legacy_dataset=True':FutureWarning") -def test_legacy_write_to_dataset_drops_null(tempdir): - schema = pa.schema([ - pa.field('col', pa.int64()), - pa.field('part', pa.dictionary(pa.int32(), pa.string())) - ]) - table = pa.table({'part': ['a', 'a', None, None], - 'col': list(range(4))}, schema=schema) - expected = pa.table( - {'part': ['a', 'a'], 'col': list(range(2))}, schema=schema) - - path = str(tempdir / 'test_dataset') - pq.write_to_dataset(table, path, partition_cols=[ - 'part'], use_legacy_dataset=True) - - actual = pq.read_table(tempdir / 'test_dataset') - assert actual == expected - - def _sort_table(tab, sort_col): import pyarrow.compute as pc sorted_indices = pc.sort_indices( diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py index 511dbf9a1c4e1..5b94c200f35de 100644 --- a/python/pyarrow/tests/test_hdfs.py +++ b/python/pyarrow/tests/test_hdfs.py @@ -27,7 +27,7 @@ from pyarrow.tests import util from pyarrow.tests.parquet.common import _test_dataframe from pyarrow.tests.parquet.test_dataset import ( - _test_read_common_metadata_files, _test_write_to_dataset_with_partitions, + _test_write_to_dataset_with_partitions, _test_write_to_dataset_no_partitions ) from pyarrow.util import guid @@ -309,6 +309,9 @@ def _write_multiple_hdfs_pq_files(self, tmpdir): expected = pa.concat_tables(test_data) return expected + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_multiple_parquet_files(self): @@ -343,6 +346,9 @@ def test_read_multiple_parquet_files_with_uri(self): expected.to_pandas() ) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.pandas @pytest.mark.parquet def test_read_write_parquet_files_with_uri(self): @@ -360,19 +366,13 @@ def test_read_write_parquet_files_with_uri(self): pq.write_table(table, path, filesystem=self.hdfs) - result = pq.read_table( - path, filesystem=self.hdfs, use_legacy_dataset=True - ).to_pandas() + result = pq.read_table(path, filesystem=self.hdfs).to_pandas() assert_frame_equal(result, df) - @pytest.mark.parquet - @pytest.mark.pandas - def test_read_common_metadata_files(self): - tmpdir = pjoin(self.tmp_path, 'common-metadata-' + guid()) - self.hdfs.mkdir(tmpdir) - _test_read_common_metadata_files(self.hdfs, tmpdir) - + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_with_partitions(self): @@ -381,6 +381,9 @@ def test_write_to_dataset_with_partitions(self): _test_write_to_dataset_with_partitions( tmpdir, filesystem=self.hdfs) + @pytest.mark.xfail(reason="legacy.FileSystem not supported with ParquetDataset " + "due to legacy path being removed in PyArrow 15.0.0.", + raises=TypeError) @pytest.mark.parquet @pytest.mark.pandas def test_write_to_dataset_no_partitions(self): From 2abb3fb7095241300e2bb2aadd953b0f23970237 Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Thu, 21 Dec 2023 14:14:45 -0800 Subject: [PATCH 06/31] GH-32570: [C++] Fix the issue of `ExecBatchBuilder` when appending consecutive tail rows with the same id may exceed buffer boundary (#39234) ### Rationale for this change Addressed in https://github.com/apache/arrow/issues/32570#issuecomment-1856473812 ### What changes are included in this PR? 1. Skip consecutive rows with the same id when calculating rows to skip when appending to `ExecBatchBuilder`. 2. Fix the bug that column offset is neglected when calculating rows to skip. ### Are these changes tested? Yes. New UT included and the change is also protected by the existing case mentioned in the issue. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** Because #32570 is labeled critical, and causes a crash even when the API contract is upheld. * Closes: #32570 Authored-by: zanmato Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/light_array.cc | 7 ++++-- cpp/src/arrow/compute/light_array.h | 4 +++- cpp/src/arrow/compute/light_array_test.cc | 26 +++++++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 4e8b2b2d7cc3a..93a054de1957c 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -398,9 +398,12 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr& column, } else { --num_rows_left; int row_id_removed = row_ids[num_rows_left]; - const uint32_t* offsets = - reinterpret_cast(column->buffers[1]->data()); + const int32_t* offsets = column->GetValues(1); num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed]; + // Skip consecutive rows with the same id + while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) { + --num_rows_left; + } } } diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 87f6b6c76a12c..84aa86d64bb62 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -416,7 +416,9 @@ class ARROW_EXPORT ExecBatchBuilder { // without checking buffer bounds (useful with SIMD or fixed size memory loads // and stores). // - // The sequence of row_ids provided must be non-decreasing. + // The sequence of row_ids provided must be non-decreasing. In case of consecutive rows + // with the same row id, they are skipped all at once because they occupy the same + // space. // static int NumRowsToSkip(const std::shared_ptr& column, int num_rows, const uint16_t* row_ids, int num_tail_bytes_to_skip); diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 4e33f7b578ea8..52121530fe91d 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -471,6 +471,32 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) { ASSERT_EQ(0, pool->bytes_allocated()); } +TEST(ExecBatchBuilder, AppendBatchDupRows) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + // Case of cross-word copying for the last row, which may exceed the buffer boundary. + // This is a simplified case of GH-32570 + { + // 64-byte data fully occupying one minimal 64-byte aligned memory region. + ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["ABCDEF0"], + ["123456789"]])"); // 9-byte tail row, larger than a word. + ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64); + ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64); + ExecBatchBuilder builder; + uint16_t row_ids[2] = {4, 4}; + ASSERT_OK(builder.AppendSelected(pool, batch_string, 2, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ExecBatch batch_string_appended = + JSONToExecBatch({binary()}, R"([["123456789"], ["123456789"]])"); + ASSERT_EQ(batch_string_appended, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + TEST(ExecBatchBuilder, AppendBatchesSomeCols) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); From 929c40bcbded7184a5f6894db208f16975de4d37 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 22 Dec 2023 00:37:29 +0000 Subject: [PATCH 07/31] GH-39343: [C++][FS][Azure] Add client secret auth configuration (#39346) ### Rationale for this change Client is a useful Azure authentication ### What changes are included in this PR? Implement `AzureOptions::ConfigureClientSecretCredential` ### Are these changes tested? Simple unittest ### Are there any user-facing changes? Client secret auth is now supported on the Azure filesystem. * Closes: #39343 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 10 ++++++++++ cpp/src/arrow/filesystem/azurefs.h | 5 +++++ cpp/src/arrow/filesystem/azurefs_test.cc | 7 +++++++ 3 files changed, 22 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 27bdb5092a3ea..26c2761886050 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -113,6 +113,16 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_na return Status::OK(); } +Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_name, + const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = std::make_shared( + tenant_id, client_id, client_secret); + return Status::OK(); +} + Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 69f6295237043..346dd349e935c 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -110,6 +110,11 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureAccountKeyCredential(const std::string& account_name, const std::string& account_key); + Status ConfigureClientSecretCredential(const std::string& account_name, + const std::string& tenant_id, + const std::string& client_id, + const std::string& client_secret); + bool Equals(const AzureOptions& other) const; std::string AccountBlobUrl(const std::string& account_name) const; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3266c1bfda2dc..62c5ef2232045 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -271,6 +271,13 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +TEST(AzureFileSystem, InitializeFilesystemWithClientSecretCredential) { + AzureOptions options; + ARROW_EXPECT_OK(options.ConfigureClientSecretCredential( + "dummy-account-name", "tenant_id", "client_id", "client_secret")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); From 51970e066e69ab01f9bdcc81219781ae07b9799b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 22 Dec 2023 02:06:50 +0100 Subject: [PATCH 08/31] GH-39006: [Python] Extract libparquet requirements out of libarrow_python.so to new libarrow_python_parquet_encryption.so (#39316) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change If I build pyarrow with everything and then I remove some of the Arrow CPP .so in order to have a minimal build I can't import pyarrow because it requires libarrow and libparquet. This is relevant in order to have a minimal build for Conda. Please see the related issue for more information. ### What changes are included in this PR? Move libarrow parquet encryption for pyarrow to its own shared object. ### Are these changes tested? I will run extensive CI with extra python archery tests. ### Are there any user-facing changes? No, and yes :) There will be a new .so on pyarrow but shouldn't be relevant in my opinion. * Closes: #39006 Lead-authored-by: Raúl Cumplido Co-authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- ci/scripts/python_test.sh | 2 + ci/scripts/python_wheel_unix_test.sh | 1 + ci/scripts/python_wheel_windows_test.bat | 1 + python/CMakeLists.txt | 38 ++++++++++--------- .../src/arrow/python/parquet_encryption.h | 33 +++++++++++++--- 5 files changed, 53 insertions(+), 22 deletions(-) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 8d818346faa6e..341c2dd0577ef 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -45,6 +45,7 @@ export ARROW_DEBUG_MEMORY_POOL=trap : ${PYARROW_TEST_HDFS:=${ARROW_HDFS:-ON}} : ${PYARROW_TEST_ORC:=${ARROW_ORC:-ON}} : ${PYARROW_TEST_PARQUET:=${ARROW_PARQUET:-ON}} +: ${PYARROW_TEST_PARQUET_ENCRYPTION:=${PARQUET_REQUIRE_ENCRYPTION:-ON}} : ${PYARROW_TEST_S3:=${ARROW_S3:-ON}} export PYARROW_TEST_ACERO @@ -56,6 +57,7 @@ export PYARROW_TEST_GCS export PYARROW_TEST_HDFS export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET +export PYARROW_TEST_PARQUET_ENCRYPTION export PYARROW_TEST_S3 # Testing PyArrow diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index a6cc3bb7b29b7..01250ff7ef40c 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -46,6 +46,7 @@ export PYARROW_TEST_HDFS=ON export PYARROW_TEST_ORC=ON export PYARROW_TEST_PANDAS=ON export PYARROW_TEST_PARQUET=ON +export PYARROW_TEST_PARQUET_ENCRYPTION=ON export PYARROW_TEST_SUBSTRAIT=${ARROW_SUBSTRAIT} export PYARROW_TEST_S3=${ARROW_S3} export PYARROW_TEST_TENSORFLOW=ON diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index c73b0cfd1b9bd..b14bfddfb36d3 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -26,6 +26,7 @@ set PYARROW_TEST_GCS=ON set PYARROW_TEST_HDFS=ON set PYARROW_TEST_ORC=OFF set PYARROW_TEST_PARQUET=ON +set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON set PYARROW_TEST_S3=OFF set PYARROW_TEST_TENSORFLOW=ON diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3f810d27271e5..2df1e67b9f4c7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -332,22 +332,6 @@ if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_PARQUET_ENCRYPTION) - if(PARQUET_REQUIRE_ENCRYPTION) - list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) - if(ARROW_BUILD_SHARED) - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_shared) - else() - list(APPEND PYARROW_CPP_LINK_LIBS Parquet::parquet_static) - endif() - message(STATUS "Parquet Encryption Enabled") - else() - message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") - endif() -else() - message(STATUS "Parquet Encryption is NOT Enabled") -endif() - if(PYARROW_BUILD_HDFS) if(NOT ARROW_HDFS) message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") @@ -391,6 +375,26 @@ install(TARGETS arrow_python LIBRARY DESTINATION . RUNTIME DESTINATION .) +set(PYARROW_CPP_ENCRYPTION_SRCS ${PYARROW_CPP_SOURCE_DIR}/parquet_encryption.cc) +if(NOT PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Parquet Encryption is NOT Enabled") +else() + if(PARQUET_REQUIRE_ENCRYPTION) + add_library(arrow_python_parquet_encryption SHARED ${PYARROW_CPP_ENCRYPTION_SRCS}) + target_link_libraries(arrow_python_parquet_encryption PUBLIC arrow_python + ${PARQUET_LINK_LIBS}) + target_compile_definitions(arrow_python_parquet_encryption + PRIVATE ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) + install(TARGETS arrow_python_parquet_encryption + ARCHIVE DESTINATION . + LIBRARY DESTINATION . + RUNTIME DESTINATION .) + message(STATUS "Parquet Encryption Enabled") + else() + message(FATAL_ERROR "You must build Arrow C++ with PARQUET_REQUIRE_ENCRYPTION=ON") + endif() +endif() + set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) if(NOT ARROW_FLIGHT) @@ -814,6 +818,6 @@ endif() if(PYARROW_BUILD_PARQUET) target_link_libraries(_parquet PRIVATE ${PARQUET_LINK_LIBS}) if(PYARROW_BUILD_PARQUET_ENCRYPTION) - target_link_libraries(_parquet_encryption PRIVATE ${PARQUET_LINK_LIBS}) + target_link_libraries(_parquet_encryption PRIVATE arrow_python_parquet_encryption) endif() endif() diff --git a/python/pyarrow/src/arrow/python/parquet_encryption.h b/python/pyarrow/src/arrow/python/parquet_encryption.h index 23ee478348ecd..a1aaa30e260f5 100644 --- a/python/pyarrow/src/arrow/python/parquet_encryption.h +++ b/python/pyarrow/src/arrow/python/parquet_encryption.h @@ -26,6 +26,27 @@ #include "parquet/encryption/kms_client.h" #include "parquet/encryption/kms_client_factory.h" +#if defined(_WIN32) || defined(__CYGWIN__) // Windows +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_PYTHON_STATIC +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING) +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport) +#else +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport) +#endif + +#else // Not Windows +#ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT +#define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default"))) +#endif +#endif // Non-Windows + namespace arrow { namespace py { namespace parquet { @@ -33,7 +54,7 @@ namespace encryption { /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable { public: std::function @@ -44,7 +65,8 @@ class ARROW_PYTHON_EXPORT PyKmsClientVtable { }; /// \brief A helper for KmsClient implementation in Python. -class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient + : public ::parquet::encryption::KmsClient { public: PyKmsClient(PyObject* handler, PyKmsClientVtable vtable); ~PyKmsClient() override; @@ -62,7 +84,7 @@ class ARROW_PYTHON_EXPORT PyKmsClient : public ::parquet::encryption::KmsClient /// \brief A table of function pointers for calling from C++ into /// Python. -class ARROW_PYTHON_EXPORT PyKmsClientFactoryVtable { +class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable { public: std::function> SafeGetFileEncryptionProperties( From cd5a1bd259a95eb9342569fb01d41a5924aec30f Mon Sep 17 00:00:00 2001 From: Ravjot Brar <83892020+ravjotbrar@users.noreply.github.com> Date: Fri, 22 Dec 2023 07:03:32 -0800 Subject: [PATCH 09/31] GH-39014: [Java] Add default truststore along with KeychainStore when on Mac system (#39235) ### Rationale for this change As described in #39014, when using the system TrustStore on Mac, the certificates returned do not include Root CAs trusted by the system. This change adds the default KeyStore instance along with the KeyChainStore to include trusted Root CAs. The reason we add the default KeyStore instance is because there is no easy way to get the certificates from the System Roots keychain. ### What changes are included in this PR? I've updated ClientAuthenticationUtils to get the default KeyStore instance when the operating system is macOS and have updated the tests to include this change. ### Are these changes tested? See changes made in ClientAuthenticationUtilsTest.java. ### Are there any user-facing changes? No * Closes: #39014 Authored-by: Ravjot Brar Signed-off-by: David Li --- .../utils/ClientAuthenticationUtils.java | 21 ++++++---- .../utils/ClientAuthenticationUtilsTest.java | 42 +++++++++++++++++-- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java index d50dc385a62e1..ffb0048181c7c 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtils.java @@ -115,6 +115,16 @@ static KeyStore getKeyStoreInstance(String instance) return keyStore; } + @VisibleForTesting + static KeyStore getDefaultKeyStoreInstance(String password) + throws KeyStoreException, CertificateException, NoSuchAlgorithmException, IOException { + try (InputStream fileInputStream = getKeystoreInputStream()) { + KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); + keyStore.load(fileInputStream, password == null ? null : password.toCharArray()); + return keyStore; + } + } + static String getOperatingSystem() { return System.getProperty("os.name"); } @@ -156,16 +166,9 @@ public static InputStream getCertificateInputStreamFromSystem(String password) t keyStoreList.add(getKeyStoreInstance("Windows-MY")); } else if (isMac()) { keyStoreList.add(getKeyStoreInstance("KeychainStore")); + keyStoreList.add(getDefaultKeyStoreInstance(password)); } else { - try (InputStream fileInputStream = getKeystoreInputStream()) { - KeyStore keyStore = KeyStore.getInstance(KeyStore.getDefaultType()); - if (password == null) { - keyStore.load(fileInputStream, null); - } else { - keyStore.load(fileInputStream, password.toCharArray()); - } - keyStoreList.add(keyStore); - } + keyStoreList.add(getDefaultKeyStoreInstance(password)); } return getCertificatesInputStream(keyStoreList); diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index 27bba64587367..b7977462e9c01 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -77,6 +77,33 @@ public void testGetKeyStoreInstance() throws IOException, } } + @Test + public void testGetDefaultKeyStoreInstancePassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test + public void testGetDefaultKeyStoreInstanceNoPassword() throws IOException, + KeyStoreException, CertificateException, NoSuchAlgorithmException { + try (MockedStatic keyStoreMockedStatic = Mockito.mockStatic(KeyStore.class)) { + + keyStoreMockedStatic + .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance(null)) + .thenReturn(keyStoreMock); + KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance(null); + Assert.assertEquals(receiveKeyStore, keyStoreMock); + } + } + + @Test public void testGetCertificateInputStreamFromMacSystem() throws IOException, KeyStoreException, CertificateException, NoSuchAlgorithmException { @@ -90,11 +117,18 @@ public void testGetCertificateInputStreamFromMacSystem() throws IOException, keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getKeyStoreInstance("KeychainStore")) .thenReturn(keyStoreMock); + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance("changeit")) + .thenReturn(keyStoreMock); + clientAuthenticationUtilsMockedStatic + .when(ClientAuthenticationUtils::getKeystoreInputStream) + .thenCallRealMethod(); + keyStoreMockedStatic.when(KeyStore::getDefaultType).thenCallRealMethod(); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("test"); + InputStream inputStream = ClientAuthenticationUtils.getCertificateInputStreamFromSystem("changeit"); Assert.assertEquals(inputStream, mock); } } @@ -136,9 +170,11 @@ public void testGetCertificateInputStreamFromLinuxSystem() throws IOException, setOperatingSystemMock(clientAuthenticationUtilsMockedStatic, false, false); keyStoreMockedStatic.when(() -> ClientAuthenticationUtils - .getCertificatesInputStream(Mockito.any())) + .getCertificatesInputStream(Mockito.any())) .thenReturn(mock); - + keyStoreMockedStatic.when(() -> ClientAuthenticationUtils + .getDefaultKeyStoreInstance(Mockito.any())) + .thenReturn(keyStoreMock); clientAuthenticationUtilsMockedStatic .when(ClientAuthenticationUtils::getKeystoreInputStream) .thenCallRealMethod(); From a4a3d3f4825eb025657121e70c9d86e8d6ecff35 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 22 Dec 2023 23:17:58 +0800 Subject: [PATCH 10/31] GH-39265: [Java] Make it run well with the netty newest version 4.1.104 (#39266) ### Describe the enhancement requested When I used `netty arrow memory 14.0.1` and `netty 4.1.104.Final` in Spark, the following error occurred, After pr: https://github.com/netty/netty/pull/13613, `PoolArena` no longer extends `SizeClasses`, but instead uses it as one of its fields, as follows: image in order to ensure that `netty arrow memory 14.0.1` works well with `netty 4.1.104.Final` version, I suggest making similar modifications here. 1.Compilation errors are as follows: https://github.com/panbingkun/spark/actions/runs/7237466030/job/19717162391 image 2.Some bugs have been fixed in `netty 4.1.104.Final` as follows: image image 4.1.104.Final release note: https://netty.io/news/2023/12/15/4-1-104-Final.html 4.1.103.Final release note: https://netty.io/news/2023/12/13/4-1-103-Final.html 4.1.101.Final release note: https://netty.io/news/2023/11/09/4-1-101-Final.html ### Component(s) Java * Closes: #39265 Authored-by: panbingkun Signed-off-by: David Li --- .../main/java/io/netty/buffer/PooledByteBufAllocatorL.java | 7 ++----- java/pom.xml | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 06c6669cfd162..ba9aba353c351 100644 --- a/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/memory-netty/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -71,7 +71,7 @@ public UnsafeDirectLittleEndian allocate(long size) { } public int getChunkSize() { - return allocator.chunkSize; + return allocator.chunkSize(); } public long getHugeBufferSize() { @@ -137,7 +137,6 @@ private class InnerAllocator extends PooledByteBufAllocator { private final PoolArena[] directArenas; private final MemoryStatusThread statusThread; - private final int chunkSize; public InnerAllocator() { super(true); @@ -150,8 +149,6 @@ public InnerAllocator() { throw new RuntimeException("Failure while initializing allocator. Unable to retrieve direct arenas field.", e); } - this.chunkSize = directArenas[0].chunkSize; - if (memoryLogger.isTraceEnabled()) { statusThread = new MemoryStatusThread(this); statusThread.start(); @@ -166,7 +163,7 @@ private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCa if (directArena != null) { - if (initialCapacity > directArena.chunkSize) { + if (initialCapacity > chunkSize()) { // This is beyond chunk size so we'll allocate separately. ByteBuf buf = UnpooledByteBufAllocator.DEFAULT.directBuffer(initialCapacity, maxCapacity); diff --git a/java/pom.xml b/java/pom.xml index 75e0946f10811..4cca5e7245f0f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -33,7 +33,7 @@ 5.10.1 2.0.9 32.1.3-jre - 4.1.100.Final + 4.1.104.Final 1.60.0 3.23.1 2.16.0 From 87971df049c09671bae8a207fe2b29704fe21e8d Mon Sep 17 00:00:00 2001 From: John Garland Date: Sat, 23 Dec 2023 04:27:20 +1100 Subject: [PATCH 11/31] GH-39335: [C#] Support creating FlightClient with Grpc.Core.Channel (#39348) as well as Grpc.Net.Client.GrpcChannel by changing our constructor arg to Grpc.Core.ChannelBase which both classes inherit from. ### Rationale for this change ### What changes are included in this PR? Changing the constructor of C#'s Flight Client to take in a ChannelBase which allows for multiple implementations of gRPC channels to be passed in. ### Are these changes tested? Existing tests already cover the use but have also manually tested in a separate app ( ### Are there any user-facing changes? No as we're just changing the constructor to take in a parent/base class instead. * Closes: #39335 Authored-by: John Garland Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs index 5dc0d1b434b6d..a7c459935c240 100644 --- a/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs +++ b/csharp/src/Apache.Arrow.Flight/Client/FlightClient.cs @@ -16,10 +16,8 @@ using System.Threading.Tasks; using Apache.Arrow.Flight.Internal; using Apache.Arrow.Flight.Protocol; -using Apache.Arrow.Flight.Server; using Apache.Arrow.Flight.Server.Internal; using Grpc.Core; -using Grpc.Net.Client; namespace Apache.Arrow.Flight.Client { @@ -29,7 +27,7 @@ public class FlightClient private readonly FlightService.FlightServiceClient _client; - public FlightClient(GrpcChannel grpcChannel) + public FlightClient(ChannelBase grpcChannel) { _client = new FlightService.FlightServiceClient(grpcChannel); } From 7b71156d99557168d46292c010f82b812947ffb8 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 22 Dec 2023 17:02:31 -0400 Subject: [PATCH 12/31] GH-39138: [R] Fix implicit conversion warnings (#39250) ### Rationale for this change We have failing CRAN checks because this warning occurs on one check machine. ### What changes are included in this PR? Implicit integer casts are made explicit and/or variable declarations were fixed so that fewer implicit integer casts were performed. Fully solving the warnings also requires https://github.com/r-lib/cpp11/pull/349 since some errors occur in those headers. ### Are these changes tested? This particular test we can't do on CI because the MacOS runner we have doesn't have a new enough `clang` to support the requisite `-W` flags. I tested this locally by adding `PKG_CXXFLAGS=-Wconversion -Wno-sign-conversion -Wsign-compare -Werror` to `Makevars.in`. ### Are there any user-facing changes? No * Closes: #39138 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- r/src/altrep.cpp | 56 +++++++++++++++++++---------- r/src/array.cpp | 18 ++++++---- r/src/array_to_vector.cpp | 14 ++++---- r/src/arraydata.cpp | 12 +++---- r/src/arrowExports.cpp | 76 +++++++++++++++++++-------------------- r/src/arrow_cpp11.h | 14 +++++++- r/src/arrow_types.h | 4 +-- r/src/chunkedarray.cpp | 5 ++- r/src/compression.cpp | 2 +- r/src/compute.cpp | 15 ++++---- r/src/dataset.cpp | 4 +-- r/src/datatype.cpp | 2 +- r/src/io.cpp | 11 ++++-- r/src/message.cpp | 4 +-- r/src/r_to_arrow.cpp | 18 +++++----- r/src/recordbatch.cpp | 14 ++++---- r/src/schema.cpp | 4 +-- r/src/table.cpp | 16 ++++----- 18 files changed, 165 insertions(+), 124 deletions(-) diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index 9745393d01bbc..bdaac0a9ce5d2 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -275,7 +275,8 @@ struct AltrepVectorPrimitive : public AltrepVectorBase(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; return array->IsNull(j) ? cpp11::na() @@ -466,10 +467,10 @@ struct AltrepFactor : public AltrepVectorBase { std::unique_ptr unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); BufferVector arrays_transpose(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *internal::checked_cast(*chunked_array->chunk(i)) .dictionary(); @@ -559,17 +560,14 @@ struct AltrepFactor : public AltrepVectorBase { return dup; } - // The value at position i - static int Elt(SEXP alt, R_xlen_t i) { - if (Base::IsMaterialized(alt)) { - return INTEGER_ELT(Representation(alt), i); - } - + // The value at position i as an int64_t (to make bounds checking less verbose) + static int64_t Elt64(SEXP alt, R_xlen_t i) { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; if (!array->IsNull(j)) { @@ -578,7 +576,7 @@ struct AltrepFactor : public AltrepVectorBase { if (WasUnified(alt)) { const auto* transpose_data = reinterpret_cast( - GetArrayTransposed(alt, resolve.chunk_index)->data()); + GetArrayTransposed(alt, static_cast(resolve.chunk_index))->data()); switch (indices->type_id()) { case Type::UINT8: @@ -617,7 +615,7 @@ struct AltrepFactor : public AltrepVectorBase { case Type::INT64: return indices->data()->GetValues(1)[j] + 1; case Type::UINT64: - return indices->data()->GetValues(1)[j] + 1; + return static_cast(indices->data()->GetValues(1)[j] + 1); default: break; } @@ -628,6 +626,18 @@ struct AltrepFactor : public AltrepVectorBase { return NA_INTEGER; } + // The value at position i as an int (which R needs because this is a factor) + static int Elt(SEXP alt, R_xlen_t i) { + if (Base::IsMaterialized(alt)) { + return INTEGER_ELT(Representation(alt), i); + } + + int64_t elt64 = Elt64(alt, i); + ARROW_R_DCHECK(elt64 == NA_INTEGER || elt64 >= 1); + ARROW_R_DCHECK(elt64 <= std::numeric_limits::max()); + return static_cast(elt64); + } + static R_xlen_t Get_region(SEXP alt, R_xlen_t start, R_xlen_t n, int* buf) { // If we have data2, we can just copy the region into buf // using the standard Get_region for this R type @@ -667,7 +677,7 @@ struct AltrepFactor : public AltrepVectorBase { // using the transpose data for this chunk const auto* transpose_data = reinterpret_cast(GetArrayTransposed(alt, j)->data()); - auto transpose = [transpose_data](int x) { return transpose_data[x]; }; + auto transpose = [transpose_data](int64_t x) { return transpose_data[x]; }; GetRegionDispatch(array, indices, transpose, out); @@ -677,7 +687,7 @@ struct AltrepFactor : public AltrepVectorBase { } else { // simpler case, identity transpose - auto transpose = [](int x) { return x; }; + auto transpose = [](int64_t x) { return static_cast(x); }; int* out = buf; for (const auto& array : slice->chunks()) { @@ -718,7 +728,13 @@ struct AltrepFactor : public AltrepVectorBase { VisitArraySpanInline( *array->data(), - /*valid_func=*/[&](index_type index) { *out++ = transpose(index) + 1; }, + /*valid_func=*/ + [&](index_type index) { + int64_t transposed = transpose(index) + 1; + ARROW_R_DCHECK(transposed >= 1); + ARROW_R_DCHECK(transposed <= std::numeric_limits::max()); + *out++ = static_cast(transposed); + }, /*null_func=*/[&]() { *out++ = cpp11::na(); }); } @@ -765,7 +781,8 @@ struct AltrepVectorString : public AltrepVectorBase> { bool no_nul = std::find(view_.begin(), view_.end(), '\0') == view_.end(); if (no_nul) { - return Rf_mkCharLenCE(view_.data(), view_.size(), CE_UTF8); + ARROW_R_DCHECK(view_.size() <= std::numeric_limits::max()); + return Rf_mkCharLenCE(view_.data(), static_cast(view_.size()), CE_UTF8); } else if (strip_out_nuls_) { return ConvertStripNul(); } else { @@ -802,7 +819,9 @@ struct AltrepVectorString : public AltrepVectorBase> { } nul_was_stripped_ = true; - return Rf_mkCharLenCE(stripped_string_.data(), stripped_len, CE_UTF8); + ARROW_R_DCHECK(stripped_len <= std::numeric_limits::max()); + return Rf_mkCharLenCE(stripped_string_.data(), static_cast(stripped_len), + CE_UTF8); } bool nul_was_stripped() const { return nul_was_stripped_; } @@ -847,7 +866,8 @@ struct AltrepVectorString : public AltrepVectorBase> { auto altrep_data = reinterpret_cast(R_ExternalPtrAddr(R_altrep_data1(alt))); auto resolve = altrep_data->locate(i); - const auto& array = altrep_data->chunked_array()->chunk(resolve.chunk_index); + const auto& array = + altrep_data->chunked_array()->chunk(static_cast(resolve.chunk_index)); auto j = resolve.index_in_chunk; SEXP s = NA_STRING; diff --git a/r/src/array.cpp b/r/src/array.cpp index ae76c01a94910..38406e494d67b 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -92,7 +92,7 @@ std::shared_ptr Array__Slice2(const std::shared_ptr& return array->Slice(offset, length); } -void arrow::r::validate_index(int i, int len) { +void arrow::r::validate_index(int64_t i, int64_t len) { if (i == NA_INTEGER) { cpp11::stop("'i' cannot be NA"); } @@ -119,10 +119,14 @@ r_vec_size Array__length(const std::shared_ptr& x) { } // [[arrow::export]] -int Array__offset(const std::shared_ptr& x) { return x->offset(); } +r_vec_size Array__offset(const std::shared_ptr& x) { + return r_vec_size(x->offset()); +} // [[arrow::export]] -int Array__null_count(const std::shared_ptr& x) { return x->null_count(); } +r_vec_size Array__null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count()); +} // [[arrow::export]] std::shared_ptr Array__type(const std::shared_ptr& x) { @@ -263,9 +267,9 @@ r_vec_size LargeListArray__value_length( } // [[arrow::export]] -r_vec_size FixedSizeListArray__value_length( +int FixedSizeListArray__value_length( const std::shared_ptr& array, int64_t i) { - return r_vec_size(array->value_length(i)); + return array->value_length(i); } // [[arrow::export]] @@ -294,10 +298,10 @@ cpp11::writable::integers ListArray__raw_value_offsets( } // [[arrow::export]] -cpp11::writable::integers LargeListArray__raw_value_offsets( +cpp11::writable::doubles LargeListArray__raw_value_offsets( const std::shared_ptr& array) { auto offsets = array->raw_value_offsets(); - return cpp11::writable::integers(offsets, offsets + array->length()); + return cpp11::writable::doubles(offsets, offsets + array->length()); } // [[arrow::export]] diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index bf026d2723a1a..2f0508eb7a47a 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -375,7 +375,7 @@ struct Converter_String : public Converter { private: static SEXP r_string_from_view(std::string_view view) { - return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + return Rf_mkCharLenCE(view.data(), static_cast(view.size()), CE_UTF8); } static SEXP r_string_from_view_strip_nul(std::string_view view, @@ -576,10 +576,10 @@ class Converter_Dictionary : public Converter { const auto& arr_type = checked_cast(*chunked_array->type()); unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); - size_t n_arrays = chunked_array->num_chunks(); + int n_arrays = chunked_array->num_chunks(); arrays_transpose_.resize(n_arrays); - for (size_t i = 0; i < n_arrays; i++) { + for (int i = 0; i < n_arrays; i++) { const auto& dict_i = *checked_cast(*chunked_array->chunk(i)).dictionary(); StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose_[i])); @@ -748,7 +748,7 @@ class Converter_Struct : public Converter { auto colnames = arrow::r::to_r_strings( type->fields(), [](const std::shared_ptr& field) { return field->name(); }); - out.attr(symbols::row_names) = arrow::r::short_row_names(n); + out.attr(symbols::row_names) = arrow::r::short_row_names(static_cast(n)); out.attr(R_NamesSymbol) = colnames; out.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; @@ -756,7 +756,7 @@ class Converter_Struct : public Converter { } Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const { - int nf = converters.size(); + int nf = static_cast(converters.size()); for (int i = 0; i < nf; i++) { SEXP data_i = VECTOR_ELT(data, i); @@ -771,7 +771,7 @@ class Converter_Struct : public Converter { Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto struct_array = checked_cast(array.get()); - int nf = converters.size(); + int nf = static_cast(converters.size()); // Flatten() deals with merging of nulls auto arrays = ValueOrStop(struct_array->Flatten(gc_memory_pool())); for (int i = 0; i < nf; i++) { @@ -1384,7 +1384,7 @@ cpp11::writable::list to_data_frame(const std::shared_ptr& data, tbl.attr(R_NamesSymbol) = names; tbl.attr(R_ClassSymbol) = arrow::r::data::classes_tbl_df; - tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(nr); + tbl.attr(R_RowNamesSymbol) = arrow::r::short_row_names(static_cast(nr)); return tbl; } diff --git a/r/src/arraydata.cpp b/r/src/arraydata.cpp index cdab38f1147aa..d879e807323af 100644 --- a/r/src/arraydata.cpp +++ b/r/src/arraydata.cpp @@ -26,18 +26,18 @@ std::shared_ptr ArrayData__get_type( } // [[arrow::export]] -int ArrayData__get_length(const std::shared_ptr& x) { - return x->length; +r_vec_size ArrayData__get_length(const std::shared_ptr& x) { + return r_vec_size(x->length); } // [[arrow::export]] -int ArrayData__get_null_count(const std::shared_ptr& x) { - return x->null_count; +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x) { + return r_vec_size(x->null_count); } // [[arrow::export]] -int ArrayData__get_offset(const std::shared_ptr& x) { - return x->offset; +r_vec_size ArrayData__get_offset(const std::shared_ptr& x) { + return r_vec_size(x->offset); } // [[arrow::export]] diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 790207efce1d2..75e0f27b4002e 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -110,7 +110,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__offset(const std::shared_ptr& x); +r_vec_size Array__offset(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -118,7 +118,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -int Array__null_count(const std::shared_ptr& x); +r_vec_size Array__null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_Array__null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -315,7 +315,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -r_vec_size FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); +int FixedSizeListArray__value_length(const std::shared_ptr& array, int64_t i); extern "C" SEXP _arrow_FixedSizeListArray__value_length(SEXP array_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -359,7 +359,7 @@ BEGIN_CPP11 END_CPP11 } // array.cpp -cpp11::writable::integers LargeListArray__raw_value_offsets(const std::shared_ptr& array); +cpp11::writable::doubles LargeListArray__raw_value_offsets(const std::shared_ptr& array); extern "C" SEXP _arrow_LargeListArray__raw_value_offsets(SEXP array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type array(array_sexp); @@ -467,7 +467,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_length(const std::shared_ptr& x); +r_vec_size ArrayData__get_length(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_length(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -475,7 +475,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_null_count(const std::shared_ptr& x); +r_vec_size ArrayData__get_null_count(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_null_count(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -483,7 +483,7 @@ BEGIN_CPP11 END_CPP11 } // arraydata.cpp -int ArrayData__get_offset(const std::shared_ptr& x); +r_vec_size ArrayData__get_offset(const std::shared_ptr& x); extern "C" SEXP _arrow_ArrayData__get_offset(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -765,7 +765,7 @@ BEGIN_CPP11 END_CPP11 } // chunkedarray.cpp -r_vec_size ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array); extern "C" SEXP _arrow_ChunkedArray__num_chunks(SEXP chunked_array_sexp){ BEGIN_CPP11 arrow::r::Input&>::type chunked_array(chunked_array_sexp); @@ -869,11 +869,11 @@ BEGIN_CPP11 END_CPP11 } // compression.cpp -std::shared_ptr util___Codec__Create(arrow::Compression::type codec, R_xlen_t compression_level); +std::shared_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); extern "C" SEXP _arrow_util___Codec__Create(SEXP codec_sexp, SEXP compression_level_sexp){ BEGIN_CPP11 arrow::r::Input::type codec(codec_sexp); - arrow::r::Input::type compression_level(compression_level_sexp); + arrow::r::Input::type compression_level(compression_level_sexp); return cpp11::as_sexp(util___Codec__Create(codec, compression_level)); END_CPP11 } @@ -2024,14 +2024,14 @@ extern "C" SEXP _arrow_dataset___JsonFragmentScanOptions__Make(SEXP parse_option // dataset.cpp #if defined(ARROW_R_WITH_DATASET) -std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int64_t thrift_string_size_limit, int64_t thrift_container_size_limit); +std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, int32_t thrift_string_size_limit, int32_t thrift_container_size_limit); extern "C" SEXP _arrow_dataset___ParquetFragmentScanOptions__Make(SEXP use_buffered_stream_sexp, SEXP buffer_size_sexp, SEXP pre_buffer_sexp, SEXP thrift_string_size_limit_sexp, SEXP thrift_container_size_limit_sexp){ BEGIN_CPP11 arrow::r::Input::type use_buffered_stream(use_buffered_stream_sexp); arrow::r::Input::type buffer_size(buffer_size_sexp); arrow::r::Input::type pre_buffer(pre_buffer_sexp); - arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); - arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); + arrow::r::Input::type thrift_string_size_limit(thrift_string_size_limit_sexp); + arrow::r::Input::type thrift_container_size_limit(thrift_container_size_limit_sexp); return cpp11::as_sexp(dataset___ParquetFragmentScanOptions__Make(use_buffered_stream, buffer_size, pre_buffer, thrift_string_size_limit, thrift_container_size_limit)); END_CPP11 } @@ -2567,10 +2567,10 @@ BEGIN_CPP11 END_CPP11 } // datatype.cpp -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width); +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width); extern "C" SEXP _arrow_FixedSizeBinary__initialize(SEXP byte_width_sexp){ BEGIN_CPP11 - arrow::r::Input::type byte_width(byte_width_sexp); + arrow::r::Input::type byte_width(byte_width_sexp); return cpp11::as_sexp(FixedSizeBinary__initialize(byte_width)); END_CPP11 } @@ -3976,7 +3976,7 @@ BEGIN_CPP11 END_CPP11 } // message.cpp -r_vec_size ipc___Message__Verify(const std::unique_ptr& message); +bool ipc___Message__Verify(const std::unique_ptr& message); extern "C" SEXP _arrow_ipc___Message__Verify(SEXP message_sexp){ BEGIN_CPP11 arrow::r::Input&>::type message(message_sexp); @@ -4684,7 +4684,7 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x); +int RecordBatch__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_RecordBatch__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -4734,11 +4734,11 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column(batch, i)); END_CPP11 } @@ -4771,42 +4771,42 @@ BEGIN_CPP11 END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__AddColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__AddColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__AddColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr RecordBatch__SetColumn(const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_RecordBatch__SetColumn(SEXP batch_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(RecordBatch__SetColumn(batch, i, field, column)); END_CPP11 } // recordbatch.cpp -std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, R_xlen_t i); +std::shared_ptr RecordBatch__RemoveColumn(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__RemoveColumn(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__RemoveColumn(batch, i)); END_CPP11 } // recordbatch.cpp -std::string RecordBatch__column_name(const std::shared_ptr& batch, R_xlen_t i); +std::string RecordBatch__column_name(const std::shared_ptr& batch, int i); extern "C" SEXP _arrow_RecordBatch__column_name(SEXP batch_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type batch(batch_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(RecordBatch__column_name(batch, i)); END_CPP11 } @@ -5346,7 +5346,7 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -r_vec_size Table__num_columns(const std::shared_ptr& x); +int Table__num_columns(const std::shared_ptr& x); extern "C" SEXP _arrow_Table__num_columns(SEXP x_sexp){ BEGIN_CPP11 arrow::r::Input&>::type x(x_sexp); @@ -5379,20 +5379,20 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__column(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__column(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__column(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__column(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__field(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__field(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__field(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__field(table, i)); END_CPP11 } @@ -5476,31 +5476,31 @@ BEGIN_CPP11 END_CPP11 } // table.cpp -std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, R_xlen_t i); +std::shared_ptr Table__RemoveColumn(const std::shared_ptr& table, int i); extern "C" SEXP _arrow_Table__RemoveColumn(SEXP table_sexp, SEXP i_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); return cpp11::as_sexp(Table__RemoveColumn(table, i)); END_CPP11 } // table.cpp -std::shared_ptr Table__AddColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__AddColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__AddColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__AddColumn(table, i, field, column)); END_CPP11 } // table.cpp -std::shared_ptr Table__SetColumn(const std::shared_ptr& table, R_xlen_t i, const std::shared_ptr& field, const std::shared_ptr& column); +std::shared_ptr Table__SetColumn(const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column); extern "C" SEXP _arrow_Table__SetColumn(SEXP table_sexp, SEXP i_sexp, SEXP field_sexp, SEXP column_sexp){ BEGIN_CPP11 arrow::r::Input&>::type table(table_sexp); - arrow::r::Input::type i(i_sexp); + arrow::r::Input::type i(i_sexp); arrow::r::Input&>::type field(field_sexp); arrow::r::Input&>::type column(column_sexp); return cpp11::as_sexp(Table__SetColumn(table, i, field, column)); diff --git a/r/src/arrow_cpp11.h b/r/src/arrow_cpp11.h index d8c4b719d1d3e..ab60586628164 100644 --- a/r/src/arrow_cpp11.h +++ b/r/src/arrow_cpp11.h @@ -27,6 +27,18 @@ #include "./nameof.h" +// Simple dcheck that doesn't use assert (i.e., won't crash the R session) +// Condition this on our own debug flag to avoid this ending up in any CRAN +// checks. +#if defined(ARROW_R_DEBUG) +#define ARROW_R_DCHECK(EXPR) \ + do { \ + if (!(EXPR)) Rf_error("Failed DCHECK: %s evaluated to false", #EXPR); \ + } while (false) +#else +#define ARROW_R_DCHECK(EXPR) +#endif + // borrowed from enc package // because R does not make these macros available (i.e. from Defn.h) #define UTF8_MASK (1 << 3) @@ -465,7 +477,7 @@ inline SEXP as_sexp(r_vec_size size) { if (x > std::numeric_limits::max()) { return Rf_ScalarReal(x); } else { - return Rf_ScalarInteger(x); + return Rf_ScalarInteger(static_cast(x)); } } diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index fadc39c75fc06..05c8f6062dabb 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -189,13 +189,13 @@ void validate_slice_offset(R_xlen_t offset, int64_t len); void validate_slice_length(R_xlen_t length, int64_t available); -void validate_index(int i, int len); +void validate_index(int64_t i, int64_t len); template void TraverseDots(cpp11::list dots, int num_fields, Lambda lambda) { cpp11::strings names(dots.attr(R_NamesSymbol)); - for (R_xlen_t i = 0, j = 0; j < num_fields; i++) { + for (int i = 0, j = 0; j < num_fields; i++) { auto name_i = names[i]; if (name_i.size() == 0) { diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index 36884bb531b62..258013fc4da57 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -34,9 +34,8 @@ r_vec_size ChunkedArray__null_count( } // [[arrow::export]] -r_vec_size ChunkedArray__num_chunks( - const std::shared_ptr& chunked_array) { - return r_vec_size(chunked_array->num_chunks()); +int ChunkedArray__num_chunks(const std::shared_ptr& chunked_array) { + return chunked_array->num_chunks(); } // [[arrow::export]] diff --git a/r/src/compression.cpp b/r/src/compression.cpp index 148c6e14002f5..bc893afd8d28b 100644 --- a/r/src/compression.cpp +++ b/r/src/compression.cpp @@ -22,7 +22,7 @@ // [[arrow::export]] std::shared_ptr util___Codec__Create(arrow::Compression::type codec, - R_xlen_t compression_level) { + int compression_level) { return ValueOrStop(arrow::util::Codec::Create(codec, compression_level)); } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 87d1326ed3419..bd97e30005ca3 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -241,10 +241,10 @@ std::shared_ptr make_compute_options( interpolation); } if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { - out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); + out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); } return out; } @@ -479,9 +479,9 @@ std::shared_ptr make_compute_options( func_name == "hash_stddev") { using Options = arrow::compute::VarianceOptions; auto out = std::make_shared(); - out->ddof = cpp11::as_cpp(options["ddof"]); + out->ddof = cpp11::as_cpp(options["ddof"]); if (!Rf_isNull(options["min_count"])) { - out->min_count = cpp11::as_cpp(options["min_count"]); + out->min_count = cpp11::as_cpp(options["min_count"]); } if (!Rf_isNull(options["skip_nulls"])) { out->skip_nulls = cpp11::as_cpp(options["skip_nulls"]); @@ -683,7 +683,7 @@ arrow::Status CallRScalarUDF(arrow::compute::KernelContext* context, } } - cpp11::sexp batch_length_sexp = cpp11::as_sexp(span.length); + cpp11::sexp batch_length_sexp = cpp11::as_sexp(static_cast(span.length)); std::shared_ptr output_type = result->type()->GetSharedPtr(); cpp11::sexp output_type_sexp = cpp11::to_r6(output_type); @@ -738,8 +738,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { // Compute the Arity from the list of input kernels. We don't currently handle // variable numbers of arguments in a user-defined function. - int64_t n_args = - cpp11::as_cpp>(in_type_r[0])->num_fields(); + int n_args = cpp11::as_cpp>(in_type_r[0])->num_fields(); for (R_xlen_t i = 1; i < n_kernels; i++) { auto in_types = cpp11::as_cpp>(in_type_r[i]); if (in_types->num_fields() != n_args) { @@ -767,7 +766,7 @@ void RegisterScalarUDF(std::string name, cpp11::list func_sexp) { cpp11::sexp out_type_func = out_type_r[i]; std::vector compute_in_types(in_types->num_fields()); - for (int64_t j = 0; j < in_types->num_fields(); j++) { + for (int j = 0; j < in_types->num_fields(); j++) { compute_in_types[j] = arrow::compute::InputType(in_types->field(j)->type()); } diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 83c430fb634d3..e53fc03bdb413 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -343,8 +343,8 @@ std::shared_ptr dataset___JsonFragmentScanOptions__ std::shared_ptr dataset___ParquetFragmentScanOptions__Make(bool use_buffered_stream, int64_t buffer_size, bool pre_buffer, - int64_t thrift_string_size_limit, - int64_t thrift_container_size_limit) { + int32_t thrift_string_size_limit, + int32_t thrift_container_size_limit) { auto options = std::make_shared(); if (use_buffered_stream) { options->reader_properties->enable_buffered_stream(); diff --git a/r/src/datatype.cpp b/r/src/datatype.cpp index f19ba92527157..2f2b89d658d91 100644 --- a/r/src/datatype.cpp +++ b/r/src/datatype.cpp @@ -201,7 +201,7 @@ std::shared_ptr DayTimeInterval__initialize() { } // [[arrow::export]] -std::shared_ptr FixedSizeBinary__initialize(R_xlen_t byte_width) { +std::shared_ptr FixedSizeBinary__initialize(int32_t byte_width) { if (byte_width == NA_INTEGER) { cpp11::stop("'byte_width' cannot be NA"); } diff --git a/r/src/io.cpp b/r/src/io.cpp index 321b1b17febc3..4d5ee31794ae8 100644 --- a/r/src/io.cpp +++ b/r/src/io.cpp @@ -253,11 +253,16 @@ class RConnectionFileInterface : public virtual arrow::io::FileInterface { return arrow::Status::IOError("R connection is closed"); } + if (nbytes > std::numeric_limits::max()) { + return arrow::Status::Invalid( + "Can't read more than INT_MAX bytes from an R connection"); + } + return SafeCallIntoR( [&] { cpp11::function read_bin = cpp11::package("base")["readBin"]; cpp11::writable::raws ptype((R_xlen_t)0); - cpp11::integers n = cpp11::as_sexp(nbytes); + cpp11::integers n = cpp11::as_sexp(static_cast(nbytes)); cpp11::sexp result = read_bin(connection_sexp_, ptype, n); @@ -512,8 +517,8 @@ struct ReencodeUTF8TransformFunctionWrapper { // UTF-16, and UTF-32. while (in_bytes_left > 0) { // Make enough place in the output to hopefully consume all of the input. - RETURN_NOT_OK( - builder.Reserve(std::max(in_bytes_left * kOversizeFactor, 4))); + RETURN_NOT_OK(builder.Reserve( + std::max(static_cast(in_bytes_left * kOversizeFactor), 4))); out_buf = builder.mutable_data() + builder.length(); out_bytes_left = builder.capacity() - builder.length(); diff --git a/r/src/message.cpp b/r/src/message.cpp index d9832ddc22a74..3f21873fea3b2 100644 --- a/r/src/message.cpp +++ b/r/src/message.cpp @@ -39,8 +39,8 @@ std::shared_ptr ipc___Message__body( } // [[arrow::export]] -r_vec_size ipc___Message__Verify(const std::unique_ptr& message) { - return r_vec_size(message->Verify()); +bool ipc___Message__Verify(const std::unique_ptr& message) { + return message->Verify(); } // [[arrow::export]] diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index d9bf848e24292..d2db11e14a787 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -335,7 +335,7 @@ struct RConvert { template static enable_if_integer> Convert(Type*, From from) { - return CIntFromRScalarImpl(from); + return CIntFromRScalarImpl(static_cast(from)); } // ---- convert R integer types to double @@ -461,7 +461,7 @@ class RPrimitiveConverter< if (std::is_same::value) { auto append_value = [this](r_value_type value) { - this->primitive_builder_->UnsafeAppend(value); + this->primitive_builder_->UnsafeAppend(static_cast(value)); return Status::OK(); }; return VisitVector(it, size, append_null, append_value); @@ -595,19 +595,21 @@ class RPrimitiveConverter::value>> return VisitVector(it, size, append_null, append_value); } - static int FromRDate(const Date32Type*, int from) { return from; } + static int FromRDate(const Date32Type*, double from) { return static_cast(from); } - static int64_t FromRDate(const Date64Type*, int from) { + static int64_t FromRDate(const Date64Type*, double from) { constexpr int64_t kMilliSecondsPerDay = 86400000; - return from * kMilliSecondsPerDay; + return static_cast(from * kMilliSecondsPerDay); } static int FromPosixct(const Date32Type*, double from) { constexpr int64_t kSecondsPerDay = 86400; - return from / kSecondsPerDay; + return static_cast(from / kSecondsPerDay); } - static int64_t FromPosixct(const Date64Type*, double from) { return from * 1000; } + static int64_t FromPosixct(const Date64Type*, double from) { + return static_cast(from * 1000); + } }; int64_t get_TimeUnit_multiplier(TimeUnit::type unit) { @@ -1081,7 +1083,7 @@ class RListConverter : public ListConverter { auto append_value = [this](SEXP value) { // TODO: if we decide that this can be run concurrently // we'll have to do vec_size() upfront - int n = arrow::r::vec_size(value); + R_xlen_t n = arrow::r::vec_size(value); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(n)); RETURN_NOT_OK(this->list_builder_->Append()); diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index aca3a74fd81df..bf88e98ed1026 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -27,8 +27,8 @@ #include // [[arrow::export]] -r_vec_size RecordBatch__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int RecordBatch__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -80,7 +80,7 @@ cpp11::list RecordBatch__columns(const std::shared_ptr& batc // [[arrow::export]] std::shared_ptr RecordBatch__column( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column(i); } @@ -106,7 +106,7 @@ bool RecordBatch__Equals(const std::shared_ptr& self, // [[arrow::export]] std::shared_ptr RecordBatch__AddColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->AddColumn(i, field, column)); @@ -114,7 +114,7 @@ std::shared_ptr RecordBatch__AddColumn( // [[arrow::export]] std::shared_ptr RecordBatch__SetColumn( - const std::shared_ptr& batch, R_xlen_t i, + const std::shared_ptr& batch, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(batch->SetColumn(i, field, column)); @@ -122,14 +122,14 @@ std::shared_ptr RecordBatch__SetColumn( // [[arrow::export]] std::shared_ptr RecordBatch__RemoveColumn( - const std::shared_ptr& batch, R_xlen_t i) { + const std::shared_ptr& batch, int i) { arrow::r::validate_index(i, batch->num_columns()); return ValueOrStop(batch->RemoveColumn(i)); } // [[arrow::export]] std::string RecordBatch__column_name(const std::shared_ptr& batch, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, batch->num_columns()); return batch->column_name(i); } diff --git a/r/src/schema.cpp b/r/src/schema.cpp index cf959707305a7..41d3d38d2eda3 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -29,14 +29,14 @@ std::shared_ptr Schema__from_fields( // [[arrow::export]] std::shared_ptr Schema__from_list(cpp11::list field_list) { - int n = field_list.size(); + R_xlen_t n = field_list.size(); bool nullable = true; cpp11::strings names(field_list.attr(R_NamesSymbol)); std::vector> fields(n); - for (int i = 0; i < n; i++) { + for (R_xlen_t i = 0; i < n; i++) { fields[i] = arrow::field( names[i], cpp11::as_cpp>(field_list[i]), nullable); diff --git a/r/src/table.cpp b/r/src/table.cpp index 04537000f5d48..04a8c7caf24fd 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -23,8 +23,8 @@ #include // [[arrow::export]] -r_vec_size Table__num_columns(const std::shared_ptr& x) { - return r_vec_size(x->num_columns()); +int Table__num_columns(const std::shared_ptr& x) { + return x->num_columns(); } // [[arrow::export]] @@ -49,14 +49,14 @@ std::shared_ptr Table__ReplaceSchemaMetadata( // [[arrow::export]] std::shared_ptr Table__column( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { arrow::r::validate_index(i, table->num_columns()); return table->column(i); } // [[arrow::export]] std::shared_ptr Table__field(const std::shared_ptr& table, - R_xlen_t i) { + int i) { arrow::r::validate_index(i, table->num_columns()); return table->field(i); } @@ -123,13 +123,13 @@ std::shared_ptr Table__GetColumnByName( // [[arrow::export]] std::shared_ptr Table__RemoveColumn( - const std::shared_ptr& table, R_xlen_t i) { + const std::shared_ptr& table, int i) { return ValueOrStop(table->RemoveColumn(i)); } // [[arrow::export]] std::shared_ptr Table__AddColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->AddColumn(i, field, column)); @@ -137,7 +137,7 @@ std::shared_ptr Table__AddColumn( // [[arrow::export]] std::shared_ptr Table__SetColumn( - const std::shared_ptr& table, R_xlen_t i, + const std::shared_ptr& table, int i, const std::shared_ptr& field, const std::shared_ptr& column) { return ValueOrStop(table->SetColumn(i, field, column)); @@ -241,7 +241,7 @@ arrow::Status AddMetadataFromDots(SEXP lst, int num_fields, // Remove metadata for ExtensionType columns, because these have their own mechanism for // preserving R type information - for (R_xlen_t i = 0; i < schema->num_fields(); i++) { + for (int i = 0; i < schema->num_fields(); i++) { if (schema->field(i)->type()->id() == Type::EXTENSION) { metadata_columns[i] = R_NilValue; } From d51954415882423584f2a95b0897aa4d073a4e1c Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 23 Dec 2023 15:03:47 +0000 Subject: [PATCH 13/31] GH-39320: [C++][FS][Azure] Add managed identity auth configuration (#39321) ### Rationale for this change Workload identity is a useful Azure authentication method. Also I failed to set the account_name correctly for a bunch of auths (I think this got lost in a rebase then I copy pasted the broken code). ### What changes are included in this PR? - Make filesystem initialisation fail if `account_name_.empty()`. This prevents the account name configuration bug we had. Also added a test asserting that filesystem initialization fails in this case. - Remove account name configuration on all auth configs, in favour of setting in separately from the auth configuration. - Implement `AzureOptions::ConfigureManagedIdentityCredential` ### Are these changes tested? Added a simple test initialising a filesystem using `ConfigureManagedIdentityCredential`. This is not the most comprehensive test but its the same as what we agreed on for https://github.com/apache/arrow/pull/39263. ### Are there any user-facing changes? Managed identity authentication is now supported. * Closes: #39320 Authored-by: Thomas Newton Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/filesystem/azurefs.cc | 38 ++++++++++++++++-------- cpp/src/arrow/filesystem/azurefs.h | 16 +++++----- cpp/src/arrow/filesystem/azurefs_test.cc | 34 +++++++++++++++++---- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 26c2761886050..21350a490411a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -58,7 +58,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { blob_storage_scheme == other.blob_storage_scheme && dfs_storage_scheme == other.dfs_storage_scheme && default_metadata == other.default_metadata && - account_name_ == other.account_name_ && + account_name == other.account_name && credential_kind_ == other.credential_kind_; if (!equals) { return false; @@ -104,17 +104,17 @@ std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const { return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name); } -Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key) { +Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_key) { credential_kind_ = CredentialKind::kStorageSharedKeyCredential; - account_name_ = account_name; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } storage_shared_key_credential_ = std::make_shared(account_name, account_key); return Status::OK(); } -Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_name, - const std::string& tenant_id, +Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { credential_kind_ = CredentialKind::kTokenCredential; @@ -123,14 +123,20 @@ Status AzureOptions::ConfigureClientSecretCredential(const std::string& account_ return Status::OK(); } -Status AzureOptions::ConfigureDefaultCredential(const std::string& account_name) { +Status AzureOptions::ConfigureDefaultCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); } -Status AzureOptions::ConfigureWorkloadIdentityCredential( - const std::string& account_name) { +Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& client_id) { + credential_kind_ = CredentialKind::kTokenCredential; + token_credential_ = + std::make_shared(client_id); + return Status::OK(); +} + +Status AzureOptions::ConfigureWorkloadIdentityCredential() { credential_kind_ = CredentialKind::kTokenCredential; token_credential_ = std::make_shared(); return Status::OK(); @@ -138,14 +144,17 @@ Status AzureOptions::ConfigureWorkloadIdentityCredential( Result> AzureOptions::MakeBlobServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: - return std::make_unique(AccountBlobUrl(account_name_), + return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); @@ -153,15 +162,18 @@ Result> AzureOptions::MakeBlobServiceC Result> AzureOptions::MakeDataLakeServiceClient() const { + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } switch (credential_kind_) { case CredentialKind::kAnonymous: break; case CredentialKind::kTokenCredential: return std::make_unique( - AccountDfsUrl(account_name_), token_credential_); + AccountDfsUrl(account_name), token_credential_); case CredentialKind::kStorageSharedKeyCredential: return std::make_unique( - AccountDfsUrl(account_name_), storage_shared_key_credential_); + AccountDfsUrl(account_name), storage_shared_key_credential_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 346dd349e935c..78e0a8148c616 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -48,6 +48,9 @@ class TestAzureFileSystem; /// Options for the AzureFileSystem implementation. struct ARROW_EXPORT AzureOptions { + /// \brief account name of the Azure Storage account. + std::string account_name; + /// \brief hostname[:port] of the Azure Blob Storage Service. /// /// If the hostname is a relative domain name (one that starts with a '.'), then storage @@ -94,7 +97,6 @@ struct ARROW_EXPORT AzureOptions { kStorageSharedKeyCredential, } credential_kind_ = CredentialKind::kAnonymous; - std::string account_name_; std::shared_ptr token_credential_; std::shared_ptr storage_shared_key_credential_; @@ -103,15 +105,15 @@ struct ARROW_EXPORT AzureOptions { AzureOptions(); ~AzureOptions(); - Status ConfigureDefaultCredential(const std::string& account_name); + Status ConfigureDefaultCredential(); + + Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string()); - Status ConfigureWorkloadIdentityCredential(const std::string& account_name); + Status ConfigureWorkloadIdentityCredential(); - Status ConfigureAccountKeyCredential(const std::string& account_name, - const std::string& account_key); + Status ConfigureAccountKeyCredential(const std::string& account_key); - Status ConfigureClientSecretCredential(const std::string& account_name, - const std::string& tenant_id, + Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 62c5ef2232045..f6af9f722dbac 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -271,22 +271,44 @@ class AzureHierarchicalNSEnv : public AzureEnvImpl { bool WithHierarchicalNamespace() const final { return true; } }; +TEST(AzureFileSystem, InitializingFilesystemWithoutAccountNameFails) { + AzureOptions options; + ASSERT_RAISES(Invalid, options.ConfigureAccountKeyCredential("account_key")); + + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); + ASSERT_RAISES(Invalid, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithClientSecretCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureClientSecretCredential( - "dummy-account-name", "tenant_id", "client_id", "client_secret")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK( + options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, InitializeFilesystemWithDefaultCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureDefaultCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureDefaultCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } +TEST(AzureFileSystem, InitializeFilesystemWithManagedIdentityCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential()); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential("specific-client-id")); + EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options)); +} + TEST(AzureFileSystem, InitializeFilesystemWithWorkloadIdentityCredential) { AzureOptions options; - ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential("dummy-account-name")); + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential()); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -383,6 +405,7 @@ class TestAzureFileSystem : public ::testing::Test { static Result MakeOptions(BaseAzureEnv* env) { AzureOptions options; + options.account_name = env->account_name(); switch (env->backend()) { case AzureBackend::kAzurite: options.blob_storage_authority = "127.0.0.1:10000"; @@ -394,8 +417,7 @@ class TestAzureFileSystem : public ::testing::Test { // Use the default values break; } - ARROW_EXPECT_OK( - options.ConfigureAccountKeyCredential(env->account_name(), env->account_key())); + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential(env->account_key())); return options; } From ec41209ea02bdb410bc7e049cb3100afedf4ba2f Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Sat, 23 Dec 2023 23:50:39 +0800 Subject: [PATCH 14/31] GH-37055: [C++] Optimize hash kernels for Dictionary ChunkedArrays (#38394) ### Rationale for this change When merging dictionaries across chunks, the hash kernels unnecessarily unify the existing dictionary, dragging down the performance. ### What changes are included in this PR? Reuse the dictionary unifier across chunks. ### Are these changes tested? Yes, with a new benchmark for dictionary chunked arrays. ### Are there any user-facing changes? No. * Closes: #37055 Lead-authored-by: Jin Shang Co-authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/compute/kernels/vector_hash.cc | 55 +++++++++++++------ .../compute/kernels/vector_hash_benchmark.cc | 36 ++++++++++++ 2 files changed, 74 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 65e59d1a2eb14..800deba3a5ed2 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -26,17 +26,20 @@ #include "arrow/array/concatenate.h" #include "arrow/array/dict_internal.h" #include "arrow/array/util.h" +#include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/int_util.h" #include "arrow/util/unreachable.h" namespace arrow { using internal::DictionaryTraits; using internal::HashTraits; +using internal::TransposeInts; namespace compute { namespace internal { @@ -448,9 +451,9 @@ class DictionaryHashKernel : public HashKernel { Status Append(const ArraySpan& arr) override { auto arr_dict = arr.dictionary().ToArray(); - if (!dictionary_) { - dictionary_ = arr_dict; - } else if (!dictionary_->Equals(*arr_dict)) { + if (!first_dictionary_) { + first_dictionary_ = arr_dict; + } else if (!first_dictionary_->Equals(*arr_dict)) { // NOTE: This approach computes a new dictionary unification per chunk. // This is in effect O(n*k) where n is the total chunked array length and // k is the number of chunks (therefore O(n**2) if chunks have a fixed size). @@ -458,21 +461,23 @@ class DictionaryHashKernel : public HashKernel { // A better approach may be to run the kernel over each individual chunk, // and then hash-aggregate all results (for example sum-group-by for // the "value_counts" kernel). - auto out_dict_type = dictionary_->type(); + if (dictionary_unifier_ == nullptr) { + ARROW_ASSIGN_OR_RAISE(dictionary_unifier_, + DictionaryUnifier::Make(first_dictionary_->type())); + RETURN_NOT_OK(dictionary_unifier_->Unify(*first_dictionary_)); + } + auto out_dict_type = first_dictionary_->type(); std::shared_ptr transpose_map; - std::shared_ptr out_dict; - ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type)); - ARROW_CHECK_OK(unifier->Unify(*dictionary_)); - ARROW_CHECK_OK(unifier->Unify(*arr_dict, &transpose_map)); - ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict)); + RETURN_NOT_OK(dictionary_unifier_->Unify(*arr_dict, &transpose_map)); - dictionary_ = out_dict; auto transpose = reinterpret_cast(transpose_map->data()); - auto in_dict_array = arr.ToArray(); + auto in_array = arr.ToArray(); + const auto& in_dict_array = + arrow::internal::checked_cast(*in_array); ARROW_ASSIGN_OR_RAISE( - auto tmp, arrow::internal::checked_cast(*in_dict_array) - .Transpose(arr.type->GetSharedPtr(), out_dict, transpose)); + auto tmp, in_dict_array.Transpose(arr.type->GetSharedPtr(), + in_dict_array.dictionary(), transpose)); return indices_kernel_->Append(*tmp->data()); } @@ -495,12 +500,27 @@ class DictionaryHashKernel : public HashKernel { return dictionary_value_type_; } - std::shared_ptr dictionary() const { return dictionary_; } + /// This can't be called more than once because DictionaryUnifier::GetResult() + /// can't be called more than once and produce the same output. + Result> dictionary() const { + if (!first_dictionary_) { // Append was never called + return nullptr; + } + if (!dictionary_unifier_) { // Append was called only once + return first_dictionary_; + } + + auto out_dict_type = first_dictionary_->type(); + std::shared_ptr out_dict; + RETURN_NOT_OK(dictionary_unifier_->GetResult(&out_dict_type, &out_dict)); + return out_dict; + } private: std::unique_ptr indices_kernel_; - std::shared_ptr dictionary_; + std::shared_ptr first_dictionary_; std::shared_ptr dictionary_value_type_; + std::unique_ptr dictionary_unifier_; }; // ---------------------------------------------------------------------- @@ -630,8 +650,9 @@ Status ValueCountsFinalize(KernelContext* ctx, std::vector* out) { // hence have no dictionary. Result> EnsureHashDictionary(KernelContext* ctx, DictionaryHashKernel* hash) { - if (hash->dictionary()) { - return hash->dictionary()->data(); + ARROW_ASSIGN_OR_RAISE(auto dict, hash->dictionary()); + if (dict) { + return dict->data(); } ARROW_ASSIGN_OR_RAISE(auto null, MakeArrayOfNull(hash->dictionary_value_type(), /*length=*/0, ctx->memory_pool())); diff --git a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc index e9548e133aa00..472f50db8cf92 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc @@ -25,6 +25,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/logging.h" #include "arrow/compute/api.h" @@ -226,6 +227,33 @@ static void UniqueString100bytes(benchmark::State& state) { BenchUnique(state, HashParams{general_bench_cases[state.range(0)], 100}); } +template +void BenchValueCountsDictionaryChunks(benchmark::State& state, const ParamType& params) { + std::shared_ptr arr; + params.GenerateTestData(&arr); + // chunk arr to 100 slices + std::vector> chunks; + const int64_t chunk_size = arr->length() / 100; + for (int64_t i = 0; i < 100; ++i) { + auto slice = arr->Slice(i * chunk_size, chunk_size); + auto datum = DictionaryEncode(slice).ValueOrDie(); + ARROW_CHECK(datum.is_array()); + chunks.push_back(datum.make_array()); + } + auto chunked_array = std::make_shared(chunks); + + while (state.KeepRunning()) { + ABORT_NOT_OK(ValueCounts(chunked_array).status()); + } + params.SetMetadata(state); +} + +static void ValueCountsDictionaryChunks(benchmark::State& state) { + // Dictionary of byte strings with 10 bytes each + BenchValueCountsDictionaryChunks( + state, HashParams{general_bench_cases[state.range(0)], 10}); +} + void HashSetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { bench->Arg(i); @@ -239,6 +267,14 @@ BENCHMARK(UniqueInt64)->Apply(HashSetArgs); BENCHMARK(UniqueString10bytes)->Apply(HashSetArgs); BENCHMARK(UniqueString100bytes)->Apply(HashSetArgs); +void DictionaryChunksHashSetArgs(benchmark::internal::Benchmark* bench) { + for (int i = 0; i < static_cast(general_bench_cases.size()); ++i) { + bench->Arg(i); + } +} + +BENCHMARK(ValueCountsDictionaryChunks)->Apply(DictionaryChunksHashSetArgs); + void UInt8SetArgs(benchmark::internal::Benchmark* bench) { for (int i = 0; i < static_cast(uint8_bench_cases.size()); ++i) { bench->Arg(i); From 90f7ecab559870dc862d34b5ac323c77c7050353 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 25 Dec 2023 05:23:17 -0500 Subject: [PATCH 15/31] GH-39017: [JS] Add `typeId` as attribute (#39018) ### Rationale for this change Support reconstructing `DataType` after `postMessage`. ### What changes are included in this PR? Make `typeId` an attribute, not a getter. ### Are these changes tested? Passes all existing tests. ### Are there any user-facing changes? No * Closes: #39017 --- js/src/type.ts | 70 ++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/js/src/type.ts b/js/src/type.ts index dea5301aed355..ae3aefa025999 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -79,7 +79,11 @@ export abstract class DataTypeType.NONE; } + declare public readonly typeId: TType; + + constructor(typeId: TType) { + this.typeId = typeId; + } protected static [Symbol.toStringTag] = ((proto: DataType) => { (proto).children = null; @@ -93,8 +97,10 @@ export abstract class DataType { TArray: void; TValue: null } /** @ignore */ export class Null extends DataType { + constructor() { + super(Type.Null); + } public toString() { return `Null`; } - public get typeId() { return Type.Null as Type.Null; } protected static [Symbol.toStringTag] = ((proto: Null) => proto[Symbol.toStringTag] = 'Null')(Null.prototype); } @@ -119,9 +125,8 @@ interface Int_ extends DataType { TArray: IType[T]['TA class Int_ extends DataType { constructor(public readonly isSigned: IType[T]['isSigned'], public readonly bitWidth: IType[T]['bitWidth']) { - super(); + super(Type.Int as T); } - public get typeId() { return Type.Int as T; } public get ArrayType() { switch (this.bitWidth) { case 8: return this.isSigned ? Int8Array : Uint8Array; @@ -206,9 +211,8 @@ export interface Float extends DataType { TArray: /** @ignore */ export class Float extends DataType { constructor(public readonly precision: Precision) { - super(); + super(Type.Float as T); } - public get typeId() { return Type.Float as T; } public get ArrayType(): TypedArrayConstructor { switch (this.precision) { case Precision.HALF: return Uint16Array; @@ -241,9 +245,8 @@ export interface Binary extends DataType { TArray: Uint8Array; TOff /** @ignore */ export class Binary extends DataType { constructor() { - super(); + super(Type.Binary); } - public get typeId() { return Type.Binary as Type.Binary; } public toString() { return `Binary`; } protected static [Symbol.toStringTag] = ((proto: Binary) => { (proto).ArrayType = Uint8Array; @@ -256,9 +259,8 @@ export interface LargeBinary extends DataType { TArray: Uint8A /** @ignore */ export class LargeBinary extends DataType { constructor() { - super(); + super(Type.LargeBinary); } - public get typeId() { return Type.LargeBinary as Type.LargeBinary; } public toString() { return `LargeBinary`; } protected static [Symbol.toStringTag] = ((proto: LargeBinary) => { (proto).ArrayType = Uint8Array; @@ -272,9 +274,8 @@ export interface Utf8 extends DataType { TArray: Uint8Array; TOffsetA /** @ignore */ export class Utf8 extends DataType { constructor() { - super(); + super(Type.Utf8); } - public get typeId() { return Type.Utf8 as Type.Utf8; } public toString() { return `Utf8`; } protected static [Symbol.toStringTag] = ((proto: Utf8) => { (proto).ArrayType = Uint8Array; @@ -287,9 +288,8 @@ export interface LargeUtf8 extends DataType { TArray: Uint8Array /** @ignore */ export class LargeUtf8 extends DataType { constructor() { - super(); + super(Type.LargeUtf8); } - public get typeId() { return Type.LargeUtf8 as Type.LargeUtf8; } public toString() { return `LargeUtf8`; } protected static [Symbol.toStringTag] = ((proto: LargeUtf8) => { (proto).ArrayType = Uint8Array; @@ -303,9 +303,8 @@ export interface Bool extends DataType { TArray: Uint8Array; TValue: /** @ignore */ export class Bool extends DataType { constructor() { - super(); + super(Type.Bool); } - public get typeId() { return Type.Bool as Type.Bool; } public toString() { return `Bool`; } protected static [Symbol.toStringTag] = ((proto: Bool) => { (proto).ArrayType = Uint8Array; @@ -320,9 +319,8 @@ export class Decimal extends DataType { constructor(public readonly scale: number, public readonly precision: number, public readonly bitWidth: number = 128) { - super(); + super(Type.Decimal); } - public get typeId() { return Type.Decimal as Type.Decimal; } public toString() { return `Decimal[${this.precision}e${this.scale > 0 ? `+` : ``}${this.scale}]`; } protected static [Symbol.toStringTag] = ((proto: Decimal) => { (proto).scale = null; @@ -339,9 +337,8 @@ export interface Date_ extends DataType { TArray: In /** @ignore */ export class Date_ extends DataType { constructor(public readonly unit: DateUnit) { - super(); + super(Type.Date as T); } - public get typeId() { return Type.Date as T; } public toString() { return `Date${(this.unit + 1) * 32}<${DateUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Date_) => { (proto).unit = null; @@ -375,9 +372,8 @@ interface Time_ extends DataType { class Time_ extends DataType { constructor(public readonly unit: TimesType[T]['unit'], public readonly bitWidth: TimeBitWidth) { - super(); + super(Type.Time as T); } - public get typeId() { return Type.Time as T; } public toString() { return `Time${this.bitWidth}<${TimeUnit[this.unit]}>`; } public get ArrayType() { switch (this.bitWidth) { @@ -418,9 +414,8 @@ interface Timestamp_ extends DataType { class Timestamp_ extends DataType { constructor(public readonly unit: TimeUnit, public readonly timezone?: string | null) { - super(); + super(Type.Timestamp as T); } - public get typeId() { return Type.Timestamp as T; } public toString() { return `Timestamp<${TimeUnit[this.unit]}${this.timezone ? `, ${this.timezone}` : ``}>`; } protected static [Symbol.toStringTag] = ((proto: Timestamp_) => { (proto).unit = null; @@ -453,9 +448,8 @@ interface Interval_ extends DataType { /** @ignore */ class Interval_ extends DataType { constructor(public readonly unit: IntervalUnit) { - super(); + super(Type.Interval as T); } - public get typeId() { return Type.Interval as T; } public toString() { return `Interval<${IntervalUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Interval_) => { (proto).unit = null; @@ -483,9 +477,8 @@ export interface Duration extends DataType { /** @ignore */ export class Duration extends DataType { constructor(public readonly unit: TimeUnit) { - super(); + super(Type.Duration as T); } - public get typeId() { return Type.Duration as T; } public toString() { return `Duration<${TimeUnit[this.unit]}>`; } protected static [Symbol.toStringTag] = ((proto: Duration) => { (proto).unit = null; @@ -513,11 +506,10 @@ export interface List extends DataType extends DataType { constructor(child: Field) { - super(); + super(Type.List); this.children = [child]; } public declare readonly children: Field[]; - public get typeId() { return Type.List as Type.List; } public toString() { return `List<${this.valueType}>`; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } @@ -540,10 +532,9 @@ export class Struct extends DataType { public declare _row: StructRow; public declare readonly children: Field[]; constructor(children: Field[]) { - super(); + super(Type.Struct); this.children = children; } - public get typeId() { return Type.Struct as Type.Struct; } public toString() { return `Struct<{${this.children.map((f) => `${f.name}:${f.type}`).join(`, `)}}>`; } protected static [Symbol.toStringTag] = ((proto: Struct) => { (proto).children = null; @@ -564,13 +555,12 @@ class Union_ extends DataType { constructor(mode: UnionMode, typeIds: number[] | Int32Array, children: Field[]) { - super(); + super(Type.Union as T); this.mode = mode; this.children = children; this.typeIds = typeIds = Int32Array.from(typeIds); this.typeIdToChildIndex = typeIds.reduce((typeIdToChildIndex, typeId, idx) => (typeIdToChildIndex[typeId] = idx) && typeIdToChildIndex || typeIdToChildIndex, Object.create(null) as { [key: number]: number }); } - public get typeId() { return Type.Union as T; } public toString() { return `${this[Symbol.toStringTag]}<${this.children.map((x) => `${x.type}`).join(` | `) }>`; @@ -611,9 +601,8 @@ export interface FixedSizeBinary extends DataType { /** @ignore */ export class FixedSizeBinary extends DataType { constructor(public readonly byteWidth: number) { - super(); + super(Type.FixedSizeBinary); } - public get typeId() { return Type.FixedSizeBinary as Type.FixedSizeBinary; } public toString() { return `FixedSizeBinary[${this.byteWidth}]`; } protected static [Symbol.toStringTag] = ((proto: FixedSizeBinary) => { (proto).byteWidth = null; @@ -632,10 +621,9 @@ export interface FixedSizeList extends DataType extends DataType { public declare readonly children: Field[]; constructor(public readonly listSize: number, child: Field) { - super(); + super(Type.FixedSizeList); this.children = [child]; } - public get typeId() { return Type.FixedSizeList as Type.FixedSizeList; } public get valueType(): T { return this.children[0].type as T; } public get valueField(): Field { return this.children[0] as Field; } public get ArrayType(): T['ArrayType'] { return this.valueType.ArrayType; } @@ -657,7 +645,7 @@ export interface Map_ extends DataType }> { constructor(entries: Field>, keysSorted = false) { - super(); + super(Type.Map); this.children = [entries]; this.keysSorted = keysSorted; // ARROW-8716 @@ -678,7 +666,6 @@ export class Map_ ex } public declare readonly keysSorted: boolean; public declare readonly children: Field>[]; - public get typeId() { return Type.Map as Type.Map; } public get keyType(): TKey { return this.children[0].type.children[0].type as TKey; } public get valueType(): TValue { return this.children[0].type.children[1].type as TValue; } public get childType() { return this.children[0].type as Struct<{ key: TKey; value: TValue }>; } @@ -709,13 +696,12 @@ export class Dictionary ex public declare readonly dictionary: T; public declare readonly isOrdered: boolean; constructor(dictionary: T, indices: TKey, id?: bigint | number | null, isOrdered?: boolean | null) { - super(); + super(Type.Dictionary); this.indices = indices; this.dictionary = dictionary; this.isOrdered = isOrdered || false; this.id = id == null ? getId() : bigIntToNumber(id); } - public get typeId() { return Type.Dictionary as Type.Dictionary; } public get children() { return this.dictionary.children; } public get valueType(): T { return this.dictionary as T; } public get ArrayType(): T['ArrayType'] { return this.dictionary.ArrayType; } From 4d9a860196c2959c8595e117452ef5094ce7363c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:09:10 +0900 Subject: [PATCH 16/31] MINOR: [C#] Bump xunit.runner.visualstudio from 2.5.5 to 2.5.6 in /csharp (#39369) Bumps [xunit.runner.visualstudio](https://github.com/xunit/visualstudio.xunit) from 2.5.5 to 2.5.6.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit.runner.visualstudio&package-manager=nuget&previous-version=2.5.5&new-version=2.5.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index 94ef4b5f3c5f5..e3d86f0dd9992 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -9,7 +9,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 46d0a59b5d8e1..4dd479545a74c 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 99c772770d6c6..114e76ad984f1 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index fde30a90e6479..71f68fe2d49e3 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -16,7 +16,7 @@ - + all runtime; build; native; contentfiles; analyzers From 35db6f78a2e2b45e55109979c85649150d205326 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:10:24 +0900 Subject: [PATCH 17/31] MINOR: [Java] Bump org.apache.maven.plugins:maven-surefire-plugin from 3.0.0-M7 to 3.2.3 in /java (#39372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 3.0.0-M7 to 3.2.3.
Release notes

Sourced from org.apache.maven.plugins:maven-surefire-plugin's releases.

3.2.2

🐛 Bug Fixes

📦 Dependency updates

🔧 Build

3.2.1

🚀 New features and improvements

🐛 Bug Fixes

📝 Documentation updates

👻 Maintenance

3.1.2

Release Notes - Maven Surefire - Version 3.1.2

... (truncated)

Commits
  • ac9e574 [maven-release-plugin] prepare release surefire-3.2.3
  • 2d6cbc6 [SUREFIRE-2220] SurefireForkChannel#getForkNodeConnectionString() returns inv...
  • 05322d9 [SUREFIRE-2212] OutOfMemoryError raised when parsing files with huge stderr/s...
  • 55ccd06 [SUREFIRE-2211] additionalClasspathElement with UNC path not working with Mav...
  • aa864f4 [SUREFIRE-2216] Upgrade plugins and components (in ITs)
  • 6662e07 [SUREFIRE-2215] Upgrade to Parent 41
  • f5b73ab [SUREFIRE-2214] Uprade to HtmlUnit 3.8.0
  • 47c5816 [SUREFIRE-2210] - Restore ordering of additional class path elements
  • 9b7ecf1 [maven-release-plugin] prepare for next development iteration
  • 2d76753 [maven-release-plugin] prepare release surefire-3.2.2
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-surefire-plugin&package-manager=maven&previous-version=3.0.0-M7&new-version=3.2.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index a3e4da85b4321..888c0fb367932 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -211,7 +211,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 diff --git a/java/pom.xml b/java/pom.xml index 4cca5e7245f0f..27d1504016ee6 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -442,7 +442,7 @@ maven-surefire-plugin - 3.0.0-M7 + 3.2.3 org.junit.jupiter From 9126021e675e7e021a11a90a7ab7d67bd6529712 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 11:10:49 +0900 Subject: [PATCH 18/31] MINOR: [Java] Bump org.apache.maven.plugins:maven-resources-plugin from 2.6 to 3.3.1 in /java (#39373) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [org.apache.maven.plugins:maven-resources-plugin](https://github.com/apache/maven-resources-plugin) from 2.6 to 3.3.1.
Release notes

Sourced from org.apache.maven.plugins:maven-resources-plugin's releases.

3.3.1

🚨 Removed

📦 Dependency updates

📝 Documentation updates

  • doc: adds alternative variable syntax using @ delimiters to the documentation (#36) @​kevin0x90

3.3.0

📦 Dependency updates

📝 Documentation updates

👻 Maintenance

3.2.0

What's Changed

New Contributors

... (truncated)

Commits
  • 978ce1e [maven-release-plugin] prepare release maven-resources-plugin-3.3.1
  • b7cd080 [MRESOURCES-296] Upgrade to maven-filtering 3.3.1
  • 1c9f610 [MRESOURCES-288] Make tests-jar reproducible (#56)
  • 1946127 [MRESOURCES-293] Rollback
  • f7a6f22 [MRESOURCES-297] Update to parent POM 39, reformat (#55)
  • 22d64ca remove specific IDE m2e files (#40)
  • 02c2d01 [MRESOURCES-293] Make resources param not read-only (#54)
  • 6bb3e1f [MRESOURCES-295] Drop Plexus legacy code (#53)
  • df7e172 [MRESOURCES-294] Upgrade plexus-utils to 3.5.1
  • 9354ecd Bump apache/maven-gh-actions-shared from 2 to 3
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.apache.maven.plugins:maven-resources-plugin&package-manager=maven&previous-version=2.6&new-version=3.3.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/performance/pom.xml | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/performance/pom.xml b/java/performance/pom.xml index 888c0fb367932..4d449af46b6b1 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -199,7 +199,7 @@
maven-resources-plugin - 2.6 + 3.3.1 maven-site-plugin diff --git a/java/pom.xml b/java/pom.xml index 27d1504016ee6..1776407e3d030 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -412,7 +412,7 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 + 3.3.1 org.apache.maven.plugins From 6bb77464940bf97dbd042bbf1c6048439f4c0695 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 14:23:37 +0900 Subject: [PATCH 19/31] MINOR: [C#] Bump xunit from 2.6.3 to 2.6.4 in /csharp (#39370) Bumps [xunit](https://github.com/xunit/xunit) from 2.6.3 to 2.6.4.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=xunit&package-manager=nuget&previous-version=2.6.3&new-version=2.6.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- .../Apache.Arrow.Compression.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Sql.Tests.csproj | 2 +- .../Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj | 2 +- csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index e3d86f0dd9992..dd2c75dd3df90 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,7 +8,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index 4dd479545a74c..0e9c02d61977c 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index 114e76ad984f1..d38413ba45b3a 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 71f68fe2d49e3..0afd1490e7b69 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -15,7 +15,7 @@ - + all runtime; build; native; contentfiles; analyzers From 526b2eb298292849b133f9ddae7facdf8ee1d35f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Dec 2023 14:24:09 +0900 Subject: [PATCH 20/31] MINOR: [Java] Bump org.assertj:assertj-core from 3.23.1 to 3.24.2 in /java (#39375) Bumps org.assertj:assertj-core from 3.23.1 to 3.24.2. [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=org.assertj:assertj-core&package-manager=maven&previous-version=3.23.1&new-version=3.24.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@ dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@ dependabot rebase` will rebase this PR - `@ dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@ dependabot merge` will merge this PR after your CI passes on it - `@ dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@ dependabot cancel merge` will cancel a previously requested merge and block automerging - `@ dependabot reopen` will reopen this PR if it is closed - `@ dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@ dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@ dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@ dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Sutou Kouhei --- java/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/pom.xml b/java/pom.xml index 1776407e3d030..523e5642720cd 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -609,7 +609,7 @@ org.assertj assertj-core - 3.23.1 + 3.24.2 test From b32f71a157eb90a7eb107c540b9cadd343e5e388 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Tue, 26 Dec 2023 15:25:51 +0900 Subject: [PATCH 21/31] GH-39363: [C++] Use Cast() instead of CastTo() for Parquet (#39364) ### Rationale for this change Remove legacy code ### What changes are included in this PR? Replace the legacy scalar CastTo implementation for Parquet. ### Are these changes tested? Yes. It is passed by existing all test cases for Parquet. ### Are there any user-facing changes? Maybe, Yes. There is a dependency on the Parquet schema that the user handles. There may be a problem if the user has to deal with a type for which Casting is not implemented. However, in this case, it should be treated as a new issue with an implementation that improves the `Cast` compute kernel. * Closes: #39363 Authored-by: Hyunseok Seo Signed-off-by: mwish --- cpp/src/arrow/dataset/file_parquet.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3afe4ec85cf49..1c2fd2dea6307 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/compute/cast.h" #include "arrow/compute/exec.h" #include "arrow/dataset/dataset_internal.h" #include "arrow/dataset/parquet_encryption_config.h" @@ -58,6 +59,8 @@ using parquet::arrow::SchemaField; using parquet::arrow::SchemaManifest; using parquet::arrow::StatisticsAsScalars; +using compute::Cast; + namespace { parquet::ReaderProperties MakeReaderProperties( @@ -370,12 +373,12 @@ std::optional ParquetFileFragment::EvaluateStatisticsAsExpr return std::nullopt; } - auto maybe_min = min->CastTo(field.type()); - auto maybe_max = max->CastTo(field.type()); + auto maybe_min = Cast(min, field.type()); + auto maybe_max = Cast(max, field.type()); if (maybe_min.ok() && maybe_max.ok()) { - min = maybe_min.MoveValueUnsafe(); - max = maybe_max.MoveValueUnsafe(); + min = maybe_min.MoveValueUnsafe().scalar(); + max = maybe_max.MoveValueUnsafe().scalar(); if (min->Equals(*max)) { auto single_value = compute::equal(field_expr, compute::literal(std::move(min))); From cf44793204d88e0156669af102ff65f180a6b003 Mon Sep 17 00:00:00 2001 From: "Rossi(Ruoxi) Sun" Date: Tue, 26 Dec 2023 09:14:32 -0800 Subject: [PATCH 22/31] GH-39357: [C++] Reduce function.h includes (#39312) ### Rationale for this change As proposed in #36246 , by splitting function option structs from `function.h`, we can reduce the including of `function.h`. So that the total build time could be reduced. The total parser time could be reduced from 722.3s to 709.7s. And the `function.h` along with its transitive inclusion of `kernel.h` don't show up in expensive headers any more. The detailed analysis result before and after this PR are attached: [analyze-before.txt](https://github.com/apache/arrow/files/13756923/analyze-before.txt) [analyze-after.txt](https://github.com/apache/arrow/files/13756924/analyze-after.txt) Disclaimer (quote from https://github.com/apache/arrow/issues/36246#issuecomment-1866974963): > Note that the time diff is not absolute. The ClangBuildAnalyzer result differs from time to time. I guess it depends on the idle-ness of the building machine when doing the experiment. But the time reduction is almost certain, though sometimes more sometimes less. And the inclusion times of the questioning headers are reduced for sure, as shown in the attachments in my other comment. ### What changes are included in this PR? Move function option structs into own `compute/options.h`, and change including `function.h` to including `options.h` wherever fits. ### Are these changes tested? Build is testing. ### Are there any user-facing changes? There could be potential build failures for user code (quote from https://github.com/apache/arrow/issues/36246#issuecomment-1866980969): > The header function.h remains in compute/api.h, with and without this PR. The proposed PR removes function.h from api_xxx.h (then includes options.h instead), as proposed in the initial description of this issue. This results in compile failures for user code which includes only compute/api_xxx.h but not compute/api.h, and meanwhile uses CallFunction which is declared in function.h. But I think it's OK as described in https://github.com/apache/arrow/issues/36246#issuecomment-1867018578. * Closes: #39357 Authored-by: zanmato Signed-off-by: Felipe Oliveira Carvalho --- .../arrow/compute_and_write_csv_example.cc | 2 +- cpp/src/arrow/acero/aggregate_internal.cc | 1 + cpp/src/arrow/acero/scalar_aggregate_node.cc | 1 + cpp/src/arrow/compute/api.h | 21 +++-- cpp/src/arrow/compute/api_aggregate.h | 2 +- cpp/src/arrow/compute/api_scalar.h | 2 +- cpp/src/arrow/compute/api_vector.h | 3 +- cpp/src/arrow/compute/cast.h | 1 + cpp/src/arrow/compute/function.cc | 1 + cpp/src/arrow/compute/function.h | 46 +---------- cpp/src/arrow/compute/function_options.h | 81 +++++++++++++++++++ .../kernels/scalar_if_else_benchmark.cc | 1 + cpp/src/arrow/compute/kernels/vector_rank.cc | 1 + .../kernels/vector_replace_benchmark.cc | 1 + .../kernels/vector_run_end_encode_test.cc | 1 + .../arrow/compute/kernels/vector_select_k.cc | 1 + cpp/src/arrow/compute/kernels/vector_sort.cc | 1 + cpp/src/arrow/compute/registry_test.cc | 1 + cpp/src/arrow/compute/type_fwd.h | 1 + 19 files changed, 111 insertions(+), 58 deletions(-) create mode 100644 cpp/src/arrow/compute/function_options.h diff --git a/cpp/examples/arrow/compute_and_write_csv_example.cc b/cpp/examples/arrow/compute_and_write_csv_example.cc index edf21e45b2bb7..7e0f6cdf1ce16 100644 --- a/cpp/examples/arrow/compute_and_write_csv_example.cc +++ b/cpp/examples/arrow/compute_and_write_csv_example.cc @@ -16,7 +16,7 @@ // under the License. #include -#include +#include #include #include #include diff --git a/cpp/src/arrow/acero/aggregate_internal.cc b/cpp/src/arrow/acero/aggregate_internal.cc index 3cd5491720dcd..9c4b7fe5ae98c 100644 --- a/cpp/src/arrow/acero/aggregate_internal.cc +++ b/cpp/src/arrow/acero/aggregate_internal.cc @@ -25,6 +25,7 @@ #include "arrow/acero/exec_plan.h" #include "arrow/acero/options.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc index ae59aa692096a..c7805f4d24eb2 100644 --- a/cpp/src/arrow/acero/scalar_aggregate_node.cc +++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc @@ -25,6 +25,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/util.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/registry.h" #include "arrow/compute/row/grouper.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h index 5b5dfdf69eb94..b701d9928691f 100644 --- a/cpp/src/arrow/compute/api.h +++ b/cpp/src/arrow/compute/api.h @@ -20,18 +20,23 @@ #pragma once +/// \defgroup compute-functions Abstract compute function API +/// @{ +/// @} + /// \defgroup compute-concrete-options Concrete option classes for compute functions /// @{ /// @} -#include "arrow/compute/api_aggregate.h" // IWYU pragma: export -#include "arrow/compute/api_scalar.h" // IWYU pragma: export -#include "arrow/compute/api_vector.h" // IWYU pragma: export -#include "arrow/compute/cast.h" // IWYU pragma: export -#include "arrow/compute/function.h" // IWYU pragma: export -#include "arrow/compute/kernel.h" // IWYU pragma: export -#include "arrow/compute/registry.h" // IWYU pragma: export -#include "arrow/datum.h" // IWYU pragma: export +#include "arrow/compute/api_aggregate.h" // IWYU pragma: export +#include "arrow/compute/api_scalar.h" // IWYU pragma: export +#include "arrow/compute/api_vector.h" // IWYU pragma: export +#include "arrow/compute/cast.h" // IWYU pragma: export +#include "arrow/compute/function.h" // IWYU pragma: export +#include "arrow/compute/function_options.h" // IWYU pragma: export +#include "arrow/compute/kernel.h" // IWYU pragma: export +#include "arrow/compute/registry.h" // IWYU pragma: export +#include "arrow/datum.h" // IWYU pragma: export #include "arrow/compute/expression.h" // IWYU pragma: export diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h index 3493c3146310d..4d2c814a69bbb 100644 --- a/cpp/src/arrow/compute/api_aggregate.h +++ b/cpp/src/arrow/compute/api_aggregate.h @@ -22,7 +22,7 @@ #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/datum.h" #include "arrow/result.h" #include "arrow/util/macros.h" diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 9f12471ddca14..26fbe64f74293 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -24,7 +24,7 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/datum.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 0233090ef6fb9..759f9e5c1a408 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -20,9 +20,8 @@ #include #include -#include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/ordering.h" -#include "arrow/datum.h" #include "arrow/result.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h index 613e8a55addd2..18e56092dda2a 100644 --- a/cpp/src/arrow/compute/cast.h +++ b/cpp/src/arrow/compute/cast.h @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/type_fwd.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index c0433145dd1d0..e1a2e8c5d8879 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -26,6 +26,7 @@ #include "arrow/compute/exec.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/function_internal.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h index 333c9a65c56c4..be934a3c5abfc 100644 --- a/cpp/src/arrow/compute/function.h +++ b/cpp/src/arrow/compute/function.h @@ -36,53 +36,9 @@ namespace arrow { namespace compute { -/// \defgroup compute-functions Abstract compute function API -/// +/// \addtogroup compute-functions /// @{ -/// \brief Extension point for defining options outside libarrow (but -/// still within this project). -class ARROW_EXPORT FunctionOptionsType { - public: - virtual ~FunctionOptionsType() = default; - - virtual const char* type_name() const = 0; - virtual std::string Stringify(const FunctionOptions&) const = 0; - virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; - virtual Result> Serialize(const FunctionOptions&) const; - virtual Result> Deserialize( - const Buffer& buffer) const; - virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; -}; - -/// \brief Base class for specifying options configuring a function's behavior, -/// such as error handling. -class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { - public: - virtual ~FunctionOptions() = default; - - const FunctionOptionsType* options_type() const { return options_type_; } - const char* type_name() const { return options_type()->type_name(); } - - bool Equals(const FunctionOptions& other) const; - std::string ToString() const; - std::unique_ptr Copy() const; - /// \brief Serialize an options struct to a buffer. - Result> Serialize() const; - /// \brief Deserialize an options struct from a buffer. - /// Note: this will only look for `type_name` in the default FunctionRegistry; - /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then - /// call FunctionOptionsType::Deserialize(). - static Result> Deserialize( - const std::string& type_name, const Buffer& buffer); - - protected: - explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} - const FunctionOptionsType* options_type_; -}; - -ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); - /// \brief Contains the number of required arguments for the function. /// /// Naming conventions taken from https://en.wikipedia.org/wiki/Arity. diff --git a/cpp/src/arrow/compute/function_options.h b/cpp/src/arrow/compute/function_options.h new file mode 100644 index 0000000000000..88ec2fd2d0679 --- /dev/null +++ b/cpp/src/arrow/compute/function_options.h @@ -0,0 +1,81 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// NOTE: API is EXPERIMENTAL and will change without going through a +// deprecation cycle. + +#pragma once + +#include "arrow/compute/type_fwd.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace compute { + +/// \addtogroup compute-functions +/// @{ + +/// \brief Extension point for defining options outside libarrow (but +/// still within this project). +class ARROW_EXPORT FunctionOptionsType { + public: + virtual ~FunctionOptionsType() = default; + + virtual const char* type_name() const = 0; + virtual std::string Stringify(const FunctionOptions&) const = 0; + virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0; + virtual Result> Serialize(const FunctionOptions&) const; + virtual Result> Deserialize( + const Buffer& buffer) const; + virtual std::unique_ptr Copy(const FunctionOptions&) const = 0; +}; + +/// \brief Base class for specifying options configuring a function's behavior, +/// such as error handling. +class ARROW_EXPORT FunctionOptions : public util::EqualityComparable { + public: + virtual ~FunctionOptions() = default; + + const FunctionOptionsType* options_type() const { return options_type_; } + const char* type_name() const { return options_type()->type_name(); } + + bool Equals(const FunctionOptions& other) const; + std::string ToString() const; + std::unique_ptr Copy() const; + /// \brief Serialize an options struct to a buffer. + Result> Serialize() const; + /// \brief Deserialize an options struct from a buffer. + /// Note: this will only look for `type_name` in the default FunctionRegistry; + /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then + /// call FunctionOptionsType::Deserialize(). + static Result> Deserialize( + const std::string& type_name, const Buffer& buffer); + + protected: + explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {} + const FunctionOptionsType* options_type_; +}; + +ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*); + +/// @} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index b72402bbccd4e..58bc560f52842 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -21,6 +21,7 @@ #include "arrow/array/concatenate.h" #include "arrow/array/util.h" #include "arrow/compute/api_scalar.h" +#include "arrow/compute/function.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/util/key_value_metadata.h" diff --git a/cpp/src/arrow/compute/kernels/vector_rank.cc b/cpp/src/arrow/compute/kernels/vector_rank.cc index 780ae25d96360..0cea7246e516c 100644 --- a/cpp/src/arrow/compute/kernels/vector_rank.cc +++ b/cpp/src/arrow/compute/kernels/vector_rank.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc index 719969d46ea7c..971a841de0773 100644 --- a/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/vector_replace_benchmark.cc @@ -18,6 +18,7 @@ #include #include "arrow/array.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" diff --git a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc index 0bd8e3386e7cc..f02aee1b35996 100644 --- a/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc @@ -21,6 +21,7 @@ #include "arrow/array/validate.h" #include "arrow/builder.h" #include "arrow/compute/api_vector.h" +#include "arrow/datum.h" #include "arrow/testing/gtest_util.h" #include "arrow/type_fwd.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/kernels/vector_select_k.cc b/cpp/src/arrow/compute/kernels/vector_select_k.cc index 5000de8996280..1740a9b7f0bb4 100644 --- a/cpp/src/arrow/compute/kernels/vector_select_k.cc +++ b/cpp/src/arrow/compute/kernels/vector_select_k.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8ddcbb9905cb2..e08a2bc10372f 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -17,6 +17,7 @@ #include +#include "arrow/compute/function.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc index 7fee136de7a0b..2d69f119df1f4 100644 --- a/cpp/src/arrow/compute/registry_test.cc +++ b/cpp/src/arrow/compute/registry_test.cc @@ -22,6 +22,7 @@ #include #include "arrow/compute/function.h" +#include "arrow/compute/function_options.h" #include "arrow/compute/registry.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/type_fwd.h b/cpp/src/arrow/compute/type_fwd.h index 3f990b1814311..89f32ceb0f906 100644 --- a/cpp/src/arrow/compute/type_fwd.h +++ b/cpp/src/arrow/compute/type_fwd.h @@ -27,6 +27,7 @@ struct TypeHolder; namespace compute { class Function; +class ScalarAggregateFunction; class FunctionExecutor; class FunctionOptions; class FunctionRegistry; From ae627c09b08dbd9b4faac545170f4706645ca4ce Mon Sep 17 00:00:00 2001 From: Dominik Moritz Date: Wed, 27 Dec 2023 15:06:23 +0100 Subject: [PATCH 23/31] GH-39251: [JS] Use resizable buffer in builder (#39252) --- js/src/builder.ts | 2 +- js/src/builder/binary.ts | 4 ++-- js/src/builder/buffer.ts | 44 +++++++++++++++++++++++++------------ js/src/builder/largeutf8.ts | 2 +- js/src/builder/union.ts | 4 ++-- js/src/builder/utf8.ts | 2 +- 6 files changed, 37 insertions(+), 21 deletions(-) diff --git a/js/src/builder.ts b/js/src/builder.ts index a4e2d4d89325c..1880db3818ca5 100644 --- a/js/src/builder.ts +++ b/js/src/builder.ts @@ -342,7 +342,7 @@ export abstract class Builder { export abstract class FixedWidthBuilder extends Builder { constructor(opts: BuilderOptions) { super(opts); - this._values = new DataBufferBuilder(new this.ArrayType(0), this.stride); + this._values = new DataBufferBuilder(this.ArrayType, 0, this.stride); } public setValue(index: number, value: T['TValue']) { const values = this._values; diff --git a/js/src/builder/binary.ts b/js/src/builder/binary.ts index 3c12ddf34abb0..fa9a11b24ec39 100644 --- a/js/src/builder/binary.ts +++ b/js/src/builder/binary.ts @@ -16,15 +16,15 @@ // under the License. import { Binary } from '../type.js'; -import { toUint8Array } from '../util/buffer.js'; import { BufferBuilder } from './buffer.js'; import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; +import { toUint8Array } from '../util/buffer.js'; /** @ignore */ export class BinaryBuilder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/buffer.ts b/js/src/builder/buffer.ts index 402172059682c..18c6dcda738b9 100644 --- a/js/src/builder/buffer.ts +++ b/js/src/builder/buffer.ts @@ -24,20 +24,36 @@ function roundLengthUpToNearest64Bytes(len: number, BPE: number) { const bytesMinus1 = Math.ceil(len) * BPE - 1; return ((bytesMinus1 - bytesMinus1 % 64 + 64) || 64) / BPE; } + /** @ignore */ -const sliceOrExtendArray = (arr: T, len = 0) => ( - arr.length >= len ? arr.subarray(0, len) : memcpy(new (arr.constructor as any)(len), arr, 0) -) as T; +function resizeArray(arr: T, len = 0): T { + // TODO: remove when https://github.com/microsoft/TypeScript/issues/54636 is fixed + const buffer = arr.buffer as ArrayBufferLike & { resizable: boolean; resize: (byteLength: number) => void; maxByteLength: number }; + const byteLength = len * arr.BYTES_PER_ELEMENT; + if (buffer.resizable && byteLength <= buffer.maxByteLength) { + buffer.resize(byteLength); + return arr; + } + + // Fallback for non-resizable buffers + return arr.length >= len ? + arr.subarray(0, len) as T : + memcpy(new (arr.constructor as any)(len), arr, 0); +} + +/** @ignore */ +export const SAFE_ARRAY_SIZE = 2 ** 32 - 1; /** @ignore */ export class BufferBuilder { - constructor(buffer: T, stride = 1) { - this.buffer = buffer; + constructor(bufferType: ArrayCtor, initialSize = 0, stride = 1) { + this.length = Math.ceil(initialSize / stride); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new bufferType(new (ArrayBuffer as any)(this.length * bufferType.BYTES_PER_ELEMENT, { maxByteLength: SAFE_ARRAY_SIZE })) as T; this.stride = stride; - this.BYTES_PER_ELEMENT = buffer.BYTES_PER_ELEMENT; - this.ArrayType = buffer.constructor as ArrayCtor; - this._resize(this.length = Math.ceil(buffer.length / stride)); + this.BYTES_PER_ELEMENT = bufferType.BYTES_PER_ELEMENT; + this.ArrayType = bufferType; } public buffer: T; @@ -72,17 +88,18 @@ export class BufferBuilder { } public flush(length = this.length) { length = roundLengthUpToNearest64Bytes(length * this.stride, this.BYTES_PER_ELEMENT); - const array = sliceOrExtendArray(this.buffer, length); + const array = resizeArray(this.buffer, length); this.clear(); return array; } public clear() { this.length = 0; - this._resize(0); + // TODO: remove as any when https://github.com/microsoft/TypeScript/issues/54636 is fixed + this.buffer = new this.ArrayType(new (ArrayBuffer as any)(0, { maxByteLength: SAFE_ARRAY_SIZE })) as T; return this; } protected _resize(newLength: number) { - return this.buffer = memcpy(new this.ArrayType(newLength), this.buffer); + return this.buffer = resizeArray(this.buffer, newLength); } } @@ -100,7 +117,7 @@ export class DataBufferBuilder extends Buffe /** @ignore */ export class BitmapBufferBuilder extends DataBufferBuilder { - constructor(data = new Uint8Array(0)) { super(data, 1 / 8); } + constructor() { super(Uint8Array, 0, 1 / 8); } public numValid = 0; public get numInvalid() { return this.length - this.numValid; } @@ -123,9 +140,8 @@ export class BitmapBufferBuilder extends DataBufferBuilder { /** @ignore */ export class OffsetsBufferBuilder extends DataBufferBuilder { constructor(type: T) { - super(new type.OffsetArrayType(1), 1); + super(type.OffsetArrayType as ArrayCtor, 1, 1); } - public append(value: T['TOffsetArray'][0]) { return this.set(this.length - 1, value); } diff --git a/js/src/builder/largeutf8.ts b/js/src/builder/largeutf8.ts index 51890100095c1..90a0bde9f3443 100644 --- a/js/src/builder/largeutf8.ts +++ b/js/src/builder/largeutf8.ts @@ -25,7 +25,7 @@ import { LargeBinaryBuilder } from './largebinary.js'; export class LargeUtf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); diff --git a/js/src/builder/union.ts b/js/src/builder/union.ts index ac8a13191a549..7bee460a77de1 100644 --- a/js/src/builder/union.ts +++ b/js/src/builder/union.ts @@ -31,7 +31,7 @@ export abstract class UnionBuilder extends Builder constructor(options: UnionBuilderOptions) { super(options); - this._typeIds = new DataBufferBuilder(new Int8Array(0), 1); + this._typeIds = new DataBufferBuilder(Int8Array, 0, 1); if (typeof options['valueToChildTypeId'] === 'function') { this._valueToChildTypeId = options['valueToChildTypeId']; } @@ -84,7 +84,7 @@ export class DenseUnionBuilder extends UnionB constructor(options: UnionBuilderOptions) { super(options); - this._offsets = new DataBufferBuilder(new Int32Array(0)); + this._offsets = new DataBufferBuilder(Int32Array); } /** @ignore */ diff --git a/js/src/builder/utf8.ts b/js/src/builder/utf8.ts index 53b8306cbaffd..aac0aec54fe90 100644 --- a/js/src/builder/utf8.ts +++ b/js/src/builder/utf8.ts @@ -25,7 +25,7 @@ import { VariableWidthBuilder, BuilderOptions } from '../builder.js'; export class Utf8Builder extends VariableWidthBuilder { constructor(opts: BuilderOptions) { super(opts); - this._values = new BufferBuilder(new Uint8Array(0)); + this._values = new BufferBuilder(Uint8Array); } public get byteLength(): number { let size = this._pendingLength + (this.length * 4); From 9e33d12f1b022c902cc831026ceb3e0016ca4b3c Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Wed, 27 Dec 2023 10:10:46 -0800 Subject: [PATCH 24/31] GH-39341: [C#] Support Utf8View, BinaryView and ListView (#39342) ### What changes are included in this PR? Support for reading, writing and representing Utf8View, BinaryView and ListView. ### Are these changes tested? Yes ### Are there any user-facing changes? New classes and APIs for Utf8View, BinaryView and ListView. * Closes: #39341 Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Arrays/ArrayDataConcatenator.cs | 102 +++++- .../Arrays/ArrowArrayBuilderFactory.cs | 6 + .../Apache.Arrow/Arrays/ArrowArrayFactory.cs | 6 + .../Apache.Arrow/Arrays/BinaryViewArray.cs | 344 ++++++++++++++++++ .../src/Apache.Arrow/Arrays/ListViewArray.cs | 217 +++++++++++ .../Apache.Arrow/Arrays/StringViewArray.cs | 110 ++++++ .../src/Apache.Arrow/C/CArrowArrayExporter.cs | 22 +- .../src/Apache.Arrow/C/CArrowArrayImporter.cs | 48 +++ .../Apache.Arrow/C/CArrowSchemaExporter.cs | 3 + .../Apache.Arrow/C/CArrowSchemaImporter.cs | 6 +- .../Extensions/ArrayDataExtensions.cs | 11 + .../Extensions/FlatbufExtensions.cs | 19 - csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs | 47 +++ .../Flatbuf/Enums/MetadataVersion.cs | 12 +- csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs | 16 + csharp/src/Apache.Arrow/Flatbuf/Field.cs | 4 + .../src/Apache.Arrow/Flatbuf/LargeListView.cs | 42 +++ csharp/src/Apache.Arrow/Flatbuf/ListView.cs | 43 +++ .../src/Apache.Arrow/Flatbuf/RecordBatch.cs | 37 +- .../src/Apache.Arrow/Flatbuf/SparseTensor.cs | 4 + csharp/src/Apache.Arrow/Flatbuf/Tensor.cs | 4 + csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs | 47 +++ .../Ipc/ArrowReaderImplementation.cs | 79 ++-- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 57 ++- .../Ipc/ArrowTypeFlatbufferBuilder.cs | 39 ++ .../src/Apache.Arrow/Ipc/MessageSerializer.cs | 10 + csharp/src/Apache.Arrow/Scalars/BinaryView.cs | 111 ++++++ .../src/Apache.Arrow/Types/BinaryViewType.cs | 28 ++ csharp/src/Apache.Arrow/Types/IArrowType.cs | 3 + csharp/src/Apache.Arrow/Types/ListViewType.cs | 35 ++ .../src/Apache.Arrow/Types/StringViewType.cs | 28 ++ .../ArrowWriterBenchmark.cs | 2 +- .../Apache.Arrow.IntegrationTest/JsonFile.cs | 156 +++++++- .../Properties/launchSettings.json | 8 + .../ArrowArrayConcatenatorTests.cs | 89 +++++ .../Apache.Arrow.Tests/ArrowReaderVerifier.cs | 61 ++++ .../Apache.Arrow.Tests/BinaryViewTests.cs | 89 +++++ .../CDataInterfacePythonTests.cs | 4 +- csharp/test/Apache.Arrow.Tests/TableTests.cs | 6 +- csharp/test/Apache.Arrow.Tests/TestData.cs | 198 ++++++++-- dev/archery/archery/integration/datagen.py | 3 +- docs/source/status.rst | 10 +- 42 files changed, 2017 insertions(+), 149 deletions(-) create mode 100644 csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Arrays/ListViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Arrays/StringViewArray.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/ListView.cs create mode 100644 csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs create mode 100644 csharp/src/Apache.Arrow/Scalars/BinaryView.cs create mode 100644 csharp/src/Apache.Arrow/Types/BinaryViewType.cs create mode 100644 csharp/src/Apache.Arrow/Types/ListViewType.cs create mode 100644 csharp/src/Apache.Arrow/Types/StringViewType.cs create mode 100644 csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json create mode 100644 csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 463ca49e29c94..698d74e4bac84 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -14,6 +14,7 @@ // limitations under the License. using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; using Apache.Arrow.Types; using System; using System.Collections.Generic; @@ -46,8 +47,11 @@ private class ArrayDataConcatenationVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -84,17 +88,50 @@ public void Visit(FixedWidthType type) { CheckData(type, 2); ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); - ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(type); + ArrowBuffer valueBuffer = ConcatenateFixedWidthTypeValueBuffer(1, type); Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, valueBuffer }); } public void Visit(BinaryType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(BinaryViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(StringType type) => ConcatenateVariableBinaryArrayData(type); + public void Visit(StringViewType type) => ConcatenateBinaryViewArrayData(type); + public void Visit(ListType type) => ConcatenateLists(type); + public void Visit(ListViewType type) + { + CheckData(type, 3); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + + var offsetsBuilder = new ArrowBuffer.Builder(_totalLength); + int baseOffset = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length > 0) + { + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (int offset in span) + { + offsetsBuilder.Append(baseOffset + offset); + } + } + + baseOffset += arrayData.Children[0].Length; + } + + ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator); + ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default); + ArrayData child = Concatenate(SelectChildren(0), _allocator); + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child }); + } + public void Visit(FixedSizeListType type) { CheckData(type, 1); @@ -161,6 +198,15 @@ private void CheckData(IArrowType type, int expectedBufferCount) } } + private void CheckDataVariadicCount(IArrowType type, int expectedBufferCount) + { + foreach (ArrayData arrayData in _arrayDataList) + { + arrayData.EnsureDataType(type.TypeId); + arrayData.EnsureVariadicBufferCount(expectedBufferCount); + } + } + private void ConcatenateVariableBinaryArrayData(IArrowType type) { CheckData(type, 3); @@ -171,6 +217,26 @@ private void ConcatenateVariableBinaryArrayData(IArrowType type) Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, valueBuffer }); } + private void ConcatenateBinaryViewArrayData(IArrowType type) + { + CheckDataVariadicCount(type, 2); + ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + ArrowBuffer viewBuffer = ConcatenateViewBuffer(out int variadicBufferCount); + ArrowBuffer[] buffers = new ArrowBuffer[2 + variadicBufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewBuffer; + int index = 2; + foreach (ArrayData arrayData in _arrayDataList) + { + for (int i = 2; i < arrayData.Buffers.Length; i++) + { + buffers[index++] = arrayData.Buffers[i]; + } + } + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, buffers); + } + private void ConcatenateLists(NestedType type) { CheckData(type, 2); @@ -206,7 +272,7 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex) return builder.Build(_allocator); } - private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) + private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedWidthType type) { int typeByteWidth = type.BitWidth / 8; var builder = new ArrowBuffer.Builder(_totalLength * typeByteWidth); @@ -216,7 +282,7 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(FixedWidthType type) int length = arrayData.Length; int byteLength = length * typeByteWidth; - builder.Append(arrayData.Buffers[1].Span.Slice(0, byteLength)); + builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength)); } return builder.Build(_allocator); @@ -265,6 +331,36 @@ private ArrowBuffer ConcatenateOffsetBuffer() return builder.Build(_allocator); } + private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount) + { + var builder = new ArrowBuffer.Builder(_totalLength); + variadicBufferCount = 0; + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length == 0) + { + continue; + } + + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + foreach (BinaryView view in span) + { + if (view.Length > BinaryView.MaxInlineLength) + { + builder.Append(view.AdjustBufferIndex(variadicBufferCount)); + } + else + { + builder.Append(view); + } + } + + variadicBufferCount += (arrayData.Buffers.Length - 2); + } + + return builder.Build(_allocator); + } + private ArrowBuffer ConcatenateUnionTypeBuffer() { var builder = new ArrowBuffer.Builder(_totalLength); diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs index af5a524798396..f8367102082f5 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs @@ -54,8 +54,12 @@ internal static IArrowArrayBuilder> return new DoubleArray.Builder(); case ArrowTypeId.String: return new StringArray.Builder(); + case ArrowTypeId.StringView: + return new StringViewArray.Builder(); case ArrowTypeId.Binary: return new BinaryArray.Builder(); + case ArrowTypeId.BinaryView: + return new BinaryViewArray.Builder(); case ArrowTypeId.Timestamp: return new TimestampArray.Builder(); case ArrowTypeId.Date64: @@ -70,6 +74,8 @@ internal static IArrowArrayBuilder> return new DurationArray.Builder(dataType as DurationType); case ArrowTypeId.List: return new ListArray.Builder(dataType as ListType); + case ArrowTypeId.ListView: + return new ListViewArray.Builder(dataType as ListViewType); case ArrowTypeId.FixedSizeList: return new FixedSizeListArray.Builder(dataType as FixedSizeListType); case ArrowTypeId.Decimal128: diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs index d6577260bb82d..3d2ab1d2129f1 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs @@ -51,14 +51,20 @@ public static IArrowArray BuildArray(ArrayData data) return new DoubleArray(data); case ArrowTypeId.String: return new StringArray(data); + case ArrowTypeId.StringView: + return new StringViewArray(data); case ArrowTypeId.FixedSizedBinary: return new FixedSizeBinaryArray(data); case ArrowTypeId.Binary: return new BinaryArray(data); + case ArrowTypeId.BinaryView: + return new BinaryViewArray(data); case ArrowTypeId.Timestamp: return new TimestampArray(data); case ArrowTypeId.List: return new ListArray(data); + case ArrowTypeId.ListView: + return new ListViewArray(data); case ArrowTypeId.Map: return new MapArray(data); case ArrowTypeId.Struct: diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs new file mode 100644 index 0000000000000..4f62dffd1ddeb --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/BinaryViewArray.cs @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Memory; +using Apache.Arrow.Scalars; +using Apache.Arrow.Types; +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Collections; + +namespace Apache.Arrow +{ + public class BinaryViewArray : Array, IReadOnlyList + { + public class Builder : BuilderBase + { + public Builder() : base(BinaryViewType.Default) { } + public Builder(IArrowType dataType) : base(dataType) { } + + protected override BinaryViewArray Build(ArrayData data) + { + return new BinaryViewArray(data); + } + } + + public BinaryViewArray(ArrayData data) + : base(data) + { + data.EnsureDataType(ArrowTypeId.BinaryView); + data.EnsureVariadicBufferCount(2); + } + + public BinaryViewArray(ArrowTypeId typeId, ArrayData data) + : base(data) + { + data.EnsureDataType(typeId); + data.EnsureVariadicBufferCount(2); + } + + public abstract class BuilderBase : IArrowArrayBuilder + where TArray : IArrowArray + where TBuilder : class, IArrowArrayBuilder + { + protected IArrowType DataType { get; } + protected TBuilder Instance => this as TBuilder; + protected ArrowBuffer.Builder BinaryViews { get; } + protected ArrowBuffer.Builder ValueBuffer { get; } + protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; } + protected int NullCount => this.ValidityBuffer.UnsetBitCount; + + protected BuilderBase(IArrowType dataType) + { + DataType = dataType; + BinaryViews = new ArrowBuffer.Builder(); + ValueBuffer = new ArrowBuffer.Builder(); + ValidityBuffer = new ArrowBuffer.BitmapBuilder(); + } + + protected abstract TArray Build(ArrayData data); + + /// + /// Gets the length of the array built so far. + /// + public int Length => BinaryViews.Length; + + /// + /// Build an Arrow array from the appended contents so far. + /// + /// Optional memory allocator. + /// Returns an array of type . + public TArray Build(MemoryAllocator allocator = default) + { + bool hasValues = ValueBuffer.Length > 0; + var bufs = new ArrowBuffer[hasValues ? 3 : 2]; + bufs[0] = NullCount > 0 ? ValidityBuffer.Build(allocator) : ArrowBuffer.Empty; + bufs[1] = BinaryViews.Build(allocator); + if (hasValues) { bufs[2] = ValueBuffer.Build(allocator); } + + var data = new ArrayData( + DataType, + length: Length, + NullCount, + offset: 0, + bufs); + + return Build(data); + } + + /// + /// Append a single null value to the array. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder AppendNull() + { + // Do not add to the value buffer in the case of a null. + // Note that we do not need to increment the offset as a result. + ValidityBuffer.Append(false); + BinaryViews.Append(default(BinaryView)); + return Instance; + } + + /// + /// Appends a value, consisting of a single byte, to the array. + /// + /// Byte value to append. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(byte value) + { + ValidityBuffer.Append(true); + Span buf = stackalloc[] { value }; + BinaryViews.Append(new BinaryView(buf)); + return Instance; + } + + /// + /// Append a value, consisting of a span of bytes, to the array. + /// + /// + /// Note that a single value is added, which consists of arbitrarily many bytes. If multiple values are + /// to be added, use the method. + /// + /// Span of bytes to add. + /// Returns the builder (for fluent-style composition). + public TBuilder Append(ReadOnlySpan span) + { + if (span.Length > BinaryView.MaxInlineLength) + { + int offset = ValueBuffer.Length; + ValueBuffer.Append(span); + BinaryViews.Append(new BinaryView(span.Length, span.Slice(0, 4), 0, offset)); + } + else + { + BinaryViews.Append(new BinaryView(span)); + } + ValidityBuffer.Append(true); + return Instance; + } + + /// + /// Append an enumerable collection of single-byte values to the array. + /// + /// + /// Note that this method appends multiple values, each of which is a single byte + /// + /// Single-byte values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte b in values) + { + Append(b); + } + + return Instance; + } + + /// + /// Append an enumerable collection of values to the array. + /// + /// Values to add. + /// Returns the builder (for fluent-style composition). + public TBuilder AppendRange(IEnumerable values) + { + if (values == null) + { + throw new ArgumentNullException(nameof(values)); + } + + foreach (byte[] arr in values) + { + if (arr == null) + { + AppendNull(); + } + else + { + Append((ReadOnlySpan)arr); + } + } + + return Instance; + } + + public TBuilder Reserve(int capacity) + { + // TODO: [ARROW-9366] Reserve capacity in the value buffer in a more sensible way. + BinaryViews.Reserve(capacity); + ValueBuffer.Reserve(capacity); + ValidityBuffer.Reserve(capacity); + return Instance; + } + + public TBuilder Resize(int length) + { + // TODO: [ARROW-9366] Resize the value buffer to a safe length based on offsets, not `length`. + BinaryViews.Resize(length); + ValueBuffer.Resize(length); + ValidityBuffer.Resize(length); + return Instance; + } + + public TBuilder Swap(int i, int j) + { + ValidityBuffer.Swap(i, j); + BinaryView view = BinaryViews.Span[i]; + BinaryViews.Span[i] = BinaryViews.Span[j]; + BinaryViews.Span[j] = view; + return Instance; + } + + public TBuilder Set(int index, byte value) + { + // TODO: Implement + throw new NotImplementedException(); + } + + /// + /// Clear all contents appended so far. + /// + /// Returns the builder (for fluent-style composition). + public TBuilder Clear() + { + BinaryViews.Clear(); + ValueBuffer.Clear(); + ValidityBuffer.Clear(); + return Instance; + } + } + + public BinaryViewArray(IArrowType dataType, int length, + ArrowBuffer binaryViewsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, binaryViewsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public ArrowBuffer ViewsBuffer => Data.Buffers[1]; + + public int DataBufferCount => Data.Buffers.Length - 2; + + public ArrowBuffer DataBuffer(int index) => Data.Buffers[index + 2]; + + public ReadOnlySpan Views => ViewsBuffer.Span.CastTo().Slice(Offset, Length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + if (!IsValid(index)) + { + return 0; + } + + return Views[index].Length; + } + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// + /// Note that this method cannot reliably identify null values, which are indistinguishable from empty byte + /// collection values when seen in the context of this method's return type of . + /// Use the method or the overload instead + /// to reliably determine null values. + /// + /// Index at which to get bytes. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index) => GetBytes(index, out _); + + /// + /// Get the collection of bytes, as a read-only span, at a given index in the array. + /// + /// Index at which to get bytes. + /// Set to if the value at the given index is null. + /// Returns a object. + /// If the index is negative or beyond the length of the array. + /// + public ReadOnlySpan GetBytes(int index, out bool isNull) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + isNull = IsNull(index); + + if (isNull) + { + // Note that `return null;` is valid syntax, but would be misleading as `null` in the context of a span + // is actually returned as an empty span. + return ReadOnlySpan.Empty; + } + + BinaryView binaryView = Views[index]; + if (binaryView.IsInline) + { + return ViewsBuffer.Span.Slice(16 * index + 4, binaryView.Length); + } + + return DataBuffer(binaryView._bufferIndex).Span.Slice(binaryView._bufferOffset, binaryView.Length); + } + + int IReadOnlyCollection.Count => Length; + byte[] IReadOnlyList.this[int index] => GetBytes(index).ToArray(); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetBytes(index).ToArray(); + } + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs new file mode 100644 index 0000000000000..081385d9211a4 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/ListViewArray.cs @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Memory; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + public class ListViewArray : Array + { + public class Builder : IArrowArrayBuilder + { + public IArrowArrayBuilder> ValueBuilder { get; } + + public int Length => ValueOffsetsBufferBuilder.Length; + + private ArrowBuffer.Builder ValueOffsetsBufferBuilder { get; } + + private ArrowBuffer.Builder SizesBufferBuilder { get; } + + private ArrowBuffer.BitmapBuilder ValidityBufferBuilder { get; } + + public int NullCount { get; protected set; } + + private IArrowType DataType { get; } + + private int Start { get; set; } + + public Builder(IArrowType valueDataType) : this(new ListViewType(valueDataType)) + { + } + + public Builder(Field valueField) : this(new ListViewType(valueField)) + { + } + + internal Builder(ListViewType dataType) + { + ValueBuilder = ArrowArrayBuilderFactory.Build(dataType.ValueDataType); + ValueOffsetsBufferBuilder = new ArrowBuffer.Builder(); + SizesBufferBuilder = new ArrowBuffer.Builder(); + ValidityBufferBuilder = new ArrowBuffer.BitmapBuilder(); + DataType = dataType; + Start = -1; + } + + /// + /// Start a new variable-length list slot + /// + /// This function should be called before beginning to append elements to the + /// value builder. TODO: Consider adding builder APIs to support construction + /// of overlapping lists. + /// + public Builder Append() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(true); + + return this; + } + + public Builder AppendNull() + { + AppendPrevious(); + + ValidityBufferBuilder.Append(false); + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(0); + NullCount++; + Start = -1; + + return this; + } + + private void AppendPrevious() + { + if (Start >= 0) + { + ValueOffsetsBufferBuilder.Append(Start); + SizesBufferBuilder.Append(ValueBuilder.Length - Start); + } + Start = ValueBuilder.Length; + } + + public ListViewArray Build(MemoryAllocator allocator = default) + { + AppendPrevious(); + + ArrowBuffer validityBuffer = NullCount > 0 + ? ValidityBufferBuilder.Build(allocator) + : ArrowBuffer.Empty; + + return new ListViewArray(DataType, Length, + ValueOffsetsBufferBuilder.Build(allocator), SizesBufferBuilder.Build(allocator), + ValueBuilder.Build(allocator), + validityBuffer, NullCount, 0); + } + + public Builder Reserve(int capacity) + { + ValueOffsetsBufferBuilder.Reserve(capacity); + SizesBufferBuilder.Reserve(capacity); + ValidityBufferBuilder.Reserve(capacity); + return this; + } + + public Builder Resize(int length) + { + ValueOffsetsBufferBuilder.Resize(length); + SizesBufferBuilder.Resize(length); + ValidityBufferBuilder.Resize(length); + return this; + } + + public Builder Clear() + { + ValueOffsetsBufferBuilder.Clear(); + SizesBufferBuilder.Clear(); + ValueBuilder.Clear(); + ValidityBufferBuilder.Clear(); + return this; + } + + } + + public IArrowArray Values { get; } + + public ArrowBuffer ValueOffsetsBuffer => Data.Buffers[1]; + + public ReadOnlySpan ValueOffsets => ValueOffsetsBuffer.Span.CastTo().Slice(Offset, Length); + + public ArrowBuffer SizesBuffer => Data.Buffers[2]; + + public ReadOnlySpan Sizes => SizesBuffer.Span.CastTo().Slice(Offset, Length); + + public ListViewArray(IArrowType dataType, int length, + ArrowBuffer valueOffsetsBuffer, ArrowBuffer sizesBuffer, IArrowArray values, + ArrowBuffer nullBitmapBuffer, int nullCount = 0, int offset = 0) + : this(new ArrayData(dataType, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, sizesBuffer }, new[] { values.Data }), + values) + { + } + + public ListViewArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0])) + { + } + + private ListViewArray(ArrayData data, IArrowArray values) : base(data) + { + data.EnsureBufferCount(3); + data.EnsureDataType(ArrowTypeId.ListView); + Values = values; + } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public int GetValueLength(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return 0; + } + + return Sizes[index]; + } + + public IArrowArray GetSlicedValues(int index) + { + if (index < 0 || index >= Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + if (IsNull(index)) + { + return null; + } + + if (!(Values is Array array)) + { + return default; + } + + return array.Slice(ValueOffsets[index], GetValueLength(index)); + } + + protected override void Dispose(bool disposing) + { + if (disposing) + { + Values?.Dispose(); + } + base.Dispose(disposing); + } + } +} diff --git a/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs new file mode 100644 index 0000000000000..88644761535d9 --- /dev/null +++ b/csharp/src/Apache.Arrow/Arrays/StringViewArray.cs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Apache.Arrow.Types; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; + +namespace Apache.Arrow +{ + public class StringViewArray: BinaryViewArray, IReadOnlyList + { + public static readonly Encoding DefaultEncoding = Encoding.UTF8; + + public new class Builder : BuilderBase + { + public Builder() : base(StringViewType.Default) { } + + protected override StringViewArray Build(ArrayData data) + { + return new StringViewArray(data); + } + + public Builder Append(string value, Encoding encoding = null) + { + if (value == null) + { + return AppendNull(); + } + encoding = encoding ?? DefaultEncoding; + byte[] span = encoding.GetBytes(value); + return Append(span.AsSpan()); + } + + public Builder AppendRange(IEnumerable values, Encoding encoding = null) + { + foreach (string value in values) + { + Append(value, encoding); + } + + return this; + } + } + + public StringViewArray(ArrayData data) + : base(ArrowTypeId.StringView, data) { } + + public StringViewArray(int length, + ArrowBuffer valueOffsetsBuffer, + ArrowBuffer dataBuffer, + ArrowBuffer nullBitmapBuffer, + int nullCount = 0, int offset = 0) + : this(new ArrayData(StringViewType.Default, length, nullCount, offset, + new[] { nullBitmapBuffer, valueOffsetsBuffer, dataBuffer })) + { } + + public override void Accept(IArrowArrayVisitor visitor) => Accept(this, visitor); + + public string GetString(int index, Encoding encoding = default) + { + encoding ??= DefaultEncoding; + + ReadOnlySpan bytes = GetBytes(index, out bool isNull); + + if (isNull) + { + return null; + } + if (bytes.Length == 0) + { + return string.Empty; + } + + unsafe + { + fixed (byte* data = &MemoryMarshal.GetReference(bytes)) + return encoding.GetString(data, bytes.Length); + } + } + + int IReadOnlyCollection.Count => Length; + + string IReadOnlyList.this[int index] => GetString(index); + + IEnumerator IEnumerable.GetEnumerator() + { + for (int index = 0; index < Length; index++) + { + yield return GetString(index); + }; + } + + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + } +} diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 2d9febea33f54..03059eaf5d4df 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -15,10 +15,12 @@ using System; +using System.Buffers; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Apache.Arrow.Memory; +using Apache.Arrow.Types; namespace Apache.Arrow.C { @@ -121,7 +123,16 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr cArray->buffers = null; if (cArray->n_buffers > 0) { - cArray->buffers = (byte**)sharedOwner.Allocate(array.Buffers.Length * IntPtr.Size); + long* lengths = null; + int bufferCount = array.Buffers.Length; + if (array.DataType.TypeId == ArrowTypeId.BinaryView || array.DataType.TypeId == ArrowTypeId.StringView) + { + lengths = (long*)sharedOwner.Allocate(8 * bufferCount); // overallocation to avoid edge case + bufferCount++; + cArray->n_buffers++; + } + + cArray->buffers = (byte**)sharedOwner.Allocate(bufferCount * IntPtr.Size); for (int i = 0; i < array.Buffers.Length; i++) { ArrowBuffer buffer = array.Buffers[i]; @@ -131,6 +142,15 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr throw new NotSupportedException($"An ArrowArray of type {array.DataType.TypeId} could not be exported: failed on buffer #{i}"); } cArray->buffers[i] = (byte*)ptr; + if (lengths != null && i >= 2) + { + lengths[i - 2] = array.Buffers[i].Length; + } + } + + if (lengths != null) + { + cArray->buffers[array.Buffers.Length] = (byte*)lengths; } } diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 1b40ec49658bb..fbb2be661fc5d 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -157,10 +157,18 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) case ArrowTypeId.Binary: buffers = ImportByteArrayBuffers(cArray); break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = ImportByteArrayViewBuffers(cArray); + break; case ArrowTypeId.List: children = ProcessListChildren(cArray, ((ListType)type).ValueDataType); buffers = ImportListBuffers(cArray); break; + case ArrowTypeId.ListView: + children = ProcessListChildren(cArray, ((ListViewType)type).ValueDataType); + buffers = ImportListViewBuffers(cArray); + break; case ArrowTypeId.FixedSizeList: children = ProcessListChildren(cArray, ((FixedSizeListType)type).ValueDataType); buffers = ImportFixedSizeListBuffers(cArray); @@ -268,6 +276,28 @@ private ArrowBuffer[] ImportByteArrayBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportByteArrayViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers < 3) + { + throw new InvalidOperationException("Byte array views are expected to have at least three buffers"); + } + + int length = checked((int)cArray->length); + int viewsLength = length * 16; + + long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; + ArrowBuffer[] buffers = new ArrowBuffer[cArray->n_buffers - 1]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, viewsLength)); + for (int i = 2; i < buffers.Length; i++) + { + buffers[i] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[i], 0, checked((int)bufferLengths[i - 2]))); + } + + return buffers; + } + private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 2) @@ -285,6 +315,24 @@ private ArrowBuffer[] ImportListBuffers(CArrowArray* cArray) return buffers; } + private ArrowBuffer[] ImportListViewBuffers(CArrowArray* cArray) + { + if (cArray->n_buffers != 3) + { + throw new InvalidOperationException("List view arrays are expected to have exactly three buffers"); + } + + int length = checked((int)cArray->length); + int offsetsLength = length * 4; + + ArrowBuffer[] buffers = new ArrowBuffer[3]; + buffers[0] = ImportValidityBuffer(cArray); + buffers[1] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[1], 0, offsetsLength)); + buffers[2] = new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[2], 0, offsetsLength)); + + return buffers; + } + private ArrowBuffer[] ImportFixedSizeListBuffers(CArrowArray* cArray) { if (cArray->n_buffers != 1) diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs index c9b45a8eb2d87..3bb7134af3ba9 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaExporter.cs @@ -167,7 +167,9 @@ private static string GetFormat(IArrowType datatype) return $"d:{decimalType.Precision},{decimalType.Scale},256"; // Binary case BinaryType _: return "z"; + case BinaryViewType _: return "vz"; case StringType _: return "u"; + case StringViewType _: return "vu"; case FixedSizeBinaryType binaryType: return $"w:{binaryType.ByteWidth}"; // Date @@ -196,6 +198,7 @@ private static string GetFormat(IArrowType datatype) }; // Nested case ListType _: return "+l"; + case ListViewType _: return "+vl"; case FixedSizeListType fixedListType: return $"+w:{fixedListType.ListSize}"; case StructType _: return "+s"; diff --git a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs index 9c81195771bae..f1acc007bcef7 100644 --- a/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowSchemaImporter.cs @@ -165,7 +165,7 @@ public ArrowType GetAsType() } // Special handling for nested types - if (format == "+l") + if (format == "+l" || format == "+vl") { if (_cSchema->n_children != 1) { @@ -180,7 +180,7 @@ public ArrowType GetAsType() Field childField = childSchema.GetAsField(); - return new ListType(childField); + return format[1] == 'v' ? new ListViewType(childField) : new ListType(childField); } else if (format == "+s") { @@ -303,8 +303,10 @@ public ArrowType GetAsType() "g" => DoubleType.Default, // Binary data "z" => BinaryType.Default, + "vz" => BinaryViewType.Default, //"Z" => new LargeBinaryType() // Not yet implemented "u" => StringType.Default, + "vu" => StringViewType.Default, //"U" => new LargeStringType(), // Not yet implemented // Date and time "tdD" => Date32Type.Default, diff --git a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs index 399d9bf5e6bf1..2b6742a3d0cb2 100644 --- a/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/ArrayDataExtensions.cs @@ -23,6 +23,17 @@ internal static class ArrayDataExtensions public static void EnsureBufferCount(this ArrayData data, int count) { if (data.Buffers.Length != count) + { + // TODO: Use localizable string resource + throw new ArgumentException( + $"Buffer count <{data.Buffers.Length}> must be at exactly <{count}>", + nameof(data.Buffers.Length)); + } + } + + public static void EnsureVariadicBufferCount(this ArrayData data, int count) + { + if (data.Buffers.Length < count) { // TODO: Use localizable string resource throw new ArgumentException( diff --git a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs index 5f39680b90ebc..b44c02d854077 100644 --- a/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs +++ b/csharp/src/Apache.Arrow/Extensions/FlatbufExtensions.cs @@ -19,25 +19,6 @@ namespace Apache.Arrow { internal static class FlatbufExtensions { - public static bool IsFixedPrimitive(this Flatbuf.Type t) - { - if (t == Flatbuf.Type.Utf8 || t == Flatbuf.Type.Binary) - return false; - return true; - } - - public static bool IsFixedPrimitive(this Types.IArrowType t) - { - return t.TypeId.IsFixedPrimitive(); - } - - public static bool IsFixedPrimitive(this Types.ArrowTypeId t) - { - if (t == Types.ArrowTypeId.String || t == Types.ArrowTypeId.Binary) - return false; - return true; - } - public static Types.IntervalUnit ToArrow(this Flatbuf.IntervalUnit unit) { switch (unit) diff --git a/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs new file mode 100644 index 0000000000000..2f9cca51737f8 --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/BinaryView.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Binary, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct BinaryView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb) { return GetRootAsBinaryView(_bb, new BinaryView()); } + public static BinaryView GetRootAsBinaryView(ByteBuffer _bb, BinaryView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public BinaryView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartBinaryView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndBinaryView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class BinaryViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs index 1e893e8cb6ffc..13b5315805dc9 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/MetadataVersion.cs @@ -8,21 +8,21 @@ namespace Apache.Arrow.Flatbuf internal enum MetadataVersion : short { /// 0.1.0 (October 2016). - V1 = 0, + V1 = 0, /// 0.2.0 (February 2017). Non-backwards compatible with V1. - V2 = 1, + V2 = 1, /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. - V3 = 2, + V3 = 2, /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. - V4 = 3, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + V4 = 3, + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// /// Incompatible changes between V4 and V5: /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. - V5 = 4, + V5 = 4, }; diff --git a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs index 10f852efb9b96..9c04288648dea 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Enums/Type.cs @@ -33,6 +33,10 @@ internal enum Type : byte LargeUtf8 = 20, LargeList = 21, RunEndEncoded = 22, + BinaryView = 23, + Utf8View = 24, + ListView = 25, + LargeListView = 26, }; @@ -110,6 +114,18 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, byte typeId, uin case Type.RunEndEncoded: result = RunEndEncodedVerify.Verify(verifier, tablePos); break; + case Type.BinaryView: + result = BinaryViewVerify.Verify(verifier, tablePos); + break; + case Type.Utf8View: + result = Utf8ViewVerify.Verify(verifier, tablePos); + break; + case Type.ListView: + result = ListViewVerify.Verify(verifier, tablePos); + break; + case Type.LargeListView: + result = LargeListViewVerify.Verify(verifier, tablePos); + break; default: result = true; break; } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Field.cs b/csharp/src/Apache.Arrow/Flatbuf/Field.cs index c5c6c0a165598..efbc6afb06d03 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Field.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Field.cs @@ -57,6 +57,10 @@ internal struct Field : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// Present only if the field is dictionary encoded. public DictionaryEncoding? Dictionary { get { int o = __p.__offset(12); return o != 0 ? (DictionaryEncoding?)(new DictionaryEncoding()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } /// children apply only to nested data types like Struct, List and Union. For diff --git a/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs new file mode 100644 index 0000000000000..685e91333c38c --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/LargeListView.cs @@ -0,0 +1,42 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent +/// extremely large data values. +internal struct LargeListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb) { return GetRootAsLargeListView(_bb, new LargeListView()); } + public static LargeListView GetRootAsLargeListView(ByteBuffer _bb, LargeListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public LargeListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartLargeListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndLargeListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class LargeListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/ListView.cs b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs new file mode 100644 index 0000000000000..d2e54e428524b --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/ListView.cs @@ -0,0 +1,43 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Represents the same logical types that List can, but contains offsets and +/// sizes allowing for writes in any order and sharing of child values among +/// list values. +internal struct ListView : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static ListView GetRootAsListView(ByteBuffer _bb) { return GetRootAsListView(_bb, new ListView()); } + public static ListView GetRootAsListView(ByteBuffer _bb, ListView obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public ListView __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartListView(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndListView(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class ListViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs index 9ab9715165ddc..2df8716bc1655 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/RecordBatch.cs @@ -38,27 +38,57 @@ internal struct RecordBatch : IFlatbufferObject public int BuffersLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } /// Optional compression of the message body public BodyCompression? Compression { get { int o = __p.__offset(10); return o != 0 ? (BodyCompression?)(new BodyCompression()).__assign(__p.__indirect(o + __p.bb_pos), __p.bb) : null; } } + /// Some types such as Utf8View are represented using a variable number of buffers. + /// For each such Field in the pre-ordered flattened logical schema, there will be + /// an entry in variadicBufferCounts to indicate the number of number of variadic + /// buffers which belong to that Field in the current RecordBatch. + /// + /// For example, the schema + /// col1: Struct + /// col2: Utf8View + /// contains two Fields with variadic buffers so variadicBufferCounts will have + /// two entries, the first counting the variadic buffers of `col1.beta` and the + /// second counting `col2`'s. + /// + /// This field may be omitted if and only if the schema contains no Fields with + /// a variable number of buffers, such as BinaryView and Utf8View. + public long VariadicBufferCounts(int j) { int o = __p.__offset(12); return o != 0 ? __p.bb.GetLong(__p.__vector(o) + j * 8) : (long)0; } + public int VariadicBufferCountsLength { get { int o = __p.__offset(12); return o != 0 ? __p.__vector_len(o) : 0; } } +#if ENABLE_SPAN_T + public Span GetVariadicCountsBytes() { return __p.__vector_as_span(12, 8); } +#else + public ArraySegment? GetVariadicCountsBytes() { return __p.__vector_as_arraysegment(12); } +#endif + public long[] GetVariadicCountsArray() { return __p.__vector_as_array(12); } public static Offset CreateRecordBatch(FlatBufferBuilder builder, long length = 0, VectorOffset nodesOffset = default(VectorOffset), VectorOffset buffersOffset = default(VectorOffset), - Offset compressionOffset = default(Offset)) { - builder.StartTable(4); + Offset compressionOffset = default(Offset), + VectorOffset variadicCountsOffset = default(VectorOffset)) { + builder.StartTable(5); RecordBatch.AddLength(builder, length); + RecordBatch.AddVariadicCounts(builder, variadicCountsOffset); RecordBatch.AddCompression(builder, compressionOffset); RecordBatch.AddBuffers(builder, buffersOffset); RecordBatch.AddNodes(builder, nodesOffset); return RecordBatch.EndRecordBatch(builder); } - public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(4); } + public static void StartRecordBatch(FlatBufferBuilder builder) { builder.StartTable(5); } public static void AddLength(FlatBufferBuilder builder, long length) { builder.AddLong(0, length, 0); } public static void AddNodes(FlatBufferBuilder builder, VectorOffset nodesOffset) { builder.AddOffset(1, nodesOffset.Value, 0); } public static void StartNodesVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddBuffers(FlatBufferBuilder builder, VectorOffset buffersOffset) { builder.AddOffset(2, buffersOffset.Value, 0); } public static void StartBuffersVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(16, numElems, 8); } public static void AddCompression(FlatBufferBuilder builder, Offset compressionOffset) { builder.AddOffset(3, compressionOffset.Value, 0); } + public static void AddVariadicCounts(FlatBufferBuilder builder, VectorOffset variadicCountsOffset) { builder.AddOffset(4, variadicCountsOffset.Value, 0); } + public static VectorOffset CreateVariadicCountsVector(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); for (int i = data.Length - 1; i >= 0; i--) builder.AddLong(data[i]); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, long[] data) { builder.StartVector(8, data.Length, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, ArraySegment data) { builder.StartVector(8, data.Count, 8); builder.Add(data); return builder.EndVector(); } + public static VectorOffset CreateVariadicCountsVectorBlock(FlatBufferBuilder builder, IntPtr dataPtr, int sizeInBytes) { builder.StartVector(1, sizeInBytes, 1); builder.Add(dataPtr, sizeInBytes); return builder.EndVector(); } + public static void StartVariadicCountsVector(FlatBufferBuilder builder, int numElems) { builder.StartVector(8, numElems, 8); } public static Offset EndRecordBatch(FlatBufferBuilder builder) { int o = builder.EndTable(); return new Offset(o); @@ -75,6 +105,7 @@ static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) && verifier.VerifyVectorOfData(tablePos, 6 /*Nodes*/, 16 /*FieldNode*/, false) && verifier.VerifyVectorOfData(tablePos, 8 /*Buffers*/, 16 /*Buffer*/, false) && verifier.VerifyTable(tablePos, 10 /*Compression*/, BodyCompressionVerify.Verify, false) + && verifier.VerifyVectorOfData(tablePos, 12 /*VariadicCounts*/, 8 /*long*/, false) && verifier.VerifyTableEnd(tablePos); } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs index 3f9e1de7c00a9..099950fafe4ee 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/SparseTensor.cs @@ -47,6 +47,10 @@ internal struct SparseTensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named. public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs index f8c213768a3fc..eb39257d861ca 100644 --- a/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs +++ b/csharp/src/Apache.Arrow/Flatbuf/Tensor.cs @@ -46,6 +46,10 @@ internal struct Tensor : IFlatbufferObject public LargeUtf8 TypeAsLargeUtf8() { return Type().Value; } public LargeList TypeAsLargeList() { return Type().Value; } public RunEndEncoded TypeAsRunEndEncoded() { return Type().Value; } + public BinaryView TypeAsBinaryView() { return Type().Value; } + public Utf8View TypeAsUtf8View() { return Type().Value; } + public ListView TypeAsListView() { return Type().Value; } + public LargeListView TypeAsLargeListView() { return Type().Value; } /// The dimensions of the tensor, optionally named public TensorDim? Shape(int j) { int o = __p.__offset(8); return o != 0 ? (TensorDim?)(new TensorDim()).__assign(__p.__indirect(__p.__vector(o) + j * 4), __p.bb) : null; } public int ShapeLength { get { int o = __p.__offset(8); return o != 0 ? __p.__vector_len(o) : 0; } } diff --git a/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs new file mode 100644 index 0000000000000..e85c5374a9acc --- /dev/null +++ b/csharp/src/Apache.Arrow/Flatbuf/Utf8View.cs @@ -0,0 +1,47 @@ +// +// automatically generated by the FlatBuffers compiler, do not modify +// + +namespace Apache.Arrow.Flatbuf +{ + +using global::System; +using global::System.Collections.Generic; +using global::Google.FlatBuffers; + +/// Logically the same as Utf8, but the internal representation uses a view +/// struct that contains the string length and either the string's entire data +/// inline (for small strings) or an inlined prefix, an index of another buffer, +/// and an offset pointing to a slice in that buffer (for non-small strings). +/// +/// Since it uses a variable number of data buffers, each Field with this type +/// must have a corresponding entry in `variadicBufferCounts`. +internal struct Utf8View : IFlatbufferObject +{ + private Table __p; + public ByteBuffer ByteBuffer { get { return __p.bb; } } + public static void ValidateVersion() { FlatBufferConstants.FLATBUFFERS_23_5_9(); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb) { return GetRootAsUtf8View(_bb, new Utf8View()); } + public static Utf8View GetRootAsUtf8View(ByteBuffer _bb, Utf8View obj) { return (obj.__assign(_bb.GetInt(_bb.Position) + _bb.Position, _bb)); } + public void __init(int _i, ByteBuffer _bb) { __p = new Table(_i, _bb); } + public Utf8View __assign(int _i, ByteBuffer _bb) { __init(_i, _bb); return this; } + + + public static void StartUtf8View(FlatBufferBuilder builder) { builder.StartTable(0); } + public static Offset EndUtf8View(FlatBufferBuilder builder) { + int o = builder.EndTable(); + return new Offset(o); + } +} + + +static internal class Utf8ViewVerify +{ + static public bool Verify(Google.FlatBuffers.Verifier verifier, uint tablePos) + { + return verifier.VerifyTableStart(tablePos) + && verifier.VerifyTableEnd(tablePos); + } +} + +} diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs index d3115da52cc6c..eb7349a570786 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs @@ -191,9 +191,7 @@ private List BuildArrays( Field field = schema.GetFieldByIndex(schemaFieldIndex++); Flatbuf.FieldNode fieldNode = recordBatchEnumerator.CurrentNode; - ArrayData arrayData = field.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); + ArrayData arrayData = LoadField(version, ref recordBatchEnumerator, field, in fieldNode, messageBuffer, bufferCreator); arrays.Add(ArrowArrayFactory.BuildArray(arrayData)); } while (recordBatchEnumerator.MoveNextNode()); @@ -229,7 +227,7 @@ private IBufferCreator GetBufferCreator(BodyCompression? compression) return new DecompressingBufferCreator(decompressor, _allocator); } - private ArrayData LoadPrimitiveField( + private ArrayData LoadField( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, Field field, @@ -276,6 +274,16 @@ private ArrayData LoadPrimitiveField( case ArrowTypeId.FixedSizeList: buffers = 1; break; + case ArrowTypeId.String: + case ArrowTypeId.Binary: + case ArrowTypeId.ListView: + buffers = 3; + break; + case ArrowTypeId.StringView: + case ArrowTypeId.BinaryView: + buffers = checked((int)(2 + recordBatchEnumerator.CurrentVariadicCount)); + recordBatchEnumerator.MoveNextVariadicCount(); + break; default: buffers = 2; break; @@ -300,54 +308,6 @@ private ArrayData LoadPrimitiveField( return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); } - private ArrayData LoadVariableField( - MetadataVersion version, - ref RecordBatchEnumerator recordBatchEnumerator, - Field field, - in Flatbuf.FieldNode fieldNode, - ByteBuffer bodyData, - IBufferCreator bufferCreator) - { - - ArrowBuffer nullArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer offsetArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - if (!recordBatchEnumerator.MoveNextBuffer()) - { - throw new Exception("Unable to move to the next buffer."); - } - ArrowBuffer valueArrowBuffer = BuildArrowBuffer(bodyData, recordBatchEnumerator.CurrentBuffer, bufferCreator); - recordBatchEnumerator.MoveNextBuffer(); - - int fieldLength = (int)fieldNode.Length; - int fieldNullCount = (int)fieldNode.NullCount; - - if (fieldLength < 0) - { - throw new InvalidDataException("Field length must be >= 0"); // TODO: Localize exception message - } - - if (fieldNullCount < 0) - { - throw new InvalidDataException("Null count length must be >= 0"); //TODO: Localize exception message - } - - ArrowBuffer[] arrowBuff = new[] { nullArrowBuffer, offsetArrowBuffer, valueArrowBuffer }; - ArrayData[] children = GetChildren(version, ref recordBatchEnumerator, field, bodyData, bufferCreator); - - IArrowArray dictionary = null; - if (field.DataType.TypeId == ArrowTypeId.Dictionary) - { - long id = DictionaryMemo.GetId(field); - dictionary = DictionaryMemo.GetDictionary(id); - } - - return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, arrowBuff, children, dictionary?.Data); - } - private ArrayData[] GetChildren( MetadataVersion version, ref RecordBatchEnumerator recordBatchEnumerator, @@ -365,11 +325,7 @@ private ArrayData[] GetChildren( Flatbuf.FieldNode childFieldNode = recordBatchEnumerator.CurrentNode; Field childField = type.Fields[index]; - ArrayData child = childField.DataType.IsFixedPrimitive() - ? LoadPrimitiveField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator) - : LoadVariableField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); - - children[index] = child; + children[index] = LoadField(version, ref recordBatchEnumerator, childField, in childFieldNode, bodyData, bufferCreator); } return children; } @@ -394,11 +350,14 @@ internal struct RecordBatchEnumerator private Flatbuf.RecordBatch RecordBatch { get; } internal int CurrentBufferIndex { get; private set; } internal int CurrentNodeIndex { get; private set; } + internal int CurrentVariadicCountIndex { get; private set; } internal Flatbuf.Buffer CurrentBuffer => RecordBatch.Buffers(CurrentBufferIndex).GetValueOrDefault(); internal Flatbuf.FieldNode CurrentNode => RecordBatch.Nodes(CurrentNodeIndex).GetValueOrDefault(); + internal long CurrentVariadicCount => RecordBatch.VariadicBufferCounts(CurrentVariadicCountIndex); + internal bool MoveNextBuffer() { return ++CurrentBufferIndex < RecordBatch.BuffersLength; @@ -409,11 +368,17 @@ internal bool MoveNextNode() return ++CurrentNodeIndex < RecordBatch.NodesLength; } + internal bool MoveNextVariadicCount() + { + return ++CurrentVariadicCountIndex < RecordBatch.VariadicBufferCountsLength; + } + internal RecordBatchEnumerator(in Flatbuf.RecordBatch recordBatch) { RecordBatch = recordBatch; CurrentBufferIndex = 0; CurrentNodeIndex = 0; + CurrentVariadicCountIndex = 0; } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 5f490019b2133..07d1dcfdb171d 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -54,9 +54,12 @@ internal class ArrowRecordBatchFlatBufferBuilder : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -81,6 +84,7 @@ public Buffer(ArrowBuffer buffer, int offset) public IReadOnlyList Buffers => _buffers; + public List VariadicCounts { get; private set; } public int TotalLength { get; private set; } public ArrowRecordBatchFlatBufferBuilder() @@ -121,6 +125,15 @@ public void Visit(ListArray array) array.Values.Accept(this); } + public void Visit(ListViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ValueOffsetsBuffer)); + _buffers.Add(CreateBuffer(array.SizesBuffer)); + + array.Values.Accept(this); + } + public void Visit(FixedSizeListArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -130,6 +143,8 @@ public void Visit(FixedSizeListArray array) public void Visit(StringArray array) => Visit(array as BinaryArray); + public void Visit(StringViewArray array) => Visit(array as BinaryViewArray); + public void Visit(BinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -137,6 +152,18 @@ public void Visit(BinaryArray array) _buffers.Add(CreateBuffer(array.ValueBuffer)); } + public void Visit(BinaryViewArray array) + { + _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); + _buffers.Add(CreateBuffer(array.ViewsBuffer)); + for (int i = 0; i < array.DataBufferCount; i++) + { + _buffers.Add(CreateBuffer(array.DataBuffer(i))); + } + VariadicCounts = VariadicCounts ?? new List(); + VariadicCounts.Add(array.DataBufferCount); + } + public void Visit(FixedSizeBinaryArray array) { _buffers.Add(CreateBuffer(array.NullBitmapBuffer)); @@ -328,7 +355,7 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -339,7 +366,9 @@ private protected void WriteRecordBatchInternal(RecordBatch recordBatch) Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength); @@ -367,7 +396,7 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat HasWrittenDictionaryBatch = true; } - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(recordBatch); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -378,7 +407,9 @@ private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBat Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength, @@ -451,12 +482,12 @@ private async ValueTask WriteBufferDataAsync(IReadOnlyList PreparingWritingRecordBatch(RecordBatch recordBatch) + private Tuple PreparingWritingRecordBatch(RecordBatch recordBatch) { return PreparingWritingRecordBatch(recordBatch.Schema.FieldsList, recordBatch.ArrayList); } - private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) + private Tuple PreparingWritingRecordBatch(IReadOnlyList fields, IReadOnlyList arrays) { Builder.Clear(); @@ -483,6 +514,12 @@ private Tuple PreparingWritingR fieldArray.Accept(recordBatchBuilder); } + VectorOffset variadicCountOffset = default; + if (recordBatchBuilder.VariadicCounts != null) + { + variadicCountOffset = Flatbuf.RecordBatch.CreateVariadicCountsVectorBlock(Builder, recordBatchBuilder.VariadicCounts.ToArray()); + } + IReadOnlyList buffers = recordBatchBuilder.Buffers; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); @@ -494,7 +531,7 @@ private Tuple PreparingWritingR buffers[i].Offset, buffers[i].DataBuffer.Length); } - return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset); + return Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset, variadicCountOffset); } private protected virtual void StartingWritingDictionary() @@ -561,7 +598,7 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, var arrays = new List { dictionary }; - (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) = + (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) = PreparingWritingRecordBatch(fields, arrays); VectorOffset buffersVectorOffset = Builder.EndVector(); @@ -569,7 +606,9 @@ private protected async Task WriteDictionaryAsync(long id, IArrowType valueType, // Serialize record batch Offset recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, dictionary.Length, fieldNodesVectorOffset, - buffersVectorOffset); + buffersVectorOffset, + default, + variadicCountsOffset); // TODO: Support delta. Offset dictionaryBatchOffset = Flatbuf.DictionaryBatch.CreateDictionaryBatch(Builder, id, recordBatchOffset, false); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs index 84ff4f9cc7202..473e18968f8cb 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs @@ -50,9 +50,13 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -60,8 +64,10 @@ class TypeVisitor : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -106,6 +112,14 @@ public void Visit(BinaryType type) Flatbuf.Binary.EndBinary(Builder)); } + public void Visit(BinaryViewType type) + { + Flatbuf.BinaryView.StartBinaryView(Builder); + Offset offset = Flatbuf.BinaryView.EndBinaryView(Builder); + Result = FieldType.Build( + Flatbuf.Type.BinaryView, offset); + } + public void Visit(ListType type) { Flatbuf.List.StartList(Builder); @@ -114,6 +128,14 @@ public void Visit(ListType type) Flatbuf.List.EndList(Builder)); } + public void Visit(ListViewType type) + { + Flatbuf.ListView.StartListView(Builder); + Result = FieldType.Build( + Flatbuf.Type.ListView, + Flatbuf.ListView.EndListView(Builder)); + } + public void Visit(FixedSizeListType type) { Result = FieldType.Build( @@ -136,6 +158,14 @@ public void Visit(StringType type) Flatbuf.Type.Utf8, offset); } + public void Visit(StringViewType type) + { + Flatbuf.Utf8View.StartUtf8View(Builder); + Offset offset = Flatbuf.Utf8View.EndUtf8View(Builder); + Result = FieldType.Build( + Flatbuf.Type.Utf8View, offset); + } + public void Visit(TimestampType type) { StringOffset timezoneStringOffset = default; @@ -169,6 +199,15 @@ public void Visit(Time32Type type) Flatbuf.Time.CreateTime(Builder, ToFlatBuffer(type.Unit))); } +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) + { + Result = FieldType.Build( + Flatbuf.Type.FloatingPoint, + Flatbuf.FloatingPoint.CreateFloatingPoint(Builder, Precision.HALF)); + } +#endif + public void Visit(FloatType type) { Result = FieldType.Build( diff --git a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs index 633554fc53261..0e6f330aef091 100644 --- a/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs +++ b/csharp/src/Apache.Arrow/Ipc/MessageSerializer.cs @@ -184,17 +184,27 @@ private static Types.IArrowType GetFieldArrowType(Flatbuf.Field field, Field[] c return Types.IntervalType.FromIntervalUnit(intervalMetadata.Unit.ToArrow()); case Flatbuf.Type.Utf8: return Types.StringType.Default; + case Flatbuf.Type.Utf8View: + return Types.StringViewType.Default; case Flatbuf.Type.FixedSizeBinary: Flatbuf.FixedSizeBinary fixedSizeBinaryMetadata = field.Type().Value; return new Types.FixedSizeBinaryType(fixedSizeBinaryMetadata.ByteWidth); case Flatbuf.Type.Binary: return Types.BinaryType.Default; + case Flatbuf.Type.BinaryView: + return Types.BinaryViewType.Default; case Flatbuf.Type.List: if (childFields == null || childFields.Length != 1) { throw new InvalidDataException($"List type must have exactly one child."); } return new Types.ListType(childFields[0]); + case Flatbuf.Type.ListView: + if (childFields == null || childFields.Length != 1) + { + throw new InvalidDataException($"List view type must have exactly one child."); + } + return new Types.ListViewType(childFields[0]); case Flatbuf.Type.FixedSizeList: if (childFields == null || childFields.Length != 1) { diff --git a/csharp/src/Apache.Arrow/Scalars/BinaryView.cs b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs new file mode 100644 index 0000000000000..eaba89c7a3a8e --- /dev/null +++ b/csharp/src/Apache.Arrow/Scalars/BinaryView.cs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Apache.Arrow.Scalars +{ + [StructLayout(LayoutKind.Explicit)] + public unsafe struct BinaryView : IEquatable + { + public const int PrefixLength = 4; + public const int MaxInlineLength = 12; + + [FieldOffset(0)] + public readonly int Length; + + [FieldOffset(4)] + internal readonly int _prefix; + + [FieldOffset(8)] + internal readonly int _bufferIndex; + + [FieldOffset(12)] + internal readonly int _bufferOffset; + + [FieldOffset(4)] + internal fixed byte _inline[MaxInlineLength]; + + public unsafe BinaryView(ReadOnlySpan inline) : this() + { + if (inline.Length > MaxInlineLength) + { + throw new ArgumentException("invalid inline data length", nameof(inline)); + } + + Length = inline.Length; + fixed (byte* dest = _inline) + fixed (byte* src = inline) + { + Buffer.MemoryCopy(src, dest, MaxInlineLength, inline.Length); + } + } + + public BinaryView(int length, ReadOnlySpan prefix, int bufferIndex, int bufferOffset) + { + if (length < MaxInlineLength) + { + throw new ArgumentException("invalid length", nameof(length)); + } + if (prefix.Length != PrefixLength) + { + throw new ArgumentException("invalid prefix length", nameof(prefix)); + } + + Length = length; + _bufferIndex = bufferIndex; + _bufferOffset = bufferOffset; + _prefix = prefix.CastTo()[0]; + } + + private BinaryView(int length, int prefix, int bufferIndex, int offset) + { + Length = length; + _prefix = prefix; + _bufferIndex = bufferIndex; + _bufferOffset = offset; + } + + public bool IsInline => Length <= MaxInlineLength; + +#if NET5_0_OR_GREATER + public ReadOnlySpan Bytes => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.AsRef(_inline[0]), IsInline ? Length : PrefixLength); +#else + public unsafe ReadOnlySpan Bytes => new ReadOnlySpan(Unsafe.AsPointer(ref _inline[0]), IsInline ? Length : PrefixLength); +#endif + + public int BufferIndex => IsInline ? -1 : _bufferIndex; + + public int BufferOffset => IsInline ? -1 : _bufferOffset; + + public override int GetHashCode() => Length ^ _prefix ^ _bufferIndex ^ _bufferOffset; + + public override bool Equals(object obj) + { + BinaryView? other = obj as BinaryView?; + return other != null && Equals(other.Value); + } + + public bool Equals(BinaryView other) => + Length == other.Length && _prefix == other._prefix && _bufferIndex == other._bufferIndex && _bufferOffset == other._bufferOffset; + + internal BinaryView AdjustBufferIndex(int bufferOffset) + { + return new BinaryView(Length, _prefix, _bufferIndex + bufferOffset, _bufferOffset); + } + } +} diff --git a/csharp/src/Apache.Arrow/Types/BinaryViewType.cs b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs new file mode 100644 index 0000000000000..f5cfc034dc967 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/BinaryViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public class BinaryViewType: ArrowType + { + public static readonly BinaryViewType Default = new BinaryViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.BinaryView; + public override string Name => "binaryview"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/IArrowType.cs b/csharp/src/Apache.Arrow/Types/IArrowType.cs index 5e107813be828..cf520391fe1e6 100644 --- a/csharp/src/Apache.Arrow/Types/IArrowType.cs +++ b/csharp/src/Apache.Arrow/Types/IArrowType.cs @@ -50,6 +50,9 @@ public enum ArrowTypeId FixedSizeList, Duration, RecordBatch, + BinaryView, + StringView, + ListView, } public interface IArrowType diff --git a/csharp/src/Apache.Arrow/Types/ListViewType.cs b/csharp/src/Apache.Arrow/Types/ListViewType.cs new file mode 100644 index 0000000000000..ecf745723c4ae --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/ListViewType.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Types +{ + public sealed class ListViewType : NestedType + { + public override ArrowTypeId TypeId => ArrowTypeId.ListView; + public override string Name => "listview"; + + public Field ValueField => Fields[0]; + + public IArrowType ValueDataType => Fields[0].DataType; + + public ListViewType(Field valueField) + : base(valueField) { } + + public ListViewType(IArrowType valueDataType) + : this(new Field("item", valueDataType, true)) { } + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/src/Apache.Arrow/Types/StringViewType.cs b/csharp/src/Apache.Arrow/Types/StringViewType.cs new file mode 100644 index 0000000000000..0c539a56b03b5 --- /dev/null +++ b/csharp/src/Apache.Arrow/Types/StringViewType.cs @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace Apache.Arrow.Types +{ + public sealed class StringViewType : ArrowType + { + public static StringViewType Default = new StringViewType(); + + public override ArrowTypeId TypeId => ArrowTypeId.StringView; + public override string Name => "utf8view"; + + public override void Accept(IArrowTypeVisitor visitor) => Accept(this, visitor); + } +} diff --git a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs index c791c9969356a..f35c2a5d78d79 100644 --- a/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs +++ b/csharp/test/Apache.Arrow.Benchmarks/ArrowWriterBenchmark.cs @@ -38,7 +38,7 @@ public class ArrowWriterBenchmark [GlobalSetup] public void GlobalSetup() { - _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount, false); + _batch = TestData.CreateSampleRecordBatch(BatchLength, ColumnSetCount); _memoryStream = new MemoryStream(); } diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index f3fe73588a7bb..31a5676f01315 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -21,6 +21,7 @@ using System.Numerics; using System.Text; using System.Text.Json; +using System.Text.Json.Nodes; using System.Text.Json.Serialization; using System.Threading.Tasks; using Apache.Arrow.Arrays; @@ -175,7 +176,9 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "floatingpoint" => ToFloatingPointArrowType(type), "decimal" => ToDecimalArrowType(type), "binary" => BinaryType.Default, + "binaryview" => BinaryViewType.Default, "utf8" => StringType.Default, + "utf8view" => StringViewType.Default, "fixedsizebinary" => new FixedSizeBinaryType(type.ByteWidth), "date" => ToDateArrowType(type), "time" => ToTimeArrowType(type), @@ -184,6 +187,7 @@ private static IArrowType ToArrowType(JsonArrowType type, Field[] children) "interval_mdn" => ToIntervalArrowType(type), "timestamp" => ToTimestampArrowType(type), "list" => ToListArrowType(type, children), + "listview" => ToListViewArrowType(type, children), "fixedsizelist" => ToFixedSizeListArrowType(type, children), "struct" => ToStructArrowType(type, children), "union" => ToUnionArrowType(type, children), @@ -294,6 +298,11 @@ private static IArrowType ToListArrowType(JsonArrowType type, Field[] children) return new ListType(children[0]); } + private static IArrowType ToListViewArrowType(JsonArrowType type, Field[] children) + { + return new ListViewType(children[0]); + } + private static IArrowType ToFixedSizeListArrowType(JsonArrowType type, Field[] children) { return new FixedSizeListType(children[0], type.ListSize); @@ -451,9 +460,12 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -652,6 +664,38 @@ public void Visit(StringType type) Array = new StringArray(JsonFieldData.Count, offsetBuffer, valueBuffer, validityBuffer, nullCount); } + public void Visit(StringViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Encoding.UTF8.GetBytes(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new StringViewArray(arrayData); + } + public void Visit(BinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -671,6 +715,38 @@ public void Visit(BinaryType type) Array = new BinaryArray(arrayData); } + public void Visit(BinaryViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + + // ArrowBuffer viewsBuffer = GetViewsBuffer(); + ArrowBuffer viewsBuffer = ArrowBuffer.Empty; + if (JsonFieldData.Views != null) + { + ArrowBuffer.Builder viewBuilder = new ArrowBuffer.Builder(JsonFieldData.Views.Count); + foreach (JsonView jsonView in JsonFieldData.Views) + { + BinaryView view = (jsonView.BufferIndex == null) ? + new BinaryView(Convert.FromHexString(jsonView.Inlined)) : + new BinaryView(jsonView.Size, Convert.FromHexString(jsonView.PrefixHex), jsonView.BufferIndex.Value, jsonView.Offset.Value); + viewBuilder.Append(view); + } + viewsBuffer = viewBuilder.Build(); + } + + int bufferCount = JsonFieldData.VariadicDataBuffers?.Count ?? 0; + ArrowBuffer[] buffers = new ArrowBuffer[2 + bufferCount]; + buffers[0] = validityBuffer; + buffers[1] = viewsBuffer; + for (int i = 0; i < bufferCount; i++) + { + buffers[i + 2] = new ArrowBuffer(Convert.FromHexString(JsonFieldData.VariadicDataBuffers[i])).Clone(); + } + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, buffers); + Array = new BinaryViewArray(arrayData); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -704,6 +780,22 @@ public void Visit(ListType type) Array = new ListArray(arrayData); } + public void Visit(ListViewType type) + { + ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); + ArrowBuffer offsetBuffer = GetOffsetBuffer(); + ArrowBuffer sizeBuffer = GetSizeBuffer(); + + var data = JsonFieldData; + JsonFieldData = data.Children[0]; + type.ValueDataType.Accept(this); + JsonFieldData = data; + + ArrayData arrayData = new ArrayData(type, JsonFieldData.Count, nullCount, 0, + new[] { validityBuffer, offsetBuffer, sizeBuffer }, new[] { Array.Data }); + Array = new ListViewArray(arrayData); + } + public void Visit(FixedSizeListType type) { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -878,11 +970,18 @@ private void GenerateArray(Func valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Length); - valueOffsets.AppendRange(JsonFieldData.Offset); + ArrowBuffer.Builder valueOffsets = new ArrowBuffer.Builder(JsonFieldData.Offset.Count); + valueOffsets.AppendRange(JsonFieldData.IntOffset); return valueOffsets.Build(default); } + private ArrowBuffer GetSizeBuffer() + { + ArrowBuffer.Builder valueSizes = new ArrowBuffer.Builder(JsonFieldData.Size.Count); + valueSizes.AppendRange(JsonFieldData.IntSize); + return valueSizes.Build(default); + } + private ArrowBuffer GetTypeIdBuffer() { ArrowBuffer.Builder typeIds = new ArrowBuffer.Builder(JsonFieldData.TypeId.Length); @@ -920,10 +1019,61 @@ public class JsonFieldData public string Name { get; set; } public int Count { get; set; } public bool[] Validity { get; set; } - public int[] Offset { get; set; } + public JsonArray Offset { get; set; } + + [JsonPropertyName("SIZE")] + public JsonArray Size { get; set; } public int[] TypeId { get; set; } public JsonElement Data { get; set; } public List Children { get; set; } + + [JsonPropertyName("VIEWS")] + public List Views { get; set; } + + [JsonPropertyName("VARIADIC_DATA_BUFFERS")] + public List VariadicDataBuffers { get; set; } + + [JsonIgnore] + public IEnumerable IntOffset + { + get { return Offset.Select(GetInt); } + } + + [JsonIgnore] + public IEnumerable IntSize + { + get { return Size.Select(GetInt); } + } + + static int GetInt(JsonNode node) + { + try + { + return node.GetValue(); + } + catch + { + return int.Parse(node.GetValue()); + } + } + } + + public class JsonView + { + [JsonPropertyName("SIZE")] + public int Size { get; set; } + + [JsonPropertyName("INLINED")] + public string Inlined { get; set; } + + [JsonPropertyName("PREFIX_HEX")] + public string PrefixHex { get; set; } + + [JsonPropertyName("BUFFER_INDEX")] + public int? BufferIndex { get; set; } + + [JsonPropertyName("OFFSET")] + public int? Offset { get; set; } } internal sealed class ValidityConverter : JsonConverter diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json new file mode 100644 index 0000000000000..46bdeff290e17 --- /dev/null +++ b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json @@ -0,0 +1,8 @@ +{ + "profiles": { + "Apache.Arrow.IntegrationTest": { + "commandName": "Project", + "commandLineArgs": "--mode validate -j C:\\Users\\curt\\AppData\\Local\\Temp\\arrow-integration-9_cov7dz\\generated_binary_view.json -a C:\\Users\\curt\\AppData\\Local\\Temp\\tmpxicbzqpn\\460a151e_generated_binary_view.json_as_file" + } + } +} \ No newline at end of file diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index 137dc16d473a4..25ef289f0dc25 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -64,13 +64,16 @@ private static IEnumerable, IArrowArray>> GenerateTestDa FloatType.Default, DoubleType.Default, BinaryType.Default, + BinaryViewType.Default, StringType.Default, + StringViewType.Default, Date32Type.Default, Date64Type.Default, TimestampType.Default, new Decimal128Type(14, 10), new Decimal256Type(14,10), new ListType(Int64Type.Default), + new ListViewType(Int64Type.Default), new StructType(new List{ new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() @@ -122,7 +125,9 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -131,6 +136,7 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, @@ -368,6 +374,34 @@ public void Visit(BinaryType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(BinaryViewType type) + { + BinaryViewArray.Builder resultBuilder = new BinaryViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + BinaryViewArray.Builder builder = new BinaryViewArray.Builder().Reserve(dataList.Count); + + foreach (byte? value in dataList) + { + if (value.HasValue) + { + builder.Append(value.Value); + resultBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(StringType type) { StringArray.Builder resultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); @@ -388,6 +422,26 @@ public void Visit(StringType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(StringViewType type) + { + StringViewArray.Builder resultBuilder = new StringViewArray.Builder().Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + StringViewArray.Builder builder = new StringViewArray.Builder().Reserve(dataList.Count); + + foreach (string value in dataList.Select(_ => _.ToString() ?? null)) + { + builder.Append(value); + resultBuilder.Append(value); + } + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(ListType type) { ListArray.Builder resultBuilder = new ListArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); @@ -423,6 +477,41 @@ public void Visit(ListType type) ExpectedArray = resultBuilder.Build(); } + public void Visit(ListViewType type) + { + ListViewArray.Builder resultBuilder = new ListViewArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); + Int64Array.Builder resultValueBuilder = (Int64Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); + + for (int i = 0; i < _baseDataListCount; i++) + { + List dataList = _baseData[i]; + + ListViewArray.Builder builder = new ListViewArray.Builder(type.ValueField).Reserve(dataList.Count); + Int64Array.Builder valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); + + foreach (long? value in dataList) + { + if (value.HasValue) + { + builder.Append(); + resultBuilder.Append(); + + valueBuilder.Append(value.Value); + resultValueBuilder.Append(value.Value); + } + else + { + builder.AppendNull(); + resultBuilder.AppendNull(); + } + } + + TestTargetArrayList.Add(builder.Build()); + } + + ExpectedArray = resultBuilder.Build(); + } + public void Visit(FixedSizeListType type) { FixedSizeListArray.Builder resultBuilder = new FixedSizeListArray.Builder(type.ValueDataType, type.ListSize).Reserve(_baseDataTotalElementCount); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 2aaffe7835258..10315ff287c0b 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -20,6 +20,7 @@ using System.Threading.Tasks; using Apache.Arrow.Arrays; using Xunit; +using System.Diagnostics; namespace Apache.Arrow.Tests { @@ -90,10 +91,13 @@ private class ArrayComparer : IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, + IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, IArrowArrayVisitor, @@ -136,12 +140,15 @@ public ArrayComparer(IArrowArray expectedArray, bool strictCompare) public void Visit(DayTimeIntervalArray array) => CompareArrays(array); public void Visit(MonthDayNanosecondIntervalArray array) => CompareArrays(array); public void Visit(ListArray array) => CompareArrays(array); + public void Visit(ListViewArray array) => CompareArrays(array); public void Visit(FixedSizeListArray array) => CompareArrays(array); public void Visit(FixedSizeBinaryArray array) => CompareArrays(array); public void Visit(Decimal128Array array) => CompareArrays(array); public void Visit(Decimal256Array array) => CompareArrays(array); public void Visit(StringArray array) => CompareBinaryArrays(array); + public void Visit(StringViewArray array) => CompareVariadicArrays(array); public void Visit(BinaryArray array) => CompareBinaryArrays(array); + public void Visit(BinaryViewArray array) => CompareVariadicArrays(array); public void Visit(StructArray array) { @@ -230,6 +237,32 @@ private void CompareBinaryArrays(BinaryArray actualArray) } } + private void CompareVariadicArrays(BinaryViewArray actualArray) + where T : IArrowArray + { + Assert.IsAssignableFrom(_expectedArray); + Assert.IsAssignableFrom(actualArray); + + var expectedArray = (BinaryViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + Assert.True(expectedArray.Views.SequenceEqual(actualArray.Views)); + + for (int i = 0; i < expectedArray.Length; i++) + { + Assert.True( + expectedArray.GetBytes(i).SequenceEqual(actualArray.GetBytes(i)), + $"BinaryArray values do not match at index {i}."); + } + } + private void CompareArrays(FixedSizeBinaryArray actualArray) { Assert.IsAssignableFrom(_expectedArray); @@ -346,6 +379,34 @@ private void CompareArrays(ListArray actualArray) actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } + private void CompareArrays(ListViewArray actualArray) + { + Assert.IsAssignableFrom(_expectedArray); + ListViewArray expectedArray = (ListViewArray)_expectedArray; + + actualArray.Data.DataType.Accept(_arrayTypeComparer); + + Assert.Equal(expectedArray.Length, actualArray.Length); + Assert.Equal(expectedArray.NullCount, actualArray.NullCount); + Assert.Equal(expectedArray.Offset, actualArray.Offset); + + CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, actualArray.NullBitmapBuffer); + + if (_strictCompare) + { + Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); + } + else + { + int length = expectedArray.Length * sizeof(int); + Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(0, length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length))); + Assert.True(expectedArray.SizesBuffer.Span.Slice(0, length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length))); + } + + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); + } + private void CompareArrays(FixedSizeListArray actualArray) { Assert.IsAssignableFrom(_expectedArray); diff --git a/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs new file mode 100644 index 0000000000000..eb617b4dedc75 --- /dev/null +++ b/csharp/test/Apache.Arrow.Tests/BinaryViewTests.cs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Scalars; +using Xunit; + +namespace Apache.Arrow.Tests +{ + public class BinaryViewTests + { + private static readonly byte[] empty = new byte[0]; + private static readonly byte[] oneByte = new byte[1]; + private static readonly byte[] fourBytes = new byte[] { 1, 2, 3, 4 }; + private static readonly byte[] fiveBytes = new byte[] { 5, 4, 3, 2, 1 }; + private static readonly byte[] twelveBytes = new byte[] { 1, 2, 3, 4, 8, 7, 6, 5, 9, 10, 11, 12 }; + private static readonly byte[] thirteenBytes = new byte[13]; + + [Fact] + public void Equality() + { + BinaryView one = new BinaryView(oneByte); + BinaryView four = new BinaryView(fourBytes); + BinaryView twelve = new BinaryView(twelveBytes); + BinaryView twelvePlus = new BinaryView(13, fourBytes, 0, 0); + Assert.Equal(one, one); + Assert.NotEqual(one, four); + Assert.NotEqual(four, twelve); + Assert.NotEqual(four, twelvePlus); + } + + [Fact] + public void ConstructorThrows() + { + Assert.Throws(() => new BinaryView(thirteenBytes)); + Assert.Throws(() => new BinaryView(20, empty, 0, 0)); + Assert.Throws(() => new BinaryView(20, fiveBytes, 0, 0)); + Assert.Throws(() => new BinaryView(13, thirteenBytes, 0, 0)); + Assert.Throws(() => new BinaryView(4, fourBytes, 0, 0)); + } + + [Fact] + public void ConstructInline() + { + BinaryView zero = new BinaryView(empty); + Assert.Equal(-1, zero.BufferIndex); + Assert.Equal(-1, zero.BufferOffset); + Assert.Equal(0, zero.Length); + Assert.Equal(0, zero.Bytes.Length); + + BinaryView one = new BinaryView(oneByte); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(1, one.Length); + Assert.Equal(1, one.Bytes.Length); + Assert.Equal((byte)0, one.Bytes[0]); + + BinaryView twelve = new BinaryView(twelveBytes); + Assert.Equal(-1, one.BufferIndex); + Assert.Equal(-1, one.BufferOffset); + Assert.Equal(12, twelve.Length); + Assert.Equal(12, twelve.Bytes.Length); + Assert.Equal((byte)8, twelve.Bytes[4]); + } + + [Fact] + public void ConstructPrefix() + { + BinaryView four = new BinaryView(14, fourBytes, 2, 3); + Assert.Equal(2, four.BufferIndex); + Assert.Equal(3, four.BufferOffset); + Assert.Equal(14, four.Length); + Assert.Equal(4, four.Bytes.Length); + Assert.Equal((byte)2, four.Bytes[1]); + } + } +} diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index 83902d8d93c70..274434e4bab09 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -741,7 +741,9 @@ public unsafe void ExportBatch() [SkippableFact] public unsafe void RoundTripTestBatch() { - RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, createDictionaryArray: true); + // TODO: Enable these once this the version of pyarrow referenced during testing supports them + HashSet unsupported = new HashSet { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView }; + RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported); RecordBatch batch2 = batch1.Clone(); CArrowArray* cExportArray = CArrowArray.Create(); diff --git a/csharp/test/Apache.Arrow.Tests/TableTests.cs b/csharp/test/Apache.Arrow.Tests/TableTests.cs index d52b514e092d9..83c88265d172b 100644 --- a/csharp/test/Apache.Arrow.Tests/TableTests.cs +++ b/csharp/test/Apache.Arrow.Tests/TableTests.cs @@ -62,7 +62,11 @@ public void TestTableFromRecordBatches() Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches); Assert.Equal(20, table1.RowCount); - Assert.Equal(30, table1.ColumnCount); +#if NET5_0_OR_GREATER + Assert.Equal(35, table1.ColumnCount); +#else + Assert.Equal(34, table1.ColumnCount); +#endif Assert.Equal("ChunkedArray: Length=20, DataType=list", table1.Column(0).Data.ToString()); FixedSizeBinaryType type = new FixedSizeBinaryType(17); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index b43321abd7499..29ddef2864862 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -24,53 +24,66 @@ namespace Apache.Arrow.Tests { public static class TestData { - public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray = true) + public static RecordBatch CreateSampleRecordBatch(int length, bool createDictionaryArray) { - return CreateSampleRecordBatch(length, columnSetCount: 1, createDictionaryArray); + HashSet excluded = createDictionaryArray ? null : new HashSet { ArrowTypeId.Dictionary }; + return CreateSampleRecordBatch(length, columnSetCount: 1, excluded); } - public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount, bool createAdvancedTypeArrays) + public static RecordBatch CreateSampleRecordBatch( + int length, + int columnSetCount = 1, + HashSet excludedTypes = null) { Schema.Builder builder = new Schema.Builder(); - for (int i = 0; i < columnSetCount; i++) + + void AddField(Field field) { - builder.Field(CreateField(new ListType(Int64Type.Default), i)); - builder.Field(CreateField(BooleanType.Default, i)); - builder.Field(CreateField(UInt8Type.Default, i)); - builder.Field(CreateField(Int8Type.Default, i)); - builder.Field(CreateField(UInt16Type.Default, i)); - builder.Field(CreateField(Int16Type.Default, i)); - builder.Field(CreateField(UInt32Type.Default, i)); - builder.Field(CreateField(Int32Type.Default, i)); - builder.Field(CreateField(UInt64Type.Default, i)); - builder.Field(CreateField(Int64Type.Default, i)); - builder.Field(CreateField(FloatType.Default, i)); - builder.Field(CreateField(DoubleType.Default, i)); - builder.Field(CreateField(Date32Type.Default, i)); - builder.Field(CreateField(Date64Type.Default, i)); - builder.Field(CreateField(Time32Type.Default, i)); - builder.Field(CreateField(Time64Type.Default, i)); - builder.Field(CreateField(TimestampType.Default, i)); - builder.Field(CreateField(StringType.Default, i)); - builder.Field(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); - builder.Field(CreateField(new Decimal128Type(10, 6), i)); - builder.Field(CreateField(new Decimal256Type(16, 8), i)); - builder.Field(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); - builder.Field(CreateField(IntervalType.YearMonth, i)); - builder.Field(CreateField(IntervalType.DayTime, i)); - builder.Field(CreateField(IntervalType.MonthDayNanosecond, i)); - - if (createAdvancedTypeArrays) + if (excludedTypes == null || !excludedTypes.Contains(field.DataType.TypeId)) { - builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); - builder.Field(CreateField(new FixedSizeBinaryType(16), i)); - builder.Field(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); - builder.Field(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + builder.Field(field); } + } - //builder.Field(CreateField(HalfFloatType.Default)); - //builder.Field(CreateField(StringType.Default)); + for (int i = 0; i < columnSetCount; i++) + { + AddField(CreateField(new ListType(Int64Type.Default), i)); + AddField(CreateField(new ListViewType(Int64Type.Default), i)); + AddField(CreateField(BooleanType.Default, i)); + AddField(CreateField(UInt8Type.Default, i)); + AddField(CreateField(Int8Type.Default, i)); + AddField(CreateField(UInt16Type.Default, i)); + AddField(CreateField(Int16Type.Default, i)); + AddField(CreateField(UInt32Type.Default, i)); + AddField(CreateField(Int32Type.Default, i)); + AddField(CreateField(UInt64Type.Default, i)); + AddField(CreateField(Int64Type.Default, i)); +#if NET5_0_OR_GREATER + AddField(CreateField(HalfFloatType.Default, i)); +#endif + AddField(CreateField(FloatType.Default, i)); + AddField(CreateField(DoubleType.Default, i)); + AddField(CreateField(Date32Type.Default, i)); + AddField(CreateField(Date64Type.Default, i)); + AddField(CreateField(Time32Type.Default, i)); + AddField(CreateField(Time64Type.Default, i)); + AddField(CreateField(TimestampType.Default, i)); + AddField(CreateField(StringType.Default, i)); + AddField(CreateField(StringViewType.Default, i)); + AddField(CreateField(new StructType(new List { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); + AddField(CreateField(new Decimal128Type(10, 6), i)); + AddField(CreateField(new Decimal256Type(16, 8), i)); + AddField(CreateField(new MapType(StringType.Default, Int32Type.Default), i)); + AddField(CreateField(IntervalType.YearMonth, i)); + AddField(CreateField(IntervalType.DayTime, i)); + AddField(CreateField(IntervalType.MonthDayNanosecond, i)); + AddField(CreateField(BinaryType.Default, i)); + AddField(CreateField(BinaryViewType.Default, i)); + AddField(CreateField(new FixedSizeBinaryType(16), i)); + AddField(CreateField(new FixedSizeListType(Int32Type.Default, 3), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Sparse), i)); + AddField(CreateField(new UnionType(new[] { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }, new[] { 0, 1 }, UnionMode.Dense), -i)); + AddField(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); } Schema schema = builder.Build(); @@ -130,16 +143,23 @@ private class ArrayCreator : IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, + IArrowTypeVisitor, + IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, IArrowTypeVisitor, +#if NET5_0_OR_GREATER + IArrowTypeVisitor, +#endif IArrowTypeVisitor { private int Length { get; } @@ -160,6 +180,9 @@ public ArrayCreator(int length) public void Visit(UInt32Type type) => GenerateArray(new UInt32Array.Builder(), x => (uint)x); public void Visit(UInt64Type type) => GenerateArray(new UInt64Array.Builder(), x => (ulong)x); public void Visit(FloatType type) => GenerateArray(new FloatArray.Builder(), x => ((float)x / Length)); +#if NET5_0_OR_GREATER + public void Visit(HalfFloatType type) => GenerateArray(new HalfFloatArray.Builder(), x => ((Half)x / (Half)Length)); +#endif public void Visit(DoubleType type) => GenerateArray(new DoubleArray.Builder(), x => ((double)x / Length)); public void Visit(Decimal128Type type) { @@ -277,6 +300,30 @@ public void Visit(StringType type) Array = builder.Build(); } + public void Visit(StringViewType type) + { + var str = "length=ten"; + var builder = new StringViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(str); + break; + case 2: + builder.Append(str + str); + break; + } + } + + Array = builder.Build(); + } + public void Visit(ListType type) { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); @@ -294,6 +341,23 @@ public void Visit(ListType type) Array = builder.Build(); } + public void Visit(ListViewType type) + { + var builder = new ListViewArray.Builder(type.ValueField).Reserve(Length); + + var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1); + + for (var i = 0; i < Length; i++) + { + builder.Append(); + valueBuilder.Append(i); + } + //Add a value to check if Values.Length can exceed ListArray.Length + valueBuilder.Append(0); + + Array = builder.Build(); + } + public void Visit(FixedSizeListType type) { var builder = new FixedSizeListArray.Builder(type.ValueField, type.ListSize).Reserve(Length); @@ -411,6 +475,64 @@ public void Visit(DictionaryType type) Array = new DictionaryArray(type, indicesBuilder.Build(), valueBuilder.Build()); } + public void Visit(BinaryType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + + public void Visit(BinaryViewType type) + { + ReadOnlySpan shortData = new[] { (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9 }; + ReadOnlySpan longData = new[] + { + (byte)0, (byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, + (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16, (byte)17, (byte)18, (byte)19 + }; + var builder = new BinaryViewArray.Builder(); + + for (var i = 0; i < Length; i++) + { + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(shortData); + break; + case 2: + builder.Append(longData); + break; + } + } + + Array = builder.Build(); + } + public void Visit(FixedSizeBinaryType type) { ArrowBuffer.Builder valueBuilder = new ArrowBuffer.Builder(); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 2bbc843836af9..230ec5b3effff 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1932,13 +1932,12 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), generate_list_view_case() - .skip_tester('C#') + .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') .skip_tester('Rust'), diff --git a/docs/source/status.rst b/docs/source/status.rst index e860aceb76e15..03a87012342c2 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -68,9 +68,13 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large Utf8 | ✓ | ✓ | ✓ | ✓ | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Binary View | ✓ | | ✓ | | | | | | +| Binary View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| String View | ✓ | | ✓ | | | | | | +| Large Binary View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Utf8 View | ✓ | | ✓ | | ✓ | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Large Utf8 View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -83,7 +87,7 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| List View | ✓ | | ✓ | | | | | | +| List View | ✓ | | ✓ | | ✓ | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large List View | ✓ | | ✓ | | | | | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ From bcaeaa8c2d970b81249cfba019475598e3d3109f Mon Sep 17 00:00:00 2001 From: Curt Hagenlocher Date: Wed, 27 Dec 2023 11:30:16 -0800 Subject: [PATCH 25/31] MINOR: [C#] Remove launchSettings.json (#39382) ### Rationale for this change A previous commit accidentally included a version of launchSettings.json used for local debugging. This file is not helpful to anyone. ### Are these changes tested? N/A ### Are there any user-facing changes? No. Authored-by: Curt Hagenlocher Signed-off-by: Curt Hagenlocher --- .../Properties/launchSettings.json | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json diff --git a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json b/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json deleted file mode 100644 index 46bdeff290e17..0000000000000 --- a/csharp/test/Apache.Arrow.IntegrationTest/Properties/launchSettings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "profiles": { - "Apache.Arrow.IntegrationTest": { - "commandName": "Project", - "commandLineArgs": "--mode validate -j C:\\Users\\curt\\AppData\\Local\\Temp\\arrow-integration-9_cov7dz\\generated_binary_view.json -a C:\\Users\\curt\\AppData\\Local\\Temp\\tmpxicbzqpn\\460a151e_generated_binary_view.json_as_file" - } - } -} \ No newline at end of file From 7c3480e2f028f5881242f227f42155cf833efee7 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 29 Dec 2023 10:58:12 +0800 Subject: [PATCH 26/31] GH-39326: [C++] Flaky DatasetWriterTestFixture.MaxRowsOneWriteBackpresure test (#39379) ### Rationale for this change This patch reduce the number of open files in testing first. I've verify the test in 14.0.2, it hangs forever. ### What changes are included in this PR? Change the test file number from 100 to 20 ### Are these changes tested? Already ### Are there any user-facing changes? no * Closes: #39326 Authored-by: mwish Signed-off-by: mwish --- cpp/src/arrow/dataset/dataset_writer_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/dataset/dataset_writer_test.cc b/cpp/src/arrow/dataset/dataset_writer_test.cc index e62e779f71797..1ac0ec3f39e97 100644 --- a/cpp/src/arrow/dataset/dataset_writer_test.cc +++ b/cpp/src/arrow/dataset/dataset_writer_test.cc @@ -290,12 +290,12 @@ TEST_F(DatasetWriterTestFixture, MaxRowsOneWriteBackpresure) { write_options_.max_open_files = 2; write_options_.min_rows_per_group = kFileSizeLimit - 1; auto dataset_writer = MakeDatasetWriter(/*max_rows=*/kFileSizeLimit); - for (int i = 0; i < 20; ++i) { - dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 5), ""); + for (int i = 0; i < 5; ++i) { + dataset_writer->WriteRecordBatch(MakeBatch(kFileSizeLimit * 2), ""); } EndWriterChecked(dataset_writer.get()); std::vector expected_files; - for (int i = 0; i < 100; ++i) { + for (int i = 0; i < 10; ++i) { expected_files.emplace_back("testdir/chunk-" + std::to_string(i) + ".arrow", kFileSizeLimit * i, kFileSizeLimit); } From 8a9f877896644ef1629136e8428a2c21bce64ae3 Mon Sep 17 00:00:00 2001 From: Hyunseok Seo Date: Mon, 1 Jan 2024 22:35:58 +0900 Subject: [PATCH 27/31] GH-39051: [C++] Use Cast() instead of CastTo() for List Scalar in test (#39353) ### Rationale for this change Remove legacy code ### What changes are included in this PR? Replace the legacy scalar CastTo implementation for List Scalar in test. ### Are these changes tested? Yes. It is passed by existing test cases. ### Are there any user-facing changes? No. * Closes: #39051 Authored-by: Hyunseok Seo Signed-off-by: Sutou Kouhei --- .../compute/kernels/scalar_cast_nested.cc | 10 ++++- cpp/src/arrow/scalar_test.cc | 39 ++++++++++++------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6fd449a931381..ec5291ef608a3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -401,7 +401,7 @@ void AddTypeToTypeCast(CastFunction* func) { kernel.exec = CastFunctor::Exec; kernel.signature = KernelSignature::Make({InputType(SrcT::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); + DCHECK_OK(func->AddKernel(SrcT::type_id, std::move(kernel))); } template @@ -480,14 +480,18 @@ std::vector> GetNestedCasts() { auto cast_list = std::make_shared("cast_list", Type::LIST); AddCommonCasts(Type::LIST, kOutputTargetType, cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddListCast(cast_list.get()); + AddListCast(cast_list.get()); AddTypeToTypeCast, FixedSizeListType>(cast_list.get()); auto cast_large_list = std::make_shared("cast_large_list", Type::LARGE_LIST); AddCommonCasts(Type::LARGE_LIST, kOutputTargetType, cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); AddTypeToTypeCast, FixedSizeListType>( cast_large_list.get()); @@ -503,7 +507,11 @@ std::vector> GetNestedCasts() { AddCommonCasts(Type::FIXED_SIZE_LIST, kOutputTargetType, cast_fsl.get()); AddTypeToTypeCast(cast_fsl.get()); AddTypeToTypeCast, ListType>(cast_fsl.get()); + AddTypeToTypeCast, ListViewType>(cast_fsl.get()); AddTypeToTypeCast, LargeListType>(cast_fsl.get()); + AddTypeToTypeCast, LargeListViewType>( + cast_fsl.get()); + AddTypeToTypeCast, MapType>(cast_fsl.get()); // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index ac740f92c8527..e8b8784e7a314 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1077,7 +1077,8 @@ std::shared_ptr MakeListType( template void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to_type) { - EXPECT_OK_AND_ASSIGN(auto cast_scalar, scalar.CastTo(to_type)); + EXPECT_OK_AND_ASSIGN(auto cast_scalar_datum, Cast(scalar, to_type)); + const auto& cast_scalar = cast_scalar_datum.scalar(); ASSERT_OK(cast_scalar->ValidateFull()); ASSERT_EQ(*cast_scalar->type, *to_type); @@ -1087,11 +1088,25 @@ void CheckListCast(const ScalarType& scalar, const std::shared_ptr& to *checked_cast(*cast_scalar).value); } -void CheckInvalidListCast(const Scalar& scalar, const std::shared_ptr& to_type, - const std::string& expected_message) { - EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(StatusCode::Invalid, - ::testing::HasSubstr(expected_message), - scalar.CastTo(to_type)); +template +void CheckListCastError(const ScalarType& scalar, + const std::shared_ptr& to_type) { + StatusCode code; + std::string expected_message; + if (scalar.type->id() == Type::FIXED_SIZE_LIST) { + code = StatusCode::TypeError; + expected_message = + "Size of FixedSizeList is not the same. input list: " + scalar.type->ToString() + + " output list: " + to_type->ToString(); + } else { + code = StatusCode::Invalid; + expected_message = + "ListType can only be casted to FixedSizeListType if the lists are all the " + "expected size."; + } + + EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, ::testing::HasSubstr(expected_message), + Cast(scalar, to_type)); } template @@ -1178,10 +1193,8 @@ class TestListLikeScalar : public ::testing::Test { CheckListCast( scalar, fixed_size_list(value_->type(), static_cast(value_->length()))); - CheckInvalidListCast(scalar, fixed_size_list(value_->type(), 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value_->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(value_->type(), 5); + CheckListCastError(scalar, invalid_cast_type); } protected: @@ -1238,10 +1251,8 @@ TEST(TestMapScalar, Cast) { CheckListCast(scalar, large_list(key_value_type)); CheckListCast(scalar, fixed_size_list(key_value_type, 2)); - CheckInvalidListCast(scalar, fixed_size_list(key_value_type, 5), - "Cannot cast " + scalar.type->ToString() + " of length " + - std::to_string(value->length()) + - " to fixed size list of length 5"); + auto invalid_cast_type = fixed_size_list(key_value_type, 5); + CheckListCastError(scalar, invalid_cast_type); } TEST(TestStructScalar, FieldAccess) { From 13696304089217c7c1c9b84c497318f506eee67b Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 1 Jan 2024 22:36:37 +0900 Subject: [PATCH 28/31] GH-39359: [CI][C++] Remove MinGW MINGW32 C++ job (#39376) ### Rationale for this change MSYS2 stopped providing MINGW32 packages: * https://github.com/msys2/MINGW-packages/pull/19517 * https://github.com/msys2/MINGW-packages/commit/f68162d5827fce41e7c2d4eb65cab6fcd8b9dd60 ### What changes are included in this PR? Remove the job. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #39359 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 2e3c2a355a884..3d4fb10b10c39 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -340,8 +340,6 @@ jobs: fail-fast: false matrix: include: - - msystem_lower: mingw32 - msystem_upper: MINGW32 - msystem_lower: mingw64 msystem_upper: MINGW64 - msystem_lower: clang64 From 4543f5d8394e221681c362f4e7c8a7268823b2cd Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 1 Jan 2024 22:38:24 +0900 Subject: [PATCH 29/31] GH-39268: [C++] Don't install bundled Azure SDK for C++ with CMake 3.28+ (#39269) ### Rationale for this change We can implement this by specifying `EXCLUDE_FROM_ALL TRUE` to `fetchcontent_declare()`. ### What changes are included in this PR? Specify `EXCLUDE_FROM_ALL TRUE` only with CMake 3.28+. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * Closes: #39268 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 89d046945e5fe..3f327ed64ff00 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1015,6 +1015,10 @@ else() endif() include(FetchContent) +set(FC_DECLARE_COMMON_OPTIONS) +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.28) + list(APPEND FC_DECLARE_COMMON_OPTIONS EXCLUDE_FROM_ALL TRUE) +endif() macro(prepare_fetchcontent) set(BUILD_SHARED_LIBS OFF) @@ -2146,6 +2150,9 @@ function(build_gtest) message(STATUS "Building gtest from source") set(GTEST_VENDORED TRUE) fetchcontent_declare(googletest + # We should not specify "EXCLUDE_FROM_ALL TRUE" here. + # Because we install GTest with custom path. + # ${FC_DECLARE_COMMON_OPTIONS} URL ${GTEST_SOURCE_URL} URL_HASH "SHA256=${ARROW_GTEST_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() @@ -5096,8 +5103,7 @@ function(build_azure_sdk) endif() message(STATUS "Building Azure SDK for C++ from source") fetchcontent_declare(azure_sdk - # EXCLUDE_FROM_ALL is available since CMake 3.28 - # EXCLUDE_FROM_ALL TRUE + ${FC_DECLARE_COMMON_OPTIONS} URL ${ARROW_AZURE_SDK_URL} URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") prepare_fetchcontent() From 3087c941699ea8485de619b8a36d98322fe20aa0 Mon Sep 17 00:00:00 2001 From: shibei Date: Tue, 2 Jan 2024 09:23:56 +0800 Subject: [PATCH 30/31] GH-39387: [C++] Fix compile warning (#39389) ### Rationale for this change Fix compile warning: ```bash In file included from /workspace/arrow/cpp/src/arrow/array/array_base.h:26: /workspace/arrow/cpp/src/arrow/array/data.h:452:19: warning: unused variable 'buffer_length' [-Wunused-variable] const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); ^ /workspace/arrow/cpp/src/arrow/array/data.h:467:19: warning: unused variable 'buffer_length' [-Wunused-variable] const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); ^ 2 warnings generated. ``` ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * Closes: #39387 Authored-by: shibei Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/data.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index f29f164d19973..edd443adc43c4 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -451,6 +451,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) const { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].data_as() + this->offset, length); } @@ -466,6 +467,7 @@ struct ARROW_EXPORT ArraySpan { util::span GetSpan(int i, int64_t length) { const int64_t buffer_length = buffers[i].size / static_cast(sizeof(T)); assert(i > 0 && length + offset <= buffer_length); + ARROW_UNUSED(buffer_length); return util::span(buffers[i].mutable_data_as() + this->offset, length); } From 98f677af3c281680b95093ceeab084b3e57e180a Mon Sep 17 00:00:00 2001 From: Hattonuri <53221537+Hattonuri@users.noreply.github.com> Date: Tue, 2 Jan 2024 07:35:48 +0300 Subject: [PATCH 31/31] GH-39413: [C++][Parquet] Vectorize decode plain on FLBA (#39414) ### Rationale for this change ### What changes are included in this PR? FLBA Decode Plain is not vectorized. So this parsing can be implemented faster https://godbolt.org/z/xWeb93xjW ### Are these changes tested? Yes, on unittest ### Are there any user-facing changes? * Closes: #39413 Authored-by: Dmitry Stasenko Signed-off-by: mwish --- cpp/src/parquet/encoding.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9ad1ee6efc12a..840efa12cc3c1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1080,9 +1080,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size ParquetException::EofException(); } for (int i = 0; i < num_values; ++i) { - out[i].ptr = data; - data += type_length; - data_size -= type_length; + out[i].ptr = data + i * type_length; } return static_cast(bytes_to_decode); }