From b2754832741d1566900e102e0add330a1d620f27 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 24 May 2024 09:11:09 -0300 Subject: [PATCH] GH-41741: [C++] Check that extension metadata key is present before attempting to delete it (#41763) ### Rationale for this change Neither Schema.fbs nor the Arrow C Data interface nor the columnar specification indicates that the ARROW:extension:metadata key must be present; however, the `ImportType()` implementation assumes that both `ARROW:extension:name` and `ARROW:extension:metadata` are both present and throws an exception if `ARROW:extension:metadata` is missing. This causes pyarrow to crash (see issue for reproducer). ### What changes are included in this PR? This PR checks that the extension metadata is present before attempting to delete it. ### Are these changes tested? Yes (test added). ### Are there any user-facing changes? No. * GitHub Issue: #41741 Authored-by: Dewey Dunnington Signed-off-by: Dewey Dunnington --- cpp/src/arrow/c/bridge.cc | 10 ++++++++-- cpp/src/arrow/c/bridge_test.cc | 17 +++++++++++++++++ cpp/src/arrow/testing/extension_type.h | 19 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 3e2e04ba0b6ec..afb664c3bc258 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -1059,8 +1059,14 @@ struct SchemaImporter { ARROW_ASSIGN_OR_RAISE( type_, registered_ext_type->Deserialize(std::move(type_), metadata_.extension_serialized)); - RETURN_NOT_OK(metadata_.metadata->DeleteMany( - {metadata_.extension_name_index, metadata_.extension_serialized_index})); + // If metadata is present, delete both metadata keys (otherwise, just remove + // the extension name key) + if (metadata_.extension_serialized_index >= 0) { + RETURN_NOT_OK(metadata_.metadata->DeleteMany( + {metadata_.extension_name_index, metadata_.extension_serialized_index})); + } else { + RETURN_NOT_OK(metadata_.metadata->Delete(metadata_.extension_name_index)); + } } } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 0ecfb5a957760..e3ec262422ba6 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -4102,6 +4102,23 @@ TEST_F(TestArrayRoundtrip, RegisteredExtension) { TestWithArrayFactory(NestedFactory(ExampleDictExtension)); } +TEST_F(TestArrayRoundtrip, RegisteredExtensionNoMetadata) { + auto ext_type = std::make_shared(); + ExtensionTypeGuard guard(ext_type); + + auto ext_metadata = + KeyValueMetadata::Make({"ARROW:extension:name"}, {ext_type->extension_name()}); + auto ext_field = field("", ext_type->storage_type(), true, std::move(ext_metadata)); + + struct ArrowSchema c_schema {}; + SchemaExportGuard schema_guard(&c_schema); + ASSERT_OK(ExportField(*ext_field, &c_schema)); + + ASSERT_OK_AND_ASSIGN(auto ext_type_roundtrip, ImportType(&c_schema)); + ASSERT_EQ(ext_type_roundtrip->id(), Type::EXTENSION); + AssertTypeEqual(ext_type_roundtrip, ext_type); +} + TEST_F(TestArrayRoundtrip, UnregisteredExtension) { auto StorageExtractor = [](ArrayFactory factory) { return [factory]() -> Result> { diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 846e3c7a16578..6515631f202ae 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -132,6 +132,25 @@ class ARROW_TESTING_EXPORT DictExtensionType : public ExtensionType { std::string Serialize() const override { return "dict-extension-serialized"; } }; +// A minimal extension type that does not error when passed blank extension information +class ARROW_TESTING_EXPORT MetadataOptionalExtensionType : public ExtensionType { + public: + MetadataOptionalExtensionType() : ExtensionType(null()) {} + std::string extension_name() const override { return "metadata.optional"; } + std::string Serialize() const override { return ""; } + std::shared_ptr MakeArray(std::shared_ptr data) const override { + return nullptr; + } + bool ExtensionEquals(const ExtensionType& other) const override { + return other.extension_name() == extension_name(); + } + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override { + return std::make_shared(); + } +}; + class ARROW_TESTING_EXPORT Complex128Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray;