From 4d664b52c85283bc92fd363724e557db5860ee4e Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 25 May 2024 02:48:11 +0800 Subject: [PATCH] Parquet: normalize dictionary encoding to use RLE_DICTIONARY --- cpp/src/parquet/column_reader.cc | 11 +++-------- cpp/src/parquet/column_writer.cc | 8 ++------ cpp/src/parquet/encoding.cc | 4 ++-- cpp/src/parquet/file_reader.cc | 3 +-- cpp/src/parquet/properties.h | 6 ++++++ 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 407201a89ef08..52add8f339fc3 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -642,12 +642,6 @@ namespace { // ---------------------------------------------------------------------- // Impl base class for TypedColumnReader and RecordReader -// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index -// encoding. -static bool IsDictionaryIndexEncoding(const Encoding::type& e) { - return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY; -} - template class ColumnReaderImplBase { public: @@ -876,8 +870,9 @@ class ColumnReaderImplBase { } Encoding::type encoding = page.encoding(); - if (IsDictionaryIndexEncoding(encoding)) { + // Normalizing the PLAIN_DICTIONARY to RLE_DICTIONARY encoding + // in decoder. encoding = Encoding::RLE_DICTIONARY; } @@ -950,7 +945,7 @@ class ColumnReaderImplBase { /// Flag to signal when a new dictionary has been set, for the benefit of /// DictionaryRecordReader - bool new_dictionary_; + bool new_dictionary_ = false; // The exposed encoding ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 9059cd1641745..ac1c3ea2e3e20 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1205,10 +1205,6 @@ Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool, return Status::OK(); } -static inline bool IsDictionaryEncoding(Encoding::type encoding) { - return encoding == Encoding::PLAIN_DICTIONARY; -} - template class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter { public: @@ -1565,7 +1561,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< } void FallbackToPlainEncoding() { - if (IsDictionaryEncoding(current_encoder_->encoding())) { + if (IsDictionaryIndexEncoding(current_encoder_->encoding())) { WriteDictionaryPage(); // Serialize the buffered Dictionary Indices FlushBufferedDataPages(); @@ -1661,7 +1657,7 @@ Status TypedColumnWriterImpl::WriteArrowDictionary( maybe_parent_nulls); }; - if (!IsDictionaryEncoding(current_encoder_->encoding()) || + if (!IsDictionaryIndexEncoding(current_encoder_->encoding()) || !DictionaryDirectWriteSupported(array)) { // No longer dictionary-encoding for whatever reason, maybe we never were // or we decided to stop. Note that WriteArrow can be invoked multiple diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 903faa92b6370..54e1e000040a1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -442,12 +442,12 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { constexpr static int32_t kDataPageBitWidthBytes = 1; explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool) - : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool), + : EncoderImpl(desc, Encoding::RLE_DICTIONARY, pool), buffered_indices_(::arrow::stl::allocator(pool)), dict_encoded_size_(0), memo_table_(pool, kInitialHashTableSize) {} - ~DictEncoderImpl() = default; + ~DictEncoderImpl() override = default; int dict_encoded_size() const override { return dict_encoded_size_; } diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 8fcb0870ce4b6..3e9eeea6c6f67 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -71,8 +71,7 @@ bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) { } // The following pages should be dictionary encoded data pages. for (size_t idx = 1; idx < encoding_stats.size(); ++idx) { - if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY && - encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) || + if (!IsDictionaryIndexEncoding(encoding_stats[idx].encoding) || (encoding_stats[idx].page_type != PageType::DATA_PAGE && encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) { // Return false if any following page is not a dictionary encoded data diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4d3acb491e390..5b70e0b2001fd 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1180,4 +1180,10 @@ struct ArrowWriteContext { PARQUET_EXPORT std::shared_ptr default_arrow_writer_properties(); +// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index +// encoding. +constexpr bool IsDictionaryIndexEncoding(Encoding::type e) { + return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY; +} + } // namespace parquet