From ac0c569ab62cb0347ea9c9828fd638b69b5fc708 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 21 Dec 2023 21:08:54 +0800 Subject: [PATCH] Minor: style enhancment for parquet FileMetadata Subset --- cpp/src/arrow/util/bit_stream_utils.h | 2 +- cpp/src/parquet/metadata.cc | 30 +++++++++++---------------- cpp/src/parquet/metadata.h | 8 ++++++- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils.h b/cpp/src/arrow/util/bit_stream_utils.h index 2afb2e5193697..811694e43b76c 100644 --- a/cpp/src/arrow/util/bit_stream_utils.h +++ b/cpp/src/arrow/util/bit_stream_utils.h @@ -183,7 +183,7 @@ class BitReader { /// Returns the number of bytes left in the stream, not including the current /// byte (i.e., there may be an additional fraction of a byte). - int bytes_left() { + int bytes_left() const { return max_bytes_ - (byte_offset_ + static_cast(bit_util::BytesForBits(bit_offset_))); } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d651ea5db0f18..8b3f7bfe1fce7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -793,26 +793,19 @@ class FileMetaData::FileMetaDataImpl { std::shared_ptr out(new FileMetaData()); out->impl_ = std::make_unique(); - out->impl_->metadata_ = std::make_unique(); + out->impl_->metadata_ = std::make_unique(*metadata_); + auto output_metadata = out->impl_->metadata_.get(); - auto metadata = out->impl_->metadata_.get(); - metadata->version = metadata_->version; - metadata->schema = metadata_->schema; - - metadata->row_groups.resize(row_groups.size()); + // Discard row groups that are not in the subset + output_metadata->num_rows = 0; + output_metadata->row_groups.clear(); + output_metadata->row_groups.resize(row_groups.size()); int i = 0; for (int selected_index : row_groups) { - metadata->num_rows += row_group(selected_index).num_rows; - metadata->row_groups[i++] = row_group(selected_index); + output_metadata->num_rows += row_group(selected_index).num_rows; + output_metadata->row_groups[i++] = row_group(selected_index); } - metadata->key_value_metadata = metadata_->key_value_metadata; - metadata->created_by = metadata_->created_by; - metadata->column_orders = metadata_->column_orders; - metadata->encryption_algorithm = metadata_->encryption_algorithm; - metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata; - metadata->__isset = metadata_->__isset; - out->impl_->schema_ = schema_; out->impl_->writer_version_ = writer_version_; out->impl_->key_value_metadata_ = key_value_metadata_; @@ -886,13 +879,14 @@ std::shared_ptr FileMetaData::Make( const void* metadata, uint32_t* metadata_len, std::shared_ptr file_decryptor) { return std::shared_ptr(new FileMetaData( - metadata, metadata_len, default_reader_properties(), file_decryptor)); + metadata, metadata_len, default_reader_properties(), std::move(file_decryptor))); } FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len, const ReaderProperties& properties, std::shared_ptr file_decryptor) - : impl_(new FileMetaDataImpl(metadata, metadata_len, properties, file_decryptor)) {} + : impl_(new FileMetaDataImpl(metadata, metadata_len, properties, + std::move(file_decryptor))) {} FileMetaData::FileMetaData() : impl_(new FileMetaDataImpl()) {} @@ -942,7 +936,7 @@ const std::string& FileMetaData::footer_signing_key_metadata() const { void FileMetaData::set_file_decryptor( std::shared_ptr file_decryptor) { - impl_->set_file_decryptor(file_decryptor); + impl_->set_file_decryptor(std::move(file_decryptor)); } ParquetVersion::type FileMetaData::version() const { diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index e47c45ff0492a..19eb510fd5b4c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -306,9 +306,15 @@ class PARQUET_EXPORT FileMetaData { int num_schema_elements() const; /// \brief The total number of rows. + /// + /// If the FileMetaData is generate from a `SubSet()`, the number of rows + /// will be the sum of the number of rows of remaining RowGroup. int64_t num_rows() const; /// \brief The number of row groups in the file. + /// + /// If the FileMetaData is generate from a `SubSet()`, the number of + /// row groups will be the number of remaining RowGroup. int num_row_groups() const; /// \brief Return the RowGroupMetaData of the corresponding row group ordinal. @@ -338,7 +344,7 @@ class PARQUET_EXPORT FileMetaData { /// \brief Size of the original thrift encoded metadata footer. uint32_t size() const; - /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed. + /// \brief Indicate if all of the FileMetaData's RowGroups can be decompressed. /// /// This will return false if any of the RowGroup's page is compressed with a /// compression format which is not compiled in the current parquet library.