From e4a8c51babb07b6e821ef746fcb7f4e649d83a9f Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 9 May 2024 16:38:50 +0800 Subject: [PATCH] Separate def level and rep level handling --- cpp/src/parquet/column_reader.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index d3f918186247f..c9def652a6dfc 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1699,27 +1699,28 @@ class TypedRecordReader : public TypedColumnReaderImpl, // Count logical records and number of non-null values to read ARROW_DCHECK(!at_record_start_); + // Scan repetition levels to find record end + int64_t level = levels_position_; while (levels_position_ < levels_written_) { int64_t stride = std::min(levels_written_ - levels_position_, num_records - records_read); const int64_t position_end = levels_position_ + stride; for (int64_t i = levels_position_; i < position_end; ++i) { records_read += rep_levels[i] == 0; - values_to_read += def_levels[i] == this->max_def_level_; } - levels_position_ = position_end; if (records_read == num_records) { // Last rep_level reaches the boundary ARROW_CHECK_EQ(rep_levels[levels_position_ - 1], 0); // We've found the number of records we were looking for. Set // at_record_start_ to true and break at_record_start_ = true; - // Remove last value if we have reaches the end of the record - levels_position_ = levels_position_ - 1; - values_to_read -= def_levels[levels_position_] == this->max_def_level_; break; } + levels_position_ = position_end; } + // Scan definition levels to find number of physical values + values_to_read += std::count(def_levels + level, def_levels + levels_position_, + this->max_def_level_); *values_seen = values_to_read; return records_read; }