From 6f7f796e7f8fb51d136594ecd5363f18f697434a Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 6 Apr 2024 00:11:53 +0800 Subject: [PATCH] bugfix for impl --- cpp/src/parquet/encoding.cc | 9 +-- cpp/src/parquet/encoding_test.cc | 108 +++++++++++++------------------ 2 files changed, 51 insertions(+), 66 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7950b7434151f..adeb081953598 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1211,7 +1211,7 @@ int PlainBooleanDecoder::DecodeArrow( BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values); int64_t value_position = 0; int64_t valid_bits_offset_position = valid_bits_offset; - int64_t previous_value_offset = 0; + int64_t previous_value_offset = total_num_values_ - num_values_; while (value_position < num_values) { auto block = bit_counter.NextWord(); if (block.AllSet()) { @@ -1227,8 +1227,7 @@ int PlainBooleanDecoder::DecodeArrow( } else { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) { - bool value = bit_util::GetBit( - data_, total_num_values_ - num_values_ + previous_value_offset); + bool value = bit_util::GetBit(data_, previous_value_offset); builder->UnsafeAppend(value); previous_value_offset += 1; } else { @@ -3180,7 +3179,9 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder { PARQUET_THROW_NOT_OK( out->AppendValues(values.begin(), values.begin() + current_batch_size)); num_values -= current_batch_size; - current_index_in_batch = 0; + // set current_index_in_batch to current_batch_size means + // the whole batch is totally consumed. + current_index_in_batch = current_batch_size; } while (num_values > 0); return num_non_null_values; } diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index ec75bc2667b3f..537511d063db7 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -638,22 +638,22 @@ TEST(BooleanArrayEncoding, AdHocRoundTrip) { class TestBooleanArrowDecoding : public ::testing::Test { public: void SetUp() override { - null_probabilities_ = {0.0, 0.5, 1.0}; + null_probabilities_ = {0.0, 0.001, 0.5, 0.9999, 1.0}; read_batch_sizes_ = {1024, 4096, 10000}; - true_probabilities_ = {0.0, 0.5, 1.0}; + true_probabilities_ = {0.0, 0.001, 0.5, 0.9999, 1.0}; } void TearDown() override {} - void InitTestCase(double null_probability, double true_probability) { + void InitTestCase(Encoding::type encoding, double null_probability, + double true_probability) { GenerateInputData(null_probability, true_probability); - SetupEncoderDecoder(); + SetupEncoderDecoder(encoding); } void GenerateInputData(double null_probability, double true_probability) { constexpr int num_values = 10000; ::arrow::random::RandomArrayGenerator rag(0); - expected_dense_ = - rag.Boolean(num_values, /*true_probability=*/true_probability, null_probability); + expected_dense_ = rag.Boolean(num_values, true_probability, null_probability); num_values_ = static_cast(expected_dense_->length()); null_count_ = static_cast(expected_dense_->null_count()); valid_bits_ = expected_dense_->null_bitmap_data(); @@ -668,20 +668,31 @@ class TestBooleanArrowDecoding : public ::testing::Test { } } - // Setup encoder/decoder pair for testing with - virtual void SetupEncoderDecoder() = 0; + // Setup encoder/decoder pair for testing with boolean encoding + void SetupEncoderDecoder(Encoding::type encoding) { + encoder_ = MakeTypedEncoder(encoding); + decoder_ = MakeTypedDecoder(encoding); + const auto* data_ptr = reinterpret_cast(input_data_.data()); + if (valid_bits_ != nullptr) { + ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, num_values_, valid_bits_, 0)); + } else { + ASSERT_NO_THROW(encoder_->Put(data_ptr, num_values_)); + } + buffer_ = encoder_->FlushValues(); + decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); + } void CheckDense(int actual_num_values, const ::arrow::Array& chunk) { ASSERT_EQ(actual_num_values, num_values_ - null_count_); ASSERT_ARRAYS_EQUAL(chunk, *expected_dense_); } - void CheckDecodeArrowUsingDenseBuilder() { + void CheckDecodeArrowUsingDenseBuilder(Encoding::type encoding) { for (double np : null_probabilities_) { for (double true_prob : true_probabilities_) { for (int read_batch_size : this->read_batch_sizes_) { // Resume the state of decoder - InitTestCase(np, true_prob); + InitTestCase(encoding, np, true_prob); int num_values_left = this->num_values_; ::arrow::BooleanBuilder acc; @@ -697,10 +708,6 @@ class TestBooleanArrowDecoding : public ::testing::Test { ::arrow::internal::CountSetBits( valid_bits_, num_values_ - num_values_left, batch_size); } - std::cout << "request batch_size: " << batch_size - << ", null_count:" << batch_null_count - << ", valid bits offset:" << this->num_values_ - num_values_left - << '\n'; int batch_num_values = decoder_->DecodeArrow( batch_size, static_cast(batch_null_count), valid_bits_, this->num_values_ - num_values_left, &acc); @@ -715,41 +722,28 @@ class TestBooleanArrowDecoding : public ::testing::Test { } } - void CheckDecodeArrowNonNullUsingDenseBuilder() { - for (auto np : null_probabilities_) { - InitTestCase(np, 0.5); - if (null_count_ > 0) { - continue; - } - ::arrow::BooleanBuilder acc; - int actual_num_values = 0; - int num_values_left = this->num_values_; + void CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::type encoding) { + // NonNull skips tests for null_prob != 0. + for (auto true_prob : true_probabilities_) { for (int read_batch_size : this->read_batch_sizes_) { - int batch_size = std::min(num_values_left, read_batch_size); - int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc); - actual_num_values += batch_num_values; - num_values_left -= batch_num_values; + // Resume the decoder + InitTestCase(encoding, /*null_probability=*/0, true_prob); + ::arrow::BooleanBuilder acc; + int actual_num_values = 0; + int num_values_left = this->num_values_; + while (num_values_left > 0) { + int batch_size = std::min(num_values_left, read_batch_size); + int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc); + actual_num_values += batch_num_values; + num_values_left -= batch_num_values; + } + std::shared_ptr<::arrow::Array> chunk; + ASSERT_OK(acc.Finish(&chunk)); + CheckDense(actual_num_values, *chunk); } - std::shared_ptr<::arrow::Array> chunk; - ASSERT_OK(acc.Finish(&chunk)); - CheckDense(actual_num_values, *chunk); } } - protected: - void SetupEncoderDecoderImpl(Encoding::type encoding) { - encoder_ = MakeTypedEncoder(encoding); - decoder_ = MakeTypedDecoder(encoding); - const auto* data_ptr = reinterpret_cast(input_data_.data()); - if (valid_bits_ != nullptr) { - ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, num_values_, valid_bits_, 0)); - } else { - ASSERT_NO_THROW(encoder_->Put(data_ptr, num_values_)); - } - buffer_ = encoder_->FlushValues(); - decoder_->SetData(num_values_, buffer_->data(), static_cast(buffer_->size())); - } - protected: std::vector null_probabilities_; std::vector true_probabilities_; @@ -765,30 +759,20 @@ class TestBooleanArrowDecoding : public ::testing::Test { std::shared_ptr buffer_; }; -class PlainBooleanEncodingArrowTest : public TestBooleanArrowDecoding { - public: - void SetupEncoderDecoder() override { SetupEncoderDecoderImpl(Encoding::PLAIN); } -}; - -TEST_F(PlainBooleanEncodingArrowTest, CheckDecodeArrowUsingDenseBuilder) { - this->CheckDecodeArrowUsingDenseBuilder(); +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingDenseBuilderPlain) { + this->CheckDecodeArrowUsingDenseBuilder(Encoding::PLAIN); } -TEST_F(PlainBooleanEncodingArrowTest, CheckDecodeArrowNonNullDenseBuilder) { - this->CheckDecodeArrowNonNullUsingDenseBuilder(); +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullDenseBuilderPlain) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::PLAIN); } -class RleBooleanEncodingArrowTest : public TestBooleanArrowDecoding { - public: - void SetupEncoderDecoder() override { SetupEncoderDecoderImpl(Encoding::RLE); } -}; - -TEST_F(RleBooleanEncodingArrowTest, CheckDecodeArrowUsingDenseBuilder) { - this->CheckDecodeArrowUsingDenseBuilder(); +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingDenseBuilderRle) { + this->CheckDecodeArrowUsingDenseBuilder(Encoding::RLE); } -TEST_F(RleBooleanEncodingArrowTest, CheckDecodeArrowNonNullDenseBuilder) { - this->CheckDecodeArrowNonNullUsingDenseBuilder(); +TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullDenseBuilder) { + this->CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::PLAIN); } template