Skip to content

Commit

Permalink
bugfix for impl
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Apr 5, 2024
1 parent 54bdddc commit 825bb90
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 66 deletions.
9 changes: 5 additions & 4 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1211,7 +1211,7 @@ int PlainBooleanDecoder::DecodeArrow(
BitBlockCounter bit_counter(valid_bits, valid_bits_offset, num_values);
int64_t value_position = 0;
int64_t valid_bits_offset_position = valid_bits_offset;
int64_t previous_value_offset = 0;
int64_t previous_value_offset = total_num_values_ - num_values_;
while (value_position < num_values) {
auto block = bit_counter.NextWord();
if (block.AllSet()) {
Expand All @@ -1227,8 +1227,7 @@ int PlainBooleanDecoder::DecodeArrow(
} else {
for (int64_t i = 0; i < block.length; ++i) {
if (bit_util::GetBit(valid_bits, valid_bits_offset_position + i)) {
bool value = bit_util::GetBit(
data_, total_num_values_ - num_values_ + previous_value_offset);
bool value = bit_util::GetBit(data_, previous_value_offset);
builder->UnsafeAppend(value);
previous_value_offset += 1;
} else {
Expand Down Expand Up @@ -3180,7 +3179,9 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
PARQUET_THROW_NOT_OK(
out->AppendValues(values.begin(), values.begin() + current_batch_size));
num_values -= current_batch_size;
current_index_in_batch = 0;
// set current_index_in_batch to current_batch_size means
// the whole batch is totally consumed.
current_index_in_batch = current_batch_size;
} while (num_values > 0);
return num_non_null_values;
}
Expand Down
108 changes: 46 additions & 62 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -638,22 +638,22 @@ TEST(BooleanArrayEncoding, AdHocRoundTrip) {
class TestBooleanArrowDecoding : public ::testing::Test {
public:
void SetUp() override {
null_probabilities_ = {0.0, 0.5, 1.0};
null_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0};
read_batch_sizes_ = {1024, 4096, 10000};
true_probabilities_ = {0.0, 0.5, 1.0};
true_probabilities_ = {0.0, 0.001, 0.5, 0.999, 1.0};
}
void TearDown() override {}

void InitTestCase(double null_probability, double true_probability) {
void InitTestCase(Encoding::type encoding, double null_probability,
double true_probability) {
GenerateInputData(null_probability, true_probability);
SetupEncoderDecoder();
SetupEncoderDecoder(encoding);
}

void GenerateInputData(double null_probability, double true_probability) {
constexpr int num_values = 10000;
::arrow::random::RandomArrayGenerator rag(0);
expected_dense_ =
rag.Boolean(num_values, /*true_probability=*/true_probability, null_probability);
expected_dense_ = rag.Boolean(num_values, true_probability, null_probability);
num_values_ = static_cast<int>(expected_dense_->length());
null_count_ = static_cast<int>(expected_dense_->null_count());
valid_bits_ = expected_dense_->null_bitmap_data();
Expand All @@ -668,20 +668,31 @@ class TestBooleanArrowDecoding : public ::testing::Test {
}
}

// Setup encoder/decoder pair for testing with
virtual void SetupEncoderDecoder() = 0;
// Setup encoder/decoder pair for testing with boolean encoding
void SetupEncoderDecoder(Encoding::type encoding) {
encoder_ = MakeTypedEncoder<BooleanType>(encoding);
decoder_ = MakeTypedDecoder<BooleanType>(encoding);
const auto* data_ptr = reinterpret_cast<const bool*>(input_data_.data());
if (valid_bits_ != nullptr) {
ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, num_values_, valid_bits_, 0));
} else {
ASSERT_NO_THROW(encoder_->Put(data_ptr, num_values_));
}
buffer_ = encoder_->FlushValues();
decoder_->SetData(num_values_, buffer_->data(), static_cast<int>(buffer_->size()));
}

void CheckDense(int actual_num_values, const ::arrow::Array& chunk) {
ASSERT_EQ(actual_num_values, num_values_ - null_count_);
ASSERT_ARRAYS_EQUAL(chunk, *expected_dense_);
}

void CheckDecodeArrowUsingDenseBuilder() {
void CheckDecodeArrowUsingDenseBuilder(Encoding::type encoding) {
for (double np : null_probabilities_) {
for (double true_prob : true_probabilities_) {
for (int read_batch_size : this->read_batch_sizes_) {
// Resume the state of decoder
InitTestCase(np, true_prob);
InitTestCase(encoding, np, true_prob);

int num_values_left = this->num_values_;
::arrow::BooleanBuilder acc;
Expand All @@ -697,10 +708,6 @@ class TestBooleanArrowDecoding : public ::testing::Test {
::arrow::internal::CountSetBits(
valid_bits_, num_values_ - num_values_left, batch_size);
}
std::cout << "request batch_size: " << batch_size
<< ", null_count:" << batch_null_count
<< ", valid bits offset:" << this->num_values_ - num_values_left
<< '\n';
int batch_num_values = decoder_->DecodeArrow(
batch_size, static_cast<int>(batch_null_count), valid_bits_,
this->num_values_ - num_values_left, &acc);
Expand All @@ -715,41 +722,28 @@ class TestBooleanArrowDecoding : public ::testing::Test {
}
}

void CheckDecodeArrowNonNullUsingDenseBuilder() {
for (auto np : null_probabilities_) {
InitTestCase(np, 0.5);
if (null_count_ > 0) {
continue;
}
::arrow::BooleanBuilder acc;
int actual_num_values = 0;
int num_values_left = this->num_values_;
void CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::type encoding) {
// NonNull skips tests for null_prob != 0.
for (auto true_prob : true_probabilities_) {
for (int read_batch_size : this->read_batch_sizes_) {
int batch_size = std::min(num_values_left, read_batch_size);
int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc);
actual_num_values += batch_num_values;
num_values_left -= batch_num_values;
// Resume the decoder
InitTestCase(encoding, /*null_probability=*/0, true_prob);
::arrow::BooleanBuilder acc;
int actual_num_values = 0;
int num_values_left = this->num_values_;
while (num_values_left > 0) {
int batch_size = std::min(num_values_left, read_batch_size);
int batch_num_values = decoder_->DecodeArrowNonNull(batch_size, &acc);
actual_num_values += batch_num_values;
num_values_left -= batch_num_values;
}
std::shared_ptr<::arrow::Array> chunk;
ASSERT_OK(acc.Finish(&chunk));
CheckDense(actual_num_values, *chunk);
}
std::shared_ptr<::arrow::Array> chunk;
ASSERT_OK(acc.Finish(&chunk));
CheckDense(actual_num_values, *chunk);
}
}

protected:
void SetupEncoderDecoderImpl(Encoding::type encoding) {
encoder_ = MakeTypedEncoder<BooleanType>(encoding);
decoder_ = MakeTypedDecoder<BooleanType>(encoding);
const auto* data_ptr = reinterpret_cast<const bool*>(input_data_.data());
if (valid_bits_ != nullptr) {
ASSERT_NO_THROW(encoder_->PutSpaced(data_ptr, num_values_, valid_bits_, 0));
} else {
ASSERT_NO_THROW(encoder_->Put(data_ptr, num_values_));
}
buffer_ = encoder_->FlushValues();
decoder_->SetData(num_values_, buffer_->data(), static_cast<int>(buffer_->size()));
}

protected:
std::vector<double> null_probabilities_;
std::vector<double> true_probabilities_;
Expand All @@ -765,30 +759,20 @@ class TestBooleanArrowDecoding : public ::testing::Test {
std::shared_ptr<Buffer> buffer_;
};

class PlainBooleanEncodingArrowTest : public TestBooleanArrowDecoding {
public:
void SetupEncoderDecoder() override { SetupEncoderDecoderImpl(Encoding::PLAIN); }
};

TEST_F(PlainBooleanEncodingArrowTest, CheckDecodeArrowUsingDenseBuilder) {
this->CheckDecodeArrowUsingDenseBuilder();
TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingDenseBuilderPlain) {
this->CheckDecodeArrowUsingDenseBuilder(Encoding::PLAIN);
}

TEST_F(PlainBooleanEncodingArrowTest, CheckDecodeArrowNonNullDenseBuilder) {
this->CheckDecodeArrowNonNullUsingDenseBuilder();
TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullDenseBuilderPlain) {
this->CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::PLAIN);
}

class RleBooleanEncodingArrowTest : public TestBooleanArrowDecoding {
public:
void SetupEncoderDecoder() override { SetupEncoderDecoderImpl(Encoding::RLE); }
};

TEST_F(RleBooleanEncodingArrowTest, CheckDecodeArrowUsingDenseBuilder) {
this->CheckDecodeArrowUsingDenseBuilder();
TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowUsingDenseBuilderRle) {
this->CheckDecodeArrowUsingDenseBuilder(Encoding::RLE);
}

TEST_F(RleBooleanEncodingArrowTest, CheckDecodeArrowNonNullDenseBuilder) {
this->CheckDecodeArrowNonNullUsingDenseBuilder();
TEST_F(TestBooleanArrowDecoding, CheckDecodeArrowNonNullDenseBuilder) {
this->CheckDecodeArrowNonNullUsingDenseBuilder(Encoding::RLE);
}

template <typename T>
Expand Down

0 comments on commit 825bb90

Please sign in to comment.