Skip to content

Commit

Permalink
Encoding: Optimize DecodeArrow for PlainBooleanDecoder
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Mar 28, 2024
1 parent a9b2cc2 commit b03728b
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 23 deletions.
3 changes: 2 additions & 1 deletion cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,8 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
levels_position_ = 0;
levels_capacity_ = 0;
read_dense_for_nullable_ = read_dense_for_nullable;
uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
// BYTE_ARRAY values are not stored in the `values_` buffer.
uses_values_ = descr->physical_type() != Type::BYTE_ARRAY;

if (uses_values_) {
values_ = AllocateBuffer(pool);
Expand Down
61 changes: 39 additions & 22 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1173,13 +1173,15 @@ class PlainBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {

private:
std::unique_ptr<::arrow::bit_util::BitReader> bit_reader_;
int total_num_values_{0};
};

PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
: DecoderImpl(descr, Encoding::PLAIN) {}

void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
DecoderImpl::SetData(num_values, data, len);
total_num_values_ = num_values;
bit_reader_ = std::make_unique<bit_util::BitReader>(data, len);
}

Expand All @@ -1188,19 +1190,37 @@ int PlainBooleanDecoder::DecodeArrow(
typename EncodingTraits<BooleanType>::Accumulator* builder) {
int values_decoded = num_values - null_count;
if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
// A too large `num_values` was requested.
ParquetException::EofException();
}
if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(values_decoded))) {
ParquetException::EofException();
}

PARQUET_THROW_NOT_OK(builder->Reserve(num_values));

VisitNullBitmapInline(
valid_bits, valid_bits_offset, num_values, null_count,
[&]() {
bool value;
ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
builder->UnsafeAppend(value);
},
[&]() { builder->UnsafeAppendNull(); });
if (null_count == 0) {
// FastPath: can copy the data directly
PARQUET_THROW_NOT_OK(builder->AppendValues(data_, values_decoded, NULLPTR,
total_num_values_ - num_values_));
} else {
int64_t previous_offset = 0;
int64_t previous_value_offset = 0;
PARQUET_THROW_NOT_OK(::arrow::internal::VisitSetBitRuns(
valid_bits, valid_bits_offset, num_values, [&](int64_t position, int64_t length) {
if (position > previous_offset) {
RETURN_NOT_OK(builder->AppendNulls(position - previous_offset));
}
RETURN_NOT_OK(builder->AppendValues(
data_, length, NULLPTR,
total_num_values_ - num_values_ + previous_value_offset));
previous_offset = position + length;
previous_value_offset += length;
return Status::OK();
}));
// Epilogue: handle trailing nulls
if (previous_offset < num_values) {
PARQUET_THROW_NOT_OK(builder->AppendNulls(num_values - previous_offset));
}
}

num_values_ -= values_decoded;
return values_decoded;
Expand All @@ -1214,18 +1234,15 @@ inline int PlainBooleanDecoder::DecodeArrow(

int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
for (int i = 0; i < max_values; ++i) {
if (!bit_reader_->GetValue(1, &val)) {
ParquetException::EofException();
}
if (val) {
bit_writer.Set();
}
bit_writer.Next();
if (ARROW_PREDICT_FALSE(!bit_reader_->Advance(max_values))) {
ParquetException::EofException();
}
bit_writer.Finish();
// Copy the data directly
// Parquet's boolean encoding is bit-packed using LSB. So
// we can directly copy the data to the buffer.
::arrow::internal::CopyBitmap(this->data_, /*offset=*/total_num_values_ - num_values_,
/*length=*/max_values, /*dest=*/buffer,
/*dest_offset=*/0);
num_values_ -= max_values;
return max_values;
}
Expand Down

0 comments on commit b03728b

Please sign in to comment.