Skip to content

Commit

Permalink
add more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed May 16, 2024
1 parent c082f8d commit 4d252c0
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 20 deletions.
12 changes: 5 additions & 7 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) {
}

constexpr std::string_view kErrorRepDefLevelNotMatchesNumValues =
"Number of decoded rep / def levels did more or less than num_values in page_header";
constexpr std::string_view kErrorRepDefLevelInEqual =
"Number of decoded rep / def levels did not match";
"Number of decoded rep / def levels do not match num_values in page header";

} // namespace

Expand Down Expand Up @@ -1063,7 +1061,7 @@ class TypedColumnReaderImpl : public TypedColumnReader<DType>,
if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
if (batch_size != num_rep_levels) {
throw ParquetException(kErrorRepDefLevelInEqual);
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
}
}
}
Expand Down Expand Up @@ -1183,7 +1181,7 @@ int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
if (this->max_rep_level_ > 0) {
int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
if (num_def_levels != num_rep_levels) {
throw ParquetException(kErrorRepDefLevelInEqual);
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
}
}

Expand Down Expand Up @@ -1429,7 +1427,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
if (this->max_rep_level_ > 0) {
int64_t rep_levels_read = this->ReadRepetitionLevels(batch_size, rep_levels);
if (rep_levels_read != levels_read) {
throw ParquetException(kErrorRepDefLevelInEqual);
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
}
}

Expand Down Expand Up @@ -1597,7 +1595,7 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
int64_t levels_read = 0;
levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
throw ParquetException(kErrorRepDefLevelInEqual);
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
}
if (ARROW_PREDICT_FALSE(batch_size != levels_read)) {
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
Expand Down
47 changes: 35 additions & 12 deletions cpp/src/parquet/column_reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -431,24 +431,21 @@ TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
ParquetException);
}

// GH-41321: When max_def_level > 0, and Page has more or less
// def-levels than the `num_values` in PageHeader. We should
// detect and throw exception.
TEST_F(TestPrimitiveReader, DefLevelNotExpected) {
max_def_level_ = 1;
max_rep_level_ = 0;

auto do_check = [&](const std::vector<int16_t>& input_def_levels, int num_values) {
// GH-41321: When max_def_level > 0 or max_rep_level > 0, and
// Page has more or less levels than the `num_values` in
// PageHeader. We should detect and throw exception.
TEST_F(TestPrimitiveReader, DefRepLevelNotExpected) {
auto do_check = [&](const NodePtr& type, const std::vector<int16_t>& input_def_levels,
const std::vector<int16_t>& input_rep_levels, int num_values) {
std::vector<bool> values(num_values, false);
NodePtr type = schema::Boolean("a", Repetition::OPTIONAL);
const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);

// The data page falls back to plain encoding
std::shared_ptr<ResizableBuffer> dummy = AllocateBuffer();
std::shared_ptr<DataPageV1> data_page = MakeDataPage<BooleanType>(
&descr, values, /*num_values=*/num_values, Encoding::PLAIN, /*indices=*/{},
/*indices_size=*/0, /*def_levels=*/input_def_levels, max_def_level_,
/*rep_levels=*/{},
/*rep_levels=*/input_rep_levels,
/*max_rep_level=*/max_rep_level_);
pages_.push_back(data_page);
InitReader(&descr);
Expand All @@ -473,13 +470,39 @@ TEST_F(TestPrimitiveReader, DefLevelNotExpected) {
};
// storing def-levels less than value in page-header
{
max_def_level_ = 1;
max_rep_level_ = 0;
NodePtr type = schema::Boolean("a", Repetition::OPTIONAL);
std::vector<int16_t> input_def_levels(1, 1);
do_check(input_def_levels, /*num_values=*/3);
std::vector<int16_t> input_rep_levels{};
do_check(type, input_def_levels, input_rep_levels, /*num_values=*/3);
}
// storing def-levels more than value in page-header
{
max_def_level_ = 1;
max_rep_level_ = 0;
NodePtr type = schema::Boolean("a", Repetition::OPTIONAL);
std::vector<int16_t> input_def_levels(2, 1);
do_check(input_def_levels, /*num_values=*/1);
std::vector<int16_t> input_rep_levels{};
do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1);
}
// storing rep-levels more than value in page-header
{
max_def_level_ = 1;
max_rep_level_ = 1;
NodePtr type = schema::Boolean("a", Repetition::REPEATED);
std::vector<int16_t> input_def_levels(1, 1);
std::vector<int16_t> input_rep_levels(2, 1);
do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1);
}
// storing rep-levels more than value in page-header
{
max_def_level_ = 1;
max_rep_level_ = 1;
NodePtr type = schema::Boolean("a", Repetition::REPEATED);
std::vector<int16_t> input_def_levels(1, 1);
std::vector<int16_t> input_rep_levels(2, 1);
do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1);
}
}

Expand Down
1 change: 0 additions & 1 deletion cpp/submodules/parquet-testing
Submodule parquet-testing deleted from 74278b

0 comments on commit 4d252c0

Please sign in to comment.