From 4d252c0fd5cfe776dd105d7b075eb220d4ac2e32 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 17 May 2024 00:27:05 +0800 Subject: [PATCH] add more tests --- cpp/src/parquet/column_reader.cc | 12 +++---- cpp/src/parquet/column_reader_test.cc | 47 ++++++++++++++++++++------- cpp/submodules/parquet-testing | 1 - 3 files changed, 40 insertions(+), 20 deletions(-) delete mode 160000 cpp/submodules/parquet-testing diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index a053971f95624..b739f22791677 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -103,9 +103,7 @@ inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) { } constexpr std::string_view kErrorRepDefLevelNotMatchesNumValues = - "Number of decoded rep / def levels did more or less than num_values in page_header"; -constexpr std::string_view kErrorRepDefLevelInEqual = - "Number of decoded rep / def levels did not match"; + "Number of decoded rep / def levels do not match num_values in page header"; } // namespace @@ -1063,7 +1061,7 @@ class TypedColumnReaderImpl : public TypedColumnReader, if (this->max_rep_level_ > 0 && rep_levels != nullptr) { int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels); if (batch_size != num_rep_levels) { - throw ParquetException(kErrorRepDefLevelInEqual); + throw ParquetException(kErrorRepDefLevelNotMatchesNumValues); } } } @@ -1183,7 +1181,7 @@ int64_t TypedColumnReaderImpl::ReadBatchSpaced( if (this->max_rep_level_ > 0) { int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels); if (num_def_levels != num_rep_levels) { - throw ParquetException(kErrorRepDefLevelInEqual); + throw ParquetException(kErrorRepDefLevelNotMatchesNumValues); } } @@ -1429,7 +1427,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, if (this->max_rep_level_ > 0) { int64_t rep_levels_read = this->ReadRepetitionLevels(batch_size, rep_levels); if (rep_levels_read != levels_read) { - throw ParquetException(kErrorRepDefLevelInEqual); + throw ParquetException(kErrorRepDefLevelNotMatchesNumValues); } } @@ -1597,7 +1595,7 @@ class TypedRecordReader : public TypedColumnReaderImpl, int64_t levels_read = 0; levels_read = this->ReadDefinitionLevels(batch_size, def_levels); if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) { - throw ParquetException(kErrorRepDefLevelInEqual); + throw ParquetException(kErrorRepDefLevelNotMatchesNumValues); } if (ARROW_PREDICT_FALSE(batch_size != levels_read)) { throw ParquetException(kErrorRepDefLevelNotMatchesNumValues); diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc index 998f1c37be652..5632e9f923d28 100644 --- a/cpp/src/parquet/column_reader_test.cc +++ b/cpp/src/parquet/column_reader_test.cc @@ -431,16 +431,13 @@ TEST_F(TestPrimitiveReader, TestReadValuesMissing) { ParquetException); } -// GH-41321: When max_def_level > 0, and Page has more or less -// def-levels than the `num_values` in PageHeader. We should -// detect and throw exception. -TEST_F(TestPrimitiveReader, DefLevelNotExpected) { - max_def_level_ = 1; - max_rep_level_ = 0; - - auto do_check = [&](const std::vector& input_def_levels, int num_values) { +// GH-41321: When max_def_level > 0 or max_rep_level > 0, and +// Page has more or less levels than the `num_values` in +// PageHeader. We should detect and throw exception. +TEST_F(TestPrimitiveReader, DefRepLevelNotExpected) { + auto do_check = [&](const NodePtr& type, const std::vector& input_def_levels, + const std::vector& input_rep_levels, int num_values) { std::vector values(num_values, false); - NodePtr type = schema::Boolean("a", Repetition::OPTIONAL); const ColumnDescriptor descr(type, max_def_level_, max_rep_level_); // The data page falls back to plain encoding @@ -448,7 +445,7 @@ TEST_F(TestPrimitiveReader, DefLevelNotExpected) { std::shared_ptr data_page = MakeDataPage( &descr, values, /*num_values=*/num_values, Encoding::PLAIN, /*indices=*/{}, /*indices_size=*/0, /*def_levels=*/input_def_levels, max_def_level_, - /*rep_levels=*/{}, + /*rep_levels=*/input_rep_levels, /*max_rep_level=*/max_rep_level_); pages_.push_back(data_page); InitReader(&descr); @@ -473,13 +470,39 @@ TEST_F(TestPrimitiveReader, DefLevelNotExpected) { }; // storing def-levels less than value in page-header { + max_def_level_ = 1; + max_rep_level_ = 0; + NodePtr type = schema::Boolean("a", Repetition::OPTIONAL); std::vector input_def_levels(1, 1); - do_check(input_def_levels, /*num_values=*/3); + std::vector input_rep_levels{}; + do_check(type, input_def_levels, input_rep_levels, /*num_values=*/3); } // storing def-levels more than value in page-header { + max_def_level_ = 1; + max_rep_level_ = 0; + NodePtr type = schema::Boolean("a", Repetition::OPTIONAL); std::vector input_def_levels(2, 1); - do_check(input_def_levels, /*num_values=*/1); + std::vector input_rep_levels{}; + do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1); + } + // storing rep-levels more than value in page-header + { + max_def_level_ = 1; + max_rep_level_ = 1; + NodePtr type = schema::Boolean("a", Repetition::REPEATED); + std::vector input_def_levels(1, 1); + std::vector input_rep_levels(2, 1); + do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1); + } + // storing rep-levels more than value in page-header + { + max_def_level_ = 1; + max_rep_level_ = 1; + NodePtr type = schema::Boolean("a", Repetition::REPEATED); + std::vector input_def_levels(1, 1); + std::vector input_rep_levels(2, 1); + do_check(type, input_def_levels, input_rep_levels, /*num_values=*/1); } } diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing deleted file mode 160000 index 74278bc4a1122..0000000000000 --- a/cpp/submodules/parquet-testing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 74278bc4a1122d74945969e6dec405abd1533ec3