From 51529e061f2eb0020c85264e0c1b4135d8143a1b Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 19 Jan 2024 21:18:46 +0800 Subject: [PATCH] Add levels decoding basic benchmark --- cpp/src/parquet/column_reader_benchmark.cc | 81 ++++++++++++++++++++++ cpp/src/parquet/column_writer_test.cc | 4 +- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc index 49b2317ede187..bc96d19206aaa 100644 --- a/cpp/src/parquet/column_reader_benchmark.cc +++ b/cpp/src/parquet/column_reader_benchmark.cc @@ -219,5 +219,86 @@ BENCHMARK(RecordReaderReadRecords) ->Args({2, 1000, true}) ->Args({2, 1000, false}); +void GenerateLevels(int level_repeats, int max_level, int num_levels, + std::vector& input_levels) { + // Generate random levels + std::default_random_engine gen(/*seed=*/1943); + std::uniform_int_distribution d(0, max_level); + for (int i = 0; i < num_levels;) { + for (int j = 0; j < level_repeats; ++j) { + input_levels.push_back(d(gen)); + ++i; + } + } +} + +void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, + const int16_t* input_levels, std::vector& bytes) { + LevelEncoder encoder; + int levels_count = 0; + bytes.resize(2 * num_levels); + ASSERT_EQ(2 * num_levels, static_cast(bytes.size())); + // encode levels + if (encoding == Encoding::RLE) { + // leave space to write the rle length value + encoder.Init(encoding, max_level, num_levels, bytes.data() + sizeof(int32_t), + static_cast(bytes.size())); + + levels_count = encoder.Encode(num_levels, input_levels); + (reinterpret_cast(bytes.data()))[0] = encoder.len(); + } else { + encoder.Init(encoding, max_level, num_levels, bytes.data(), + static_cast(bytes.size())); + levels_count = encoder.Encode(num_levels, input_levels); + } + ASSERT_EQ(num_levels, levels_count); +} + +static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int num_levels, + int batch_size, int level_repeat_count, + ::benchmark::State& state) { + std::vector bytes; + { + std::vector input_levels; + GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level, + num_levels, input_levels); + // Print generated levels + EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), bytes); + } + + LevelDecoder decoder; + std::vector output_levels(num_levels); + for (auto _ : state) { + decoder.SetData(level_encoding, max_level, num_levels, bytes.data(), + static_cast(bytes.size())); + // Decode multiple times with batch_size + while (true) { + int levels_decoded = decoder.Decode(batch_size, output_levels.data()); + if (levels_decoded == 0) { + break; + } + } + } +} + +static void ReadLevels(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size, level_repeat_count, + state); +} + +BENCHMARK(ReadLevels) + ->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"}) + ->Args({1, 8096, 1024, 1}) + ->Args({1, 8096, 1024, 7}) + ->Args({1, 8096, 1024, 1024}) + ->Args({1, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 1}) + ->Args({3, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 7}); + } // namespace benchmark } // namespace parquet diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 59fc848d7fd57..6a01c0614e3ec 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -976,7 +976,7 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, } void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector& bytes) { LevelDecoder decoder; int levels_count = 0; @@ -1015,7 +1015,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, } void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector>& bytes) { LevelDecoder decoder; int levels_count = 0;