Skip to content

Commit

Permalink
limit compression block size
Browse files Browse the repository at this point in the history
  • Loading branch information
ffacs committed Jan 29, 2024
1 parent 8f22732 commit ecaa636
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
2 changes: 2 additions & 0 deletions c++/include/orc/Writer.hh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ namespace orc {

/**
* Set the data compression block size.
* Should less then 1 << 23 bytes (8M) which is limited by the
* 3 bytes size of compression block header (1 bit for isOriginal and 23 bits for length)
*/
WriterOptions& setCompressionBlockSize(uint64_t size);

Expand Down
8 changes: 8 additions & 0 deletions c++/src/Writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ namespace orc {

const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER;

static void validateOptions(const WriterOptions& opts) {
if (opts.getCompressionBlockSize() >= (1 << 23)) {
throw std::invalid_argument("Compression block size cannot be greater or equal than 8M");
}
}

WriterImpl::WriterImpl(const Type& t, OutputStream* stream, const WriterOptions& opts)
: outStream(stream), options(opts), type(t) {
streamsFactory = createStreamsFactory(options, outStream);
Expand All @@ -347,6 +353,8 @@ namespace orc {
stripesAtLastFlush = 0;
lastFlushOffset = 0;

validateOptions(opts);

useTightNumericVector = opts.getUseTightNumericVector();

// compression stream for stripe footer, file footer and metadata
Expand Down
20 changes: 20 additions & 0 deletions c++/test/TestWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* limitations under the License.
*/

#include "gtest/gtest.h"
#include "orc/ColumnPrinter.hh"
#include "orc/OrcFile.hh"

Expand All @@ -29,6 +30,7 @@
#include <cmath>
#include <ctime>
#include <sstream>
#include <stdexcept>

#ifdef __clang__
DIAGNOSTIC_IGNORE("-Wmissing-variable-declarations")
Expand Down Expand Up @@ -2191,6 +2193,24 @@ namespace orc {
}
}

TEST_P(WriterTest, testValidateOptions) {
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
MemoryPool* pool = getDefaultPool();
std::unique_ptr<Type> type(Type::buildTypeFromString("struct<col1:int>"));

uint64_t stripeSize = 16 * 1024; // 16K
auto compressionKind = CompressionKind_NONE;

EXPECT_NO_THROW(createWriter(stripeSize, /* compressionBlockSize*/ (1 << 23) - 1,
compressionKind, *type, pool, &memStream, fileVersion));
EXPECT_THROW(createWriter(stripeSize, /* compressionBlockSize*/ (1 << 23), compressionKind,
*type, pool, &memStream, fileVersion),
std::invalid_argument);
EXPECT_THROW(createWriter(stripeSize, /* compressionBlockSize*/ (1 << 23) + 1, compressionKind,
*type, pool, &memStream, fileVersion),
std::invalid_argument);
}

INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest,
Values(FileVersion::v_0_11(), FileVersion::v_0_12(),
FileVersion::UNSTABLE_PRE_2_0()));
Expand Down

0 comments on commit ecaa636

Please sign in to comment.