From 41fa699385755486db44e18c10c704e211cb4d90 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Sat, 1 Jun 2024 13:44:21 +0800 Subject: [PATCH] fulltext Share PostingFormat among PostingWriters (#1270) Log MemIndexRecover. Share PostingFormat among PostingWriters. Updated builder image. Fulltext benchmark improvement - [x] Refactoring --- docs/getstarted/build_from_source.md | 2 +- scripts/Dockerfile_infinity_builder_centos7 | 16 ++++----- .../Dockerfile_infinity_builder_opencloudos | 2 +- .../Dockerfile_infinity_builder_ubuntu2310 | 12 +++---- .../download_deps_infinity_builder_centos7.sh | 4 +-- scripts/infinity-deps-ubuntu2004.sh | 8 ++--- scripts/infinity-deps-ubuntu2204.sh | 8 ++--- .../invertedindex/format/doc_list_encoder.cpp | 28 +++++---------- .../format/doc_list_encoder.cppm | 8 ++--- .../format/doc_list_format_option.cppm | 28 +++++++++------ .../format/position_list_format_option.cppm | 12 ++++--- .../format/posting_list_format.cppm | 8 +++-- src/storage/invertedindex/memory_indexer.cpp | 10 +++--- src/storage/invertedindex/memory_indexer.cppm | 2 ++ src/storage/invertedindex/posting_merger.cpp | 12 +++---- src/storage/invertedindex/posting_merger.cppm | 2 +- src/storage/invertedindex/posting_writer.cpp | 13 +++---- src/storage/invertedindex/posting_writer.cppm | 5 ++- src/storage/meta/entry/table_entry.cpp | 35 +++++++++++++------ .../storage/invertedindex/column_inverter.cpp | 3 +- .../format/inmem_doc_list_decoder.cpp | 6 ++-- .../format/posting_byte_slice.cpp | 3 +- .../format/posting_byte_slice_reader.cpp | 6 ++-- .../storage/invertedindex/posting_writer.cpp | 5 +-- 24 files changed, 125 insertions(+), 113 deletions(-) diff --git a/docs/getstarted/build_from_source.md b/docs/getstarted/build_from_source.md index b3e5647dab..9e0d741457 100644 --- a/docs/getstarted/build_from_source.md +++ b/docs/getstarted/build_from_source.md @@ -70,7 +70,7 @@ sudo apt update && sudo apt install git wget unzip software-properties-common wget https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.tar.gz tar zxvf cmake-3.29.0-linux-x86_64.tar.gz sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64 -wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip +wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja echo 'deb https://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main' | sudo tee /etc/apt/sources.list.d/llvm17.list wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc diff --git a/scripts/Dockerfile_infinity_builder_centos7 b/scripts/Dockerfile_infinity_builder_centos7 index c6941d9f14..cbbd0c4382 100644 --- a/scripts/Dockerfile_infinity_builder_centos7 +++ b/scripts/Dockerfile_infinity_builder_centos7 @@ -4,7 +4,7 @@ # bison-3.8.2.tar.xz # binutils-2.41.tar.xz # gcc-13.2.0.tar.xz -# cmake-3.29.2-linux-x86_64.tar.gz +# cmake-3.29.3-linux-x86_64.tar.gz # ninja-linux.zip # llvm-project-17.0.6.src.tar.xz # boost_1_81_0.tar.bz2 @@ -52,14 +52,14 @@ RUN --mount=type=bind,source=gcc-13.2.0.tar.xz,target=/root/gcc-13.2.0.tar.xz \ ENV LIBRARY_PATH=/usr/local/lib:/usr/local/lib64 -# Install cmake-3.29.2 -RUN --mount=type=bind,source=cmake-3.29.2-linux-x86_64.tar.gz,target=/root/cmake-3.29.2-linux-x86_64.tar.gz \ - cd /root && tar xf cmake-3.29.2-linux-x86_64.tar.gz \ - && cp -rf cmake-3.29.2-linux-x86_64/bin/* /usr/local/bin \ - && cp -rf cmake-3.29.2-linux-x86_64/share/* /usr/local/share \ - && rm -rf cmake-3.29.2-linux-x86_64 +# Install cmake-3.29.3 +RUN --mount=type=bind,source=cmake-3.29.3-linux-x86_64.tar.gz,target=/root/cmake-3.29.3-linux-x86_64.tar.gz \ + cd /root && tar xf cmake-3.29.3-linux-x86_64.tar.gz \ + && cp -rf cmake-3.29.3-linux-x86_64/bin/* /usr/local/bin \ + && cp -rf cmake-3.29.3-linux-x86_64/share/* /usr/local/share \ + && rm -rf cmake-3.29.3-linux-x86_64 -# Install ninja-1.11.1 +# Install ninja-1.12.1 RUN --mount=type=bind,source=ninja-linux.zip,target=/root/ninja-linux.zip \ cd /root && unzip ninja-linux.zip \ && cp ninja /usr/local/bin && rm ninja diff --git a/scripts/Dockerfile_infinity_builder_opencloudos b/scripts/Dockerfile_infinity_builder_opencloudos index 55ec022c3b..76533375fb 100644 --- a/scripts/Dockerfile_infinity_builder_opencloudos +++ b/scripts/Dockerfile_infinity_builder_opencloudos @@ -55,7 +55,7 @@ RUN --mount=type=bind,source=cmake-3.29.0-linux-x86_64.tar.gz,target=/root/cmake && cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share \ && rm -rf cmake-3.29.0-linux-x86_64 -# Install ninja-1.11.1 +# Install ninja-1.12.1 RUN --mount=type=bind,source=ninja-linux.zip,target=/root/ninja-linux.zip \ cd /root && unzip ninja-linux.zip \ && cp ninja /usr/local/bin && rm ninja diff --git a/scripts/Dockerfile_infinity_builder_ubuntu2310 b/scripts/Dockerfile_infinity_builder_ubuntu2310 index 9b4c1b066c..0655b1c93b 100644 --- a/scripts/Dockerfile_infinity_builder_ubuntu2310 +++ b/scripts/Dockerfile_infinity_builder_ubuntu2310 @@ -17,17 +17,17 @@ RUN apt install -y wget curl emacs-nox vim git build-essential ninja-build bison RUN apt install -y liblz4-dev zlib1g-dev libboost1.81-dev liburing-dev libgflags-dev libevent-dev libjemalloc-dev # CMake 3.28+ is requrired for C++20 modules. -# download https://github.com/Kitware/CMake/releases/download/v3.29.2/cmake-3.29.2-linux-x86_64.tar.gz -RUN --mount=type=bind,source=cmake-3.29.2-linux-x86_64.tar.gz,target=/root/cmake-3.29.2-linux-x86_64.tar.gz \ - cd /root && tar xzf cmake-3.29.2-linux-x86_64.tar.gz && cp -rf cmake-3.29.2-linux-x86_64/bin/* /usr/local/bin && cp -rf cmake-3.29.2-linux-x86_64/share/* /usr/local/share && rm -fr cmake-3.29.2-linux-x86_64 +# download https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.tar.gz +RUN --mount=type=bind,source=cmake-3.29.3-linux-x86_64.tar.gz,target=/root/cmake-3.29.3-linux-x86_64.tar.gz \ + cd /root && tar xzf cmake-3.29.3-linux-x86_64.tar.gz && cp -rf cmake-3.29.3-linux-x86_64/bin/* /usr/local/bin && cp -rf cmake-3.29.3-linux-x86_64/share/* /usr/local/share && rm -fr cmake-3.29.3-linux-x86_64 # download https://github.com/gperftools/gperftools/releases/download/gperftools-2.15/gperftools-2.15.tar.gz RUN --mount=type=bind,source=gperftools-2.15.tar.gz,target=/root/gperftools-2.15.tar.gz \ cd /root && tar xzf gperftools-2.15.tar.gz && cd gperftools-2.15 && ./configure && make -j 8 && make install && rm -fr /root/gperftools-2.15 -# download https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.20.1/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz -RUN --mount=type=bind,source=sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz,target=/root/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz \ - cd /tmp && tar xzf /root/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz && cp -rf sqllogictest /usr/local/bin && rm -fr /tmp/* +# download https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.20.2/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz +RUN --mount=type=bind,source=sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz,target=/root/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz \ + cd /tmp && tar xzf /root/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz && cp -rf sqllogictest /usr/local/bin && rm -fr /tmp/* # Create a python virtual environment. Set PATH so that the shell activate this virtual environment automatically when entering a container from this image. RUN python3 -m venv /usr/local/venv diff --git a/scripts/download_deps_infinity_builder_centos7.sh b/scripts/download_deps_infinity_builder_centos7.sh index 4a67092741..7d56407ab8 100644 --- a/scripts/download_deps_infinity_builder_centos7.sh +++ b/scripts/download_deps_infinity_builder_centos7.sh @@ -14,8 +14,8 @@ download() names="https://ftp.gnu.org/gnu/bison/bison-3.8.2.tar.xz https://ftp.gnu.org/gnu/binutils/binutils-2.41.tar.xz https://ftp.gnu.org/gnu/gcc/gcc-13.2.0/gcc-13.2.0.tar.xz -https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.tar.gz -https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip +https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.tar.gz +https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip https://github.com/llvm/llvm-project/releases/download/llvmorg-17.0.6/llvm-project-17.0.6.src.tar.xz https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0.tar.bz2 https://github.com/westes/flex/releases/download/v2.6.4/flex-2.6.4.tar.gz diff --git a/scripts/infinity-deps-ubuntu2004.sh b/scripts/infinity-deps-ubuntu2004.sh index 67c4987be4..039747255f 100644 --- a/scripts/infinity-deps-ubuntu2004.sh +++ b/scripts/infinity-deps-ubuntu2004.sh @@ -26,13 +26,13 @@ tar zxvf cmake-3.29.0-linux-x86_64.tar.gz sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64 echo -echo 'step [4/9] : download ninja-1.11.1' -echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip' +echo 'step [4/9] : download ninja-1.12.1' +echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip' echo -wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip +wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip echo -echo 'step [5/9] : install ninja-1.11.1 into /usr/local/bin' +echo 'step [5/9] : install ninja-1.12.1 into /usr/local/bin' echo 'command: unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja' echo unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja diff --git a/scripts/infinity-deps-ubuntu2204.sh b/scripts/infinity-deps-ubuntu2204.sh index 0ad7c53ac9..eb1d9d177b 100644 --- a/scripts/infinity-deps-ubuntu2204.sh +++ b/scripts/infinity-deps-ubuntu2204.sh @@ -26,13 +26,13 @@ tar zxvf cmake-3.29.0-linux-x86_64.tar.gz sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64 echo -echo 'step [4/9] : download ninja-1.11.1' -echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip' +echo 'step [4/9] : download ninja-1.12.1' +echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip' echo -wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip +wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip echo -echo 'step [5/9] : install ninja-1.11.1 into /usr/local/bin' +echo 'step [5/9] : install ninja-1.12.1 into /usr/local/bin' echo 'command: unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja' echo unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja diff --git a/src/storage/invertedindex/format/doc_list_encoder.cpp b/src/storage/invertedindex/format/doc_list_encoder.cpp index e85660f71e..cdde189691 100644 --- a/src/storage/invertedindex/format/doc_list_encoder.cpp +++ b/src/storage/invertedindex/format/doc_list_encoder.cpp @@ -9,32 +9,22 @@ import file_reader; import posting_byte_slice; import skiplist_writer; import skiplist_reader; -import doc_list_format_option; import inmem_doc_list_decoder; import index_defines; import vbyte_compressor; import logger; +import doc_list_format_option; namespace infinity { -DocListEncoder::DocListEncoder(const DocListFormatOption &format_option, DocListFormat *doc_list_format) - : doc_list_buffer_(), own_doc_list_format_(false), format_option_(format_option), doc_list_format_(doc_list_format), last_doc_id_(0), - current_tf_(0), total_tf_(0), df_(0), doc_skiplist_writer_(nullptr) { - if (!doc_list_format) { - doc_list_format_ = new DocListFormat; - doc_list_format_->Init(format_option); - own_doc_list_format_ = true; - } +DocListEncoder::DocListEncoder(const DocListFormat *doc_list_format) + : doc_list_buffer_(), doc_list_format_(doc_list_format), last_doc_id_(0), current_tf_(0), total_tf_(0), df_(0), doc_skiplist_writer_(nullptr) { + assert(doc_list_format != nullptr); doc_list_buffer_.Init(doc_list_format_); CreateDocSkipListWriter(); } -DocListEncoder::~DocListEncoder() { - if (own_doc_list_format_) { - delete doc_list_format_; - doc_list_format_ = nullptr; - } -} +DocListEncoder::~DocListEncoder() {} void DocListEncoder::AddPosition() { current_tf_++; @@ -57,10 +47,10 @@ void DocListEncoder::Flush() { void DocListEncoder::AddDocument(docid_t doc_id, docpayload_t doc_payload, tf_t tf, u32 doc_len) { doc_list_buffer_.PushBack(0, doc_id - last_doc_id_); int n = 1; - if (format_option_.HasTfList()) { + if (doc_list_format_->GetOption().HasTfList()) { doc_list_buffer_.PushBack(n++, tf); } - if (format_option_.HasDocPayload()) { + if (doc_list_format_->GetOption().HasDocPayload()) { doc_list_buffer_.PushBack(n++, doc_payload); } doc_list_buffer_.EndPushBack(); @@ -163,14 +153,14 @@ InMemDocListDecoder *DocListEncoder::GetInMemDocListDecoder() const { df_t df = df_; SkipListReaderPostingByteSlice *skiplist_reader = nullptr; if (doc_skiplist_writer_) { - skiplist_reader = new SkipListReaderPostingByteSlice(format_option_); + skiplist_reader = new SkipListReaderPostingByteSlice(doc_list_format_->GetOption()); skiplist_reader->Load(doc_skiplist_writer_.get()); } PostingByteSlice *doc_list_buffer = new PostingByteSlice(); doc_list_buffer_.SnapShot(doc_list_buffer); - InMemDocListDecoder *decoder = new InMemDocListDecoder(format_option_); + InMemDocListDecoder *decoder = new InMemDocListDecoder(doc_list_format_->GetOption()); decoder->Init(df, skiplist_reader, doc_list_buffer); return decoder; diff --git a/src/storage/invertedindex/format/doc_list_encoder.cppm b/src/storage/invertedindex/format/doc_list_encoder.cppm index 6e22e7e0da..bf0f36d109 100644 --- a/src/storage/invertedindex/format/doc_list_encoder.cppm +++ b/src/storage/invertedindex/format/doc_list_encoder.cppm @@ -6,9 +6,9 @@ import file_writer; import file_reader; import posting_byte_slice; import skiplist_writer; -import doc_list_format_option; import inmem_doc_list_decoder; import index_defines; +import doc_list_format_option; export module doc_list_encoder; @@ -16,8 +16,7 @@ namespace infinity { export class DocListEncoder { public: - DocListEncoder(const DocListFormatOption &format_option, - DocListFormat *doc_list_format = nullptr); + DocListEncoder(const DocListFormat *doc_list_format); ~DocListEncoder(); @@ -64,8 +63,7 @@ private: private: PostingByteSlice doc_list_buffer_; bool own_doc_list_format_; - DocListFormatOption format_option_; - DocListFormat *doc_list_format_; + const DocListFormat *doc_list_format_; docid_t last_doc_id_; docpayload_t last_doc_payload_; diff --git a/src/storage/invertedindex/format/doc_list_format_option.cppm b/src/storage/invertedindex/format/doc_list_format_option.cppm index 21d78f8d82..3a14fe9a20 100644 --- a/src/storage/invertedindex/format/doc_list_format_option.cppm +++ b/src/storage/invertedindex/format/doc_list_format_option.cppm @@ -26,7 +26,7 @@ namespace infinity { export class DocListFormatOption { public: - explicit DocListFormatOption(optionflag_t option_flag) { Init(option_flag); } + explicit DocListFormatOption(optionflag_t option_flag = OPTION_FLAG_ALL) { Init(option_flag); } ~DocListFormatOption() = default; @@ -70,10 +70,14 @@ private: export class DocSkipListFormat : public PostingFields { public: - DocSkipListFormat() = default; + explicit DocSkipListFormat(const DocListFormatOption &option) { Init(option); } ~DocSkipListFormat() = default; + bool HasTfList() const { return has_tf_list_; } + bool HasBlockMax() const { return has_block_max_; } + +private: void Init(const DocListFormatOption &option) { u8 row_count = 0; u32 offset = 0; @@ -121,9 +125,6 @@ public: } } - bool HasTfList() const { return has_tf_list_; } - bool HasBlockMax() const { return has_block_max_; } - private: bool has_tf_list_ = false; bool has_block_max_ = false; @@ -131,8 +132,7 @@ private: export class DocListFormat : public PostingFields { public: - DocListFormat(const DocListFormatOption &option) : skiplist_format_(nullptr) { Init(option); } - DocListFormat() : skiplist_format_(nullptr) {} + explicit DocListFormat(const DocListFormatOption &option) : option_(option), skiplist_format_(nullptr) { Init(option_); } ~DocListFormat() { if (skiplist_format_) { @@ -141,6 +141,10 @@ public: } }; + const DocSkipListFormat *GetDocSkipListFormat() const { return skiplist_format_; } + const DocListFormatOption GetOption() const { return option_; } + +private: void Init(const DocListFormatOption &option) { u8 row_count = 0; u32 offset = 0; @@ -166,11 +170,15 @@ public: doc_payload_value->offset_ = offset; values_.push_back(doc_payload_value); } - skiplist_format_ = new DocSkipListFormat; - skiplist_format_->Init(option); + if (skiplist_format_) { + delete skiplist_format_; + skiplist_format_ = nullptr; + } + skiplist_format_ = new DocSkipListFormat(option); } - const DocSkipListFormat *GetDocSkipListFormat() const { return skiplist_format_; } +public: + const DocListFormatOption option_{OPTION_FLAG_ALL}; private: DocSkipListFormat *skiplist_format_; diff --git a/src/storage/invertedindex/format/position_list_format_option.cppm b/src/storage/invertedindex/format/position_list_format_option.cppm index cab2bc55bf..5cb69ead59 100644 --- a/src/storage/invertedindex/format/position_list_format_option.cppm +++ b/src/storage/invertedindex/format/position_list_format_option.cppm @@ -9,7 +9,7 @@ namespace infinity { export class PositionListFormatOption { public: - explicit PositionListFormatOption(optionflag_t option_flag) { Init(option_flag); } + explicit PositionListFormatOption(optionflag_t option_flag = OPTION_FLAG_ALL) { Init(option_flag); } ~PositionListFormatOption() {} inline void Init(optionflag_t option_flag) { @@ -57,8 +57,7 @@ public: export class PositionListFormat : public PostingFields { public: - PositionListFormat(const PositionListFormatOption &option) : skiplist_format_(nullptr) { Init(option); } - PositionListFormat() : skiplist_format_(nullptr) {} + explicit PositionListFormat(const PositionListFormatOption &option) : option_(option), skiplist_format_(nullptr) { Init(option); } ~PositionListFormat() { if (skiplist_format_) { @@ -67,6 +66,10 @@ public: } }; + const PositionSkipListFormat *GetPositionSkipListFormat() const { return skiplist_format_; } + const PositionListFormatOption GetOption() const { return option_; } + +private: void Init(const PositionListFormatOption &option) { u8 row_count = 0; u32 offset = 0; @@ -82,7 +85,8 @@ public: skiplist_format_->Init(option); } - const PositionSkipListFormat *GetPositionSkipListFormat() const { return skiplist_format_; } +public: + const PositionListFormatOption option_; private: PositionSkipListFormat *skiplist_format_; diff --git a/src/storage/invertedindex/format/posting_list_format.cppm b/src/storage/invertedindex/format/posting_list_format.cppm index 89cda5c7c8..5cc3dfc346 100644 --- a/src/storage/invertedindex/format/posting_list_format.cppm +++ b/src/storage/invertedindex/format/posting_list_format.cppm @@ -12,7 +12,7 @@ namespace infinity { export class PostingFormatOption { public: - inline PostingFormatOption(optionflag_t flag) + inline PostingFormatOption(optionflag_t flag = OPTION_FLAG_ALL) : has_term_payload_(flag & of_term_payload), doc_list_format_option_(flag), pos_list_format_option_(flag) {} bool HasTfList() const { return doc_list_format_option_.HasTfList(); } @@ -45,7 +45,7 @@ private: export class PostingFormat { public: - PostingFormat(const PostingFormatOption &option) : doc_list_format_(nullptr), pos_list_format_(nullptr) { + explicit PostingFormat(const PostingFormatOption &option) : option_(option), doc_list_format_(nullptr), pos_list_format_(nullptr) { doc_list_format_ = new DocListFormat(option.GetDocListFormatOption()); if (option.HasPositionList()) { pos_list_format_ = new PositionListFormat(option.GetPosListFormatOption()); @@ -64,6 +64,10 @@ public: DocListFormat *GetDocListFormat() const { return doc_list_format_; } PositionListFormat *GetPositionListFormat() const { return pos_list_format_; } + const PostingFormatOption GetOption() const { return option_; } + +private: + const PostingFormatOption option_; private: DocListFormat *doc_list_format_; diff --git a/src/storage/invertedindex/memory_indexer.cpp b/src/storage/invertedindex/memory_indexer.cpp index 96790895e9..0a2f8b5bde 100644 --- a/src/storage/invertedindex/memory_indexer.cpp +++ b/src/storage/invertedindex/memory_indexer.cpp @@ -71,11 +71,11 @@ bool MemoryIndexer::KeyComp::operator()(const String &lhs, const String &rhs) co MemoryIndexer::PostingTable::PostingTable() {} MemoryIndexer::MemoryIndexer(const String &index_dir, const String &base_name, RowID base_row_id, optionflag_t flag, const String &analyzer) - : index_dir_(index_dir), base_name_(base_name), base_row_id_(base_row_id), flag_(flag), analyzer_(analyzer), - inverting_thread_pool_(infinity::InfinityContext::instance().GetFulltextInvertingThreadPool()), + : index_dir_(index_dir), base_name_(base_name), base_row_id_(base_row_id), flag_(flag), posting_format_(PostingFormatOption(flag_)), + analyzer_(analyzer), inverting_thread_pool_(infinity::InfinityContext::instance().GetFulltextInvertingThreadPool()), commiting_thread_pool_(infinity::InfinityContext::instance().GetFulltextCommitingThreadPool()), ring_inverted_(15UL), ring_sorted_(13UL) { posting_table_ = MakeShared(); - prepared_posting_ = MakeShared(PostingFormatOption(flag_), column_lengths_); + prepared_posting_ = MakeShared(posting_format_, column_lengths_); Path path = Path(index_dir) / (base_name + ".tmp.merge"); spill_full_path_ = path.string(); } @@ -351,7 +351,7 @@ SharedPtr MemoryIndexer::GetOrAddPosting(const String &term) { PostingPtr posting; bool found = posting_store.GetOrAdd(term, posting, prepared_posting_); if (!found) { - prepared_posting_ = MakeShared(PostingFormatOption(flag_), column_lengths_); + prepared_posting_ = MakeShared(posting_format_, column_lengths_); } return posting; } @@ -438,7 +438,7 @@ void MemoryIndexer::OfflineDump() { term_meta_dumpler.Dump(dict_file_writer, term_meta); fst_builder.Insert((u8 *)last_term.data(), last_term.length(), term_meta_offset); } - posting = MakeUnique(PostingFormatOption(flag_), column_lengths_); + posting = MakeUnique(posting_format_, column_lengths_); last_term_str = String(term); last_term = std::string_view(last_term_str); last_doc_id = INVALID_DOCID; diff --git a/src/storage/invertedindex/memory_indexer.cppm b/src/storage/invertedindex/memory_indexer.cppm index 84b3dd1a2d..139e7783c5 100644 --- a/src/storage/invertedindex/memory_indexer.cppm +++ b/src/storage/invertedindex/memory_indexer.cppm @@ -31,6 +31,7 @@ import internal_types; import map_with_lock; import vector_with_lock; import buf_writer; +import posting_list_format; namespace infinity { @@ -118,6 +119,7 @@ private: String base_name_; RowID base_row_id_{INVALID_ROWID}; optionflag_t flag_; + PostingFormat posting_format_; String analyzer_; ThreadPool &inverting_thread_pool_; ThreadPool &commiting_thread_pool_; diff --git a/src/storage/invertedindex/posting_merger.cpp b/src/storage/invertedindex/posting_merger.cpp index 26c3890e6a..5a323b3cd0 100644 --- a/src/storage/invertedindex/posting_merger.cpp +++ b/src/storage/invertedindex/posting_merger.cpp @@ -115,9 +115,8 @@ bool DocMerger::HasNext() { class PostingDumper { public: - PostingDumper(const PostingFormatOption &format_option, VectorWithLock &column_length_array) - : format_option_(format_option), column_lengths_(column_length_array) { - posting_writer_ = MakeShared(format_option, column_lengths_); + PostingDumper(const PostingFormat &posting_format, VectorWithLock &column_length_array) : column_lengths_(column_length_array) { + posting_writer_ = MakeShared(posting_format, column_lengths_); } ~PostingDumper() {} @@ -135,7 +134,6 @@ class PostingDumper { SharedPtr GetPostingWriter() { return posting_writer_; } private: - PostingFormatOption format_option_; SharedPtr posting_writer_; // for column length info VectorWithLock &column_lengths_; @@ -172,8 +170,8 @@ class SortedPosting { }; PostingMerger::PostingMerger(optionflag_t flag, VectorWithLock &column_length_array) - : format_option_(flag), column_lengths_(column_length_array) { - posting_dumper_ = MakeShared(format_option_, column_lengths_); + : posting_format_(PostingFormatOption(flag)), column_lengths_(column_length_array) { + posting_dumper_ = MakeShared(posting_format_, column_lengths_); } PostingMerger::~PostingMerger() {} @@ -185,7 +183,7 @@ void PostingMerger::Merge(const Vector& segment_term_posti RowID base_row_id = term_posting->GetBaseRowId(); u32 base_doc_id = base_row_id - merge_base_rowid; PostingDecoder *decoder = term_posting->GetPostingDecoder(); - SortedPosting sorted_posting(format_option_, base_doc_id, decoder); + SortedPosting sorted_posting(posting_format_.GetOption(), base_doc_id, decoder); while (sorted_posting.Next()) { sorted_posting.Merge(posting_dumper_); } diff --git a/src/storage/invertedindex/posting_merger.cppm b/src/storage/invertedindex/posting_merger.cppm index 4204804a99..454547388f 100644 --- a/src/storage/invertedindex/posting_merger.cppm +++ b/src/storage/invertedindex/posting_merger.cppm @@ -32,7 +32,7 @@ public: u32 GetTotalTF(); private: - PostingFormatOption format_option_; + PostingFormat posting_format_; SharedPtr posting_dumper_; df_t df_; ttf_t ttf_; diff --git a/src/storage/invertedindex/posting_writer.cpp b/src/storage/invertedindex/posting_writer.cpp index 024b62a36b..cddcba5927 100644 --- a/src/storage/invertedindex/posting_writer.cpp +++ b/src/storage/invertedindex/posting_writer.cpp @@ -18,18 +18,15 @@ module posting_writer; namespace infinity { -PostingWriter::PostingWriter(PostingFormatOption posting_option, VectorWithLock &column_lengths) - : posting_option_(posting_option), posting_format_(new PostingFormat(posting_option)), column_lengths_(column_lengths) { - if (posting_option.HasPositionList()) { - position_list_encoder_ = new PositionListEncoder(posting_option_, posting_format_->GetPositionListFormat()); +PostingWriter::PostingWriter(const PostingFormat &posting_format, VectorWithLock &column_lengths) + : posting_format_(posting_format), column_lengths_(column_lengths) { + if (posting_format.GetOption().HasPositionList()) { + position_list_encoder_ = new PositionListEncoder(posting_format.GetOption(), posting_format.GetPositionListFormat()); } - doc_list_encoder_ = new DocListEncoder(posting_option_.GetDocListFormatOption(), posting_format_->GetDocListFormat()); + doc_list_encoder_ = new DocListEncoder(posting_format.GetDocListFormat()); } PostingWriter::~PostingWriter() { - if (posting_format_) { - delete posting_format_; - } if (position_list_encoder_) { delete position_list_encoder_; } diff --git a/src/storage/invertedindex/posting_writer.cppm b/src/storage/invertedindex/posting_writer.cppm index 65e6581fa3..7d46ffbd01 100644 --- a/src/storage/invertedindex/posting_writer.cppm +++ b/src/storage/invertedindex/posting_writer.cppm @@ -17,7 +17,7 @@ import vector_with_lock; namespace infinity { export class PostingWriter { public: - PostingWriter(PostingFormatOption posting_option, VectorWithLock &column_lengths); + PostingWriter(const PostingFormat &posting_format, VectorWithLock &column_lengths); ~PostingWriter(); @@ -48,8 +48,7 @@ public: u32 GetDocColumnLength(docid_t doc_id) { return column_lengths_.Get(doc_id); } private: - PostingFormatOption posting_option_; - PostingFormat *posting_format_{nullptr}; + const PostingFormat &posting_format_; DocListEncoder *doc_list_encoder_{nullptr}; PositionListEncoder *position_list_encoder_{nullptr}; // for column length info diff --git a/src/storage/meta/entry/table_entry.cpp b/src/storage/meta/entry/table_entry.cpp index 37bd02c780..a6e94708c1 100644 --- a/src/storage/meta/entry/table_entry.cpp +++ b/src/storage/meta/entry/table_entry.cpp @@ -706,6 +706,9 @@ void TableEntry::MemIndexInsertInner(TableIndexEntry *table_index_entry, Txn *tx if ((i == dump_idx && segment_index_entry->MemIndexRowCount() >= infinity::InfinityContext::instance().config()->MemIndexCapacity()) || (i == num_ranges - 1 && segment_entry->Room() <= 0)) { SharedPtr chunk_index_entry = segment_index_entry->MemIndexDump(); + String *index_name = index_base->index_name_.get(); + String message = fmt::format("Table {}.{} index {} segment {} MemIndex dumped.", *GetDBName(), *table_name_, *index_name, seg_id); + LOG_INFO(message); if (chunk_index_entry.get() != nullptr) { chunk_index_entry->Commit(txn->CommitTS()); txn_table_store->AddChunkIndexStore(table_index_entry, chunk_index_entry.get()); @@ -747,7 +750,7 @@ void TableEntry::MemIndexCommit() { void TableEntry::MemIndexRecover(BufferManager *buffer_manager) { auto index_meta_map_guard = index_meta_map_.GetMetaMap(); - for (auto &[_, table_index_meta] : *index_meta_map_guard) { + for (auto &[index_name, table_index_meta] : *index_meta_map_guard) { auto [table_index_entry, status] = table_index_meta->GetEntryNolock(0UL, MAX_TIMESTAMP); if (!status.ok()) continue; @@ -791,20 +794,30 @@ void TableEntry::MemIndexRecover(BufferManager *buffer_manager) { } } - // Insert block entries into MemIndexer SizeT num_ranges = append_ranges.size(); - for (SizeT i = 0; i < num_ranges; i++) { - AppendRange &range = append_ranges[i]; - segment_index_entry->MemIndexInsert(block_entries[range.block_id_], - range.start_offset_, - range.row_count_, - segment_index_entry->max_ts(), - buffer_manager); + if (num_ranges > 0) { + assert(segment_entry->Room() > 0); + // Insert block entries into MemIndexer + String message = fmt::format("Table {}.{} index {} segment {} MemIndex recovering from {} block entries.", + *GetDBName(), + *table_name_, + index_name, + segment_id, + num_ranges); + LOG_INFO(message); + for (SizeT i = 0; i < num_ranges; i++) { + AppendRange &range = append_ranges[i]; + segment_index_entry->MemIndexInsert(block_entries[range.block_id_], + range.start_offset_, + range.row_count_, + segment_index_entry->max_ts(), + buffer_manager); + } + message = fmt::format("Table {}.{} index {} segment {} MemIndex recovered.", *GetDBName(), *table_name_, index_name, segment_id); + LOG_INFO(message); } if (segment_id == unsealed_id_) { table_index_entry->last_segment_ = segment_index_entry; - } else { - segment_index_entry->MemIndexDump(); } } } diff --git a/src/unit_test/storage/invertedindex/column_inverter.cpp b/src/unit_test/storage/invertedindex/column_inverter.cpp index bcb13a3c19..e55db5e13c 100644 --- a/src/unit_test/storage/invertedindex/column_inverter.cpp +++ b/src/unit_test/storage/invertedindex/column_inverter.cpp @@ -37,6 +37,7 @@ using namespace infinity; class ColumnInverterTest : public BaseTest { protected: optionflag_t flag_{OPTION_FLAG_ALL}; + PostingFormat posting_format_{PostingFormatOption(flag_)}; Map> postings_; VectorWithLock column_lengths_; @@ -57,7 +58,7 @@ class ColumnInverterTest : public BaseTest { if (it != postings_.end()) { return it->second; } - SharedPtr posting = MakeShared(PostingFormatOption(flag_), column_lengths_); + SharedPtr posting = MakeShared(posting_format_, column_lengths_); postings_[term] = posting; return posting; } diff --git a/src/unit_test/storage/invertedindex/format/inmem_doc_list_decoder.cpp b/src/unit_test/storage/invertedindex/format/inmem_doc_list_decoder.cpp index 9c81478fd8..887fff6ef8 100644 --- a/src/unit_test/storage/invertedindex/format/inmem_doc_list_decoder.cpp +++ b/src/unit_test/storage/invertedindex/format/inmem_doc_list_decoder.cpp @@ -20,8 +20,8 @@ class InMemDocListDecoderTest : public BaseTest { } void SetUp() override { - DocListFormatOption doc_list_format_option(OPTION_FLAG_NONE); - doc_list_encoder_ = new DocListEncoder(doc_list_format_option); + doc_list_format_ptr_.reset(new DocListFormat(DocListFormatOption(OPTION_FLAG_NONE))); + doc_list_encoder_ = new DocListEncoder(doc_list_format_ptr_.get()); for (SizeT i = 0; i < 128; ++i) { doc_list_encoder_->AddPosition(); @@ -106,7 +106,7 @@ class InMemDocListDecoderTest : public BaseTest { DocListFormatOption format_option(op_flag); doc_list_format_ptr_.reset(new DocListFormat(format_option)); - doc_list_encoder_ptr_.reset(new DocListEncoder(format_option, doc_list_format_ptr_.get())); + doc_list_encoder_ptr_.reset(new DocListEncoder(doc_list_format_ptr_.get())); for (uint32_t i = 0; i < doc_count; ++i) { doc_list_encoder_ptr_->AddPosition(); doc_list_encoder_ptr_->AddPosition(); diff --git a/src/unit_test/storage/invertedindex/format/posting_byte_slice.cpp b/src/unit_test/storage/invertedindex/format/posting_byte_slice.cpp index 1bee587574..67083272af 100644 --- a/src/unit_test/storage/invertedindex/format/posting_byte_slice.cpp +++ b/src/unit_test/storage/invertedindex/format/posting_byte_slice.cpp @@ -16,8 +16,7 @@ class PostingByteSliceTest : public BaseTest { ~PostingByteSliceTest() {} void SetUp() override { DocListFormatOption option(NO_TERM_FREQUENCY); - doc_list_format_.reset(new DocListFormat()); - doc_list_format_->Init(option); + doc_list_format_.reset(new DocListFormat(option)); posting_byte_slice_.reset(new PostingByteSlice()); posting_byte_slice_->Init(doc_list_format_.get()); } diff --git a/src/unit_test/storage/invertedindex/format/posting_byte_slice_reader.cpp b/src/unit_test/storage/invertedindex/format/posting_byte_slice_reader.cpp index 72a5dad371..9373cea6e6 100644 --- a/src/unit_test/storage/invertedindex/format/posting_byte_slice_reader.cpp +++ b/src/unit_test/storage/invertedindex/format/posting_byte_slice_reader.cpp @@ -17,8 +17,7 @@ class PostingByteSliceReaderTest : public BaseTest { void SetUp() override { DocListFormatOption option(NO_TERM_FREQUENCY); - doc_list_format_.reset(new DocListFormat()); - doc_list_format_->Init(option); + doc_list_format_.reset(new DocListFormat(option)); posting_byte_slice_.reset(new PostingByteSlice()); posting_byte_slice_->Init(doc_list_format_.get()); } @@ -109,8 +108,7 @@ class PostingByteSliceReaderTest : public BaseTest { TEST_F(PostingByteSliceReaderTest, test1) { using namespace infinity; DocListFormatOption option(of_none); - DocListFormat doc_list_format; - doc_list_format.Init(option); + DocListFormat doc_list_format(option); { // empty posting buffer PostingByteSlice posting_buffer; diff --git a/src/unit_test/storage/invertedindex/posting_writer.cpp b/src/unit_test/storage/invertedindex/posting_writer.cpp index 1ea7a8645c..15483477e4 100644 --- a/src/unit_test/storage/invertedindex/posting_writer.cpp +++ b/src/unit_test/storage/invertedindex/posting_writer.cpp @@ -40,6 +40,7 @@ class PostingWriterTest : public BaseTest { protected: String file_; optionflag_t flag_{OPTION_FLAG_ALL}; + PostingFormat posting_format_{flag_}; LocalFileSystem fs_; }; @@ -47,7 +48,7 @@ TEST_F(PostingWriterTest, test1) { Vector expected = {1, 3, 5, 7, 9}; VectorWithLock column_length_array(20, 10); { - SharedPtr posting = MakeShared(PostingFormatOption(flag_), column_length_array); + SharedPtr posting = MakeShared(posting_format_, column_length_array); for (u32 i = 0; i < expected.size(); ++i) { posting->AddPosition(1); @@ -62,7 +63,7 @@ TEST_F(PostingWriterTest, test1) { file_writer->Sync(); } { - SharedPtr posting = MakeShared(PostingFormatOption(flag_), column_length_array); + SharedPtr posting = MakeShared(posting_format_, column_length_array); SharedPtr file_reader = MakeShared(fs_, file_, 128000); posting->Load(file_reader);