Skip to content

Commit

Permalink
fulltext Share PostingFormat among PostingWriters (#1270)
Browse files Browse the repository at this point in the history
Log MemIndexRecover. 
Share PostingFormat among PostingWriters. 
Updated builder image. 
Fulltext benchmark improvement

- [x] Refactoring
  • Loading branch information
yuzhichang authored Jun 1, 2024
1 parent dd28326 commit 41fa699
Show file tree
Hide file tree
Showing 24 changed files with 125 additions and 113 deletions.
2 changes: 1 addition & 1 deletion docs/getstarted/build_from_source.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ sudo apt update && sudo apt install git wget unzip software-properties-common
wget https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.tar.gz
tar zxvf cmake-3.29.0-linux-x86_64.tar.gz
sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64
wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip
unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja
echo 'deb https://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main' | sudo tee /etc/apt/sources.list.d/llvm17.list
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
Expand Down
16 changes: 8 additions & 8 deletions scripts/Dockerfile_infinity_builder_centos7
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# bison-3.8.2.tar.xz
# binutils-2.41.tar.xz
# gcc-13.2.0.tar.xz
# cmake-3.29.2-linux-x86_64.tar.gz
# cmake-3.29.3-linux-x86_64.tar.gz
# ninja-linux.zip
# llvm-project-17.0.6.src.tar.xz
# boost_1_81_0.tar.bz2
Expand Down Expand Up @@ -52,14 +52,14 @@ RUN --mount=type=bind,source=gcc-13.2.0.tar.xz,target=/root/gcc-13.2.0.tar.xz \

ENV LIBRARY_PATH=/usr/local/lib:/usr/local/lib64

# Install cmake-3.29.2
RUN --mount=type=bind,source=cmake-3.29.2-linux-x86_64.tar.gz,target=/root/cmake-3.29.2-linux-x86_64.tar.gz \
cd /root && tar xf cmake-3.29.2-linux-x86_64.tar.gz \
&& cp -rf cmake-3.29.2-linux-x86_64/bin/* /usr/local/bin \
&& cp -rf cmake-3.29.2-linux-x86_64/share/* /usr/local/share \
&& rm -rf cmake-3.29.2-linux-x86_64
# Install cmake-3.29.3
RUN --mount=type=bind,source=cmake-3.29.3-linux-x86_64.tar.gz,target=/root/cmake-3.29.3-linux-x86_64.tar.gz \
cd /root && tar xf cmake-3.29.3-linux-x86_64.tar.gz \
&& cp -rf cmake-3.29.3-linux-x86_64/bin/* /usr/local/bin \
&& cp -rf cmake-3.29.3-linux-x86_64/share/* /usr/local/share \
&& rm -rf cmake-3.29.3-linux-x86_64

# Install ninja-1.11.1
# Install ninja-1.12.1
RUN --mount=type=bind,source=ninja-linux.zip,target=/root/ninja-linux.zip \
cd /root && unzip ninja-linux.zip \
&& cp ninja /usr/local/bin && rm ninja
Expand Down
2 changes: 1 addition & 1 deletion scripts/Dockerfile_infinity_builder_opencloudos
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ RUN --mount=type=bind,source=cmake-3.29.0-linux-x86_64.tar.gz,target=/root/cmake
&& cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share \
&& rm -rf cmake-3.29.0-linux-x86_64

# Install ninja-1.11.1
# Install ninja-1.12.1
RUN --mount=type=bind,source=ninja-linux.zip,target=/root/ninja-linux.zip \
cd /root && unzip ninja-linux.zip \
&& cp ninja /usr/local/bin && rm ninja
Expand Down
12 changes: 6 additions & 6 deletions scripts/Dockerfile_infinity_builder_ubuntu2310
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ RUN apt install -y wget curl emacs-nox vim git build-essential ninja-build bison
RUN apt install -y liblz4-dev zlib1g-dev libboost1.81-dev liburing-dev libgflags-dev libevent-dev libjemalloc-dev

# CMake 3.28+ is requrired for C++20 modules.
# download https://github.com/Kitware/CMake/releases/download/v3.29.2/cmake-3.29.2-linux-x86_64.tar.gz
RUN --mount=type=bind,source=cmake-3.29.2-linux-x86_64.tar.gz,target=/root/cmake-3.29.2-linux-x86_64.tar.gz \
cd /root && tar xzf cmake-3.29.2-linux-x86_64.tar.gz && cp -rf cmake-3.29.2-linux-x86_64/bin/* /usr/local/bin && cp -rf cmake-3.29.2-linux-x86_64/share/* /usr/local/share && rm -fr cmake-3.29.2-linux-x86_64
# download https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.tar.gz
RUN --mount=type=bind,source=cmake-3.29.3-linux-x86_64.tar.gz,target=/root/cmake-3.29.3-linux-x86_64.tar.gz \
cd /root && tar xzf cmake-3.29.3-linux-x86_64.tar.gz && cp -rf cmake-3.29.3-linux-x86_64/bin/* /usr/local/bin && cp -rf cmake-3.29.3-linux-x86_64/share/* /usr/local/share && rm -fr cmake-3.29.3-linux-x86_64

# download https://github.com/gperftools/gperftools/releases/download/gperftools-2.15/gperftools-2.15.tar.gz
RUN --mount=type=bind,source=gperftools-2.15.tar.gz,target=/root/gperftools-2.15.tar.gz \
cd /root && tar xzf gperftools-2.15.tar.gz && cd gperftools-2.15 && ./configure && make -j 8 && make install && rm -fr /root/gperftools-2.15

# download https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.20.1/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz
RUN --mount=type=bind,source=sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz,target=/root/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz \
cd /tmp && tar xzf /root/sqllogictest-bin-v0.20.1-x86_64-unknown-linux-musl.tar.gz && cp -rf sqllogictest /usr/local/bin && rm -fr /tmp/*
# download https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.20.2/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz
RUN --mount=type=bind,source=sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz,target=/root/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz \
cd /tmp && tar xzf /root/sqllogictest-bin-v0.20.2-x86_64-unknown-linux-musl.tar.gz && cp -rf sqllogictest /usr/local/bin && rm -fr /tmp/*

# Create a python virtual environment. Set PATH so that the shell activate this virtual environment automatically when entering a container from this image.
RUN python3 -m venv /usr/local/venv
Expand Down
4 changes: 2 additions & 2 deletions scripts/download_deps_infinity_builder_centos7.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ download()
names="https://ftp.gnu.org/gnu/bison/bison-3.8.2.tar.xz
https://ftp.gnu.org/gnu/binutils/binutils-2.41.tar.xz
https://ftp.gnu.org/gnu/gcc/gcc-13.2.0/gcc-13.2.0.tar.xz
https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.tar.gz
https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.tar.gz
https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip
https://github.com/llvm/llvm-project/releases/download/llvmorg-17.0.6/llvm-project-17.0.6.src.tar.xz
https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0.tar.bz2
https://github.com/westes/flex/releases/download/v2.6.4/flex-2.6.4.tar.gz
Expand Down
8 changes: 4 additions & 4 deletions scripts/infinity-deps-ubuntu2004.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ tar zxvf cmake-3.29.0-linux-x86_64.tar.gz
sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64

echo
echo 'step [4/9] : download ninja-1.11.1'
echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip'
echo 'step [4/9] : download ninja-1.12.1'
echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip'
echo
wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip

echo
echo 'step [5/9] : install ninja-1.11.1 into /usr/local/bin'
echo 'step [5/9] : install ninja-1.12.1 into /usr/local/bin'
echo 'command: unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja'
echo
unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja
Expand Down
8 changes: 4 additions & 4 deletions scripts/infinity-deps-ubuntu2204.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ tar zxvf cmake-3.29.0-linux-x86_64.tar.gz
sudo cp -rf cmake-3.29.0-linux-x86_64/bin/* /usr/local/bin && sudo cp -rf cmake-3.29.0-linux-x86_64/share/* /usr/local/share && rm -rf cmake-3.29.0-linux-x86_64

echo
echo 'step [4/9] : download ninja-1.11.1'
echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip'
echo 'step [4/9] : download ninja-1.12.1'
echo 'command: wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip'
echo
wget https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-linux.zip
wget https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip

echo
echo 'step [5/9] : install ninja-1.11.1 into /usr/local/bin'
echo 'step [5/9] : install ninja-1.12.1 into /usr/local/bin'
echo 'command: unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja'
echo
unzip ninja-linux.zip && sudo cp ninja /usr/local/bin && rm ninja
Expand Down
28 changes: 9 additions & 19 deletions src/storage/invertedindex/format/doc_list_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,22 @@ import file_reader;
import posting_byte_slice;
import skiplist_writer;
import skiplist_reader;
import doc_list_format_option;
import inmem_doc_list_decoder;
import index_defines;
import vbyte_compressor;
import logger;
import doc_list_format_option;

namespace infinity {

DocListEncoder::DocListEncoder(const DocListFormatOption &format_option, DocListFormat *doc_list_format)
: doc_list_buffer_(), own_doc_list_format_(false), format_option_(format_option), doc_list_format_(doc_list_format), last_doc_id_(0),
current_tf_(0), total_tf_(0), df_(0), doc_skiplist_writer_(nullptr) {
if (!doc_list_format) {
doc_list_format_ = new DocListFormat;
doc_list_format_->Init(format_option);
own_doc_list_format_ = true;
}
DocListEncoder::DocListEncoder(const DocListFormat *doc_list_format)
: doc_list_buffer_(), doc_list_format_(doc_list_format), last_doc_id_(0), current_tf_(0), total_tf_(0), df_(0), doc_skiplist_writer_(nullptr) {
assert(doc_list_format != nullptr);
doc_list_buffer_.Init(doc_list_format_);
CreateDocSkipListWriter();
}

DocListEncoder::~DocListEncoder() {
if (own_doc_list_format_) {
delete doc_list_format_;
doc_list_format_ = nullptr;
}
}
DocListEncoder::~DocListEncoder() {}

void DocListEncoder::AddPosition() {
current_tf_++;
Expand All @@ -57,10 +47,10 @@ void DocListEncoder::Flush() {
void DocListEncoder::AddDocument(docid_t doc_id, docpayload_t doc_payload, tf_t tf, u32 doc_len) {
doc_list_buffer_.PushBack(0, doc_id - last_doc_id_);
int n = 1;
if (format_option_.HasTfList()) {
if (doc_list_format_->GetOption().HasTfList()) {
doc_list_buffer_.PushBack(n++, tf);
}
if (format_option_.HasDocPayload()) {
if (doc_list_format_->GetOption().HasDocPayload()) {
doc_list_buffer_.PushBack(n++, doc_payload);
}
doc_list_buffer_.EndPushBack();
Expand Down Expand Up @@ -163,14 +153,14 @@ InMemDocListDecoder *DocListEncoder::GetInMemDocListDecoder() const {
df_t df = df_;
SkipListReaderPostingByteSlice *skiplist_reader = nullptr;
if (doc_skiplist_writer_) {
skiplist_reader = new SkipListReaderPostingByteSlice(format_option_);
skiplist_reader = new SkipListReaderPostingByteSlice(doc_list_format_->GetOption());
skiplist_reader->Load(doc_skiplist_writer_.get());
}

PostingByteSlice *doc_list_buffer = new PostingByteSlice();
doc_list_buffer_.SnapShot(doc_list_buffer);

InMemDocListDecoder *decoder = new InMemDocListDecoder(format_option_);
InMemDocListDecoder *decoder = new InMemDocListDecoder(doc_list_format_->GetOption());
decoder->Init(df, skiplist_reader, doc_list_buffer);

return decoder;
Expand Down
8 changes: 3 additions & 5 deletions src/storage/invertedindex/format/doc_list_encoder.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@ import file_writer;
import file_reader;
import posting_byte_slice;
import skiplist_writer;
import doc_list_format_option;
import inmem_doc_list_decoder;
import index_defines;
import doc_list_format_option;

export module doc_list_encoder;

namespace infinity {

export class DocListEncoder {
public:
DocListEncoder(const DocListFormatOption &format_option,
DocListFormat *doc_list_format = nullptr);
DocListEncoder(const DocListFormat *doc_list_format);

~DocListEncoder();

Expand Down Expand Up @@ -64,8 +63,7 @@ private:
private:
PostingByteSlice doc_list_buffer_;
bool own_doc_list_format_;
DocListFormatOption format_option_;
DocListFormat *doc_list_format_;
const DocListFormat *doc_list_format_;

docid_t last_doc_id_;
docpayload_t last_doc_payload_;
Expand Down
28 changes: 18 additions & 10 deletions src/storage/invertedindex/format/doc_list_format_option.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace infinity {

export class DocListFormatOption {
public:
explicit DocListFormatOption(optionflag_t option_flag) { Init(option_flag); }
explicit DocListFormatOption(optionflag_t option_flag = OPTION_FLAG_ALL) { Init(option_flag); }

~DocListFormatOption() = default;

Expand Down Expand Up @@ -70,10 +70,14 @@ private:

export class DocSkipListFormat : public PostingFields {
public:
DocSkipListFormat() = default;
explicit DocSkipListFormat(const DocListFormatOption &option) { Init(option); }

~DocSkipListFormat() = default;

bool HasTfList() const { return has_tf_list_; }
bool HasBlockMax() const { return has_block_max_; }

private:
void Init(const DocListFormatOption &option) {
u8 row_count = 0;
u32 offset = 0;
Expand Down Expand Up @@ -121,18 +125,14 @@ public:
}
}

bool HasTfList() const { return has_tf_list_; }
bool HasBlockMax() const { return has_block_max_; }

private:
bool has_tf_list_ = false;
bool has_block_max_ = false;
};

export class DocListFormat : public PostingFields {
public:
DocListFormat(const DocListFormatOption &option) : skiplist_format_(nullptr) { Init(option); }
DocListFormat() : skiplist_format_(nullptr) {}
explicit DocListFormat(const DocListFormatOption &option) : option_(option), skiplist_format_(nullptr) { Init(option_); }

~DocListFormat() {
if (skiplist_format_) {
Expand All @@ -141,6 +141,10 @@ public:
}
};

const DocSkipListFormat *GetDocSkipListFormat() const { return skiplist_format_; }
const DocListFormatOption GetOption() const { return option_; }

private:
void Init(const DocListFormatOption &option) {
u8 row_count = 0;
u32 offset = 0;
Expand All @@ -166,11 +170,15 @@ public:
doc_payload_value->offset_ = offset;
values_.push_back(doc_payload_value);
}
skiplist_format_ = new DocSkipListFormat;
skiplist_format_->Init(option);
if (skiplist_format_) {
delete skiplist_format_;
skiplist_format_ = nullptr;
}
skiplist_format_ = new DocSkipListFormat(option);
}

const DocSkipListFormat *GetDocSkipListFormat() const { return skiplist_format_; }
public:
const DocListFormatOption option_{OPTION_FLAG_ALL};

private:
DocSkipListFormat *skiplist_format_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace infinity {

export class PositionListFormatOption {
public:
explicit PositionListFormatOption(optionflag_t option_flag) { Init(option_flag); }
explicit PositionListFormatOption(optionflag_t option_flag = OPTION_FLAG_ALL) { Init(option_flag); }
~PositionListFormatOption() {}

inline void Init(optionflag_t option_flag) {
Expand Down Expand Up @@ -57,8 +57,7 @@ public:

export class PositionListFormat : public PostingFields {
public:
PositionListFormat(const PositionListFormatOption &option) : skiplist_format_(nullptr) { Init(option); }
PositionListFormat() : skiplist_format_(nullptr) {}
explicit PositionListFormat(const PositionListFormatOption &option) : option_(option), skiplist_format_(nullptr) { Init(option); }

~PositionListFormat() {
if (skiplist_format_) {
Expand All @@ -67,6 +66,10 @@ public:
}
};

const PositionSkipListFormat *GetPositionSkipListFormat() const { return skiplist_format_; }
const PositionListFormatOption GetOption() const { return option_; }

private:
void Init(const PositionListFormatOption &option) {
u8 row_count = 0;
u32 offset = 0;
Expand All @@ -82,7 +85,8 @@ public:
skiplist_format_->Init(option);
}

const PositionSkipListFormat *GetPositionSkipListFormat() const { return skiplist_format_; }
public:
const PositionListFormatOption option_;

private:
PositionSkipListFormat *skiplist_format_;
Expand Down
8 changes: 6 additions & 2 deletions src/storage/invertedindex/format/posting_list_format.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace infinity {

export class PostingFormatOption {
public:
inline PostingFormatOption(optionflag_t flag)
inline PostingFormatOption(optionflag_t flag = OPTION_FLAG_ALL)
: has_term_payload_(flag & of_term_payload), doc_list_format_option_(flag), pos_list_format_option_(flag) {}

bool HasTfList() const { return doc_list_format_option_.HasTfList(); }
Expand Down Expand Up @@ -45,7 +45,7 @@ private:

export class PostingFormat {
public:
PostingFormat(const PostingFormatOption &option) : doc_list_format_(nullptr), pos_list_format_(nullptr) {
explicit PostingFormat(const PostingFormatOption &option) : option_(option), doc_list_format_(nullptr), pos_list_format_(nullptr) {
doc_list_format_ = new DocListFormat(option.GetDocListFormatOption());
if (option.HasPositionList()) {
pos_list_format_ = new PositionListFormat(option.GetPosListFormatOption());
Expand All @@ -64,6 +64,10 @@ public:

DocListFormat *GetDocListFormat() const { return doc_list_format_; }
PositionListFormat *GetPositionListFormat() const { return pos_list_format_; }
const PostingFormatOption GetOption() const { return option_; }

private:
const PostingFormatOption option_;

private:
DocListFormat *doc_list_format_;
Expand Down
10 changes: 5 additions & 5 deletions src/storage/invertedindex/memory_indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,11 @@ bool MemoryIndexer::KeyComp::operator()(const String &lhs, const String &rhs) co
MemoryIndexer::PostingTable::PostingTable() {}

MemoryIndexer::MemoryIndexer(const String &index_dir, const String &base_name, RowID base_row_id, optionflag_t flag, const String &analyzer)
: index_dir_(index_dir), base_name_(base_name), base_row_id_(base_row_id), flag_(flag), analyzer_(analyzer),
inverting_thread_pool_(infinity::InfinityContext::instance().GetFulltextInvertingThreadPool()),
: index_dir_(index_dir), base_name_(base_name), base_row_id_(base_row_id), flag_(flag), posting_format_(PostingFormatOption(flag_)),
analyzer_(analyzer), inverting_thread_pool_(infinity::InfinityContext::instance().GetFulltextInvertingThreadPool()),
commiting_thread_pool_(infinity::InfinityContext::instance().GetFulltextCommitingThreadPool()), ring_inverted_(15UL), ring_sorted_(13UL) {
posting_table_ = MakeShared<PostingTable>();
prepared_posting_ = MakeShared<PostingWriter>(PostingFormatOption(flag_), column_lengths_);
prepared_posting_ = MakeShared<PostingWriter>(posting_format_, column_lengths_);
Path path = Path(index_dir) / (base_name + ".tmp.merge");
spill_full_path_ = path.string();
}
Expand Down Expand Up @@ -351,7 +351,7 @@ SharedPtr<PostingWriter> MemoryIndexer::GetOrAddPosting(const String &term) {
PostingPtr posting;
bool found = posting_store.GetOrAdd(term, posting, prepared_posting_);
if (!found) {
prepared_posting_ = MakeShared<PostingWriter>(PostingFormatOption(flag_), column_lengths_);
prepared_posting_ = MakeShared<PostingWriter>(posting_format_, column_lengths_);
}
return posting;
}
Expand Down Expand Up @@ -438,7 +438,7 @@ void MemoryIndexer::OfflineDump() {
term_meta_dumpler.Dump(dict_file_writer, term_meta);
fst_builder.Insert((u8 *)last_term.data(), last_term.length(), term_meta_offset);
}
posting = MakeUnique<PostingWriter>(PostingFormatOption(flag_), column_lengths_);
posting = MakeUnique<PostingWriter>(posting_format_, column_lengths_);
last_term_str = String(term);
last_term = std::string_view(last_term_str);
last_doc_id = INVALID_DOCID;
Expand Down
Loading

0 comments on commit 41fa699

Please sign in to comment.