From 79918f5df8bb7be0e2d41b68b7cc950e741c83dc Mon Sep 17 00:00:00 2001 From: Weilong Ma <65322990+Ma-cat@users.noreply.github.com> Date: Wed, 8 May 2024 12:37:31 +0800 Subject: [PATCH] support phrase query with block max (#1189) ### What problem does this PR solve? * support phrase query with block max * fix skiplist reader error in reading position Issue link:#639 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature - [x] Test cases --- .../fulltext/fulltext_benchmark.cpp | 10 +- python/benchmark/configs/infinity_enwiki.json | 2 +- src/executor/operator/physical_match.cpp | 49 +-- .../format/position_list_decoder.cpp | 8 +- .../invertedindex/format/skiplist_reader.cppm | 14 +- .../invertedindex/multi_posting_decoder.cpp | 1 + .../invertedindex/multi_posting_decoder.cppm | 4 + .../invertedindex/search/bm25_ranker.cpp | 22 +- .../invertedindex/search/bm25_ranker.cppm | 2 +- .../blockmax_and_iterator.cpp | 3 +- .../blockmax_maxscore_iterator.cppm | 2 +- .../blockmax_phrase_doc_iterator.cpp | 296 ++++++++++++++++-- .../blockmax_phrase_doc_iterator.cppm | 88 +++++- .../blockmax_term_doc_iterator.cppm | 1 + .../early_terminate_iterator.cppm | 2 + .../invertedindex/search/match_data.cpp | 7 +- .../search/phrase_doc_iterator.cpp | 5 + .../search/phrase_doc_iterator.cppm | 15 +- .../invertedindex/search/query_node.cpp | 17 +- .../invertedindex/search/search_driver.cpp | 4 +- src/storage/invertedindex/segment_posting.cpp | 3 - .../storage/invertedindex/memory_indexer.cpp | 4 + .../invertedindex/search/query_match.cpp | 7 +- test/sql/dql/fulltext.slt | 13 + 24 files changed, 435 insertions(+), 144 deletions(-) diff --git a/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp b/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp index 9c14414132..4f102f724c 100644 --- a/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp +++ b/benchmark/local_infinity/fulltext/fulltext_benchmark.cpp @@ -222,8 +222,8 @@ void BenchmarkOptimize(SharedPtr infinity, const String &db_name, cons void BenchmarkQuery(SharedPtr infinity, const String &db_name, const String &table_name) { std::string fields = "text"; - //std::vector query_vec = {"one of", "is", "a", "\"is a\"", "\"one of\""};// {"Animalia", "Algorithms", "Animalia Algorithms", "network space", "harmful chemical anarchism"}; - std::vector query_vec = {"harmful chemical anarchism", "\"harmful chemical\"", "\"one of\"", "harmful chemical"}; + std::vector query_vec = {"harmful \"social custom\"", "social custom \"harmful chemical\"", "\"annual American awards\"", "harmful chemical", "\"one of\""}; + for (auto match_text : query_vec) { BaseProfiler profiler; profiler.Begin(); @@ -314,7 +314,7 @@ int main(int argc, char *argv[]) { }; Mode mode(Mode::kInsert); SizeT insert_batch = 500; - app.add_option("--mode", mode, "Bencmark mode, one of insert, import, merge, query") + app.add_option("--mode", mode, "Benchmark mode, one of insert, import, merge, query") ->required() ->transform(CLI::CheckedTransformer(mode_map, CLI::ignore_case)); app.add_option("--insert-batch", insert_batch, "batch size of each insert, valid only at insert and merge mode, default value 500"); @@ -331,7 +331,7 @@ int main(int argc, char *argv[]) { String srcfile = test_data_path(); srcfile += "/benchmark/dbpedia-entity/corpus.jsonl"; -// #define DEL_LOCAL_DATA +//#define DEL_LOCAL_DATA #ifdef DEL_LOCAL_DATA system("rm -rf /var/infinity/data /var/infinity/wal /var/infinity/log /var/infinity/tmp"); #endif @@ -358,7 +358,7 @@ int main(int argc, char *argv[]) { case Mode::kQuery: { BenchmarkCreateIndex(infinity, db_name, table_name, index_name); BenchmarkInsert(infinity, db_name, table_name, srcfile, insert_batch); - // BenchmarkOptimize(infinity, db_name, table_name); + BenchmarkOptimize(infinity, db_name, table_name); sleep(10); BenchmarkMoreQuery(infinity, db_name, table_name, 1); break; diff --git a/python/benchmark/configs/infinity_enwiki.json b/python/benchmark/configs/infinity_enwiki.json index f69ebb25cf..235a84c0a4 100644 --- a/python/benchmark/configs/infinity_enwiki.json +++ b/python/benchmark/configs/infinity_enwiki.json @@ -9,7 +9,7 @@ "query_link": "to_be_set", "mode": "fulltext", "topK": 10, - "use_import": false, + "use_import": true, "schema": { "doctitle": {"type": "varchar", "default":""}, "docdate": {"type": "varchar", "default":""}, diff --git a/src/executor/operator/physical_match.cpp b/src/executor/operator/physical_match.cpp index 1c9216bd74..e891589d9d 100644 --- a/src/executor/operator/physical_match.cpp +++ b/src/executor/operator/physical_match.cpp @@ -494,45 +494,6 @@ void ASSERT_FLOAT_EQ(float bar, u32 i, float a, float b) { } } -void AnalyzeFunc(const String &analyzer_name, String &&text, TermList &output_terms) { - UniquePtr analyzer = AnalyzerPool::instance().Get(analyzer_name); - // (dynamic_cast(analyzer.get()))->SetExtractEngStem(false); - if (analyzer.get() == nullptr) { - RecoverableError(Status::UnexpectedError(fmt::format("Invalid analyzer: {}", analyzer_name))); - } - Term input_term; - input_term.text_ = std::move(text); - TermList temp_output_terms; - analyzer->Analyze(input_term, temp_output_terms); - if (analyzer_name == AnalyzerPool::STANDARD) { - // remove duplicates and only keep the root words for query - const u32 INVALID_TERM_OFFSET = -1; - Term last_term; - last_term.word_offset_ = INVALID_TERM_OFFSET; - for (const Term &term : temp_output_terms) { - if (last_term.word_offset_ != INVALID_TERM_OFFSET) { - assert(term.word_offset_ >= last_term.word_offset_); - } - if (last_term.word_offset_ != term.word_offset_) { - if (last_term.word_offset_ != INVALID_TERM_OFFSET) { - output_terms.emplace_back(last_term); - } - last_term.text_ = term.text_; - last_term.word_offset_ = term.word_offset_; - last_term.stats_ = term.stats_; - } else { - if (term.text_.size() < last_term.text_.size()) { - last_term.text_ = term.text_; - last_term.stats_ = term.stats_; - } - } - } - if (last_term.word_offset_ != INVALID_TERM_OFFSET) { - output_terms.emplace_back(last_term); - } - } -} - void ExecuteFTSearch(UniquePtr &et_iter, FullTextScoreResultHeap &result_heap, u32 &blockmax_loop_cnt) { if (et_iter) { while (true) { @@ -584,12 +545,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator if (!query_tree) { RecoverableError(Status::ParseMatchExprFailed(match_expr_->fields_, match_expr_->matching_text_)); } - if (query_tree->type_ == QueryNodeType::PHRASE) { - // TODO: make sure there is no problem with block max phrase and delete this code - // LOG_INFO(fmt::format("Block max phrase not supported, use ordinary iterator, query: {}", match_expr_->matching_text_)); - use_block_max_iter = false; - use_ordinary_iter = true; - } + auto finish_parse_query_tree_time = std::chrono::high_resolution_clock::now(); TimeDurationType parse_query_tree_duration = finish_parse_query_tree_time - finish_init_query_builder_time; LOG_TRACE(fmt::format("PhysicalMatch Part 0.2: Parse QueryNode tree time: {} ms", parse_query_tree_duration.count())); @@ -868,8 +824,9 @@ bool PhysicalMatch::Execute(QueryContext *query_context, OperatorState *operator SharedPtr> PhysicalMatch::GetOutputNames() const { SharedPtr> result_names = MakeShared>(); result_names->reserve(base_table_ref_->column_names_->size() + 2); - for (auto &name : *base_table_ref_->column_names_) + for (auto &name : *base_table_ref_->column_names_) { result_names->emplace_back(name); + } result_names->emplace_back(COLUMN_NAME_SCORE); result_names->emplace_back(COLUMN_NAME_ROW_ID); return result_names; diff --git a/src/storage/invertedindex/format/position_list_decoder.cpp b/src/storage/invertedindex/format/position_list_decoder.cpp index fc55cad2f7..433aa05f6f 100644 --- a/src/storage/invertedindex/format/position_list_decoder.cpp +++ b/src/storage/invertedindex/format/position_list_decoder.cpp @@ -47,8 +47,8 @@ void PositionListDecoder::InitPositionSkipList(const ByteSliceList *pos_list, state->SetRecordOffset(pos_skiplist_end); } else { pos_skiplist_reader_ = session_pool_ ? new ((session_pool_)->Allocate(sizeof(SkipListReaderByteSlice))) - SkipListReaderByteSlice(option_.GetDocListFormatOption()) - : new SkipListReaderByteSlice(option_.GetDocListFormatOption()); + SkipListReaderByteSlice(option_.GetPosListFormatOption()) + : new SkipListReaderByteSlice(option_.GetPosListFormatOption()); skiplist_reader_real_size_ = sizeof(SkipListReaderByteSlice); static_cast(pos_skiplist_reader_)->Load(pos_list, pos_skiplist_start, pos_skiplist_end); decoded_pos_count_ = 0; @@ -66,8 +66,8 @@ void PositionListDecoder::InitPositionSkipList(ByteSlice *pos_list, state->SetRecordOffset(pos_skiplist_end); } else { pos_skiplist_reader_ = session_pool_ ? new ((session_pool_)->Allocate(sizeof(SkipListReaderByteSlice))) - SkipListReaderByteSlice(option_.GetDocListFormatOption()) - : new SkipListReaderByteSlice(option_.GetDocListFormatOption()); + SkipListReaderByteSlice(option_.GetPosListFormatOption()) + : new SkipListReaderByteSlice(option_.GetPosListFormatOption()); skiplist_reader_real_size_ = sizeof(SkipListReaderByteSlice); static_cast(pos_skiplist_reader_)->Load(pos_list, pos_skiplist_start, pos_skiplist_end); decoded_pos_count_ = 0; diff --git a/src/storage/invertedindex/format/skiplist_reader.cppm b/src/storage/invertedindex/format/skiplist_reader.cppm index f2e5294717..3650eaca58 100644 --- a/src/storage/invertedindex/format/skiplist_reader.cppm +++ b/src/storage/invertedindex/format/skiplist_reader.cppm @@ -9,12 +9,15 @@ import doc_list_format_option; import memory_pool; import posting_byte_slice; import posting_byte_slice_reader; +import position_list_format_option; namespace infinity { export class SkipListReader { public: - explicit SkipListReader(const DocListFormatOption &doc_list_format_option) : doc_list_format_option_(doc_list_format_option) { + explicit SkipListReader(const DocListFormatOption &doc_list_format_option) + : has_tf_list_(doc_list_format_option.HasTfList()), + has_block_max_(doc_list_format_option.HasBlockMax()) { if (has_tf_list_) { ttf_buffer_ = MakeUnique(SKIP_LIST_BUFFER_SIZE); } @@ -24,6 +27,8 @@ public: } } + explicit SkipListReader(const PositionListFormatOption &doc_list_format_option) {} + virtual ~SkipListReader() = default; bool SkipTo(u32 query_doc_id, u32 &doc_id, u32 &prev_doc_id, u32 &offset, u32 &delta); @@ -55,9 +60,8 @@ public: protected: virtual Pair LoadBuffer() = 0; - DocListFormatOption doc_list_format_option_; - const bool has_tf_list_ = doc_list_format_option_.HasTfList(); - const bool has_block_max_ = doc_list_format_option_.HasBlockMax(); + const bool has_tf_list_ = false; + const bool has_block_max_ = false; i32 skipped_item_count_ = 0; u32 current_doc_id_ = 0; u32 current_offset_ = 0; @@ -80,6 +84,8 @@ export class SkipListReaderByteSlice final : public SkipListReader { public: explicit SkipListReaderByteSlice(const DocListFormatOption &doc_list_format_option) : SkipListReader(doc_list_format_option) {} + explicit SkipListReaderByteSlice(const PositionListFormatOption &pos_list_format_option) : SkipListReader(pos_list_format_option) {} + void Load(const ByteSliceList *byte_slice_list, u32 start, u32 end); void Load(ByteSlice *byteSlice, u32 start, u32 end); diff --git a/src/storage/invertedindex/multi_posting_decoder.cpp b/src/storage/invertedindex/multi_posting_decoder.cpp index 918e7c0641..f278094cee 100644 --- a/src/storage/invertedindex/multi_posting_decoder.cpp +++ b/src/storage/invertedindex/multi_posting_decoder.cpp @@ -201,6 +201,7 @@ bool MultiPostingDecoder::DiskSegMoveToSegment(SegmentPosting &cur_segment_posti u32 doc_skiplist_end = doc_skiplist_start + doc_skiplist_size; index_decoder_->InitSkipList(doc_skiplist_start, doc_skiplist_end, posting_list, term_meta.GetDocFreq()); + if (format_option_.HasPositionList()) { u32 pos_list_begin = doc_list_reader.Tell() + doc_skiplist_size + doc_list_size; in_doc_state_keeper_.MoveToSegment(posting_list, term_meta.GetTotalTermFreq(), pos_list_begin, format_option_); diff --git a/src/storage/invertedindex/multi_posting_decoder.cppm b/src/storage/invertedindex/multi_posting_decoder.cppm index b8828a4b61..bfc11a2cab 100644 --- a/src/storage/invertedindex/multi_posting_decoder.cppm +++ b/src/storage/invertedindex/multi_posting_decoder.cppm @@ -71,9 +71,13 @@ private: } bool MoveToSegment(RowID start_row_id); + bool MemSegMoveToSegment(const SharedPtr &posting_writer); + bool DiskSegMoveToSegment(SegmentPosting &cur_segment_posting); + IndexDecoder *CreateDocIndexDecoder(u32 doc_list_begin_pos); + private: PostingFormatOption format_option_; bool need_decode_doc_id_ = false; diff --git a/src/storage/invertedindex/search/bm25_ranker.cpp b/src/storage/invertedindex/search/bm25_ranker.cpp index 90d88f6305..81bfec86e0 100644 --- a/src/storage/invertedindex/search/bm25_ranker.cpp +++ b/src/storage/invertedindex/search/bm25_ranker.cpp @@ -20,6 +20,7 @@ module bm25_ranker; import stl; import index_defines; +import third_party; namespace infinity { @@ -34,23 +35,10 @@ void BM25Ranker::AddTermParam(u64 tf, u64 df, float avg_column_len, u32 column_l score_ += smooth_idf * smooth_tf * weight; } -void BM25Ranker::AddPhraseParam(const Vector& all_tf, - const Vector& all_df, - float avg_column_len, - u32 column_len, - float weight, - SizeT term_num) { - for (SizeT i = 0; i < term_num; ++i) { - u64 tf = 0; - u64 df = 0; - if (i < all_tf.size()) { - tf = all_tf[i]; - } - if (i < all_df.size()) { - df = all_df[i]; - } - AddTermParam(tf, df, avg_column_len, column_len, weight); - } +void BM25Ranker::AddPhraseParam(tf_t tf, u64 df, float avg_column_len, u32 column_len, float weight) { + float smooth_idf = std::log(1.0F + (total_df_ - df + 0.5F) / (df + 0.5F)); + float smooth_tf = (k1 + 1.0F) * tf / (tf + k1 * (1.0F - b + b * column_len / avg_column_len)); + score_ += smooth_idf * smooth_tf * weight; } } // namespace infinity diff --git a/src/storage/invertedindex/search/bm25_ranker.cppm b/src/storage/invertedindex/search/bm25_ranker.cppm index 777108b731..07190b751e 100644 --- a/src/storage/invertedindex/search/bm25_ranker.cppm +++ b/src/storage/invertedindex/search/bm25_ranker.cppm @@ -27,7 +27,7 @@ public: void AddTermParam(u64 tf, u64 df, float avg_column_len, u32 column_len, float weight); - void AddPhraseParam(const Vector& all_tf, const Vector& all_df, float avg_column_len, u32 column_len, float weight, SizeT term_num); + void AddPhraseParam(tf_t tf, u64 df, float avg_colum_len, u32 column_len, float weight); float GetScore() { return score_; } diff --git a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_and_iterator.cpp b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_and_iterator.cpp index e3271882c7..e67fb0a6d6 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_and_iterator.cpp +++ b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_and_iterator.cpp @@ -21,6 +21,7 @@ import stl; import index_defines; import early_terminate_iterator; import internal_types; +import third_party; namespace infinity { @@ -79,7 +80,7 @@ bool BlockMaxAndIterator::BlockSkipTo(RowID doc_id, float threshold) { sum_score += sorted_iterators_[j - 1]->BlockMaxBM25Score(); common_block_max_bm25_score_parts_[j - 1] = prev_sum_score; } - assert((sum_score <= bm25_score_upper_bound_)); + // assert((sum_score <= bm25_score_upper_bound_)); if (sum_score >= threshold) { common_block_max_bm25_score_ = sum_score; common_block_min_possible_doc_id_ = doc_id; diff --git a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_maxscore_iterator.cppm b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_maxscore_iterator.cppm index a5ee298c99..524a966dd3 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_maxscore_iterator.cppm +++ b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_maxscore_iterator.cppm @@ -90,4 +90,4 @@ private: Vector> must_have_history_; }; -} // namespace infinity +} // namespace infinity \ No newline at end of file diff --git a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cpp b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cpp index 051e53c6e6..7d6aca1da6 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cpp +++ b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cpp @@ -1,6 +1,8 @@ module; #include +#include +#include module blockmax_phrase_doc_iterator; @@ -16,59 +18,155 @@ import third_party; namespace infinity { bool BlockMaxPhraseDocIterator::BlockSkipTo(RowID doc_id, float threshold) { - for (auto &iter : term_doc_iters_) { - if (!iter->BlockSkipTo(doc_id, threshold)) { - return false; + if (threshold > BM25ScoreUpperBound()) [[unlikely]] { + return false; + } + while (true) { + for (u32 i = 0; i < pos_iters_.size(); ++i) { + const auto &iter = pos_iters_[i]; + if (!iter->SkipTo(doc_id)) { + doc_id_ = INVALID_ROWID; + return false; + } + term_doc_id_[i] = doc_id; + } + if (BlockMaxBM25Score() >= threshold) { + return true; } + doc_id = BlockLastDocID() + 1; } - return true; } -RowID BlockMaxPhraseDocIterator::BlockMinPossibleDocID() const { return term_doc_iters_[0]->BlockMinPossibleDocID(); } +RowID BlockMaxPhraseDocIterator::BlockMinPossibleDocID() const { return pos_iters_[0]->BlockLowestPossibleDocID(); } -RowID BlockMaxPhraseDocIterator::BlockLastDocID() const { return term_doc_iters_[0]->BlockLastDocID(); } +RowID BlockMaxPhraseDocIterator::BlockLastDocID() const { return TermBlockLastDocID(0); } -float BlockMaxPhraseDocIterator::BlockMaxBM25Score() { return term_doc_iters_[0]->BlockMaxBM25Score(); } +void BlockMaxPhraseDocIterator::SeekDoc(RowID doc_id, RowID seek_end) { + assert(pos_iters_.size() > 0); + bool need_loop = true; + while (need_loop) { + if (doc_id == INVALID_ROWID || doc_id > seek_end) { + break; + } + RowID max_doc_id = pos_iters_[0]->SeekDoc(doc_id); + term_doc_id_[0] = max_doc_id; + need_loop = false; + for (SizeT i = 1; i < pos_iters_.size(); ++i) { + auto& iter = pos_iters_[i]; + auto tmp_doc_id = iter->SeekDoc(doc_id); + term_doc_id_[i] = tmp_doc_id; + if (max_doc_id != tmp_doc_id) { + max_doc_id = std::max(max_doc_id, tmp_doc_id); + need_loop = true; + break; + } + } + doc_id = max_doc_id; + } + doc_id_ = doc_id; +} -Tuple BlockMaxPhraseDocIterator::SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond, float threshold) { - Vector> all_res; - for (auto &iter : term_doc_iters_) { - auto res = iter->SeekInBlockRange(doc_id, doc_id_no_beyond, threshold); - if (!std::get<0>(res)) { - return {false, 0.0F, INVALID_ROWID}; +bool BlockMaxPhraseDocIterator::CheckBeginPosition(pos_t position) { + pos_t now_position = position; + for (SizeT i = 1; i < pos_iters_.size(); ++i) { + auto& iter = pos_iters_[i]; + pos_t next_position = 0; + iter->SeekPosition(now_position, next_position); + if (next_position != now_position + 1) { + return false; } - all_res.push_back(res); + now_position = next_position; } - if (all_res.empty()) { - return {false, 0.0F, INVALID_ROWID}; + return true; +} + +bool BlockMaxPhraseDocIterator::GetPhraseMatchData(PhraseColumnMatchData &match_data, RowID doc_id) { + if (doc_id != doc_id_) { + return false; } - return all_res[0]; + auto& iter = pos_iters_[0]; + pos_t beg_position = 0; + match_data.tf_ = 0; + while (true) { + pos_t position = INVALID_POSITION; + iter->SeekPosition(beg_position, position); + if (position == INVALID_POSITION) { + break; + } + if (CheckBeginPosition(position)) { + match_data.begin_positions_.push_back(position); + match_data.tf_ += 1; + } + beg_position = position + 1; + } + match_data.doc_id_ = doc_id_; + if (match_data.begin_positions_.size()) { + for (SizeT i = 0; i < pos_iters_.size(); ++i) { + auto& iter_temp = pos_iters_[i]; + match_data.all_tf_.emplace_back(iter_temp->GetCurrentTF()); + match_data.all_doc_payload_.emplace_back(iter_temp->GetCurrentDocPayload()); + } + if (all_doc_ids_.count(doc_id_) == 0) { + all_doc_ids_.insert(doc_id_); + doc_freq_++; + phrase_freq_ += match_data.begin_positions_.size(); + } + return true; + } + return false; } -Pair BlockMaxPhraseDocIterator::PeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) { - Vector> all_res; - for (auto &iter : term_doc_iters_) { - auto res = iter->PeekInBlockRange(doc_id, doc_id_no_beyond); - if (!std::get<0>(res)) { - return {false, INVALID_ROWID}; +Tuple BlockMaxPhraseDocIterator::SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond, float threshold) { + if (threshold > BlockMaxBM25Score()) [[unlikely]] { + return {false, 0.0F, INVALID_ROWID}; + } + const RowID block_last = BlockLastDocID(); + const RowID seek_end = std::min(doc_id_no_beyond, block_last); + while (doc_id <= seek_end) { + SeekDoc(doc_id, seek_end); + doc_id = doc_id_; + // assert((doc_id <= block_last)); + if (doc_id > seek_end) { + return {false, 0.0F, INVALID_ROWID}; + } + PhraseColumnMatchData phrase_match_data; + if (GetPhraseMatchData(phrase_match_data, doc_id)) { + current_phrase_freq_ = phrase_match_data.tf_; + if (const float score = BM25Score(); score >= threshold) { + return {true, score, doc_id}; + } } - all_res.push_back(res); + ++doc_id; } - if (all_res.empty()) { + return {false, 0.0F, INVALID_ROWID}; +} + +Pair BlockMaxPhraseDocIterator::SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) { + const RowID block_last = BlockLastDocID(); + const RowID seek_end = std::min(doc_id_no_beyond, block_last); + if (doc_id > seek_end) { return {false, INVALID_ROWID}; } - return all_res[0]; + SeekDoc(doc_id, seek_end); + doc_id = doc_id_; + if (doc_id > seek_end) { + return {false, INVALID_ROWID}; + } + PhraseColumnMatchData phrase_match_data; + if (GetPhraseMatchData(phrase_match_data, doc_id)) { + current_phrase_freq_ = phrase_match_data.tf_; + return {true, doc_id}; + } + return {false, INVALID_ROWID}; } -void BlockMaxPhraseDocIterator::InitBM25Info(u64 total_df, float avg_column_len, FullTextColumnLengthReader *column_length_reader) { - for (auto &iter : term_doc_iters_) { - iter->InitBM25Info(total_df, avg_column_len, column_length_reader); - } +Pair BlockMaxPhraseDocIterator::PeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) { + return TermPeekInBlockRange(0, doc_id, doc_id_no_beyond); } bool BlockMaxPhraseDocIterator::NotPartCheckExist(RowID doc_id) { - for (auto &doc_iter : term_doc_iters_) { - if (!doc_iter->NotPartCheckExist(doc_id)) { + for (SizeT i = 0; i < pos_iters_.size(); ++i) { + if (!TermNotPartCheckExist(i, doc_id)) { return false; } } @@ -91,4 +189,136 @@ void BlockMaxPhraseDocIterator::PrintTree(std::ostream &os, const String &prefix os << '\n'; } -} \ No newline at end of file +// BM25 parameters +constexpr float k1 = 1.2F; +constexpr float b = 0.75F; + +void BlockMaxPhraseDocIterator::InitBM25Info(u64 total_df, float avg_column_len, FullTextColumnLengthReader *column_length_reader) { + avg_column_len_ = avg_column_len; + + column_length_reader_ = column_length_reader; + float smooth_idf = std::log(1.0F + (total_df - estimate_doc_freq_ + 0.5F) / (estimate_doc_freq_ + 0.5F)); + bm25_common_score_ = weight_ * smooth_idf * (k1 + 1.0F); + bm25_score_upper_bound_ = bm25_common_score_ / (1.0F + k1 * b / avg_column_len); + + for (SizeT i = 0; i < pos_iters_.size(); ++i) { + TermInitBM25Info(i, total_df, avg_column_len, column_length_reader); + } +} + +float BlockMaxPhraseDocIterator::BM25Score() { + auto tf = current_phrase_freq_; + auto doc_len = column_length_reader_->GetColumnLength(doc_id_); + return bm25_common_score_ * tf / (tf + k1 * (1.0F - b + b * doc_len / avg_column_len_)); +} + +float BlockMaxPhraseDocIterator::BlockMaxBM25Score() { + if (auto last_doc_id = BlockLastDocID(); last_doc_id != block_max_bm25_score_cache_end_id_) { + block_max_bm25_score_cache_end_id_ = last_doc_id; + // bm25_common_score_ / (1.0F + k1 * ((1.0F - b) / block_max_tf + b / block_max_percentage / avg_column_len)); + auto [block_max_tf, block_max_percentage_u16] = GetBlockMaxInfo(); + block_max_bm25_score_cache_ = + bm25_common_score_ / + (1.0F + k1 * ((1.0F - b) / block_max_tf + b * std::numeric_limits::max() / (block_max_percentage_u16 * avg_column_len_))); + } + + for (SizeT i = 0; i < pos_iters_.size(); ++i) { + TermBlockMaxBM25Score(i); + } + + return block_max_bm25_score_cache_; +} + +// for term block operator +float BlockMaxPhraseDocIterator::TermBlockMaxBM25Score(u32 term_id) { + if (auto last_doc_id = TermBlockLastDocID(term_id); last_doc_id == term_block_max_bm25_score_cache_end_id_[term_id]) { + return term_block_max_bm25_score_cache_[term_id]; + } else { + term_block_max_bm25_score_cache_end_id_[term_id] = last_doc_id; + // bm25_common_score_ / (1.0F + k1 * ((1.0F - b) / block_max_tf + b / block_max_percentage / avg_column_len)); + auto [block_max_tf, block_max_percentage_u16] = TermGetBlockMaxInfo(term_id); + return term_block_max_bm25_score_cache_[term_id] = + term_bm25_common_score_[term_id] / + (1.0F + k1 * ((1.0F - b) / block_max_tf + b * std::numeric_limits::max() / (block_max_percentage_u16 * avg_column_len_))); + } +} + +Pair BlockMaxPhraseDocIterator::TermPeekInBlockRange(u32 term_id, RowID doc_id, RowID doc_id_no_beyond) { + const RowID seek_end = std::min(doc_id_no_beyond, TermBlockLastDocID(term_id)); + if (doc_id > seek_end) { + return {false, INVALID_ROWID}; + } + // check cache + if (term_peek_doc_id_range_start_[term_id] <= doc_id) { + if (const RowID peek_cache = term_peek_doc_id_val_[term_id]; peek_cache >= doc_id) { + if (peek_cache <= seek_end) { + return {true, peek_cache}; + } + if (term_peek_doc_id_range_end_[term_id] >= seek_end) { + return {false, INVALID_ROWID}; + } + } + } + // need to decode + Pair result = pos_iters_[term_id]->PeekInBlockRange(doc_id, seek_end); + // update cache + term_peek_doc_id_range_start_[term_id] = doc_id; + term_peek_doc_id_range_end_[term_id] = seek_end; + term_peek_doc_id_val_[term_id] = result.second; + return result; +} + +Tuple BlockMaxPhraseDocIterator::TermSeekInBlockRange(u32 term_id, RowID doc_id, RowID doc_id_no_beyond, float threshold) { + if (threshold > TermBlockMaxBM25Score(term_id)) [[unlikely]] { + return {false, 0.0F, INVALID_ROWID}; + } + const RowID block_last = TermBlockLastDocID(term_id); + const RowID seek_end = std::min(doc_id_no_beyond, block_last); + while (doc_id <= seek_end) { + doc_id = pos_iters_[term_id]->SeekDoc(doc_id); + term_doc_id_[term_id] = doc_id; + assert((doc_id <= block_last)); + if (doc_id > seek_end) { + return {false, 0.0F, INVALID_ROWID}; + } + if (const float score = TermBM25Score(term_id); score >= threshold) { + return {true, score, doc_id}; + } + ++doc_id; + } + return {false, 0.0F, INVALID_ROWID}; +} + +float BlockMaxPhraseDocIterator::TermBM25Score(u32 term_id) { + // bm25_common_score_ * tf / (tf + k1 * (1.0F - b + b * column_len / avg_column_len)); + auto tf = pos_iters_[term_id]->GetCurrentTF(); + auto doc_len = term_column_length_reader_[term_id]->GetColumnLength(term_doc_id_[term_id]); + return term_bm25_common_score_[term_id] * tf / (tf + k1 * (1.0F - b + b * doc_len / avg_column_len_)); +} + +float BlockMaxPhraseDocIterator::TermBM25Score(infinity::u32 term_id, infinity::tf_t phrase_freq) { + // bm25_common_score_ * tf / (tf + k1 * (1.0F - b + b * column_len / avg_column_len)); + auto tf = phrase_freq; + auto doc_len = term_column_length_reader_[term_id]->GetColumnLength(term_doc_id_[term_id]); + return term_bm25_common_score_[term_id] * tf / (tf + k1 * (1.0F - b + b * doc_len / avg_column_len_)); +} + +void BlockMaxPhraseDocIterator::TermInitBM25Info(u32 term_id, u64 total_df, float avg_column_len, FullTextColumnLengthReader *column_length_reader) { + term_column_length_reader_[term_id] = column_length_reader; + float smooth_idf = std::log(1.0F + (total_df - estimate_doc_freq_ + 0.5F) / (estimate_doc_freq_ + 0.5F)); + term_bm25_common_score_[term_id] = weight_ * smooth_idf * (k1 + 1.0F); + term_bm25_score_upper_bound_[term_id] = term_bm25_common_score_[term_id] / (1.0F + k1 * b / avg_column_len); + bm25_score_upper_bound_ = term_bm25_score_upper_bound_[0]; +} + +bool BlockMaxPhraseDocIterator::TermNotPartCheckExist(u32 term_id, RowID doc_id) { + const RowID seek_result = pos_iters_[term_id]->SeekDoc(doc_id); + term_doc_id_[term_id] = seek_result; + return seek_result == doc_id; +} + +RowID BlockMaxPhraseDocIterator::TermBlockLastDocID(u32 term_id) const { + return pos_iters_[term_id]->BlockLastDocID(); +} + +} // namespace infinity diff --git a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cppm b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cppm index 19c07b7c17..cda8076cb2 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cppm +++ b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_phrase_doc_iterator.cppm @@ -9,13 +9,38 @@ import early_terminate_iterator; import phrase_doc_iterator; import blockmax_term_doc_iterator; import column_length_io; +import posting_iterator; +import match_data; +import third_party; namespace infinity { export class BlockMaxPhraseDocIterator final : public EarlyTerminateIterator { public: - BlockMaxPhraseDocIterator(Vector> &&iters, u64 column_id) - : term_doc_iters_(std::move(iters)), column_id_(column_id) {} + BlockMaxPhraseDocIterator(Vector> &&iters, float weight) + : pos_iters_(std::move(iters)), weight_(weight) { + auto iter_size = pos_iters_.size(); + term_column_length_reader_.resize(iter_size, nullptr); + term_block_max_bm25_score_cache_.resize(iter_size, 0.0f); + term_bm25_common_score_.resize(iter_size, 0.0f); + term_block_max_bm25_score_cache_end_id_.resize(iter_size, INVALID_ROWID); + term_bm25_score_upper_bound_.resize(iter_size, 0.0f); + term_column_length_reader_.resize(iter_size, nullptr); + // cache for PeekInBlockRange + term_peek_doc_id_range_start_.resize(iter_size, INVALID_ROWID); + term_peek_doc_id_range_end_.resize(iter_size, INVALID_ROWID); + term_peek_doc_id_val_.resize(iter_size, INVALID_ROWID); + term_doc_id_.resize(iter_size, INVALID_ROWID); + block_last_doc_id_ = 0; + if (iter_size) { + estimate_doc_freq_ = pos_iters_[0]->GetDocFreq(); + } else { + estimate_doc_freq_ = 0; + } + for (SizeT i = 0; i < pos_iters_.size(); ++i) { + estimate_doc_freq_ = std::min(estimate_doc_freq_, pos_iters_[i]->GetDocFreq()); + } + } void UpdateScoreThreshold(float threshold) override {} // do nothing @@ -27,26 +52,77 @@ public: float BlockMaxBM25Score() override; - Pair SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) override { return {}; } + Pair SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) override; + Tuple SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond, float threshold) override; - float BM25Score() override { return 0.0f; } Pair PeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) override; + Pair GetBlockMaxInfo() const { return TermGetBlockMaxInfo(0); } + bool NotPartCheckExist(RowID doc_id) override; void PrintTree(std::ostream &os, const String &prefix, bool is_final) const override; void InitBM25Info(u64 total_df, float avg_column_len, FullTextColumnLengthReader *column_length_reader); + float BM25Score() override; + + bool CheckBeginPosition(pos_t position); + + bool GetPhraseMatchData(PhraseColumnMatchData &match_data, RowID doc_id); + // debug info const Vector *terms_ptr_ = nullptr; const String *column_name_ptr_ = nullptr; private: - Vector> term_doc_iters_; - u64 column_id_; + float TermBlockMaxBM25Score(u32 term_id); + + Pair TermGetBlockMaxInfo(u32 term_id) const { return pos_iters_[term_id]->GetBlockMaxInfo(); } + + Pair TermPeekInBlockRange(u32 term_id, RowID doc_id, RowID doc_id_no_beyond); + + RowID TermBlockLastDocID(u32 term_id) const; + + Tuple TermSeekInBlockRange(u32 term_id, RowID doc_id, RowID doc_id_no_beyond, float threshold); + + float TermBM25Score(u32 term_id); + + float TermBM25Score(u32 term_id, tf_t phrase_freq); + + void TermInitBM25Info(u32 term_id, u64 total_df, float avg_column_len, FullTextColumnLengthReader *column_length_reader); + + bool TermNotPartCheckExist(u32 term_id, RowID doc_id); + + void SeekDoc(RowID doc_id, RowID seek_end); +private: + float avg_column_len_ = 0; + Vector> term_doc_iters_{}; + Vector> pos_iters_{}; + u64 phrase_freq_{0}; + u64 current_phrase_freq_{0}; + Set all_doc_ids_{}; + FullTextColumnLengthReader* column_length_reader_{nullptr}; + u32 estimate_doc_freq_{0}; float weight_ = 1.0f; + float bm25_common_score_ = 0.0f; + float block_max_bm25_score_cache_ = 0.0f; + RowID block_max_bm25_score_cache_end_id_{INVALID_ROWID}; + RowID block_last_doc_id_{INVALID_DOCID}; + + // info for sub terms + Vector term_column_length_reader_; + Vector term_block_max_bm25_score_cache_; + Vector term_bm25_common_score_; + Vector term_bm25_score_upper_bound_; + Vector term_doc_id_; + Vector term_block_max_bm25_score_cache_end_id_; + + // cache for PeekInBlockRange + Vector term_peek_doc_id_range_start_; + Vector term_peek_doc_id_range_end_; + Vector term_peek_doc_id_val_; }; } \ No newline at end of file diff --git a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_term_doc_iterator.cppm b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_term_doc_iterator.cppm index 2fd4eff91c..1dd1eee91c 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/blockmax_term_doc_iterator.cppm +++ b/src/storage/invertedindex/search/early_terminate_iterator/blockmax_term_doc_iterator.cppm @@ -56,6 +56,7 @@ public: float BlockMaxBM25Score() override; Pair SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) override; + Tuple SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond, float threshold) override; Pair PeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) override; diff --git a/src/storage/invertedindex/search/early_terminate_iterator/early_terminate_iterator.cppm b/src/storage/invertedindex/search/early_terminate_iterator/early_terminate_iterator.cppm index 57447cccb2..ee4e4b294c 100644 --- a/src/storage/invertedindex/search/early_terminate_iterator/early_terminate_iterator.cppm +++ b/src/storage/invertedindex/search/early_terminate_iterator/early_terminate_iterator.cppm @@ -57,7 +57,9 @@ public: // if seek failed in current block, return false, doc_id_ may be unchanged or changed // if seek succeed in current block, return true, doc_id_ is updated virtual Pair SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) = 0; + virtual Tuple SeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond, float threshold) = 0; + virtual float BM25Score() = 0; virtual Pair PeekInBlockRange(RowID doc_id, RowID doc_id_no_beyond) = 0; diff --git a/src/storage/invertedindex/search/match_data.cpp b/src/storage/invertedindex/search/match_data.cpp index bb198fae73..8cb52abc25 100644 --- a/src/storage/invertedindex/search/match_data.cpp +++ b/src/storage/invertedindex/search/match_data.cpp @@ -88,12 +88,11 @@ float Scorer::Score(RowID doc_id) { PhraseColumnMatchData column_match_data; PhraseDocIterator* phrase_iter = dynamic_cast(column_iters[j]); if (phrase_iter->GetPhraseMatchData(column_match_data, doc_id)) { - ranker.AddPhraseParam(column_match_data.all_tf_, - phrase_iter->GetAllDF(), + ranker.AddPhraseParam(column_match_data.tf_, + phrase_iter->GetEstimateDF(), avg_column_length, column_len, - phrase_iter->GetWeight(), - column_iters.size()); + phrase_iter->GetWeight()); } } } diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cpp b/src/storage/invertedindex/search/phrase_doc_iterator.cpp index 9c0f11a7c6..da878e479b 100644 --- a/src/storage/invertedindex/search/phrase_doc_iterator.cpp +++ b/src/storage/invertedindex/search/phrase_doc_iterator.cpp @@ -36,6 +36,9 @@ namespace infinity { doc_id = max_doc_id; } doc_id_ = doc_id; + for (SizeT i = 0; i < iters_.size(); ++i) { + iters_[i]->SeekDoc(doc_id_); + } } void PhraseDocIterator::PrintTree(std::ostream &os, const String &prefix, bool is_final) const { @@ -73,6 +76,7 @@ namespace infinity { } auto& iter = iters_[0]; pos_t beg_position = 0; + match_data.tf_ = 0; while (true) { pos_t position = INVALID_POSITION; iter->SeekPosition(beg_position, position); @@ -81,6 +85,7 @@ namespace infinity { } if (CheckBeginPosition(position)) { match_data.begin_positions_.push_back(position); + ++(match_data.tf_); } beg_position = position + 1; } diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cppm b/src/storage/invertedindex/search/phrase_doc_iterator.cppm index 6947400ab9..b8438e1d65 100644 --- a/src/storage/invertedindex/search/phrase_doc_iterator.cppm +++ b/src/storage/invertedindex/search/phrase_doc_iterator.cppm @@ -13,14 +13,19 @@ import index_defines; namespace infinity { export class PhraseDocIterator final : public DocIterator { public: - PhraseDocIterator(Vector> &&iters, u64 column_id, float weight, u32 slop = 1) - : iters_(std::move(iters)), column_id_(column_id), weight_(weight), slop_(slop) { + PhraseDocIterator(Vector> &&iters, float weight, u32 slop = 1) + : iters_(std::move(iters)), weight_(weight), slop_(slop) { doc_ids_.resize(iters_.size()); doc_freq_ = 0; phrase_freq_ = 0; + if (iters_.size()) { + estimate_doc_freq_ = iters_[0]->GetDocFreq(); + } else { + estimate_doc_freq_ = 0; + } for (SizeT i = 0; i < iters_.size(); ++i) { all_df_.push_back(iters_[i]->GetDocFreq()); - // doc_freq_ = std::max(doc_freq_, iters_[i]->GetDocFreq()); + estimate_doc_freq_ = std::min(estimate_doc_freq_, iters_[i]->GetDocFreq()); } } @@ -28,6 +33,8 @@ public: u32 GetDF() const override; + u32 GetEstimateDF() const { return estimate_doc_freq_; } + void PrintTree(std::ostream &os, const String &prefix, bool is_final) const override; float GetWeight() const { return weight_; } @@ -49,8 +56,8 @@ public: private: Vector> iters_; Vector doc_ids_; - u64 column_id_; u32 doc_freq_{0}; + u32 estimate_doc_freq_{0}; u64 phrase_freq_{0}; Set all_doc_ids_{}; float weight_; diff --git a/src/storage/invertedindex/search/query_node.cpp b/src/storage/invertedindex/search/query_node.cpp index e68eb77572..152c0d94a8 100644 --- a/src/storage/invertedindex/search/query_node.cpp +++ b/src/storage/invertedindex/search/query_node.cpp @@ -359,7 +359,6 @@ std::unique_ptr AndNotQueryNode::InnerGetNewOptimizedQueryTree() { } // create search iterator - std::unique_ptr TermQueryNode::CreateSearch(const TableEntry *table_entry, IndexReader &index_reader, Scorer *scorer) const { ColumnID column_id = table_entry->GetColumnIdByName(column_); ColumnIndexReader *column_index_reader = index_reader.GetColumnIndexReader(column_id); @@ -430,7 +429,7 @@ std::unique_ptr PhraseQueryNode::CreateSearch(const TableEntry *tab } posting_iterators.emplace_back(std::move(posting_iterator)); } - auto search = MakeUnique(std::move(posting_iterators), column_id, GetWeight()); + auto search = MakeUnique(std::move(posting_iterators), GetWeight()); search->terms_ptr_ = &terms_; search->column_name_ptr_ = &column_; @@ -454,16 +453,16 @@ PhraseQueryNode::CreateEarlyTerminateSearch(const TableEntry *table_entry, Index fetch_position = true; } - Vector> term_doc_iterators; - for (auto &term : terms_) { - auto term_doc_iterator = column_index_reader->LookupBlockMax(term, index_reader.session_pool_.get(), GetWeight(), fetch_position); - if (nullptr == term_doc_iterator) { + Vector> posting_iterators; + for (auto& term : terms_) { + auto posting_iterator = column_index_reader->Lookup(term, index_reader.session_pool_.get(), fetch_position); + if (nullptr == posting_iterator) { + fmt::print("not found term = {}\n", term); return nullptr; } - term_doc_iterators.emplace_back(std::move(term_doc_iterator)); + posting_iterators.emplace_back(std::move(posting_iterator)); } - - auto search = MakeUnique(std::move(term_doc_iterators), column_id); + auto search = MakeUnique(std::move(posting_iterators), GetWeight()); if (!search) { return nullptr; } diff --git a/src/storage/invertedindex/search/search_driver.cpp b/src/storage/invertedindex/search/search_driver.cpp index 36594939da..42f2e39080 100644 --- a/src/storage/invertedindex/search/search_driver.cpp +++ b/src/storage/invertedindex/search/search_driver.cpp @@ -198,6 +198,7 @@ std::unique_ptr SearchDriver::AnalyzeAndBuildQueryNode(const std::str result->column_ = field; return result; } else { + /* fmt::print("Create Or Query Node\n"); auto result = std::make_unique(); for (auto &term : terms) { @@ -207,7 +208,7 @@ std::unique_ptr SearchDriver::AnalyzeAndBuildQueryNode(const std::str result->Add(std::move(subquery)); } return result; - /* + */ // create phrase query node auto result = std::make_unique(); for (auto term : terms) { @@ -215,7 +216,6 @@ std::unique_ptr SearchDriver::AnalyzeAndBuildQueryNode(const std::str } result->column_ = field; return result; - */ } } diff --git a/src/storage/invertedindex/segment_posting.cpp b/src/storage/invertedindex/segment_posting.cpp index 52a0ce313f..4da9009ac5 100644 --- a/src/storage/invertedindex/segment_posting.cpp +++ b/src/storage/invertedindex/segment_posting.cpp @@ -65,14 +65,11 @@ void SegmentPosting::Init(SharedPtr doc_slice_list, pos_begin_ = pos_begin; pos_size_ = pos_size; posting_reader_ = MakeShared(posting_reader->fs_, posting_reader->path_, 1024); - // fs_ = &posting_reader->fs_; - // path_ = &posting_reader->path_; session_pool_ = session_pool; } const SharedPtr &SegmentPosting::GetPosSliceListPtr() { if (pos_slice_list_.get() == nullptr) { - // auto posting_reader = MakeUnique(*fs_, *path_, 1024); ByteSlice *pos_slice = ByteSlice::CreateSlice(pos_size_, session_pool_); posting_reader_->Seek(doc_start_ + pos_begin_); diff --git a/src/unit_test/storage/invertedindex/memory_indexer.cpp b/src/unit_test/storage/invertedindex/memory_indexer.cpp index d19ee3327a..109ef03dae 100644 --- a/src/unit_test/storage/invertedindex/memory_indexer.cpp +++ b/src/unit_test/storage/invertedindex/memory_indexer.cpp @@ -96,6 +96,10 @@ class MemoryIndexerTest : public BaseTest { ASSERT_EQ(doc_id, expected.doc_ids[j]); u32 tf = post_iter->GetCurrentTF(); ASSERT_EQ(tf, expected.tfs[j]); + pos_t res_pos = INVALID_POSITION; + do { + post_iter->SeekPosition(0, res_pos); + } while(res_pos != INVALID_POSITION); } if (doc_id != INVALID_ROWID) { doc_id = post_iter->SeekDoc(doc_id + 1); diff --git a/src/unit_test/storage/invertedindex/search/query_match.cpp b/src/unit_test/storage/invertedindex/search/query_match.cpp index 7a9761a55d..7c4b1e98bf 100644 --- a/src/unit_test/storage/invertedindex/search/query_match.cpp +++ b/src/unit_test/storage/invertedindex/search/query_match.cpp @@ -97,7 +97,7 @@ void QueryMatchTest::InitData() { }; } -TEST_F(QueryMatchTest, DISABLED_basic_phrase) { +TEST_F(QueryMatchTest, basic_phrase) { CreateDBAndTable(db_name_, table_name_); CreateIndex(db_name_, table_name_, index_name_); InsertData(db_name_, table_name_); @@ -115,7 +115,7 @@ TEST_F(QueryMatchTest, DISABLED_basic_phrase) { } } -TEST_F(QueryMatchTest, DISABLED_basic_term) { +TEST_F(QueryMatchTest, basic_term) { CreateDBAndTable(db_name_, table_name_); CreateIndex(db_name_, table_name_, index_name_); InsertData(db_name_, table_name_); @@ -318,7 +318,8 @@ void QueryMatchTest::QueryMatch(const String& db_name, fmt::print("iter_row_id is INVALID_ROWID\n"); } else { do { - query_builder.Score(iter_row_id); + auto score = query_builder.Score(iter_row_id); + fmt::print("iter_row_id = {}, score = {}\n", iter_row_id.ToUint64(), score); iter_row_id = doc_iterator->Next(); } while (iter_row_id != INVALID_ROWID); if (query_type == DocIteratorType::kPhraseIterator) { diff --git a/test/sql/dql/fulltext.slt b/test/sql/dql/fulltext.slt index 4e14cd91e9..5316973b05 100644 --- a/test/sql/dql/fulltext.slt +++ b/test/sql/dql/fulltext.slt @@ -27,6 +27,19 @@ SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH('body^5', ' ---- Anarchism 30-APR-2012 03:25:17.000 0 21.620300 +# only phrase +query TTI +SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH('body^5', '\"social customs\"', 'topn=3;block_max=compare'); +---- +Anarchism 30-APR-2012 03:25:17.000 6 20.881144 + +# phrase and term +query TTI +SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH('doctitle,body^5', '\"social customs\" harmful', 'topn=3'); +---- +Anarchism 30-APR-2012 03:25:17.000 0 21.620300 +Anarchism 30-APR-2012 03:25:17.000 6 20.881144 + # copy data from csv file query I COPY enwiki FROM '/var/infinity/test_data/enwiki_99.csv' WITH ( DELIMITER '\t' );