Skip to content

Commit 24ce96c

Browse files
authored
support phrase query with block max (#1189)
### What problem does this PR solve? * support phrase query with block max * fix skiplist reader error in reading position Issue link:#639 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature - [x] Test cases
1 parent 4bfeae5 commit 24ce96c

24 files changed

+435
-144
lines changed

benchmark/local_infinity/fulltext/fulltext_benchmark.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -222,8 +222,8 @@ void BenchmarkOptimize(SharedPtr<Infinity> infinity, const String &db_name, cons
222222

223223
void BenchmarkQuery(SharedPtr<Infinity> infinity, const String &db_name, const String &table_name) {
224224
std::string fields = "text";
225-
//std::vector<std::string> query_vec = {"one of", "is", "a", "\"is a\"", "\"one of\""};// {"Animalia", "Algorithms", "Animalia Algorithms", "network space", "harmful chemical anarchism"};
226-
std::vector<std::string> query_vec = {"harmful chemical anarchism", "\"harmful chemical\"", "\"one of\"", "harmful chemical"};
225+
std::vector<std::string> query_vec = {"harmful \"social custom\"", "social custom \"harmful chemical\"", "\"annual American awards\"", "harmful chemical", "\"one of\""};
226+
227227
for (auto match_text : query_vec) {
228228
BaseProfiler profiler;
229229
profiler.Begin();
@@ -314,7 +314,7 @@ int main(int argc, char *argv[]) {
314314
};
315315
Mode mode(Mode::kInsert);
316316
SizeT insert_batch = 500;
317-
app.add_option("--mode", mode, "Bencmark mode, one of insert, import, merge, query")
317+
app.add_option("--mode", mode, "Benchmark mode, one of insert, import, merge, query")
318318
->required()
319319
->transform(CLI::CheckedTransformer(mode_map, CLI::ignore_case));
320320
app.add_option("--insert-batch", insert_batch, "batch size of each insert, valid only at insert and merge mode, default value 500");
@@ -331,7 +331,7 @@ int main(int argc, char *argv[]) {
331331
String srcfile = test_data_path();
332332
srcfile += "/benchmark/dbpedia-entity/corpus.jsonl";
333333

334-
// #define DEL_LOCAL_DATA
334+
//#define DEL_LOCAL_DATA
335335
#ifdef DEL_LOCAL_DATA
336336
system("rm -rf /var/infinity/data /var/infinity/wal /var/infinity/log /var/infinity/tmp");
337337
#endif
@@ -358,7 +358,7 @@ int main(int argc, char *argv[]) {
358358
case Mode::kQuery: {
359359
BenchmarkCreateIndex(infinity, db_name, table_name, index_name);
360360
BenchmarkInsert(infinity, db_name, table_name, srcfile, insert_batch);
361-
// BenchmarkOptimize(infinity, db_name, table_name);
361+
BenchmarkOptimize(infinity, db_name, table_name);
362362
sleep(10);
363363
BenchmarkMoreQuery(infinity, db_name, table_name, 1);
364364
break;

python/benchmark/configs/infinity_enwiki.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"query_link": "to_be_set",
1010
"mode": "fulltext",
1111
"topK": 10,
12-
"use_import": false,
12+
"use_import": true,
1313
"schema": {
1414
"doctitle": {"type": "varchar", "default":""},
1515
"docdate": {"type": "varchar", "default":""},

src/executor/operator/physical_match.cpp

+3-46
Original file line numberDiff line numberDiff line change
@@ -494,45 +494,6 @@ void ASSERT_FLOAT_EQ(float bar, u32 i, float a, float b) {
494494
}
495495
}
496496

497-
void AnalyzeFunc(const String &analyzer_name, String &&text, TermList &output_terms) {
498-
UniquePtr<Analyzer> analyzer = AnalyzerPool::instance().Get(analyzer_name);
499-
// (dynamic_cast<CommonLanguageAnalyzer*>(analyzer.get()))->SetExtractEngStem(false);
500-
if (analyzer.get() == nullptr) {
501-
RecoverableError(Status::UnexpectedError(fmt::format("Invalid analyzer: {}", analyzer_name)));
502-
}
503-
Term input_term;
504-
input_term.text_ = std::move(text);
505-
TermList temp_output_terms;
506-
analyzer->Analyze(input_term, temp_output_terms);
507-
if (analyzer_name == AnalyzerPool::STANDARD) {
508-
// remove duplicates and only keep the root words for query
509-
const u32 INVALID_TERM_OFFSET = -1;
510-
Term last_term;
511-
last_term.word_offset_ = INVALID_TERM_OFFSET;
512-
for (const Term &term : temp_output_terms) {
513-
if (last_term.word_offset_ != INVALID_TERM_OFFSET) {
514-
assert(term.word_offset_ >= last_term.word_offset_);
515-
}
516-
if (last_term.word_offset_ != term.word_offset_) {
517-
if (last_term.word_offset_ != INVALID_TERM_OFFSET) {
518-
output_terms.emplace_back(last_term);
519-
}
520-
last_term.text_ = term.text_;
521-
last_term.word_offset_ = term.word_offset_;
522-
last_term.stats_ = term.stats_;
523-
} else {
524-
if (term.text_.size() < last_term.text_.size()) {
525-
last_term.text_ = term.text_;
526-
last_term.stats_ = term.stats_;
527-
}
528-
}
529-
}
530-
if (last_term.word_offset_ != INVALID_TERM_OFFSET) {
531-
output_terms.emplace_back(last_term);
532-
}
533-
}
534-
}
535-
536497
void ExecuteFTSearch(UniquePtr<EarlyTerminateIterator> &et_iter, FullTextScoreResultHeap &result_heap, u32 &blockmax_loop_cnt) {
537498
if (et_iter) {
538499
while (true) {
@@ -585,12 +546,7 @@ bool PhysicalMatch::ExecuteInnerHomebrewed(QueryContext *query_context, Operator
585546
if (!query_tree) {
586547
RecoverableError(Status::ParseMatchExprFailed(match_expr_->fields_, match_expr_->matching_text_));
587548
}
588-
if (query_tree->type_ == QueryNodeType::PHRASE) {
589-
// TODO: make sure there is no problem with block max phrase and delete this code
590-
// LOG_INFO(fmt::format("Block max phrase not supported, use ordinary iterator, query: {}", match_expr_->matching_text_));
591-
use_block_max_iter = false;
592-
use_ordinary_iter = true;
593-
}
549+
594550
auto finish_parse_query_tree_time = std::chrono::high_resolution_clock::now();
595551
TimeDurationType parse_query_tree_duration = finish_parse_query_tree_time - finish_init_query_builder_time;
596552
LOG_TRACE(fmt::format("PhysicalMatch Part 0.2: Parse QueryNode tree time: {} ms", parse_query_tree_duration.count()));
@@ -868,8 +824,9 @@ bool PhysicalMatch::Execute(QueryContext *query_context, OperatorState *operator
868824
SharedPtr<Vector<String>> PhysicalMatch::GetOutputNames() const {
869825
SharedPtr<Vector<String>> result_names = MakeShared<Vector<String>>();
870826
result_names->reserve(base_table_ref_->column_names_->size() + 2);
871-
for (auto &name : *base_table_ref_->column_names_)
827+
for (auto &name : *base_table_ref_->column_names_) {
872828
result_names->emplace_back(name);
829+
}
873830
result_names->emplace_back(COLUMN_NAME_SCORE);
874831
result_names->emplace_back(COLUMN_NAME_ROW_ID);
875832
return result_names;

src/storage/invertedindex/format/position_list_decoder.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ void PositionListDecoder::InitPositionSkipList(const ByteSliceList *pos_list,
4747
state->SetRecordOffset(pos_skiplist_end);
4848
} else {
4949
pos_skiplist_reader_ = session_pool_ ? new ((session_pool_)->Allocate(sizeof(SkipListReaderByteSlice)))
50-
SkipListReaderByteSlice(option_.GetDocListFormatOption())
51-
: new SkipListReaderByteSlice(option_.GetDocListFormatOption());
50+
SkipListReaderByteSlice(option_.GetPosListFormatOption())
51+
: new SkipListReaderByteSlice(option_.GetPosListFormatOption());
5252
skiplist_reader_real_size_ = sizeof(SkipListReaderByteSlice);
5353
static_cast<SkipListReaderByteSlice *>(pos_skiplist_reader_)->Load(pos_list, pos_skiplist_start, pos_skiplist_end);
5454
decoded_pos_count_ = 0;
@@ -66,8 +66,8 @@ void PositionListDecoder::InitPositionSkipList(ByteSlice *pos_list,
6666
state->SetRecordOffset(pos_skiplist_end);
6767
} else {
6868
pos_skiplist_reader_ = session_pool_ ? new ((session_pool_)->Allocate(sizeof(SkipListReaderByteSlice)))
69-
SkipListReaderByteSlice(option_.GetDocListFormatOption())
70-
: new SkipListReaderByteSlice(option_.GetDocListFormatOption());
69+
SkipListReaderByteSlice(option_.GetPosListFormatOption())
70+
: new SkipListReaderByteSlice(option_.GetPosListFormatOption());
7171
skiplist_reader_real_size_ = sizeof(SkipListReaderByteSlice);
7272
static_cast<SkipListReaderByteSlice *>(pos_skiplist_reader_)->Load(pos_list, pos_skiplist_start, pos_skiplist_end);
7373
decoded_pos_count_ = 0;

src/storage/invertedindex/format/skiplist_reader.cppm

+10-4
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@ import doc_list_format_option;
99
import memory_pool;
1010
import posting_byte_slice;
1111
import posting_byte_slice_reader;
12+
import position_list_format_option;
1213

1314
namespace infinity {
1415

1516
export class SkipListReader {
1617
public:
17-
explicit SkipListReader(const DocListFormatOption &doc_list_format_option) : doc_list_format_option_(doc_list_format_option) {
18+
explicit SkipListReader(const DocListFormatOption &doc_list_format_option)
19+
: has_tf_list_(doc_list_format_option.HasTfList()),
20+
has_block_max_(doc_list_format_option.HasBlockMax()) {
1821
if (has_tf_list_) {
1922
ttf_buffer_ = MakeUnique<u32[]>(SKIP_LIST_BUFFER_SIZE);
2023
}
@@ -24,6 +27,8 @@ public:
2427
}
2528
}
2629

30+
explicit SkipListReader(const PositionListFormatOption &doc_list_format_option) {}
31+
2732
virtual ~SkipListReader() = default;
2833

2934
bool SkipTo(u32 query_doc_id, u32 &doc_id, u32 &prev_doc_id, u32 &offset, u32 &delta);
@@ -55,9 +60,8 @@ public:
5560
protected:
5661
virtual Pair<int, bool> LoadBuffer() = 0;
5762

58-
DocListFormatOption doc_list_format_option_;
59-
const bool has_tf_list_ = doc_list_format_option_.HasTfList();
60-
const bool has_block_max_ = doc_list_format_option_.HasBlockMax();
63+
const bool has_tf_list_ = false;
64+
const bool has_block_max_ = false;
6165
i32 skipped_item_count_ = 0;
6266
u32 current_doc_id_ = 0;
6367
u32 current_offset_ = 0;
@@ -80,6 +84,8 @@ export class SkipListReaderByteSlice final : public SkipListReader {
8084
public:
8185
explicit SkipListReaderByteSlice(const DocListFormatOption &doc_list_format_option) : SkipListReader(doc_list_format_option) {}
8286

87+
explicit SkipListReaderByteSlice(const PositionListFormatOption &pos_list_format_option) : SkipListReader(pos_list_format_option) {}
88+
8389
void Load(const ByteSliceList *byte_slice_list, u32 start, u32 end);
8490

8591
void Load(ByteSlice *byteSlice, u32 start, u32 end);

src/storage/invertedindex/multi_posting_decoder.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ bool MultiPostingDecoder::DiskSegMoveToSegment(SegmentPosting &cur_segment_posti
201201
u32 doc_skiplist_end = doc_skiplist_start + doc_skiplist_size;
202202

203203
index_decoder_->InitSkipList(doc_skiplist_start, doc_skiplist_end, posting_list, term_meta.GetDocFreq());
204+
204205
if (format_option_.HasPositionList()) {
205206
u32 pos_list_begin = doc_list_reader.Tell() + doc_skiplist_size + doc_list_size;
206207
in_doc_state_keeper_.MoveToSegment(posting_list, term_meta.GetTotalTermFreq(), pos_list_begin, format_option_);

src/storage/invertedindex/multi_posting_decoder.cppm

+4
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,13 @@ private:
7171
}
7272

7373
bool MoveToSegment(RowID start_row_id);
74+
7475
bool MemSegMoveToSegment(const SharedPtr<PostingWriter> &posting_writer);
76+
7577
bool DiskSegMoveToSegment(SegmentPosting &cur_segment_posting);
78+
7679
IndexDecoder *CreateDocIndexDecoder(u32 doc_list_begin_pos);
80+
7781
private:
7882
PostingFormatOption format_option_;
7983
bool need_decode_doc_id_ = false;

src/storage/invertedindex/search/bm25_ranker.cpp

+5-17
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ module bm25_ranker;
2020

2121
import stl;
2222
import index_defines;
23+
import third_party;
2324

2425
namespace infinity {
2526

@@ -34,23 +35,10 @@ void BM25Ranker::AddTermParam(u64 tf, u64 df, float avg_column_len, u32 column_l
3435
score_ += smooth_idf * smooth_tf * weight;
3536
}
3637

37-
void BM25Ranker::AddPhraseParam(const Vector<tf_t>& all_tf,
38-
const Vector<u32>& all_df,
39-
float avg_column_len,
40-
u32 column_len,
41-
float weight,
42-
SizeT term_num) {
43-
for (SizeT i = 0; i < term_num; ++i) {
44-
u64 tf = 0;
45-
u64 df = 0;
46-
if (i < all_tf.size()) {
47-
tf = all_tf[i];
48-
}
49-
if (i < all_df.size()) {
50-
df = all_df[i];
51-
}
52-
AddTermParam(tf, df, avg_column_len, column_len, weight);
53-
}
38+
void BM25Ranker::AddPhraseParam(tf_t tf, u64 df, float avg_column_len, u32 column_len, float weight) {
39+
float smooth_idf = std::log(1.0F + (total_df_ - df + 0.5F) / (df + 0.5F));
40+
float smooth_tf = (k1 + 1.0F) * tf / (tf + k1 * (1.0F - b + b * column_len / avg_column_len));
41+
score_ += smooth_idf * smooth_tf * weight;
5442
}
5543

5644
} // namespace infinity

src/storage/invertedindex/search/bm25_ranker.cppm

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public:
2727

2828
void AddTermParam(u64 tf, u64 df, float avg_column_len, u32 column_len, float weight);
2929

30-
void AddPhraseParam(const Vector<tf_t>& all_tf, const Vector<u32>& all_df, float avg_column_len, u32 column_len, float weight, SizeT term_num);
30+
void AddPhraseParam(tf_t tf, u64 df, float avg_colum_len, u32 column_len, float weight);
3131

3232
float GetScore() { return score_; }
3333

src/storage/invertedindex/search/early_terminate_iterator/blockmax_and_iterator.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import stl;
2121
import index_defines;
2222
import early_terminate_iterator;
2323
import internal_types;
24+
import third_party;
2425

2526
namespace infinity {
2627

@@ -79,7 +80,7 @@ bool BlockMaxAndIterator::BlockSkipTo(RowID doc_id, float threshold) {
7980
sum_score += sorted_iterators_[j - 1]->BlockMaxBM25Score();
8081
common_block_max_bm25_score_parts_[j - 1] = prev_sum_score;
8182
}
82-
assert((sum_score <= bm25_score_upper_bound_));
83+
// assert((sum_score <= bm25_score_upper_bound_));
8384
if (sum_score >= threshold) {
8485
common_block_max_bm25_score_ = sum_score;
8586
common_block_min_possible_doc_id_ = doc_id;

src/storage/invertedindex/search/early_terminate_iterator/blockmax_maxscore_iterator.cppm

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,4 @@ private:
9090
Vector<Pair<u32, u64>> must_have_history_;
9191
};
9292

93-
} // namespace infinity
93+
} // namespace infinity

0 commit comments

Comments
 (0)