From 5e6b78db76c1538130fd239dc146fbf7e9d05622 Mon Sep 17 00:00:00 2001 From: shenyushi Date: Thu, 6 Mar 2025 19:30:16 +0800 Subject: [PATCH 1/2] Fix call GetFwd when doc_num is not the actual doc_num. --- src/storage/knn_index/sparse/bmp_alg.cppm | 6 +++++- src/storage/knn_index/sparse/bmp_fwd.cppm | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/storage/knn_index/sparse/bmp_alg.cppm b/src/storage/knn_index/sparse/bmp_alg.cppm index 6dd5ad1c32..c36245956f 100644 --- a/src/storage/knn_index/sparse/bmp_alg.cppm +++ b/src/storage/knn_index/sparse/bmp_alg.cppm @@ -265,10 +265,13 @@ public: void Optimize(const BMPOptimizeOptions &options) { std::unique_lock lock(mtx_); - if (options.bp_reorder_) { + while (options.bp_reorder_) { SizeT block_size = this->block_fwd_.block_size(); SizeT term_num = this->bm_ivt_.term_num(); SizeT doc_num = this->doc_ids_.size() - this->doc_ids_.size() % block_size; + if (doc_num == 0) { + break; + } this->bm_ivt_ = BMPIvt(term_num); Vector, Vector>> fwd = this->block_fwd_.GetFwd(doc_num, term_num); @@ -293,6 +296,7 @@ public: SparseVecRef doc((i32)indices.size(), indices.data(), data.data()); this->AddDoc(doc, doc_ids[i], false); } + break; } if (options.topk_ != 0) { SizeT term_num = this->bm_ivt_.term_num(); diff --git a/src/storage/knn_index/sparse/bmp_fwd.cppm b/src/storage/knn_index/sparse/bmp_fwd.cppm index a37088d400..fdc31091d9 100644 --- a/src/storage/knn_index/sparse/bmp_fwd.cppm +++ b/src/storage/knn_index/sparse/bmp_fwd.cppm @@ -441,6 +441,9 @@ public: const auto &[term_id, block_size, block_offsets, scores] = iter.Value(); for (SizeT i = 0; i < block_size; ++i) { BMPDocID doc_id = block_offsets[i] + block_id * block_size_; + if (doc_id >= doc_num) { + break; + } DataType score = scores[i]; fwd[doc_id].first.push_back(term_id); fwd[doc_id].second.push_back(score); From a8365f8029d16bd376215b97f2e839d13a26b7ed Mon Sep 17 00:00:00 2001 From: shenyushi Date: Thu, 6 Mar 2025 19:32:15 +0800 Subject: [PATCH 2/2] Add test. --- python/test_pysdk/test_knn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test_pysdk/test_knn.py b/python/test_pysdk/test_knn.py index 764ce58d73..1bae209bfc 100644 --- a/python/test_pysdk/test_knn.py +++ b/python/test_pysdk/test_knn.py @@ -1069,7 +1069,7 @@ def test_sparse_knn_with_index(self, check_data, suffix): index.IndexType.BMP, {"block_size": "8", "compress_type": "compress"}), ConflictType.Error) - table_obj.optimize("idx1", {"topk": "3"}) + table_obj.optimize("idx1", {"topk": "3", "bp_reorder": ""}) res, extra_result = (table_obj .output(["*", "_row_id", "_similarity"])