Skip to content

Commit f8015ad

Browse files
committed
add benchmark phrase query
1 parent 569df82 commit f8015ad

File tree

7 files changed

+38
-23
lines changed

7 files changed

+38
-23
lines changed

docs/references/benchmark.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ Following are commands to issue a single query so that you can compare results a
114114
```base
115115
curl -X GET "http://localhost:9200/elasticsearch_enwiki/_search" -H 'Content-Type: application/json' -d'{"size":10,"_source":"doctitle","query": {"match": { "body": "wraysbury istorijos" }}}'
116116
117-
curl -X GET "http://localhost:7280/api/v1/_elastic/qucikwit_enwiki/_search" -H 'Content-Type: application/json' -d'{"query": {"query_string": {"query": "wraysbury istorijos", "fields": [ "body" ] } },"sort": ["_score"]}'
117+
curl -X GET "http://localhost:7280/api/v1/_elastic/qucikwit_enwiki/_search" -H 'Content-Type: application/json' -d'{"query": {"query_string": {"query": "wraysbury istorijos", "fields": [ "body" ] } },"sort": ["_score"],"size":10}'
118118
119119
psql -h 0.0.0.0 -p 5432 -c "SELECT doctitle, ROW_ID(), SCORE() FROM infinity_enwiki SEARCH MATCH TEXT ('body', 'wraysbury istorijos', 'topn=10');"
120120
```

python/benchmark/clients/elasticsearch_client.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def upload(self):
108108
self.client.indices.forcemerge(index=self.table_name, wait_for_completion=True)
109109

110110
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
111-
def get_fulltext_query_content(self, query: str, is_and: bool = False) -> Any:
111+
def get_fulltext_query_content(self, query: str, is_and: bool = False, is_phrase: bool = False) -> Any:
112112
ret = None
113113
if is_and:
114114
terms = query.split()
@@ -118,7 +118,10 @@ def get_fulltext_query_content(self, query: str, is_and: bool = False) -> Any:
118118
}
119119
}
120120
else:
121-
ret = {"query": {"match": {"body": query}}}
121+
if is_phrase:
122+
ret = {"query": {"match_phrase": {"body": query}}}
123+
else:
124+
ret = {"query": {"match": {"body": query}}}
122125
return ret
123126

124127
def setup_clients(self, num_threads=1):
@@ -149,7 +152,7 @@ def do_single_query(self, query_id, client_id) -> list[Any]:
149152
]
150153
return result
151154
elif self.data["mode"] == "fulltext":
152-
body = self.get_fulltext_query_content(query)
155+
body = self.get_fulltext_query_content(query, is_phrase=self.data["is_phrase_query"])
153156
result = self.client.search(
154157
index=self.table_name,
155158
body=body,

python/benchmark/clients/quickwit_client.py

+18-6
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def upload(self):
176176
else:
177177
raise TypeError("Unsupported file type")
178178

179-
def get_fulltext_query_content(self, query: str, is_and: bool = False) -> Any:
179+
def get_fulltext_query_content(self, query: str, is_and: bool = False, is_phrase=False) -> Any:
180180
ret = None
181181
if is_and:
182182
terms = query.split()
@@ -186,10 +186,22 @@ def get_fulltext_query_content(self, query: str, is_and: bool = False) -> Any:
186186
}
187187
}
188188
else:
189-
ret = {
190-
"query": {"query_string": {"query": query, "fields": ["body"]}},
191-
"sort": ["_score"],
192-
}
189+
if is_phrase:
190+
ret = {
191+
"query": {
192+
"match_phrase": {"body": query}
193+
},
194+
"sort": ["_score"],
195+
"size": self.data["topK"]
196+
}
197+
else:
198+
ret = {
199+
"query": {
200+
"query_string": {"query": query, "fields": ["body"]}
201+
},
202+
"sort": ["_score"],
203+
"size": self.data["topK"]
204+
}
193205
return ret
194206

195207
def setup_clients(self, num_threads=1):
@@ -202,7 +214,7 @@ def do_single_query(self, query_id, client_id) -> list[Any]:
202214
query = self.queries[query_id]
203215
client = self.clients[client_id]
204216
if self.data["mode"] == "fulltext":
205-
body = self.get_fulltext_query_content(query)
217+
body = self.get_fulltext_query_content(query=query, is_phrase=self.data["is_phrase_query"])
206218
body["size"] = self.data["topK"]
207219

208220
result = client.search(

python/benchmark/configs/elasticsearch_tantivy.json

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"data_path": "datasets/tantivy/wiki-articles.json",
77
"insert_batch_size": 8192,
88
"query_path": "datasets/tantivy/operations.txt",
9+
"is_phrase_query": false,
910
"result_path": "datasets/tantivy/elasticsearch_result.jsonl",
1011
"mode": "fulltext",
1112
"topK": 10,

python/benchmark/configs/quickwit_tantivy.json

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"data_path": "datasets/tantivy/wiki-articles.json",
77
"insert_batch_size": 8192,
88
"query_path": "datasets/tantivy/operations.txt",
9+
"is_phrase_query": false,
910
"result_path": "datasets/tantivy/quickwit_result.jsonl",
1011
"mode": "fulltext",
1112
"topK": 10,

src/unit_test/storage/invertedindex/search/query_match.cpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,6 @@ void QueryMatchTest::InsertData(const String& db_name, const String& table_name)
278278
auto& row = datas_[block_id];
279279
for (SizeT i = 0; i < column_vectors.size(); ++i) {
280280
auto &column = row[i];
281-
fmt::print("insert {}\n", column);
282281
column_vectors[i].AppendByStringView(column, ',');
283282
}
284283
block_entry->IncreaseRowCount(1);
@@ -352,9 +351,8 @@ void QueryMatchTest::QueryMatch(const String& db_name,
352351
auto phrase_iterator = dynamic_cast<PhraseDocIterator *>(doc_iterator.get());
353352
auto res_df = phrase_iterator->GetDF();
354353
auto res_phrase_freq = phrase_iterator->GetPhraseFreq();
355-
fmt::print("res_df: {}, res_phrase_freq: {}\n", res_df, res_phrase_freq);
356-
// EXPECT_EQ(res_df, expected_doc_freq);
357-
// EXPECT_EQ(res_phrase_freq, expected_matched_freq);
354+
EXPECT_EQ(res_df, expected_doc_freq);
355+
EXPECT_EQ(res_phrase_freq, expected_matched_freq);
358356
} else {
359357
EXPECT_EQ(doc_iterator->GetType(), DocIteratorType::kTermIterator);
360358
auto term_iterator = dynamic_cast<TermDocIterator *>(doc_iterator.get());

test/sql/dql/fulltext/fulltext.slt

+9-9
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,17 @@ SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH TEXT ('body
2828
Anarchism 30-APR-2012 03:25:17.000 0 21.620300
2929

3030
# only phrase
31-
# query TTI rowsort
32-
# SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH TEXT ('body^5', '"social customs"', 'topn=3;block_max=compare');
33-
# ----
34-
# Anarchism 30-APR-2012 03:25:17.000 6 20.881144
31+
query TTI rowsort
32+
SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH TEXT ('body^5', '"social customs"', 'topn=3;block_max=compare');
33+
----
34+
Anarchism 30-APR-2012 03:25:17.000 6 20.881144
3535

3636
# phrase and term
37-
# query TTI rowsort
38-
# SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH TEXT ('doctitle,body^5', '"social customs" harmful', 'topn=3');
39-
# ----
40-
# Anarchism 30-APR-2012 03:25:17.000 0 21.620300
41-
# Anarchism 30-APR-2012 03:25:17.000 6 20.881144
37+
query TTI rowsort
38+
SELECT doctitle, docdate, ROW_ID(), SCORE() FROM enwiki SEARCH MATCH TEXT ('doctitle,body^5', '"social customs" harmful', 'topn=3');
39+
----
40+
Anarchism 30-APR-2012 03:25:17.000 0 21.620300
41+
Anarchism 30-APR-2012 03:25:17.000 6 20.881144
4242

4343
# copy data from csv file
4444
query I

0 commit comments

Comments
 (0)