From 3c3b04cbd86e42adaf370705dfed1b47bddf7cb8 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Wed, 22 May 2024 22:06:02 +0800 Subject: [PATCH] Fulltext preformance improvement (#1234) Fulltext preformance improvement - [x] Performance Improvement --- docs/references/benchmark.md | 27 ++++++++++++++----- .../benchmark/clients/elasticsearch_client.py | 7 +++++ python/benchmark/clients/infinity_client.py | 20 +++++++------- .../configs/elasticsearch_enwiki.json | 1 + python/benchmark/configs/infinity_enwiki.json | 1 + .../invertedindex/column_index_reader.cpp | 2 +- 6 files changed, 42 insertions(+), 16 deletions(-) diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md index cbc4c06722..2af491e185 100644 --- a/docs/references/benchmark.md +++ b/docs/references/benchmark.md @@ -64,7 +64,7 @@ sudo mkdir -p /var/infinity && sudo chown -R $USER /var/infinity docker run -d --name infinity -v /var/infinity/:/var/infinity --ulimit nofile=500000:500000 --network=host infiniflow/infinity:0.1.0 ``` -1. Run Benchmark: +4. Run Benchmark: Drop file cache before benchmark query latency. @@ -96,7 +96,22 @@ options: --dataset DATASET data set to benchmark, one of: all, gist, sift, geonames, enwiki ``` -2. Navigate to the **results** folder to view the results and latency of each query. +Following are commands for engine `infinity` and dataset `enwiki`: + +```bash +python run.py --generate --engine infinity --dataset enwiki +python run.py --import --engine infinity --dataset enwiki +python run.py --query --engine infinity --dataset enwiki +python run.py --query-express=16 --engine infinity --dataset enwiki +``` + +Following are commands to issue a single query so that you can compare results among several engines. + +```base +curl -X GET "http://localhost:9200/elasticsearch_enwiki/_search" -H 'Content-Type: application/json' -d'{"size":10,"_source":"doctitle","query": {"match": { "body": "wraysbury istorijos" }}}' + +psql -h 0.0.0.0 -p 5432 -c "SELECT doctitle, ROW_ID(), SCORE() FROM infinity_enwiki SEARCH MATCH TEXT ('body', 'wraysbury istorijos', 'topn=10;block_max=true');" +``` ## Benchmark Results ### SIFT1M @@ -130,10 +145,10 @@ options: > - 33000000 documents > - 100000 `OR` queries generated based on the dataset. All terms are extracted from the dataset and very rare(occurrence < 100) terms are excluded. The number of terms of each query match the weight `[0.03, 0.15, 0.25, 0.25, 0.15, 0.08, 0.04, 0.03, 0.02]`. -| | Time to insert & build index | Time to import & build index | P95 Latency(ms)| QPS (8 python clients) | Memory | vCPU | -| ----------------- | ---------------------------- | ---------------------------- | ---------------| -----------------------| --------| ----- | -| **Elasticsearch** | 2289 s | N/A | 14.75 | 1174 | 21.0GB | 10.0 | -| **Infinity** | 2321 s | 944 s | 3.51 | 3925 | 9.0GB | 4.2 | +| | Time to insert & build index | Time to import & build index | P95 Latency(ms)| QPS (16 python clients) | Memory | vCPU | +| ----------------- | ---------------------------- | ---------------------------- | ---------------| ------------------------| --------| ----- | +| **Elasticsearch** | 2289 s | N/A | 14.75 | 1340 | 21.0GB | 10.6 | +| **Infinity** | 2321 s | 2890 s | 1.86 | 12328 | 10.0GB | 11.0 | --- diff --git a/python/benchmark/clients/elasticsearch_client.py b/python/benchmark/clients/elasticsearch_client.py index 69444de76a..86030c665b 100644 --- a/python/benchmark/clients/elasticsearch_client.py +++ b/python/benchmark/clients/elasticsearch_client.py @@ -263,6 +263,13 @@ def search(self) -> list[list[Any]]: return results def check_and_save_results(self, results: List[List[Any]]): + if "result_path" in self.data: + result_path = self.data["result_path"] + with open(result_path, "w") as f: + for result in results: + line = json.dumps(result) + f.write(line + "\n") + logging.info("query_result_path: {0}".format(result_path)) if "ground_truth_path" in self.data: ground_truth_path = self.data["ground_truth_path"] _, ext = os.path.splitext(ground_truth_path) diff --git a/python/benchmark/clients/infinity_client.py b/python/benchmark/clients/infinity_client.py index 8e7de74bfb..c84b7fed99 100644 --- a/python/benchmark/clients/infinity_client.py +++ b/python/benchmark/clients/infinity_client.py @@ -238,6 +238,13 @@ def search(self) -> list[list[Any]]: return results def check_and_save_results(self, results: List[List[Any]]): + if "result_path" in self.data: + result_path = self.data["result_path"] + with open(result_path, "w") as f: + for result in results: + line = json.dumps(result) + f.write(line + "\n") + logging.info("query_result_path: {0}".format(result_path)) if "ground_truth_path" in self.data: ground_truth_path = self.data["ground_truth_path"] _, ext = os.path.splitext(ground_truth_path) @@ -263,17 +270,12 @@ def check_and_save_results(self, results: List[List[Any]]): with open(ground_truth_path, "r") as f: for i, line in enumerate(f): expected_result = json.loads(line) + exp_ids = set(x[0] for x in expected_result[:-1]) result = results[i] ids = set(x[0] for x in result[:-1]) - precision = ( - len( - ids.intersection( - expected_result["expected_results"][ - : self.data["topK"] - ] - ) - ) - / self.data["topK"] + precision = len(ids.intersection(exp_ids)) / self.data["topK"] + logging.info( + f"expected_ids: {exp_ids}, ids: {ids}, precision: {precision}" ) precisions.append(precision) latencies.append(result[-1]) diff --git a/python/benchmark/configs/elasticsearch_enwiki.json b/python/benchmark/configs/elasticsearch_enwiki.json index ce4c729506..0ae9456065 100644 --- a/python/benchmark/configs/elasticsearch_enwiki.json +++ b/python/benchmark/configs/elasticsearch_enwiki.json @@ -6,6 +6,7 @@ "data_path": "datasets/enwiki/enwiki.csv", "insert_batch_size": 8192, "query_path": "datasets/enwiki/operations.txt", + "result_path": "datasets/enwiki/elasticsearch_result.jsonl", "mode": "fulltext", "topK": 10, "index": { diff --git a/python/benchmark/configs/infinity_enwiki.json b/python/benchmark/configs/infinity_enwiki.json index 235a84c0a4..e13660af62 100644 --- a/python/benchmark/configs/infinity_enwiki.json +++ b/python/benchmark/configs/infinity_enwiki.json @@ -6,6 +6,7 @@ "data_link": "http://192.168.200.183:8000/enwiki-20120502-lines-10.csv", "insert_batch_size": 8192, "query_path": "datasets/enwiki/operations.txt", + "result_path": "datasets/enwiki/infinity_result.jsonl", "query_link": "to_be_set", "mode": "fulltext", "topK": 10, diff --git a/src/storage/invertedindex/column_index_reader.cpp b/src/storage/invertedindex/column_index_reader.cpp index 3f81eafc57..56e133c124 100644 --- a/src/storage/invertedindex/column_index_reader.cpp +++ b/src/storage/invertedindex/column_index_reader.cpp @@ -140,7 +140,7 @@ IndexReader TableIndexReaderCache::GetIndexReader(Txn *txn, TableEntry *self_tab std::scoped_lock lock(mutex_); assert(cache_ts_ <= first_known_update_ts_); assert(first_known_update_ts_ == MAX_TIMESTAMP || first_known_update_ts_ <= last_known_update_ts_); - if (cache_ts_ != 0 && begin_ts >= cache_ts_ && begin_ts < first_known_update_ts_) [[likely]] { + if (first_known_update_ts_ != 0 && begin_ts >= cache_ts_ && begin_ts < first_known_update_ts_) [[likely]] { // no need to build, use cache result.column_index_readers_ = cache_column_readers_; result.column2analyzer_ = column2analyzer_;