Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: zilliztech/VectorDBBench
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: main
Choose a base ref
...
head repository: yugabyte/VectorDBBench
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: main-yb
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 6 commits
  • 31 files changed
  • 6 contributors

Commits on Jan 10, 2025

  1. synchronise fork (#4)

    * added HNSW params to index creation and search
    
    * fixed types
    
    * fixed query
    
    * fixed ef_runtime
    
    * enhance: Refine the coding style and enable lint-action
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    
    * fix bug
    
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    
    ---------
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    Co-authored-by: Luka958Pixion <luka.panic@pixion.co>
    Co-authored-by: yangxuan <xuan.yang@zilliz.com>
    Co-authored-by: min.tian <min.tian.cn@gmail.com>
    4 people authored Jan 10, 2025
    Copy the full SHA
    3aa8417 View commit details
  2. synchronize (#5)

    * added HNSW params to index creation and search
    
    * fixed types
    
    * fixed query
    
    * fixed ef_runtime
    
    * enhance: Refine the coding style and enable lint-action
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    
    * fix bug
    
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    
    ---------
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    Co-authored-by: Luka958Pixion <luka.panic@pixion.co>
    Co-authored-by: yangxuan <xuan.yang@zilliz.com>
    Co-authored-by: min.tian <min.tian.cn@gmail.com>
    4 people authored Jan 10, 2025
    Copy the full SHA
    9253eba View commit details
  3. Merge Main (#6)

    * added HNSW params to index creation and search
    
    * fixed types
    
    * fixed query
    
    * fixed ef_runtime
    
    * enhance: Refine the coding style and enable lint-action
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    
    * fix bug
    
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    
    ---------
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    Signed-off-by: min.tian <min.tian.cn@gmail.com>
    Co-authored-by: Luka958Pixion <luka.panic@pixion.co>
    Co-authored-by: yangxuan <xuan.yang@zilliz.com>
    Co-authored-by: min.tian <min.tian.cn@gmail.com>
    4 people authored Jan 10, 2025
    Copy the full SHA
    b08bf0d View commit details
  4. Copy the full SHA
    5ebc3e3 View commit details

Commits on Jan 16, 2025

  1. sync fork (#7)

    * fix: Unable to run vebbench and cli
    
    fix: remove comma of logging str
    fix cli unable to run #444
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    
    * enhance: Unify optimize and remove ready_to_load
    
    PyMilvus used to be the only client that uses ready_to_load.
    Not it'll load the collection when creating it, so
    this PR removes `ready_to_load` from the client.API
    
    Also this PR enhance optimize and remove the optimize_with_size
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    
    * add mongodb client
    
    Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
    
    * add mongodb client in readme
    
    Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
    
    ---------
    
    Signed-off-by: yangxuan <xuan.yang@zilliz.com>
    Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
    Co-authored-by: yangxuan <xuan.yang@zilliz.com>
    Co-authored-by: zhuwenxing <wenxing.zhu@zilliz.com>
    3 people authored Jan 16, 2025
    Copy the full SHA
    729b5b9 View commit details

Commits on Feb 17, 2025

  1. Copy the full SHA
    6636c06 View commit details
Showing with 384 additions and 261 deletions.
  1. +1 −0 README.md
  2. +3 −2 pyproject.toml
  3. +28 −2 vectordb_bench/backend/clients/__init__.py
  4. +1 −7 vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py
  5. +1 −4 vectordb_bench/backend/clients/alloydb/alloydb.py
  6. +8 −15 vectordb_bench/backend/clients/api.py
  7. +4 −7 vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py
  8. +1 −4 vectordb_bench/backend/clients/chroma/chroma.py
  9. +1 −4 vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py
  10. +2 −2 vectordb_bench/backend/clients/memorydb/cli.py
  11. +2 −5 vectordb_bench/backend/clients/memorydb/memorydb.py
  12. +1 −20 vectordb_bench/backend/clients/milvus/milvus.py
  13. +44 −0 vectordb_bench/backend/clients/mongodb/config.py
  14. +201 −0 vectordb_bench/backend/clients/mongodb/mongodb.py
  15. +1 −4 vectordb_bench/backend/clients/pgdiskann/pgdiskann.py
  16. +3 −11 vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py
  17. +2 −7 vectordb_bench/backend/clients/pgvector/pgvector.py
  18. +2 −7 vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py
  19. +1 −4 vectordb_bench/backend/clients/pinecone/pinecone.py
  20. +3 −6 vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py
  21. +1 −4 vectordb_bench/backend/clients/redis/redis.py
  22. +1 −1 vectordb_bench/backend/clients/test/cli.py
  23. +1 −4 vectordb_bench/backend/clients/test/test.py
  24. +1 −4 vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py
  25. +4 −12 vectordb_bench/backend/data_source.py
  26. +16 −34 vectordb_bench/backend/runner/mp_runner.py
  27. +4 −4 vectordb_bench/backend/runner/rate_runner.py
  28. +11 −15 vectordb_bench/backend/runner/read_write_runner.py
  29. +20 −28 vectordb_bench/backend/runner/serial_runner.py
  30. +6 −26 vectordb_bench/backend/task_runner.py
  31. +9 −18 vectordb_bench/interface.py
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -52,6 +52,7 @@ All the database client supported
| chromadb | `pip install vectordb-bench[chromadb]` |
| awsopensearch | `pip install vectordb-bench[opensearch]` |
| aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
| mongodb | `pip install vectordb-bench[mongodb]` |

### Run

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -35,7 +35,7 @@ dependencies = [
"psutil",
"polars",
"plotly",
"environs",
"environs<14.1.0",
"pydantic<v2",
"scikit-learn",
"pymilvus", # with pandas, numpy, ujson
@@ -85,6 +85,7 @@ memorydb = [ "memorydb" ]
chromadb = [ "chromadb" ]
opensearch = [ "opensearch-py" ]
aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"]
mongodb = [ "pymongo" ]

[project.urls]
"repository" = "https://github.com/zilliztech/VectorDBBench"
@@ -133,6 +134,7 @@ lint.ignore = [
"RUF017",
"C416",
"PLW0603",
"COM812",
]

# Allow autofix for all enabled rules (when `--fix`) is provided.
@@ -206,4 +208,3 @@ builtins-ignorelist = [
# "dict", # TODO
# "filter",
]

30 changes: 28 additions & 2 deletions vectordb_bench/backend/clients/__init__.py
Original file line number Diff line number Diff line change
@@ -40,9 +40,10 @@ class DB(Enum):
AliyunElasticsearch = "AliyunElasticsearch"
Test = "test"
AliyunOpenSearch = "AliyunOpenSearch"
MongoDB = "MongoDB"

@property
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901
"""Import while in use"""
if self == DB.Milvus:
from .milvus.milvus import Milvus
@@ -129,11 +130,21 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912

return AliyunOpenSearch

if self == DB.MongoDB:
from .mongodb.mongodb import MongoDB

return MongoDB

if self == DB.Test:
from .test.test import Test

return Test

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

@property
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901
"""Import while in use"""
if self == DB.Milvus:
from .milvus.config import MilvusConfig
@@ -220,6 +231,16 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912

return AliyunOpenSearchConfig

if self == DB.MongoDB:
from .mongodb.config import MongoDBConfig

return MongoDBConfig

if self == DB.Test:
from .test.config import TestConfig

return TestConfig

msg = f"Unknown DB: {self.name}"
raise ValueError(msg)

@@ -292,6 +313,11 @@ def case_config_cls( # noqa: PLR0911

return AliyunOpenSearchIndexConfig

if self == DB.MongoDB:
from .mongodb.config import MongoDBIndexConfig

return MongoDBIndexConfig

# DB.Pinecone, DB.Chroma, DB.Redis
return EmptyDBCaseConfig

Original file line number Diff line number Diff line change
@@ -325,10 +325,7 @@ def need_normalize_cosine(self) -> bool:

return False

def optimize(self):
pass

def optimize_with_size(self, data_size: int):
def optimize(self, data_size: int):
log.info(f"optimize count: {data_size}")
retry_times = 0
while True:
@@ -340,6 +337,3 @@ def optimize_with_size(self, data_size: int):
if total_count == data_size:
log.info("optimize table finish.")
return

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
5 changes: 1 addition & 4 deletions vectordb_bench/backend/clients/alloydb/alloydb.py
Original file line number Diff line number Diff line change
@@ -149,10 +149,7 @@ def _drop_table(self):
)
self.conn.commit()

def ready_to_load(self):
pass

def optimize(self):
def optimize(self, data_size: int | None = None):
self._post_insert()

def _post_insert(self):
23 changes: 8 additions & 15 deletions vectordb_bench/backend/clients/api.py
Original file line number Diff line number Diff line change
@@ -137,6 +137,13 @@ def __init__(
@contextmanager
def init(self) -> None:
"""create and destory connections to database.
Why contextmanager:
In multiprocessing search tasks, vectordbbench might init
totally hundreds of thousands of connections with DB server.
Too many connections may drain local FDs or server connection resources.
If the DB client doesn't have `close()` method, just set the object to None.
Examples:
>>> with self.init():
@@ -187,9 +194,8 @@ def search_embedding(
"""
raise NotImplementedError

# TODO: remove
@abstractmethod
def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases.
Should be blocked until the vectorDB is ready to be tested on
@@ -199,16 +205,3 @@ def optimize(self):
Optimize's execution time is limited, the limited time is based on cases.
"""
raise NotImplementedError

def optimize_with_size(self, data_size: int):
self.optimize()

# TODO: remove
@abstractmethod
def ready_to_load(self):
"""ready_to_load will be called before load in load cases.
Should be blocked until the vectorDB is ready to be tested on
heavy load cases.
"""
raise NotImplementedError
11 changes: 4 additions & 7 deletions vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py
Original file line number Diff line number Diff line change
@@ -145,15 +145,15 @@ def search_embedding(
docvalue_fields=[self.id_col_name],
stored_fields="_none_",
)
log.info(f'Search took: {resp["took"]}')
log.info(f'Search shards: {resp["_shards"]}')
log.info(f'Search hits total: {resp["hits"]["total"]}')
log.info(f"Search took: {resp['took']}")
log.info(f"Search shards: {resp['_shards']}")
log.info(f"Search hits total: {resp['hits']['total']}")
return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
except Exception as e:
log.warning(f"Failed to search: {self.index_name} error: {e!s}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases."""
# Call refresh first to ensure that all segments are created
self._refresh_index()
@@ -194,6 +194,3 @@ def _load_graphs_to_memory(self):
log.info("Calling warmup API to load graphs into memory")
warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
self.client.transport.perform_request("GET", warmup_endpoint)

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
5 changes: 1 addition & 4 deletions vectordb_bench/backend/clients/chroma/chroma.py
Original file line number Diff line number Diff line change
@@ -57,10 +57,7 @@ def init(self) -> None:
def ready_to_search(self) -> bool:
pass

def ready_to_load(self) -> bool:
pass

def optimize(self) -> None:
def optimize(self, data_size: int | None = None):
pass

def insert_embeddings(
Original file line number Diff line number Diff line change
@@ -143,7 +143,7 @@ def search_embedding(
log.warning(f"Failed to search: {self.indice} error: {e!s}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
"""optimize will be called between insertion and search in performance cases."""
assert self.client is not None, "should self.init() first"
self.client.indices.refresh(index=self.indice)
@@ -158,6 +158,3 @@ def optimize(self):
task_status = self.client.tasks.get(task_id=force_merge_task_id)
if task_status["completed"]:
return

def ready_to_load(self):
"""ready_to_load will be called before load in load cases."""
4 changes: 2 additions & 2 deletions vectordb_bench/backend/clients/memorydb/cli.py
Original file line number Diff line number Diff line change
@@ -43,8 +43,8 @@ class MemoryDBTypedDict(TypedDict):
show_default=True,
default=False,
help=(
"Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance.",
" In production, MemoryDB only supports cluster mode (CME)",
"Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance."
" In production, MemoryDB only supports cluster mode (CME)"
),
),
]
7 changes: 2 additions & 5 deletions vectordb_bench/backend/clients/memorydb/memorydb.py
Original file line number Diff line number Diff line change
@@ -157,17 +157,14 @@ def init(self) -> Generator[None, None, None]:
self.conn = self.get_client()
search_param = self.case_config.search_param()
if search_param["ef_runtime"]:
self.ef_runtime_str = f'EF_RUNTIME {search_param["ef_runtime"]}'
self.ef_runtime_str = f"EF_RUNTIME {search_param['ef_runtime']}"
else:
self.ef_runtime_str = ""
yield
self.conn.close()
self.conn = None

def ready_to_load(self) -> bool:
pass

def optimize(self) -> None:
def optimize(self, data_size: int | None = None):
self._post_insert()

def insert_embeddings(
21 changes: 1 addition & 20 deletions vectordb_bench/backend/clients/milvus/milvus.py
Original file line number Diff line number Diff line change
@@ -138,26 +138,7 @@ def wait_index():
log.warning(f"{self.name} optimize error: {e}")
raise e from None

def ready_to_load(self):
assert self.col, "Please call self.init() before"
self._pre_load(self.col)

def _pre_load(self, coll: Collection):
try:
if not coll.has_index(index_name=self._index_name):
log.info(f"{self.name} create index")
coll.create_index(
self._vector_field,
self.case_config.index_param(),
index_name=self._index_name,
)
coll.load()
log.info(f"{self.name} load")
except Exception as e:
log.warning(f"{self.name} pre load error: {e}")
raise e from None

def optimize(self):
def optimize(self, data_size: int | None = None):
assert self.col, "Please call self.init() before"
self._optimize()

44 changes: 44 additions & 0 deletions vectordb_bench/backend/clients/mongodb/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from pydantic import BaseModel, SecretStr

from ..api import DBCaseConfig, DBConfig, IndexType, MetricType


class MongoDBConfig(DBConfig, BaseModel):
connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
database: str = "vdb_bench"

def to_dict(self) -> dict:
return {
"connection_string": self.connection_string.get_secret_value(),
"database": self.database,
}


class MongoDBIndexConfig(BaseModel, DBCaseConfig):
index: IndexType = IndexType.HNSW # MongoDB uses HNSW for vector search
metric_type: MetricType | None = None
num_candidates: int | None = 1500 # Default numCandidates for vector search
exact_search: bool = False # Whether to use exact (ENN) search

def parse_metric(self) -> str:
if self.metric_type == MetricType.L2:
return "euclidean"
if self.metric_type == MetricType.IP:
return "dotProduct"
return "cosine" # Default to cosine similarity

def index_param(self) -> dict:
return {
"type": "vectorSearch",
"fields": [
{
"type": "vector",
"similarity": self.parse_metric(),
"numDimensions": None, # Will be set in MongoDB class
"path": "vector", # Vector field name
}
],
}

def search_param(self) -> dict:
return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search}
Loading