Skip to content

add clickhouse engine to benchmark #233

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions engine/clients/clickhouse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from engine.clients.clickhouse.configure import CHVectorConfigurator
from engine.clients.clickhouse.search import CHVectorSearcher
from engine.clients.clickhouse.upload import CHVectorUploader

__all__ = ["CHVectorUploader", "CHVectorSearcher", "CHVectorConfigurator"]
16 changes: 16 additions & 0 deletions engine/clients/clickhouse/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os

CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", 8123))
CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER", "clickhouse")
CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "passwd")


def get_db_config(connection_params):
return {
"host": CLICKHOUSE_HOST,
"port": CLICKHOUSE_PORT,
"user": CLICKHOUSE_USER,
"password": CLICKHOUSE_PASSWORD,
**connection_params,
}
39 changes: 39 additions & 0 deletions engine/clients/clickhouse/configure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import clickhouse_connect

from benchmark.dataset import Dataset
from engine.base_client import IncompatibilityError
from engine.base_client.configure import BaseConfigurator
from engine.base_client.distances import Distance
from engine.clients.clickhouse.config import get_db_config


class CHVectorConfigurator(BaseConfigurator):
def __init__(self, host, collection_params: dict, connection_params: dict):
super().__init__(host, collection_params, connection_params)
self.client = clickhouse_connect.driver.create_client(
**get_db_config(connection_params)
)
print("configure connection created")

def clean(self):
self.client.command(
cmd="DROP TABLE IF EXISTS items;",
)

def recreate(self, dataset: Dataset, collection_params):
if dataset.config.distance == Distance.DOT:
raise IncompatibilityError

self.client.command(
cmd="""CREATE TABLE items (
id UInt64,
embedding Array(Float64)
)
ENGINE = MergeTree()
ORDER BY id
;"""
)
self.client.close()

def delete_client(self):
self.client.close()
46 changes: 46 additions & 0 deletions engine/clients/clickhouse/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
from typing import Any, List, Optional

from engine.base_client import IncompatibilityError
from engine.base_client.parser import BaseConditionParser, FieldValue


class CHVectorConditionParser(BaseConditionParser):
def build_condition(
self, and_subfilters: Optional[List[Any]], or_subfilters: Optional[List[Any]]
) -> Optional[Any]:
clauses = []
if or_subfilters is not None and len(or_subfilters) > 0:
clauses.append(f"( {' OR '.join(or_subfilters)} )")
if and_subfilters is not None and len(and_subfilters) > 0:
clauses.append(f"( {' AND '.join(and_subfilters)} )")

return " AND ".join(clauses)

def build_exact_match_filter(self, field_name: str, value: FieldValue) -> Any:
return f"{field_name} == {json.dumps(value)}"

def build_range_filter(
self,
field_name: str,
lt: Optional[FieldValue],
gt: Optional[FieldValue],
lte: Optional[FieldValue],
gte: Optional[FieldValue],
) -> Any:
clauses = []
if lt is not None:
clauses.append(f"{field_name} < {lt}")
if gt is not None:
clauses.append(f"{field_name} > {gt}")
if lte is not None:
clauses.append(f"{field_name} <= {lte}")
if gte is not None:
clauses.append(f"{field_name} >= {gte}")
return f"( {' AND '.join(clauses)} )"

def build_geo_filter(
self, field_name: str, lat: float, lon: float, radius: float
) -> Any:
# TODO: Implement this
raise IncompatibilityError
48 changes: 48 additions & 0 deletions engine/clients/clickhouse/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import List, Tuple

import clickhouse_connect
from clickhouse_connect.driver.query import QueryResult

66
from dataset_reader.base_reader import Query
from engine.base_client.distances import Distance
from engine.base_client.search import BaseSearcher
from engine.clients.clickhouse.config import get_db_config
from engine.clients.clickhouse.parser import CHVectorConditionParser


class CHVectorSearcher(BaseSearcher):
distance = None
search_params = {}
parser = CHVectorConditionParser()

@classmethod
def init_client(cls, host, distance, connection_params: dict, search_params: dict):
cls.client = clickhouse_connect.driver.create_client(
**get_db_config(connection_params)
)
if distance == Distance.COSINE:
cls.query: str = (
"SELECT id, cosineDistance(embedding, {vector:Array(Float64)}) AS _score FROM items ORDER BY _score LIMIT {top:UInt8} OFFSET 1"
)
elif distance == Distance.L2:
cls.query: str = (
"SELECT id, L2Distance(embedding, {vector:Array(Float64)}) AS _score FROM items ORDER BY _score LIMIT {top:UInt8} OFFSET 1"
)
else:
raise NotImplementedError(f"Unsupported distance metric {cls.distance}")

@classmethod
def search_one(cls, query: Query, top) -> List[Tuple[int, float]]:
# TODO: Use query.metaconditions for datasets with filtering
query_summary: QueryResult = cls.client.query(
cls.query, parameters={"vector": query.vector, "top": top}
)
# print(type(query_summary.result_rows))
# print(type(query_summary.result_rows[0]))
# print(type(query_summary.result_rows[0][0]))
return query_summary.result_rows

@classmethod
def delete_client(cls):
cls.client.close()
60 changes: 60 additions & 0 deletions engine/clients/clickhouse/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import List

import clickhouse_connect

from dataset_reader.base_reader import Record
from engine.base_client import IncompatibilityError
from engine.base_client.distances import Distance
from engine.base_client.upload import BaseUploader
from engine.clients.clickhouse.config import get_db_config


class CHVectorUploader(BaseUploader):
DISTANCE_MAPPING = {
Distance.L2: "vector_l2_ops",
Distance.COSINE: "vector_cosine_ops",
}
client = None
upload_params = {}

@classmethod
def init_client(cls, host, distance, connection_params, upload_params):
cls.client = clickhouse_connect.driver.create_client(
**get_db_config(connection_params)
)
cls.upload_params = upload_params

@classmethod
def upload_batch(cls, batch: List[Record]):
ids, vectors = [], []
for record in batch:
ids.append(record.id)
vectors.append(record.vector)
# array_vectors = np.array(vectors)
both_columns = [ids, vectors]

cls.client.insert(
table="items",
data=both_columns,
column_names=["id", "embedding"],
column_type_names=["UInt64", "Array(Float64)"],
column_oriented=True,
)
# print(query_summary)
# return query_summary

@classmethod
def post_upload(cls, distance):
try:
hnsw_distance_type = cls.DISTANCE_MAPPING[distance]
except KeyError:
raise IncompatibilityError(f"Unsupported distance metric: {distance}")

# cls.conn.execute(
# f"CREATE INDEX ON items USING hnsw (embedding {hnsw_distance_type}) WITH (m = {cls.upload_params['hnsw_config']['m']}, ef_construction = {cls.upload_params['hnsw_config']['ef_construct']})"
# )
return {}

@classmethod
def delete_client(cls):
cls.client.close()
8 changes: 8 additions & 0 deletions engine/clients/client_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
BaseSearcher,
BaseUploader,
)
from engine.clients.clickhouse import (
CHVectorConfigurator,
CHVectorSearcher,
CHVectorUploader,
)
from engine.clients.elasticsearch import (
ElasticConfigurator,
ElasticSearcher,
Expand Down Expand Up @@ -39,6 +44,7 @@
"opensearch": OpenSearchConfigurator,
"redis": RedisConfigurator,
"pgvector": PgVectorConfigurator,
"clickhouse": CHVectorConfigurator,
}

ENGINE_UPLOADERS = {
Expand All @@ -49,6 +55,7 @@
"opensearch": OpenSearchUploader,
"redis": RedisUploader,
"pgvector": PgVectorUploader,
"clickhouse": CHVectorUploader,
}

ENGINE_SEARCHERS = {
Expand All @@ -59,6 +66,7 @@
"opensearch": OpenSearchSearcher,
"redis": RedisSearcher,
"pgvector": PgVectorSearcher,
"clickhouse": CHVectorSearcher,
}


Expand Down
22 changes: 22 additions & 0 deletions engine/servers/clickhouse-single-node/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
services:
clickhouse:
container_name: clickhouse
user: "clickhouse:clickhouse"
hostname: clickhouse
image: clickhouse/clickhouse-server:25.1-alpine
environment:
- CLICKHOUSE_DB=clickhouse
- CLICKHOUSE_USER=clickhouse
- CLICKHOUSE_PASSWORD=passwd
- CLICKHOUSE_PORT=8123
ports:
- 8123:8123
logging:
driver: "json-file"
options:
max-file: 1
max-size: 10m
deploy:
resources:
limits:
memory: 9Gb
90 changes: 90 additions & 0 deletions experiments/configurations/clickhouse-single-node.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
[
{
"name": "clickhouse-default",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 8, "config": { "hnsw_ef": 128 } }
],
"upload_params": { "parallel": 16, "batch_size": 4096}
},
{
"name": "clickhouse-parallel",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 8, "config": { "hnsw_ef": 128 } },
{ "parallel": 16, "config": { "hnsw_ef": 128 } },
{ "parallel": 100, "config": { "hnsw_ef": 128 } }
],
"upload_params": { "parallel": 1, "batch_size": 2048}
},
{
"name": "clickhouse-m-16-ef-128",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16}
},
{
"name": "clickhouse-m-32-ef-128",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "clickhouse-m-32-ef-256",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "clickhouse-m-32-ef-512",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "clickhouse-m-64-ef-256",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16}
},
{
"name": "clickhouse-m-64-ef-512",
"engine": "clickhouse",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 1, "config": { "hnsw_ef": 64 } }, { "parallel": 1, "config": { "hnsw_ef": 128 } }, { "parallel": 1, "config": { "hnsw_ef": 256 } }, { "parallel": 1, "config": { "hnsw_ef": 512 } },
{ "parallel": 100, "config": { "hnsw_ef": 64 } }, { "parallel": 100, "config": { "hnsw_ef": 128 } }, { "parallel": 100, "config": { "hnsw_ef": 256 } }, { "parallel": 100, "config": { "hnsw_ef": 512 } }
],
"upload_params": { "parallel": 16 }
}
]
Loading