diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index f3bb76f7e..d1ec10078 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -4,7 +4,6 @@ on: push: branches: - release-* - - '*wheel*' # must quote since "*" is a YAML reserved character; we want a string tags: - '*' pull_request: diff --git a/Dockerfile b/Dockerfile index 7813a782a..26ca8da6c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ RUN conda config --prepend channels conda-forge # Install mamba for faster installations RUN conda install mamba -RUN mamba install -y -c tiledb 'tiledb>=2.17,<2.18' tiledb-py cmake pybind11 pytest c-compiler cxx-compiler ninja openblas-devel "pip>22" +RUN mamba install -y -c tiledb 'tiledb>=2.16,<2.17' tiledb-py cmake pybind11 pytest c-compiler cxx-compiler ninja openblas-devel "pip>22" COPY . TileDB-Vector-Search/ diff --git a/apis/python/CMakeLists.txt b/apis/python/CMakeLists.txt index 46a7e2147..8246aa9e1 100644 --- a/apis/python/CMakeLists.txt +++ b/apis/python/CMakeLists.txt @@ -49,7 +49,10 @@ find_package(pybind11 CONFIG REQUIRED) set(VSPY_TARGET_NAME _tiledbvspy) -python_add_library(${VSPY_TARGET_NAME} MODULE "src/tiledb/vector_search/module.cc" WITH_SOABI) +python_add_library(${VSPY_TARGET_NAME} MODULE + "src/tiledb/vector_search/module.cc" + "src/tiledb/vector_search/kmeans.cc" + WITH_SOABI) target_link_libraries(${VSPY_TARGET_NAME} PRIVATE diff --git a/apis/python/pyproject.toml b/apis/python/pyproject.toml index 8cfa02139..d7ca75e17 100644 --- a/apis/python/pyproject.toml +++ b/apis/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiledb-vector-search" -version = "0.0.14" +version = "0.0.10" #dynamic = ["version"] description = "TileDB Vector Search Python client" license = { text = "MIT" } diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py index e710280dc..6277d8a50 100644 --- a/apis/python/src/tiledb/vector_search/index.py +++ b/apis/python/src/tiledb/vector_search/index.py @@ -1,5 +1,3 @@ -import concurrent.futures as futures -import os import numpy as np import sys @@ -22,7 +20,6 @@ class Index: config: Optional[Mapping[str, Any]] config dictionary, defaults to None """ - def __init__( self, uri: str, @@ -39,28 +36,16 @@ def __init__( self.storage_version = self.group.meta.get("storage_version", "0.1") self.update_arrays_uri = None self.index_version = self.group.meta.get("index_version", "") - self.thread_executor = futures.ThreadPoolExecutor() + def query(self, queries: np.ndarray, k, **kwargs): + updated_ids = set(self.read_updated_ids()) + retrieval_k = k + if len(updated_ids) > 0: + retrieval_k = 2*k + internal_results_d, internal_results_i = self.query_internal(queries, retrieval_k, **kwargs) if self.update_arrays_uri is None: - return self.query_internal(queries, k, **kwargs) - - # Query with updates - # Perform the queries in parallel - retrieval_k = 2 * k - kwargs["nthreads"] = int(os.cpu_count() / 2) - future = self.thread_executor.submit( - Index.query_additions, - queries, - k, - self.dtype, - self.update_arrays_uri, - int(os.cpu_count() / 2), - ) - internal_results_d, internal_results_i = self.query_internal( - queries, retrieval_k, **kwargs - ) - addition_results_d, addition_results_i, updated_ids = future.result() + return internal_results_d[:, 0:k], internal_results_i[:, 0:k] # Filter updated vectors query_id = 0 @@ -70,12 +55,6 @@ def query(self, queries: np.ndarray, k, **kwargs): if res in updated_ids: internal_results_d[query_id, res_id] = MAX_FLOAT_32 internal_results_i[query_id, res_id] = MAX_UINT64 - if ( - internal_results_d[query_id, res_id] == 0 - and internal_results_i[query_id, res_id] == 0 - ): - internal_results_d[query_id, res_id] = MAX_FLOAT_32 - internal_results_i[query_id, res_id] = MAX_UINT64 res_id += 1 query_id += 1 sort_index = np.argsort(internal_results_d, axis=1) @@ -83,6 +62,7 @@ def query(self, queries: np.ndarray, k, **kwargs): internal_results_i = np.take_along_axis(internal_results_i, sort_index, axis=1) # Merge update results + addition_results_d, addition_results_i = self.query_additions(queries, k) if addition_results_d is None: return internal_results_d[:, 0:k], internal_results_i[:, 0:k] @@ -90,15 +70,13 @@ def query(self, queries: np.ndarray, k, **kwargs): for query in addition_results_d: res_id = 0 for res in query: - if ( - addition_results_d[query_id, res_id] == 0 - and addition_results_i[query_id, res_id] == 0 - ): + if addition_results_d[query_id, res_id] == 0 and addition_results_i[query_id, res_id] == 0: addition_results_d[query_id, res_id] = MAX_FLOAT_32 addition_results_i[query_id, res_id] = MAX_UINT64 res_id += 1 query_id += 1 + results_d = np.hstack((internal_results_d, addition_results_d)) results_i = np.hstack((internal_results_i, addition_results_i)) sort_index = np.argsort(results_d, axis=1) @@ -106,101 +84,90 @@ def query(self, queries: np.ndarray, k, **kwargs): results_i = np.take_along_axis(results_i, sort_index, axis=1) return results_d[:, 0:k], results_i[:, 0:k] - @staticmethod - def query_additions( - queries: np.ndarray, k, dtype, update_arrays_uri, nthreads=8 - ): + def query_internal(self, queries: np.ndarray, k, **kwargs): + raise NotImplementedError + + def query_additions(self, queries: np.ndarray, k): assert queries.dtype == np.float32 - additions_vectors, additions_external_ids, updated_ids = Index.read_additions( - update_arrays_uri - ) + additions_vectors, additions_external_ids = self.read_additions() if additions_vectors is None: - return None, None, updated_ids - + return None, None queries_m = array_to_matrix(np.transpose(queries)) d, i = query_vq_heap_pyarray( - array_to_matrix(np.transpose(additions_vectors).astype(dtype)), + array_to_matrix(np.transpose(additions_vectors).astype(self.dtype)), queries_m, StdVector_u64(additions_external_ids), k, - nthreads, - ) - return np.transpose(np.array(d)), np.transpose(np.array(i)), updated_ids - - @staticmethod - def read_additions(update_arrays_uri) -> (np.ndarray, np.array): - if update_arrays_uri is None: - return None, None, np.array([], np.uint64) - updates_array = tiledb.open(update_arrays_uri, mode="r") - q = updates_array.query(attrs=("vector",), coords=True) - data = q[:] - updates_array.close() - updated_ids = data["external_id"] - additions_filter = [len(item) > 0 for item in data["vector"]] - if len(data["external_id"][additions_filter]) > 0: - return ( - np.vstack(data["vector"][additions_filter]), - data["external_id"][additions_filter], - updated_ids - ) - else: - return None, None, updated_ids - - def query_internal(self, queries: np.ndarray, k, **kwargs): - raise NotImplementedError + 8) + return np.transpose(np.array(d)), np.transpose(np.array(i)) def update(self, vector: np.array, external_id: np.uint64): updates_array = self.open_updates_array() - vectors = np.empty((1), dtype="O") + vectors = np.empty((1), dtype='O') vectors[0] = vector - updates_array[external_id] = {"vector": vectors} + updates_array[external_id] = {'vector': vectors} updates_array.close() self.consolidate_update_fragments() def update_batch(self, vectors: np.ndarray, external_ids: np.array): updates_array = self.open_updates_array() - updates_array[external_ids] = {"vector": vectors} + updates_array[external_ids] = {'vector': vectors} updates_array.close() self.consolidate_update_fragments() def delete(self, external_id: np.uint64): updates_array = self.open_updates_array() - deletes = np.empty((1), dtype="O") + deletes = np.empty((1), dtype='O') deletes[0] = np.array([], dtype=self.dtype) - updates_array[external_id] = {"vector": deletes} + updates_array[external_id] = {'vector': deletes} updates_array.close() self.consolidate_update_fragments() def delete_batch(self, external_ids: np.array): updates_array = self.open_updates_array() - deletes = np.empty((len(external_ids)), dtype="O") + deletes = np.empty((len(external_ids)), dtype='O') for i in range(len(external_ids)): deletes[i] = np.array([], dtype=self.dtype) - updates_array[external_ids] = {"vector": deletes} + updates_array[external_ids] = {'vector': deletes} updates_array.close() self.consolidate_update_fragments() def consolidate_update_fragments(self): fragments_info = tiledb.array_fragments(self.update_arrays_uri) - if len(fragments_info) > 10: + if(len(fragments_info) > 10): tiledb.consolidate(self.update_arrays_uri) tiledb.vacuum(self.update_arrays_uri) def get_updates_uri(self): return self.update_arrays_uri + def read_additions(self) -> (np.ndarray, np.array): + if self.update_arrays_uri is None: + return None, None + updates_array = tiledb.open(self.update_arrays_uri, mode="r") + q = updates_array.query(attrs=('vector',), coords=True) + data = q[:] + additions_filter = [len(item) > 0 for item in data["vector"]] + if len(data["external_id"][additions_filter]) > 0: + return np.vstack(data["vector"][additions_filter]), data["external_id"][additions_filter] + else: + return None, None + def read_updated_ids(self) -> np.array: + if self.update_arrays_uri is None: + return np.array([], np.uint64) + updates_array = tiledb.open(self.update_arrays_uri, mode="r") + q = updates_array.query(attrs=('vector',), coords=True) + data = q[:] + return data["external_id"] + def open_updates_array(self): if self.update_arrays_uri is None: - updates_array_name = storage_formats[self.storage_version][ - "UPDATES_ARRAY_NAME" - ] + updates_array_name = storage_formats[self.storage_version]["UPDATES_ARRAY_NAME"] updates_array_uri = f"{self.group.uri}/{updates_array_name}" if tiledb.array_exists(updates_array_uri): raise RuntimeError(f"Array {updates_array_uri} already exists.") external_id_dim = tiledb.Dim( - name="external_id", - domain=(0, MAX_UINT64 - 1), - dtype=np.dtype(np.uint64), + name="external_id", domain=(0, MAX_UINT64-1), dtype=np.dtype(np.uint64) ) dom = tiledb.Domain(external_id_dim) vector_attr = tiledb.Attr(name="vector", dtype=self.dtype, var=True) @@ -221,18 +188,13 @@ def open_updates_array(self): def consolidate_updates(self): from tiledb.vector_search.ingestion import ingest - new_index = ingest( index_type=self.index_type, index_uri=self.uri, size=self.size, source_uri=self.db_uri, external_ids_uri=self.ids_uri, - updates_uri=self.update_arrays_uri, + updates_uri=self.update_arrays_uri ) tiledb.Array.delete_array(self.update_arrays_uri) - self.group.close() - self.group = tiledb.Group(self.uri, "w", ctx=tiledb.Ctx(self.config)) - self.group.remove(self.update_arrays_uri) - self.group.close() return new_index diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py index 3798e0157..a829a1463 100644 --- a/apis/python/src/tiledb/vector_search/ingestion.py +++ b/apis/python/src/tiledb/vector_search/ingestion.py @@ -3,6 +3,7 @@ from tiledb.cloud.dag import Mode from tiledb.vector_search._tiledbvspy import * +from tiledb.vector_search.module import kmeans_predict import numpy as np @@ -27,6 +28,7 @@ def ingest( input_vectors_per_work_item: int = -1, verbose: bool = False, trace_id: Optional[str] = None, + use_sklearn: bool = False, mode: Mode = Mode.LOCAL, ): """ @@ -78,6 +80,9 @@ def ingest( verbose logging, defaults to False trace_id: Optional[str] trace ID for logging, defaults to None + use_sklearn: bool + Whether to use scikit-learn's implementation of k-means clustering instead of + tiledb.vector_search's. Defaults to false. mode: Mode execution mode, defaults to LOCAL use BATCH for distributed execution """ @@ -695,11 +700,11 @@ def read_additions( logger.debug( "Reading additions vectors" ) - with tiledb.open(updates_uri, mode="r") as updates_array: - q = updates_array.query(attrs=('vector',), coords=True) - data = q[:] - additions_filter = [len(item) > 0 for item in data["vector"]] - return np.vstack(data["vector"][additions_filter]), data["external_id"][additions_filter] + updates_array = tiledb.open(updates_uri, mode="r") + q = updates_array.query(attrs=('vector',), coords=True) + data = q[:] + additions_filter = [len(item) > 0 for item in data["vector"]] + return np.vstack(data["vector"][additions_filter]), data["external_id"][additions_filter] def read_updated_ids( updates_uri: str, @@ -713,10 +718,10 @@ def read_updated_ids( logger.debug( "Reading updated vector ids" ) - with tiledb.open(updates_uri, mode="r") as updates_array: - q = updates_array.query(attrs=('vector',), coords=True) - data = q[:] - return data["external_id"] + updates_array = tiledb.open(updates_uri, mode="r") + q = updates_array.query(attrs=('vector',), coords=True) + data = q[:] + return data["external_id"] def read_input_vectors( source_uri: str, @@ -848,9 +853,14 @@ def centralised_kmeans( config: Optional[Mapping[str, Any]] = None, verbose: bool = False, trace_id: Optional[str] = None, + use_sklearn: bool = False ): from sklearn.cluster import KMeans + from tiledb.vector_search.module import ( + array_to_matrix, + kmeans_fit, + ) with tiledb.scope_ctx(ctx_or_config=config): logger = setup(config, verbose) group = tiledb.Group(index_group_uri) @@ -866,23 +876,23 @@ def centralised_kmeans( verbose=verbose, trace_id=trace_id, ).astype(np.float32) - verb = 0 - if verbose: - verb = 3 logger.debug("Start kmeans training") - km = KMeans( - n_clusters=partitions, - init=init, - max_iter=max_iter, - verbose=verb, - n_init=n_init, - ) - km.fit_predict(sample_vectors) + if use_sklearn: + km = KMeans( + n_clusters=partitions, + init=init, + max_iter=max_iter, + verbose=3 if verbose else 0, + n_init=n_init, + ) + km.fit(np.transpose(sample_vectors)) + clusters = km.cluster_centers_ + else: + clusters = kmeans_fit(partitions, init, max_iter, verbose, n_init, array_to_matrix(np.transpose(sample_vectors))) + clusters = np.array(clusters) logger.debug("Writing centroids to array %s", centroids_uri) with tiledb.open(centroids_uri, mode="w") as A: - A[0:dimensions, 0:partitions] = np.transpose( - np.array(km.cluster_centers_) - ) + A[0:dimensions, 0:partitions] = clusters # -------------------------------------------------------------------- # distributed kmeans UDFs @@ -927,6 +937,7 @@ def assign_points_and_partial_new_centroids( config: Optional[Mapping[str, Any]] = None, verbose: bool = False, trace_id: Optional[str] = None, + use_sklearn: bool = False, ): from sklearn.cluster import KMeans @@ -1026,10 +1037,13 @@ def update_centroids(): ) logger.debug("Input centroids: %s", centroids[0:5]) logger.debug("Assigning vectors to centroids") - km = KMeans() - km._n_threads = threads - km.cluster_centers_ = centroids - assignments = km.predict(vectors) + if use_sklearn: + km = KMeans() + km.n_threads_ = threads + km.cluster_centers_ = centroids + assignments = km.predict(vectors) + else: + assignments = kmeans_predict(centroids, vectors) logger.debug("Assignments: %s", assignments[0:100]) partial_new_centroids = update_centroids() logger.debug("New centroids: %s", partial_new_centroids[0:5]) @@ -1464,6 +1478,7 @@ def create_ingestion_dag( config: Optional[Mapping[str, Any]] = None, verbose: bool = False, trace_id: Optional[str] = None, + use_sklearn: bool = False, mode: Mode = Mode.LOCAL, ) -> dag.DAG: if mode == Mode.BATCH: @@ -1549,6 +1564,7 @@ def create_ingestion_dag( config=config, verbose=verbose, trace_id=trace_id, + use_sklearn=use_sklearn, name="kmeans", resources={"cpu": "8", "memory": "32Gi"}, image_name=DEFAULT_IMG_NAME, @@ -1594,6 +1610,7 @@ def create_ingestion_dag( config=config, verbose=verbose, trace_id=trace_id, + use_sklearn=use_sklearn, name="k-means-part-" + str(task_id), resources={"cpu": str(threads), "memory": "12Gi"}, image_name=DEFAULT_IMG_NAME, @@ -1728,16 +1745,11 @@ def consolidate_and_vacuum( index_group_uri: str, config: Optional[Mapping[str, Any]] = None, ): - group = tiledb.Group(index_group_uri) - try: - if INPUT_VECTORS_ARRAY_NAME in group: - tiledb.Array.delete_array(group[INPUT_VECTORS_ARRAY_NAME].uri) - if EXTERNAL_IDS_ARRAY_NAME in group: - tiledb.Array.delete_array(group[EXTERNAL_IDS_ARRAY_NAME].uri) - except tiledb.TileDBError as err: - message = str(err) - if "does not exist" not in message: - raise err + group = tiledb.Group(index_group_uri, config=config) + if INPUT_VECTORS_ARRAY_NAME in group: + tiledb.Array.delete_array(group[INPUT_VECTORS_ARRAY_NAME].uri) + if EXTERNAL_IDS_ARRAY_NAME in group: + tiledb.Array.delete_array(group[EXTERNAL_IDS_ARRAY_NAME].uri) modes = ["fragment_meta", "commits", "array_meta"] for mode in modes: conf = tiledb.Config(config) @@ -1907,6 +1919,7 @@ def consolidate_and_vacuum( config=config, verbose=verbose, trace_id=trace_id, + use_sklearn=use_sklearn, mode=mode, ) logger.debug("Submitting ingestion graph") diff --git a/apis/python/src/tiledb/vector_search/kmeans.cc b/apis/python/src/tiledb/vector_search/kmeans.cc new file mode 100644 index 000000000..e818824de --- /dev/null +++ b/apis/python/src/tiledb/vector_search/kmeans.cc @@ -0,0 +1,57 @@ +#include + +#include +#include +#include + +#include "linalg.h" +#include "ivf_index.h" +#include "ivf_query.h" +#include "flat_query.h" + +namespace py = pybind11; +using Ctx = tiledb::Context; + +namespace { + +template +static void declare_kmeans(py::module& m, const std::string& suffix) { + m.def(("kmeans_fit_" + suffix).c_str(), + [](size_t n_clusters, + std::string init, + size_t max_iter, + bool verbose, + size_t n_init, + const ColMajorMatrix& sample_vectors, + std::optional tol, + std::optional seed, + std::optional nthreads) { + // TODO: support verbose and n_init + std::ignore = verbose; + std::ignore = n_init; + kmeans_init init_val; + if (init == "k-means++") { + init_val = kmeans_init::kmeanspp; + } else if (init == "random") { + init_val = kmeans_init::random; + } else { + throw std::invalid_argument("Invalid init method"); + } + kmeans_index idx(sample_vectors.num_rows(), n_clusters, max_iter, tol.value_or(0.0001), nthreads, seed); + idx.train(sample_vectors, init_val); + return std::move(idx.get_centroids()); + }); + + m.def(("kmeans_predict_" + suffix).c_str(), + [](const ColMajorMatrix& centroids, + const ColMajorMatrix& sample_vectors) { + return kmeans_index::predict(centroids, sample_vectors); + }); +} + +} // anonymous namespace + + +void init_kmeans(py::module_& m) { + declare_kmeans(m, "f32"); +} diff --git a/apis/python/src/tiledb/vector_search/module.cc b/apis/python/src/tiledb/vector_search/module.cc index 9bf0fea61..1e45cd438 100644 --- a/apis/python/src/tiledb/vector_search/module.cc +++ b/apis/python/src/tiledb/vector_search/module.cc @@ -11,6 +11,8 @@ namespace py = pybind11; using Ctx = tiledb::Context; +bool global_debug = false; + bool enable_stats = false; std::vector core_stats; @@ -124,8 +126,7 @@ static void declare_qv_query_heap_infinite_ram(py::module& m, const std::string& size_t k_nn, size_t nthreads) -> py::tuple { //std::pair, ColMajorMatrix> { // TODO change return type - // auto r = detail::ivf::qv_query_heap_infinite_ram( - auto r = detail::ivf::query_infinite_ram( + auto r = detail::ivf::qv_query_heap_infinite_ram( parts, centroids, query_vectors, @@ -177,7 +178,7 @@ static void declare_nuv_query_heap_infinite_ram(py::module& m, const std::string std::vector& ids, size_t nprobe, size_t k_nn, - size_t nthreads) -> std::tuple, ColMajorMatrix> { // TODO change return type + size_t nthreads) -> std::tuple, ColMajorMatrix> { // TODO change return type auto r = detail::ivf::nuv_query_heap_infinite_ram_reg_blocked( parts, @@ -204,7 +205,7 @@ static void declare_nuv_query_heap_finite_ram(py::module& m, const std::string& size_t nprobe, size_t k_nn, size_t upper_bound, - size_t nthreads) -> std::tuple, ColMajorMatrix> { // TODO change return type + size_t nthreads) -> std::tuple, ColMajorMatrix> { // TODO change return type auto r = detail::ivf::nuv_query_heap_finite_ram_reg_blocked( ctx, @@ -279,7 +280,7 @@ static void declare_ivf_index_tdb(py::module& m, const std::string& suffix) { }, py::keep_alive<1,2>()); } -template +template static void declareFixedMinPairHeap(py::module& mod) { using PyFixedMinPairHeap = py::class_>; PyFixedMinPairHeap cls(mod, "FixedMinPairHeap", py::buffer_protocol()); @@ -356,7 +357,7 @@ void declareStdVector(py::module& m, const std::string& suffix) { }); } -template +template void declarePartitionIvfIndex(py::module& m, const std::string& suffix) { m.def(("partition_ivf_index_" + suffix).c_str(), [](ColMajorMatrix& centroids, @@ -400,7 +401,7 @@ static void declare_vq_query_heap(py::module& m, const std::string& suffix) { ColMajorMatrix& query_vectors, const std::vector &ids, int k, - size_t nthreads) -> std::tuple, ColMajorMatrix> { + size_t nthreads) -> std::tuple, ColMajorMatrix> { auto r = detail::flat::vq_query_heap(data, query_vectors, ids, k, nthreads); return r; }); @@ -413,7 +414,7 @@ static void declare_vq_query_heap_pyarray(py::module& m, const std::string& suff ColMajorMatrix& query_vectors, const std::vector &ids, int k, - size_t nthreads) -> std::tuple, ColMajorMatrix> { + size_t nthreads) -> std::tuple, ColMajorMatrix> { auto r = detail::flat::vq_query_heap(data, query_vectors, ids, k, nthreads); return r; }); @@ -421,6 +422,7 @@ static void declare_vq_query_heap_pyarray(py::module& m, const std::string& suff } // anonymous namespace +void init_kmeans(py::module&); PYBIND11_MODULE(_tiledbvspy, m) { @@ -496,7 +498,7 @@ PYBIND11_MODULE(_tiledbvspy, m) { [](ColMajorMatrix& data, ColMajorMatrix& query_vectors, int k, - size_t nthreads) -> std::tuple, ColMajorMatrix> { + size_t nthreads) -> std::tuple, ColMajorMatrix> { auto r = detail::flat::vq_query_heap(data, query_vectors, k, nthreads); return r; }); @@ -505,13 +507,13 @@ PYBIND11_MODULE(_tiledbvspy, m) { [](tdbColMajorMatrix& data, ColMajorMatrix& query_vectors, int k, - size_t nthreads) -> std::tuple, ColMajorMatrix> { + size_t nthreads) -> std::tuple, ColMajorMatrix> { auto r = detail::flat::vq_query_heap(data, query_vectors, k, nthreads); return r; }); m.def("validate_top_k_u64", - [](const ColMajorMatrix& top_k, + [](const ColMajorMatrix& top_k, const ColMajorMatrix& ground_truth) -> bool { return validate_top_k(top_k, ground_truth); }); @@ -534,11 +536,9 @@ PYBIND11_MODULE(_tiledbvspy, m) { return json{core_stats}.dump(); }); -#if 0 m.def("set_debug", [](bool debug) { global_debug = debug; }); -#endif declare_vq_query_heap(m, "u8"); declare_vq_query_heap(m, "f32"); @@ -568,4 +568,6 @@ PYBIND11_MODULE(_tiledbvspy, m) { declare_dist_qv(m, "u8"); declare_dist_qv(m, "f32"); declareFixedMinPairHeap(m); + + init_kmeans(m); } diff --git a/apis/python/src/tiledb/vector_search/module.py b/apis/python/src/tiledb/vector_search/module.py index 70d23e3e4..d5e3017bd 100644 --- a/apis/python/src/tiledb/vector_search/module.py +++ b/apis/python/src/tiledb/vector_search/module.py @@ -404,6 +404,37 @@ def array_to_matrix(array: np.ndarray): else: raise TypeError("Unsupported type!") +def kmeans_fit(partitions: int, init: str, max_iter: int, verbose: bool, n_init: int, sample_vectors: "colMajorMatrix", tol: Optional[float] = None, nthreads: Optional[int] = None, seed: Optional[int] = None): + args = tuple( + [ + partitions, + init, + max_iter, + verbose, + n_init, + sample_vectors, + tol, + nthreads, + seed, + ] + ) + if sample_vectors.dtype == np.float32: + return kmeans_fit_f32(*args) + else: + raise TypeError("Unsupported type!") + +def kmeans_predict(centroids: "colMajorMatrix", sample_vectors: "colMajorMatrix"): + args = tuple( + [ + centroids, + sample_vectors, + ] + ) + if sample_vectors.dtype == np.float32: + return kmeans_predict_f32(*args) + else: + raise TypeError("Unsupported type!") + # TODO # def load_partitioned(uri, partitions, dtype: Optional[np.dtype] = None): diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py index ac1e3283d..5339e9973 100644 --- a/apis/python/test/test_ingestion.py +++ b/apis/python/test/test_ingestion.py @@ -6,10 +6,10 @@ from tiledb.vector_search.ingestion import ingest from tiledb.vector_search.flat_index import FlatIndex from tiledb.vector_search.ivf_flat_index import IVFFlatIndex +from tiledb.vector_search.module import array_to_matrix, kmeans_fit, kmeans_predict from tiledb.cloud.dag import Mode MINIMUM_ACCURACY = 0.85 -MAX_UINT64 = np.iinfo(np.dtype("uint64")).max def test_flat_ingestion_u8(tmp_path): @@ -308,12 +308,11 @@ def test_ivf_flat_ingestion_with_updates(tmp_path): _, result = index.query(query_vectors, k=k, nprobe=nprobe) assert accuracy(result, gt_i) > MINIMUM_ACCURACY - update_ids_offset = MAX_UINT64-size updated_ids = {} for i in range(100): index.delete(external_id=i) - index.update(vector=data[i].astype(dtype), external_id=i + update_ids_offset) - updated_ids[i + update_ids_offset] = i + index.update(vector=data[i].astype(dtype), external_id=i + 1000000) + updated_ids[i + 1000000] = i _, result = index.query(query_vectors, k=k, nprobe=nprobe) assert accuracy(result, gt_i, updated_ids=updated_ids) > MINIMUM_ACCURACY @@ -348,10 +347,9 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path): update_ids = {} updated_ids = {} - update_ids_offset = MAX_UINT64 - size for i in range(0, 100000, 2): - update_ids[i] = i + update_ids_offset - updated_ids[i + update_ids_offset] = i + update_ids[i] = i + 1000000 + updated_ids[i + 1000000] = i external_ids = np.zeros((len(update_ids) * 2), dtype=np.uint64) updates = np.empty((len(update_ids) * 2), dtype='O') id = 0 @@ -371,3 +369,81 @@ def test_ivf_flat_ingestion_with_batch_updates(tmp_path): _, result = index.query(query_vectors, k=k, nprobe=nprobe) assert accuracy(result, gt_i, updated_ids=updated_ids) > MINIMUM_ACCURACY + + +def test_kmeans(): + k = 128 + d = 16 + n = k * k + max_iter = 16 + n_init = 10 + verbose = False + + import sklearn.model_selection + from sklearn.datasets import make_blobs + from sklearn.cluster import KMeans + + X, _, centers = make_blobs(n_samples=n, n_features=d, centers=k, return_centers=True, random_state=1) + X = X.astype("float32") + + data, queries = sklearn.model_selection.train_test_split( + X, test_size=0.1, random_state=1 + ) + + data_x = np.array([[1.0573647, 5.082087], + [-6.229642, -1.3590931], + [0.7446737, 6.3828287], + [-7.698864, -3.0493321], + [2.1362762, -4.4448104], + [1.04019, -4.0389647], + [0.38996044, 5.7235265], + [1.7470839, -4.717076]]).astype("float32") + queries_x = np.array([[-7.3712273, -1.1178735]]).astype("float32") + + km = KMeans(n_clusters=k, n_init=n_init, max_iter=max_iter, verbose=verbose, init="random", random_state=1) + km.fit(data) + centroids_sk = km.cluster_centers_ + results_sk = km.predict(queries) + + centroids_tdb = kmeans_fit( + k, "random", max_iter, verbose, n_init, array_to_matrix(np.transpose(data)), seed=1 + ) + centroids_tdb_np = np.transpose(np.array(centroids_tdb)) + results_tdb = kmeans_predict(centroids_tdb, array_to_matrix(np.transpose(queries))) + results_tdb_np = np.transpose(np.array(results_tdb)) + + def get_score(centroids, results): + x = [] + for i in range(len(queries)): + x.append(np.linalg.norm(queries[i] - centroids[results[i]])) + return np.mean(np.array(x)) + + sklearn_score = get_score(centroids_sk, results_sk) + tdb_score = get_score(centroids_tdb_np, results_tdb_np) + print("Random initialization:") + print(f"sklearn score: {sklearn_score}") + print(f"tiledb score: {tdb_score}") + + km = KMeans(n_clusters=k, n_init=n_init, max_iter=max_iter, verbose=verbose, init="k-means++", random_state=1) + km.fit(data) + centroids_sk = km.cluster_centers_ + results_sk = km.predict(queries) + + assert tdb_score < 1.5 * sklearn_score + + centroids_tdb = kmeans_fit( + k, "k-means++", max_iter, verbose, n_init, array_to_matrix(np.transpose(data)), seed=1 + ) + centroids_tdb_np = np.transpose(np.array(centroids_tdb)) + results_tdb = kmeans_predict(centroids_tdb, array_to_matrix(np.transpose(queries))) + results_tdb_np = np.transpose(np.array(results_tdb)) + + sklearn_score = get_score(centroids_sk, results_sk) + tdb_score = get_score(centroids_tdb_np, results_tdb_np) + print("K-means++:") + print(f"sklearn score: {sklearn_score}") + print(f"tiledb score: {tdb_score}") + + assert tdb_score < 1.5 * sklearn_score + +test_kmeans() \ No newline at end of file diff --git a/src/cmake/Modules/FindTileDB_EP.cmake b/src/cmake/Modules/FindTileDB_EP.cmake index a5680060e..5fdbc505f 100644 --- a/src/cmake/Modules/FindTileDB_EP.cmake +++ b/src/cmake/Modules/FindTileDB_EP.cmake @@ -52,8 +52,8 @@ else() # Try to download prebuilt artifacts unless the user specifies to build from source if(DOWNLOAD_TILEDB_PREBUILT) if (WIN32) # Windows - SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-windows-x86_64-2.17.0-93c173d.zip") - SET(DOWNLOAD_SHA1 "d43589b22de95d45b40de9918d105a6174ec352e") + SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.3/tiledb-windows-x86_64-2.16.3-194b5ae.zip") + SET(DOWNLOAD_SHA1 "8e986b6143ef9509201c7f2b3c0a7a53ff41095a") elseif(APPLE) # OSX if (DEFINED CMAKE_OSX_ARCHITECTURES) set(ACTUAL_TARGET ${CMAKE_OSX_ARCHITECTURES}) @@ -63,15 +63,15 @@ else() if (ACTUAL_TARGET MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") - SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-x86_64-2.17.0-93c173d.tar.gz") - SET(DOWNLOAD_SHA1 "9a232015cbf09c5bd37375537cef80a382e1ffa4") + SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.3/tiledb-macos-x86_64-2.16.3-194b5ae.tar.gz") + SET(DOWNLOAD_SHA1 "b4d35306771d1c30b49bf2a98651e8e24c91037e") elseif (ACTUAL_TARGET STREQUAL arm64 OR ACTUAL_TARGET MATCHES "^aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") - SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-macos-arm64-2.17.0-93c173d.tar.gz") - SET(DOWNLOAD_SHA1 "b861b90b462963db44fe0217087fac3510fd6293") + SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.3/tiledb-macos-arm64-2.16.3-194b5ae.tar.gz") + SET(DOWNLOAD_SHA1 "1f3f0e7660ed54b8988bdfcc4876021bf57d58da") endif() else() # Linux - SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.17.0/tiledb-linux-x86_64-2.17.0-93c173d.tar.gz") - SET(DOWNLOAD_SHA1 "5c04c07a73d3fe48a9ba8f3ad8af5e1912a39ce8") + SET(DOWNLOAD_URL "https://github.com/TileDB-Inc/TileDB/releases/download/2.16.3/tiledb-linux-x86_64-2.16.3-194b5ae.tar.gz") + SET(DOWNLOAD_SHA1 "6ccafbee52137478d0b8146e71a11323755c9ed5") endif() ExternalProject_Add(ep_tiledb @@ -93,8 +93,8 @@ else() else() # Build from source ExternalProject_Add(ep_tiledb PREFIX "externals" - URL "https://github.com/TileDB-Inc/TileDB/archive/2.17.0.zip" - URL_HASH SHA1=bbf5b34fec1c729f048f48bf1a0f03abb447d7de + URL "https://github.com/TileDB-Inc/TileDB/archive/2.16.3.zip" + URL_HASH SHA1=22724f1ede59fee7c88b800242e9813eab785dcb DOWNLOAD_NAME "tiledb.zip" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EP_INSTALL_PREFIX} diff --git a/src/cmake/Superbuild.cmake b/src/cmake/Superbuild.cmake index ac8c3b8f8..5202b814a 100644 --- a/src/cmake/Superbuild.cmake +++ b/src/cmake/Superbuild.cmake @@ -44,11 +44,15 @@ set(FORWARD_EP_CMAKE_ARGS) # as a part of the superbuild. set(EXTERNAL_PROJECTS) +# Passing lists through ExternalProject_Add requires using a separator +# character other than a semicolon. +list(JOIN CMAKE_PREFIX_PATH "|" CMAKE_PREFIX_PATH_STR) + # Forward any additional CMake args to the non-superbuild. set(INHERITED_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} - -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH_STR} + -DCMAKE_BUILD_TYPE=$ -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR} -DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} @@ -121,6 +125,7 @@ ExternalProject_Add(libtiledbvectorsearch INSTALL_COMMAND "" BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libtiledbvectorsearch DEPENDS ${EXTERNAL_PROJECTS} + LIST_SEPARATOR "|" ) # make install-libtiledbvectorsearch diff --git a/src/include/detail/flat/qv.h b/src/include/detail/flat/qv.h index 90435202b..a853a0d23 100644 --- a/src/include/detail/flat/qv.h +++ b/src/include/detail/flat/qv.h @@ -73,13 +73,10 @@ namespace detail::flat { */ template [[deprecated]] auto qv_query_heap_0( - const DB& db, const Q& q, int k_nn, unsigned int nthreads) { + DB& db, const Q& q, int k_nn, unsigned int nthreads) { scoped_timer _{tdb_func__}; - using id_type = size_t; - using score_type = float; - - ColMajorMatrix top_k(k_nn, size(q)); + ColMajorMatrix top_k(k_nn, size(q)); auto par = stdx::execution::indexed_parallel_policy{nthreads}; stdx::range_for_each( @@ -88,7 +85,7 @@ template size_t size_db = size(db); // @todo can we do this more efficiently? - Vector scores(size_db); + Vector scores(size_db); for (size_t i = 0; i < size_db; ++i) { scores[i] = L2(q_vec, db[i]); @@ -116,44 +113,40 @@ template * @return A matrix of size k x #queries containing the top k results for each * query. */ -template +template auto qv_query_heap( T, - const DB& db, - const Q& q, - const ID& ids, + DB& db, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads); template -auto qv_query_heap(const DB& db, const Q& q, int k_nn, unsigned nthreads) { +auto qv_query_heap(DB& db, Q& q, int k_nn, unsigned nthreads) { return qv_query_heap( - without_ids{}, db, q, std::vector{}, k_nn, nthreads); + without_ids{}, db, q, std::vector{}, k_nn, nthreads); } -template +template auto qv_query_heap( - const DB& db, const Q& q, const ID& ids, int k_nn, unsigned nthreads) { + DB& db, Q& q, const std::vector& ids, int k_nn, unsigned nthreads) { return qv_query_heap(with_ids{}, db, q, ids, k_nn, nthreads); } // @todo Add to out of core -template +template auto qv_query_heap( T, - const DB& db, - const Q& query, - const ID& ids, + DB& db, + Q& query, + const std::vector& ids, int k_nn, unsigned nthreads) { scoped_timer _{tdb_func__}; - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - - auto top_k = ColMajorMatrix(k_nn, query.num_cols()); - auto top_k_scores = ColMajorMatrix(k_nn, query.num_cols()); + auto top_k = ColMajorMatrix(k_nn, query.num_cols()); + auto top_k_scores = ColMajorMatrix(k_nn, query.num_cols()); // Have to do explicit asynchronous threading here, as the current parallel // algorithms have iterator-based interaces, and the `Matrix` class does not @@ -166,7 +159,7 @@ auto qv_query_heap( std::move(par), query, [&, size_db](auto&& q_vec, auto&& n = 0, auto&& j = 0) { - fixed_min_pair_heap min_scores(k_nn); + fixed_min_pair_heap min_scores(k_nn); for (size_t i = 0; i < size_db; ++i) { auto score = L2(q_vec, db[i]); @@ -197,40 +190,35 @@ auto qv_query_heap( * @return A matrix of size k x #queries containing the top k results for each * query. */ -template +template auto qv_query_heap_tiled( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads); template -auto qv_query_heap_tiled(DB& db, const Q& q, int k_nn, unsigned nthreads) { +auto qv_query_heap_tiled(DB& db, Q& q, int k_nn, unsigned nthreads) { return qv_query_heap_tiled( - without_ids{}, db, q, std::vector{}, k_nn, nthreads); + without_ids{}, db, q, std::vector{}, k_nn, nthreads); } -template +template auto qv_query_heap_tiled( - DB& db, Q& q, const ID& ids, int k_nn, unsigned nthreads) { + DB& db, Q& q, const std::vector& ids, int k_nn, unsigned nthreads) { return qv_query_heap_tiled(with_ids{}, db, q, ids, k_nn, nthreads); } -template +template auto qv_query_heap_tiled( T, DB& db, - const Q& query, - [[maybe_unused]] const ID& ids, + Q& query, + [[maybe_unused]] const std::vector& ids, int k_nn, unsigned nthreads) { - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - if constexpr (is_loadable_v) { db.load(); } @@ -248,8 +236,8 @@ auto qv_query_heap_tiled( std::vector> futs; futs.reserve(nthreads); - auto min_scores = std::vector>( - size(query), fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + size(query), fixed_min_pair_heap(k_nn)); // @todo: Use range::for_each for (size_t n = 0; n < nthreads; ++n) { @@ -263,8 +251,8 @@ auto qv_query_heap_tiled( auto len = 2 * ((stop - start) / 2); auto end = start + len; - // auto min_scores0 = fixed_min_pair_heap (k); - // auto min_scores1 = fixed_min_pair_heap (k); + // auto min_scores0 = fixed_min_pair_heap (k); + // auto min_scores1 = fixed_min_pair_heap (k); for (auto j = start; j != end; j += 2) { auto j0 = j + 0; @@ -377,17 +365,15 @@ template auto qv_partition(const DB& db, const Q& q, unsigned nthreads) { scoped_timer _{tdb_func__}; - // Just need a single vector -- creating an index, not ids, so hardcoded size_t is okay to use here - using id_type = size_t; - using score_type = float; auto size_db = size(db); - std::vector top_k(q.num_cols()); + // Just need a single vector + std::vector top_k(q.num_cols()); auto par = stdx::execution::indexed_parallel_policy{(size_t)nthreads}; stdx::range_for_each( std::move(par), q, [&, size_db](auto&& qvec, auto&& n = 0, auto&& j = 0) { - score_type min_score = std::numeric_limits::max(); + float min_score = std::numeric_limits::max(); size_t idx = 0; for (size_t i = 0; i < size_db; ++i) { @@ -403,6 +389,46 @@ auto qv_partition(const DB& db, const Q& q, unsigned nthreads) { return top_k; } +/** + * @brief Find the single nearest neighbor of each query vector in the database. + * This is essentially qv_query_heap, specialized for k = 1. + * @tparam DB + * @tparam Q + * @param db + * @param q + * @param nthreads + * @return + */ +template +auto qv_partition_with_scores(const DB& db, const Q& q, unsigned nthreads) { + scoped_timer _{tdb_func__}; + + auto size_db = size(db); + + // Just need a single vector + std::vector top_k(q.num_cols()); + std::vector top_k_scores(q.num_cols()); + + auto par = stdx::execution::indexed_parallel_policy{(size_t)nthreads}; + stdx::range_for_each( + std::move(par), q, [&, size_db](auto&& qvec, auto&& n = 0, auto&& j = 0) { + float min_score = std::numeric_limits::max(); + size_t idx = 0; + + for (size_t i = 0; i < size_db; ++i) { + auto score = L2(qvec, db[i]); + if (score < min_score) { + min_score = score; + idx = i; + } + } + top_k[j] = idx; + top_k_scores[j] = min_score; + }); + + return std::make_tuple(top_k_scores, top_k); +} + } // namespace detail::flat #endif // TILEDB_FLAT_QV_H diff --git a/src/include/detail/flat/vq.h b/src/include/detail/flat/vq.h index 0d14e4fd9..84a46ff02 100644 --- a/src/include/detail/flat/vq.h +++ b/src/include/detail/flat/vq.h @@ -51,47 +51,42 @@ namespace detail::flat { * * @todo Unify out of core and not out of core versions. */ -template +template auto vq_query_heap( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads); template -auto vq_query_heap(DB& db, const Q& q, int k_nn, unsigned nthreads) { +auto vq_query_heap(DB& db, Q& q, int k_nn, unsigned nthreads) { return vq_query_heap( - without_ids{}, db, q, std::vector{}, k_nn, nthreads); + without_ids{}, db, q, std::vector{}, k_nn, nthreads); } -template +template auto vq_query_heap( - DB& db, const Q& q, const ID& ids, int k_nn, unsigned nthreads) { + DB& db, Q& q, const std::vector& ids, int k_nn, unsigned nthreads) { return vq_query_heap(with_ids{}, db, q, ids, k_nn, nthreads); } // @todo Support out of core -template +template auto vq_query_heap( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads) { // @todo Need to get the total number of queries, not just the first block // @todo Use Matrix here rather than vector of vectors - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - - std::vector>> scores( + std::vector>> scores( nthreads, - std::vector>( - size(q), fixed_min_pair_heap(k_nn))); + std::vector>( + size(q), fixed_min_pair_heap(k_nn))); unsigned size_q = size(q); auto par = stdx::execution::indexed_parallel_policy{nthreads}; @@ -145,9 +140,6 @@ auto vq_query_heap( consolidate_scores(scores); auto top_k = get_top_k_with_scores(scores, k_nn); - debug_slice(std::get<0>(top_k)); - debug_slice(std::get<1>(top_k)); - return top_k; } @@ -161,46 +153,41 @@ auto vq_query_heap( * @param nthreads * @return */ -template +template auto vq_query_heap_tiled( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads); template -auto vq_query_heap_tiled(DB& db, const Q& q, int k_nn, unsigned nthreads) { +auto vq_query_heap_tiled(DB& db, Q& q, int k_nn, unsigned nthreads) { return vq_query_heap_tiled( - without_ids{}, db, q, std::vector{}, k_nn, nthreads); + without_ids{}, db, q, std::vector{}, k_nn, nthreads); } -template +template auto vq_query_heap_tiled( - DB& db, const Q& q, const ID& ids, int k_nn, unsigned nthreads) { + DB& db, Q& q, const std::vector& ids, int k_nn, unsigned nthreads) { return vq_query_heap_tiled(with_ids{}, db, q, ids, k_nn, nthreads); } -template +template auto vq_query_heap_tiled( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads) { // @todo Need to get the total number of queries, not just the first block // @todo Use Matrix here rather than vector of vectors - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - - std::vector>> scores( + std::vector>> scores( nthreads, - std::vector>( - size(q), fixed_min_pair_heap(k_nn))); + std::vector>( + size(q), fixed_min_pair_heap(k_nn))); unsigned size_q = size(q); auto par = stdx::execution::indexed_parallel_policy{nthreads}; @@ -243,46 +230,41 @@ auto vq_query_heap_tiled( // ==================================================================================================== -template +template auto vq_query_heap_2( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads); template -auto vq_query_heap_2(DB& db, const Q& q, int k_nn, unsigned nthreads) { +auto vq_query_heap_2(DB& db, Q& q, int k_nn, unsigned nthreads) { return vq_query_heap_2( - without_ids{}, db, q, std::vector{}, k_nn, nthreads); + without_ids{}, db, q, std::vector{}, k_nn, nthreads); } -template +template auto vq_query_heap_2( - DB& db, const Q& q, const ID& ids, int k_nn, unsigned nthreads) { + DB& db, Q& q, const std::vector& ids, int k_nn, unsigned nthreads) { return vq_query_heap_2(with_ids{}, db, q, ids, k_nn, nthreads); } -template +template auto vq_query_heap_2( T, DB& db, - const Q& q, - const ID& ids, + Q& q, + const std::vector& ids, int k_nn, unsigned nthreads) { // @todo Need to get the total number of queries, not just the first block // @todo Use Matrix here rather than vector of vectors - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - - std::vector>> scores( + std::vector>> scores( nthreads, - std::vector>( - size(q), fixed_min_pair_heap(k_nn))); + std::vector>( + size(q), fixed_min_pair_heap(k_nn))); unsigned size_q = size(q); auto par = stdx::execution::indexed_parallel_policy{nthreads}; diff --git a/src/include/detail/ivf/dist_qv.h b/src/include/detail/ivf/dist_qv.h index b052efa8a..1e65a5be2 100644 --- a/src/include/detail/ivf/dist_qv.h +++ b/src/include/detail/ivf/dist_qv.h @@ -65,7 +65,7 @@ namespace detail::ivf { * Should be able to use only the indices and the query vectors that are active * on this node, and return only the min heaps for the active query vectors */ -template +template auto dist_qv_finite_ram_part( tiledb::Context& ctx, const std::string& part_uri, @@ -80,8 +80,6 @@ auto dist_qv_finite_ram_part( nthreads = std::thread::hardware_concurrency(); } - - using score_type = float; using parts_type = typename std::remove_reference_t::value_type; using indices_type = @@ -90,7 +88,7 @@ auto dist_qv_finite_ram_part( size_t num_queries = size(query); auto shuffled_db = tdbColMajorPartitionedMatrix< - feature_type, + T, shuffled_ids_type, indices_type, parts_type>(ctx, part_uri, indices, active_partitions, id_uri, 0); @@ -113,8 +111,8 @@ auto dist_qv_finite_ram_part( } assert(shuffled_db.num_cols() == size(shuffled_db.ids())); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); auto current_part_size = shuffled_db.num_col_parts(); @@ -167,10 +165,10 @@ auto dist_qv_finite_ram_part( #if 0 auto min_scores = - std::vector>>( + std::vector>>( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); size_t parts_per_thread = (shuffled_db.num_col_parts() + nthreads - 1) / nthreads; @@ -223,8 +221,8 @@ auto dist_qv_finite_ram_part( futs[n].get(); } - auto min_min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); for (size_t j = 0; j < num_queries; ++j) { for (size_t n = 0; n < nthreads; ++n) { @@ -237,7 +235,7 @@ auto dist_qv_finite_ram_part( return min_min_scores; #endif -template +template auto dist_qv_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -255,7 +253,6 @@ auto dist_qv_finite_ram( // Check that the size of the indices vector is correct assert(size(indices) == centroids.num_cols() + 1); - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -271,8 +268,8 @@ auto dist_qv_finite_ram( auto num_parts = size(active_partitions); using parts_type = typename decltype(active_partitions)::value_type; - std::vector> min_scores( - num_queries, fixed_min_pair_heap(k_nn)); + std::vector> min_scores( + num_queries, fixed_min_pair_heap(k_nn)); size_t parts_per_node = (num_parts + num_nodes - 1) / num_nodes; @@ -296,7 +293,7 @@ auto dist_qv_finite_ram( /* * Each compute node returns a min_heap of its own min_scores */ - auto dist_min_scores = dist_qv_finite_ram_part( + auto dist_min_scores = dist_qv_finite_ram_part( ctx, part_uri, dist_partitions, diff --git a/src/include/detail/ivf/index.h b/src/include/detail/ivf/index.h index ebd95d528..934ccea8b 100644 --- a/src/include/detail/ivf/index.h +++ b/src/include/detail/ivf/index.h @@ -74,7 +74,8 @@ int ivf_index( debug_matrix(parts, "parts"); { scoped_timer _{"shuffling data"}; - std::unordered_set deleted_ids_set(deleted_ids.begin(), deleted_ids.end()); + std::unordered_set deleted_ids_set( + deleted_ids.begin(), deleted_ids.end()); std::vector degrees(centroids.num_cols()); std::vector indices(centroids.num_cols() + 1); if (deleted_ids.empty()) { @@ -84,7 +85,8 @@ int ivf_index( } } else { for (size_t i = 0; i < db.num_cols(); ++i) { - if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) { + if (auto iter = deleted_ids_set.find(external_ids[i]); + iter == deleted_ids_set.end()) { auto j = parts[i]; ++degrees[j]; } @@ -134,7 +136,8 @@ int ivf_index( } } else { for (size_t i = 0; i < db.num_cols(); ++i) { - if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) { + if (auto iter = deleted_ids_set.find(external_ids[i]); + iter == deleted_ids_set.end()) { size_t bin = parts[i]; size_t ibin = indices[bin]; diff --git a/src/include/detail/ivf/qv.h b/src/include/detail/ivf/qv.h index 11f0ad96e..11a31d93e 100644 --- a/src/include/detail/ivf/qv.h +++ b/src/include/detail/ivf/qv.h @@ -72,6 +72,95 @@ namespace detail::ivf { // Functions for searching with infinite RAM, OG qv ordering // ---------------------------------------------------------------------------- +// Forward declarations +/** + * + * Overload for already opened arrays. Since the array is already opened, we + * don't need to specify its type with a template parameter. + */ +auto qv_query_heap_infinite_ram( + auto&& partitioned_db, + auto&& centroids, + auto&& q, + auto&& indices, + auto&& partitioned_ids, + size_t nprobe, + size_t k_nn, + size_t nthreads); + +/** + * @brief Query a (small) set of query vectors against a vector database. + * This version loads the entire partition array into memory and then + * queries each vector in the query set against the appropriate partitions. + * + * For now that type of the array needs to be passed as a template argument. + */ +template +auto qv_query_heap_infinite_ram( + tiledb::Context& ctx, + const std::string& part_uri, + auto&& centroids, + auto&& q, + auto&& indices, + const std::string& id_uri, + size_t nprobe, + size_t k_nn, + size_t nthreads) { + scoped_timer _{tdb_func__}; + + // Read the shuffled database and ids + // @todo To this more systematically + auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); + auto partitioned_ids = read_vector(ctx, id_uri); + + return qv_query_heap_infinite_ram( + partitioned_db, + centroids, + q, + indices, + partitioned_ids, + nprobe, + k_nn, + nthreads); +} + +/** + * @brief Query a set of query vectors against an indexed vector database. The + * arrays part_uri, centroids, indices, and id_uri comprise the index. The + * partitioned database is stored in part_uri, the centroids are stored in + * centroids, the indices demarcating partitions is stored in indices, and the + * labels for the vectors in the original database are stored in id_uri. + * The query is stored in q. + * + * "Infinite RAM" means the entire index is loaded into memory before any + * queries are applied, regardless of which partitions are to be queried. + * + * @param part_uri Partitioned database URI + * @param centroids Centroids of the vectors in the original database (and + * the partitioned database). The ith centroid is the centroid of the ith + * partition. + * @param q The query to be searched + * @param indices The demarcations of partitions + * @param id_uri URI of the labels for the vectors in the original database + * @param nprobe How many partitions to search + * @param k_nn How many nearest neighbors to return + * @param nth Unused + * @param nthreads How many threads to use for parallel execution + * @return The indices of the top_k neighbors for each query vector + */ +auto qv_query_heap_infinite_ram( + const std::string& part_uri, + auto&& centroids, + auto&& q, + auto&& indices, + const std::string& id_uri, + size_t nprobe, + size_t k_nn, + size_t nthreads) { + tiledb::Context ctx; + return qv_query_heap_infinite_ram( + ctx, part_uri, centroids, q, indices, id_uri, nprobe, k_nn, nthreads); +} /** * @brief The OG version of querying with qv loop ordering. @@ -112,10 +201,6 @@ auto qv_query_heap_infinite_ram( load(partitioned_db); } scoped_timer _{"Total time " + tdb_func__}; - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; assert(partitioned_db.num_cols() == partitioned_ids.size()); @@ -124,6 +209,7 @@ auto qv_query_heap_infinite_ram( debug_matrix(partitioned_db, "partitioned_db"); debug_slice(partitioned_db, "partitioned_db"); + debug_matrix(partitioned_ids, "partitioned_ids"); // get closest centroid for each query vector @@ -135,8 +221,8 @@ auto qv_query_heap_infinite_ram( auto top_centroids = detail::flat::qv_query_heap_0(centroids, q, nprobe, nthreads); - auto min_scores = std::vector>( - size(q), fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + size(q), fixed_min_pair_heap(k_nn)); // Parallelizing over q is not going to be very efficient { @@ -162,81 +248,6 @@ auto qv_query_heap_infinite_ram( return top_k; } - - -/** - * @brief Query a (small) set of query vectors against a vector database. - * This version loads the entire partition array into memory and then - * queries each vector in the query set against the appropriate partitions. - * - * For now that type of the array needs to be passed as a template argument. - */ -template -auto qv_query_heap_infinite_ram( - tiledb::Context& ctx, - const std::string& part_uri, - const auto& centroids, - const auto& q, - const auto& indices, - const std::string& id_uri, - size_t nprobe, - size_t k_nn, - size_t nthreads) { - scoped_timer _{tdb_func__}; - - // Read the shuffled database and ids - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); - auto partitioned_ids = read_vector(ctx, id_uri); - - return qv_query_heap_infinite_ram( - partitioned_db, - centroids, - q, - indices, - partitioned_ids, - nprobe, - k_nn, - nthreads); -} - -/** - * @brief Query a set of query vectors against an indexed vector database. The - * arrays part_uri, centroids, indices, and id_uri comprise the index. The - * partitioned database is stored in part_uri, the centroids are stored in - * centroids, the indices demarcating partitions is stored in indices, and the - * labels for the vectors in the original database are stored in id_uri. - * The query is stored in q. - * - * "Infinite RAM" means the entire index is loaded into memory before any - * queries are applied, regardless of which partitions are to be queried. - * - * @param part_uri Partitioned database URI - * @param centroids Centroids of the vectors in the original database (and - * the partitioned database). The ith centroid is the centroid of the ith - * partition. - * @param q The query to be searched - * @param indices The demarcations of partitions - * @param id_uri URI of the labels for the vectors in the original database - * @param nprobe How many partitions to search - * @param k_nn How many nearest neighbors to return - * @param nth Unused - * @param nthreads How many threads to use for parallel execution - * @return The indices of the top_k neighbors for each query vector - */ -template -auto qv_query_heap_infinite_ram( - const std::string& part_uri, - auto&& centroids, - auto&& q, - auto&& indices, - const std::string& id_uri, - size_t nprobe, - size_t k_nn, - size_t nthreads) { - tiledb::Context ctx; - return qv_query_heap_infinite_ram(ctx, part_uri, centroids, q, indices, id_uri, nprobe, k_nn, nthreads); -} - // ---------------------------------------------------------------------------- // Functions for searching with infinite RAM, new qv (nuv) ordering // ---------------------------------------------------------------------------- @@ -251,7 +262,7 @@ auto nuv_query_heap_infinite_ram( size_t k_nn, size_t nthreads); -template +template auto nuv_query_heap_infinite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -266,8 +277,8 @@ auto nuv_query_heap_infinite_ram( // Read the shuffled database and ids // @todo To this more systematically - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); - auto partitioned_ids = read_vector(ctx, id_uri); + auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); + auto partitioned_ids = read_vector(ctx, id_uri); return nuv_query_heap_infinite_ram( partitioned_db, @@ -318,15 +329,12 @@ auto nuv_query_heap_infinite_ram( load(partitioned_db); } scoped_timer _{tdb_func__ + std::string{"_in_ram"}}; - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; assert(partitioned_db.num_cols() == partitioned_ids.size()); debug_matrix(partitioned_db, "partitioned_db"); debug_slice(partitioned_db, "partitioned_db"); + debug_matrix(partitioned_ids, "partitioned_ids"); // Check that the indices vector is the right size @@ -338,10 +346,10 @@ auto nuv_query_heap_infinite_ram( partition_ivf_index(centroids, query, nprobe, nthreads); auto min_scores = - std::vector>>( + std::vector>>( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); size_t parts_per_thread = (size(active_partitions) + nthreads - 1) / nthreads; @@ -412,7 +420,7 @@ auto nuv_query_heap_infinite_ram_reg_blocked( size_t k_nn, size_t nthreads); -template +template auto nuv_query_heap_infinite_ram_reg_blocked( tiledb::Context& ctx, const std::string& part_uri, @@ -427,8 +435,8 @@ auto nuv_query_heap_infinite_ram_reg_blocked( // Read the shuffled database and ids // @todo To this more systematically - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); - auto partitioned_ids = read_vector(ctx, id_uri); + auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); + auto partitioned_ids = read_vector(ctx, id_uri); return nuv_query_heap_infinite_ram_reg_blocked( partitioned_db, @@ -484,10 +492,6 @@ auto nuv_query_heap_infinite_ram_reg_blocked( load(partitioned_db); } scoped_timer _{tdb_func__ + std::string{"_in_ram"}}; - - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; assert(partitioned_db.num_cols() == partitioned_ids.size()); @@ -501,13 +505,13 @@ auto nuv_query_heap_infinite_ram_reg_blocked( auto&& [active_partitions, active_queries] = partition_ivf_index(centroids, query, nprobe, nthreads); - // auto min_scores = std::vector>( - // size(q), fixed_min_pair_heap(k_nn)); + // auto min_scores = std::vector>( + // size(q), fixed_min_pair_heap(k_nn)); - auto min_scores = std::vector>> ( + std::vector>> min_scores( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); size_t parts_per_thread = (size(active_partitions) + nthreads - 1) / nthreads; @@ -613,7 +617,7 @@ auto nuv_query_heap_infinite_ram_reg_blocked( /** * Forward declaration */ -template +template auto qv_query_heap_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -630,8 +634,8 @@ auto qv_query_heap_finite_ram( * Interface with uris for all arguments. */ template < - class feature_type, - class id_type, + typename db_type, + class partitioned_ids_type, class centroids_type, class indices_type> auto qv_query_heap_finite_ram( @@ -650,7 +654,7 @@ auto qv_query_heap_finite_ram( auto centroids = tdbColMajorMatrix(ctx, centroids_uri); centroids.load(); - auto query = tdbColMajorMatrix( + auto query = tdbColMajorMatrix( ctx, query_uri, nqueries); query.load(); @@ -706,7 +710,7 @@ auto qv_query_heap_finite_ram( * @param nthreads How many threads to use for parallel execution * @return The indices of the top_k neighbors for each query vector */ -template +template auto qv_query_heap_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -720,7 +724,6 @@ auto qv_query_heap_finite_ram( size_t nthreads) { scoped_timer _{tdb_func__}; - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -744,7 +747,7 @@ auto qv_query_heap_finite_ram( * We also need to know the "active" centroids, i.e., the ones having at * least one query. */ - auto centroid_query = std::multimap{}; + auto centroid_query = std::multimap{}; auto active_centroids = std::set{}; for (size_t j = 0; j < num_queries; ++j) { for (size_t p = 0; p < nprobe; ++p) { @@ -758,8 +761,8 @@ auto qv_query_heap_finite_ram( std::vector(begin(active_centroids), end(active_centroids)); auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + T, + partitioned_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); @@ -778,11 +781,11 @@ auto qv_query_heap_finite_ram( max_partition_size = std::max(max_partition_size, partition_size); _memory_data.insert_entry( tdb_func__ + " (predicted)", - partition_size * sizeof(feature_type) * partitioned_db.num_rows()); + partition_size * sizeof(T) * partitioned_db.num_rows()); } _memory_data.insert_entry( tdb_func__ + " (upper bound)", - nprobe * num_queries * sizeof(feature_type) * max_partition_size); + nprobe * num_queries * sizeof(T) * max_partition_size); } assert(partitioned_db.num_cols() == size(partitioned_db.ids())); @@ -790,13 +793,13 @@ auto qv_query_heap_finite_ram( debug_matrix(partitioned_db, "partitioned_db"); debug_matrix(partitioned_db.ids(), "partitioned_db.ids()"); - // auto min_scores = std::vector>( - // size(q), fixed_min_pair_heap(k_nn)); + // auto min_scores = std::vector>( + // size(q), fixed_min_pair_heap(k_nn)); - auto min_scores = std::vector>> ( + std::vector>> min_scores( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); log_timer _i{tdb_func__ + " in RAM"}; @@ -880,7 +883,7 @@ auto qv_query_heap_finite_ram( // Functions for searching with finite RAM, new qv (nuv) ordering // ---------------------------------------------------------------------------- -template +template auto nuv_query_heap_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -897,8 +900,8 @@ auto nuv_query_heap_finite_ram( * Interface with uris for all arguments. */ template < - typename feature_type, - class id_type, + typename db_type, + class partitioned_ids_type, class centroids_type, class indices_type> auto nuv_query_heap_finite_ram( @@ -916,7 +919,7 @@ auto nuv_query_heap_finite_ram( // using centroid_type = // std::invoke_result_t>; - using query_type = std::invoke_result_t>; + using query_type = std::invoke_result_t>; using idx_type = std::invoke_result_t>; std::future centroids_future = @@ -929,13 +932,13 @@ auto nuv_query_heap_finite_ram( // centroids.load(); std::future query_future = std::async(std::launch::async, [&]() { - auto query = tdbColMajorMatrix( + auto query = tdbColMajorMatrix( ctx, query_uri, nqueries); query.load(); return query; }); // auto query = - // tdbColMajorMatrix(ctx, query_uri, + // tdbColMajorMatrix(ctx, query_uri, // nqueries); // query.load(); @@ -1000,7 +1003,7 @@ auto nuv_query_heap_finite_ram( * @param nthreads How many threads to use for parallel execution * @return The indices of the top_k neighbors for each query vector */ -template +template auto nuv_query_heap_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -1017,7 +1020,6 @@ auto nuv_query_heap_finite_ram( // Check that the size of the indices vector is correct assert(size(indices) == centroids.num_cols() + 1); - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -1029,8 +1031,8 @@ auto nuv_query_heap_finite_ram( using parts_type = typename decltype(active_partitions)::value_type; auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + T, + partitioned_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); @@ -1050,24 +1052,24 @@ auto nuv_query_heap_finite_ram( max_partition_size = std::max(max_partition_size, partition_size); _memory_data.insert_entry( tdb_func__ + " (predicted)", - partition_size * sizeof(feature_type) * partitioned_db.num_rows()); + partition_size * sizeof(T) * partitioned_db.num_rows()); } _memory_data.insert_entry( tdb_func__ + " (upper bound)", - nprobe * num_queries * sizeof(feature_type) * max_partition_size); + nprobe * num_queries * sizeof(T) * max_partition_size); } assert(partitioned_db.num_cols() == size(partitioned_db.ids())); debug_matrix(partitioned_db, "partitioned_db"); debug_matrix(partitioned_db.ids(), "partitioned_db.ids()"); - // auto min_scores = std::vector>( - // size(q), fixed_min_pair_heap(k_nn)); + // auto min_scores = std::vector>( + // size(q), fixed_min_pair_heap(k_nn)); - std::vector>> min_scores( + std::vector>> min_scores( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); log_timer _i{tdb_func__ + " in RAM"}; @@ -1176,7 +1178,7 @@ auto nuv_query_heap_finite_ram( * @param nthreads How many threads to use for parallel execution * @return The indices of the top_k neighbors for each query vector */ -template +template auto nuv_query_heap_finite_ram_reg_blocked( tiledb::Context& ctx, const std::string& part_uri, @@ -1193,7 +1195,6 @@ auto nuv_query_heap_finite_ram_reg_blocked( // Check that the size of the indices vector is correct assert(size(indices) == centroids.num_cols() + 1); - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -1205,8 +1206,8 @@ auto nuv_query_heap_finite_ram_reg_blocked( using parts_type = typename decltype(active_partitions)::value_type; auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + T, + partitioned_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); @@ -1226,24 +1227,24 @@ auto nuv_query_heap_finite_ram_reg_blocked( max_partition_size = std::max(max_partition_size, partition_size); _memory_data.insert_entry( tdb_func__ + " (predicted)", - partition_size * sizeof(feature_type) * partitioned_db.num_rows()); + partition_size * sizeof(T) * partitioned_db.num_rows()); } _memory_data.insert_entry( tdb_func__ + " (upper bound)", - nprobe * num_queries * sizeof(feature_type) * max_partition_size); + nprobe * num_queries * sizeof(T) * max_partition_size); } assert(partitioned_db.num_cols() == size(partitioned_db.ids())); debug_matrix(partitioned_db, "partitioned_db"); debug_matrix(partitioned_db.ids(), "partitioned_db.ids()"); - // auto min_scores = std::vector>( - // size(q), fixed_min_pair_heap(k_nn)); + // auto min_scores = std::vector>( + // size(q), fixed_min_pair_heap(k_nn)); - std::vector>> min_scores( + std::vector>> min_scores( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); log_timer _i{tdb_func__ + " in RAM"}; @@ -1407,13 +1408,9 @@ auto apply_query( size_t last_part) { // print_types(query, partitioned_db, new_indices, active_queries); - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - auto num_queries = size(query); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); size_t part_offset = 0; size_t col_offset = 0; @@ -1532,7 +1529,7 @@ auto apply_query( * @param min_parts_per_thread Unused (WIP for threading heuristics) * @return The indices of the top_k neighbors for each query vector */ -template +template auto query_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -1546,11 +1543,10 @@ auto query_finite_ram( size_t nthreads, size_t min_parts_per_thread = 0) { scoped_timer _{tdb_func__ + " " + part_uri}; - + // Check that the size of the indices vector is correct assert(size(indices) == centroids.num_cols() + 1); - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -1562,8 +1558,8 @@ auto query_finite_ram( using parts_type = typename decltype(active_partitions)::value_type; auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + T, + partitioned_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); @@ -1585,19 +1581,19 @@ auto query_finite_ram( max_partition_size = std::max(max_partition_size, partition_size); _memory_data.insert_entry( tdb_func__ + " (predicted)", - partition_size * sizeof(feature_type) * partitioned_db.num_rows()); + partition_size * sizeof(T) * partitioned_db.num_rows()); } _memory_data.insert_entry( tdb_func__ + " (upper bound)", - nprobe * num_queries * sizeof(feature_type) * max_partition_size); + nprobe * num_queries * sizeof(T) * max_partition_size); } assert(partitioned_db.num_cols() == size(partitioned_db.ids())); debug_matrix(partitioned_db, "partitioned_db"); debug_matrix(partitioned_db.ids(), "partitioned_db.ids()"); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); while (partitioned_db.load()) { _i.start(); @@ -1700,10 +1696,6 @@ auto query_infinite_ram( size_t nthreads) { scoped_timer _{tdb_func__ + std::string{"_in_ram"}}; - // using feature_type = typename std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; - assert(partitioned_db.num_cols() == partitioned_ids.size()); // Check that the indices vector is the right size @@ -1720,8 +1712,8 @@ auto query_infinite_ram( std::vector new_indices(size(active_partitions) + 1); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); size_t parts_per_thread = (size(active_partitions) + nthreads - 1) / nthreads; @@ -1776,7 +1768,7 @@ auto query_infinite_ram( return top_k; } -template +template auto query_infinite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -1791,9 +1783,9 @@ auto query_infinite_ram( // Read the shuffled database and ids // @todo To this more systematically - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); + auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); partitioned_db.load(); - auto partitioned_ids = read_vector(ctx, id_uri); + auto partitioned_ids = read_vector(ctx, id_uri); return query_infinite_ram( partitioned_db, diff --git a/src/include/detail/ivf/vq.h b/src/include/detail/ivf/vq.h index 5780bb277..223e1a69b 100644 --- a/src/include/detail/ivf/vq.h +++ b/src/include/detail/ivf/vq.h @@ -78,7 +78,7 @@ namespace detail::ivf { * Note that this algorithm is essentially the transpose of the one in qv.h. * * @param query The set of all query vectors. - * @param partitioned_db The partitioned set of vectors to be searched + * @param shuffled_db The partitioned set of vectors to be searched * @param new_indices The indices delimiting the partitions. * @param active_queries Indicates which queries to apply to each of the active * partitions. @@ -92,7 +92,7 @@ namespace detail::ivf { */ auto vq_apply_query( auto&& query, - auto&& partitioned_db, + auto&& shuffled_db, auto&& new_indices, auto&& active_queries, auto&& ids, @@ -102,20 +102,18 @@ auto vq_apply_query( size_t last_part) { auto num_queries = size(query); - // using feature_type = typename - // std::remove_reference_t::value_type; - using id_type = typename std::remove_reference_t::value_type; - using score_type = float; + // std::cout << "thread " << n << " running " << first_part << " to " << + // last_part << std::endl; size_t part_offset = 0; size_t col_offset = 0; - if constexpr (has_num_col_parts) { - part_offset = partitioned_db.col_part_offset(); - col_offset = partitioned_db.col_offset(); + if constexpr (has_num_col_parts) { + part_offset = shuffled_db.col_part_offset(); + col_offset = shuffled_db.col_offset(); } - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); /** * Loop over given partitons to be searched @@ -125,7 +123,7 @@ auto vq_apply_query( // @todo this is a bit of a hack auto quartno = partno; - if constexpr (!has_num_col_parts) { + if constexpr (!has_num_col_parts) { quartno = active_partitions[partno]; } @@ -153,10 +151,10 @@ auto vq_apply_query( auto q_vec_0 = query[j0]; auto q_vec_1 = query[j1]; - auto score_00 = L2(q_vec_0, partitioned_db[kp + 0]); - auto score_01 = L2(q_vec_0, partitioned_db[kp + 1]); - auto score_10 = L2(q_vec_1, partitioned_db[kp + 0]); - auto score_11 = L2(q_vec_1, partitioned_db[kp + 1]); + auto score_00 = L2(q_vec_0, shuffled_db[kp + 0]); + auto score_01 = L2(q_vec_0, shuffled_db[kp + 1]); + auto score_10 = L2(q_vec_1, shuffled_db[kp + 0]); + auto score_11 = L2(q_vec_1, shuffled_db[kp + 1]); min_scores[j0].insert(score_00, ids[kp + 0]); min_scores[j0].insert(score_01, ids[kp + 1]); @@ -171,8 +169,8 @@ auto vq_apply_query( auto j0 = j[0]; auto q_vec_0 = query[j0]; - auto score_00 = L2(q_vec_0, partitioned_db[kp + 0]); - auto score_01 = L2(q_vec_0, partitioned_db[kp + 1]); + auto score_00 = L2(q_vec_0, shuffled_db[kp + 0]); + auto score_01 = L2(q_vec_0, shuffled_db[kp + 1]); min_scores[j0].insert(score_00, ids[kp + 0]); min_scores[j0].insert(score_01, ids[kp + 1]); } @@ -188,8 +186,8 @@ auto vq_apply_query( auto q_vec_0 = query[j0]; auto q_vec_1 = query[j1]; - auto score_00 = L2(q_vec_0, partitioned_db[kp + 0]); - auto score_10 = L2(q_vec_1, partitioned_db[kp + 0]); + auto score_00 = L2(q_vec_0, shuffled_db[kp + 0]); + auto score_10 = L2(q_vec_1, shuffled_db[kp + 0]); min_scores[j0].insert(score_00, ids[kp + 0]); min_scores[j1].insert(score_10, ids[kp + 0]); @@ -202,7 +200,7 @@ auto vq_apply_query( auto j0 = j[0]; auto q_vec_0 = query[j0]; - auto score_00 = L2(q_vec_0, partitioned_db[kp + 0]); + auto score_00 = L2(q_vec_0, shuffled_db[kp + 0]); min_scores[j0].insert(score_00, ids[kp + 0]); } } @@ -216,23 +214,17 @@ auto vq_apply_query( * already have loaded all of its data. */ auto vq_query_infinite_ram( - auto&& partitioned_db, + auto&& shuffled_db, auto&& centroids, auto&& query, auto&& indices, - auto&& partitioned_ids, + auto&& shuffled_ids, size_t nprobe, size_t k_nn, size_t nthreads) { scoped_timer _{tdb_func__ + std::string{"_in_ram"}}; - // using feature_type = typename - // std::remove_reference_t::value_type; - using id_type = - typename std::remove_reference_t::value_type; - using score_type = float; - - assert(partitioned_db.num_cols() == partitioned_ids.size()); + assert(shuffled_db.num_cols() == shuffled_ids.size()); // Check that the indices vector is the right size assert(size(indices) == centroids.num_cols() + 1); @@ -248,8 +240,8 @@ auto vq_query_infinite_ram( std::vector new_indices(size(active_partitions) + 1); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); size_t parts_per_thread = (size(active_partitions) + nthreads - 1) / nthreads; @@ -266,20 +258,20 @@ auto vq_query_infinite_ram( futs.emplace_back(std::async( std::launch::async, [&query, - &partitioned_db, + &shuffled_db, &indices, &active_queries = active_queries, &active_partitions = active_partitions, - &partitioned_ids, + &shuffled_ids, k_nn, first_part, last_part]() { return vq_apply_query( query, - partitioned_db, + shuffled_db, indices, active_queries, - partitioned_ids, + shuffled_ids, active_partitions, k_nn, first_part, @@ -309,7 +301,7 @@ auto vq_query_infinite_ram( * loads them each into a matrix, and then calls `vq_query_infinite_ram` * above. */ -template +template auto vq_query_infinite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -322,14 +314,14 @@ auto vq_query_infinite_ram( size_t nthreads) { scoped_timer _{tdb_func__}; - // Read the partitioned database and ids + // Read the shuffled database and ids // @todo To this more systematically - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); - partitioned_db.load(); - auto partitioned_ids = read_vector(ctx, id_uri); + auto shuffled_db = tdbColMajorMatrix(ctx, part_uri); + shuffled_db.load(); + auto shuffled_ids = read_vector(ctx, id_uri); return vq_query_infinite_ram( - partitioned_db, centroids, q, indices, partitioned_ids, nprobe, k_nn, nthreads); + shuffled_db, centroids, q, indices, shuffled_ids, nprobe, k_nn, nthreads); } /** @@ -338,24 +330,20 @@ auto vq_query_infinite_ram( * already have loaded all of its data. */ auto vq_query_infinite_ram_2( - auto&& partitioned_db, + auto&& shuffled_db, auto&& centroids, auto&& query, auto&& indices, - auto&& partitioned_ids, + auto&& shuffled_ids, size_t nprobe, size_t k_nn, size_t nthreads) { scoped_timer _{tdb_func__ + std::string{"_in_ram"}}; - assert(partitioned_db.num_cols() == partitioned_ids.size()); + assert(shuffled_db.num_cols() == shuffled_ids.size()); // Check that the indices vector is the right size assert(size(indices) == centroids.num_cols() + 1); - - // using feature_type = typename std::remove_reference::value_type; - using id_type = typename std::remove_reference::value_type; - using score_type = float; auto num_queries = size(query); @@ -368,8 +356,8 @@ auto vq_query_infinite_ram_2( std::vector new_indices(size(active_partitions) + 1); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); size_t parts_per_thread = (size(active_partitions) + nthreads - 1) / nthreads; @@ -387,7 +375,7 @@ auto vq_query_infinite_ram_2( std::launch::async, [&query, &min_scores, - &partitioned_db, + &shuffled_db, &new_indices = indices, &active_queries = active_queries, &active_partitions = active_partitions, @@ -408,15 +396,15 @@ auto vq_query_infinite_ram_2( */ for (size_t k = start; k < stop; ++k) { - auto kp = k - partitioned_db.col_offset(); + auto kp = k - shuffled_db.col_offset(); for (auto j : active_queries[partno]) { // @todo shift start / stop back by the offset - auto score = L2(query[j], partitioned_db[kp]); + auto score = L2(query[j], shuffled_db[kp]); // @todo any performance with apparent extra indirection? - min_scores[j].insert(score, partitioned_db.ids()[kp]); + min_scores[j].insert(score, shuffled_db.ids()[kp]); } } } @@ -445,7 +433,7 @@ auto vq_query_infinite_ram_2( * loads them each into a matrix, and then calls `vq_query_infinite_ram` * above. */ -template +template auto vq_query_infinite_ram_2( tiledb::Context& ctx, const std::string& part_uri, @@ -458,14 +446,14 @@ auto vq_query_infinite_ram_2( size_t nthreads) { scoped_timer _{tdb_func__}; - // Read the partitioned database and ids + // Read the shuffled database and ids // @todo To this more systematically - auto partitioned_db = tdbColMajorMatrix(ctx, part_uri); - partitioned_db.load(); - auto partitioned_ids = read_vector(ctx, id_uri); + auto shuffled_db = tdbColMajorMatrix(ctx, part_uri); + shuffled_db.load(); + auto shuffled_ids = read_vector(ctx, id_uri); return vq_query_infinite_ram( - partitioned_db, centroids, q, indices, partitioned_ids, nprobe, k_nn, nthreads); + shuffled_db, centroids, q, indices, shuffled_ids, nprobe, k_nn, nthreads); } /* @@ -477,7 +465,7 @@ auto vq_query_infinite_ram_2( * function then invoked `vq_apply_query` on the partitioned matrix in * parallel fashion, decomposing over the partitions. */ -template +template auto vq_query_finite_ram( tiledb::Context& ctx, const std::string& part_uri, @@ -495,8 +483,6 @@ auto vq_query_finite_ram( // Check that the size of the indices vector is correct assert(size(indices) == centroids.num_cols() + 1); - using score_type = float; - using indices_type = typename std::remove_reference_t::value_type; @@ -507,13 +493,13 @@ auto vq_query_finite_ram( using parts_type = typename decltype(active_partitions)::value_type; - auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + auto shuffled_db = tdbColMajorPartitionedMatrix< + T, + shuffled_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); - load(partitioned_db); + load(shuffled_db); log_timer _i{tdb_func__ + " in RAM"}; @@ -532,24 +518,24 @@ auto vq_query_finite_ram( max_partition_size = std::max(max_partition_size, partition_size); _memory_data.insert_entry( tdb_func__ + " (predicted)", - partition_size * sizeof(feature_type) * partitioned_db.num_rows()); + partition_size * sizeof(T) * shuffled_db.num_rows()); } _memory_data.insert_entry( tdb_func__ + " (upper bound)", - nprobe * num_queries * sizeof(feature_type) * max_partition_size); + nprobe * num_queries * sizeof(T) * max_partition_size); } - assert(partitioned_db.num_cols() == size(partitioned_db.ids())); - debug_matrix(partitioned_db, "partitioned_db"); - debug_matrix(partitioned_db.ids(), "partitioned_db.ids()"); + assert(shuffled_db.num_cols() == size(shuffled_db.ids())); + debug_matrix(shuffled_db, "shuffled_db"); + debug_matrix(shuffled_db.ids(), "shuffled_db.ids()"); - auto min_scores = std::vector>( - num_queries, fixed_min_pair_heap(k_nn)); + auto min_scores = std::vector>( + num_queries, fixed_min_pair_heap(k_nn)); do { _i.start(); - auto current_part_size = partitioned_db.num_col_parts(); + auto current_part_size = shuffled_db.num_col_parts(); size_t parts_per_thread = (current_part_size + nthreads - 1) / nthreads; @@ -567,7 +553,7 @@ auto vq_query_finite_ram( futs.emplace_back(std::async( std::launch::async, [&query, - &partitioned_db, + &shuffled_db, &new_indices, &active_queries = active_queries, &active_partitions = active_partitions, @@ -576,10 +562,10 @@ auto vq_query_finite_ram( last_part]() { return vq_apply_query( query, - partitioned_db, + shuffled_db, new_indices, active_queries, - partitioned_db.ids(), + shuffled_db.ids(), active_partitions, k_nn, first_part, @@ -600,14 +586,14 @@ auto vq_query_finite_ram( } _i.stop(); - } while (load(partitioned_db)); + } while (load(shuffled_db)); auto top_k = get_top_k_with_scores(min_scores, k_nn); return top_k; } -template +template auto vq_query_finite_ram_2( tiledb::Context& ctx, const std::string& part_uri, @@ -622,7 +608,6 @@ auto vq_query_finite_ram_2( size_t min_parts_per_thread = 0) { scoped_timer _{tdb_func__ + " " + part_uri}; - using score_type = float; using indices_type = typename std::remove_reference_t::value_type; @@ -633,9 +618,9 @@ auto vq_query_finite_ram_2( using parts_type = typename decltype(active_partitions)::value_type; - auto partitioned_db = tdbColMajorPartitionedMatrix< - feature_type, - id_type, + auto shuffled_db = tdbColMajorPartitionedMatrix< + T, + shuffled_ids_type, indices_type, parts_type>( ctx, part_uri, indices, active_partitions, id_uri, upper_bound); @@ -650,15 +635,15 @@ auto vq_query_finite_ram_2( } auto min_scores = - std::vector>>( + std::vector>>( nthreads, - std::vector>( - num_queries, fixed_min_pair_heap(k_nn))); + std::vector>( + num_queries, fixed_min_pair_heap(k_nn))); - while (partitioned_db.load()) { + while (shuffled_db.load()) { _i.start(); - auto current_part_size = partitioned_db.num_col_parts(); + auto current_part_size = shuffled_db.num_col_parts(); size_t parts_per_thread = (current_part_size + nthreads - 1) / nthreads; @@ -677,7 +662,7 @@ auto vq_query_finite_ram_2( std::launch::async, [&query, &min_scores, - &partitioned_db, + &shuffled_db, &new_indices, &active_queries = active_queries, n, @@ -688,7 +673,7 @@ auto vq_query_finite_ram_2( * partition as their top centroid. */ for (size_t p = first_part; p < last_part; ++p) { - auto partno = p + partitioned_db.col_part_offset(); + auto partno = p + shuffled_db.col_part_offset(); auto start = new_indices[partno]; auto stop = new_indices[partno + 1]; @@ -698,15 +683,15 @@ auto vq_query_finite_ram_2( */ for (size_t k = start; k < stop; ++k) { - auto kp = k - partitioned_db.col_offset(); + auto kp = k - shuffled_db.col_offset(); for (auto j : active_queries[partno]) { // @todo shift start / stop back by the offset - auto score = L2(query[j], partitioned_db[kp]); + auto score = L2(query[j], shuffled_db[kp]); // @todo any performance with apparent extra indirection? - min_scores[n][j].insert(score, partitioned_db.ids()[kp]); + min_scores[n][j].insert(score, shuffled_db.ids()[kp]); } } } @@ -726,4 +711,4 @@ auto vq_query_finite_ram_2( } // namespace detail::ivf -#endif // TILEDB_IVF_VQ_H +#endif // TILEDB_IVF_VQ_H \ No newline at end of file diff --git a/src/include/detail/linalg/linalg_defs.h b/src/include/detail/linalg/linalg_defs.h index 5e431a78e..0d9320a9c 100644 --- a/src/include/detail/linalg/linalg_defs.h +++ b/src/include/detail/linalg/linalg_defs.h @@ -40,4 +40,8 @@ using namespace Kokkos; using namespace Kokkos::Experimental; } // namespace stdx +extern bool global_verbose; +extern bool global_debug; +extern std::string global_region; + #endif // TDB_LINALG_DEFS_H \ No newline at end of file diff --git a/src/include/detail/linalg/matrix.h b/src/include/detail/linalg/matrix.h index d81fc8d80..26bf7d40a 100644 --- a/src/include/detail/linalg/matrix.h +++ b/src/include/detail/linalg/matrix.h @@ -201,6 +201,13 @@ class Matrix : public stdx::mdspan, LayoutPolicy> { auto num_cols() const noexcept { return num_cols_; } + + auto swap(Matrix& rhs) noexcept { + std::swap(num_rows_, rhs.num_rows_); + std::swap(num_cols_, rhs.num_cols_); + std::swap(storage_, rhs.storage_); + std::swap(static_cast(*this), static_cast(rhs)); + } }; /** @@ -293,11 +300,9 @@ std::string matrix_info(const std::span& A, const std::string& msg = "") { return str; } -static bool matrix_printf = false; - template void debug_matrix(const Matrix& A, const std::string& msg = "") { - if (matrix_printf) { + if (global_debug) { std::cout << matrix_info(A, msg) << std::endl; } } @@ -308,7 +313,7 @@ void debug_slice( const std::string& msg = "", size_t rows = 5, size_t cols = 15) { - if (matrix_printf) { + if (global_debug) { rows = std::min(rows, A.num_rows()); cols = std::min(cols, A.num_cols()); @@ -330,7 +335,7 @@ void debug_slices_diff( const std::string& msg = "", size_t rows = 5, size_t cols = 15) { - if (matrix_printf) { + if (global_debug) { rows = std::min(rows, A.num_rows()); cols = std::min(cols, A.num_cols()); diff --git a/src/include/detail/linalg/tdb_io.h b/src/include/detail/linalg/tdb_io.h index 102465337..b0dc504a2 100644 --- a/src/include/detail/linalg/tdb_io.h +++ b/src/include/detail/linalg/tdb_io.h @@ -45,6 +45,9 @@ void create_matrix( const tiledb::Context& ctx, const Matrix& A, const std::string& uri) { + if (global_debug) { + std::cerr << "# Creating Matrix: " << uri << std::endl; + } // @todo: make this a parameter size_t num_parts = 10; @@ -83,6 +86,9 @@ void write_matrix( size_t start_pos = 0, bool create = true) { scoped_timer _{tdb_func__ + " " + std::string{uri}}; + if (global_debug) { + std::cerr << "# Writing Matrix: " << uri << std::endl; + } if (create) { create_matrix(ctx, A, uri); @@ -118,6 +124,9 @@ void write_matrix( template void create_vector( const tiledb::Context& ctx, std::vector& v, const std::string& uri) { + if (global_debug) { + std::cerr << "# Creating std::vector: " << uri << std::endl; + } size_t num_parts = 10; size_t tile_extent = (size(v) + num_parts - 1) / num_parts; @@ -147,6 +156,10 @@ void write_vector( bool create = true) { scoped_timer _{tdb_func__ + " " + std::string{uri}}; + if (global_debug) { + std::cerr << "# Writing std::vector: " << uri << std::endl; + } + if (create) { create_vector(ctx, v, uri); } @@ -184,6 +197,10 @@ std::vector read_vector( size_t end_pos = 0) { scoped_timer _{tdb_func__ + " " + std::string{uri}}; + if (global_debug) { + std::cerr << "# Reading std::vector: " << uri << std::endl; + } + tiledb::Array array_ = tiledb_helpers::open_array(tdb_func__, ctx, uri, TILEDB_READ); auto schema_ = array_.schema(); diff --git a/src/include/detail/linalg/tdb_partitioned_matrix.h b/src/include/detail/linalg/tdb_partitioned_matrix.h index 737efd89f..7e4e929ad 100644 --- a/src/include/detail/linalg/tdb_partitioned_matrix.h +++ b/src/include/detail/linalg/tdb_partitioned_matrix.h @@ -62,6 +62,10 @@ using namespace Kokkos; using namespace Kokkos::Experimental; } // namespace stdx +extern bool global_verbose; +extern bool global_debug; +extern std::string global_region; + /** * * @note The template parameters indices_type and parts_type are deduced using diff --git a/src/include/ivf_index.h b/src/include/ivf_index.h index f39363908..75465d0d5 100644 --- a/src/include/ivf_index.h +++ b/src/include/ivf_index.h @@ -58,16 +58,20 @@ #include "detail/flat/qv.h" #include "detail/ivf/index.h" -template +enum class kmeans_init { none, kmeanspp, random }; + +template < + class T, + class shuffled_ids_type = size_t, + class indices_type = size_t> class kmeans_index { - // Random device to seed the random number generator - std::random_device rd; - std::mt19937 gen{rd()}; + std::mt19937 gen; size_t dimension_{0}; size_t nlist_; size_t max_iter_; double tol_; + double reassign_ratio_{0.075}; size_t nthreads_{std::thread::hardware_concurrency()}; ColMajorMatrix centroids_; @@ -76,22 +80,59 @@ class kmeans_index { ColMajorMatrix shuffled_db_; public: + using value_type = T; + using index_type = indices_type; + + /** + * @brief Construct a new kmeans index object, setting a number of parameters + * to be used subsequently in training. + * @param dimension Dimension of the vectors comprising the training set and + * the data set. + * @param nlist Number of centroids / partitions to compute. + * @param max_iter Maximum number of iterations for kmans algorithm. + * @param tol Convergence tolerance for kmeans algorithm. + * @param nthreads Number of threads to use when executing in parallel. + * @param seed Seed for random number generator. + */ kmeans_index( size_t dimension, size_t nlist, size_t max_iter, - double tol, - size_t nthreads) - : dimension_(dimension) + double tol = 0.000025, + std::optional nthreads = std::nullopt, + std::optional seed = std::nullopt) + : gen(seed ? *seed : std::random_device{}()) + , dimension_(dimension) , nlist_(nlist) , max_iter_(max_iter) , tol_(tol) - , nthreads_(nthreads) + , nthreads_(nthreads ? *nthreads : std::thread::hardware_concurrency()) , centroids_(dimension, nlist) { } /** - * @brief Use kmeans++ algorithm to choose initial centroids. + * @brief Use the kmeans++ algorithm to choose initial centroids. + * The current implementation follows the algorithm described in + * the literature (Arthur and Vassilvitskii, 2007): + * + * 1a. Choose an initial centroid uniformly at random from the training set + * 1b. Choose the next centroid from the training set + * 2b. For each data point x not chosen yet, compute D(x), the distance + * between x and the nearest centroid that has already been chosen. + * 3b. Choose one new data point at random as a new centroid, using a + * weighted probability distribution where a point x is chosen + * with probability proportional to D(x)2. + * 3. Repeat Steps 2b and 3b until k centers have been chosen. + * + * The initial centroids are stored in the member variable `centroids_`. + * It is expected that the centroids will be further refined by the + * kmeans algorithm. + * + * @param training_set Array of vectors to cluster. + * + * @todo Implement greedy kmeans++: choose several new centers during each + * iteration, and then greedily chose the one that most decreases φ + * @todo Finish implementation using triangle inequality. */ void kmeans_pp(const ColMajorMatrix& training_set) { scoped_timer _{__FUNCTION__}; @@ -104,18 +145,9 @@ class kmeans_index { end(training_set[choice]), begin(centroids_[0])); - // Choose one center uniformly at random among the data points. - // For each data point x not chosen yet, compute D(x), the distance - // between x and the nearest center that has already been chosen. - // Choose one new data point at random as a new center, using a - // weighted probability distribution where a point x is chosen - // with probability proportional to D(x)2. - // Repeat Steps 2 and 3 until k centers have been chosen. - // Now that the initial centers have been chosen, proceed using - // standard k-means clustering. - + // Initialize distances, leaving some room to grow std::vector distances( - training_set.num_cols(), std::numeric_limits::max() / 8); + training_set.num_cols(), std::numeric_limits::max() / 8192); #ifdef _TRIANGLE_INEQUALITY std::vector centroid_centroid(nlist_, 0.0); @@ -124,16 +156,13 @@ class kmeans_index { // Calculate the remaining centroids using K-means++ algorithm for (size_t i = 1; i < nlist_; ++i) { - std::vector totalDistance(nthreads_, 0.0); stdx::execution::indexed_parallel_policy par{nthreads_}; - stdx::range_for_each( std::move(par), training_set, - [this, &distances, &totalDistance, i]( - auto&& vec, size_t n, size_t j) { + [this, &distances, i](auto&& vec, size_t n, size_t j) { - // centroid i-1 is the newest centroid + // Note: centroid i-1 is the newest centroid #ifdef _TRIANGLE_INEQUALITY // using triangle inequality, only need to calculate distance to the @@ -154,20 +183,12 @@ class kmeans_index { double distance = sum_of_squares(vec, centroids_[i - 1]); auto min_distance = std::min(distances[j], distance); distances[j] = min_distance; - totalDistance[n] += min_distance; #endif }); - double total = - std::accumulate(begin(totalDistance), end(totalDistance), 0.0); - - // This isn't really necessary for the discrete_distribution - // std::for_each(begin(distances), end(distances), [total](auto& element) - // { - // element /= total; // Normalize - // }); // Select the next centroid based on the probability proportional to - // distance squared + // distance squared -- note we did not normalize the vectors ourselves + // since `discrete_distribution` implicitly does that for us std::discrete_distribution probabilityDistribution( distances.begin(), distances.end()); size_t nextIndex = probabilityDistribution(gen); @@ -190,14 +211,21 @@ class kmeans_index { /** * @brief Initialize centroids by choosing them at random from training set. + * @param training_set Array of vectors to cluster. */ void kmeans_random_init(const ColMajorMatrix& training_set) { scoped_timer _{__FUNCTION__}; std::vector indices(nlist_); + std::vector visited(training_set.num_cols(), false); std::uniform_int_distribution<> dis(0, training_set.num_cols() - 1); for (size_t i = 0; i < nlist_; ++i) { - indices[i] = dis(gen); + size_t index; + do { + index = dis(gen); + } while (visited[index]); + indices[i] = index; + visited[index] = true; } // std::iota(begin(indices), end(indices), 0); // std::shuffle(begin(indices), end(indices), gen); @@ -211,60 +239,145 @@ class kmeans_index { } /** - * @brief Use kmeans algorithm to cluster vectors into centroids. + * @brief Use kmeans algorithm to cluster vectors into centroids. Beginning + * with an initial set of centroids, the algorithm iteratively partitions + * the training_set into clusters, and then recomputes new centroids based + * on the clusters. The algorithm terminates when the change in centroids + * is less than a threshold tolerance, or when a maximum number of + * iterations is reached. + * + * @param training_set Array of vectors to cluster. */ void train_no_init(const ColMajorMatrix& training_set) { scoped_timer _{__FUNCTION__}; std::vector degrees(nlist_, 0); + ColMajorMatrix new_centroids(dimension_, nlist_); for (size_t iter = 0; iter < max_iter_; ++iter) { - auto parts = - detail::flat::qv_partition(centroids_, training_set, nthreads_); - - // for (auto & p : parts) { - // std::cout << p << " "; - // } - // std::cout << std::endl; - - for (size_t j = 0; j < nlist_; ++j) { - std::fill(begin(centroids_[j]), end(centroids_[j]), 0.0); - } + auto [scores, parts] = detail::flat::qv_partition_with_scores( + centroids_, training_set, nthreads_); + // auto parts = detail::flat::qv_partition(centroids_, training_set, + // nthreads_); + + std::fill( + new_centroids.data(), + new_centroids.data() + + new_centroids.num_rows() * new_centroids.num_cols(), + 0.0); std::fill(begin(degrees), end(degrees), 0); - stdx::execution::indexed_parallel_policy par{nthreads_}; + // How many centroids should we try to fix up + size_t heap_size = std::ceil(reassign_ratio_ * nlist_) + 5; + auto high_scores = + fixed_min_pair_heap>( + heap_size, std::greater()); + auto low_degrees = fixed_min_pair_heap(heap_size); - // @todo parallelize -- use a temp centroid matrix for each thread + // @todo parallelize -- by partition for (size_t i = 0; i < training_set.num_cols(); ++i) { auto part = parts[i]; - auto centroid = centroids_[part]; + auto centroid = new_centroids[part]; auto vector = training_set[i]; + // std::copy(begin(vector), end(vector), begin(centroid)); for (size_t j = 0; j < dimension_; ++j) { centroid[j] += vector[j]; } ++degrees[part]; + high_scores.insert(scores[i], i); } - auto mm = std::minmax_element(begin(degrees), end(degrees)); - double sum = std::accumulate(begin(degrees), end(degrees), 0); - double average = sum / (double)size(degrees); - - auto min = *mm.first; - auto max = *mm.second; - auto diff = max - min; - std::cout << "avg: " << average << " sum: " << sum << " min: " << min - << " max: " << max << " diff: " << diff << std::endl; - - // @todo parallelize + size_t max_degree = 0; + for (size_t i = 0; i < nlist_; ++i) { + auto degree = degrees[i]; + max_degree = std::max(max_degree, degree); + low_degrees.insert(degree, i); + } + size_t lower_degree_bound = std::ceil(max_degree * reassign_ratio_); + // Don't reassign if we are on last iteration + if (iter != max_iter_ - 1) { +#if 0 + // Pick a random vector to be a new centroid + std::uniform_int_distribution<> dis(0, training_set.num_cols() - 1); + for (auto&& [degree, zero_part] : low_degrees) { + if (degree < lower_degree_bound) { + auto index = dis(gen); + auto rand_vector = training_set[index]; + auto low_centroid = new_centroids[zero_part]; + std::copy(begin(rand_vector), end(rand_vector), begin(low_centroid)); + for (size_t i = 0; i < dimension_; ++i) { + new_centroids[parts[index]][i] -= rand_vector[i]; + } + } + } +#endif + // Move vectors with high scores to replace zero-degree partitions + std::sort_heap(begin(low_degrees), end(low_degrees)); + std::sort_heap( + begin(high_scores), end(high_scores), [](auto a, auto b) { + return std::get<0>(a) > std::get<0>(b); + }); + for (size_t i = 0; i < size(low_degrees) && + std::get<0>(low_degrees[i]) <= lower_degree_bound; + ++i) { + // std::cout << "i: " << i << " low_degrees: (" + // << std::get<1>(low_degrees[i]) << " " + // << std::get<0>(low_degrees[i]) << ") high_scores: (" + // << parts[std::get<1>(high_scores[i])] << " " + // << std::get<1>(high_scores[i]) << " " + // << std::get<0>(high_scores[i]) << ")" << std::endl; + auto [degree, zero_part] = low_degrees[i]; + auto [score, high_vector_id] = high_scores[i]; + auto low_centroid = new_centroids[zero_part]; + auto high_vector = training_set[high_vector_id]; + std::copy(begin(high_vector), end(high_vector), begin(low_centroid)); + for (size_t i = 0; i < dimension_; ++i) { + new_centroids[parts[high_vector_id]][i] -= high_vector[i]; + } + ++degrees[zero_part]; + --degrees[parts[high_vector_id]]; + } + } + /** + * Check for convergence + */ + // @todo parallelize? + double max_diff = 0.0; + double total_weight = 0.0; for (size_t j = 0; j < nlist_; ++j) { - auto centroid = centroids_[j]; - for (size_t k = 0; k < dimension_; ++k) { - if (degrees[j] != 0) { + if (degrees[j] != 0) { + auto centroid = new_centroids[j]; + for (size_t k = 0; k < dimension_; ++k) { centroid[k] /= degrees[j]; + total_weight += centroid[k] * centroid[k]; } } + auto diff = sum_of_squares(centroids_[j], new_centroids[j]); + max_diff = std::max(max_diff, diff); + } + centroids_.swap(new_centroids); + // std::cout << "max_diff: " << max_diff << " total_weight: " << + // total_weight + // << std::endl; + if (max_diff < tol_ * total_weight) { + // std::cout << "Converged after " << iter << " iterations." << + // std::endl; + break; } + +#if 0 + // Debugging + auto mm = std::minmax_element(begin(degrees), end(degrees)); + double sum = std::accumulate(begin(degrees), end(degrees), 0); + double average = sum / (double)size(degrees); + + auto min = *mm.first; + auto max = *mm.second; + auto diff = max - min; + std::cout << "avg: " << average << " sum: " << sum << " min: " << min + << " max: " << max << " diff: " << diff << std::endl; +#endif } // Debugging @@ -284,6 +397,21 @@ class kmeans_index { } file << std::endl; + for (auto s = 0; s < training_set.num_cols(); ++s) { + for (auto t = 0; t < training_set.num_rows(); ++t) { + file << std::to_string(training_set(t, s)) << ','; + } + file << std::endl; + } + file << std::endl; + + for (auto s = 0; s < centroids_.num_cols(); ++s) { + for (auto t = 0; t < centroids_.num_rows(); ++t) { + file << std::to_string(centroids_(t, s)) << ','; + } + file << std::endl; + } + file.close(); std::cout << "Data written to file: " << tempFileName << std::endl; @@ -291,9 +419,64 @@ class kmeans_index { #endif } - void train(const ColMajorMatrix& training_set) { - kmeans_pp(training_set); + static std::vector predict( + const ColMajorMatrix& centroids, const ColMajorMatrix& vectors) { + // Return a vector of indices of the nearest centroid for each vector in + // the matrix. Write the code below: + auto nClusters = centroids.num_cols(); + std::vector indices(vectors.num_cols()); + std::vector distances(nClusters); + for (size_t i = 0; i < vectors.num_cols(); ++i) { + for (size_t j = 0; j < nClusters; ++j) { + distances[j] = sum_of_squares(vectors[i], centroids[j]); + } + indices[i] = + std::min_element(begin(distances), end(distances)) - begin(distances); + } + return indices; + } + + /** + * Compute centroids and partitions of the training set data. + * @param training_set Array of vectors to cluster. + * @param init Specify which initialization algorithm to use, + * random (`random`) or kmeans++ (`kmeanspp`). + */ + void train(const ColMajorMatrix& training_set, kmeans_init init) { + switch (init) { + case (kmeans_init::none): + break; + case (kmeans_init::kmeanspp): + kmeans_pp(training_set); + break; + case (kmeans_init::random): + kmeans_random_init(training_set); + break; + }; + +#if 0 + std::cout << "\nCentroids Before:\n" << std::endl; + for (size_t j = 0; j < centroids_.num_cols(); ++j) { + for (size_t i = 0; i < dimension_; ++i) { + std::cout << centroids_[j][i] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + train_no_init(training_set); + +#if 0 + std::cout << "\nCentroids After:\n" << std::endl; + for (size_t j = 0; j < centroids_.num_cols(); ++j) { + for (size_t i = 0; i < dimension_; ++i) { + std::cout << centroids_[j][i] << " "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif } #if 0 @@ -336,6 +519,13 @@ class kmeans_index { } #endif + auto set_centroids(const ColMajorMatrix& centroids) { + std::copy( + centroids.data(), + centroids.data() + centroids.num_rows() * centroids.num_cols(), + centroids_.data()); + } + auto& get_centroids() { return centroids_; } diff --git a/src/include/scoring.h b/src/include/scoring.h index 7c3987239..06b6f3a08 100644 --- a/src/include/scoring.h +++ b/src/include/scoring.h @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -170,43 +169,6 @@ inline auto dot(U const& a, V const& b) { return sum; } -// ---------------------------------------------------------------------------- -// Functions for dealing with the case of when size of scores < k_nn -// ---------------------------------------------------------------------------- - -// One-dimensional case -template -void pad_with_sentinels(size_t start, U& top_k) { - using index_type = typename U::value_type; - for (size_t i = start; i < top_k.size(); ++i) { - top_k[i] = std::numeric_limits::max(); - } -} - -// One-dimensional case -template -void pad_with_sentinels(size_t start, U& top_k, V& top_k_scores) { - using score_type = typename V::value_type; - using index_type = typename U::value_type; - for (size_t i = start; i < top_k.size(); ++i) { - top_k[i] = std::numeric_limits::max(); - top_k_scores[i] = std::numeric_limits::max(); - } -} - -// One-dimensional case -template -void trim_top_k(size_t start, U& top_k) { - top_k.resize(start); -} - -// One-dimensional case -template -void trim_top_k(size_t start, U& top_k, V& top_k_scores) { - top_k.resize(start); - top_k_scores.resize(start); -} - // ---------------------------------------------------------------------------- // Functions for extracting top k neighbors from a raw scores matrix // ---------------------------------------------------------------------------- @@ -222,30 +184,19 @@ void trim_top_k(size_t start, U& top_k, V& top_k_scores) { */ template -auto get_top_k_from_scores(V const& scores, L&& top_k, size_t k = 0) { - using value_type = typename V::value_type; - using index_type = typename std::remove_reference_t::value_type; - - auto num_scores = size(scores); - - if (k == 0) { - k = num_scores; - } - - fixed_min_pair_heap s(k); +auto get_top_k_from_scores(V const& scores, L&& top_k, int k) { + fixed_min_pair_heap s(k); + auto num_scores = scores.size(); for (size_t i = 0; i < num_scores; ++i) { s.insert(scores[i], i); } get_top_k_from_heap(s, top_k); } -// Note that we cannot pad top_k with sentinels here because we don't know the -// size of the valid ranges in the scores matrix. -// @todo pad top_k with sentinel if scores has sentinel -template +template auto get_top_k_from_scores(const ColMajorMatrix& scores, int k_nn) { - auto top_k = ColMajorMatrix(k_nn, scores.num_cols()); + auto top_k = ColMajorMatrix(k_nn, scores.num_cols()); for (size_t j = 0; j < scores.num_cols(); ++j) { get_top_k_from_scores(scores[j], top_k[j], k_nn); } @@ -285,18 +236,15 @@ void consolidate_scores(std::vector>& min_scores) { * @param top_k */ template -inline void get_top_k_from_heap(Heap& min_scores, auto&& top_k) - requires(!std::is_same_v>) -{ +inline void get_top_k_from_heap(Heap& min_scores, auto&& top_k) requires( + !std::is_same_v>) { std::sort_heap(begin(min_scores), end(min_scores), [](auto&& a, auto&& b) { return std::get<0>(a) < std::get<0>(b); }); - auto k_nn = std::min(size(min_scores), size(top_k)); std::transform( - begin(min_scores), begin(min_scores) + k_nn, begin(top_k), ([](auto&& e) { + begin(min_scores), end(min_scores), begin(top_k), ([](auto&& e) { return std::get<1>(e); })); - pad_with_sentinels(k_nn, top_k); } /** @@ -310,17 +258,14 @@ inline void get_top_k_from_heap(Heap& min_scores, auto&& top_k) * @return a Matrix of the top_k scores for each query. Each column corresponds * to a query, */ -template +template inline auto get_top_k(std::vector& scores, size_t k_nn) { - // using score_type = heap_score_t; - using index_type = heap_index_t; - auto num_queries = size(scores); - ColMajorMatrix top_k(k_nn, num_queries); + ColMajorMatrix top_k(k_nn, num_queries); for (size_t j = 0; j < num_queries; ++j) { - get_top_k_from_heap(scores[j], top_k[j]); // Will pad with sentinels + get_top_k_from_heap(scores[j], top_k[j]); } return top_k; } @@ -336,7 +281,7 @@ inline auto get_top_k(std::vector& scores, size_t k_nn) { * @return Matrix of the top k scores for each query. Each column corresponds * to a query. */ -template +template inline auto get_top_k(std::vector>& scores, size_t k_nn) { return get_top_k(scores[0], k_nn); } @@ -344,38 +289,35 @@ inline auto get_top_k(std::vector>& scores, size_t k_nn) { // ---------------------------------------------------------------------------- // Functions for computing top k neighbors with scores // ---------------------------------------------------------------------------- + inline void get_top_k_with_scores_from_heap( auto&& min_scores, auto&& top_k, auto&& top_k_scores) { std::sort_heap(begin(min_scores), end(min_scores), [](auto&& a, auto&& b) { return std::get<0>(a) < std::get<0>(b); }); - auto k_nn = std::min(size(min_scores), size(top_k)); std::transform( - begin(min_scores), - begin(min_scores) + k_nn, - begin(top_k_scores), - ([](auto&& e) { return std::get<0>(e); })); + begin(min_scores), end(min_scores), begin(top_k_scores), ([](auto&& e) { + return std::get<0>(e); + })); std::transform( - begin(min_scores), begin(min_scores) + k_nn, begin(top_k), ([](auto&& e) { + begin(min_scores), end(min_scores), begin(top_k), ([](auto&& e) { return std::get<1>(e); })); - pad_with_sentinels(k_nn, top_k, top_k_scores); } // Overload for one-d scores -template +template inline auto get_top_k_with_scores(std::vector& scores, size_t k_nn) { - using score_type = heap_score_t; - using index_type = heap_index_t; - auto num_queries = size(scores); - ColMajorMatrix top_k(k_nn, num_queries); + using score_type = + typename std::tuple_element<0, typename Heap::value_type>::type; + + ColMajorMatrix top_k(k_nn, num_queries); ColMajorMatrix top_scores(k_nn, num_queries); for (size_t j = 0; j < num_queries; ++j) { - get_top_k_with_scores_from_heap( - scores[j], top_k[j], top_scores[j]); // Will pad with sentinels + get_top_k_with_scores_from_heap(scores[j], top_k[j], top_scores[j]); } return std::make_tuple(std::move(top_scores), std::move(top_k)); } @@ -410,8 +352,6 @@ auto verify_top_k_index(L const& top_k, I const& g, int k, int qno) { * @brief Check the computed top k vectors against the ground truth. * Useful only for exact search. * Prints diagnostic message if difference is found. - * Used only for testing / benchmarking where sentinels have not been needed, so - * we don't need to handle the case where top_k might have sentinels. * @todo Handle the error more systematically and succinctly. */ template @@ -556,13 +496,10 @@ auto mat_col_sum( * as vq_ew for small numbers of query vectors. */ template -void gemm_scores( - const Matrix1& A, const Matrix2& B, Matrix3& C, unsigned nthreads) - requires( - (std::is_same_v && - std::is_same_v && - std::is_same_v)) -{ +void gemm_scores(const Matrix1& A, const Matrix2& B, Matrix3& C, unsigned nthreads) requires( + (std::is_same_v && + std::is_same_v && + std::is_same_v)) { using T = typename Matrix1::value_type; size_t M = A.num_cols(); // Vector dimension @@ -608,13 +545,10 @@ void gemm_scores( } template -void gemm_scores( - const Matrix1& A, const Matrix2& B, Matrix3& C, unsigned nthreads) - requires( - ((!std::is_same_v)&&std:: - is_same_v && - std::is_same_v)) -{ +void gemm_scores(const Matrix1& A, const Matrix2& B, Matrix3& C, unsigned nthreads) requires( + ((!std::is_same_v)&&std:: + is_same_v && + std::is_same_v)) { ColMajorMatrix A_f(A.num_rows(), A.num_cols()); std::copy(A.data(), A.data() + A.num_rows() * A.num_cols(), A_f.data()); @@ -623,11 +557,20 @@ void gemm_scores( template void gemm_scores( - const Matrix1& A, const Matrix2& B, Matrix3& C, unsigned nthreads) - requires(((!std::is_same_v)&&( - !std::is_same_v)&&std:: - is_same_v)) -{ + const Matrix1& A, + const Matrix2& B, + Matrix3& C, + unsigned nthreads) requires(((!std:: + is_same_v< + typename Matrix1::value_type, + float>)&&(!std:: + is_same_v< + typename Matrix2:: + value_type, + float>)&&std:: + is_same_v< + typename Matrix3::value_type, + float>)) { ColMajorMatrix A_f(A.num_rows(), A.num_cols()); std::copy(A.data(), A.data() + A.num_rows() * A.num_cols(), A_f.data()); diff --git a/src/include/stats.h b/src/include/stats.h index f5fc74e6c..7fb1595b8 100644 --- a/src/include/stats.h +++ b/src/include/stats.h @@ -104,13 +104,13 @@ class StatsCollectionScope final { #endif }; -auto dump_logs = [](std::string filename, - const std::string algorithm, - size_t nqueries, - size_t nprobe, - size_t k_nn, - size_t nthreads, - double recall) { +static auto dump_logs = [](std::string filename, + const std::string algorithm, + size_t nqueries, + size_t nprobe, + size_t k_nn, + size_t nthreads, + double recall) { // Quick and dirty way to get log info into a summarizable and useful form -- // fixed-width columns // @todo encapsulate this as a function that can be customized @@ -223,7 +223,7 @@ auto dump_logs = [](std::string filename, }; #ifdef JSON_LOGGING -auto config_log(const std::string& program_name) { +static auto config_log(const std::string& program_name) { std::string uuid_; char host_[16]; std::string date_; diff --git a/src/include/test/unit_fixed_min_heap.cc b/src/include/test/unit_fixed_min_heap.cc index a5013010d..6a827363c 100644 --- a/src/include/test/unit_fixed_min_heap.cc +++ b/src/include/test/unit_fixed_min_heap.cc @@ -201,3 +201,41 @@ TEST_CASE( std::sort(begin(v3), end(v3)); CHECK(a2 == v3); } + +TEST_CASE( + "fixed_min_heap: fixed_max_heap with a large vector", "[fixed_min_heap]") { + using element = std::tuple; + + fixed_min_pair_heap> a( + 7, std::greater{}); + + std::vector v(5500); + for (auto&& i : v) { + i = {std::rand(), std::rand()}; + CHECK(i != element{}); + } + for (auto&& [e, f] : v) { + a.insert(e, f); + } + CHECK(a.size() == 7); + + std::vector a2(begin(a), end(a)); + std::sort(begin(a2), end(a2), [](auto&& a, auto&& b) { + return std::get<0>(a) > std::get<0>(b); + }); + + std::vector u(v.begin(), v.begin() + 7); + + std::nth_element(v.begin(), v.begin() + 7, v.end(), [](auto&& a, auto&& b) { + return std::get<0>(a) > std::get<0>(b); + }); + std::vector w(v.begin(), v.begin() + 7); + + CHECK(u != w); + + std::vector v3(v.begin(), v.begin() + 7); + std::sort(begin(v3), end(v3), [](auto&& a, auto&& b) { + return std::get<0>(a) > std::get<0>(b); + }); + CHECK(a2 == v3); +} diff --git a/src/include/test/unit_flat_qv.cc b/src/include/test/unit_flat_qv.cc index 53d57a143..407f60d6c 100644 --- a/src/include/test/unit_flat_qv.cc +++ b/src/include/test/unit_flat_qv.cc @@ -38,7 +38,7 @@ TEST_CASE("qv test test", "[qv]") { } // @todo: test with tdbMatrix -TEST_CASE("flat qv all or nothing", "[flat vq]") { +TEST_CASE("flat vq all or nothing", "[flat vq]") { auto ids = std::vector(sift_base.num_cols()); std::iota(ids.rbegin(), ids.rend(), 9); @@ -58,4 +58,8 @@ TEST_CASE("flat qv all or nothing", "[flat vq]") { CHECK(std::equal(D10.data(), D10.data() + D10.size(), D11.data())); CHECK(std::equal(I10.data(), I10.data() + I10.size(), I11.data())); +} + +TEST_CASE("flat qv: qv_partition vs qv_partition_with_scores", "[flat_qv]") { + CHECK(true); } \ No newline at end of file diff --git a/src/include/test/unit_ivf_index.cc b/src/include/test/unit_ivf_index.cc index a47ffe94a..0b10702d5 100644 --- a/src/include/test/unit_ivf_index.cc +++ b/src/include/test/unit_ivf_index.cc @@ -38,13 +38,12 @@ #include "../ivf_index.h" #include "../linalg.h" -TEST_CASE("ivf_index: test test", "[ivf_index]") { - REQUIRE(true); -} +bool global_debug = false; // kmeans and kmeans indexing still WIP void debug_centroids(auto& index) { + std::cout << "\nDebug Centroids:\n" << std::endl; for (size_t j = 0; j < index.get_centroids().num_rows(); ++j) { for (size_t i = 0; i < index.get_centroids().num_cols(); ++i) { std::cout << index.get_centroids()(j, i) << " "; @@ -54,38 +53,30 @@ void debug_centroids(auto& index) { std::cout << std::endl; } -#if 0 - -TEST_CASE("ivf_index: test kmeans initializations", "[ivf_index]") { +TEST_CASE("ivf_index: test kmeans initializations", "[ivf_index][init]") { std::vector data = {8, 6, 7, 5, 3, 3, 7, 2, 1, 4, 1, 3, 0, 5, 1, 2, 9, 9, 5, 9, 2, 0, 2, 7, 7, 9, 8, 6, 7, 9, 6, 6}; ColMajorMatrix training_data(4, 8); std::copy(begin(data), end(data), training_data.data()); - auto index = kmeans_index(4, 3, 10, 1e-4, 1); + auto index = kmeans_index( + 4, 3, 10, 1e-4, 1, Catch::rngSeed()); SECTION("random") { + std::cout << "random" << std::endl; index.kmeans_random_init(training_data); } SECTION("kmeans++") { + std::cout << "kmeans++" << std::endl; index.kmeans_pp(training_data); } CHECK(index.get_centroids().num_cols() == 3); CHECK(index.get_centroids().num_rows() == 4); -#if 0 - for (size_t j = 0; j < index.get_centroids().num_rows(); ++j) { - for (size_t i = 0; i < index.get_centroids().num_cols(); ++i) { - std::cout << index.get_centroids()(j,i) << " "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif -debug_centroids(index); + debug_centroids(index); for (size_t i = 0; i < index.get_centroids().num_cols() - 1; ++i) { for (size_t j = i + 1; j < index.get_centroids().num_cols(); ++j) { @@ -111,28 +102,24 @@ debug_centroids(index); CHECK(outer_counts == index.get_centroids().num_cols()); } -#endif - -// kmeans and kmeans indexing still WIP -#if 0 - -TEST_CASE("ivf_index: test kmeans", "[ivf_index]") { +TEST_CASE("ivf_index: test kmeans", "[ivf_index][kmeans]") { std::vector data = {8, 6, 7, 5, 3, 3, 7, 2, 1, 4, 1, 3, 0, 5, 1, 2, 9, 9, 5, 9, 2, 0, 2, 7, 7, 9, 8, 6, 7, 9, 6, 6}; ColMajorMatrix training_data(4, 8); std::copy(begin(data), end(data), training_data.data()); - auto index = kmeans_index(4, 3, 10, 1e-4, 1); + auto index = + kmeans_index(4, 3, 10, 1e-4, 1, Catch::rngSeed()); SECTION("random") { - index.kmeans_random_init(training_data); - index.train_no_init(training_data); + std::cout << "random" << std::endl; + index.train(training_data, kmeans_init::random); } SECTION("kmeans++") { - index.kmeans_pp(training_data); - index.train_no_init(training_data); + std::cout << "kmeans++" << std::endl; + index.train(training_data, kmeans_init::kmeanspp); } // Test??? @@ -140,7 +127,99 @@ TEST_CASE("ivf_index: test kmeans", "[ivf_index]") { // debug_centroids(index); } +TEST_CASE("ivf_index: debug w/ sk", "[ivf_index]") { + ColMajorMatrix training_data{ + {1.0573647, 5.082087}, + {-6.229642, -1.3590931}, + {0.7446737, 6.3828287}, + {-7.698864, -3.0493321}, + {2.1362762, -4.4448104}, + {1.04019, -4.0389647}, + {0.38996044, 5.7235265}, + {1.7470839, -4.717076}}; + ColMajorMatrix queries{{-7.3712273, -1.1178735}}; + ColMajorMatrix sklearn_centroids{ + {-6.964253, -2.2042127}, {1.6411834, -4.400284}, {0.7306664, 5.7294807}}; + + SECTION("one iteration") { + std::cout << "one iteration" << std::endl; + auto index = kmeans_index( + sklearn_centroids.num_rows(), + sklearn_centroids.num_cols(), + 1, + 1e-4, + 1, + Catch::rngSeed()); + index.set_centroids(sklearn_centroids); + index.train(training_data, kmeans_init::none); + debug_centroids(index); + } + + SECTION("two iterations") { + std::cout << "two iterations" << std::endl; + auto index = kmeans_index( + sklearn_centroids.num_rows(), + sklearn_centroids.num_cols(), + 2, + 1e-4, + 1, + Catch::rngSeed()); + index.set_centroids(sklearn_centroids); + index.train(training_data, kmeans_init::none); + debug_centroids(index); + } + + SECTION("five iterations") { + std::cout << "five iterations" << std::endl; + auto index = kmeans_index( + sklearn_centroids.num_rows(), + sklearn_centroids.num_cols(), + 5, + 1e-4, + 1, + Catch::rngSeed()); + index.set_centroids(sklearn_centroids); + index.train(training_data, kmeans_init::none); + debug_centroids(index); + } + + SECTION("five iterations, perturbed") { + std::cout << "five iterations, perturbed" << std::endl; + for (size_t i = 0; i < sklearn_centroids.num_cols(); ++i) { + for (size_t j = 0; j < sklearn_centroids.num_rows(); ++j) { + sklearn_centroids(j, i) *= 0.8; + } + } + + sklearn_centroids(0, 0) += 0.25; + auto index = kmeans_index( + sklearn_centroids.num_rows(), + sklearn_centroids.num_cols(), + 5, + 1e-4, + 1, + Catch::rngSeed()); + index.set_centroids(sklearn_centroids); + index.train(training_data, kmeans_init::none); + debug_centroids(index); + } + + SECTION("five iterations") { + std::cout << "five iterations" << std::endl; + auto index = kmeans_index( + sklearn_centroids.num_rows(), + sklearn_centroids.num_cols(), + 5, + 1e-4, + 1, + Catch::rngSeed()); + index.train(training_data, kmeans_init::random); + debug_centroids(index); + } +} +// kmeans and kmeans indexing still WIP +#if 0 TEST_CASE("ivf_index: not a unit test per se", "[ivf_index]") { tiledb::Context ctx; diff --git a/src/include/test/unit_ivf_qv.cc b/src/include/test/unit_ivf_qv.cc index f0c1e2aef..65757bc85 100644 --- a/src/include/test/unit_ivf_qv.cc +++ b/src/include/test/unit_ivf_qv.cc @@ -29,6 +29,7 @@ * */ +#include #include #include "detail/ivf/dist_qv.h" #include "detail/ivf/qv.h" @@ -36,6 +37,9 @@ #include "detail/linalg/tdb_io.h" #include "query_common.h" +bool global_verbose = false; +bool global_debug = true; + TEST_CASE("qv: test test", "[qv]") { REQUIRE(true); } @@ -62,7 +66,7 @@ TEST_CASE("ivf qv: infinite all or none", "[ivf qv][ci-skip]") { auto nprobe = GENERATE(1, 5); auto k_nn = GENERATE(1, 5); auto nthreads = GENERATE(1, 5); - std::cout << nprobe << " " << k_nn << " " << nthreads << std::endl; + // std::cout << nprobe << " " << k_nn << " " << nthreads << std::endl; auto&& [D00, I00] = detail::ivf::query_infinite_ram( ctx, @@ -177,8 +181,8 @@ TEST_CASE("ivf qv: finite all or none", "[ivf qv][ci-skip]") { auto nprobe = GENERATE(7 /*, 1*/); auto k_nn = GENERATE(9 /*, 1*/); auto nthreads = GENERATE(5 /*, 1*/); - std::cout << upper_bound << " " << nprobe << " " << num_queries << " " - << k_nn << " " << nthreads << std::endl; + // std::cout << upper_bound << " " << nprobe << " " << num_queries << " " + // << k_nn << " " << nthreads << std::endl; auto&& [D00, I00] = detail::ivf::query_infinite_ram( ctx, @@ -264,19 +268,19 @@ TEST_CASE("ivf qv: finite all or none", "[ivf qv][ci-skip]") { // std::cout << "num intersections " << intersections00 << " / " << // intersectionsGT << std::endl; - std::cout << "num intersections " << intersections00 << std::endl; + // std::cout << "num intersections " << intersections00 << std::endl; if (nprobe != 1 && k_nn != 1 && num_queries != 1) { CHECK(intersections00 != 0); } - CHECK(intersections00 == intersections01); - CHECK(intersections00 == intersections02); - CHECK(intersections00 == intersections03); - CHECK(intersections00 == intersections04); + CHECK(std::abs(intersections00 - intersections01) < 12); + CHECK(std::abs(intersections00 - intersections02) < 12); + CHECK(std::abs(intersections00 - intersections03) < 12); + CHECK(std::abs(intersections00 - intersections04) < 12); - debug_slices_diff(D00, D01, "D00 vs D01"); - debug_slices_diff(D00, D02, "D00 vs D02"); - debug_slices_diff(D00, D03, "D00 vs D03"); - debug_slices_diff(D00, D04, "D00 vs D04"); + // debug_slices_diff(D00, D01, "D00 vs D01"); + // debug_slices_diff(D00, D02, "D00 vs D02"); + // debug_slices_diff(D00, D03, "D00 vs D03"); + // debug_slices_diff(D00, D04, "D00 vs D04"); CHECK(!std::equal( D00.data(), @@ -288,9 +292,10 @@ TEST_CASE("ivf qv: finite all or none", "[ivf qv][ci-skip]") { CHECK(std::equal(D00.data(), D00.data() + D00.size(), D04.data())); #if 1 + SECTION("dist_qv_finite_ram") { - auto num_nodes = GENERATE(5 /*, 1,*/); - std::cout << "num nodes " << num_nodes << std::endl; + auto num_nodes = GENERATE(5 /*, 1 */); + // std::cout << "num nodes " << num_nodes << std::endl; auto&& [D05, I05] = detail::ivf::dist_qv_finite_ram( ctx, @@ -307,8 +312,8 @@ TEST_CASE("ivf qv: finite all or none", "[ivf qv][ci-skip]") { check_size(D05, I05); auto intersections05 = count_intersections(I05, groundtruth, k_nn); - CHECK(intersections00 == intersections05); - debug_slices_diff(D00, D05, "D00 vs D05"); + CHECK(std::abs(intersections00 - intersections05) < 12); + // debug_slices_diff(D00, D05, "D00 vs D05"); CHECK(std::equal(D00.data(), D00.data() + D00.size(), D05.data())); } #endif diff --git a/src/include/test/unit_ivf_vq.cc b/src/include/test/unit_ivf_vq.cc index b15be9f91..bd6d53193 100644 --- a/src/include/test/unit_ivf_vq.cc +++ b/src/include/test/unit_ivf_vq.cc @@ -29,6 +29,7 @@ * */ +#include #include #include "detail/ivf/qv.h" #include "detail/ivf/vq.h" @@ -37,6 +38,9 @@ #include "query_common.h" #include "utils/utils.h" +bool global_verbose = false; +bool global_debug = false; + TEST_CASE("vq: test test", "[ivf vq]") { REQUIRE(true); } @@ -206,10 +210,10 @@ TEST_CASE("ivf vq: finite all or none", "[ivf vq][ci-skip]") { auto intersections02 = count_intersections(I02, groundtruth, k_nn); auto intersections03 = count_intersections(I03, groundtruth, k_nn); - CHECK(intersections00 != 0); - CHECK(intersections00 == intersections01); - CHECK(intersections00 == intersections02); - CHECK(intersections00 == intersections03); + CHECK((size_t) intersections00 != 0UL); + CHECK(std::abs(intersections00 - intersections01) < 12); + CHECK(std::abs(intersections00 - intersections02) < 12); + CHECK(std::abs(intersections00 - intersections03) < 12); debug_slices_diff(D00, D01, "D00 vs D01"); debug_slices_diff(D00, D02, "D00 vs D02"); diff --git a/src/include/test/unit_linalg.cc b/src/include/test/unit_linalg.cc index 7dc32b13e..b40a41d97 100644 --- a/src/include/test/unit_linalg.cc +++ b/src/include/test/unit_linalg.cc @@ -37,6 +37,8 @@ #include "linalg.h" #include "utils/utils.h" +bool global_debug = false; + using TestTypes = std::tuple; diff --git a/src/include/test/unit_matrix.cc b/src/include/test/unit_matrix.cc index c59c73f55..9e1e1ccb9 100644 --- a/src/include/test/unit_matrix.cc +++ b/src/include/test/unit_matrix.cc @@ -34,6 +34,8 @@ #include #include "detail/linalg/matrix.h" +bool global_debug = false; + using TestTypes = std::tuple; TEST_CASE("matrix: test test", "[matrix]") { diff --git a/src/include/test/unit_scoring.cc b/src/include/test/unit_scoring.cc index 49acdaa4f..ceb52ff8b 100644 --- a/src/include/test/unit_scoring.cc +++ b/src/include/test/unit_scoring.cc @@ -38,7 +38,7 @@ #ifdef TILEDB_VS_ENABLE_BLAS -TEST_CASE("scoring: vector test", "[scoring]") { +TEST_CASE("defs: vector test", "[defs]") { std::vector> a{ {1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}}; std::vector b{0, 0, 0, 0}; @@ -79,43 +79,22 @@ TEST_CASE("scoring: vector test", "[scoring]") { // cosine // dot // jaccard WIP -using scoring_typelist = std::tuple< - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple, - std::tuple>; -// get_top_k (heap) from scores array -TEMPLATE_LIST_TEST_CASE( - "scoring: get_top_k_from_scores vector", - "[scoring][get_top_k]", - scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - // using groundtruth_type = std::tuple_element_t<2, TestType>; - std::vector scores = {8, 6, 7, 5, 3, 0, 9, 1, 2, 4, 3.14159}; - std::vector top_k(5); - get_top_k_from_scores(scores, top_k, 5); - CHECK(top_k.size() == 5); - CHECK(top_k[0] == 5); // 0 - CHECK(top_k[1] == 7); // 1 - CHECK(top_k[2] == 8); // 2 - CHECK(top_k[3] == 4); // 3 - CHECK(top_k[4] == 10); // 3.14159 +// get_top_k (heap) from scores array +TEST_CASE("get_top_k (heap) from scores array", "[get_top_k]") { + std::vector scores = {8, 6, 7, 5, 3, 0, 9, 1, 2, 4}; + + std::vector top_k(3); + get_top_k_from_scores(scores, top_k, 3); + CHECK(top_k.size() == 3); + CHECK(top_k[0] == 5); // 0 + CHECK(top_k[1] == 7); // 1 + CHECK(top_k[2] == 8); // 2 } // get_top_k (heap) from scores matrix, parallel -TEMPLATE_LIST_TEST_CASE( - "scoring: get_top_k_from_scores matrix", - "[scoring][get_top_k]", - scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - using groundtruth_type = std::tuple_element_t<2, TestType>; - - ColMajorMatrix scores{ +TEST_CASE("get_top_k (heap) from scores matrix, parallel", "[get_top_k]") { + ColMajorMatrix scores{ // 0 1 2 3 4 5 6 7 8 {8, 6, 7, 5, 3, 0, 9, 1, 2}, {3, 1, 4, 1, 5, 9, 2, 6, 7}, @@ -137,18 +116,18 @@ TEMPLATE_LIST_TEST_CASE( // 2 2 3 // 1 1 3 // 0 1 1 - std::vector gt_scores{ + std::vector gt_scores{ 0, 1, 2, 1, 1, 2, 1, 2, 3, 1, 2, 3, 2, 2, 3, 1, 1, 3, 0, 1, 1, }; - std::vector gt_neighbors{ + std::vector gt_neighbors{ 5, 7, 8, 1, 3, 6, 0, 1, 2, 8, 7, 6, 7, 3, 6, 8, 6, 2, 3, 8, 5, }; SECTION("single thread") { - auto top_k = get_top_k_from_scores(scores, 3); + auto top_k = get_top_k_from_scores(scores, 3); CHECK(top_k.num_rows() == 3); CHECK(top_k.num_cols() == 7); - std::vector foo(top_k.data(), top_k.data() + top_k.size()); + std::vector foo(top_k.data(), top_k.data() + top_k.size()); CHECK(std::equal(begin(gt_neighbors), end(gt_neighbors), top_k.data())); } // no multithreaded version WIP @@ -164,14 +143,9 @@ TEMPLATE_LIST_TEST_CASE( } // consolidate scores -TEMPLATE_LIST_TEST_CASE( - "scoring: consolidate scores", "[scoring]", scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - // using groundtruth_type = std::tuple_element_t<2, TestType>; - - std::vector> scores00{ - fixed_min_pair_heap( +TEST_CASE("scoring consolidate scores", "[scoring]") { + std::vector> scores00{ + fixed_min_pair_heap( 3, { {0.1, 0}, @@ -180,7 +154,7 @@ TEMPLATE_LIST_TEST_CASE( {0.4, 3}, {0.5, 4}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.9, 0}, @@ -189,7 +163,7 @@ TEMPLATE_LIST_TEST_CASE( {0.6, 3}, {0.5, 4}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.6, 5}, @@ -198,7 +172,7 @@ TEMPLATE_LIST_TEST_CASE( {0.9, 8}, {1.0, 9}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.4, 5}, @@ -208,8 +182,8 @@ TEMPLATE_LIST_TEST_CASE( {0.0, 9}, }), }; - std::vector> scores01{ - fixed_min_pair_heap( + std::vector> scores01{ + fixed_min_pair_heap( 3, { {0.1, 4}, @@ -218,7 +192,7 @@ TEMPLATE_LIST_TEST_CASE( {0.4, 7}, {0.5, 8}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.6, 4}, @@ -227,7 +201,7 @@ TEMPLATE_LIST_TEST_CASE( {0.9, 7}, {1.0, 8}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.9, 0}, @@ -236,7 +210,7 @@ TEMPLATE_LIST_TEST_CASE( {0.6, 3}, {0.5, 4}, }), - fixed_min_pair_heap( + fixed_min_pair_heap( 3, { {0.4, 9}, @@ -246,61 +220,41 @@ TEMPLATE_LIST_TEST_CASE( {0.0, 6}, }), }; - auto scores = - std::vector>>{ - scores00, scores01}; + auto scores = std::vector>>{ + scores00, scores01}; CHECK(scores.size() == 2); consolidate_scores(scores); CHECK(scores.size() == 2); auto s = scores[0]; - CHECK(size(s) == 4); - for (auto& j : s) { - std::sort_heap(begin(j), end(j)); + CHECK(s.size() == 4); + for (auto&& j : s) { + std::sort_heap(j.begin(), j.end()); } - - CHECK(size(s[0]) == 3); - CHECK(size(s[1]) == 3); - CHECK(size(s[2]) == 3); - CHECK(size(s[3]) == 3); CHECK(std::equal( begin(s[0]), end(s[0]), - std::vector>( - {{0.1, 0}, {0.1, 4}, {0.2, 1}}) + std::vector>({{0.1, 0}, {0.1, 4}, {0.2, 5}}) .begin())); CHECK(std::equal( begin(s[1]), end(s[1]), - std::vector>( - {{0.2, 5}, {0.5, 4}, {0.6, 4}}) + std::vector>({{0.2, 5}, {0.5, 4}, {0.6, 4}}) .begin())); CHECK(std::equal( begin(s[2]), end(s[2]), - std::vector>( - {{0.5, 4}, {0.6, 3}, {0.6, 5}}) + std::vector>({{0.5, 4}, {0.6, 3}, {0.6, 5}}) .begin())); CHECK(std::equal( begin(s[3]), end(s[3]), - std::vector>( - {{0.0, 6}, {0.0, 9}, {0.1, 5}}) + std::vector>({{0.0, 6}, {0.0, 9}, {0.1, 5}}) .begin())); } -TEMPLATE_LIST_TEST_CASE( - "scoring: get_top_k_from_heap one min_heap", - "[scoring]", - scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - using groundtruth_type = std::tuple_element_t<2, TestType>; - - groundtruth_type k_nn = GENERATE(1, 3, 5); - groundtruth_type asize = GENERATE(1, 3, 5); - - fixed_min_pair_heap a( - asize, +TEST_CASE("scoring get_top_k_from_heap one min_heap", "[scoring]") { + fixed_min_pair_heap a( + 5, { {10, 0}, {9, 1}, @@ -313,221 +267,30 @@ TEMPLATE_LIST_TEST_CASE( {2, 8}, {1, 9}, }); - std::vector gt_neighbors{9, 8, 7, 6, 5, 4, 3, 2, 1}; + std::vector gt_neighbors{ + 9, + 8, + 7, + 6, + 5, + }; SECTION("std::vector") { - std::vector top_k(k_nn); + std::vector top_k(5); get_top_k_from_heap(a, top_k); - REQUIRE(top_k.size() == k_nn); - auto l_nn = std::min(k_nn, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); + CHECK(top_k.size() == 5); + CHECK(std::equal(begin(gt_neighbors), end(gt_neighbors), top_k.begin())); } - SECTION("std::span") { - std::vector top_k(k_nn); - get_top_k_from_heap(a, std::span(top_k.data(), k_nn)); - REQUIRE(top_k.size() == k_nn); - auto l_nn = std::min(k_nn, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - } - - SECTION("std::vector, pad") { - groundtruth_type pad = 2; - std::vector top_k(k_nn + pad); - get_top_k_from_heap(a, top_k); - REQUIRE(top_k.size() == k_nn + pad); - auto l_nn = std::min(k_nn + pad, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - CHECK(end(top_k) == begin(top_k) + k_nn + pad); - CHECK(end(top_k) - begin(top_k) == k_nn + pad); - CHECK(std::equal( - begin(top_k) + l_nn, - begin(top_k) + k_nn + pad, - std::vector( - k_nn + pad - l_nn, std::numeric_limits::max()) - .begin())); - } - - SECTION("std::span, pad") { - groundtruth_type pad = 2; - std::vector top_k(k_nn + pad); - get_top_k_from_heap(a, std::span(top_k.data(), k_nn + pad)); - REQUIRE(top_k.size() == k_nn + pad); - auto l_nn = std::min(k_nn + pad, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - CHECK(end(top_k) == begin(top_k) + k_nn + pad); - CHECK(end(top_k) - begin(top_k) == k_nn + pad); - CHECK(std::equal( - begin(top_k) + l_nn, - begin(top_k) + k_nn + pad, - std::vector( - k_nn + pad - l_nn, std::numeric_limits::max()) - .begin())); + std::vector top_k(5); + get_top_k_from_heap(a, std::span(top_k.data(), 5)); + CHECK(top_k.size() == 5); + CHECK(std::equal(begin(gt_neighbors), end(gt_neighbors), top_k.begin())); } } -TEMPLATE_LIST_TEST_CASE("scoring: get_top_k", "[scoring]", scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - using groundtruth_type = std::tuple_element_t<2, TestType>; - - groundtruth_type k_nn = GENERATE(1, 3, 5); - groundtruth_type asize = GENERATE(1, 3, 5); - groundtruth_type bsize = GENERATE(0, 1); - size_t num_vectors{2}; - - std::vector> scores00{ - fixed_min_pair_heap( - asize, - { - {0.1, 0}, - {0.2, 1}, - {0.3, 2}, - {0.4, 3}, - {0.5, 4}, - {0.6, 5}, - {0.7, 6}, - {0.8, 7}, - {0.9, 8}, - {1.0, 9}, - }), - fixed_min_pair_heap( - asize + bsize, - { - {0.9, 0}, - {0.8, 1}, - {0.7, 2}, - {0.6, 3}, - {0.5, 4}, - {0.4, 5}, - {0.3, 6}, - {0.2, 7}, - {0.1, 8}, - {0.0, 9}, - })}; - - // Matrix not used - ColMajorMatrix gt_neighbors_mat{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, - {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, - }; - ColMajorMatrix gt_scores_mat{ - {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, - {0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}, - }; - - SECTION("std::vector get_top_k") { - CHECK(size(scores00[0]) == asize); - CHECK(size(scores00[1]) == asize + bsize); - - auto top_k = get_top_k(scores00, k_nn); - - for (size_t i = 0; i < num_vectors; ++i) { - auto l_nn = std::min(top_k[0].size(), scores00[i].size()); - CHECK(std::equal( - begin(gt_neighbors_mat[i]), - begin(gt_neighbors_mat[i]) + l_nn, - begin(top_k[i]))); - CHECK(std::equal( - begin(top_k[i]) + l_nn, - begin(top_k[i]) + k_nn, - std::vector( - k_nn - l_nn, std::numeric_limits::max()) - .begin())); - } - } - SECTION("std::vector get_top_k_with_scores") { - CHECK(size(scores00[0]) == asize); - CHECK(size(scores00[1]) == asize + bsize); - - auto&& [top_k_scores, top_k] = get_top_k_with_scores(scores00, k_nn); - - for (size_t i = 0; i < num_vectors; ++i) { - auto l_nn = std::min(top_k[0].size(), scores00[i].size()); - CHECK(std::equal( - begin(gt_neighbors_mat[i]), - begin(gt_neighbors_mat[i]) + l_nn, - begin(top_k[i]))); - CHECK(std::equal( - begin(top_k[i]) + l_nn, - begin(top_k[i]) + k_nn, - std::vector( - k_nn - l_nn, std::numeric_limits::max()) - .begin())); - CHECK(std::equal( - begin(gt_scores_mat[i]), - begin(gt_scores_mat[i]) + l_nn, - begin(top_k_scores[i]))); - CHECK(std::equal( - begin(top_k_scores[i]) + l_nn, - begin(top_k_scores[i]) + k_nn, - std::vector( - k_nn - l_nn, std::numeric_limits::max()) - .begin())); - - } - } - -#if 0 - SECTION("std::span get_top_k") { - std::vector top_k(k_nn); - get_top_k_from_heap(a, std::span(top_k.data(), k_nn)); - REQUIRE(top_k.size() == k_nn); - auto l_nn = std::min(k_nn, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - } - - SECTION("std::vector get_top_k, pad") { - groundtruth_type pad = 2; - std::vector top_k(k_nn + pad); - get_top_k_from_heap(a, top_k); - REQUIRE(top_k.size() == k_nn + pad); - auto l_nn = std::min(k_nn + pad, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - CHECK(end(top_k) == begin(top_k) + k_nn + pad); - CHECK(end(top_k) - begin(top_k) == k_nn + pad); - CHECK(std::equal( - begin(top_k) + l_nn, - begin(top_k) + k_nn + pad, - std::vector( - k_nn + pad - l_nn, std::numeric_limits::max()) - .begin())); - } - - SECTION("std::span get_top_k, pad") { - groundtruth_type pad = 2; - std::vector top_k(k_nn + pad); - get_top_k_from_heap(a, std::span(top_k.data(), k_nn + pad)); - REQUIRE(top_k.size() == k_nn + pad); - auto l_nn = std::min(k_nn + pad, a.size()); - CHECK(std::equal( - begin(gt_neighbors), begin(gt_neighbors) + l_nn, top_k.begin())); - CHECK(end(top_k) == begin(top_k) + k_nn + pad); - CHECK(end(top_k) - begin(top_k) == k_nn + pad); - CHECK(std::equal( - begin(top_k) + l_nn, - begin(top_k) + k_nn + pad, - std::vector( - k_nn + pad - l_nn, std::numeric_limits::max()) - .begin())); - } -#endif -} - -TEMPLATE_LIST_TEST_CASE( - "scoring: get_top_k_from_heap vector of min_heap", - "[scoring]", - scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - using groundtruth_type = std::tuple_element_t<2, TestType>; - fixed_min_pair_heap a( +TEST_CASE("scoring get_top_k_from_heap vector of min_heap", "[scoring]") { + fixed_min_pair_heap a( 5, { {10, 0}, @@ -541,7 +304,7 @@ TEMPLATE_LIST_TEST_CASE( {2, 8}, {1, 9}, }); - fixed_min_pair_heap b( + fixed_min_pair_heap b( 5, { {2, 0}, @@ -555,7 +318,11 @@ TEMPLATE_LIST_TEST_CASE( {9, 8}, {0, 9}, }); - std::vector gt_neighbors_vec{ + ColMajorMatrix gt_neighbors_mat{ + {9, 8, 7, 6, 5}, + {9, 4, 0, 5, 1}, + }; + std::vector gt_neighbors_vec{ 9, 8, 7, @@ -567,7 +334,11 @@ TEMPLATE_LIST_TEST_CASE( 5, 1, }; - std::vector gt_scores_vec{ + ColMajorMatrix gt_scores_mat{ + {1, 2, 3, 4, 5}, + {0, 1, 2, 3, 4}, + }; + std::vector gt_scores_vec{ 1, 2, 3, @@ -579,17 +350,7 @@ TEMPLATE_LIST_TEST_CASE( 3, 4, }; - - // Matrix not used - ColMajorMatrix gt_neighbors_mat{ - {9, 8, 7, 6, 5}, - {9, 4, 0, 5, 1}, - }; - ColMajorMatrix gt_scores_mat{ - {1, 2, 3, 4, 5}, - {0, 1, 2, 3, 4}, - }; - std::vector> scores{a, b}; + std::vector> scores{a, b}; SECTION("std::vector") { auto top_k = get_top_k(scores, 5); CHECK(top_k.num_rows() == 5); @@ -625,53 +386,6 @@ TEMPLATE_LIST_TEST_CASE( } } -// test pad with sentinel -TEMPLATE_LIST_TEST_CASE( - "scoring: pad_with_sentinels", "[scoring]", scoring_typelist) { - using score_type = std::tuple_element_t<0, TestType>; - using index_type = std::tuple_element_t<1, TestType>; - using groundtruth_type = std::tuple_element_t<2, TestType>; - - auto v = std::vector{8, 6, 7, 5, 3, 0, 9, 1, 2, 4, 3}; - auto w = v; - auto x = std::vector{ - 3.1, 4.1, 5.9, 2.6, 5.3, 5.8, 9.7, 9.3, 2.3, 8.4, 6.2}; - auto y = x; - groundtruth_type start = GENERATE(1, 3, 9, 11); - - SECTION("top_k") { - pad_with_sentinels(start, v); - CHECK(std::equal(begin(v), begin(v) + start, begin(w))); - CHECK(std::equal( - begin(v) + start, - end(v), - std::vector( - size(v) - start, std::numeric_limits::max()) - .begin())); - } - SECTION("top_k_with_scores") { - pad_with_sentinels(start, v, x); - CHECK(std::equal(begin(v), begin(v) + start, begin(w))); - CHECK(std::equal( - begin(v) + start, - end(v), - std::vector( - size(v) - start, std::numeric_limits::max()) - .begin())); - - CHECK(std::equal(begin(x), begin(x) + start, begin(y))); - CHECK(std::equal( - begin(x) + start, - end(x), - std::vector( - size(x) - start, std::numeric_limits::max()) - .begin())); - } -} - -// test get_top_k_from_heap and get_top_k_from_heap_with_scores with padding -// test get_top_k and get_top_k_with_scores with padding - // get_top_k_from_heap (vector of vectors of min_heaps) // get_top_k_with_scores_from_heap (one min_heap) // get_top_k_with_scores_from_heap (vector of min_heaps) diff --git a/src/include/utils/fixed_min_heap.h b/src/include/utils/fixed_min_heap.h index 96b13f427..9e0e7218d 100644 --- a/src/include/utils/fixed_min_heap.h +++ b/src/include/utils/fixed_min_heap.h @@ -57,8 +57,11 @@ class fixed_min_set_heap_1 : public std::vector { void insert(T const& x) { if (Base::size() < max_size) { - this->push_back(x); - std::push_heap(begin(*this), end(*this), std::less()); + Base::push_back(x); + // std::push_heap(begin(*this), end(*this), std::less()); + if (Base::size() == max_size) { + std::make_heap(begin(*this), end(*this), std::less()); + } } else if (x < this->front()) { std::pop_heap(begin(*this), end(*this), std::less()); this->pop_back(); @@ -88,8 +91,12 @@ class fixed_min_set_heap_2 : public std::vector { void insert(T const& x) { if (Base::size() < max_size) { - this->push_back(x); - std::push_heap(begin(*this), end(*this)); + Base::push_back(x); + // std::push_heap(begin(*this), end(*this), std::less()); + if (Base::size() == max_size) { + // std::make_heap(begin(*this), end(*this), std::less()); + std::make_heap(begin(*this), end(*this)); + } } else if (x < this->front()) { // std::pop_heap(begin(*this), end(*this), std::less()); std::pop_heap(begin(*this), end(*this)); @@ -106,24 +113,28 @@ class fixed_min_set_heap_2 : public std::vector { * @tparam T Type of first element * @tparam U Type of second element */ -template +template > class fixed_min_pair_heap : public std::vector> { using Base = std::vector>; - // using Base::Base; unsigned max_size{0}; + Compare compare_; public: - explicit fixed_min_pair_heap(unsigned k) + explicit fixed_min_pair_heap(unsigned k, Compare compare = Compare()) : Base(0) - , max_size{k} { + , max_size{k} + , compare_{std::move(compare)} { Base::reserve(k); } explicit fixed_min_pair_heap( - unsigned k, std::initializer_list> l) + unsigned k, + std::initializer_list> l, + Compare compare = Compare()) : Base(0) - , max_size{k} { + , max_size{k} + , compare_{std::move(compare)} { Base::reserve(k); for (auto& p : l) { insert(std::get<0>(p), std::get<1>(p)); @@ -132,37 +143,26 @@ class fixed_min_pair_heap : public std::vector> { void insert(const T& x, const U& y) { if (Base::size() < max_size) { - this->emplace_back(x, y); - std::push_heap(begin(*this), end(*this), [&](auto& a, auto& b) { - return std::get<0>(a) < std::get<0>(b); - }); - } else if (x < std::get<0>(this->front())) { + Base::emplace_back(x, y); + // std::push_heap(begin(*this), end(*this), std::less()); + if (Base::size() == max_size) { + std::make_heap(begin(*this), end(*this), [&](auto& a, auto& b) { + return compare_(std::get<0>(a), std::get<0>(b)); + }); + } + } else if (compare_(x, std::get<0>(this->front()))) { std::pop_heap(begin(*this), end(*this), [&](auto& a, auto& b) { - return std::get<0>(a) < std::get<0>(b); + return compare_(std::get<0>(a), std::get<0>(b)); }); this->pop_back(); this->emplace_back(x, y); std::push_heap(begin(*this), end(*this), [&](auto& a, auto& b) { - return std::get<0>(a) < std::get<0>(b); + return compare_(std::get<0>(a), std::get<0>(b)); }); } } }; -template -struct heap_traits { - using value_type = typename Heap::value_type; - using score_type = typename std::tuple_element<0, typename Heap::value_type>::type; - using index_type = typename std::tuple_element<1, typename Heap::value_type>::type; -}; - -template -using heap_score_t = typename heap_traits::score_type; - -template -using heap_index_t = typename heap_traits::index_type; - - // template // using fixed_min_heap = fixed_min_set_heap_1; diff --git a/src/include/utils/logging.h b/src/include/utils/logging.h index 89881bf7b..e0689fd21 100644 --- a/src/include/utils/logging.h +++ b/src/include/utils/logging.h @@ -257,7 +257,7 @@ inline timing_data_class& get_timing_data_instance() { return timing_data_class::get_instance(); } -timing_data_class& _timing_data{get_timing_data_instance()}; +static timing_data_class& _timing_data{get_timing_data_instance()}; /** * Timer class for logging timing data. Internnally aintains a start time and a @@ -468,7 +468,7 @@ inline memory_data& get_memory_data_instance() { return memory_data::get_instance(); } -memory_data& _memory_data{get_memory_data_instance()}; +static memory_data& _memory_data{get_memory_data_instance()}; #if 0 class stats_data { diff --git a/src/include/utils/timer.h b/src/include/utils/timer.h index 994424226..00bc810e1 100644 --- a/src/include/utils/timer.h +++ b/src/include/utils/timer.h @@ -123,7 +123,10 @@ class life_timer : public empty_timer, public ms_timer { } }; -std::ostream& operator<<(std::ostream& os, const seconds_timer& t) { +namespace { + +[[maybe_unused]] std::ostream& operator<<( + std::ostream& os, const seconds_timer& t) { std::string name = t.name(); if (t.name() != "") { os << "# [ " + t.name() + " ]: "; @@ -132,7 +135,7 @@ std::ostream& operator<<(std::ostream& os, const seconds_timer& t) { return os; } -std::ostream& operator<<(std::ostream& os, const ms_timer& t) { +[[maybe_unused]] std::ostream& operator<<(std::ostream& os, const ms_timer& t) { std::string name = t.name(); if (t.name() != "") { os << "# [ " + t.name() + " ]: "; @@ -142,7 +145,7 @@ std::ostream& operator<<(std::ostream& os, const ms_timer& t) { return os; } -std::ostream& operator<<(std::ostream& os, const us_timer& t) { +[[maybe_unused]] std::ostream& operator<<(std::ostream& os, const us_timer& t) { std::string name = t.name(); if (t.name() != "") { os << "# [ " + t.name() + " ]: "; @@ -151,6 +154,8 @@ std::ostream& operator<<(std::ostream& os, const us_timer& t) { return os; } +} // anonymous namespace + #ifndef tdb_func__ #ifdef __cpp_lib_source_location #include diff --git a/src/include/utils/utils.h b/src/include/utils/utils.h index a83010cad..577ad7ffd 100644 --- a/src/include/utils/utils.h +++ b/src/include/utils/utils.h @@ -38,6 +38,8 @@ #include #include +namespace { + bool is_http_address(const std::string& filename) { std::regex httpRegex("^https?://.*"); return std::regex_match(filename, httpRegex); @@ -91,7 +93,7 @@ bool local_file_exists(const std::string& filename) { return std::filesystem::is_regular_file(filePath); } -bool is_local_file(const std::string& filename) { +[[maybe_unused]] bool is_local_file(const std::string& filename) { return local_file_exists(filename); } @@ -101,7 +103,7 @@ bool local_array_exists(const std::string& array_uri) { subdirectory_exists(array_uri, "__schema"); } -bool is_local_array(const std::string& array_uri) { +[[maybe_unused]] bool is_local_array(const std::string& array_uri) { return local_array_exists(array_uri); } @@ -146,4 +148,6 @@ struct counter { } }; +} // anonymous namespace + #endif diff --git a/src/src/flat_l2.cc b/src/src/flat_l2.cc index 5595d357b..2fa37ccad 100644 --- a/src/src/flat_l2.cc +++ b/src/src/flat_l2.cc @@ -91,6 +91,7 @@ bool verbose = false; bool debug = false; +bool global_debug = false; bool enable_stats = false; std::vector core_stats; @@ -143,7 +144,7 @@ int main(int argc, char* argv[]) { return 0; } - // global_debug = debug = args["--debug"].asBool(); + global_debug = debug = args["--debug"].asBool(); verbose = args["--verbose"].asBool(); enable_stats = args["--stats"].asBool(); @@ -170,7 +171,6 @@ int main(int argc, char* argv[]) { tiledb::Context ctx; auto db = tdbColMajorMatrix(ctx, db_uri, blocksize); // blocked - db.load(); auto query = tdbColMajorMatrix(ctx, query_uri, nqueries); // just a slice @@ -180,7 +180,7 @@ int main(int argc, char* argv[]) { std::cout << load_time << std::endl; // @todo decide on what the type of top_k::value should be - auto [top_k_scores, top_k] = [&]() { + auto [top_k, top_k_scores] = [&]() { if (alg_name == "vq_heap" || alg_name == "vq") { if (verbose) { std::cout << "# Using vq_heap" << std::endl; diff --git a/src/src/index.cc b/src/src/index.cc index fb002773c..c3fb4db3b 100644 --- a/src/src/index.cc +++ b/src/src/index.cc @@ -46,6 +46,9 @@ #include "linalg.h" #include "utils/utils.h" +bool global_verbose = false; +bool global_debug = false; + bool enable_stats = false; std::vector core_stats; @@ -105,8 +108,8 @@ int main(int argc, char* argv[]) { if (nthreads == 0) { nthreads = std::thread::hardware_concurrency(); } - // global_debug = args["--debug"].asBool(); - // global_verbose = args["--verbose"].asBool(); + global_debug = args["--debug"].asBool(); + global_verbose = args["--verbose"].asBool(); enable_stats = args["--stats"].asBool(); bool dryrun = args["--dryrun"].asBool(); diff --git a/src/src/ivf_flat.cc b/src/src/ivf_flat.cc index 733c6346e..a924b7015 100644 --- a/src/src/ivf_flat.cc +++ b/src/src/ivf_flat.cc @@ -73,6 +73,8 @@ #include "utils/timer.h" #include "utils/utils.h" +bool global_verbose = false; +bool global_debug = false; bool enable_stats = false; std::vector core_stats; @@ -142,8 +144,8 @@ int main(int argc, char* argv[]) { if (nthreads == 0) { nthreads = std::thread::hardware_concurrency(); } - // global_debug = args["--debug"].asBool(); - // global_verbose = args["--verbose"].asBool(); + global_debug = args["--debug"].asBool(); + global_verbose = args["--verbose"].asBool(); enable_stats = args["--stats"].asBool(); auto part_uri = args["--parts_uri"].asString(); @@ -417,7 +419,7 @@ int main(int argc, char* argv[]) { tdbColMajorMatrix(ctx, groundtruth_uri, nqueries); groundtruth.load(); - if (false) { + if (global_debug) { std::cout << std::endl; debug_matrix(groundtruth, "groundtruth");