Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Type-erased IVFFlatIndex class #154

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion apis/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ find_package(pybind11 CONFIG REQUIRED)

set(VSPY_TARGET_NAME _tiledbvspy)

python_add_library(${VSPY_TARGET_NAME} MODULE "src/tiledb/vector_search/module.cc" WITH_SOABI)
python_add_library(${VSPY_TARGET_NAME} MODULE
"src/tiledb/vector_search/module.cc"
"src/tiledb/vector_search/kmeans.cc"
WITH_SOABI)

target_link_libraries(${VSPY_TARGET_NAME}
PRIVATE
Expand Down
3 changes: 3 additions & 0 deletions apis/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def get_cmake_overrides():
if val:
conf.append("-DUSE_MKL_CBLAS={}".format(val))

conf.append("-DTileDB_DIR=/Users/lums/Contrib/dist")


try:
# Make sure we use pybind11 from this python environment if available,
# required for windows wheels due to:
Expand Down
55 changes: 55 additions & 0 deletions apis/python/src/tiledb/vector_search/kmeans.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <tiledb/tiledb>

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>

#include "index/ivf_flat_index.h"
#include "linalg.h"

namespace py = pybind11;
using Ctx = tiledb::Context;

namespace {

template <typename T, typename shuffled_ids_type = uint64_t>
static void declare_kmeans(py::module& m, const std::string& suffix) {
m.def(("kmeans_fit_" + suffix).c_str(),
[](size_t n_clusters,
std::string init,
size_t max_iter,
bool verbose,
size_t n_init,
const ColMajorMatrix<T>& sample_vectors,
std::optional<double> tol,
std::optional<unsigned int> seed,
std::optional<size_t> nthreads) {
// TODO: support verbose and n_init
std::ignore = verbose;
std::ignore = n_init;
kmeans_init init_val;
if (init == "k-means++") {
init_val = kmeans_init::kmeanspp;
} else if (init == "random") {
init_val = kmeans_init::random;
} else {
throw std::invalid_argument("Invalid init method");
}
ivf_flat_index<T> idx(/*sample_vectors.num_rows(),*/ n_clusters, max_iter, tol.value_or(0.0001), nthreads, seed);
idx.train(sample_vectors, init_val);
return std::move(idx.get_centroids());
});

m.def(("kmeans_predict_" + suffix).c_str(),
[](const ColMajorMatrix<T>& centroids,
const ColMajorMatrix<T>& sample_vectors) {
return ivf_flat_index<T>::predict(centroids, sample_vectors);
});
}

} // anonymous namespace


void init_kmeans(py::module_& m) {
declare_kmeans<float>(m, "f32");
}
11 changes: 9 additions & 2 deletions apis/python/src/tiledb/vector_search/module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
#include <pybind11/stl.h>

#include "linalg.h"
#include "ivf_query.h"
#include "flat_query.h"

// @todo Replace
#include "detail/flat/qv.h"
#include "detail/flat/vq.h"
#include "detail/ivf/qv.h"
#include "detail/ivf/vq.h"

namespace py = pybind11;
using Ctx = tiledb::Context;
Expand Down Expand Up @@ -431,6 +435,7 @@ static void declare_vq_query_heap_pyarray(py::module& m, const std::string& suff

} // anonymous namespace

void init_kmeans(py::module&);

PYBIND11_MODULE(_tiledbvspy, m) {

Expand Down Expand Up @@ -596,4 +601,6 @@ PYBIND11_MODULE(_tiledbvspy, m) {
declare_dist_qv<uint8_t>(m, "u8");
declare_dist_qv<float>(m, "f32");
declareFixedMinPairHeap(m);

init_kmeans(m);
}
31 changes: 31 additions & 0 deletions apis/python/src/tiledb/vector_search/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,37 @@ def array_to_matrix(array: np.ndarray):
else:
raise TypeError("Unsupported type!")

def kmeans_fit(partitions: int, init: str, max_iter: int, verbose: bool, n_init: int, sample_vectors: "colMajorMatrix", tol: Optional[float] = None, nthreads: Optional[int] = None, seed: Optional[int] = None):
args = tuple(
[
partitions,
init,
max_iter,
verbose,
n_init,
sample_vectors,
tol,
nthreads,
seed,
]
)
if sample_vectors.dtype == np.float32:
return kmeans_fit_f32(*args)
else:
raise TypeError("Unsupported type!")

def kmeans_predict(centroids: "colMajorMatrix", sample_vectors: "colMajorMatrix"):
args = tuple(
[
centroids,
sample_vectors,
]
)
if sample_vectors.dtype == np.float32:
return kmeans_predict_f32(*args)
else:
raise TypeError("Unsupported type!")


# TODO
# def load_partitioned(uri, partitions, dtype: Optional[np.dtype] = None):
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ endif()
include(Docopt)
include(mdspan)
include(nlohmann_json)
# include(HighFive)

add_subdirectory(src)

Expand Down
46 changes: 46 additions & 0 deletions src/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

| Abbrv | Vector Set | Dimension | \# Vectors | dtype |
|-----------|-------------------|-----------|------------|-----------|
| siftsmall | siftsmall_base | 128 | 10,000 | float32 |
| siftsmall | siftsmall_learn | 128 | 25,000 | float32 |
| sift | sift_base | 128 | 1,000,000 | float32 |
| 1M | bigann1M_base | 128 | 1M | uint8 |
| 10M | bigann10M_base | 128 | 10M | uint8 |
| 100M | bigann100M_base | 128 | 100M | uint8 |
| 1B | bigann1B_base | 128 | 1B | uint8 |

| Abbrv | Index Set | \# Vectors | Index dtype | IDs | \# IDs | ID dtype |
|-------|-----------|--------------|-------------|---------|-----------|----------|
| siftsmall |
|sift | index | 2,001 | uint64 | ids | 1,000,000 | uint64 |
|1M | index.tdb | 1,000 | uint64 | ids.tdb | 1M | uint64 |

| Abbrv | Query | \#Queries | dtype | Groundtruth | Groundtruth dtype |
|--|------------------|----------------|---------|-------------|-------------------|
| siftsmall | siftsmall_query | 100 | float32 |
| siftsmall | siftsmall_query | 100 | float32 |
| sift | sift_query | 10,000 | float32 |
| 1M | query_public_10k | 10,000 | uint8 |
| 10M | query_public_10k | 10,000 | uint8 |
| 100M | query_public_10k | 10,000 | uint8 |
| 1B | query_public_10k | 10,000 | uint8 |

| Vector set | Download | descriptor | dimension | nb base vectors | nb query vectors | nb learn vectors | file format |
|------------|----------|------------|-----------|-----------------|------------------|------------------|-------------|
| ANN_SIFT10K | siftsmall.tar.gz (5.1MB) | SIFT (1) | 128 | 10,000 | 100 | 25,000 | fvecs |
| ANN_SIFT1M | sift.tar.gz (161MB) | SIFT (1) | 128 | 1,000,000 | 10,000 | 100,000 | fvecs |
| ANN_GIST1M | gist.tar.gz (2.6GB) | GIST (2) | 960 | 1,000,000 | 1,000 | 500,000 | fvecs |
| ANN_SIFT1B | Base set (92 GB) Learning set (9.1 GB) Query set (964 KB) Groundtruth (512 MB) | SIFT (3) | 128 | 1,000,000,000 | 10,000 | 100,000,000 | bvecs |


.bvecs, .fvecs and .ivecs vector file formats:

The vectors are stored in raw little endian.
Each vector takes 4+d*4 bytes for .fvecs and .ivecs formats, and 4+d bytes for .bvecs formats,
where d is the dimensionality of the vector, as shown below.

| field | field type | description |
|-------|------------|-------------|
| d | int | the vector dimension |
| components | (unsigned char\|float \| int)\*d | the vector components |

18 changes: 18 additions & 0 deletions src/benchmarks/ivf/ivf_index.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

. ${SCRIPT_DIR}/ivf_init.bash

if [ ! -e ${SIFT_LEARN} ]; then
echo "File ${SIFT_LEARN} does not exist!"
fi


cmd="${IVFPATH}/ivf_index --db_uri ${SIFT_LEARN} --ftype ${F_TYPE} --index_uri ${SIFT_INDEX} --idtype ${ID_TYPE} --pxtype ${PX_TYPE} -v -d --log - --force"
echo ${cmd}
time ${cmd}

# cmd="${IVFPATH}/ivf_query --index_uri ${SIFT_INDEX} --query_uri ${SIFT_QUERY} --groundtruth_uri ${SIFT_GROUNDTRUTH} -k 10 -v -d --log -"
# echo ${cmd}
# time ${cmd}
44 changes: 44 additions & 0 deletions src/benchmarks/ivf/ivf_index_all.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

. ${SCRIPT_DIR}/ivf_init.bash

// siftsmall_base siftsmall_learn sift_base sift_learn 1M 10M



for id_type in 32 64; do
for px_type in 32 64; do

pref="${IVFPATH}/ivf_index --idtype uint${id_type} --pxtype uint${px_type} \
--num_clusters 0 -v -d --log - --force"

db_uri=${GP3}/siftsmall/siftsmall_base
index_name=${GP3}/siftsmall/flatIVF_index_siftsmall_base_${id_type}_${px_type}
cmd="${pref} --db_uri ${db_uri} --index_uri ${index_name} --ftype float"
echo ${cmd}
time ${cmd}

db_uri=${GP3}/sift/sift_base
index_name=${GP3}/sift/flatIVF_index_sift_base_${id_type}_${px_type}
cmd="${pref} --db_uri ${db_uri} --index_uri ${index_name} --ftype float"
echo ${cmd}
time ${cmd}


db_uri=${GP3}/1M/bigann1M_base
index_name=${GP3}/1M/flatIVF_index_1M_base_${id_type}_${px_type}
cmd="${pref} --db_uri ${db_uri} --index_uri ${index_name} --ftype uint8"
echo ${cmd}
time ${cmd}

db_uri=${GP3}/10M/bigann10M_base
index_name=${GP3}/10M/flatIVF_index_10M_base_${id_type}_${px_type}
cmd="${pref} --db_uri ${db_uri} --index_uri ${index_name} --ftype uint8"
echo ${cmd}
time ${cmd}


done
done
33 changes: 33 additions & 0 deletions src/benchmarks/ivf/ivf_init.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

export VECTOR_SEARCH=${HOME}/TileDB/TileDB-Vector-Search

if [ ! -d ${VECTOR_SEARCH} ]; then
echo "${VECTOR_SEARCH} does not exist"
export VECTOR_SEARCH=${HOME}/TileDB-Vector-Search
else
if [ ! -d ${VECTOR_SEARCH} ]; then
echo "${VECTOR_SEARCH} does not exist"
return 1
fi
fi
echo "VECTOR_SEARCH is ${VECTOR_SEARCH}"

export SRCROOT=${VECTOR_SEARCH}/src/

export SIFT=siftsmall
export SIFTPATH=${VECTOR_SEARCH}/external/data/gp3/${SIFT}
export GP3=${VECTOR_SEARCH}/external/data/gp3/
export DATAPATH=${SIFTPATH}

export IVFPATH=${SRCROOT}/cmake-build-relwithdebinfo/libtiledbvectorsearch/src/ivf/

export SIFT_LEARN=${SIFTPATH}/${SIFT}_base
export SIFT_QUERY=${SIFTPATH}/${SIFT}_query
export SIFT_GROUNDTRUTH=${SIFTPATH}/${SIFT}_groundtruth

export SIFT_INDEX=${SIFTPATH}/flatIVF_index_${SIFT}_base

export ID_TYPE=uint32
export PX_TYPE=uint64
export F_TYPE=float
19 changes: 19 additions & 0 deletions src/benchmarks/ivf/ivf_query.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

. ${SCRIPT_DIR}/ivf_init.bash

if [ ! -e ${SIFT_QUERY} ]; then
echo "File ${SIFT_QUERY} does not exist!"
fi
if [ ! -e ${SIFT_INDEX} ]; then
echo "File ${SIFT_INDEX} does not exist!"
fi
if [ ! -e ${SIFT_GROUNDTRUTH} ]; then
echo "File ${SIFT_GROUNDTRUTH} does not exist!"
fi

cmd="${IVFPATH}/ivf_query --index_uri ${SIFT_INDEX} --query_uri ${SIFT_QUERY} --ftype ${F_TYPE} --pxtype ${PX_TYPE} --groundtruth_uri ${SIFT_GROUNDTRUTH} --idtype ${ID_TYPE} -k 10 -v -d --log -"
echo ${cmd}
time ${cmd}
40 changes: 40 additions & 0 deletions src/benchmarks/pq/flat_l2.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

export VECTOR_SEARCH=${HOME}/TileDB/TileDB-Vector-Search

if [ ! -d ${VECTOR_SEARCH} ]; then
echo "${VECTOR_SEARCH} does not exist"
export VECTOR_SEARCH=${HOME}/TileDB-Vector-Search
else
if [ ! -d ${VECTOR_SEARCH} ]; then
echo "${VECTOR_SEARCH} does not exist"
return 1
fi
fi
echo "VECTOR_SEARCH is ${VECTOR_SEARCH}"

export PQROOT=${VECTOR_SEARCH}/src/

export SIFT=sift
export SIFTPATH=${VECTOR_SEARCH}/external/data/gp3/${SIFT}
export DATAPATH=${SIFTPATH}

export PQPATH=${PQROOT}/cmake-build-relwithdebinfo/libtiledbvectorsearch/src/

export SIFT_LEARN=${SIFTPATH}/${SIFT}_base
export SIFT_QUERY=${SIFTPATH}/${SIFT}_query
export SIFT_GROUNDTRUTH=${SIFTPATH}/${SIFT}_groundtruth

if [ ! -e ${SIFT_LEARN} ]; then
echo "File ${SIFT_LEARN} does not exist!"
fi
if [ ! -e ${SIFT_QUERY} ]; then
echo "File ${SIFT_QUERY} does not exist!"
fi
if [ ! -e ${SIFT_GROUNDTRUTH} ]; then
echo "File ${SIFT_GROUNDTRUTH} does not exist!"
fi

cmd="${PQPATH}/flat_l2 --db_uri ${SIFT_LEARN} --query_uri ${SIFT_QUERY} --groundtruth_uri ${SIFT_GROUNDTRUTH} --alg qv -v -d --log -"
echo ${cmd}
time ${cmd}
Loading
Loading