Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Type erased feature vector and feature vector array classes #210

Merged
merged 29 commits into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
92bcd89
Add concepts and cpos
lums658 Jan 19, 2024
fa16f09
Add include concepts.h to flat/vq.h
lums658 Jan 19, 2024
4df5c27
Cleanup code
jparismorgan Jan 19, 2024
cc53a0e
Do not compile CLI programs
lums658 Jan 19, 2024
85d23d6
Small changes to allow concepts to work
lums658 Jan 19, 2024
859dbe9
Incorporate graph search into main
lums658 Jan 19, 2024
cfae025
Add test data, additional source files
lums658 Jan 19, 2024
05f2d84
Remove ci-skip from unit_ivf_qv and unit_ivf_vq
lums658 Jan 19, 2024
4783860
clang-format
lums658 Jan 19, 2024
8dde86f
Cleanup code
jparismorgan Jan 19, 2024
b435c27
More cleanup
jparismorgan Jan 19, 2024
5bacf08
Merge graph-to-main and other small changes so unit tests pass
lums658 Jan 19, 2024
d7e6b16
Clang format
lums658 Jan 20, 2024
9e17cc7
Conceptify graph files
lums658 Jan 20, 2024
42e8a6e
Apply changes from 204
lums658 Jan 22, 2024
ba1a346
Address PR comments, clang-format
lums658 Jan 22, 2024
e27ab44
tdb vector checkpoint
lums658 Jan 23, 2024
8cd8a72
Type erased feature vector and feature vector array
lums658 Jan 23, 2024
9ad8c23
Removed will_fail from ivf_qv and ivf_vq, C++ unit tests all pass
lums658 Jan 23, 2024
e7fa90a
Add Python bindings for type-erased feature vector and feature vector…
lums658 Jan 23, 2024
6aaa2c1
Merge branch 'main' into lums/tmp/type-erased
lums658 Jan 23, 2024
bdb3d8c
Fix small bug in tdb_io.h
lums658 Jan 23, 2024
b7e1440
Remove will_fail (again) from CMakeLists.txt, small change to vector.h
lums658 Jan 23, 2024
7744ef7
Merge branch 'main' into lums/tmp/type-erased
lums658 Jan 24, 2024
9ec0ce8
Address comments from PR
lums658 Jan 24, 2024
9db4b84
Merge branch 'lums/tmp/type-erased' of github.com:TileDB-Inc/TileDB-V…
lums658 Jan 24, 2024
0056627
clang format
lums658 Jan 24, 2024
ad3499b
Fix incorrect python module target
dudoslav Jan 24, 2024
0082c31
Remove spurious comment
lums658 Jan 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apis/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ set(VSPY_TARGET_NAME _tiledbvspy)
python_add_library(${VSPY_TARGET_NAME} MODULE
"src/tiledb/vector_search/module.cc"
"src/tiledb/vector_search/kmeans.cc"
"src/tiledb/vector_search/module2.cc"
WITH_SOABI)

target_link_libraries(${VSPY_TARGET_NAME}
Expand Down
7 changes: 7 additions & 0 deletions apis/python/src/tiledb/vector_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
validate_top_k)
from .storage_formats import STORAGE_VERSION, storage_formats

from ._tiledbvspy import FeatureVector
from ._tiledbvspy import FeatureVectorArray
# Indexes to be added in later PR
# from ._tiledbvspy import IndexFlatL2
# from ._tiledbvspy import IndexIVFFlat
from ._tiledbvspy import Ctx

try:
from tiledb.vector_search.version import version as __version__
except ImportError:
Expand Down
4 changes: 4 additions & 0 deletions apis/python/src/tiledb/vector_search/module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<double>);
PYBIND11_MAKE_OPAQUE(std::vector<size_t>);
#endif

void init_module2(py::module&);

namespace {


Expand Down Expand Up @@ -436,6 +438,7 @@ static void declare_vq_query_heap_pyarray(py::module& m, const std::string& suff
} // anonymous namespace

void init_kmeans(py::module&);
void init_module2(py::module&);

PYBIND11_MODULE(_tiledbvspy, m) {

Expand Down Expand Up @@ -603,4 +606,5 @@ PYBIND11_MODULE(_tiledbvspy, m) {
declareFixedMinPairHeap(m);

init_kmeans(m);
init_module2(m);
}
318 changes: 318 additions & 0 deletions apis/python/src/tiledb/vector_search/module2.cc
lums658 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
/**
* @file tiledb/vector_search/module2.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2023 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*/

#include <tiledb/tiledb>

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>


#include "api/feature_vector.h"
#include "api/feature_vector_array.h"

// Do not delete! This will be part of a future PR
#if 0
#include "api/flat_l2_index.h"
#include "api/ivf_flat_index.h"
#endif

#include "api/api_defs.h"

namespace py = pybind11;

// See https://chat.openai.com/share/0ec55abe-f5be-4988-a99b-017f27a1e129


namespace {
template <typename ...TArgs>
py::tuple make_python_pair(std::tuple<TArgs...>&& arg) {
static_assert(sizeof...(TArgs) == 2, "Must have exactly two arguments");

return py::make_tuple<py::return_value_policy::automatic>(
py::cast(std::get<0>(arg), py::return_value_policy::move),
py::cast(std::get<1>(arg), py::return_value_policy::move)
);
}

std::map<std::string, std::string> kwargs_to_map(py::kwargs kwargs) {
std::map<std::string, std::string> result;

for (auto item : kwargs) {
// Convert the Python objects to strings
std::string key = py::str(item.first);
std::string value = py::str(item.second);

result[key] = value;
}

return result;
}

} // namespace

auto datatype_to_format(tiledb_datatype_t datatype) {
switch(datatype) {
case TILEDB_FLOAT32:
return py::format_descriptor<float>::format();
case TILEDB_FLOAT64:
return py::format_descriptor<double>::format();
case TILEDB_INT8:
return py::format_descriptor<int8_t>::format();
case TILEDB_UINT8:
return py::format_descriptor<uint8_t>::format();
case TILEDB_INT16:
return py::format_descriptor<int16_t>::format();
case TILEDB_UINT16:
return py::format_descriptor<uint16_t>::format();
case TILEDB_INT32:
return py::format_descriptor<int32_t>::format();
case TILEDB_UINT32:
return py::format_descriptor<uint32_t>::format();
case TILEDB_INT64:
return py::format_descriptor<int64_t>::format();
case TILEDB_UINT64:
return py::format_descriptor<uint64_t>::format();
default:
throw std::runtime_error("Unsupported datatype");
}
}

// Define Pybind11 bindings

// PYBIND11_MODULE(_tiledbvspy2, m) {
void init_module2(py::module_& m) {
m.def(
"count_intersections",
[](const FeatureVectorArray& a,
const FeatureVectorArray& b,
size_t k_nn) { return count_intersections(a, b, k_nn); });
#if 0
py::class_<tiledb::Context> (m, "Ctx", py::module_local())
.def(py::init([](std::optional<py::dict> maybe_config) {
tiledb::Config cfg;
if (maybe_config.has_value()) {
for (auto item : maybe_config.value()) {
cfg.set(item.first.cast<std::string>(), item.second.cast<std::string>());
}
}
return tiledb::Context(cfg);
}))
;
#endif
py::class_<FeatureVector>(m, "FeatureVector", py::buffer_protocol())
.def(py::init<const tiledb::Context&, const std::string&>())
.def(py::init<size_t, const std::string&>())
.def(py::init<size_t, void*, const std::string&>())
.def("dimension", &FeatureVector::dimension)
.def("feature_type", &FeatureVector::feature_type)
.def("feature_type_string", &FeatureVector::feature_type_string)
.def_buffer([](FeatureVector& v) -> py::buffer_info {
return py::buffer_info(
v.data(), /* Pointer to buffer */
datatype_to_size(v.feature_type()), /* Size of one scalar */
datatype_to_format(
v.feature_type()), /* Python struct-style format descriptor */
1, /* Number of dimensions */
{v.dimension()}, /* Buffer dimension */
{datatype_to_size(v.feature_type())}
/* Strides (in bytes) for each index */
);
})
.def(py::init([](py::array b) {
/* Request a buffer descriptor from Python */
py::buffer_info info = b.request();
if (info.ndim != 1)
throw std::runtime_error(
"Incompatible buffer dimension! Should be 1.");

auto dtype_str = b.dtype().str();
tiledb_datatype_t datatype = string_to_datatype(dtype_str);
if (info.format != datatype_to_format(datatype))
throw std::runtime_error(
"Incompatible format: expected array of " +
datatype_to_string(datatype));

size_t sz = datatype_to_size(datatype);

auto v = FeatureVector(info.shape[0], dtype_str);

auto data = (uint8_t*)v.data();
std::memcpy(data, (uint8_t*)info.ptr, info.shape[0] * sz);

return v;
}));

py::class_<FeatureVectorArray>(m, "FeatureVectorArray", py::buffer_protocol())
.def(py::init<const tiledb::Context&, const std::string&>())
// .def(py::init<size_t, size_t, const std::string&>())
// .def(py::init<size_t, size_t void*, const std::string&>())
.def("dimension", &FeatureVectorArray::dimension)
.def("num_vectors", &FeatureVectorArray::num_vectors)
.def("feature_type", &FeatureVectorArray::feature_type)
.def("feature_type_string", &FeatureVectorArray::feature_type_string)
.def_buffer([](FeatureVectorArray& v) -> py::buffer_info {
return py::buffer_info(
v.data(), /* Pointer to buffer */
datatype_to_size(v.feature_type()), /* Size of one scalar */
datatype_to_format(
v.feature_type()), /* Python struct-style format descriptor */
2, /* Number of dimensions */
{v.num_vectors(),
v.dimension()}, /* Buffer dimensions -- row major */
{datatype_to_size(v.feature_type()) *
v.dimension(), /* Strides (in bytes) for each index */
datatype_to_size(v.feature_type())});
})
.def(py::init([](py::array b) {
/* Request a buffer descriptor from Python */
py::buffer_info info = b.request();
if (info.ndim != 2)
throw std::runtime_error(
"Incompatible buffer dimension! Should be 2.");

auto dtype_str = b.dtype().str();
tiledb_datatype_t datatype = string_to_datatype(dtype_str);
if (info.format != datatype_to_format(datatype))
throw std::runtime_error(
"Incompatible format: expected array of " +
datatype_to_string(datatype));

size_t sz = datatype_to_size(datatype);

auto v = [&]() {
auto order = b.flags() & py::array::f_style ? TILEDB_COL_MAJOR :
TILEDB_ROW_MAJOR;
if (order == TILEDB_COL_MAJOR) {
return FeatureVectorArray(info.shape[0], info.shape[1], dtype_str);
} else {
return FeatureVectorArray(info.shape[1], info.shape[0], dtype_str);
}
}();

auto data = (uint8_t*)v.data();
std::memcpy(
data, (uint8_t*)info.ptr, info.shape[0] * info.shape[1] * sz);

return v;
}));

// Do not delete! This will be part of a future PR
#if 0
py::class_<IndexFlatL2>(m, "IndexFlatL2")
.def(py::init<const tiledb::Context&, const std::string&>())
.def("add", &IndexFlatL2::add)
.def("add_with_ids", &IndexFlatL2::add_with_ids)
.def("train", &IndexFlatL2::train)
.def("save", &IndexFlatL2::save)
.def("feature_type_string", &IndexFlatL2::feature_type_string)
.def("dimension", &IndexFlatL2::dimension)
.def(
"query",
[](IndexFlatL2& index, FeatureVectorArray& vectors, size_t top_k) {
auto r = index.query(vectors, top_k);
return make_python_pair(std::move(r));
});

py::class_<kmeans_init>(m, "kmeans_init")
.def(py::init([](const std::string& s) {
if (s == "kmeanspp") {
return kmeans_init::kmeanspp;
} else if (s == "random") {
return kmeans_init::random;
} else {
throw std::runtime_error("Invalid kmeans_init value");
}
}));

py::class_<IndexIVFFlat>(m, "IndexIVFFlat")
.def(py::init<const tiledb::Context&, const std::string&>())
.def(
"__init__",
[](IndexIVFFlat& instance, py::kwargs kwargs) {
auto args = kwargs_to_map(kwargs);
new (&instance) IndexIVFFlat(args);
})
.def(
"train",
[](IndexIVFFlat& index,
const FeatureVectorArray& vectors,
py::str init_str) {
kmeans_init init = kmeans_init::random;
if (std::string(init_str) == "kmeans++") {
init = kmeans_init::kmeanspp;
} else if (std::string(init_str) == "random") {
init = kmeans_init::random;
} else {
throw std::runtime_error("Invalid kmeans_init value");
}
index.train(vectors, init);
},
py::arg("vectors"),
py::arg("init") = "random")
.def(
"add",
[](IndexIVFFlat& index, const FeatureVectorArray& vectors) {
index.add(vectors);
})
.def("add_with_ids", &IndexIVFFlat::add_with_ids)
// .def("save", &IndexIVFFlat::save)
.def(
"query_infinite_ram",
[](IndexIVFFlat& index,
const FeatureVectorArray& query,
size_t top_k,
size_t nprobe) {
auto r = index.query_infinite_ram(query, top_k, nprobe);
return make_python_pair(std::move(r));
}) // , py::arg("vectors"), py::arg("top_k") = 1, py::arg("nprobe")
// = 10)
.def(
"query_finite_ram",
[](IndexIVFFlat& index,
const FeatureVectorArray& query,
size_t top_k,
size_t nprobe,
size_t upper_bound) {
auto r = index.query_finite_ram(query, top_k, nprobe, upper_bound);
return make_python_pair(std::move(r));
},
py::arg("vectors"),
py::arg("top_k") = 1,
py::arg("nprobe") = 10,
py::arg("upper_bound") = 0)
.def("feature_type_string", &IndexIVFFlat::feature_type_string)
.def("id_type_string", &IndexIVFFlat::id_type_string)
.def("px_type_string", &IndexIVFFlat::px_type_string)
.def("dimension", &IndexIVFFlat::dimension);
#endif
}

Loading
Loading