From 0cdd4a1ba7c6ea120a4f293a2b44db4c860aff65 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Thu, 11 Jul 2024 13:39:27 +0200 Subject: [PATCH 1/3] add ids to python FeatureVectorArray --- .../vector_search/type_erased_module.cc | 158 +++++++++++++----- apis/python/test/test_type_erased_module.py | 46 +++++ 2 files changed, 166 insertions(+), 38 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/type_erased_module.cc b/apis/python/src/tiledb/vector_search/type_erased_module.cc index a0caec329..9ea1b2ad0 100644 --- a/apis/python/src/tiledb/vector_search/type_erased_module.cc +++ b/apis/python/src/tiledb/vector_search/type_erased_module.cc @@ -97,10 +97,36 @@ auto datatype_to_format(tiledb_datatype_t datatype) { case TILEDB_UINT64: return py::format_descriptor::format(); default: - throw std::runtime_error("Unsupported datatype"); + throw std::runtime_error( + "[type_erased_module@datatype_to_format] Unsupported datatype"); } } +bool check_datatype_format( + const std::string& dtype_format, const std::string& buffer_info_format) { + if (dtype_format == buffer_info_format) { + return true; + } + // We need to handle uint64 specifically of a numpy quirk: + // a. dtype_format (i.e. + // `datatype_to_format(string_to_datatype(.dtype().str()))`) will + // give us 'Q' (numpy.ulonglong) See: + // https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ulonglong + // b. buffer_info_format (i.e. `.request().format`) will + // give us 'L' (numpy.uint) because numpy.uint is an alias for numpy.uint64 on + // Darwin arm64. See: + // https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uint + if (dtype_format == "Q" && buffer_info_format == "L") { + return true; + } + // The same thing happens with int64, but for it dtype_format will give 'q' + // (numpy.longlong), whereas buffer_info_format gives 'l' (numpy.int_). + if (dtype_format == "q" && buffer_info_format == "l") { + return true; + } + return false; +} + // Define Pybind11 bindings // PYBIND11_MODULE(_tiledbvspy2, m) { @@ -176,19 +202,26 @@ void init_type_erased_module(py::module_& m) { /* Strides (in bytes) for each index */ ); }) - .def(py::init([](py::array b) { + .def(py::init([](py::array vector) { /* Request a buffer descriptor from Python */ - py::buffer_info info = b.request(); - if (info.ndim != 1) + py::buffer_info info = vector.request(); + if (info.ndim != 1) { throw std::runtime_error( - "Incompatible buffer dimension! Should be 1."); + "[type_erased_module@FeatureVector] Incompatible buffer " + "dimension. Should be 1, but was " + + std::to_string(info.ndim) + "."); + } - auto dtype_str = b.dtype().str(); + auto dtype_str = vector.dtype().str(); tiledb_datatype_t datatype = string_to_datatype(dtype_str); - if (info.format != datatype_to_format(datatype)) + + if (!check_datatype_format(datatype_to_format(datatype), info.format)) { throw std::runtime_error( - "Incompatible format: expected array of " + - datatype_to_string(datatype)); + "[type_erased_module@FeatureVector] Incompatible format: " + "expected array of " + + datatype_to_string(datatype) + " (" + + datatype_to_format(datatype) + "), but was " + info.format + "."); + } size_t sz = datatype_to_size(datatype); @@ -243,38 +276,84 @@ void init_type_erased_module(py::module_& m) { v.dimensions(), /* Strides (in bytes) for each index */ datatype_to_size(v.feature_type())}); }) - .def(py::init([](py::array b) { - /* Request a buffer descriptor from Python */ - py::buffer_info info = b.request(); - if (info.ndim != 2) - throw std::runtime_error( - "Incompatible buffer dimension! Should be 2."); + .def( + py::init([](py::array vectors, py::array ids) { + // The vector buffer info. + py::buffer_info info = vectors.request(); + if (info.ndim != 2) { + throw std::runtime_error( + "[type_erased_module@FeatureVectorArray] Incompatible buffer " + "dimension. Should be 2, but was " + + std::to_string(info.ndim) + "."); + } - auto dtype_str = b.dtype().str(); - tiledb_datatype_t datatype = string_to_datatype(dtype_str); - if (info.format != datatype_to_format(datatype)) - throw std::runtime_error( - "Incompatible format: expected array of " + - datatype_to_string(datatype)); + auto dtype_str = vectors.dtype().str(); + tiledb_datatype_t datatype = string_to_datatype(dtype_str); + if (!check_datatype_format( + datatype_to_format(datatype), info.format)) { + throw std::runtime_error( + "[type_erased_module@FeatureVectorArray] Incompatible format " + "- expected array of " + + datatype_to_string(datatype) + " (" + + datatype_to_format(datatype) + "), but was " + info.format + + "."); + } - size_t sz = datatype_to_size(datatype); + // The ids vector buffer info. + py::buffer_info ids_info = ids.request(); + if (ids_info.ndim != 1) { + throw std::runtime_error( + "[type_erased_module@FeatureVectorArray] Incompatible ids " + "buffer dimension. Should be 1, but was " + + std::to_string(ids_info.ndim) + "."); + } - auto v = [&]() { - auto order = b.flags() & py::array::f_style ? TILEDB_COL_MAJOR : - TILEDB_ROW_MAJOR; - if (order == TILEDB_COL_MAJOR) { - return FeatureVectorArray(info.shape[0], info.shape[1], dtype_str); - } else { - return FeatureVectorArray(info.shape[1], info.shape[0], dtype_str); - } - }(); + std::string ids_dtype_str; + tiledb_datatype_t ids_datatype = TILEDB_ANY; + if (ids.size() != 0) { + ids_dtype_str = ids.dtype().str(); + ids_datatype = string_to_datatype(ids_dtype_str); + if (!check_datatype_format( + datatype_to_format(ids_datatype), ids_info.format)) { + throw std::runtime_error( + "[type_erased_module@FeatureVectorArray] Incompatible ids " + "format - expected array of " + + datatype_to_string(datatype) + " (" + + datatype_to_format(datatype) + "), but was " + info.format + + "."); + } + } - auto data = (uint8_t*)v.data(); - std::memcpy( - data, (uint8_t*)info.ptr, info.shape[0] * info.shape[1] * sz); + auto feature_vector_array = [&]() { + auto order = vectors.flags() & py::array::f_style ? + TILEDB_COL_MAJOR : + TILEDB_ROW_MAJOR; + if (order == TILEDB_COL_MAJOR) { + return FeatureVectorArray( + info.shape[0], info.shape[1], dtype_str, ids_dtype_str); + } else { + return FeatureVectorArray( + info.shape[1], info.shape[0], dtype_str, ids_dtype_str); + } + }(); + + auto data = (uint8_t*)feature_vector_array.data(); + std::memcpy( + data, + (uint8_t*)info.ptr, + info.shape[0] * info.shape[1] * datatype_to_size(datatype)); + + if (ids.size() != 0) { + std::memcpy( + feature_vector_array.ids(), + (uint8_t*)ids_info.ptr, + ids_info.shape[0] * datatype_to_size(ids_datatype)); + } - return v; - })); + return feature_vector_array; + }), + py::arg("vectors"), + py::arg("ids") = py::array()); py::class_(m, "IndexFlatL2") .def( @@ -301,7 +380,8 @@ void init_type_erased_module(py::module_& m) { } else if (s == "random") { return kmeans_init::random; } else { - throw std::runtime_error("Invalid kmeans_init value"); + throw std::runtime_error( + "[type_erased_module@kmeans_init] Invalid kmeans_init value"); } })); @@ -478,7 +558,9 @@ void init_type_erased_module(py::module_& m) { } else if (std::string(init_str) == "random") { init = kmeans_init::random; } else { - throw std::runtime_error("Invalid kmeans_init value"); + throw std::runtime_error( + "[type_erased_module@IndexIVFFlat@train] Invalid kmeans_init " + "value"); } index.train(vectors, init); }, diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py index 80fbd2bef..b87e7e00e 100644 --- a/apis/python/test/test_type_erased_module.py +++ b/apis/python/test/test_type_erased_module.py @@ -73,16 +73,53 @@ def test_feature_vector_array_to_numpy(): a = vspy.FeatureVectorArray(ctx, siftsmall_inputs_uri) assert a.num_vectors() == 10000 assert a.dimensions() == 128 + assert a.num_ids() == 0 + assert a.ids_type_string() == "any" b = np.array(a) assert b.shape == (10000, 128) a = vspy.FeatureVectorArray(ctx, bigann10k_inputs_uri) assert a.num_vectors() == 10000 assert a.dimensions() == 128 + assert a.num_ids() == 0 + assert a.ids_type_string() == "any" b = np.array(a) assert b.shape == (10000, 128) +def test_numpy_to_feature_vector_array_data_types(): + for dtype in [ + np.float32, + np.int8, + np.uint8, + np.int32, + np.uint32, + np.int64, + np.uint64, + ]: + for dtype_ids in [np.uint32, np.uint64]: + if np.issubdtype(dtype, np.integer): + max_val = np.iinfo(dtype).max + elif np.issubdtype(dtype, np.floating): + max_val = np.finfo(dtype).max + else: + raise TypeError(f"Unsupported data type {dtype}") + + if np.issubdtype(dtype_ids, np.integer): + max_val_ids = np.iinfo(dtype_ids).max + elif np.issubdtype(dtype, np.floating): + max_val_ids = np.finfo(dtype_ids).max + else: + raise TypeError(f"Unsupported ids data type {dtype_ids}") + + vectors = np.array([[max_val]], dtype=dtype) + ids = np.array([max_val_ids], dtype=dtype_ids) + feature_vector_array = vspy.FeatureVectorArray(vectors, ids) + assert np.array_equal( + vectors, np.array(feature_vector_array) + ), f"Arrays were not equal for dtype: {dtype}, dtype_ids: {dtype_ids}" + + def test_numpy_to_feature_vector_array(): a = np.array(np.random.rand(10000, 128), dtype=np.float32) b = vspy.FeatureVectorArray(a) @@ -162,6 +199,15 @@ def test_numpy_to_feature_vector_array(): assert np.array_equal(a, np.transpose(np.array(b))) +def test_numpy_to_feature_vector_array_with_ids(): + print() + a = np.array(np.random.rand(10000, 128), dtype=np.float32) + ids = np.arange(10000, dtype=np.uint64) + b = vspy.FeatureVectorArray(a, ids) + assert b.num_ids() == 10000 + assert b.ids_type_string() == "uint64" + + def test_TemporalPolicy(): temporal_policy = vspy.TemporalPolicy() assert temporal_policy.timestamp_start() == 0 From 1cbab1f526966260001025d24b6bbb2e6ba949a7 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Thu, 11 Jul 2024 14:17:30 +0200 Subject: [PATCH 2/3] cleanup and more tests --- .../vector_search/type_erased_module.cc | 25 +++++++++--------- apis/python/test/test_type_erased_module.py | 26 +++++++++++++++++++ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/type_erased_module.cc b/apis/python/src/tiledb/vector_search/type_erased_module.cc index 9ea1b2ad0..abcc58d10 100644 --- a/apis/python/src/tiledb/vector_search/type_erased_module.cc +++ b/apis/python/src/tiledb/vector_search/type_erased_module.cc @@ -214,13 +214,13 @@ void init_type_erased_module(py::module_& m) { auto dtype_str = vector.dtype().str(); tiledb_datatype_t datatype = string_to_datatype(dtype_str); - - if (!check_datatype_format(datatype_to_format(datatype), info.format)) { + auto datatype_format = datatype_to_format(datatype); + if (!check_datatype_format(datatype_format, info.format)) { throw std::runtime_error( "[type_erased_module@FeatureVector] Incompatible format: " "expected array of " + - datatype_to_string(datatype) + " (" + - datatype_to_format(datatype) + "), but was " + info.format + "."); + dtype_str + " (" + datatype_format + "), but was " + info.format + + "."); } size_t sz = datatype_to_size(datatype); @@ -289,14 +289,13 @@ void init_type_erased_module(py::module_& m) { auto dtype_str = vectors.dtype().str(); tiledb_datatype_t datatype = string_to_datatype(dtype_str); - if (!check_datatype_format( - datatype_to_format(datatype), info.format)) { + auto datatype_format = datatype_to_format(datatype); + if (!check_datatype_format(datatype_format, info.format)) { throw std::runtime_error( "[type_erased_module@FeatureVectorArray] Incompatible format " "- expected array of " + - datatype_to_string(datatype) + " (" + - datatype_to_format(datatype) + "), but was " + info.format + - "."); + dtype_str + " (" + datatype_format + "), but was " + + info.format + "."); } // The ids vector buffer info. @@ -313,14 +312,14 @@ void init_type_erased_module(py::module_& m) { if (ids.size() != 0) { ids_dtype_str = ids.dtype().str(); ids_datatype = string_to_datatype(ids_dtype_str); + auto ids_datatype_format = datatype_to_format(ids_datatype); if (!check_datatype_format( - datatype_to_format(ids_datatype), ids_info.format)) { + ids_datatype_format, ids_info.format)) { throw std::runtime_error( "[type_erased_module@FeatureVectorArray] Incompatible ids " "format - expected array of " + - datatype_to_string(datatype) + " (" + - datatype_to_format(datatype) + "), but was " + info.format + - "."); + ids_dtype_str + " (" + ids_datatype_format + "), but was " + + ids_info.format + "."); } } diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py index b87e7e00e..ddd266697 100644 --- a/apis/python/test/test_type_erased_module.py +++ b/apis/python/test/test_type_erased_module.py @@ -28,6 +28,30 @@ def test_feature_vector_to_numpy(): assert b.dtype == np.uint64 +def test_numpy_to_feature_vector_data_types(): + for dtype in [ + np.float32, + np.int8, + np.uint8, + np.int32, + np.uint32, + np.uint64, + ]: + if np.issubdtype(dtype, np.integer): + max_val = np.iinfo(dtype).max + elif np.issubdtype(dtype, np.floating): + max_val = np.finfo(dtype).max + else: + raise TypeError(f"Unsupported data type {dtype}") + + vector = np.array([max_val], dtype=dtype) + feature_vector = vspy.FeatureVector(vector) + assert feature_vector.feature_type_string() == np.dtype(dtype).name + assert np.array_equal( + vector, np.array(feature_vector) + ), f"Arrays were not equal for dtype: {dtype}" + + def test_numpy_to_feature_vector_array_simple(): a = np.array(np.random.rand(10000), dtype=np.float32) b = vspy.FeatureVector(a) @@ -115,6 +139,8 @@ def test_numpy_to_feature_vector_array_data_types(): vectors = np.array([[max_val]], dtype=dtype) ids = np.array([max_val_ids], dtype=dtype_ids) feature_vector_array = vspy.FeatureVectorArray(vectors, ids) + assert feature_vector_array.feature_type_string() == np.dtype(dtype).name + assert feature_vector_array.ids_type_string() == np.dtype(dtype_ids).name assert np.array_equal( vectors, np.array(feature_vector_array) ), f"Arrays were not equal for dtype: {dtype}, dtype_ids: {dtype_ids}" From 214c99ef6719b0e5bf4cde18d31e6efc27286e5b Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Thu, 11 Jul 2024 15:13:25 +0200 Subject: [PATCH 3/3] fix build --- .../src/tiledb/vector_search/type_erased_module.cc | 13 +++++++------ apis/python/test/test_type_erased_module.py | 1 - 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/type_erased_module.cc b/apis/python/src/tiledb/vector_search/type_erased_module.cc index abcc58d10..9b38b4e42 100644 --- a/apis/python/src/tiledb/vector_search/type_erased_module.cc +++ b/apis/python/src/tiledb/vector_search/type_erased_module.cc @@ -219,8 +219,8 @@ void init_type_erased_module(py::module_& m) { throw std::runtime_error( "[type_erased_module@FeatureVector] Incompatible format: " "expected array of " + - dtype_str + " (" + datatype_format + "), but was " + info.format + - "."); + datatype_to_string(datatype) + " (" + datatype_format + + "), but was " + info.format + "."); } size_t sz = datatype_to_size(datatype); @@ -294,8 +294,8 @@ void init_type_erased_module(py::module_& m) { throw std::runtime_error( "[type_erased_module@FeatureVectorArray] Incompatible format " "- expected array of " + - dtype_str + " (" + datatype_format + "), but was " + - info.format + "."); + datatype_to_string(datatype) + " (" + datatype_format + + "), but was " + info.format + "."); } // The ids vector buffer info. @@ -318,8 +318,9 @@ void init_type_erased_module(py::module_& m) { throw std::runtime_error( "[type_erased_module@FeatureVectorArray] Incompatible ids " "format - expected array of " + - ids_dtype_str + " (" + ids_datatype_format + "), but was " + - ids_info.format + "."); + datatype_to_string(ids_datatype) + " (" + + ids_datatype_format + "), but was " + ids_info.format + + "."); } } diff --git a/apis/python/test/test_type_erased_module.py b/apis/python/test/test_type_erased_module.py index ddd266697..1713ed895 100644 --- a/apis/python/test/test_type_erased_module.py +++ b/apis/python/test/test_type_erased_module.py @@ -226,7 +226,6 @@ def test_numpy_to_feature_vector_array(): def test_numpy_to_feature_vector_array_with_ids(): - print() a = np.array(np.random.rand(10000, 128), dtype=np.float32) ids = np.arange(10000, dtype=np.uint64) b = vspy.FeatureVectorArray(a, ids)