From f3f6229357e31d3e3296be5d04a21a8e19fc22e7 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 11:15:32 +0200 Subject: [PATCH 1/6] specify ids array tile_size based on ids array attribute datatype --- apis/python/src/tiledb/vector_search/flat_index.py | 5 ++--- apis/python/src/tiledb/vector_search/ivf_flat_index.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/flat_index.py b/apis/python/src/tiledb/vector_search/flat_index.py index 4d547f148..0f73ee71d 100644 --- a/apis/python/src/tiledb/vector_search/flat_index.py +++ b/apis/python/src/tiledb/vector_search/flat_index.py @@ -185,7 +185,6 @@ def create( ) with tiledb.scope_ctx(ctx_or_config=config): group = tiledb.Group(uri, "w") - tile_size = TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions ids_array_name = storage_formats[storage_version]["IDS_ARRAY_NAME"] parts_array_name = storage_formats[storage_version]["PARTS_ARRAY_NAME"] updates_array_name = storage_formats[storage_version]["UPDATES_ARRAY_NAME"] @@ -196,7 +195,7 @@ def create( ids_array_rows_dim = tiledb.Dim( name="rows", domain=(0, MAX_INT32), - tile=tile_size, + tile=int(TILE_SIZE_BYTES / np.dtype(np.uint64).itemsize / dimensions), dtype=np.dtype(np.int32), ) ids_array_dom = tiledb.Domain(ids_array_rows_dim) @@ -224,7 +223,7 @@ def create( parts_array_cols_dim = tiledb.Dim( name="cols", domain=(0, MAX_INT32), - tile=tile_size, + tile=int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions), dtype=np.dtype(np.int32), ) parts_array_dom = tiledb.Domain(parts_array_rows_dim, parts_array_cols_dim) diff --git a/apis/python/src/tiledb/vector_search/ivf_flat_index.py b/apis/python/src/tiledb/vector_search/ivf_flat_index.py index 25d7d528b..717f7e2b7 100644 --- a/apis/python/src/tiledb/vector_search/ivf_flat_index.py +++ b/apis/python/src/tiledb/vector_search/ivf_flat_index.py @@ -539,7 +539,6 @@ def create( ) with tiledb.scope_ctx(ctx_or_config=config): group = tiledb.Group(uri, "w") - tile_size = int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions) group.meta["partition_history"] = json.dumps([0]) centroids_array_name = storage_formats[storage_version]["CENTROIDS_ARRAY_NAME"] index_array_name = storage_formats[storage_version]["INDEX_ARRAY_NAME"] @@ -607,7 +606,7 @@ def create( ids_array_rows_dim = tiledb.Dim( name="rows", domain=(0, MAX_INT32), - tile=tile_size, + tile=int(TILE_SIZE_BYTES / np.dtype(np.uint64).itemsize / dimensions), dtype=np.dtype(np.int32), ) ids_array_dom = tiledb.Domain(ids_array_rows_dim) @@ -635,7 +634,7 @@ def create( parts_array_cols_dim = tiledb.Dim( name="cols", domain=(0, MAX_INT32), - tile=tile_size, + tile=int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions), dtype=np.dtype(np.int32), ) parts_array_dom = tiledb.Domain(parts_array_rows_dim, parts_array_cols_dim) From c8eff66e52dfa95f7c855bdd7b2fe508581f66f6 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 14:39:49 +0200 Subject: [PATCH 2/6] C++ changes --- src/include/index/ivf_pq_group.h | 6 +++--- src/include/index/vamana_group.h | 25 ++++++++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/include/index/ivf_pq_group.h b/src/include/index/ivf_pq_group.h index 4abfea049..3290800ff 100644 --- a/src/include/index/ivf_pq_group.h +++ b/src/include/index/ivf_pq_group.h @@ -245,8 +245,8 @@ class ivf_pq_group : public base_index_group { } this->init_valid_array_names(); - static const int32_t tile_size{ - (int32_t)(tile_size_bytes / sizeof(typename index_type::feature_type) / + static const int32_t tile_size_ids{ + (int32_t)(tile_size_bytes / sizeof(typename index_type::id_type) / this->get_dimensions())}; static const tiledb_filter_type_t default_compression{ string_to_filter(storage_formats[version_]["default_attr_filters"])}; @@ -336,7 +336,7 @@ class ivf_pq_group : public base_index_group { cached_ctx_, this->ids_uri(), default_domain, - tile_size, + tile_size_ids, default_compression); tiledb_helpers::add_to_group( write_group, this->ids_uri(), this->ids_array_name()); diff --git a/src/include/index/vamana_group.h b/src/include/index/vamana_group.h index e9076ccb0..a4050a775 100644 --- a/src/include/index/vamana_group.h +++ b/src/include/index/vamana_group.h @@ -198,9 +198,20 @@ class vamana_index_group : public base_index_group { } this->init_valid_array_names(); - static const int32_t tile_size{ - (int32_t)(tile_size_bytes / sizeof(typename index_type::feature_type) / - this->get_dimensions())}; + static const int32_t tile_size{static_cast( + tile_size_bytes / sizeof(typename index_type::feature_type) / + this->get_dimensions())}; + static const int32_t tile_size_ids{static_cast( + tile_size_bytes / sizeof(typename index_type::id_type) / + this->get_dimensions())}; + static const int32_t tile_size_adjacency_row_index{static_cast( + tile_size_bytes / + sizeof(typename index_type::adjacency_row_index_type) / + this->get_dimensions())}; + static const int32_t tile_size_adjacency_scores{static_cast( + tile_size_bytes / sizeof(typename index_type::adjacency_scores_type) / + this->get_dimensions())}; + static const tiledb_filter_type_t default_compression{ string_to_filter(storage_formats[version_]["default_attr_filters"])}; @@ -270,7 +281,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, this->ids_uri(), default_domain, - tile_size, + tile_size_ids, default_compression); tiledb_helpers::add_to_group( write_group, this->ids_uri(), this->ids_array_name()); @@ -279,7 +290,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_scores_uri(), default_domain, - tile_size, + tile_size_adjacency_scores, default_compression); tiledb_helpers::add_to_group( write_group, adjacency_scores_uri(), adjacency_scores_array_name()); @@ -288,7 +299,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_ids_uri(), default_domain, - tile_size, + tile_size_ids, default_compression); tiledb_helpers::add_to_group( write_group, adjacency_ids_uri(), adjacency_ids_array_name()); @@ -297,7 +308,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_row_index_uri(), default_domain, - tile_size, + tile_size_adjacency_row_index, default_compression); tiledb_helpers::add_to_group( write_group, From 75b59cb31c914c5b6957b7a9de0c02a4ef67c2f0 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 16:00:18 +0200 Subject: [PATCH 3/6] Set filter when we create vector arrays in C++ --- src/include/detail/linalg/tdb_io.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/include/detail/linalg/tdb_io.h b/src/include/detail/linalg/tdb_io.h index 95abbbf65..b99045205 100644 --- a/src/include/detail/linalg/tdb_io.h +++ b/src/include/detail/linalg/tdb_io.h @@ -252,17 +252,19 @@ void create_empty_for_vector( const tiledb::Context& ctx, const std::string& uri, size_t rows, - size_t row_extent, - std::optional filter = std::nullopt) { + int32_t row_extent, + tiledb_filter_type_t filter) { tiledb::Domain domain(ctx); - domain.add_dimensions(tiledb::Dimension::create( + domain.add_dimensions(tiledb::Dimension::create( ctx, "rows", {{0, std::max(0, (int)rows - 1)}}, row_extent)); - // The array will be dense. tiledb::ArraySchema schema(ctx, TILEDB_DENSE); schema.set_domain(domain).set_order({{TILEDB_COL_MAJOR, TILEDB_COL_MAJOR}}); - schema.add_attribute(tiledb::Attribute::create(ctx, "values")); + tiledb::FilterList filter_list(ctx); + filter_list.add_filter({ctx, filter}); + schema.add_attribute( + tiledb::Attribute::create(ctx, "values", filter_list)); tiledb::Array::create(uri, schema); } @@ -279,7 +281,7 @@ void create_vector( const tiledb::Context& ctx, const V& v, const std::string& uri, - std::optional filter = std::nullopt) { + tiledb_filter_type_t filter) { using value_type = std::ranges::range_value_t; size_t num_parts = 10; @@ -313,7 +315,7 @@ void write_vector( using value_type = std::remove_const_t>; if (create) { - create_vector(ctx, v, uri); + create_vector(ctx, v, uri, TILEDB_FILTER_NONE); } if (size(v) == 0) { From a671daae669ebd73c12112fe0b7086b8da64ed04 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 16:19:09 +0200 Subject: [PATCH 4/6] undo old changes --- .../src/tiledb/vector_search/flat_index.py | 5 ++-- .../tiledb/vector_search/ivf_flat_index.py | 5 ++-- src/include/index/ivf_pq_group.h | 6 ++--- src/include/index/vamana_group.h | 25 ++++++------------- 4 files changed, 16 insertions(+), 25 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/flat_index.py b/apis/python/src/tiledb/vector_search/flat_index.py index 0f73ee71d..4d547f148 100644 --- a/apis/python/src/tiledb/vector_search/flat_index.py +++ b/apis/python/src/tiledb/vector_search/flat_index.py @@ -185,6 +185,7 @@ def create( ) with tiledb.scope_ctx(ctx_or_config=config): group = tiledb.Group(uri, "w") + tile_size = TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions ids_array_name = storage_formats[storage_version]["IDS_ARRAY_NAME"] parts_array_name = storage_formats[storage_version]["PARTS_ARRAY_NAME"] updates_array_name = storage_formats[storage_version]["UPDATES_ARRAY_NAME"] @@ -195,7 +196,7 @@ def create( ids_array_rows_dim = tiledb.Dim( name="rows", domain=(0, MAX_INT32), - tile=int(TILE_SIZE_BYTES / np.dtype(np.uint64).itemsize / dimensions), + tile=tile_size, dtype=np.dtype(np.int32), ) ids_array_dom = tiledb.Domain(ids_array_rows_dim) @@ -223,7 +224,7 @@ def create( parts_array_cols_dim = tiledb.Dim( name="cols", domain=(0, MAX_INT32), - tile=int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions), + tile=tile_size, dtype=np.dtype(np.int32), ) parts_array_dom = tiledb.Domain(parts_array_rows_dim, parts_array_cols_dim) diff --git a/apis/python/src/tiledb/vector_search/ivf_flat_index.py b/apis/python/src/tiledb/vector_search/ivf_flat_index.py index 717f7e2b7..25d7d528b 100644 --- a/apis/python/src/tiledb/vector_search/ivf_flat_index.py +++ b/apis/python/src/tiledb/vector_search/ivf_flat_index.py @@ -539,6 +539,7 @@ def create( ) with tiledb.scope_ctx(ctx_or_config=config): group = tiledb.Group(uri, "w") + tile_size = int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions) group.meta["partition_history"] = json.dumps([0]) centroids_array_name = storage_formats[storage_version]["CENTROIDS_ARRAY_NAME"] index_array_name = storage_formats[storage_version]["INDEX_ARRAY_NAME"] @@ -606,7 +607,7 @@ def create( ids_array_rows_dim = tiledb.Dim( name="rows", domain=(0, MAX_INT32), - tile=int(TILE_SIZE_BYTES / np.dtype(np.uint64).itemsize / dimensions), + tile=tile_size, dtype=np.dtype(np.int32), ) ids_array_dom = tiledb.Domain(ids_array_rows_dim) @@ -634,7 +635,7 @@ def create( parts_array_cols_dim = tiledb.Dim( name="cols", domain=(0, MAX_INT32), - tile=int(TILE_SIZE_BYTES / np.dtype(vector_type).itemsize / dimensions), + tile=tile_size, dtype=np.dtype(np.int32), ) parts_array_dom = tiledb.Domain(parts_array_rows_dim, parts_array_cols_dim) diff --git a/src/include/index/ivf_pq_group.h b/src/include/index/ivf_pq_group.h index 3290800ff..4abfea049 100644 --- a/src/include/index/ivf_pq_group.h +++ b/src/include/index/ivf_pq_group.h @@ -245,8 +245,8 @@ class ivf_pq_group : public base_index_group { } this->init_valid_array_names(); - static const int32_t tile_size_ids{ - (int32_t)(tile_size_bytes / sizeof(typename index_type::id_type) / + static const int32_t tile_size{ + (int32_t)(tile_size_bytes / sizeof(typename index_type::feature_type) / this->get_dimensions())}; static const tiledb_filter_type_t default_compression{ string_to_filter(storage_formats[version_]["default_attr_filters"])}; @@ -336,7 +336,7 @@ class ivf_pq_group : public base_index_group { cached_ctx_, this->ids_uri(), default_domain, - tile_size_ids, + tile_size, default_compression); tiledb_helpers::add_to_group( write_group, this->ids_uri(), this->ids_array_name()); diff --git a/src/include/index/vamana_group.h b/src/include/index/vamana_group.h index a4050a775..e9076ccb0 100644 --- a/src/include/index/vamana_group.h +++ b/src/include/index/vamana_group.h @@ -198,20 +198,9 @@ class vamana_index_group : public base_index_group { } this->init_valid_array_names(); - static const int32_t tile_size{static_cast( - tile_size_bytes / sizeof(typename index_type::feature_type) / - this->get_dimensions())}; - static const int32_t tile_size_ids{static_cast( - tile_size_bytes / sizeof(typename index_type::id_type) / - this->get_dimensions())}; - static const int32_t tile_size_adjacency_row_index{static_cast( - tile_size_bytes / - sizeof(typename index_type::adjacency_row_index_type) / - this->get_dimensions())}; - static const int32_t tile_size_adjacency_scores{static_cast( - tile_size_bytes / sizeof(typename index_type::adjacency_scores_type) / - this->get_dimensions())}; - + static const int32_t tile_size{ + (int32_t)(tile_size_bytes / sizeof(typename index_type::feature_type) / + this->get_dimensions())}; static const tiledb_filter_type_t default_compression{ string_to_filter(storage_formats[version_]["default_attr_filters"])}; @@ -281,7 +270,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, this->ids_uri(), default_domain, - tile_size_ids, + tile_size, default_compression); tiledb_helpers::add_to_group( write_group, this->ids_uri(), this->ids_array_name()); @@ -290,7 +279,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_scores_uri(), default_domain, - tile_size_adjacency_scores, + tile_size, default_compression); tiledb_helpers::add_to_group( write_group, adjacency_scores_uri(), adjacency_scores_array_name()); @@ -299,7 +288,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_ids_uri(), default_domain, - tile_size_ids, + tile_size, default_compression); tiledb_helpers::add_to_group( write_group, adjacency_ids_uri(), adjacency_ids_array_name()); @@ -308,7 +297,7 @@ class vamana_index_group : public base_index_group { cached_ctx_, adjacency_row_index_uri(), default_domain, - tile_size_adjacency_row_index, + tile_size, default_compression); tiledb_helpers::add_to_group( write_group, From d93c41b376e4562b369f78ae79582fc31fcecc55 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 16:19:32 +0200 Subject: [PATCH 5/6] add fix create_matrix() --- src/include/detail/linalg/tdb_io.h | 27 +++++++++---------- .../test/unit_api_feature_vector_array.cc | 8 +++++- src/include/test/unit_tdb_matrix.cc | 3 ++- src/include/test/unit_tdb_matrix_with_ids.cc | 3 ++- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/include/detail/linalg/tdb_io.h b/src/include/detail/linalg/tdb_io.h index b99045205..30f4ed3f2 100644 --- a/src/include/detail/linalg/tdb_io.h +++ b/src/include/detail/linalg/tdb_io.h @@ -128,7 +128,10 @@ void create_empty_for_matrix( size_t cols, size_t row_extent, size_t col_extent, - std::optional filter = std::nullopt) { + tiledb_filter_type_t filter) { + tiledb::FilterList filter_list(ctx); + filter_list.add_filter({ctx, filter}); + tiledb::Domain domain(ctx); domain .add_dimensions(tiledb::Dimension::create( @@ -137,18 +140,13 @@ void create_empty_for_matrix( ctx, "cols", {{0, std::max(0, (int)cols - 1)}}, col_extent)); tiledb::ArraySchema schema(ctx, TILEDB_DENSE); - auto order = std::is_same_v ? TILEDB_ROW_MAJOR : TILEDB_COL_MAJOR; schema.set_domain(domain).set_order({{order, order}}); - schema.add_attribute(tiledb::Attribute::create(ctx, "values")); - - if (filter) { - tiledb::FilterList fl(ctx); - fl.add_filter(tiledb::Filter(ctx, *filter)); - schema.set_coords_filter_list(fl); - } + schema.add_attribute( + tiledb::Attribute::create(ctx, "values", filter_list)); + schema.set_coords_filter_list(filter_list); tiledb::Array::create(uri, schema); } @@ -161,7 +159,7 @@ void create_matrix( const tiledb::Context& ctx, const Matrix& A, const std::string& uri, - std::optional filter = std::nullopt) { + tiledb_filter_type_t filter) { // @todo: make this a parameter size_t num_parts = 10; @@ -201,7 +199,7 @@ void write_matrix( scoped_timer _{tdb_func__ + " " + std::string{uri}}; if (create) { - create_matrix(ctx, A, uri); + create_matrix(ctx, A, uri, TILEDB_FILTER_NONE); } if (A.num_rows() == 0 || A.num_cols() == 0) { @@ -254,17 +252,18 @@ void create_empty_for_vector( size_t rows, int32_t row_extent, tiledb_filter_type_t filter) { + tiledb::FilterList filter_list(ctx); + filter_list.add_filter({ctx, filter}); + tiledb::Domain domain(ctx); domain.add_dimensions(tiledb::Dimension::create( ctx, "rows", {{0, std::max(0, (int)rows - 1)}}, row_extent)); tiledb::ArraySchema schema(ctx, TILEDB_DENSE); schema.set_domain(domain).set_order({{TILEDB_COL_MAJOR, TILEDB_COL_MAJOR}}); - - tiledb::FilterList filter_list(ctx); - filter_list.add_filter({ctx, filter}); schema.add_attribute( tiledb::Attribute::create(ctx, "values", filter_list)); + schema.set_coords_filter_list(filter_list); tiledb::Array::create(uri, schema); } diff --git a/src/include/test/unit_api_feature_vector_array.cc b/src/include/test/unit_api_feature_vector_array.cc index b9bcd969c..58a1b0128 100644 --- a/src/include/test/unit_api_feature_vector_array.cc +++ b/src/include/test/unit_api_feature_vector_array.cc @@ -541,7 +541,13 @@ TEST_CASE("load empty matrix", "[api][index]") { } create_empty_for_matrix( - ctx, tmp_matrix_uri, dimension, domain, dimension, tile_extent); + ctx, + tmp_matrix_uri, + dimension, + domain, + dimension, + tile_extent, + TILEDB_FILTER_NONE); auto X = FeatureVectorArray(ctx, tmp_matrix_uri); } diff --git a/src/include/test/unit_tdb_matrix.cc b/src/include/test/unit_tdb_matrix.cc index 02626fca6..e556be61e 100644 --- a/src/include/test/unit_tdb_matrix.cc +++ b/src/include/test/unit_tdb_matrix.cc @@ -286,7 +286,8 @@ TEST_CASE("empty matrix", "[tdb_matrix]") { matrix_dimension, matrix_domain, matrix_dimension, - tile_extent); + tile_extent, + TILEDB_FILTER_NONE); { // No rows and no cols. diff --git a/src/include/test/unit_tdb_matrix_with_ids.cc b/src/include/test/unit_tdb_matrix_with_ids.cc index 826afb8dd..7165f5f73 100644 --- a/src/include/test/unit_tdb_matrix_with_ids.cc +++ b/src/include/test/unit_tdb_matrix_with_ids.cc @@ -260,7 +260,8 @@ TEST_CASE("empty matrix", "[tdb_matrix_with_ids]") { matrix_dimension, matrix_domain, matrix_dimension, - tile_extent); + tile_extent, + TILEDB_FILTER_NONE); create_empty_for_vector( ctx, tmp_ids_uri, matrix_domain, tile_extent); From c48b1c4e1acc69f43b87913e2bd0870b9f8518a2 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Jul 2024 16:29:10 +0200 Subject: [PATCH 6/6] fix build --- src/include/test/unit_tdb_matrix_with_ids.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/test/unit_tdb_matrix_with_ids.cc b/src/include/test/unit_tdb_matrix_with_ids.cc index 7165f5f73..70c50a07f 100644 --- a/src/include/test/unit_tdb_matrix_with_ids.cc +++ b/src/include/test/unit_tdb_matrix_with_ids.cc @@ -263,7 +263,7 @@ TEST_CASE("empty matrix", "[tdb_matrix_with_ids]") { tile_extent, TILEDB_FILTER_NONE); create_empty_for_vector( - ctx, tmp_ids_uri, matrix_domain, tile_extent); + ctx, tmp_ids_uri, matrix_domain, tile_extent, TILEDB_FILTER_NONE); { // Empty.