From 211182e0d2a5c4f0fa560c8e4049cbbc1e393dac Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Mon, 11 Dec 2023 12:25:32 +0100 Subject: [PATCH 1/4] run queries twice to test for idempotence --- apis/python/test/test_index.py | 126 +++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index 6ae4b2259..d343ac1a7 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -9,8 +9,9 @@ def test_flat_index(tmp_path): uri = os.path.join(tmp_path, "array") index = flat_index.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8)) - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {ind.MAX_UINT64} == set(result_i[0]) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {ind.MAX_UINT64} == set(result_i[0]) update_vectors = np.empty([5], dtype=object) update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.uint8)) @@ -19,39 +20,47 @@ def test_flat_index(tmp_path): update_vectors[3] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) update_vectors[4] = np.array([4, 4, 4], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([0, 1, 2, 3, 4])) - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {1, 2, 3}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {1, 2, 3}.issubset(set(result_i[0])) index.delete_batch(external_ids=np.array([1, 3])) - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {0, 2, 4}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {0, 2, 4}.issubset(set(result_i[0])) update_vectors = np.empty([2], dtype=object) update_vectors[0] = np.array([1, 1, 1], dtype=np.dtype(np.uint8)) update_vectors[1] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([1, 3])) - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {1, 2, 3}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {1, 2, 3}.issubset(set(result_i[0])) index.delete_batch(external_ids=np.array([1, 3])) - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {0, 2, 4}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) + assert {0, 2, 4}.issubset(set(result_i[0])) def test_ivf_flat_index(tmp_path): @@ -60,10 +69,11 @@ def test_ivf_flat_index(tmp_path): index = ivf_flat_index.create( uri=uri, dimensions=3, vector_type=np.dtype(np.uint8), partitions=partitions ) - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {ind.MAX_UINT64} == set(result_i[0]) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {ind.MAX_UINT64} == set(result_i[0]) update_vectors = np.empty([5], dtype=object) update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.uint8)) @@ -72,52 +82,60 @@ def test_ivf_flat_index(tmp_path): update_vectors[3] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) update_vectors[4] = np.array([4, 4, 4], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([0, 1, 2, 3, 4])) - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {1, 2, 3}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {1, 2, 3}.issubset(set(result_i[0])) index.delete_batch(external_ids=np.array([1, 3])) - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {0, 2, 4}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {0, 2, 4}.issubset(set(result_i[0])) update_vectors = np.empty([2], dtype=object) update_vectors[0] = np.array([1, 1, 1], dtype=np.dtype(np.uint8)) update_vectors[1] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([1, 3])) - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {1, 2, 3}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {1, 2, 3}.issubset(set(result_i[0])) index.delete_batch(external_ids=np.array([1, 3])) - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {0, 2, 4}.issubset(set(result_i[0])) index = index.consolidate_updates() - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + for _ in range(2): + result_d, result_i = index.query( + np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions + ) + assert {0, 2, 4}.issubset(set(result_i[0])) From 57bb6e11e8e65b38b23f24b4640a5f70b8369edc Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Mon, 11 Dec 2023 21:31:20 +0100 Subject: [PATCH 2/4] refactor query and check into helper function --- apis/python/test/test_index.py | 90 ++++++++-------------------------- 1 file changed, 21 insertions(+), 69 deletions(-) diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index d343ac1a7..bd47d6f12 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -5,6 +5,10 @@ from tiledb.vector_search import flat_index, ivf_flat_index from tiledb.vector_search.index import Index +def query_and_check(index, queries, k, expected, **kwargs): + for _ in range(5): + result_d, result_i = index.query(queries, k=k, **kwargs) + assert expected.issubset(set(result_i[0])) def test_flat_index(tmp_path): uri = os.path.join(tmp_path, "array") @@ -20,47 +24,31 @@ def test_flat_index(tmp_path): update_vectors[3] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) update_vectors[4] = np.array([4, 4, 4], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([0, 1, 2, 3, 4])) - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}) index.delete_batch(external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}) update_vectors = np.empty([2], dtype=object) update_vectors[0] = np.array([1, 1, 1], dtype=np.dtype(np.uint8)) update_vectors[1] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}) index.delete_batch(external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}) def test_ivf_flat_index(tmp_path): @@ -69,11 +57,7 @@ def test_ivf_flat_index(tmp_path): index = ivf_flat_index.create( uri=uri, dimensions=3, vector_type=np.dtype(np.uint8), partitions=partitions ) - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {ind.MAX_UINT64} == set(result_i[0]) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {ind.MAX_UINT64}, nprobe=partitions) update_vectors = np.empty([5], dtype=object) update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.uint8)) @@ -82,60 +66,28 @@ def test_ivf_flat_index(tmp_path): update_vectors[3] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) update_vectors[4] = np.array([4, 4, 4], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([0, 1, 2, 3, 4])) - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}, nprobe=partitions) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}, nprobe=partitions) index.delete_batch(external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions) update_vectors = np.empty([2], dtype=object) update_vectors[0] = np.array([1, 1, 1], dtype=np.dtype(np.uint8)) update_vectors[1] = np.array([3, 3, 3], dtype=np.dtype(np.uint8)) index.update_batch(vectors=update_vectors, external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}, nprobe=partitions) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {1, 2, 3}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {1, 2, 3}, nprobe=partitions) index.delete_batch(external_ids=np.array([1, 3])) - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions) index = index.consolidate_updates() - for _ in range(2): - result_d, result_i = index.query( - np.array([[2, 2, 2]], dtype=np.float32), k=3, nprobe=partitions - ) - assert {0, 2, 4}.issubset(set(result_i[0])) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {0, 2, 4}, nprobe=partitions) From 86e5d3a7695e7dd51d8cc0b5ff409484b3d2e205 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Mon, 11 Dec 2023 21:32:14 +0100 Subject: [PATCH 3/4] 3 runs of query --- apis/python/test/test_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index bd47d6f12..3077ea1aa 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -6,7 +6,7 @@ from tiledb.vector_search.index import Index def query_and_check(index, queries, k, expected, **kwargs): - for _ in range(5): + for _ in range(3): result_d, result_i = index.query(queries, k=k, **kwargs) assert expected.issubset(set(result_i[0])) From 3f0b3b72aa9927fb974557f3ee2b2815662e4383 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Mon, 11 Dec 2023 21:33:26 +0100 Subject: [PATCH 4/4] add missing query() --- apis/python/test/test_index.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/apis/python/test/test_index.py b/apis/python/test/test_index.py index 3077ea1aa..6974a9432 100644 --- a/apis/python/test/test_index.py +++ b/apis/python/test/test_index.py @@ -13,9 +13,7 @@ def query_and_check(index, queries, k, expected, **kwargs): def test_flat_index(tmp_path): uri = os.path.join(tmp_path, "array") index = flat_index.create(uri=uri, dimensions=3, vector_type=np.dtype(np.uint8)) - for _ in range(2): - result_d, result_i = index.query(np.array([[2, 2, 2]], dtype=np.float32), k=3) - assert {ind.MAX_UINT64} == set(result_i[0]) + query_and_check(index, np.array([[2, 2, 2]], dtype=np.float32), 3, {ind.MAX_UINT64}) update_vectors = np.empty([5], dtype=object) update_vectors[0] = np.array([0, 0, 0], dtype=np.dtype(np.uint8))