From 748f2664840388b47bdb5ef252ebb232c6de948a Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Tue, 11 Feb 2025 03:34:38 +0200 Subject: [PATCH 1/2] Consider Current Domain --- apis/python/src/tiledb/vector_search/ingestion.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apis/python/src/tiledb/vector_search/ingestion.py b/apis/python/src/tiledb/vector_search/ingestion.py index b5652be62..794f6c930 100644 --- a/apis/python/src/tiledb/vector_search/ingestion.py +++ b/apis/python/src/tiledb/vector_search/ingestion.py @@ -412,12 +412,20 @@ def read_source_metadata( if source_type == "TILEDB_ARRAY": schema = tiledb.ArraySchema.load(source_uri) size = np.int64(schema.domain.dim(1).domain[1]) + 1 - dimensions = np.int64(schema.domain.dim(0).domain[1]) + 1 + # In the case that current domain is non-empty, we need to consider it + if hasattr(schema, "current_domain") and not schema.current_domain.is_empty: + dimensions = np.int64(schema.current_domain.dim(0).domain[1]) + 1 + else: + dimensions = np.int64(schema.domain.dim(0).domain[1]) + 1 return size, dimensions, schema.attr(0).dtype if source_type == "TILEDB_SPARSE_ARRAY": schema = tiledb.ArraySchema.load(source_uri) size = np.int64(schema.domain.dim(0).domain[1]) + 1 - dimensions = np.int64(schema.domain.dim(1).domain[1]) + 1 + # In the case that current domain is non-empty, we need to consider it + if hasattr(schema, "current_domain") and not schema.current_domain.is_empty: + dimensions = np.int64(schema.current_domain.dim(1).domain[1]) + 1 + else: + dimensions = np.int64(schema.domain.dim(1).domain[1]) + 1 return size, dimensions, schema.attr(0).dtype if source_type == "TILEDB_PARTITIONED_ARRAY": with tiledb.open(source_uri, "r", config=config) as source_array: From efd6d462c350f0e92b516913708a8019eaa743ec Mon Sep 17 00:00:00 2001 From: Agisilaos Kounelis Date: Tue, 11 Feb 2025 03:34:41 +0200 Subject: [PATCH 2/2] Modify test --- apis/python/test/test_ingestion.py | 124 +++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 16 deletions(-) diff --git a/apis/python/test/test_ingestion.py b/apis/python/test/test_ingestion.py index 889d51e75..2cd6b8758 100644 --- a/apis/python/test/test_ingestion.py +++ b/apis/python/test/test_ingestion.py @@ -1536,23 +1536,24 @@ def test_ivf_flat_copy_centroids_uri(tmp_path): centroids = np.array([[1, 1, 1, 1], [2, 2, 2, 2]], dtype=np.float32) centroids_in_size = centroids.shape[0] dimensions = centroids.shape[1] + domain = tiledb.Domain( + *[ + tiledb.Dim( + name="rows", + domain=(0, dimensions - 1), + tile=dimensions, + dtype=np.dtype(np.int32), + ), + tiledb.Dim( + name="cols", + domain=(0, np.iinfo(np.dtype("int32")).max), + tile=100000, + dtype=np.dtype(np.int32), + ), + ] + ) schema = tiledb.ArraySchema( - domain=tiledb.Domain( - *[ - tiledb.Dim( - name="rows", - domain=(0, dimensions - 1), - tile=dimensions, - dtype=np.dtype(np.int32), - ), - tiledb.Dim( - name="cols", - domain=(0, np.iinfo(np.dtype("int32")).max), - tile=100000, - dtype=np.dtype(np.int32), - ), - ] - ), + domain=domain, sparse=False, attrs=[ tiledb.Attr( @@ -1570,6 +1571,17 @@ def test_ivf_flat_copy_centroids_uri(tmp_path): with tiledb.open(centroids_uri, mode="w", timestamp=index_timestamp) as A: A[0:dimensions, 0:centroids_in_size] = centroids.transpose() + ctx = tiledb.Ctx() + ndrect = tiledb.NDRectangle(ctx, domain) + range_one = (0, 1) + range_two = (0, 2) + ndrect.set_range(0, range_one[0], range_one[1]) + ndrect.set_range(1, range_two[0], range_two[1]) + + current_domain = tiledb.CurrentDomain(ctx) + current_domain.set_ndrectangle(ndrect) + A.schema.set_current_domain(current_domain) + # Create the index. index_uri = os.path.join(tmp_path, "array") index = ingest( @@ -2010,3 +2022,83 @@ def test_ivf_flat_taskgraph_query(tmp_path): queries, k=k, nprobe=nprobe, nthreads=8, mode=Mode.LOCAL, num_partitions=10 ) assert accuracy(result, gt_i) > MINIMUM_ACCURACY + + +# def test_ingestion_current_domain(tmp_path): +# # ################################################################################################ +# # # First set up the data. +# # ################################################################################################ +# # data = np.array( +# # [ +# # [1.0, 1.1, 1.2, 1.3], +# # [2.0, 2.1, 2.2, 2.3], +# # [3.0, 3.1, 3.2, 3.3], +# # [4.0, 4.1, 4.2, 4.3], +# # [5.0, 5.1, 5.2, 5.3], +# # ], +# # dtype=np.float32, +# # ) +# # training_data = data[1:3] + +# # ################################################################################################ +# # # Test we can ingest, query, update, and consolidate. +# # ################################################################################################ +# # index_uri = os.path.join(tmp_path, "array") +# # index = ingest( +# # index_type="IVF_FLAT", +# # index_uri=index_uri, +# # input_vectors=data, +# # training_input_vectors=training_data, +# # ) + +# # ====== + +# dimensions = 128 +# schema = tiledb.ArraySchema( +# domain=tiledb.Domain( +# *[ +# tiledb.Dim( +# name="rows", +# domain=(0, dimensions - 1), +# tile=dimensions, +# dtype=np.dtype(np.int32), +# ), +# tiledb.Dim( +# name="cols", +# domain=(0, np.iinfo(np.dtype("int32")).max), +# tile=100000, +# dtype=np.dtype(np.int32), +# ), +# ] +# ), +# sparse=False, +# attrs=[ +# tiledb.Attr( +# name="attr", +# dtype="float32", +# filters=tiledb.FilterList([tiledb.ZstdFilter()]), +# ) +# ], +# cell_order="col-major", +# tile_order="col-major", +# ) +# uri = os.path.join(tmp_path, "array") +# tiledb.Array.create(uri, schema) + +# index_timestamp = int(time.time() * 1000) +# with tiledb.open(uri, mode="w", timestamp=index_timestamp) as A: +# A[0:dimensions, 0:dimensions] = np.random.rand(dimensions, dimensions).astype( +# np.float32 +# ) + +# data = np.random.rand(1000, dimensions).astype(np.float32) + +# # Create the index. +# index_uri = os.path.join(tmp_path, "array") +# index = ingest( +# index_type="IVF_FLAT", +# index_uri=index_uri, +# input_vectors=data, +# copy_centroids_uri=uri, +# partitions=centroids_in_size, +# )