diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b7165088328..7d135d8622f 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -730,20 +730,20 @@ fn inner_release_native_dataset(env: &mut JNIEnv, obj: JObject) -> Result<()> { } #[no_mangle] -pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex( - mut env: JNIEnv, - java_dataset: JObject, - columns_jobj: JObject, // List +pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject<'local>, + columns_jobj: JObject<'local>, // List index_type_code_jobj: jint, - name_jobj: JObject, // Optional - params_jobj: JObject, // IndexParams - replace_jobj: jboolean, // replace - train_jobj: jboolean, // train - fragments_jobj: JObject, // List - index_uuid_jobj: JObject, // String - arrow_stream_addr_jobj: JObject, // Optional -) { - ok_or_throw_without_return!( + name_jobj: JObject<'local>, // Optional + params_jobj: JObject<'local>, // IndexParams + replace_jobj: jboolean, // replace + train_jobj: jboolean, // train + fragments_jobj: JObject<'local>, // List + index_uuid_jobj: JObject<'local>, // String + arrow_stream_addr_jobj: JObject<'local>, // Optional +) -> JObject<'local> { + ok_or_throw!( env, inner_create_index( &mut env, @@ -758,23 +758,23 @@ pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex( index_uuid_jobj, arrow_stream_addr_jobj, ) - ); + ) } #[allow(clippy::too_many_arguments)] -fn inner_create_index( - env: &mut JNIEnv, - java_dataset: JObject, - columns_jobj: JObject, // List +fn inner_create_index<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject<'local>, + columns_jobj: JObject<'local>, // List index_type_code_jobj: jint, - name_jobj: JObject, // Optional - params_jobj: JObject, // IndexParams - replace_jobj: jboolean, // replace - train_jobj: jboolean, // train - fragments_jobj: JObject, // Optional> - index_uuid_jobj: JObject, // Optional - arrow_stream_addr_jobj: JObject, // Optional -) -> Result<()> { + name_jobj: JObject<'local>, // Optional + params_jobj: JObject<'local>, // IndexParams + replace_jobj: jboolean, // replace + train_jobj: jboolean, // train + fragments_jobj: JObject<'local>, // Optional> + index_uuid_jobj: JObject<'local>, // Optional + arrow_stream_addr_jobj: JObject<'local>, // Optional +) -> Result> { let columns = env.get_strings(&columns_jobj)?; let index_type = IndexType::try_from(index_type_code_jobj)?; let name = env.get_string_opt(&name_jobj)?; @@ -837,38 +837,43 @@ fn inner_create_index( }; let params = params_result?; - let mut dataset_guard = - unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - let mut index_builder = dataset_guard - .inner - .create_index_builder(&columns_slice, index_type, params.as_ref()) - .replace(replace) - .train(train); + // Execute index creation in a block to ensure dataset_guard is dropped + // before we call into_java (which needs to borrow env again) + let index_metadata = { + let mut dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + + let mut index_builder = dataset_guard + .inner + .create_index_builder(&columns_slice, index_type, params.as_ref()) + .replace(replace) + .train(train); - if let Some(name) = name { - index_builder = index_builder.name(name); - } + if let Some(name) = name { + index_builder = index_builder.name(name); + } - if let Some(fragment_ids) = fragment_ids { - index_builder = index_builder.fragments(fragment_ids); - } + if let Some(fragment_ids) = fragment_ids { + index_builder = index_builder.fragments(fragment_ids); + } - if let Some(index_uuid) = index_uuid { - index_builder = index_builder.index_uuid(index_uuid); - } + if let Some(index_uuid) = index_uuid { + index_builder = index_builder.index_uuid(index_uuid); + } - if let Some(reader) = batch_reader { - index_builder = index_builder.preprocessed_data(Box::new(reader)); - } + if let Some(reader) = batch_reader { + index_builder = index_builder.preprocessed_data(Box::new(reader)); + } - if skip_commit { - RT.block_on(index_builder.execute_uncommitted())?; - } else { - RT.block_on(index_builder.into_future())? - } + if skip_commit { + RT.block_on(index_builder.execute_uncommitted())? + } else { + RT.block_on(index_builder.into_future())? + } + }; - Ok(()) + (&index_metadata).into_java(env) } fn should_skip_commit(index_type: IndexType, params_opt: &Option) -> Result { diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index eb66702ddf5..22c461b8b65 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -765,16 +765,17 @@ public void restore() { * @param name the name of the created index * @param params index params * @param replace whether to replace the existing index + * @return the metadata of the created index * @deprecated please use {@link Dataset#createIndex(IndexOptions)} instead. */ @Deprecated - public void createIndex( + public Index createIndex( List columns, IndexType indexType, Optional name, IndexParams params, boolean replace) { - createIndex( + return createIndex( IndexOptions.builder(columns, indexType, params) .replace(replace) .withIndexName(name.orElse(null)) @@ -785,11 +786,12 @@ public void createIndex( * Creates a new index on the dataset. * * @param options options for building index + * @return the metadata of the created index */ - public void createIndex(IndexOptions options) { + public Index createIndex(IndexOptions options) { try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); - nativeCreateIndex( + return nativeCreateIndex( options.getColumns(), options.getIndexType().ordinal(), options.getIndexName(), @@ -802,7 +804,7 @@ public void createIndex(IndexOptions options) { } } - private native void nativeCreateIndex( + private native Index nativeCreateIndex( List columns, int indexTypeCode, Optional name, diff --git a/java/src/test/java/org/lance/ScalarIndexTest.java b/java/src/test/java/org/lance/ScalarIndexTest.java index fa18671323e..61ae66c94d2 100644 --- a/java/src/test/java/org/lance/ScalarIndexTest.java +++ b/java/src/test/java/org/lance/ScalarIndexTest.java @@ -53,6 +53,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; public class ScalarIndexTest { @@ -79,12 +80,18 @@ public void testCreateBTreeIndex() throws Exception { IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build(); // Create BTree index on 'id' column - dataset.createIndex( - Collections.singletonList("id"), - IndexType.BTREE, - Optional.of("btree_id_index"), - indexParams, - true); + Index index = + dataset.createIndex( + Collections.singletonList("id"), + IndexType.BTREE, + Optional.of("btree_id_index"), + indexParams, + true); + + // Verify the returned Index object + assertEquals("btree_id_index", index.name()); + assertNotNull(index.uuid()); + assertFalse(index.fields().isEmpty()); // Verify index was created and is in the list assertTrue( @@ -343,12 +350,17 @@ public void testCreateZonemapIndex() throws Exception { IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build(); // Create Zonemap index on 'value' column - dataset.createIndex( - Collections.singletonList("value"), - IndexType.ZONEMAP, - Optional.of("zonemap_value_index"), - indexParams, - true); + Index index = + dataset.createIndex( + Collections.singletonList("value"), + IndexType.ZONEMAP, + Optional.of("zonemap_value_index"), + indexParams, + true); + + // Verify the returned Index object + assertEquals("zonemap_value_index", index.name()); + assertNotNull(index.uuid()); // Verify index was created assertTrue( diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index 3b9b646d5f6..5ec38085896 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -87,6 +87,7 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset): indices = mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["userId"] + assert indices[0]["name"] == "userId_idx" # Query using the indexed column result = mixed_case_dataset.to_table(filter="userId = 50") @@ -96,6 +97,9 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset): plan = mixed_case_dataset.scanner(filter="userId = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = mixed_case_dataset.stats.index_stats("userId_idx") + assert stats["index_type"] == "BTree" + def test_alter_column_with_mixed_case(self, mixed_case_dataset): """Altering columns works with mixed-case column names.""" # alter_columns uses direct schema lookup, not SQL parsing @@ -347,6 +351,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): assert len(indices) == 1 # Field with special chars is returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`user-id`"] + assert indices[0]["name"] == "user-id_idx" # Query using the indexed column (requires backticks in filter) result = special_char_dataset.to_table(filter="`user-id` = 50") @@ -356,6 +361,9 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = special_char_dataset.stats.index_stats("user-id_idx") + assert stats["index_type"] == "BTree" + def test_alter_column_with_special_chars(self, special_char_dataset): """Altering columns works with special character column names.""" # alter_columns uses direct schema lookup @@ -455,6 +463,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["MetaData.userId"] + assert indices[0]["name"] == "MetaData.userId_idx" # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -466,6 +475,9 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): ).explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_mixed_case_dataset.stats.index_stats("MetaData.userId_idx") + assert stats["index_type"] == "BTree" + def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): """Scalar index on top-level mixed-case column works.""" nested_mixed_case_dataset.create_scalar_index("rowId", index_type="BTREE") @@ -473,6 +485,7 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["rowId"] + assert indices[0]["name"] == "rowId_idx" result = nested_mixed_case_dataset.to_table(filter="rowId = 50") assert result.num_rows == 1 @@ -480,6 +493,9 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): plan = nested_mixed_case_dataset.scanner(filter="rowId = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_mixed_case_dataset.stats.index_stats("rowId_idx") + assert stats["index_type"] == "BTree" + def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_dataset): """Scalar index creation should work even when path is lowercased. @@ -562,6 +578,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase assert len(indices) == 1 # Fields with special chars are returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`meta-data`.`user-id`"] + assert indices[0]["name"] == "meta-data.user-id_idx" # Query using the indexed column (backticks required in filter) result = nested_special_char_dataset.to_table( @@ -575,6 +592,9 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase ).explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_special_char_dataset.stats.index_stats("meta-data.user-id_idx") + assert stats["index_type"] == "BTree" + def test_scalar_index_on_top_level_special_chars(self, nested_special_char_dataset): """Scalar index on top-level special char column works.""" nested_special_char_dataset.create_scalar_index("`row-id`", index_type="BTREE") diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 49a6397e818..57f270ae2c7 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -81,7 +81,7 @@ use lance_index::{ }; use lance_io::object_store::ObjectStoreParams; use lance_linalg::distance::MetricType; -use lance_table::format::{BasePath, Fragment}; +use lance_table::format::{BasePath, Fragment, IndexMetadata}; use lance_table::io::commit::CommitHandler; use crate::error::PythonErrorExt; @@ -1809,7 +1809,7 @@ impl Dataset { train: Option, storage_options: Option>, kwargs: Option<&Bound>, - ) -> PyResult<()> { + ) -> PyResult> { let columns: Vec<&str> = columns.iter().map(|s| &**s).collect(); let index_type = index_type.to_uppercase(); let idx_type = match index_type.as_str() { @@ -1979,19 +1979,21 @@ impl Dataset { use std::future::IntoFuture; // Use execute_uncommitted if fragment_ids is provided, otherwise use execute - if has_fragment_ids { + let index_metadata = if has_fragment_ids { // For fragment-level indexing, use execute_uncommitted - let _index_metadata = rt() + let index_metadata = rt() .block_on(None, builder.execute_uncommitted())? .infer_error()?; // Note: We don't update self.ds here as the index is not committed + index_metadata } else { // For regular indexing, use the standard execute path - rt().block_on(None, builder.into_future())?.infer_error()?; + let index_metadata = rt().block_on(None, builder.into_future())?.infer_error()?; self.ds = Arc::new(new_self); - } + index_metadata + }; - Ok(()) + Ok(PyLance(index_metadata)) } fn drop_index(&mut self, name: &str) -> PyResult<()> { diff --git a/rust/lance-index/src/traits.rs b/rust/lance-index/src/traits.rs index 21647355039..efe5caf635a 100644 --- a/rust/lance-index/src/traits.rs +++ b/rust/lance-index/src/traits.rs @@ -153,6 +153,8 @@ pub trait DatasetIndexExt { /// if not provided, it will auto-generate one. /// - `params`: index parameters. /// - `replace`: replace the existing index if it exists. + /// + /// Returns the metadata of the created index. async fn create_index( &mut self, columns: &[&str], @@ -160,7 +162,7 @@ pub trait DatasetIndexExt { name: Option, params: &dyn IndexParams, replace: bool, - ) -> Result<()>; + ) -> Result; /// Drop indices by name. /// diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index fc5f636f5b1..db3e2b17847 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4146,7 +4146,8 @@ pub mod test_dataset { ¶ms, true, ) - .await + .await?; + Ok(()) } pub async fn make_scalar_index(&mut self) -> Result<()> { @@ -4158,14 +4159,16 @@ pub mod test_dataset { &ScalarIndexParams::default(), true, ) - .await + .await?; + Ok(()) } pub async fn make_fts_index(&mut self) -> Result<()> { let params = InvertedIndexParams::default().with_position(true); self.dataset .create_index(&["s"], IndexType::Inverted, None, ¶ms, true) - .await + .await?; + Ok(()) } pub async fn append_new_data(&mut self) -> Result<()> { diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index ce8254cbd52..62a4302701a 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -90,18 +90,18 @@ async fn test_create_index( // Make sure valid arguments should create index successfully let params = VectorIndexParams::ivf_pq(10, 8, 2, MetricType::L2, 50); - dataset + let index_meta = dataset .create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true) .await .unwrap(); dataset.validate().await.unwrap(); + // Verify the returned metadata + assert_eq!(index_meta.name, "embeddings_idx"); // The version should match the table version it was created from. - let indices = dataset.load_indices().await.unwrap(); - let actual = indices.first().unwrap().dataset_version; let expected = dataset.manifest.version - 1; - assert_eq!(actual, expected); - let fragment_bitmap = indices.first().unwrap().fragment_bitmap.as_ref().unwrap(); + assert_eq!(index_meta.dataset_version, expected); + let fragment_bitmap = index_meta.fragment_bitmap.as_ref().unwrap(); assert_eq!(fragment_bitmap.len(), 1); assert!(fragment_bitmap.contains(0)); @@ -313,18 +313,18 @@ async fn test_create_int8_index( // Make sure valid arguments should create index successfully let params = VectorIndexParams::ivf_pq(10, 8, 2, MetricType::L2, 50); - dataset + let index_meta = dataset .create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true) .await .unwrap(); dataset.validate().await.unwrap(); + // Verify the returned metadata + assert_eq!(index_meta.name, "embeddings_idx"); // The version should match the table version it was created from. - let indices = dataset.load_indices().await.unwrap(); - let actual = indices.first().unwrap().dataset_version; let expected = dataset.manifest.version - 1; - assert_eq!(actual, expected); - let fragment_bitmap = indices.first().unwrap().fragment_bitmap.as_ref().unwrap(); + assert_eq!(index_meta.dataset_version, expected); + let fragment_bitmap = index_meta.fragment_bitmap.as_ref().unwrap(); assert_eq!(fragment_bitmap.len(), 1); assert!(fragment_bitmap.contains(0)); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 1431d5687a8..a2b02dd5cf4 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -594,7 +594,7 @@ impl DatasetIndexExt for Dataset { name: Option, params: &dyn IndexParams, replace: bool, - ) -> Result<()> { + ) -> Result { // Use the builder pattern with default train=true for backward compatibility let mut builder = self.create_index_builder(columns, index_type, params); diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index acb735c8b6b..7447be0fdc4 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -31,6 +31,18 @@ use uuid::Uuid; use arrow_array::RecordBatchReader; +/// Generate default index name from field path. +/// +/// Joins field names with `.` to create the base index name. +/// For example: `["meta-data", "user-id"]` -> `"meta-data.user-id"` +fn default_index_name(fields: &[&str]) -> String { + if fields.iter().any(|f| f.contains('.')) { + format_field_path(fields) + } else { + fields.join(".") + } +} + pub struct CreateIndexBuilder<'a> { dataset: &'a mut Dataset, columns: Vec, @@ -135,7 +147,24 @@ impl<'a> CreateIndexBuilder<'a> { .dataset .open_frag_reuse_index(&NoOpMetricsCollector) .await?; - let index_name = self.name.take().unwrap_or(format!("{column}_idx")); + let index_name = if let Some(name) = self.name.take() { + name + } else { + // Generate default name with collision handling + let column_path = default_index_name(&names); + let base_name = format!("{column_path}_idx"); + let mut candidate = base_name.clone(); + let mut counter = 2; // Start with no suffix, then use _2, _3, ... + // Find unique name by appending numeric suffix if needed + while indices + .iter() + .any(|idx| idx.name == candidate && idx.fields != [field.id]) + { + candidate = format!("{base_name}_{counter}"); + counter += 1; + } + candidate + }; if let Some(idx) = indices.iter().find(|i| i.name == index_name) { if idx.fields == [field.id] && !self.replace { return Err(Error::Index { @@ -404,8 +433,9 @@ impl<'a> CreateIndexBuilder<'a> { } #[instrument(skip_all)] - async fn execute(mut self) -> Result<()> { + async fn execute(mut self) -> Result { let new_idx = self.execute_uncommitted().await?; + let index_uuid = new_idx.uuid; let transaction = Transaction::new( new_idx.dataset_version, Operation::CreateIndex { @@ -419,13 +449,23 @@ impl<'a> CreateIndexBuilder<'a> { .apply_commit(transaction, &Default::default(), &Default::default()) .await?; - Ok(()) + // Fetch the committed index metadata from the dataset. + // This ensures we return the version that may have been modified by the commit. + let indices = self.dataset.load_indices().await?; + indices + .iter() + .find(|idx| idx.uuid == index_uuid) + .cloned() + .ok_or_else(|| Error::Internal { + message: format!("Index with UUID {} not found after commit", index_uuid), + location: location!(), + }) } } impl<'a> IntoFuture for CreateIndexBuilder<'a> { - type Output = Result<()>; - type IntoFuture = BoxFuture<'a, Result<()>>; + type Output = Result; + type IntoFuture = BoxFuture<'a, Result>; fn into_future(self) -> Self::IntoFuture { Box::pin(self.execute()) @@ -436,17 +476,141 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { mod tests { use super::*; use crate::dataset::{WriteMode, WriteParams}; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use arrow::datatypes::{Float32Type, Int32Type}; use arrow_array::RecordBatchIterator; use arrow_array::{Int32Array, RecordBatch, StringArray}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_core::utils::tempfile::TempStrDir; - use lance_datagen; + use lance_datagen::{self, gen_batch}; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; use lance_linalg::distance::MetricType; use std::sync::Arc; + #[test] + fn test_default_index_name() { + // Single field - preserved as-is + assert_eq!(default_index_name(&["user-id"]), "user-id"); + assert_eq!(default_index_name(&["user:id"]), "user:id"); + assert_eq!(default_index_name(&["userId"]), "userId"); + + // Nested paths - joined with dot + assert_eq!( + default_index_name(&["meta-data", "user-id"]), + "meta-data.user-id" + ); + assert_eq!( + default_index_name(&["MetaData", "userId"]), + "MetaData.userId" + ); + + // Path with dots in field names - escape + assert_eq!( + default_index_name(&["meta.data", "user.id"]), + "`meta.data`.`user.id`" + ); + + // Empty input + assert_eq!(default_index_name(&[]), ""); + } + + #[tokio::test] + async fn test_default_index_name_with_special_chars() { + // Verify default index names preserve special characters in column names. + let mut dataset = gen_batch() + .col("user-id", lance_datagen::array::step::()) + .col("user:id", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + + // Create index on column with hyphen + let idx1 = CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + assert_eq!(idx1.name, "user-id_idx"); + + // Create index on column with colon + let idx2 = CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + assert_eq!(idx2.name, "user:id_idx"); + + // Verify both indices exist + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 2); + } + + #[tokio::test] + async fn test_index_name_collision_with_explicit_name() { + // Test collision handling when explicit name conflicts with default name. + let mut dataset = gen_batch() + .col("a", lance_datagen::array::step::()) + .col("b", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + + // (a) Explicit name on first index, default on second that would collide + // Create index on "a" with explicit name "b_idx" + let idx1 = CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + .name("b_idx".to_string()) + .execute() + .await + .unwrap(); + assert_eq!(idx1.name, "b_idx"); + + // Create index on "b" with default name - would be "b_idx" but that's taken + // so it should get "b_idx_2" + let idx2 = CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + assert_eq!(idx2.name, "b_idx_2"); + + // Verify both indices exist + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 2); + } + + #[tokio::test] + async fn test_index_name_collision_explicit_errors() { + // Test that explicit name collision with existing index errors. + let mut dataset = gen_batch() + .col("a", lance_datagen::array::step::()) + .col("b", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + + // (b) Default name on first, explicit same name on second should error + // Create index on "a" with default name "a_idx" + let idx1 = CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + assert_eq!(idx1.name, "a_idx"); + + // Try to create index on "b" with explicit name "a_idx" - should error + let result = CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) + .name("a_idx".to_string()) + .execute() + .await; + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.to_string().contains("already exists")); + } + // Helper function to create test data with text field suitable for inverted index fn create_text_batch(start: i32, end: i32) -> RecordBatch { let schema = Arc::new(ArrowSchema::new(vec![