From 819cbea762de98f2b5cfac0767b3f6eae4790cc0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 6 Jan 2026 10:07:09 -0800 Subject: [PATCH 1/7] write tests for current behavior --- python/python/tests/test_column_names.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index 3b9b646d5f6..985310d3327 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -87,6 +87,7 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset): indices = mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["userId"] + assert indices[0]["name"] == "userId_idx" # Query using the indexed column result = mixed_case_dataset.to_table(filter="userId = 50") @@ -96,6 +97,9 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset): plan = mixed_case_dataset.scanner(filter="userId = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = mixed_case_dataset.stats.index_stats("userId_idx") + assert stats["index_type"] == "BTree" + def test_alter_column_with_mixed_case(self, mixed_case_dataset): """Altering columns works with mixed-case column names.""" # alter_columns uses direct schema lookup, not SQL parsing @@ -347,6 +351,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): assert len(indices) == 1 # Field with special chars is returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`user-id`"] + assert indices[0]["name"] == "`user-id`_idx" # Query using the indexed column (requires backticks in filter) result = special_char_dataset.to_table(filter="`user-id` = 50") @@ -356,6 +361,9 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = special_char_dataset.stats.index_stats("`user-id`_idx") + assert stats["index_type"] == "BTree" + def test_alter_column_with_special_chars(self, special_char_dataset): """Altering columns works with special character column names.""" # alter_columns uses direct schema lookup @@ -455,6 +463,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["MetaData.userId"] + assert indices[0]["name"] == "MetaData.userId_idx" # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -466,6 +475,9 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): ).explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_mixed_case_dataset.stats.index_stats("MetaData.userId_idx") + assert stats["index_type"] == "BTree" + def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): """Scalar index on top-level mixed-case column works.""" nested_mixed_case_dataset.create_scalar_index("rowId", index_type="BTREE") @@ -473,6 +485,7 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["rowId"] + assert indices[0]["name"] == "rowId_idx" result = nested_mixed_case_dataset.to_table(filter="rowId = 50") assert result.num_rows == 1 @@ -480,6 +493,9 @@ def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): plan = nested_mixed_case_dataset.scanner(filter="rowId = 50").explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_mixed_case_dataset.stats.index_stats("rowId_idx") + assert stats["index_type"] == "BTree" + def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_dataset): """Scalar index creation should work even when path is lowercased. @@ -562,6 +578,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase assert len(indices) == 1 # Fields with special chars are returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`meta-data`.`user-id`"] + assert indices[0]["name"] == "`meta-data`.`user-id`_idx" # Query using the indexed column (backticks required in filter) result = nested_special_char_dataset.to_table( @@ -575,6 +592,11 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase ).explain_plan() assert "ScalarIndexQuery" in plan + stats = nested_special_char_dataset.stats.index_stats( + "`meta-data`.`user-id`_idx" + ) + assert stats["index_type"] == "BTree" + def test_scalar_index_on_top_level_special_chars(self, nested_special_char_dataset): """Scalar index on top-level special char column works.""" nested_special_char_dataset.create_scalar_index("`row-id`", index_type="BTREE") From 32e8b09a8a9a96d8bbfb0ae9d3f0c58876bd6a9b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 6 Jan 2026 15:31:00 -0800 Subject: [PATCH 2/7] normalize index names --- python/python/tests/test_column_names.py | 14 +-- rust/lance/src/index/create.rs | 140 ++++++++++++++++++++++- 2 files changed, 145 insertions(+), 9 deletions(-) diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index 985310d3327..b36ab72c40c 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -351,7 +351,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): assert len(indices) == 1 # Field with special chars is returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`user-id`"] - assert indices[0]["name"] == "`user-id`_idx" + assert indices[0]["name"] == "user_id_idx" # Query using the indexed column (requires backticks in filter) result = special_char_dataset.to_table(filter="`user-id` = 50") @@ -361,7 +361,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan() assert "ScalarIndexQuery" in plan - stats = special_char_dataset.stats.index_stats("`user-id`_idx") + stats = special_char_dataset.stats.index_stats("user_id_idx") assert stats["index_type"] == "BTree" def test_alter_column_with_special_chars(self, special_char_dataset): @@ -463,7 +463,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["MetaData.userId"] - assert indices[0]["name"] == "MetaData.userId_idx" + assert indices[0]["name"] == "MetaData_userId_idx" # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -475,7 +475,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): ).explain_plan() assert "ScalarIndexQuery" in plan - stats = nested_mixed_case_dataset.stats.index_stats("MetaData.userId_idx") + stats = nested_mixed_case_dataset.stats.index_stats("MetaData_userId_idx") assert stats["index_type"] == "BTree" def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): @@ -578,7 +578,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase assert len(indices) == 1 # Fields with special chars are returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`meta-data`.`user-id`"] - assert indices[0]["name"] == "`meta-data`.`user-id`_idx" + assert indices[0]["name"] == "meta_data_user_id_idx" # Query using the indexed column (backticks required in filter) result = nested_special_char_dataset.to_table( @@ -592,9 +592,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase ).explain_plan() assert "ScalarIndexQuery" in plan - stats = nested_special_char_dataset.stats.index_stats( - "`meta-data`.`user-id`_idx" - ) + stats = nested_special_char_dataset.stats.index_stats("meta_data_user_id_idx") assert stats["index_type"] == "BTree" def test_scalar_index_on_top_level_special_chars(self, nested_special_char_dataset): diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index acb735c8b6b..fa94f8fa18e 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -31,6 +31,43 @@ use uuid::Uuid; use arrow_array::RecordBatchReader; +/// Normalize a field path to create a valid index name. +/// +/// Converts field names to a safe identifier format: +/// - Replaces non-alphanumeric/underscore characters with `_` +/// - Collapses consecutive underscores +/// - Strips leading/trailing underscores +/// - Joins nested path segments with `_` +fn normalize_index_name(fields: &[&str]) -> String { + let normalized: String = fields + .iter() + .map(|field| { + field + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '_' { + c + } else { + '_' + } + }) + .collect::() + }) + .collect::>() + .join("_") + .split('_') + .filter(|s| !s.is_empty()) + .collect::>() + .join("_"); + + // Handle edge case where everything is stripped (e.g., "---") + if normalized.is_empty() { + "index".to_string() + } else { + normalized + } +} + pub struct CreateIndexBuilder<'a> { dataset: &'a mut Dataset, columns: Vec, @@ -135,7 +172,24 @@ impl<'a> CreateIndexBuilder<'a> { .dataset .open_frag_reuse_index(&NoOpMetricsCollector) .await?; - let index_name = self.name.take().unwrap_or(format!("{column}_idx")); + let index_name = if let Some(name) = self.name.take() { + name + } else { + // Generate normalized default name with collision handling + let normalized_column = normalize_index_name(&names); + let base_name = format!("{normalized_column}_idx"); + let mut candidate = base_name.clone(); + let mut counter = 2; // Start with no suffix, then use _2, _3, ... + // Find unique name by appending numeric suffix if needed + while indices + .iter() + .any(|idx| idx.name == candidate && idx.fields != [field.id]) + { + candidate = format!("{base_name}_{counter}"); + counter += 1; + } + candidate + }; if let Some(idx) = indices.iter().find(|i| i.name == index_name) { if idx.fields == [field.id] && !self.replace { return Err(Error::Index { @@ -447,6 +501,90 @@ mod tests { use lance_linalg::distance::MetricType; use std::sync::Arc; + #[test] + fn test_normalize_index_name() { + // Basic normalization - special chars become underscore + assert_eq!(normalize_index_name(&["user-id"]), "user_id"); + assert_eq!(normalize_index_name(&["user:id"]), "user_id"); + assert_eq!(normalize_index_name(&["user.id"]), "user_id"); + + // Nested paths - segments joined with underscore + assert_eq!( + normalize_index_name(&["meta-data", "user-id"]), + "meta_data_user_id" + ); + + // Consecutive special chars collapse to single underscore + assert_eq!(normalize_index_name(&["user--id"]), "user_id"); + assert_eq!(normalize_index_name(&["a..b"]), "a_b"); + + // Leading/trailing special chars are stripped + assert_eq!(normalize_index_name(&["--test--"]), "test"); + assert_eq!(normalize_index_name(&["-user-id-"]), "user_id"); + + // Normal columns unchanged + assert_eq!(normalize_index_name(&["userId"]), "userId"); + assert_eq!(normalize_index_name(&["user_id"]), "user_id"); + + // Empty input + assert_eq!(normalize_index_name(&[]), "index"); + + // All special chars -> fallback to "index" + assert_eq!(normalize_index_name(&["---"]), "index"); + + // Unicode alphanumeric preserved + assert_eq!(normalize_index_name(&["字段名"]), "字段名"); + assert_eq!(normalize_index_name(&["字段-名"]), "字段_名"); + } + + #[tokio::test] + async fn test_index_name_collision_suffix() { + // When two columns normalize to the same default index name, + // the second index should get a numeric suffix. + let test_dir = TempStrDir::default(); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("user-id", DataType::Int32, false), + ArrowField::new("user:id", DataType::Int32, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from_iter_values(0..100)), + Arc::new(Int32Array::from_iter_values(0..100)), + ], + ) + .unwrap(); + + let reader = RecordBatchIterator::new([Ok(batch)], schema); + let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + + // Create index on first column - gets user_id_idx + CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + assert_eq!(indices[0].name, "user_id_idx"); + + // Create index on second column - should get user_id_idx_2 + CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 2); + let names: std::collections::HashSet<_> = indices.iter().map(|i| i.name.as_str()).collect(); + assert!(names.contains("user_id_idx")); + assert!(names.contains("user_id_idx_2")); + } + // Helper function to create test data with text field suitable for inverted index fn create_text_batch(start: i32, end: i32) -> RecordBatch { let schema = Arc::new(ArrowSchema::new(vec![ From e7bb14bcc31d1b5faa62deaa4143d783822c8602 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 7 Jan 2026 09:53:06 -0800 Subject: [PATCH 3/7] backwards compatible index name defaults --- python/python/tests/test_column_names.py | 12 +- rust/lance/src/index/create.rs | 192 ++++++++++++----------- 2 files changed, 110 insertions(+), 94 deletions(-) diff --git a/python/python/tests/test_column_names.py b/python/python/tests/test_column_names.py index b36ab72c40c..5ec38085896 100644 --- a/python/python/tests/test_column_names.py +++ b/python/python/tests/test_column_names.py @@ -351,7 +351,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): assert len(indices) == 1 # Field with special chars is returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`user-id`"] - assert indices[0]["name"] == "user_id_idx" + assert indices[0]["name"] == "user-id_idx" # Query using the indexed column (requires backticks in filter) result = special_char_dataset.to_table(filter="`user-id` = 50") @@ -361,7 +361,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset): plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan() assert "ScalarIndexQuery" in plan - stats = special_char_dataset.stats.index_stats("user_id_idx") + stats = special_char_dataset.stats.index_stats("user-id_idx") assert stats["index_type"] == "BTree" def test_alter_column_with_special_chars(self, special_char_dataset): @@ -463,7 +463,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): indices = nested_mixed_case_dataset.list_indices() assert len(indices) == 1 assert indices[0]["fields"] == ["MetaData.userId"] - assert indices[0]["name"] == "MetaData_userId_idx" + assert indices[0]["name"] == "MetaData.userId_idx" # Query using the indexed column result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50") @@ -475,7 +475,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): ).explain_plan() assert "ScalarIndexQuery" in plan - stats = nested_mixed_case_dataset.stats.index_stats("MetaData_userId_idx") + stats = nested_mixed_case_dataset.stats.index_stats("MetaData.userId_idx") assert stats["index_type"] == "BTree" def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset): @@ -578,7 +578,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase assert len(indices) == 1 # Fields with special chars are returned in quoted format for SQL compatibility assert indices[0]["fields"] == ["`meta-data`.`user-id`"] - assert indices[0]["name"] == "meta_data_user_id_idx" + assert indices[0]["name"] == "meta-data.user-id_idx" # Query using the indexed column (backticks required in filter) result = nested_special_char_dataset.to_table( @@ -592,7 +592,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase ).explain_plan() assert "ScalarIndexQuery" in plan - stats = nested_special_char_dataset.stats.index_stats("meta_data_user_id_idx") + stats = nested_special_char_dataset.stats.index_stats("meta-data.user-id_idx") assert stats["index_type"] == "BTree" def test_scalar_index_on_top_level_special_chars(self, nested_special_char_dataset): diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index fa94f8fa18e..1240ad2be55 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -31,41 +31,12 @@ use uuid::Uuid; use arrow_array::RecordBatchReader; -/// Normalize a field path to create a valid index name. +/// Generate default index name from field path. /// -/// Converts field names to a safe identifier format: -/// - Replaces non-alphanumeric/underscore characters with `_` -/// - Collapses consecutive underscores -/// - Strips leading/trailing underscores -/// - Joins nested path segments with `_` -fn normalize_index_name(fields: &[&str]) -> String { - let normalized: String = fields - .iter() - .map(|field| { - field - .chars() - .map(|c| { - if c.is_alphanumeric() || c == '_' { - c - } else { - '_' - } - }) - .collect::() - }) - .collect::>() - .join("_") - .split('_') - .filter(|s| !s.is_empty()) - .collect::>() - .join("_"); - - // Handle edge case where everything is stripped (e.g., "---") - if normalized.is_empty() { - "index".to_string() - } else { - normalized - } +/// Joins field names with `.` to create the base index name. +/// For example: `["meta-data", "user-id"]` -> `"meta-data.user-id"` +fn default_index_name(fields: &[&str]) -> String { + fields.join(".") } pub struct CreateIndexBuilder<'a> { @@ -175,9 +146,9 @@ impl<'a> CreateIndexBuilder<'a> { let index_name = if let Some(name) = self.name.take() { name } else { - // Generate normalized default name with collision handling - let normalized_column = normalize_index_name(&names); - let base_name = format!("{normalized_column}_idx"); + // Generate default name with collision handling + let column_path = default_index_name(&names); + let base_name = format!("{column_path}_idx"); let mut candidate = base_name.clone(); let mut counter = 2; // Start with no suffix, then use _2, _3, ... // Find unique name by appending numeric suffix if needed @@ -490,90 +461,101 @@ impl<'a> IntoFuture for CreateIndexBuilder<'a> { mod tests { use super::*; use crate::dataset::{WriteMode, WriteParams}; + use crate::utils::test::{DatagenExt, FragmentCount, FragmentRowCount}; use arrow::datatypes::{Float32Type, Int32Type}; use arrow_array::RecordBatchIterator; use arrow_array::{Int32Array, RecordBatch, StringArray}; use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; use lance_core::utils::tempfile::TempStrDir; - use lance_datagen; + use lance_datagen::{self, gen_batch}; use lance_index::optimize::OptimizeOptions; use lance_index::scalar::inverted::tokenizer::InvertedIndexParams; use lance_linalg::distance::MetricType; use std::sync::Arc; #[test] - fn test_normalize_index_name() { - // Basic normalization - special chars become underscore - assert_eq!(normalize_index_name(&["user-id"]), "user_id"); - assert_eq!(normalize_index_name(&["user:id"]), "user_id"); - assert_eq!(normalize_index_name(&["user.id"]), "user_id"); + fn test_default_index_name() { + // Single field - preserved as-is + assert_eq!(default_index_name(&["user-id"]), "user-id"); + assert_eq!(default_index_name(&["user:id"]), "user:id"); + assert_eq!(default_index_name(&["userId"]), "userId"); - // Nested paths - segments joined with underscore + // Nested paths - joined with dot + assert_eq!( + default_index_name(&["meta-data", "user-id"]), + "meta-data.user-id" + ); assert_eq!( - normalize_index_name(&["meta-data", "user-id"]), - "meta_data_user_id" + default_index_name(&["MetaData", "userId"]), + "MetaData.userId" ); - // Consecutive special chars collapse to single underscore - assert_eq!(normalize_index_name(&["user--id"]), "user_id"); - assert_eq!(normalize_index_name(&["a..b"]), "a_b"); + // Empty input + assert_eq!(default_index_name(&[]), ""); + } - // Leading/trailing special chars are stripped - assert_eq!(normalize_index_name(&["--test--"]), "test"); - assert_eq!(normalize_index_name(&["-user-id-"]), "user_id"); + #[tokio::test] + async fn test_default_index_name_with_special_chars() { + // Verify default index names preserve special characters in column names. + let mut dataset = gen_batch() + .col("user-id", lance_datagen::array::step::()) + .col("user:id", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) + .await + .unwrap(); - // Normal columns unchanged - assert_eq!(normalize_index_name(&["userId"]), "userId"); - assert_eq!(normalize_index_name(&["user_id"]), "user_id"); + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); - // Empty input - assert_eq!(normalize_index_name(&[]), "index"); + // Create index on column with hyphen + CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + assert_eq!(indices[0].name, "user-id_idx"); - // All special chars -> fallback to "index" - assert_eq!(normalize_index_name(&["---"]), "index"); + // Create index on column with colon + CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); - // Unicode alphanumeric preserved - assert_eq!(normalize_index_name(&["字段名"]), "字段名"); - assert_eq!(normalize_index_name(&["字段-名"]), "字段_名"); + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 2); + let names: std::collections::HashSet<_> = indices.iter().map(|i| i.name.as_str()).collect(); + assert!(names.contains("user-id_idx")); + assert!(names.contains("user:id_idx")); } #[tokio::test] - async fn test_index_name_collision_suffix() { - // When two columns normalize to the same default index name, - // the second index should get a numeric suffix. - let test_dir = TempStrDir::default(); - let schema = Arc::new(ArrowSchema::new(vec![ - ArrowField::new("user-id", DataType::Int32, false), - ArrowField::new("user:id", DataType::Int32, false), - ])); - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from_iter_values(0..100)), - Arc::new(Int32Array::from_iter_values(0..100)), - ], - ) - .unwrap(); - - let reader = RecordBatchIterator::new([Ok(batch)], schema); - let mut dataset = Dataset::write(reader, test_dir.as_str(), None) + async fn test_index_name_collision_with_explicit_name() { + // Test collision handling when explicit name conflicts with default name. + let mut dataset = gen_batch() + .col("a", lance_datagen::array::step::()) + .col("b", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) .await .unwrap(); let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); - // Create index on first column - gets user_id_idx - CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) + // (a) Explicit name on first index, default on second that would collide + // Create index on "a" with explicit name "b_idx" + CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + .name("b_idx".to_string()) .execute() .await .unwrap(); let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 1); - assert_eq!(indices[0].name, "user_id_idx"); + assert_eq!(indices[0].name, "b_idx"); - // Create index on second column - should get user_id_idx_2 - CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) + // Create index on "b" with default name - would be "b_idx" but that's taken + // so it should get "b_idx_2" + CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) .execute() .await .unwrap(); @@ -581,8 +563,42 @@ mod tests { let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 2); let names: std::collections::HashSet<_> = indices.iter().map(|i| i.name.as_str()).collect(); - assert!(names.contains("user_id_idx")); - assert!(names.contains("user_id_idx_2")); + assert!(names.contains("b_idx")); + assert!(names.contains("b_idx_2")); + } + + #[tokio::test] + async fn test_index_name_collision_explicit_errors() { + // Test that explicit name collision with existing index errors. + let mut dataset = gen_batch() + .col("a", lance_datagen::array::step::()) + .col("b", lance_datagen::array::step::()) + .into_ram_dataset(FragmentCount::from(1), FragmentRowCount::from(100)) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); + + // (b) Default name on first, explicit same name on second should error + // Create index on "a" with default name "a_idx" + CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + .execute() + .await + .unwrap(); + + let indices = dataset.load_indices().await.unwrap(); + assert_eq!(indices.len(), 1); + assert_eq!(indices[0].name, "a_idx"); + + // Try to create index on "b" with explicit name "a_idx" - should error + let result = CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) + .name("a_idx".to_string()) + .execute() + .await; + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.to_string().contains("already exists")); } // Helper function to create test data with text field suitable for inverted index From 5907cecddea3118a1d0dee3b0c3442bcbd2023b2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 7 Jan 2026 14:01:55 -0800 Subject: [PATCH 4/7] return index metadata --- rust/lance/src/index.rs | 2 +- rust/lance/src/index/create.rs | 54 +++++++++++++++++----------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index 1431d5687a8..db935daedc5 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -602,7 +602,7 @@ impl DatasetIndexExt for Dataset { builder = builder.name(name); } - builder.replace(replace).await + builder.replace(replace).await.map(|_| ()) } async fn drop_index(&mut self, name: &str) -> Result<()> { diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 1240ad2be55..1317f8c2a18 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -429,8 +429,9 @@ impl<'a> CreateIndexBuilder<'a> { } #[instrument(skip_all)] - async fn execute(mut self) -> Result<()> { + async fn execute(mut self) -> Result { let new_idx = self.execute_uncommitted().await?; + let index_uuid = new_idx.uuid; let transaction = Transaction::new( new_idx.dataset_version, Operation::CreateIndex { @@ -444,13 +445,23 @@ impl<'a> CreateIndexBuilder<'a> { .apply_commit(transaction, &Default::default(), &Default::default()) .await?; - Ok(()) + // Fetch the committed index metadata from the dataset. + // This ensures we return the version that may have been modified by the commit. + let indices = self.dataset.load_indices().await?; + indices + .iter() + .find(|idx| idx.uuid == index_uuid) + .cloned() + .ok_or_else(|| Error::Internal { + message: format!("Index with UUID {} not found after commit", index_uuid), + location: location!(), + }) } } impl<'a> IntoFuture for CreateIndexBuilder<'a> { - type Output = Result<()>; - type IntoFuture = BoxFuture<'a, Result<()>>; + type Output = Result; + type IntoFuture = BoxFuture<'a, Result>; fn into_future(self) -> Self::IntoFuture { Box::pin(self.execute()) @@ -507,26 +518,22 @@ mod tests { let params = ScalarIndexParams::for_builtin(lance_index::scalar::BuiltinIndexType::BTree); // Create index on column with hyphen - CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) + let idx1 = CreateIndexBuilder::new(&mut dataset, &["user-id"], IndexType::BTree, ¶ms) .execute() .await .unwrap(); - - let indices = dataset.load_indices().await.unwrap(); - assert_eq!(indices.len(), 1); - assert_eq!(indices[0].name, "user-id_idx"); + assert_eq!(idx1.name, "user-id_idx"); // Create index on column with colon - CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) + let idx2 = CreateIndexBuilder::new(&mut dataset, &["user:id"], IndexType::BTree, ¶ms) .execute() .await .unwrap(); + assert_eq!(idx2.name, "user:id_idx"); + // Verify both indices exist let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 2); - let names: std::collections::HashSet<_> = indices.iter().map(|i| i.name.as_str()).collect(); - assert!(names.contains("user-id_idx")); - assert!(names.contains("user:id_idx")); } #[tokio::test] @@ -543,28 +550,24 @@ mod tests { // (a) Explicit name on first index, default on second that would collide // Create index on "a" with explicit name "b_idx" - CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + let idx1 = CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) .name("b_idx".to_string()) .execute() .await .unwrap(); - - let indices = dataset.load_indices().await.unwrap(); - assert_eq!(indices.len(), 1); - assert_eq!(indices[0].name, "b_idx"); + assert_eq!(idx1.name, "b_idx"); // Create index on "b" with default name - would be "b_idx" but that's taken // so it should get "b_idx_2" - CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) + let idx2 = CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) .execute() .await .unwrap(); + assert_eq!(idx2.name, "b_idx_2"); + // Verify both indices exist let indices = dataset.load_indices().await.unwrap(); assert_eq!(indices.len(), 2); - let names: std::collections::HashSet<_> = indices.iter().map(|i| i.name.as_str()).collect(); - assert!(names.contains("b_idx")); - assert!(names.contains("b_idx_2")); } #[tokio::test] @@ -581,14 +584,11 @@ mod tests { // (b) Default name on first, explicit same name on second should error // Create index on "a" with default name "a_idx" - CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) + let idx1 = CreateIndexBuilder::new(&mut dataset, &["a"], IndexType::BTree, ¶ms) .execute() .await .unwrap(); - - let indices = dataset.load_indices().await.unwrap(); - assert_eq!(indices.len(), 1); - assert_eq!(indices[0].name, "a_idx"); + assert_eq!(idx1.name, "a_idx"); // Try to create index on "b" with explicit name "a_idx" - should error let result = CreateIndexBuilder::new(&mut dataset, &["b"], IndexType::BTree, ¶ms) From aa81ce00fc2f08b696b18afaed601ef57bf025c2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 7 Jan 2026 14:24:09 -0800 Subject: [PATCH 5/7] Return from create_index method too --- rust/lance-index/src/traits.rs | 4 +++- rust/lance/src/dataset/scanner.rs | 9 ++++++--- rust/lance/src/dataset/tests/dataset_index.rs | 20 +++++++++---------- rust/lance/src/index.rs | 4 ++-- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/rust/lance-index/src/traits.rs b/rust/lance-index/src/traits.rs index 21647355039..efe5caf635a 100644 --- a/rust/lance-index/src/traits.rs +++ b/rust/lance-index/src/traits.rs @@ -153,6 +153,8 @@ pub trait DatasetIndexExt { /// if not provided, it will auto-generate one. /// - `params`: index parameters. /// - `replace`: replace the existing index if it exists. + /// + /// Returns the metadata of the created index. async fn create_index( &mut self, columns: &[&str], @@ -160,7 +162,7 @@ pub trait DatasetIndexExt { name: Option, params: &dyn IndexParams, replace: bool, - ) -> Result<()>; + ) -> Result; /// Drop indices by name. /// diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index fc5f636f5b1..db3e2b17847 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -4146,7 +4146,8 @@ pub mod test_dataset { ¶ms, true, ) - .await + .await?; + Ok(()) } pub async fn make_scalar_index(&mut self) -> Result<()> { @@ -4158,14 +4159,16 @@ pub mod test_dataset { &ScalarIndexParams::default(), true, ) - .await + .await?; + Ok(()) } pub async fn make_fts_index(&mut self) -> Result<()> { let params = InvertedIndexParams::default().with_position(true); self.dataset .create_index(&["s"], IndexType::Inverted, None, ¶ms, true) - .await + .await?; + Ok(()) } pub async fn append_new_data(&mut self) -> Result<()> { diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index ce8254cbd52..62a4302701a 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -90,18 +90,18 @@ async fn test_create_index( // Make sure valid arguments should create index successfully let params = VectorIndexParams::ivf_pq(10, 8, 2, MetricType::L2, 50); - dataset + let index_meta = dataset .create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true) .await .unwrap(); dataset.validate().await.unwrap(); + // Verify the returned metadata + assert_eq!(index_meta.name, "embeddings_idx"); // The version should match the table version it was created from. - let indices = dataset.load_indices().await.unwrap(); - let actual = indices.first().unwrap().dataset_version; let expected = dataset.manifest.version - 1; - assert_eq!(actual, expected); - let fragment_bitmap = indices.first().unwrap().fragment_bitmap.as_ref().unwrap(); + assert_eq!(index_meta.dataset_version, expected); + let fragment_bitmap = index_meta.fragment_bitmap.as_ref().unwrap(); assert_eq!(fragment_bitmap.len(), 1); assert!(fragment_bitmap.contains(0)); @@ -313,18 +313,18 @@ async fn test_create_int8_index( // Make sure valid arguments should create index successfully let params = VectorIndexParams::ivf_pq(10, 8, 2, MetricType::L2, 50); - dataset + let index_meta = dataset .create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true) .await .unwrap(); dataset.validate().await.unwrap(); + // Verify the returned metadata + assert_eq!(index_meta.name, "embeddings_idx"); // The version should match the table version it was created from. - let indices = dataset.load_indices().await.unwrap(); - let actual = indices.first().unwrap().dataset_version; let expected = dataset.manifest.version - 1; - assert_eq!(actual, expected); - let fragment_bitmap = indices.first().unwrap().fragment_bitmap.as_ref().unwrap(); + assert_eq!(index_meta.dataset_version, expected); + let fragment_bitmap = index_meta.fragment_bitmap.as_ref().unwrap(); assert_eq!(fragment_bitmap.len(), 1); assert!(fragment_bitmap.contains(0)); diff --git a/rust/lance/src/index.rs b/rust/lance/src/index.rs index db935daedc5..a2b02dd5cf4 100644 --- a/rust/lance/src/index.rs +++ b/rust/lance/src/index.rs @@ -594,7 +594,7 @@ impl DatasetIndexExt for Dataset { name: Option, params: &dyn IndexParams, replace: bool, - ) -> Result<()> { + ) -> Result { // Use the builder pattern with default train=true for backward compatibility let mut builder = self.create_index_builder(columns, index_type, params); @@ -602,7 +602,7 @@ impl DatasetIndexExt for Dataset { builder = builder.name(name); } - builder.replace(replace).await.map(|_| ()) + builder.replace(replace).await } async fn drop_index(&mut self, name: &str) -> Result<()> { From 8a46b24e1986023a0f9467df61e2af2656a2ebf7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 7 Jan 2026 15:14:35 -0800 Subject: [PATCH 6/7] return IndexMetadata from Python and Java create_index APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- java/lance-jni/src/blocking_dataset.rs | 107 +++++++++--------- java/src/main/java/org/lance/Dataset.java | 12 +- .../test/java/org/lance/ScalarIndexTest.java | 36 ++++-- python/src/dataset.rs | 16 +-- 4 files changed, 96 insertions(+), 75 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b7165088328..7d135d8622f 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -730,20 +730,20 @@ fn inner_release_native_dataset(env: &mut JNIEnv, obj: JObject) -> Result<()> { } #[no_mangle] -pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex( - mut env: JNIEnv, - java_dataset: JObject, - columns_jobj: JObject, // List +pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject<'local>, + columns_jobj: JObject<'local>, // List index_type_code_jobj: jint, - name_jobj: JObject, // Optional - params_jobj: JObject, // IndexParams - replace_jobj: jboolean, // replace - train_jobj: jboolean, // train - fragments_jobj: JObject, // List - index_uuid_jobj: JObject, // String - arrow_stream_addr_jobj: JObject, // Optional -) { - ok_or_throw_without_return!( + name_jobj: JObject<'local>, // Optional + params_jobj: JObject<'local>, // IndexParams + replace_jobj: jboolean, // replace + train_jobj: jboolean, // train + fragments_jobj: JObject<'local>, // List + index_uuid_jobj: JObject<'local>, // String + arrow_stream_addr_jobj: JObject<'local>, // Optional +) -> JObject<'local> { + ok_or_throw!( env, inner_create_index( &mut env, @@ -758,23 +758,23 @@ pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex( index_uuid_jobj, arrow_stream_addr_jobj, ) - ); + ) } #[allow(clippy::too_many_arguments)] -fn inner_create_index( - env: &mut JNIEnv, - java_dataset: JObject, - columns_jobj: JObject, // List +fn inner_create_index<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject<'local>, + columns_jobj: JObject<'local>, // List index_type_code_jobj: jint, - name_jobj: JObject, // Optional - params_jobj: JObject, // IndexParams - replace_jobj: jboolean, // replace - train_jobj: jboolean, // train - fragments_jobj: JObject, // Optional> - index_uuid_jobj: JObject, // Optional - arrow_stream_addr_jobj: JObject, // Optional -) -> Result<()> { + name_jobj: JObject<'local>, // Optional + params_jobj: JObject<'local>, // IndexParams + replace_jobj: jboolean, // replace + train_jobj: jboolean, // train + fragments_jobj: JObject<'local>, // Optional> + index_uuid_jobj: JObject<'local>, // Optional + arrow_stream_addr_jobj: JObject<'local>, // Optional +) -> Result> { let columns = env.get_strings(&columns_jobj)?; let index_type = IndexType::try_from(index_type_code_jobj)?; let name = env.get_string_opt(&name_jobj)?; @@ -837,38 +837,43 @@ fn inner_create_index( }; let params = params_result?; - let mut dataset_guard = - unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - let mut index_builder = dataset_guard - .inner - .create_index_builder(&columns_slice, index_type, params.as_ref()) - .replace(replace) - .train(train); + // Execute index creation in a block to ensure dataset_guard is dropped + // before we call into_java (which needs to borrow env again) + let index_metadata = { + let mut dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + + let mut index_builder = dataset_guard + .inner + .create_index_builder(&columns_slice, index_type, params.as_ref()) + .replace(replace) + .train(train); - if let Some(name) = name { - index_builder = index_builder.name(name); - } + if let Some(name) = name { + index_builder = index_builder.name(name); + } - if let Some(fragment_ids) = fragment_ids { - index_builder = index_builder.fragments(fragment_ids); - } + if let Some(fragment_ids) = fragment_ids { + index_builder = index_builder.fragments(fragment_ids); + } - if let Some(index_uuid) = index_uuid { - index_builder = index_builder.index_uuid(index_uuid); - } + if let Some(index_uuid) = index_uuid { + index_builder = index_builder.index_uuid(index_uuid); + } - if let Some(reader) = batch_reader { - index_builder = index_builder.preprocessed_data(Box::new(reader)); - } + if let Some(reader) = batch_reader { + index_builder = index_builder.preprocessed_data(Box::new(reader)); + } - if skip_commit { - RT.block_on(index_builder.execute_uncommitted())?; - } else { - RT.block_on(index_builder.into_future())? - } + if skip_commit { + RT.block_on(index_builder.execute_uncommitted())? + } else { + RT.block_on(index_builder.into_future())? + } + }; - Ok(()) + (&index_metadata).into_java(env) } fn should_skip_commit(index_type: IndexType, params_opt: &Option) -> Result { diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index eb66702ddf5..22c461b8b65 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -765,16 +765,17 @@ public void restore() { * @param name the name of the created index * @param params index params * @param replace whether to replace the existing index + * @return the metadata of the created index * @deprecated please use {@link Dataset#createIndex(IndexOptions)} instead. */ @Deprecated - public void createIndex( + public Index createIndex( List columns, IndexType indexType, Optional name, IndexParams params, boolean replace) { - createIndex( + return createIndex( IndexOptions.builder(columns, indexType, params) .replace(replace) .withIndexName(name.orElse(null)) @@ -785,11 +786,12 @@ public void createIndex( * Creates a new index on the dataset. * * @param options options for building index + * @return the metadata of the created index */ - public void createIndex(IndexOptions options) { + public Index createIndex(IndexOptions options) { try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); - nativeCreateIndex( + return nativeCreateIndex( options.getColumns(), options.getIndexType().ordinal(), options.getIndexName(), @@ -802,7 +804,7 @@ public void createIndex(IndexOptions options) { } } - private native void nativeCreateIndex( + private native Index nativeCreateIndex( List columns, int indexTypeCode, Optional name, diff --git a/java/src/test/java/org/lance/ScalarIndexTest.java b/java/src/test/java/org/lance/ScalarIndexTest.java index fa18671323e..61ae66c94d2 100644 --- a/java/src/test/java/org/lance/ScalarIndexTest.java +++ b/java/src/test/java/org/lance/ScalarIndexTest.java @@ -53,6 +53,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; public class ScalarIndexTest { @@ -79,12 +80,18 @@ public void testCreateBTreeIndex() throws Exception { IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build(); // Create BTree index on 'id' column - dataset.createIndex( - Collections.singletonList("id"), - IndexType.BTREE, - Optional.of("btree_id_index"), - indexParams, - true); + Index index = + dataset.createIndex( + Collections.singletonList("id"), + IndexType.BTREE, + Optional.of("btree_id_index"), + indexParams, + true); + + // Verify the returned Index object + assertEquals("btree_id_index", index.name()); + assertNotNull(index.uuid()); + assertFalse(index.fields().isEmpty()); // Verify index was created and is in the list assertTrue( @@ -343,12 +350,17 @@ public void testCreateZonemapIndex() throws Exception { IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build(); // Create Zonemap index on 'value' column - dataset.createIndex( - Collections.singletonList("value"), - IndexType.ZONEMAP, - Optional.of("zonemap_value_index"), - indexParams, - true); + Index index = + dataset.createIndex( + Collections.singletonList("value"), + IndexType.ZONEMAP, + Optional.of("zonemap_value_index"), + indexParams, + true); + + // Verify the returned Index object + assertEquals("zonemap_value_index", index.name()); + assertNotNull(index.uuid()); // Verify index was created assertTrue( diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 49a6397e818..57f270ae2c7 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -81,7 +81,7 @@ use lance_index::{ }; use lance_io::object_store::ObjectStoreParams; use lance_linalg::distance::MetricType; -use lance_table::format::{BasePath, Fragment}; +use lance_table::format::{BasePath, Fragment, IndexMetadata}; use lance_table::io::commit::CommitHandler; use crate::error::PythonErrorExt; @@ -1809,7 +1809,7 @@ impl Dataset { train: Option, storage_options: Option>, kwargs: Option<&Bound>, - ) -> PyResult<()> { + ) -> PyResult> { let columns: Vec<&str> = columns.iter().map(|s| &**s).collect(); let index_type = index_type.to_uppercase(); let idx_type = match index_type.as_str() { @@ -1979,19 +1979,21 @@ impl Dataset { use std::future::IntoFuture; // Use execute_uncommitted if fragment_ids is provided, otherwise use execute - if has_fragment_ids { + let index_metadata = if has_fragment_ids { // For fragment-level indexing, use execute_uncommitted - let _index_metadata = rt() + let index_metadata = rt() .block_on(None, builder.execute_uncommitted())? .infer_error()?; // Note: We don't update self.ds here as the index is not committed + index_metadata } else { // For regular indexing, use the standard execute path - rt().block_on(None, builder.into_future())?.infer_error()?; + let index_metadata = rt().block_on(None, builder.into_future())?.infer_error()?; self.ds = Arc::new(new_self); - } + index_metadata + }; - Ok(()) + Ok(PyLance(index_metadata)) } fn drop_index(&mut self, name: &str) -> PyResult<()> { From 933402708329b243cacfba131d376b280692538b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 15 Jan 2026 16:03:08 -0800 Subject: [PATCH 7/7] handle column with dots --- rust/lance/src/index/create.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/rust/lance/src/index/create.rs b/rust/lance/src/index/create.rs index 1317f8c2a18..7447be0fdc4 100644 --- a/rust/lance/src/index/create.rs +++ b/rust/lance/src/index/create.rs @@ -36,7 +36,11 @@ use arrow_array::RecordBatchReader; /// Joins field names with `.` to create the base index name. /// For example: `["meta-data", "user-id"]` -> `"meta-data.user-id"` fn default_index_name(fields: &[&str]) -> String { - fields.join(".") + if fields.iter().any(|f| f.contains('.')) { + format_field_path(fields) + } else { + fields.join(".") + } } pub struct CreateIndexBuilder<'a> { @@ -501,6 +505,12 @@ mod tests { "MetaData.userId" ); + // Path with dots in field names - escape + assert_eq!( + default_index_name(&["meta.data", "user.id"]), + "`meta.data`.`user.id`" + ); + // Empty input assert_eq!(default_index_name(&[]), ""); }