Skip to content

Commit c7f54b0

Browse files
wjones127claude
andauthored
feat!: define default index name and return IndexMetadata after building index (#5645)
BREAKING CHANGE: `create_index` now returns `IndexMetadata` of the new index in Rust and Java. Previously it returned nothing. (Python is unchanged, as it returns the Dataset itself, and changing that would be too disruptive.) Defines the behavior of default index names, particularly for ones with mixed case, non-alphanumeric, and nested fields. I tried to align it with how it was being done before as much as possible. For example, fields with dashes like `my-column` would get `my-column_idx`. This helps libraries that were relying on a predicable algorithm. However, I did notice we don't handle name collisions. So I added behavior for that. If there's already an existing index with that name, the default name gets a `_2` added. Finally, because we are choosing the name in the function, I made it so we return the `IndexMetadata` when you create the index in case you want to get back the name that was chosen. --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 92f3808 commit c7f54b0

10 files changed

Lines changed: 306 additions & 96 deletions

File tree

java/lance-jni/src/blocking_dataset.rs

Lines changed: 56 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -765,20 +765,20 @@ fn inner_release_native_dataset(env: &mut JNIEnv, obj: JObject) -> Result<()> {
765765
}
766766

767767
#[no_mangle]
768-
pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex(
769-
mut env: JNIEnv,
770-
java_dataset: JObject,
771-
columns_jobj: JObject, // List<String>
768+
pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex<'local>(
769+
mut env: JNIEnv<'local>,
770+
java_dataset: JObject<'local>,
771+
columns_jobj: JObject<'local>, // List<String>
772772
index_type_code_jobj: jint,
773-
name_jobj: JObject, // Optional<String>
774-
params_jobj: JObject, // IndexParams
775-
replace_jobj: jboolean, // replace
776-
train_jobj: jboolean, // train
777-
fragments_jobj: JObject, // List<Integer>
778-
index_uuid_jobj: JObject, // String
779-
arrow_stream_addr_jobj: JObject, // Optional<Long>
780-
) {
781-
ok_or_throw_without_return!(
773+
name_jobj: JObject<'local>, // Optional<String>
774+
params_jobj: JObject<'local>, // IndexParams
775+
replace_jobj: jboolean, // replace
776+
train_jobj: jboolean, // train
777+
fragments_jobj: JObject<'local>, // List<Integer>
778+
index_uuid_jobj: JObject<'local>, // String
779+
arrow_stream_addr_jobj: JObject<'local>, // Optional<Long>
780+
) -> JObject<'local> {
781+
ok_or_throw!(
782782
env,
783783
inner_create_index(
784784
&mut env,
@@ -793,23 +793,23 @@ pub extern "system" fn Java_org_lance_Dataset_nativeCreateIndex(
793793
index_uuid_jobj,
794794
arrow_stream_addr_jobj,
795795
)
796-
);
796+
)
797797
}
798798

799799
#[allow(clippy::too_many_arguments)]
800-
fn inner_create_index(
801-
env: &mut JNIEnv,
802-
java_dataset: JObject,
803-
columns_jobj: JObject, // List<String>
800+
fn inner_create_index<'local>(
801+
env: &mut JNIEnv<'local>,
802+
java_dataset: JObject<'local>,
803+
columns_jobj: JObject<'local>, // List<String>
804804
index_type_code_jobj: jint,
805-
name_jobj: JObject, // Optional<String>
806-
params_jobj: JObject, // IndexParams
807-
replace_jobj: jboolean, // replace
808-
train_jobj: jboolean, // train
809-
fragments_jobj: JObject, // Optional<List<String>>
810-
index_uuid_jobj: JObject, // Optional<String>
811-
arrow_stream_addr_jobj: JObject, // Optional<Long>
812-
) -> Result<()> {
805+
name_jobj: JObject<'local>, // Optional<String>
806+
params_jobj: JObject<'local>, // IndexParams
807+
replace_jobj: jboolean, // replace
808+
train_jobj: jboolean, // train
809+
fragments_jobj: JObject<'local>, // Optional<List<String>>
810+
index_uuid_jobj: JObject<'local>, // Optional<String>
811+
arrow_stream_addr_jobj: JObject<'local>, // Optional<Long>
812+
) -> Result<JObject<'local>> {
813813
let columns = env.get_strings(&columns_jobj)?;
814814
let index_type = IndexType::try_from(index_type_code_jobj)?;
815815
let name = env.get_string_opt(&name_jobj)?;
@@ -873,38 +873,43 @@ fn inner_create_index(
873873
};
874874

875875
let params = params_result?;
876-
let mut dataset_guard =
877-
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;
878876

879-
let mut index_builder = dataset_guard
880-
.inner
881-
.create_index_builder(&columns_slice, index_type, params.as_ref())
882-
.replace(replace)
883-
.train(train);
877+
// Execute index creation in a block to ensure dataset_guard is dropped
878+
// before we call into_java (which needs to borrow env again)
879+
let index_metadata = {
880+
let mut dataset_guard =
881+
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;
882+
883+
let mut index_builder = dataset_guard
884+
.inner
885+
.create_index_builder(&columns_slice, index_type, params.as_ref())
886+
.replace(replace)
887+
.train(train);
884888

885-
if let Some(name) = name {
886-
index_builder = index_builder.name(name);
887-
}
889+
if let Some(name) = name {
890+
index_builder = index_builder.name(name);
891+
}
888892

889-
if let Some(fragment_ids) = fragment_ids {
890-
index_builder = index_builder.fragments(fragment_ids);
891-
}
893+
if let Some(fragment_ids) = fragment_ids {
894+
index_builder = index_builder.fragments(fragment_ids);
895+
}
892896

893-
if let Some(index_uuid) = index_uuid {
894-
index_builder = index_builder.index_uuid(index_uuid);
895-
}
897+
if let Some(index_uuid) = index_uuid {
898+
index_builder = index_builder.index_uuid(index_uuid);
899+
}
896900

897-
if let Some(reader) = batch_reader {
898-
index_builder = index_builder.preprocessed_data(Box::new(reader));
899-
}
901+
if let Some(reader) = batch_reader {
902+
index_builder = index_builder.preprocessed_data(Box::new(reader));
903+
}
900904

901-
if skip_commit {
902-
RT.block_on(index_builder.execute_uncommitted())?;
903-
} else {
904-
RT.block_on(index_builder.into_future())?
905-
}
905+
if skip_commit {
906+
RT.block_on(index_builder.execute_uncommitted())?
907+
} else {
908+
RT.block_on(index_builder.into_future())?
909+
}
910+
};
906911

907-
Ok(())
912+
(&index_metadata).into_java(env)
908913
}
909914

910915
fn should_skip_commit(index_type: IndexType, params_opt: &Option<String>) -> Result<bool> {

java/src/main/java/org/lance/Dataset.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -823,16 +823,17 @@ public void restore() {
823823
* @param name the name of the created index
824824
* @param params index params
825825
* @param replace whether to replace the existing index
826+
* @return the metadata of the created index
826827
* @deprecated please use {@link Dataset#createIndex(IndexOptions)} instead.
827828
*/
828829
@Deprecated
829-
public void createIndex(
830+
public Index createIndex(
830831
List<String> columns,
831832
IndexType indexType,
832833
Optional<String> name,
833834
IndexParams params,
834835
boolean replace) {
835-
createIndex(
836+
return createIndex(
836837
IndexOptions.builder(columns, indexType, params)
837838
.replace(replace)
838839
.withIndexName(name.orElse(null))
@@ -843,11 +844,12 @@ public void createIndex(
843844
* Creates a new index on the dataset.
844845
*
845846
* @param options options for building index
847+
* @return the metadata of the created index
846848
*/
847-
public void createIndex(IndexOptions options) {
849+
public Index createIndex(IndexOptions options) {
848850
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
849851
Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed");
850-
nativeCreateIndex(
852+
return nativeCreateIndex(
851853
options.getColumns(),
852854
options.getIndexType().ordinal(),
853855
options.getIndexName(),
@@ -860,7 +862,7 @@ public void createIndex(IndexOptions options) {
860862
}
861863
}
862864

863-
private native void nativeCreateIndex(
865+
private native Index nativeCreateIndex(
864866
List<String> columns,
865867
int indexTypeCode,
866868
Optional<String> name,

java/src/test/java/org/lance/ScalarIndexTest.java

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353

5454
import static org.junit.jupiter.api.Assertions.assertEquals;
5555
import static org.junit.jupiter.api.Assertions.assertFalse;
56+
import static org.junit.jupiter.api.Assertions.assertNotNull;
5657
import static org.junit.jupiter.api.Assertions.assertTrue;
5758

5859
public class ScalarIndexTest {
@@ -79,12 +80,18 @@ public void testCreateBTreeIndex() throws Exception {
7980
IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build();
8081

8182
// Create BTree index on 'id' column
82-
dataset.createIndex(
83-
Collections.singletonList("id"),
84-
IndexType.BTREE,
85-
Optional.of("btree_id_index"),
86-
indexParams,
87-
true);
83+
Index index =
84+
dataset.createIndex(
85+
Collections.singletonList("id"),
86+
IndexType.BTREE,
87+
Optional.of("btree_id_index"),
88+
indexParams,
89+
true);
90+
91+
// Verify the returned Index object
92+
assertEquals("btree_id_index", index.name());
93+
assertNotNull(index.uuid());
94+
assertFalse(index.fields().isEmpty());
8895

8996
// Verify index was created and is in the list
9097
assertTrue(
@@ -343,12 +350,17 @@ public void testCreateZonemapIndex() throws Exception {
343350
IndexParams indexParams = IndexParams.builder().setScalarIndexParams(scalarParams).build();
344351

345352
// Create Zonemap index on 'value' column
346-
dataset.createIndex(
347-
Collections.singletonList("value"),
348-
IndexType.ZONEMAP,
349-
Optional.of("zonemap_value_index"),
350-
indexParams,
351-
true);
353+
Index index =
354+
dataset.createIndex(
355+
Collections.singletonList("value"),
356+
IndexType.ZONEMAP,
357+
Optional.of("zonemap_value_index"),
358+
indexParams,
359+
true);
360+
361+
// Verify the returned Index object
362+
assertEquals("zonemap_value_index", index.name());
363+
assertNotNull(index.uuid());
352364

353365
// Verify index was created
354366
assertTrue(

python/python/tests/test_column_names.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset):
8787
indices = mixed_case_dataset.list_indices()
8888
assert len(indices) == 1
8989
assert indices[0]["fields"] == ["userId"]
90+
assert indices[0]["name"] == "userId_idx"
9091

9192
# Query using the indexed column
9293
result = mixed_case_dataset.to_table(filter="userId = 50")
@@ -96,6 +97,9 @@ def test_scalar_index_with_mixed_case(self, mixed_case_dataset):
9697
plan = mixed_case_dataset.scanner(filter="userId = 50").explain_plan()
9798
assert "ScalarIndexQuery" in plan
9899

100+
stats = mixed_case_dataset.stats.index_stats("userId_idx")
101+
assert stats["index_type"] == "BTree"
102+
99103
def test_alter_column_with_mixed_case(self, mixed_case_dataset):
100104
"""Altering columns works with mixed-case column names."""
101105
# alter_columns uses direct schema lookup, not SQL parsing
@@ -347,6 +351,7 @@ def test_scalar_index_with_special_chars(self, special_char_dataset):
347351
assert len(indices) == 1
348352
# Field with special chars is returned in quoted format for SQL compatibility
349353
assert indices[0]["fields"] == ["`user-id`"]
354+
assert indices[0]["name"] == "user-id_idx"
350355

351356
# Query using the indexed column (requires backticks in filter)
352357
result = special_char_dataset.to_table(filter="`user-id` = 50")
@@ -356,6 +361,9 @@ def test_scalar_index_with_special_chars(self, special_char_dataset):
356361
plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan()
357362
assert "ScalarIndexQuery" in plan
358363

364+
stats = special_char_dataset.stats.index_stats("user-id_idx")
365+
assert stats["index_type"] == "BTree"
366+
359367
def test_alter_column_with_special_chars(self, special_char_dataset):
360368
"""Altering columns works with special character column names."""
361369
# alter_columns uses direct schema lookup
@@ -455,6 +463,7 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset):
455463
indices = nested_mixed_case_dataset.list_indices()
456464
assert len(indices) == 1
457465
assert indices[0]["fields"] == ["MetaData.userId"]
466+
assert indices[0]["name"] == "MetaData.userId_idx"
458467

459468
# Query using the indexed column
460469
result = nested_mixed_case_dataset.to_table(filter="MetaData.userId = 50")
@@ -466,20 +475,27 @@ def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset):
466475
).explain_plan()
467476
assert "ScalarIndexQuery" in plan
468477

478+
stats = nested_mixed_case_dataset.stats.index_stats("MetaData.userId_idx")
479+
assert stats["index_type"] == "BTree"
480+
469481
def test_scalar_index_on_top_level_mixed_case(self, nested_mixed_case_dataset):
470482
"""Scalar index on top-level mixed-case column works."""
471483
nested_mixed_case_dataset.create_scalar_index("rowId", index_type="BTREE")
472484

473485
indices = nested_mixed_case_dataset.list_indices()
474486
assert len(indices) == 1
475487
assert indices[0]["fields"] == ["rowId"]
488+
assert indices[0]["name"] == "rowId_idx"
476489

477490
result = nested_mixed_case_dataset.to_table(filter="rowId = 50")
478491
assert result.num_rows == 1
479492

480493
plan = nested_mixed_case_dataset.scanner(filter="rowId = 50").explain_plan()
481494
assert "ScalarIndexQuery" in plan
482495

496+
stats = nested_mixed_case_dataset.stats.index_stats("rowId_idx")
497+
assert stats["index_type"] == "BTree"
498+
483499
def test_scalar_index_with_lowercased_nested_path(self, nested_mixed_case_dataset):
484500
"""Scalar index creation should work even when path is lowercased.
485501
@@ -562,6 +578,7 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase
562578
assert len(indices) == 1
563579
# Fields with special chars are returned in quoted format for SQL compatibility
564580
assert indices[0]["fields"] == ["`meta-data`.`user-id`"]
581+
assert indices[0]["name"] == "meta-data.user-id_idx"
565582

566583
# Query using the indexed column (backticks required in filter)
567584
result = nested_special_char_dataset.to_table(
@@ -575,6 +592,9 @@ def test_scalar_index_with_nested_special_chars(self, nested_special_char_datase
575592
).explain_plan()
576593
assert "ScalarIndexQuery" in plan
577594

595+
stats = nested_special_char_dataset.stats.index_stats("meta-data.user-id_idx")
596+
assert stats["index_type"] == "BTree"
597+
578598
def test_scalar_index_on_top_level_special_chars(self, nested_special_char_dataset):
579599
"""Scalar index on top-level special char column works."""
580600
nested_special_char_dataset.create_scalar_index("`row-id`", index_type="BTREE")

python/src/dataset.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ use lance_index::{
8181
};
8282
use lance_io::object_store::ObjectStoreParams;
8383
use lance_linalg::distance::MetricType;
84-
use lance_table::format::{BasePath, Fragment};
84+
use lance_table::format::{BasePath, Fragment, IndexMetadata};
8585
use lance_table::io::commit::CommitHandler;
8686

8787
use crate::error::PythonErrorExt;
@@ -1834,7 +1834,7 @@ impl Dataset {
18341834
train: Option<bool>,
18351835
storage_options: Option<HashMap<String, String>>,
18361836
kwargs: Option<&Bound<PyDict>>,
1837-
) -> PyResult<()> {
1837+
) -> PyResult<PyLance<IndexMetadata>> {
18381838
let columns: Vec<&str> = columns.iter().map(|s| &**s).collect();
18391839
let index_type = index_type.to_uppercase();
18401840
let idx_type = match index_type.as_str() {
@@ -2009,19 +2009,21 @@ impl Dataset {
20092009
use std::future::IntoFuture;
20102010

20112011
// Use execute_uncommitted if fragment_ids is provided, otherwise use execute
2012-
if has_fragment_ids {
2012+
let index_metadata = if has_fragment_ids {
20132013
// For fragment-level indexing, use execute_uncommitted
2014-
let _index_metadata = rt()
2014+
let index_metadata = rt()
20152015
.block_on(None, builder.execute_uncommitted())?
20162016
.infer_error()?;
20172017
// Note: We don't update self.ds here as the index is not committed
2018+
index_metadata
20182019
} else {
20192020
// For regular indexing, use the standard execute path
2020-
rt().block_on(None, builder.into_future())?.infer_error()?;
2021+
let index_metadata = rt().block_on(None, builder.into_future())?.infer_error()?;
20212022
self.ds = Arc::new(new_self);
2022-
}
2023+
index_metadata
2024+
};
20232025

2024-
Ok(())
2026+
Ok(PyLance(index_metadata))
20252027
}
20262028

20272029
fn drop_index(&mut self, name: &str) -> PyResult<()> {

rust/lance-index/src/traits.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,16 @@ pub trait DatasetIndexExt {
153153
/// if not provided, it will auto-generate one.
154154
/// - `params`: index parameters.
155155
/// - `replace`: replace the existing index if it exists.
156+
///
157+
/// Returns the metadata of the created index.
156158
async fn create_index(
157159
&mut self,
158160
columns: &[&str],
159161
index_type: IndexType,
160162
name: Option<String>,
161163
params: &dyn IndexParams,
162164
replace: bool,
163-
) -> Result<()>;
165+
) -> Result<IndexMetadata>;
164166

165167
/// Drop indices by name.
166168
///

0 commit comments

Comments
 (0)