diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ca9045adaf..88c984c9a3 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -5144,6 +5144,7 @@ def compact_files( max_bytes_per_file: Optional[int] = None, materialize_deletions: bool = True, materialize_deletions_threshold: float = 0.1, + defer_index_remap: bool = False, num_threads: Optional[int] = None, batch_size: Optional[int] = None, ) -> CompactionMetrics: @@ -5185,6 +5186,8 @@ def compact_files( materialize_deletions_threshold: float, default 0.1 The fraction of original rows that are soft deleted in a fragment before the fragment is a candidate for compaction. + defer_index_remap: bool, default False + Whether to defer index remapping during compaction. num_threads: int, optional The number of threads to use when performing compaction. If not specified, defaults to the number of cores on the machine. @@ -5208,6 +5211,7 @@ def compact_files( max_bytes_per_file=max_bytes_per_file, materialize_deletions=materialize_deletions, materialize_deletions_threshold=materialize_deletions_threshold, + defer_index_remap=defer_index_remap, num_threads=num_threads, batch_size=batch_size, ) diff --git a/python/python/lance/optimize.py b/python/python/lance/optimize.py index f04e9264c3..8aa37a20c1 100644 --- a/python/python/lance/optimize.py +++ b/python/python/lance/optimize.py @@ -59,3 +59,7 @@ class CompactionOptions(TypedDict): The default will use the same default from ``scanner``. """ + defer_index_remap: Optional[bool] + """ + Whether to defer index remapping during compaction (default: False). + """ diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 1f23f3bac4..fded1a28aa 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -304,6 +304,23 @@ def test_index_remapping_multiple_rewrite_tasks(tmp_path: Path): assert index_frag_ids[0] in frag_ids +def test_defer_index_remap(tmp_path: Path): + base_dir = tmp_path / "dataset" + data = pa.table({"i": range(6_000), "val": range(6_000)}) + dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000) + + options = dict( + target_rows_per_fragment=2_000, defer_index_remap=True, num_threads=1 + ) + + dataset.delete("i < 500") + dataset.optimize.compact_files(**options) + + dataset = lance.dataset(base_dir) + indices = dataset.list_indices() + assert any(idx["name"] == "__lance_frag_reuse" for idx in indices) + + def test_dataset_distributed_optimize(tmp_path: Path): base_dir = tmp_path / "dataset" data = pa.table({"a": range(800), "b": range(800)}) diff --git a/python/src/dataset/optimize.rs b/python/src/dataset/optimize.rs index d398f5b7ce..b62270f62b 100644 --- a/python/src/dataset/optimize.rs +++ b/python/src/dataset/optimize.rs @@ -45,6 +45,9 @@ fn parse_compaction_options(options: &Bound<'_, PyDict>) -> PyResult { opts.materialize_deletions_threshold = value.extract()?; } + "defer_index_remap" => { + opts.defer_index_remap = value.extract()?; + } "num_threads" => { opts.num_threads = value.extract()?; }