Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5144,6 +5144,7 @@ def compact_files(
max_bytes_per_file: Optional[int] = None,
materialize_deletions: bool = True,
materialize_deletions_threshold: float = 0.1,
defer_index_remap: bool = False,
num_threads: Optional[int] = None,
batch_size: Optional[int] = None,
) -> CompactionMetrics:
Expand Down Expand Up @@ -5185,6 +5186,8 @@ def compact_files(
materialize_deletions_threshold: float, default 0.1
The fraction of original rows that are soft deleted in a fragment
before the fragment is a candidate for compaction.
defer_index_remap: bool, default False
Whether to defer index remapping during compaction.
num_threads: int, optional
The number of threads to use when performing compaction. If not
specified, defaults to the number of cores on the machine.
Expand All @@ -5208,6 +5211,7 @@ def compact_files(
max_bytes_per_file=max_bytes_per_file,
materialize_deletions=materialize_deletions,
materialize_deletions_threshold=materialize_deletions_threshold,
defer_index_remap=defer_index_remap,
num_threads=num_threads,
batch_size=batch_size,
)
Expand Down
4 changes: 4 additions & 0 deletions python/python/lance/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,7 @@ class CompactionOptions(TypedDict):

The default will use the same default from ``scanner``.
"""
defer_index_remap: Optional[bool]
"""
Whether to defer index remapping during compaction (default: False).
"""
17 changes: 17 additions & 0 deletions python/python/tests/test_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,23 @@ def test_index_remapping_multiple_rewrite_tasks(tmp_path: Path):
assert index_frag_ids[0] in frag_ids


def test_defer_index_remap(tmp_path: Path):
base_dir = tmp_path / "dataset"
data = pa.table({"i": range(6_000), "val": range(6_000)})
dataset = lance.write_dataset(data, base_dir, max_rows_per_file=1_000)

options = dict(
target_rows_per_fragment=2_000, defer_index_remap=True, num_threads=1
)

dataset.delete("i < 500")
dataset.optimize.compact_files(**options)

dataset = lance.dataset(base_dir)
indices = dataset.list_indices()
assert any(idx["name"] == "__lance_frag_reuse" for idx in indices)


def test_dataset_distributed_optimize(tmp_path: Path):
base_dir = tmp_path / "dataset"
data = pa.table({"a": range(800), "b": range(800)})
Expand Down
3 changes: 3 additions & 0 deletions python/src/dataset/optimize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ fn parse_compaction_options(options: &Bound<'_, PyDict>) -> PyResult<CompactionO
"materialize_deletions_threshold" => {
opts.materialize_deletions_threshold = value.extract()?;
}
"defer_index_remap" => {
opts.defer_index_remap = value.extract()?;
}
"num_threads" => {
opts.num_threads = value.extract()?;
}
Expand Down