lance-format · hamersaw · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 23, 2026
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -5189,6 +5189,34 @@ def analyze_plan(self) -> str:
 
         return self._scanner.analyze_plan()
 
+    def plan_splits(
+        self, max_split_size_bytes: Optional[int] = None
+    ) -> List[List["FragmentMetadata"]]:
+        """Plan splits for distributed scanning.
+
+        This method analyzes the scanner's filter and uses indices to determine
+        which fragments need to be scanned and approximately how many rows each
+        fragment will return. It then groups fragments into splits that can be
+        processed independently.
+
+        The scanner estimates the size of each row based on the output schema
+        projection and uses that to determine how many rows fit within the
+        target split size.
+
+        Parameters
+        ----------
+        max_split_size_bytes : int, optional
+            The target maximum size in bytes for each split. Defaults to 128MB.
+
+        Returns
+        -------
+        List[List[FragmentMetadata]]
+            A list of splits, where each split is a list of FragmentMetadata objects.
+            Each split can be processed independently for distributed scanning.
+        """
+
+        return self._scanner.plan_splits(max_split_size_bytes=max_split_size_bytes)
+
 
 class DatasetOptimizer:
     def __init__(self, dataset: LanceDataset):

diff --git a/python/src/scanner.rs b/python/src/scanner.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 
 use arrow::pyarrow::*;
 use arrow_array::RecordBatchReader;
-use lance::dataset::scanner::ExecutionSummaryCounts;
+use lance::dataset::scanner::{ExecutionSummaryCounts, SplitPackStrategy};
 use pyo3::prelude::*;
 use pyo3::pyclass;
 
@@ -30,6 +30,7 @@ use pyo3::exceptions::PyValueError;
 use crate::reader::LanceReader;
 use crate::rt;
 use crate::schema::logical_arrow_schema;
+use crate::utils::PyLance;
 
 /// This will be wrapped by a python class to provide
 /// additional functionality
@@ -150,4 +151,29 @@ impl Scanner {
 
         Ok(PyArrowType(Box::new(reader)))
     }
+
+    #[pyo3(signature = (max_split_size_bytes=None))]
+    fn plan_splits<'py>(
+        self_: PyRef<'py, Self>,
+        max_split_size_bytes: Option<usize>,
+    ) -> PyResult<Vec<Vec<Bound<'py, PyAny>>>> {
+        let scanner = self_.scanner.clone();
+        let strategy = max_split_size_bytes.map(SplitPackStrategy::MaxSizeBytes);
+        let splits = rt()
+            .spawn(Some(self_.py()), async move {
+                scanner.plan_splits(strategy).await
+            })?
+            .map_err(|err| PyValueError::new_err(err.to_string()))?;
+
+        splits
+            .into_iter()
+            .map(|split| {
+                split
+                    .fragments
+                    .into_iter()
+                    .map(|sf| PyLance(sf.fragment).into_pyobject(self_.py()))
+                    .collect::<Result<Vec<_>, _>>()
+            })
+            .collect::<Result<Vec<_>, _>>()
+    }
 }
diff --git a/rust/lance-core/src/utils/mask.rs b/rust/lance-core/src/utils/mask.rs
@@ -657,6 +657,10 @@ impl RowAddrTreeMap {
                 }),
             })
     }
+
+    pub fn fragments(&self) -> Vec<u32> {
+        self.inner.keys().cloned().collect()
+    }
 }
 
 impl std::ops::BitOr<Self> for RowAddrTreeMap {
-Original file line number
+Diff line change
@@ Expand Up / @@ -657,6 +657,10 @@ impl RowAddrTreeMap { @@
                     }),
                 })
         }
+        pub fn fragments(&self) -> Vec<u32> {
+            self.inner.keys().cloned().collect()
+        }
     }
     impl std::ops::BitOr<Self> for RowAddrTreeMap {
@@ Expand Down @@