Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4c34258
Support row group limit pruning
xudong963 Oct 31, 2025
1d78b6f
Support row group limit pruning
xudong963 Nov 21, 2025
d1fc3bd
Add fetch_order_sensitive during limit pushdown to decide if use limi…
xudong963 Nov 25, 2025
8170789
fix test formant
xudong963 Nov 25, 2025
187e10b
Rename to preserve_order
xudong963 Nov 27, 2025
d6dc4b7
refactor pushdown limit
xudong963 Nov 27, 2025
62e1725
extract some logic into identify_fully_matched_row_groups
xudong963 Nov 27, 2025
0229bd5
resolve conflicts
xudong963 Nov 27, 2025
330775f
Add end to end sqllogictest
xudong963 Nov 28, 2025
e50361c
resolve conflicts
xudong963 Dec 18, 2025
321429c
redesign
xudong963 Dec 18, 2025
31ae9cf
use required_ordering
xudong963 Dec 22, 2025
4602a76
resolve conflicts
xudong963 Dec 24, 2025
e09a192
resolve newest review
xudong963 Jan 7, 2026
56cda2d
remove scratch
xudong963 Jan 7, 2026
a875d41
remove scratch
xudong963 Jan 7, 2026
719fa82
fix clippy
xudong963 Jan 7, 2026
3540fd3
refine comments
xudong963 Jan 7, 2026
8d60e96
fix test
xudong963 Jan 7, 2026
88c1c2e
refine comments
xudong963 Jan 8, 2026
f67193b
rich comment
xudong963 Jan 8, 2026
038285e
remove downcast
xudong963 Jan 9, 2026
661a2c2
remove dependency
xudong963 Jan 9, 2026
ca7de4f
add an exmaple
xudong963 Jan 13, 2026
d2b84d4
fix doc test
xudong963 Jan 13, 2026
6c515b2
update doc
xudong963 Jan 15, 2026
d80a354
Merge branch 'main' into row_group_limit_pruning
xudong963 Jan 15, 2026
fd6e7c8
Merge branch 'main' into row_group_limit_pruning
xudong963 Jan 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 88 additions & 17 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use arrow::{
record_batch::RecordBatch,
util::pretty::pretty_format_batches,
};
use arrow_schema::SchemaRef;
use chrono::{Datelike, Duration, TimeDelta};
use datafusion::{
datasource::{TableProvider, provider_as_source},
Expand Down Expand Up @@ -110,6 +111,26 @@ struct ContextWithParquet {
ctx: SessionContext,
}

struct PruningMetric {
total_pruned: usize,
total_matched: usize,
total_fully_matched: usize,
}

impl PruningMetric {
pub fn total_pruned(&self) -> usize {
self.total_pruned
}

pub fn total_matched(&self) -> usize {
self.total_matched
}

pub fn total_fully_matched(&self) -> usize {
self.total_fully_matched
}
}

/// The output of running one of the test cases
struct TestOutput {
/// The input query SQL
Expand All @@ -127,8 +148,8 @@ struct TestOutput {
impl TestOutput {
/// retrieve the value of the named metric, if any
fn metric_value(&self, metric_name: &str) -> Option<usize> {
if let Some((pruned, _matched)) = self.pruning_metric(metric_name) {
return Some(pruned);
if let Some(pm) = self.pruning_metric(metric_name) {
return Some(pm.total_pruned());
}

self.parquet_metrics
Expand All @@ -141,9 +162,10 @@ impl TestOutput {
})
}

fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> {
fn pruning_metric(&self, metric_name: &str) -> Option<PruningMetric> {
let mut total_pruned = 0;
let mut total_matched = 0;
let mut total_fully_matched = 0;
let mut found = false;

for metric in self.parquet_metrics.iter() {
Expand All @@ -155,12 +177,18 @@ impl TestOutput {
{
total_pruned += pruning_metrics.pruned();
total_matched += pruning_metrics.matched();
total_fully_matched += pruning_metrics.fully_matched();

found = true;
}
}

if found {
Some((total_pruned, total_matched))
Some(PruningMetric {
total_pruned,
total_matched,
total_fully_matched,
})
} else {
None
}
Expand All @@ -172,27 +200,33 @@ impl TestOutput {
}

/// The number of row_groups pruned / matched by bloom filter
fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> {
fn row_groups_bloom_filter(&self) -> Option<PruningMetric> {
self.pruning_metric("row_groups_pruned_bloom_filter")
}

/// The number of row_groups matched by statistics
fn row_groups_matched_statistics(&self) -> Option<usize> {
self.pruning_metric("row_groups_pruned_statistics")
.map(|(_pruned, matched)| matched)
.map(|pm| pm.total_matched())
}

/// The number of row_groups fully matched by statistics
fn row_groups_fully_matched_statistics(&self) -> Option<usize> {
self.pruning_metric("row_groups_pruned_statistics")
.map(|pm| pm.total_fully_matched())
}

/// The number of row_groups pruned by statistics
fn row_groups_pruned_statistics(&self) -> Option<usize> {
self.pruning_metric("row_groups_pruned_statistics")
.map(|(pruned, _matched)| pruned)
.map(|pm| pm.total_pruned())
}

/// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
/// for testing purpose, here it only aggregate the `pruned` count.
fn files_ranges_pruned_statistics(&self) -> Option<usize> {
self.pruning_metric("files_ranges_pruned_statistics")
.map(|(pruned, _matched)| pruned)
.map(|pm| pm.total_pruned())
}

/// The number of row_groups matched by bloom filter or statistics
Expand All @@ -201,22 +235,27 @@ impl TestOutput {
/// filter: 7 total -> 3 matched, this function returns 3 for the final matched
/// count.
fn row_groups_matched(&self) -> Option<usize> {
self.row_groups_bloom_filter()
.map(|(_pruned, matched)| matched)
self.row_groups_bloom_filter().map(|pm| pm.total_matched())
}

/// The number of row_groups pruned
fn row_groups_pruned(&self) -> Option<usize> {
self.row_groups_bloom_filter()
.map(|(pruned, _matched)| pruned)
.map(|pm| pm.total_pruned())
.zip(self.row_groups_pruned_statistics())
.map(|(a, b)| a + b)
}

/// The number of row pages pruned
fn row_pages_pruned(&self) -> Option<usize> {
self.pruning_metric("page_index_rows_pruned")
.map(|(pruned, _matched)| pruned)
.map(|pm| pm.total_pruned())
}

/// The number of row groups pruned by limit pruning
fn limit_pruned_row_groups(&self) -> Option<usize> {
self.pruning_metric("limit_pruned_row_groups")
.map(|pm| pm.total_pruned())
}

fn description(&self) -> String {
Expand All @@ -232,20 +271,41 @@ impl TestOutput {
/// and the appropriate scenario
impl ContextWithParquet {
async fn new(scenario: Scenario, unit: Unit) -> Self {
Self::with_config(scenario, unit, SessionConfig::new()).await
Self::with_config(scenario, unit, SessionConfig::new(), None, None).await
}

/// Set custom schema and batches for the test
pub async fn with_custom_data(
scenario: Scenario,
unit: Unit,
schema: Arc<Schema>,
batches: Vec<RecordBatch>,
) -> Self {
Self::with_config(
scenario,
unit,
SessionConfig::new(),
Some(schema),
Some(batches),
)
.await
}

async fn with_config(
scenario: Scenario,
unit: Unit,
mut config: SessionConfig,
custom_schema: Option<SchemaRef>,
custom_batches: Option<Vec<RecordBatch>>,
) -> Self {
// Use a single partition for deterministic results no matter how many CPUs the host has
config = config.with_target_partitions(1);
let file = match unit {
Unit::RowGroup(row_per_group) => {
config = config.with_parquet_bloom_filter_pruning(true);
make_test_file_rg(scenario, row_per_group).await
config.options_mut().execution.parquet.pushdown_filters = true;
make_test_file_rg(scenario, row_per_group, custom_schema, custom_batches)
.await
}
Unit::Page(row_per_page) => {
config = config.with_parquet_page_index_pruning(true);
Expand Down Expand Up @@ -1075,7 +1135,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
}

/// Create a test parquet file with various data types
async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTempFile {
async fn make_test_file_rg(
scenario: Scenario,
row_per_group: usize,
custom_schema: Option<SchemaRef>,
custom_batches: Option<Vec<RecordBatch>>,
) -> NamedTempFile {
let mut output_file = tempfile::Builder::new()
.prefix("parquet_pruning")
.suffix(".parquet")
Expand All @@ -1088,8 +1153,14 @@ async fn make_test_file_rg(scenario: Scenario, row_per_group: usize) -> NamedTem
.set_statistics_enabled(EnabledStatistics::Page)
.build();

let batches = create_data_batch(scenario);
let schema = batches[0].schema();
let (batches, schema) =
if let (Some(schema), Some(batches)) = (custom_schema, custom_batches) {
(batches, schema)
} else {
let batches = create_data_batch(scenario);
let schema = batches[0].schema();
(batches, schema)
};

let mut writer = ArrowWriter::try_new(&mut output_file, schema, Some(props)).unwrap();

Expand Down
Loading