diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs index 04c11b887554..bc195f176787 100644 --- a/datafusion/datasource-parquet/src/row_filter.rs +++ b/datafusion/datasource-parquet/src/row_filter.rs @@ -276,7 +276,7 @@ struct PushdownChecker<'schema> { /// Does the expression reference any columns not present in the file schema? projected_columns: bool, /// Indices into the file schema of columns required to evaluate the expression. - required_columns: BTreeSet, + required_columns: Vec, /// Tracks the nested column behavior found during traversal. nested_behavior: NestedColumnSupport, /// Whether nested list columns are supported by the predicate semantics. @@ -290,7 +290,7 @@ impl<'schema> PushdownChecker<'schema> { Self { non_primitive_columns: false, projected_columns: false, - required_columns: BTreeSet::default(), + required_columns: Vec::new(), nested_behavior: NestedColumnSupport::PrimitiveOnly, allow_list_columns, file_schema, @@ -307,7 +307,8 @@ impl<'schema> PushdownChecker<'schema> { } }; - self.required_columns.insert(idx); + // Duplicates are handled by dedup() in into_sorted_columns() + self.required_columns.push(idx); let data_type = self.file_schema.field(idx).data_type(); if DataType::is_nested(data_type) { @@ -355,6 +356,21 @@ impl<'schema> PushdownChecker<'schema> { fn prevents_pushdown(&self) -> bool { self.non_primitive_columns || self.projected_columns } + + /// Consumes the checker and returns sorted, deduplicated column indices + /// wrapped in a `PushdownColumns` struct. + /// + /// This method sorts the column indices and removes duplicates. The sort + /// is required because downstream code relies on column indices being in + /// ascending order for correct schema projection. + fn into_sorted_columns(mut self) -> PushdownColumns { + self.required_columns.sort_unstable(); + self.required_columns.dedup(); + PushdownColumns { + required_columns: self.required_columns, + nested: self.nested_behavior, + } + } } impl TreeNodeVisitor<'_> for PushdownChecker<'_> { @@ -390,9 +406,13 @@ enum NestedColumnSupport { Unsupported, } +/// Result of checking which columns are required for filter pushdown. #[derive(Debug)] struct PushdownColumns { - required_columns: BTreeSet, + /// Sorted, unique column indices into the file schema required to evaluate + /// the filter expression. Must be in ascending order for correct schema + /// projection matching. + required_columns: Vec, nested: NestedColumnSupport, } @@ -411,10 +431,7 @@ fn pushdown_columns( let allow_list_columns = supports_list_predicates(expr); let mut checker = PushdownChecker::new(file_schema, allow_list_columns); expr.visit(&mut checker)?; - Ok((!checker.prevents_pushdown()).then_some(PushdownColumns { - required_columns: checker.required_columns, - nested: checker.nested_behavior, - })) + Ok((!checker.prevents_pushdown()).then(|| checker.into_sorted_columns())) } fn leaf_indices_for_roots(