diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index 6a372e2e..4594717f 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -138,9 +138,10 @@ std::pair PageFilteredRowGroupReader::ComputeCompressedRowRa Result> PageFilteredRowGroupReader::ReadFilteredColumn( const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader, ::parquet::ParquetFileReader* parquet_reader, - const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader, int32_t row_group_index, - int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr& field, - int64_t row_group_row_count, ::arrow::MemoryPool* pool) { + const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, + int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges, + const std::shared_ptr& field, int64_t row_group_row_count, + ::arrow::MemoryPool* pool) { auto file_metadata = parquet_reader->metadata(); const auto* col_descriptor = file_metadata->schema()->Column(column_index); @@ -149,11 +150,8 @@ Result> PageFilteredRowGroupReader::ReadFil int64_t effective_row_count = row_group_row_count; std::shared_ptr<::parquet::OffsetIndex> offset_index; - if (page_index_reader) { - auto rg_page_index_reader = page_index_reader->RowGroup(row_group_index); - if (rg_page_index_reader) { - offset_index = rg_page_index_reader->GetOffsetIndex(column_index); - } + if (rg_page_index_reader) { + offset_index = rg_page_index_reader->GetOffsetIndex(column_index); } auto page_reader = row_group_reader->GetColumnPageReader(column_index); @@ -263,6 +261,13 @@ Result> PageFilteredRowGroupReader::Re int64_t row_group_row_count = rg_metadata->num_rows(); auto page_index_reader = parquet_reader->GetPageIndexReader(); + // reuse RowGroupPageIndexReader for multiple columns in the same row group to avoid redundant + // metadata reads + std::shared_ptr<::parquet::RowGroupPageIndexReader> rg_page_index_reader; + if (page_index_reader) { + rg_page_index_reader = page_index_reader->RowGroup(row_group_index); + } + // Read each column with page filtering std::vector> columns; columns.reserve(column_indices.size()); @@ -270,8 +275,8 @@ Result> PageFilteredRowGroupReader::Re for (size_t i = 0; i < column_indices.size(); ++i) { PAIMON_ASSIGN_OR_RAISE( std::shared_ptr chunked_array, - ReadFilteredColumn(row_group_reader, parquet_reader, page_index_reader, row_group_index, - column_indices[i], row_ranges, + ReadFilteredColumn(row_group_reader, parquet_reader, rg_page_index_reader, + row_group_index, column_indices[i], row_ranges, arrow_schema->field(static_cast(i)), row_group_row_count, pool)); diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.h b/src/paimon/format/parquet/page_filtered_row_group_reader.h index 466f664c..24c82e25 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.h +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.h @@ -90,7 +90,7 @@ class PageFilteredRowGroupReader { static Result> ReadFilteredColumn( const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader, ::parquet::ParquetFileReader* parquet_reader, - const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader, + const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr& field, int64_t row_group_row_count, ::arrow::MemoryPool* pool);