From 0ac67d56fbd432e7e51d7403ba16fe0afd8eca5f Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Thu, 28 May 2026 17:42:47 +0800 Subject: [PATCH 1/2] fix: Reuse RowGroupPageIndexReader for multiple columns in PageFilteredRowGroupReader::ReadFilteredRowGroup to avoid performance drop on wide tables --- .../parquet/page_filtered_row_group_reader.cpp | 17 ++++++++++------- .../parquet/page_filtered_row_group_reader.h | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index 6a372e2e5..77ea92868 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -138,7 +138,7 @@ std::pair PageFilteredRowGroupReader::ComputeCompressedRowRa Result> PageFilteredRowGroupReader::ReadFilteredColumn( const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader, ::parquet::ParquetFileReader* parquet_reader, - const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader, int32_t row_group_index, + const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr& field, int64_t row_group_row_count, ::arrow::MemoryPool* pool) { auto file_metadata = parquet_reader->metadata(); @@ -149,11 +149,8 @@ Result> PageFilteredRowGroupReader::ReadFil int64_t effective_row_count = row_group_row_count; std::shared_ptr<::parquet::OffsetIndex> offset_index; - if (page_index_reader) { - auto rg_page_index_reader = page_index_reader->RowGroup(row_group_index); - if (rg_page_index_reader) { - offset_index = rg_page_index_reader->GetOffsetIndex(column_index); - } + if (rg_page_index_reader) { + offset_index = rg_page_index_reader->GetOffsetIndex(column_index); } auto page_reader = row_group_reader->GetColumnPageReader(column_index); @@ -263,6 +260,12 @@ Result> PageFilteredRowGroupReader::Re int64_t row_group_row_count = rg_metadata->num_rows(); auto page_index_reader = parquet_reader->GetPageIndexReader(); + // reuse RowGroupPageIndexReader for multiple columns in the same row group to avoid redundant metadata reads + std::shared_ptr<::parquet::RowGroupPageIndexReader> rg_page_index_reader; + if (page_index_reader) { + rg_page_index_reader = page_index_reader->RowGroup(row_group_index); + } + // Read each column with page filtering std::vector> columns; columns.reserve(column_indices.size()); @@ -270,7 +273,7 @@ Result> PageFilteredRowGroupReader::Re for (size_t i = 0; i < column_indices.size(); ++i) { PAIMON_ASSIGN_OR_RAISE( std::shared_ptr chunked_array, - ReadFilteredColumn(row_group_reader, parquet_reader, page_index_reader, row_group_index, + ReadFilteredColumn(row_group_reader, parquet_reader, rg_page_index_reader, row_group_index, column_indices[i], row_ranges, arrow_schema->field(static_cast(i)), row_group_row_count, pool)); diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.h b/src/paimon/format/parquet/page_filtered_row_group_reader.h index 466f664c7..24c82e253 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.h +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.h @@ -90,7 +90,7 @@ class PageFilteredRowGroupReader { static Result> ReadFilteredColumn( const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader, ::parquet::ParquetFileReader* parquet_reader, - const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader, + const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr& field, int64_t row_group_row_count, ::arrow::MemoryPool* pool); From f7c3ea2c00c6932ad5b4585ddac73560dc9fc2ec Mon Sep 17 00:00:00 2001 From: zhouhongfeng Date: Thu, 28 May 2026 18:24:16 +0800 Subject: [PATCH 2/2] style: clang-format --- .../parquet/page_filtered_row_group_reader.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp index 77ea92868..4594717f0 100644 --- a/src/paimon/format/parquet/page_filtered_row_group_reader.cpp +++ b/src/paimon/format/parquet/page_filtered_row_group_reader.cpp @@ -138,9 +138,10 @@ std::pair PageFilteredRowGroupReader::ComputeCompressedRowRa Result> PageFilteredRowGroupReader::ReadFilteredColumn( const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader, ::parquet::ParquetFileReader* parquet_reader, - const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, int32_t row_group_index, - int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr& field, - int64_t row_group_row_count, ::arrow::MemoryPool* pool) { + const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader, + int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges, + const std::shared_ptr& field, int64_t row_group_row_count, + ::arrow::MemoryPool* pool) { auto file_metadata = parquet_reader->metadata(); const auto* col_descriptor = file_metadata->schema()->Column(column_index); @@ -260,7 +261,8 @@ Result> PageFilteredRowGroupReader::Re int64_t row_group_row_count = rg_metadata->num_rows(); auto page_index_reader = parquet_reader->GetPageIndexReader(); - // reuse RowGroupPageIndexReader for multiple columns in the same row group to avoid redundant metadata reads + // reuse RowGroupPageIndexReader for multiple columns in the same row group to avoid redundant + // metadata reads std::shared_ptr<::parquet::RowGroupPageIndexReader> rg_page_index_reader; if (page_index_reader) { rg_page_index_reader = page_index_reader->RowGroup(row_group_index); @@ -273,8 +275,8 @@ Result> PageFilteredRowGroupReader::Re for (size_t i = 0; i < column_indices.size(); ++i) { PAIMON_ASSIGN_OR_RAISE( std::shared_ptr chunked_array, - ReadFilteredColumn(row_group_reader, parquet_reader, rg_page_index_reader, row_group_index, - column_indices[i], row_ranges, + ReadFilteredColumn(row_group_reader, parquet_reader, rg_page_index_reader, + row_group_index, column_indices[i], row_ranges, arrow_schema->field(static_cast(i)), row_group_row_count, pool));