Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions src/paimon/format/parquet/page_filtered_row_group_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,10 @@ std::pair<RowRanges, int64_t> PageFilteredRowGroupReader::ComputeCompressedRowRa
Result<std::shared_ptr<arrow::ChunkedArray>> PageFilteredRowGroupReader::ReadFilteredColumn(
const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader,
::parquet::ParquetFileReader* parquet_reader,
const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader, int32_t row_group_index,
int32_t column_index, const RowRanges& row_ranges, const std::shared_ptr<arrow::Field>& field,
int64_t row_group_row_count, ::arrow::MemoryPool* pool) {
const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader,
int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges,
const std::shared_ptr<arrow::Field>& field, int64_t row_group_row_count,
::arrow::MemoryPool* pool) {
auto file_metadata = parquet_reader->metadata();
const auto* col_descriptor = file_metadata->schema()->Column(column_index);

Expand All @@ -149,11 +150,8 @@ Result<std::shared_ptr<arrow::ChunkedArray>> PageFilteredRowGroupReader::ReadFil
int64_t effective_row_count = row_group_row_count;

std::shared_ptr<::parquet::OffsetIndex> offset_index;
if (page_index_reader) {
auto rg_page_index_reader = page_index_reader->RowGroup(row_group_index);
if (rg_page_index_reader) {
offset_index = rg_page_index_reader->GetOffsetIndex(column_index);
}
if (rg_page_index_reader) {
offset_index = rg_page_index_reader->GetOffsetIndex(column_index);
}

auto page_reader = row_group_reader->GetColumnPageReader(column_index);
Expand Down Expand Up @@ -263,15 +261,22 @@ Result<std::unique_ptr<arrow::RecordBatchReader>> PageFilteredRowGroupReader::Re
int64_t row_group_row_count = rg_metadata->num_rows();
auto page_index_reader = parquet_reader->GetPageIndexReader();

// reuse RowGroupPageIndexReader for multiple columns in the same row group to avoid redundant
// metadata reads
std::shared_ptr<::parquet::RowGroupPageIndexReader> rg_page_index_reader;
if (page_index_reader) {
rg_page_index_reader = page_index_reader->RowGroup(row_group_index);
}

// Read each column with page filtering
std::vector<std::shared_ptr<arrow::ChunkedArray>> columns;
columns.reserve(column_indices.size());

for (size_t i = 0; i < column_indices.size(); ++i) {
PAIMON_ASSIGN_OR_RAISE(
std::shared_ptr<arrow::ChunkedArray> chunked_array,
ReadFilteredColumn(row_group_reader, parquet_reader, page_index_reader, row_group_index,
column_indices[i], row_ranges,
ReadFilteredColumn(row_group_reader, parquet_reader, rg_page_index_reader,
row_group_index, column_indices[i], row_ranges,
arrow_schema->field(static_cast<int>(i)), row_group_row_count,
pool));

Expand Down
2 changes: 1 addition & 1 deletion src/paimon/format/parquet/page_filtered_row_group_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class PageFilteredRowGroupReader {
static Result<std::shared_ptr<arrow::ChunkedArray>> ReadFilteredColumn(
const std::shared_ptr<::parquet::RowGroupReader>& row_group_reader,
::parquet::ParquetFileReader* parquet_reader,
const std::shared_ptr<::parquet::PageIndexReader>& page_index_reader,
const std::shared_ptr<::parquet::RowGroupPageIndexReader>& rg_page_index_reader,
int32_t row_group_index, int32_t column_index, const RowRanges& row_ranges,
const std::shared_ptr<arrow::Field>& field, int64_t row_group_row_count,
::arrow::MemoryPool* pool);
Expand Down
Loading