Skip to content

Commit 3e6e8f3

Browse files
authored
GH-45339: [Parquet][C++] Fix statistics load logic for no row group and multiple row groups (#45350)
### Rationale for this change Loading `arrow::ArrayStatistics` logic depends on `parquet::ColumnChunkMetaData`. We can't get `parquet::ColumnChunkMetaData` when requested row groups are empty because no associated row group and column chunk exist. We can't use multiple `parquet::ColumnChunkMetaData`s for now because we don't have statistics merge logic. So we can't load statistics when we use multiple row groups. ### What changes are included in this PR? * Don't load statistics when no row groups are used * Don't load statistics when multiple row groups are used * Add `parquet::ArrowReaderProperties::{set_,}should_load_statistics()` to enforce loading statistics by loading row group one by one ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #45339 Authored-by: Sutou Kouhei <kou@clear-code.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent ca2f4d6 commit 3e6e8f3

7 files changed

Lines changed: 187 additions & 12 deletions

File tree

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4296,6 +4296,108 @@ TEST(TestArrowReaderAdHoc, ReadFloat16Files) {
42964296
}
42974297
}
42984298

4299+
TEST(TestArrowFileReader, RecordBatchReaderEmptyRowGroups) {
4300+
const int num_columns = 1;
4301+
const int num_rows = 3;
4302+
const int num_chunks = 1;
4303+
4304+
std::shared_ptr<Table> table;
4305+
ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table));
4306+
4307+
const int64_t row_group_size = num_rows;
4308+
std::shared_ptr<Buffer> buffer;
4309+
ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size,
4310+
default_arrow_writer_properties(), &buffer));
4311+
4312+
auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer));
4313+
std::unique_ptr<FileReader> file_reader;
4314+
ASSERT_OK(
4315+
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
4316+
// This is the important part in this test.
4317+
std::vector<int> row_group_indices = {};
4318+
ASSERT_OK_AND_ASSIGN(auto record_batch_reader,
4319+
file_reader->GetRecordBatchReader(row_group_indices));
4320+
std::shared_ptr<::arrow::RecordBatch> record_batch;
4321+
ASSERT_OK(record_batch_reader->ReadNext(&record_batch));
4322+
// No read record batch for empty row groups request.
4323+
ASSERT_FALSE(record_batch);
4324+
}
4325+
4326+
TEST(TestArrowFileReader, RecordBatchReaderEmptyInput) {
4327+
const int num_columns = 1;
4328+
// This is the important part in this test.
4329+
const int num_rows = 0;
4330+
const int num_chunks = 1;
4331+
4332+
std::shared_ptr<Table> table;
4333+
ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table));
4334+
4335+
const int64_t row_group_size = num_rows;
4336+
std::shared_ptr<Buffer> buffer;
4337+
ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size,
4338+
default_arrow_writer_properties(), &buffer));
4339+
4340+
auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer));
4341+
std::unique_ptr<FileReader> file_reader;
4342+
ASSERT_OK(
4343+
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
4344+
ASSERT_OK_AND_ASSIGN(auto record_batch_reader, file_reader->GetRecordBatchReader());
4345+
std::shared_ptr<::arrow::RecordBatch> record_batch;
4346+
ASSERT_OK(record_batch_reader->ReadNext(&record_batch));
4347+
// No read record batch for empty data.
4348+
ASSERT_FALSE(record_batch);
4349+
}
4350+
4351+
TEST(TestArrowColumnReader, NextBatchZeroBatchSize) {
4352+
const int num_columns = 1;
4353+
const int num_rows = 3;
4354+
const int num_chunks = 1;
4355+
4356+
std::shared_ptr<Table> table;
4357+
ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table));
4358+
4359+
const int64_t row_group_size = num_rows;
4360+
std::shared_ptr<Buffer> buffer;
4361+
ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size,
4362+
default_arrow_writer_properties(), &buffer));
4363+
4364+
auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer));
4365+
std::unique_ptr<FileReader> file_reader;
4366+
ASSERT_OK(
4367+
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
4368+
std::unique_ptr<arrow::ColumnReader> column_reader;
4369+
ASSERT_OK(file_reader->GetColumn(0, &column_reader));
4370+
std::shared_ptr<ChunkedArray> chunked_array;
4371+
// This is the important part in this test.
4372+
ASSERT_OK(column_reader->NextBatch(0, &chunked_array));
4373+
ASSERT_EQ(0, chunked_array->length());
4374+
}
4375+
4376+
TEST(TestArrowColumnReader, NextBatchEmptyInput) {
4377+
const int num_columns = 1;
4378+
// This is the important part in this test.
4379+
const int num_rows = 0;
4380+
const int num_chunks = 1;
4381+
4382+
std::shared_ptr<Table> table;
4383+
ASSERT_NO_FATAL_FAILURE(MakeDoubleTable(num_columns, num_rows, num_chunks, &table));
4384+
4385+
const int64_t row_group_size = num_rows;
4386+
std::shared_ptr<Buffer> buffer;
4387+
ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size,
4388+
default_arrow_writer_properties(), &buffer));
4389+
4390+
auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer));
4391+
std::unique_ptr<FileReader> file_reader;
4392+
ASSERT_OK(
4393+
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
4394+
std::unique_ptr<arrow::ColumnReader> column_reader;
4395+
ASSERT_OK(file_reader->GetColumn(0, &column_reader));
4396+
std::shared_ptr<ChunkedArray> chunked_array;
4397+
ASSERT_OK(column_reader->NextBatch(10, &chunked_array));
4398+
ASSERT_EQ(0, chunked_array->length());
4399+
}
4400+
42994401
// direct-as-possible translation of
43004402
// pyarrow/tests/test_parquet.py::test_validate_schema_write_table
43014403
TEST(TestArrowWriterAdHoc, SchemaMismatch) {

cpp/src/parquet/arrow/arrow_statistics_test.cc

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -185,25 +185,26 @@ TEST(StatisticsTest, TruncateOnlyHalfMinMax) {
185185

186186
namespace {
187187
::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
188-
std::shared_ptr<::arrow::DataType> data_type, std::shared_ptr<::arrow::Array> array) {
188+
std::shared_ptr<::arrow::DataType> data_type, std::shared_ptr<::arrow::Array> array,
189+
std::shared_ptr<WriterProperties> writer_properties = default_writer_properties(),
190+
const ArrowReaderProperties& reader_properties = default_arrow_reader_properties()) {
189191
auto schema = ::arrow::schema({::arrow::field("column", data_type)});
190192
auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array});
191193
ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create());
192194
const auto arrow_writer_properties =
193195
parquet::ArrowWriterProperties::Builder().store_schema()->build();
194-
ARROW_ASSIGN_OR_RAISE(
195-
auto writer,
196-
FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink,
197-
default_writer_properties(), arrow_writer_properties));
196+
ARROW_ASSIGN_OR_RAISE(auto writer,
197+
FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink,
198+
writer_properties, arrow_writer_properties));
198199
ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch));
199200
ARROW_RETURN_NOT_OK(writer->Close());
200201
ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish());
201202

202203
auto reader =
203204
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
204205
std::unique_ptr<FileReader> file_reader;
205-
ARROW_RETURN_NOT_OK(
206-
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
206+
ARROW_RETURN_NOT_OK(FileReader::Make(::arrow::default_memory_pool(), std::move(reader),
207+
reader_properties, &file_reader));
207208
std::shared_ptr<::arrow::ChunkedArray> chunked_array;
208209
ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array));
209210
return chunked_array->chunk(0);
@@ -326,4 +327,44 @@ TEST(TestStatisticsRead, Duration) {
326327
::arrow::duration(::arrow::TimeUnit::NANO));
327328
}
328329

330+
TEST(TestStatisticsRead, MultipleRowGroupsDefault) {
331+
auto arrow_type = ::arrow::int32();
332+
auto built_array = ArrayFromJSON(arrow_type, R"([1, null, -1])");
333+
auto writer_properties = WriterProperties::Builder().max_row_group_length(2)->build();
334+
ASSERT_OK_AND_ASSIGN(
335+
auto read_array,
336+
StatisticsReadArray(arrow_type, std::move(built_array), writer_properties));
337+
auto typed_read_array = std::static_pointer_cast<::arrow::Int32Array>(read_array);
338+
auto statistics = typed_read_array->statistics();
339+
ASSERT_EQ(nullptr, statistics);
340+
}
341+
342+
TEST(TestStatisticsRead, MultipleRowGroupsShouldLoadStatistics) {
343+
auto arrow_type = ::arrow::int32();
344+
auto built_array = ArrayFromJSON(arrow_type, R"([1, null, -1])");
345+
auto writer_properties = WriterProperties::Builder().max_row_group_length(2)->build();
346+
ArrowReaderProperties reader_properties;
347+
reader_properties.set_should_load_statistics(true);
348+
ASSERT_OK_AND_ASSIGN(auto read_array,
349+
StatisticsReadArray(arrow_type, std::move(built_array),
350+
writer_properties, reader_properties));
351+
// If we use should_load_statistics, reader doesn't load multiple
352+
// row groups at once. So the first array in the read chunked array
353+
// has only 2 elements.
354+
ASSERT_EQ(2, read_array->length());
355+
auto typed_read_array = std::static_pointer_cast<::arrow::Int32Array>(read_array);
356+
auto statistics = typed_read_array->statistics();
357+
ASSERT_NE(nullptr, statistics);
358+
ASSERT_EQ(true, statistics->null_count.has_value());
359+
ASSERT_EQ(1, statistics->null_count.value());
360+
ASSERT_EQ(false, statistics->distinct_count.has_value());
361+
ASSERT_EQ(true, statistics->min.has_value());
362+
// This is not -1 because this array has only the first 2 elements.
363+
ASSERT_EQ(1, std::get<int64_t>(*statistics->min));
364+
ASSERT_EQ(true, statistics->is_min_exact);
365+
ASSERT_EQ(true, statistics->max.has_value());
366+
ASSERT_EQ(1, std::get<int64_t>(*statistics->max));
367+
ASSERT_EQ(true, statistics->is_max_exact);
368+
}
369+
329370
} // namespace parquet::arrow

cpp/src/parquet/arrow/reader.cc

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ class FileReaderImpl : public FileReader {
218218
ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
219219
ctx->filter_leaves = true;
220220
ctx->included_leaves = included_leaves;
221+
ctx->reader_properties = &reader_properties_;
221222
return GetReader(manifest_.schema_fields[i], ctx, out);
222223
}
223224

@@ -475,6 +476,8 @@ class LeafReader : public ColumnReaderImpl {
475476
record_reader_->Reset();
476477
// Pre-allocation gives much better performance for flat columns
477478
record_reader_->Reserve(records_to_read);
479+
const bool should_load_statistics = ctx_->reader_properties->should_load_statistics();
480+
int64_t num_target_row_groups = 0;
478481
while (records_to_read > 0) {
479482
if (!record_reader_->HasMoreData()) {
480483
break;
@@ -483,11 +486,21 @@ class LeafReader : public ColumnReaderImpl {
483486
records_to_read -= records_read;
484487
if (records_read == 0) {
485488
NextRowGroup();
489+
} else {
490+
num_target_row_groups++;
491+
// We can't mix multiple row groups when we load statistics
492+
// because statistics are associated with a row group. If we
493+
// want to mix multiple row groups and keep valid statistics,
494+
// we need to implement a statistics merge logic.
495+
if (should_load_statistics) {
496+
break;
497+
}
486498
}
487499
}
488-
RETURN_NOT_OK(TransferColumnData(record_reader_.get(),
489-
input_->column_chunk_metadata(), field_, descr_,
490-
ctx_.get(), &out_));
500+
RETURN_NOT_OK(TransferColumnData(
501+
record_reader_.get(),
502+
num_target_row_groups == 1 ? input_->column_chunk_metadata() : nullptr, field_,
503+
descr_, ctx_.get(), &out_));
491504
return Status::OK();
492505
END_PARQUET_CATCH_EXCEPTIONS
493506
}
@@ -1214,6 +1227,7 @@ Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_facto
12141227
ctx->pool = pool_;
12151228
ctx->iterator_factory = iterator_factory;
12161229
ctx->filter_leaves = false;
1230+
ctx->reader_properties = &reader_properties_;
12171231
std::unique_ptr<ColumnReaderImpl> result;
12181232
RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
12191233
*out = std::move(result);

cpp/src/parquet/arrow/reader_internal.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,10 @@ template <typename ArrowType, typename ParquetType>
322322
void AttachStatistics(::arrow::ArrayData* data,
323323
std::unique_ptr<::parquet::ColumnChunkMetaData> metadata,
324324
const ReaderContext* ctx) {
325+
if (!metadata) {
326+
return;
327+
}
328+
325329
using ArrowCType = typename ArrowType::c_type;
326330

327331
auto statistics = metadata->statistics().get();

cpp/src/parquet/arrow/reader_internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ struct ReaderContext {
117117
FileColumnIteratorFactory iterator_factory;
118118
bool filter_leaves;
119119
std::shared_ptr<std::unordered_set<int>> included_leaves;
120+
ArrowReaderProperties* reader_properties;
120121

121122
bool IncludesLeaf(int leaf_index) const {
122123
if (this->filter_leaves) {

cpp/src/parquet/arrow/test_util.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,9 @@ ::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
229229
}
230230

231231
::arrow::NumericBuilder<ArrowType> builder;
232-
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
232+
if (values.size() > 0) {
233+
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
234+
}
233235
return builder.Finish(out);
234236
}
235237

cpp/src/parquet/properties.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -913,7 +913,8 @@ class PARQUET_EXPORT ArrowReaderProperties {
913913
pre_buffer_(true),
914914
cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
915915
coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
916-
arrow_extensions_enabled_(false) {}
916+
arrow_extensions_enabled_(false),
917+
should_load_statistics_(false) {}
917918

918919
/// \brief Set whether to use the IO thread pool to parse columns in parallel.
919920
///
@@ -996,6 +997,15 @@ class PARQUET_EXPORT ArrowReaderProperties {
996997
}
997998
bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
998999

1000+
/// \brief Set whether to load statistics as much as possible.
1001+
///
1002+
/// Default is false.
1003+
void set_should_load_statistics(bool should_load_statistics) {
1004+
should_load_statistics_ = should_load_statistics;
1005+
}
1006+
/// Return whether loading statistics as much as possible.
1007+
bool should_load_statistics() const { return should_load_statistics_; }
1008+
9991009
private:
10001010
bool use_threads_;
10011011
std::unordered_set<int> read_dict_indices_;
@@ -1005,6 +1015,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
10051015
::arrow::io::CacheOptions cache_options_;
10061016
::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
10071017
bool arrow_extensions_enabled_;
1018+
bool should_load_statistics_;
10081019
};
10091020

10101021
/// EXPERIMENTAL: Constructs the default ArrowReaderProperties

0 commit comments

Comments
 (0)