Skip to content

Commit 43afdce

Browse files
wgtmacpitrou
andauthored
GH-45283: [C++][Parquet] Omit level histogram when max level is 0 (#45285)
### Rationale for this change The level histogram of size statistics can be omitted if its max level is 0. We haven't implemented this yet and enforces histogram size to be equal to `max_level + 1`. However, when reading a Parquet file with omitted level histogram, exception will be thrown. ### What changes are included in this PR? Omit level histogram when max level is 0. ### Are these changes tested? Yes, a test case has been added to reflect the change. ### Are there any user-facing changes? No. * GitHub Issue: #45283 Lead-authored-by: Gang Wu <ustcwg@gmail.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Gang Wu <ustcwg@gmail.com>
1 parent 2d76789 commit 43afdce

File tree

3 files changed

+70
-22
lines changed

3 files changed

+70
-22
lines changed

cpp/src/parquet/column_writer.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,6 +1609,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
16091609

16101610
auto add_levels = [](std::vector<int64_t>& level_histogram,
16111611
::arrow::util::span<const int16_t> levels, int16_t max_level) {
1612+
if (max_level == 0) {
1613+
return;
1614+
}
16121615
ARROW_DCHECK_EQ(static_cast<size_t>(max_level) + 1, level_histogram.size());
16131616
::parquet::UpdateLevelHistogram(levels, level_histogram);
16141617
};

cpp/src/parquet/size_statistics.cc

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -64,23 +64,28 @@ void SizeStatistics::IncrementUnencodedByteArrayDataBytes(int64_t value) {
6464
}
6565

6666
void SizeStatistics::Validate(const ColumnDescriptor* descr) const {
67-
if (repetition_level_histogram.size() !=
68-
static_cast<size_t>(descr->max_repetition_level() + 1)) {
69-
throw ParquetException("Repetition level histogram size mismatch");
70-
}
71-
if (definition_level_histogram.size() !=
72-
static_cast<size_t>(descr->max_definition_level() + 1)) {
73-
throw ParquetException("Definition level histogram size mismatch");
74-
}
67+
auto validate_histogram = [](const std::vector<int64_t>& histogram, int16_t max_level,
68+
const std::string& name) {
69+
if (histogram.empty()) {
70+
// A levels histogram is always allowed to be missing.
71+
return;
72+
}
73+
if (histogram.size() != static_cast<size_t>(max_level + 1)) {
74+
std::stringstream ss;
75+
ss << name << " level histogram size mismatch, size: " << histogram.size()
76+
<< ", expected: " << (max_level + 1);
77+
throw ParquetException(ss.str());
78+
}
79+
};
80+
validate_histogram(repetition_level_histogram, descr->max_repetition_level(),
81+
"Repetition");
82+
validate_histogram(definition_level_histogram, descr->max_definition_level(),
83+
"Definition");
7584
if (unencoded_byte_array_data_bytes.has_value() &&
7685
descr->physical_type() != Type::BYTE_ARRAY) {
7786
throw ParquetException("Unencoded byte array data bytes does not support " +
7887
TypeToString(descr->physical_type()));
7988
}
80-
if (!unencoded_byte_array_data_bytes.has_value() &&
81-
descr->physical_type() == Type::BYTE_ARRAY) {
82-
throw ParquetException("Missing unencoded byte array data bytes");
83-
}
8489
}
8590

8691
void SizeStatistics::Reset() {
@@ -93,8 +98,15 @@ void SizeStatistics::Reset() {
9398

9499
std::unique_ptr<SizeStatistics> SizeStatistics::Make(const ColumnDescriptor* descr) {
95100
auto size_stats = std::make_unique<SizeStatistics>();
96-
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
97-
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
101+
// If the max level is 0, the level histogram can be omitted because it contains
102+
// only single level (a.k.a. 0) and its count is equivalent to `num_values` of the
103+
// column chunk or data page.
104+
if (descr->max_repetition_level() != 0) {
105+
size_stats->repetition_level_histogram.resize(descr->max_repetition_level() + 1, 0);
106+
}
107+
if (descr->max_definition_level() != 0) {
108+
size_stats->definition_level_histogram.resize(descr->max_definition_level() + 1, 0);
109+
}
98110
if (descr->physical_type() == Type::BYTE_ARRAY) {
99111
size_stats->unencoded_byte_array_data_bytes = 0;
100112
}

cpp/src/parquet/size_statistics_test.cc

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,20 @@ class SizeStatisticsRoundTripTest : public ::testing::Test {
216216
}
217217
}
218218

219+
void ReadData() {
220+
auto reader =
221+
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer_));
222+
auto metadata = reader->metadata();
223+
for (int i = 0; i < metadata->num_row_groups(); ++i) {
224+
int64_t num_rows = metadata->RowGroup(i)->num_rows();
225+
auto row_group_reader = reader->RowGroup(i);
226+
for (int j = 0; j < metadata->num_columns(); ++j) {
227+
auto column_reader = row_group_reader->RecordReader(j);
228+
ASSERT_EQ(column_reader->ReadRecords(num_rows + 1), num_rows);
229+
}
230+
}
231+
}
232+
219233
void Reset() { buffer_.reset(); }
220234

221235
protected:
@@ -300,23 +314,23 @@ TEST_F(SizeStatisticsRoundTripTest, WriteDictionaryArray) {
300314
ReadSizeStatistics();
301315
EXPECT_THAT(row_group_stats_,
302316
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{0, 2},
303-
/*rep_levels=*/{2},
317+
/*rep_levels=*/{},
304318
/*byte_array_bytes=*/5},
305319
SizeStatistics{/*def_levels=*/{1, 1},
306-
/*rep_levels=*/{2},
320+
/*rep_levels=*/{},
307321
/*byte_array_bytes=*/1},
308322
SizeStatistics{/*def_levels=*/{0, 2},
309-
/*rep_levels=*/{2},
323+
/*rep_levels=*/{},
310324
/*byte_array_bytes=*/4}));
311325
EXPECT_THAT(page_stats_,
312326
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{0, 2},
313-
/*rep_levels=*/{2},
327+
/*rep_levels=*/{},
314328
/*byte_array_bytes=*/{5}},
315329
PageSizeStatistics{/*def_levels=*/{1, 1},
316-
/*rep_levels=*/{2},
330+
/*rep_levels=*/{},
317331
/*byte_array_bytes=*/{1}},
318332
PageSizeStatistics{/*def_levels=*/{0, 2},
319-
/*rep_levels=*/{2},
333+
/*rep_levels=*/{},
320334
/*byte_array_bytes=*/{4}}));
321335
}
322336

@@ -368,12 +382,31 @@ TEST_F(SizeStatisticsRoundTripTest, LargePage) {
368382
ReadSizeStatistics();
369383
EXPECT_THAT(row_group_stats_,
370384
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{30000, 60000},
371-
/*rep_levels=*/{90000},
385+
/*rep_levels=*/{},
372386
/*byte_array_bytes=*/90000}));
373387
EXPECT_THAT(page_stats_,
374388
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{30000, 60000},
375-
/*rep_levels=*/{90000},
389+
/*rep_levels=*/{},
376390
/*byte_array_bytes=*/{90000}}));
377391
}
378392

393+
TEST_F(SizeStatisticsRoundTripTest, MaxLevelZero) {
394+
auto schema =
395+
::arrow::schema({::arrow::field("a", ::arrow::utf8(), /*nullable=*/false)});
396+
WriteFile(SizeStatisticsLevel::PageAndColumnChunk,
397+
::arrow::TableFromJSON(schema, {R"([["foo"],["bar"]])"}),
398+
/*max_row_group_length=*/2,
399+
/*page_size=*/1024);
400+
ASSERT_NO_FATAL_FAILURE(ReadSizeStatistics());
401+
ASSERT_NO_FATAL_FAILURE(ReadData());
402+
EXPECT_THAT(row_group_stats_,
403+
::testing::ElementsAre(SizeStatistics{/*def_levels=*/{},
404+
/*rep_levels=*/{},
405+
/*byte_array_bytes=*/6}));
406+
EXPECT_THAT(page_stats_,
407+
::testing::ElementsAre(PageSizeStatistics{/*def_levels=*/{},
408+
/*rep_levels=*/{},
409+
/*byte_array_bytes=*/{6}}));
410+
}
411+
379412
} // namespace parquet

0 commit comments

Comments
 (0)