Skip to content

Commit 59cd8df

Browse files
committed
add can_use_stats() and revert nullable column index
1 parent ef0b430 commit 59cd8df

6 files changed

Lines changed: 61 additions & 138 deletions

File tree

cpp/src/parquet/metadata.cc

Lines changed: 17 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -92,75 +92,25 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
9292

9393
namespace {
9494

95-
enum class StatsMinMaxMode {
96-
// Ignore min/max fields because their ordering is unknown or unsupported.
97-
kDiscard,
98-
// Use legacy min/max fields for files without column orders.
99-
kLegacy,
100-
// Use min_value/max_value fields with the column's well-defined order.
101-
kNormal,
102-
};
103-
104-
StatsMinMaxMode GetStatsMinMaxMode(const ColumnDescriptor& descr) {
95+
StatisticsMinMaxField GetStatisticsMinMaxField(const ColumnDescriptor& descr) {
96+
if (descr.sort_order() == SortOrder::UNKNOWN) {
97+
return StatisticsMinMaxField::kInvalid;
98+
}
10599
switch (descr.column_order().get_order()) {
106100
case ColumnOrder::TYPE_DEFINED_ORDER:
107-
return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kNormal
108-
: StatsMinMaxMode::kDiscard;
101+
return StatisticsMinMaxField::kMinValueMaxValue;
109102
case ColumnOrder::UNDEFINED:
110-
return descr.sort_order() != SortOrder::UNKNOWN ? StatsMinMaxMode::kLegacy
111-
: StatsMinMaxMode::kDiscard;
103+
return StatisticsMinMaxField::kLegacyMinMax;
112104
case ColumnOrder::UNKNOWN:
113-
return StatsMinMaxMode::kDiscard;
105+
return StatisticsMinMaxField::kInvalid;
114106
}
115-
return StatsMinMaxMode::kDiscard;
116-
}
117-
118-
} // namespace
119-
120-
static EncodedStatistics EncodedStatisticsFromThrift(const format::Statistics& statistics,
121-
StatsMinMaxMode min_max) {
122-
EncodedStatistics out;
123-
124-
switch (min_max) {
125-
case StatsMinMaxMode::kNormal:
126-
if (statistics.__isset.max_value) {
127-
out.set_max(statistics.max_value);
128-
if (statistics.__isset.is_max_value_exact) {
129-
out.is_max_value_exact = statistics.is_max_value_exact;
130-
}
131-
}
132-
if (statistics.__isset.min_value) {
133-
out.set_min(statistics.min_value);
134-
if (statistics.__isset.is_min_value_exact) {
135-
out.is_min_value_exact = statistics.is_min_value_exact;
136-
}
137-
}
138-
break;
139-
case StatsMinMaxMode::kLegacy:
140-
if (statistics.__isset.max) {
141-
out.set_max(statistics.max);
142-
}
143-
if (statistics.__isset.min) {
144-
out.set_min(statistics.min);
145-
}
146-
break;
147-
case StatsMinMaxMode::kDiscard:
148-
break;
149-
}
150-
if (statistics.__isset.null_count) {
151-
out.set_null_count(statistics.null_count);
152-
}
153-
if (statistics.__isset.distinct_count) {
154-
out.set_distinct_count(statistics.distinct_count);
155-
}
156-
157-
return out;
107+
return StatisticsMinMaxField::kInvalid;
158108
}
159109

160110
template <typename DType>
161-
static std::shared_ptr<Statistics> MakeTypedColumnStats(
162-
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr,
163-
::arrow::MemoryPool* pool) {
111+
std::shared_ptr<Statistics> MakeTypedColumnStats(const format::ColumnMetaData& metadata,
112+
const ColumnDescriptor* descr,
113+
::arrow::MemoryPool* pool) {
164114
const auto& statistics = metadata.statistics;
165115
const std::string kEmpty = "";
166116
const std::string* encoded_min = &kEmpty;
@@ -169,8 +119,8 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
169119
std::optional<bool> min_exact = std::nullopt;
170120
std::optional<bool> max_exact = std::nullopt;
171121

172-
switch (GetStatsMinMaxMode(*descr)) {
173-
case StatsMinMaxMode::kNormal:
122+
switch (GetStatisticsMinMaxField(*descr)) {
123+
case StatisticsMinMaxField::kMinValueMaxValue:
174124
encoded_min = &statistics.min_value;
175125
encoded_max = &statistics.max_value;
176126
has_min_max = statistics.__isset.max_value && statistics.__isset.min_value;
@@ -181,12 +131,12 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
181131
? std::optional<bool>(statistics.is_max_value_exact)
182132
: std::nullopt;
183133
break;
184-
case StatsMinMaxMode::kLegacy:
134+
case StatisticsMinMaxField::kLegacyMinMax:
185135
encoded_min = &statistics.min;
186136
encoded_max = &statistics.max;
187137
has_min_max = statistics.__isset.max && statistics.__isset.min;
188138
break;
189-
case StatsMinMaxMode::kDiscard:
139+
case StatisticsMinMaxField::kInvalid:
190140
break;
191141
}
192142

@@ -197,8 +147,6 @@ static std::shared_ptr<Statistics> MakeTypedColumnStats(
197147
max_exact, pool);
198148
}
199149

200-
namespace {
201-
202150
std::shared_ptr<geospatial::GeoStatistics> MakeColumnGeometryStats(
203151
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
204152
if (metadata.__isset.geospatial_statistics) {
@@ -411,9 +359,8 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
411359
{
412360
const std::lock_guard<std::mutex> guard(stats_mutex_);
413361
if (possible_encoded_stats_ == nullptr) {
414-
possible_encoded_stats_ =
415-
std::make_shared<EncodedStatistics>(EncodedStatisticsFromThrift(
416-
column_metadata_->statistics, GetStatsMinMaxMode(*descr_)));
362+
possible_encoded_stats_ = std::make_shared<EncodedStatistics>(
363+
FromThrift(column_metadata_->statistics, GetStatisticsMinMaxField(*descr_)));
417364
}
418365
}
419366
return writer_version_->HasCorrectStatistics(type(), *possible_encoded_stats_,

cpp/src/parquet/page_index.cc

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,6 @@ namespace parquet {
3838

3939
namespace {
4040

41-
bool CanTrustPageIndexMinMax(const ColumnDescriptor& descr) {
42-
const auto column_order = descr.column_order().get_order();
43-
return column_order != ColumnOrder::UNKNOWN && column_order != ColumnOrder::UNDEFINED &&
44-
descr.sort_order() != SortOrder::UNKNOWN;
45-
}
46-
4741
template <typename DType>
4842
void Decode(std::unique_ptr<typename EncodingTraits<DType>::Decoder>& decoder,
4943
const std::string& input, std::vector<typename DType::c_type>* output,
@@ -979,9 +973,6 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
979973
// Guard against UB when moving column_index
980974
throw ParquetException("Invalid ColumnIndex boundary_order");
981975
}
982-
if (!CanTrustPageIndexMinMax(descr)) {
983-
return nullptr;
984-
}
985976
switch (descr.physical_type()) {
986977
case Type::BOOLEAN:
987978
return std::make_unique<TypedColumnIndexImpl<BooleanType>>(descr,

cpp/src/parquet/page_index_test.cc

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -561,58 +561,6 @@ void TestWriteTypedColumnIndex(schema::NodePtr node,
561561
}
562562
}
563563

564-
template <typename T>
565-
std::shared_ptr<Buffer> SerializedColumnIndex(const T& min, const T& max) {
566-
auto encode = [](const T& value) {
567-
return std::string(reinterpret_cast<const char*>(&value), sizeof(value));
568-
};
569-
format::ColumnIndex column_index;
570-
column_index.__set_null_pages({false});
571-
column_index.__set_min_values({encode(min)});
572-
column_index.__set_max_values({encode(max)});
573-
column_index.__set_boundary_order(format::BoundaryOrder::UNORDERED);
574-
575-
auto sink = CreateOutputStream();
576-
ThriftSerializer{}.Serialize(&column_index, sink.get());
577-
PARQUET_ASSIGN_OR_THROW(auto buffer, sink->Finish());
578-
return buffer;
579-
}
580-
581-
void AssertColumnIndexIgnored(const ColumnDescriptor& read_descr,
582-
const std::shared_ptr<Buffer>& buffer) {
583-
auto column_index =
584-
ColumnIndex::Make(read_descr, buffer->data(), static_cast<uint32_t>(buffer->size()),
585-
default_reader_properties());
586-
ASSERT_EQ(nullptr, column_index);
587-
}
588-
589-
void AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder column_order) {
590-
auto node = schema::Int32("c1");
591-
auto buffer = SerializedColumnIndex<int32_t>(/*min=*/1, /*max=*/2);
592-
593-
std::static_pointer_cast<schema::PrimitiveNode>(node)->SetColumnOrder(column_order);
594-
ColumnDescriptor read_descr(node, /*max_definition_level=*/1,
595-
/*max_repetition_level=*/0);
596-
AssertColumnIndexIgnored(read_descr, buffer);
597-
}
598-
599-
TEST(PageIndex, ReadColumnIndexWithUnsupportedColumnOrder) {
600-
AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::unknown_);
601-
AssertColumnIndexIgnoredWithColumnOrder(ColumnOrder::undefined_);
602-
}
603-
604-
TEST(PageIndex, ReadColumnIndexWithUnknownSortOrder) {
605-
auto node = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::INT96);
606-
ColumnDescriptor descr(node, /*max_definition_level=*/0, /*max_repetition_level=*/0);
607-
ASSERT_EQ(SortOrder::UNKNOWN, descr.sort_order());
608-
609-
Int96 min{{1, 2, 3}};
610-
Int96 max{{4, 5, 6}};
611-
auto buffer = SerializedColumnIndex(min, max);
612-
613-
AssertColumnIndexIgnored(descr, buffer);
614-
}
615-
616564
TEST(PageIndex, WriteInt32ColumnIndex) {
617565
auto encode = [=](int32_t value) {
618566
return std::string(reinterpret_cast<const char*>(&value), sizeof(int32_t));

cpp/src/parquet/schema.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,12 @@ class PARQUET_EXPORT ColumnDescriptor {
382382
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
383383
}
384384

385+
// Whether ColumnOrder-governed min/max values have a supported ordering.
386+
bool can_use_stats() const {
387+
return column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER &&
388+
sort_order() != SortOrder::UNKNOWN;
389+
}
390+
385391
const std::string& name() const { return primitive_node_->name(); }
386392

387393
const std::shared_ptr<schema::ColumnPath> path() const;

cpp/src/parquet/schema_test.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,22 @@ TEST(TestColumnDescriptor, TestAttrs) {
660660
ASSERT_EQ(expected_descr, descr2.ToString());
661661
}
662662

663+
TEST(TestColumnDescriptor, CanUseStats) {
664+
NodePtr node = Int32("name");
665+
ColumnDescriptor descr(node, 0, 0);
666+
EXPECT_TRUE(descr.can_use_stats());
667+
668+
auto primitive_node = std::static_pointer_cast<PrimitiveNode>(node);
669+
primitive_node->SetColumnOrder(ColumnOrder::undefined_);
670+
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());
671+
672+
primitive_node->SetColumnOrder(ColumnOrder::unknown_);
673+
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());
674+
675+
node = PrimitiveNode::Make("name", Repetition::REQUIRED, Type::INT96);
676+
EXPECT_FALSE(ColumnDescriptor(node, 0, 0).can_use_stats());
677+
}
678+
663679
class TestSchemaDescriptor : public ::testing::Test {
664680
public:
665681
void setUp() {}

cpp/src/parquet/thrift_internal.h

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,12 +252,21 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
252252
aesGcmCtrV1.supply_aad_prefix};
253253
}
254254

255-
static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
255+
// Selects which thrift Statistics min/max fields should populate EncodedStatistics.
256+
enum class StatisticsMinMaxField {
257+
// Do not populate min/max, because the ordering is undefined or unsupported.
258+
kInvalid,
259+
// Populate min/max from the min_value/max_value fields.
260+
kMinValueMaxValue,
261+
// Populate min/max from the legacy min/max fields.
262+
kLegacyMinMax,
263+
};
264+
265+
static inline EncodedStatistics FromThrift(const format::Statistics& stats,
266+
StatisticsMinMaxField min_max) {
256267
EncodedStatistics out;
257268

258-
// Use the new V2 min-max statistics over the former one if it is filled
259-
if (stats.__isset.max_value || stats.__isset.min_value) {
260-
// TODO: check if the column_order is TYPE_DEFINED_ORDER.
269+
if (min_max == StatisticsMinMaxField::kMinValueMaxValue) {
261270
if (stats.__isset.max_value) {
262271
out.set_max(stats.max_value);
263272
if (stats.__isset.is_max_value_exact) {
@@ -270,9 +279,7 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
270279
out.is_min_value_exact = stats.is_min_value_exact;
271280
}
272281
}
273-
} else if (stats.__isset.max || stats.__isset.min) {
274-
// TODO: check created_by to see if it is corrupted for some types.
275-
// TODO: check if the sort_order is SIGNED.
282+
} else if (min_max == StatisticsMinMaxField::kLegacyMinMax) {
276283
if (stats.__isset.max) {
277284
out.set_max(stats.max);
278285
}
@@ -290,6 +297,14 @@ static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
290297
return out;
291298
}
292299

300+
static inline EncodedStatistics FromThrift(const format::Statistics& stats) {
301+
// Use the new V2 min-max statistics over the former one if it is filled.
302+
if (stats.__isset.max_value || stats.__isset.min_value) {
303+
return FromThrift(stats, StatisticsMinMaxField::kMinValueMaxValue);
304+
}
305+
return FromThrift(stats, StatisticsMinMaxField::kLegacyMinMax);
306+
}
307+
293308
static inline geospatial::EncodedGeoStatistics FromThrift(
294309
const format::GeospatialStatistics& geo_stats) {
295310
geospatial::EncodedGeoStatistics out;

0 commit comments

Comments
 (0)