diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index f930d3d7bdf..bc3d1087d96 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -649,6 +649,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) { arrow_fields.push_back(::arrow::field("my_list", arrow_list, true)); } + // Deep nested two-level encoding List>>: + // optional group my_list (LIST) { + // repeated group array (LIST) { + // repeated group array (LIST) { + // repeated int32 array; + // } + // } + // } + { + auto inner_array = + PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); + auto middle_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array}, + ConvertedType::LIST); + auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {middle_array}, + ConvertedType::LIST); + parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL, + {outer_array}, ConvertedType::LIST)); + auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false); + auto arrow_middle_array = ::arrow::field( + "array", list_case.type_factory(arrow_inner_array), /*nullable=*/false); + auto arrow_outer_array = ::arrow::field( + "array", list_case.type_factory(arrow_middle_array), /*nullable=*/false); + auto arrow_list = list_case.type_factory(arrow_outer_array); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, true)); + } + // List> in three-level list encoding: // optional group my_list (LIST) { // repeated group list { @@ -681,6 +707,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) { arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true)); } + // List> in two-level list encoding: + // + // optional group my_list (LIST) { + // repeated group array (MAP) { + // repeated group key_value { + // required binary key (STRING); + // optional binary value (STRING); + // } + // } + // } + { + auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, + ConvertedType::UTF8); + auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value}); + auto array = + GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP); + parquet_fields.push_back( + GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST)); + + auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false); + auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true); + auto arrow_element = ::arrow::field( + "array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value), + /*nullable=*/false); + auto arrow_list = list_case.type_factory(arrow_element); + arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true)); + } + auto arrow_schema = ::arrow::schema(arrow_fields); ArrowReaderProperties props; @@ -844,34 +900,39 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) { } TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) { - // List> in two-level list encoding: + // Two-level list-annotated group cannot be repeated // - // optional group my_list (LIST) { - // repeated group array (MAP) { - // repeated group key_value { - // required binary key (STRING); - // optional binary value (STRING); - // } - // } + // repeated group my_list (LIST) { + // repeated int32 array; // } { - auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, - ConvertedType::UTF8); - auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, ConvertedType::UTF8); - auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value}); - auto array = - GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP); + auto array = PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); std::vector parquet_fields; parquet_fields.push_back( - GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST)); + GroupNode::Make("my_list", Repetition::REPEATED, {array}, ConvertedType::LIST)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr("Group with one repeated child must be LIST-annotated."), + Invalid, testing::HasSubstr("LIST-annotated groups must not be repeated."), ConvertSchema(parquet_fields)); } + // Two-level list-annotated group cannot be repeated + // + // required group my_struct { + // repeated group my_list (LIST) { + // repeated int32 array; + // } + // } + { + auto array = PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32); + auto list = + GroupNode::Make("my_list", Repetition::REPEATED, {array}, ConvertedType::LIST); + std::vector parquet_fields; + parquet_fields.push_back(GroupNode::Make("my_struct", Repetition::REQUIRED, {list})); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, testing::HasSubstr("LIST-annotated groups must not be repeated."), + ConvertSchema(parquet_fields)); + } // List>: outer list is two-level encoding, inner list is three-level // // optional group my_list (LIST) { @@ -912,8 +973,7 @@ TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) { GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST)); EXPECT_RAISES_WITH_MESSAGE_THAT( - Invalid, - testing::HasSubstr("LIST-annotated groups must have at least one child."), + Invalid, testing::HasSubstr("Group must have at least one child."), ConvertSchema(parquet_fields)); } } diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 9c0db1d5335..33d788fa89d 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -28,6 +28,7 @@ #include "arrow/io/memory.h" #include "arrow/ipc/api.h" #include "arrow/result.h" +#include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" @@ -623,7 +624,10 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, if (group.field_count() != 1) { return Status::Invalid("MAP-annotated groups must have a single child."); } - if (group.is_repeated()) { + // MAP-annotated groups cannot be repeated unless served as an element within a + // LIST-annotated group with 2-level structure. + if (group.is_repeated() && + (group.parent() == nullptr || !group.parent()->logical_type()->is_list())) { return Status::Invalid("MAP-annotated groups must not be repeated."); } @@ -654,7 +658,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, return ListToSchemaField(group, current_levels, ctx, parent, out); } - current_levels.Increment(group); + if (group.is_optional()) { + current_levels.IncrementOptional(); + } int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); out->children.resize(1); @@ -697,41 +703,58 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels, return Status::OK(); } -Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, - SchemaTreeContext* ctx, const SchemaField* parent, - SchemaField* out) { - if (group.field_count() != 1) { - return Status::Invalid("LIST-annotated groups must have a single child."); - } - if (group.is_repeated()) { - return Status::Invalid("LIST-annotated groups must not be repeated."); - } - - current_levels.Increment(group); +Status ResolveList(const GroupNode& group, LevelInfo current_levels, + SchemaTreeContext* ctx, SchemaField* out) { + auto check_two_level_list_repetition = [](const GroupNode& group) -> Status { + // When it is repeated, the LIST-annotated 2-level structure can only serve as an + // element within another LIST-annotated 2-level structure. + if (group.is_repeated() && + (group.parent() == nullptr || !group.parent()->logical_type()->is_list())) { + return Status::Invalid("LIST-annotated groups must not be repeated."); + } + return {}; + }; - out->children.resize(group.field_count()); SchemaField* child_field = &out->children[0]; - - ctx->LinkParent(out, parent); - ctx->LinkParent(child_field, out); - const Node& list_node = *group.field(0); - if (!list_node.is_repeated()) { return Status::Invalid( "Non-repeated nodes in a LIST-annotated group are not supported."); } - int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); if (list_node.is_group()) { const auto& list_group = static_cast(list_node); if (list_group.field_count() > 1) { // The inner type of the list should be a struct when there are multiple fields // in the repeated group - RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); - } else if (list_group.field_count() == 1) { - const auto& repeated_field = list_group.field(0); - if (repeated_field->is_repeated()) { + RETURN_NOT_OK(check_two_level_list_repetition(group)); + return GroupToStruct(list_group, current_levels, ctx, out, child_field); + } + if (list_group.field_count() == 0) { + return Status::Invalid("Group must have at least one child."); + } + + if (list_group.logical_type()->is_none() && HasListElementName(list_group, group)) { + // Rule 4 at + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules + // + // required/optional group name=SOMETHING { + // repeated group name=array or $SOMETHING_tuple { + // required/optional TYPE item; + // } + // } + // + // The inner type of the list should be a struct rather than a primitive value + // + // yields list not null> ?nullable + RETURN_NOT_OK(check_two_level_list_repetition(group)); + return GroupToStruct(list_group, current_levels, ctx, out, child_field); + } + + const auto& repeated_field = list_group.field(0); + if (!list_group.logical_type()->is_none() || repeated_field->is_repeated()) { + RETURN_NOT_OK(check_two_level_list_repetition(group)); + if (list_group.logical_type()->is_list()) { // Special case where the inner type might be a list with two-level encoding // like below: // @@ -742,69 +765,69 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, // } // // yields list not null> ?nullable - if (!list_group.logical_type()->is_list()) { - return Status::Invalid("Group with one repeated child must be LIST-annotated."); - } - // LIST-annotated group with three-level encoding cannot be repeated. - if (repeated_field->is_group()) { - auto& repeated_group_field = static_cast(*repeated_field); - if (repeated_group_field.field_count() == 0) { - return Status::Invalid("LIST-annotated groups must have at least one child."); - } - if (!repeated_group_field.field(0)->is_repeated()) { - return Status::Invalid("LIST-annotated groups must not be repeated."); - } - } - RETURN_NOT_OK( - NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); - } else if (HasListElementName(list_group, group)) { - // We distinguish the special case that we have - // - // required/optional group name=SOMETHING { - // repeated group name=array or $SOMETHING_tuple { - // required/optional TYPE item; - // } - // } - // - // The inner type of the list should be a struct rather than a primitive value - // - // yields list not null> ?nullable - RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field)); + return ListToSchemaField(list_group, current_levels, ctx, out, child_field); + } else if (list_group.logical_type()->is_map()) { + return MapToSchemaField(list_group, current_levels, ctx, out, child_field); } else { - // Resolve 3-level encoding - // - // required/optional group name=whatever { - // repeated group name=list { - // required/optional TYPE item; - // } - // } - // - // yields list ?nullable - RETURN_NOT_OK( - NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field)); + return GroupToStruct(list_group, current_levels, ctx, out, child_field); } - } else { - return Status::Invalid("Group must have at least one child."); } - } else { - // Two-level list encoding + + // Resolve normal 3-level encoding // - // required/optional group LIST { - // repeated TYPE; + // required/optional group name=whatever { + // repeated group name=list { + // required/optional TYPE item; + // } // } // - // TYPE is a primitive type - // - // yields list ?nullable - const auto& primitive_node = static_cast(list_node); - int column_index = ctx->schema->GetColumnIndex(primitive_node); - ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, - GetTypeForNode(column_index, primitive_node, ctx)); - auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false, - FieldIdMetadata(list_node.field_id())); - RETURN_NOT_OK( - PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field)); + // yields list ?nullable + if (group.is_repeated()) { + return Status::Invalid("LIST-annotated groups must not be repeated."); + } + + return NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field); + } + + // Two-level list encoding + // + // required/optional group LIST { + // repeated TYPE; + // } + // + // TYPE is a primitive type + // + // yields list ?nullable + RETURN_NOT_OK(check_two_level_list_repetition(group)); + const auto& primitive_node = static_cast(list_node); + int column_index = ctx->schema->GetColumnIndex(primitive_node); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr type, + GetTypeForNode(column_index, primitive_node, ctx)); + auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false, + FieldIdMetadata(list_node.field_id())); + return PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field); +} + +Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels, + SchemaTreeContext* ctx, const SchemaField* parent, + SchemaField* out) { + if (group.field_count() != 1) { + return Status::Invalid("LIST-annotated groups must have a single child."); + } + + if (group.is_optional()) { + current_levels.IncrementOptional(); } + + out->children.resize(group.field_count()); + SchemaField* child_field = &out->children[0]; + + ctx->LinkParent(out, parent); + ctx->LinkParent(child_field, out); + + int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated(); + RETURN_NOT_OK(ResolveList(group, current_levels, ctx, out)); + ARROW_ASSIGN_OR_RAISE(auto list_type, MakeArrowList(child_field->field, ctx->properties)); out->field = ::arrow::field(group.name(), std::move(list_type), group.is_optional(),