Skip to content

Commit 4f7558f

Browse files
committed
Fix converting schema failure with deep nested two-level encoding list structure
1 parent 252a685 commit 4f7558f

2 files changed

Lines changed: 128 additions & 118 deletions

File tree

cpp/src/parquet/arrow/arrow_schema_test.cc

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
649649
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
650650
}
651651

652+
// Deep nested two-level encoding List<List<List<Integer>>>:
653+
// optional group my_list (LIST) {
654+
// repeated group array (LIST) {
655+
// repeated group array (LIST) {
656+
// repeated int32 array;
657+
// }
658+
// }
659+
// }
660+
{
661+
auto inner_array =
662+
PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
663+
auto middle_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array},
664+
ConvertedType::LIST);
665+
auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {middle_array},
666+
ConvertedType::LIST);
667+
parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
668+
{outer_array}, ConvertedType::LIST));
669+
auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false);
670+
auto arrow_middle_array = ::arrow::field(
671+
"array", list_case.type_factory(arrow_inner_array), /*nullable=*/false);
672+
auto arrow_outer_array = ::arrow::field(
673+
"array", list_case.type_factory(arrow_middle_array), /*nullable=*/false);
674+
auto arrow_list = list_case.type_factory(arrow_outer_array);
675+
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
676+
}
677+
652678
// List<Map<String, String>> in three-level list encoding:
653679
// optional group my_list (LIST) {
654680
// repeated group list {
@@ -681,6 +707,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
681707
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
682708
}
683709

710+
// List<Map<String, String>> in two-level list encoding:
711+
//
712+
// optional group my_list (LIST) {
713+
// repeated group array (MAP) {
714+
// repeated group key_value {
715+
// required binary key (STRING);
716+
// optional binary value (STRING);
717+
// }
718+
// }
719+
// }
720+
{
721+
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
722+
ConvertedType::UTF8);
723+
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
724+
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
725+
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
726+
auto array =
727+
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
728+
parquet_fields.push_back(
729+
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));
730+
731+
auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
732+
auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
733+
auto arrow_element = ::arrow::field(
734+
"array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
735+
/*nullable=*/false);
736+
auto arrow_list = list_case.type_factory(arrow_element);
737+
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
738+
}
739+
684740
auto arrow_schema = ::arrow::schema(arrow_fields);
685741

686742
ArrowReaderProperties props;
@@ -844,34 +900,6 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
844900
}
845901

846902
TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
847-
// List<Map<String, String>> in two-level list encoding:
848-
//
849-
// optional group my_list (LIST) {
850-
// repeated group array (MAP) {
851-
// repeated group key_value {
852-
// required binary key (STRING);
853-
// optional binary value (STRING);
854-
// }
855-
// }
856-
// }
857-
{
858-
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
859-
ConvertedType::UTF8);
860-
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
861-
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
862-
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
863-
auto array =
864-
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
865-
std::vector<NodePtr> parquet_fields;
866-
parquet_fields.push_back(
867-
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));
868-
869-
EXPECT_RAISES_WITH_MESSAGE_THAT(
870-
Invalid,
871-
testing::HasSubstr("Group with one repeated child must be LIST-annotated."),
872-
ConvertSchema(parquet_fields));
873-
}
874-
875903
// List<List<String>>: outer list is two-level encoding, inner list is three-level
876904
//
877905
// optional group my_list (LIST) {
@@ -912,8 +940,7 @@ TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
912940
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));
913941

914942
EXPECT_RAISES_WITH_MESSAGE_THAT(
915-
Invalid,
916-
testing::HasSubstr("LIST-annotated groups must have at least one child."),
943+
Invalid, testing::HasSubstr("Group must have at least one child."),
917944
ConvertSchema(parquet_fields));
918945
}
919946
}

cpp/src/parquet/arrow/schema.cc

Lines changed: 71 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "arrow/io/memory.h"
2828
#include "arrow/ipc/api.h"
2929
#include "arrow/result.h"
30+
#include "arrow/status.h"
3031
#include "arrow/type.h"
3132
#include "arrow/util/base64.h"
3233
#include "arrow/util/checked_cast.h"
@@ -620,7 +621,8 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
620621
if (group.field_count() != 1) {
621622
return Status::Invalid("MAP-annotated groups must have a single child.");
622623
}
623-
if (group.is_repeated()) {
624+
if (group.is_repeated() &&
625+
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
624626
return Status::Invalid("MAP-annotated groups must not be repeated.");
625627
}
626628

@@ -651,7 +653,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
651653
return ListToSchemaField(group, current_levels, ctx, parent, out);
652654
}
653655

654-
current_levels.Increment(group);
656+
if (group.is_optional()) {
657+
current_levels.IncrementOptional();
658+
}
655659
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
656660

657661
out->children.resize(1);
@@ -694,17 +698,76 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
694698
return Status::OK();
695699
}
696700

701+
Status ResolveList(const GroupNode& group, const Node& list_node,
702+
LevelInfo current_levels, SchemaTreeContext* ctx,
703+
const SchemaField* out, SchemaField* child_field) {
704+
auto check_two_level_list_repeated = [](const GroupNode& group) -> Status {
705+
// When it is repeated, the LIST-annotated 2-level structure can only serve as an
706+
// element within another LIST-annotated 2-level structure.
707+
if (group.is_repeated() &&
708+
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
709+
return Status::Invalid("LIST-annotated groups must not be repeated.");
710+
}
711+
return {};
712+
};
713+
714+
if (list_node.is_group()) {
715+
const auto& list_group = static_cast<const GroupNode&>(list_node);
716+
if (list_group.field_count() > 1) {
717+
// The inner type of the list should be a struct when there are multiple fields
718+
// in the repeated group
719+
RETURN_NOT_OK(check_two_level_list_repeated(group));
720+
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
721+
}
722+
if (list_group.field_count() == 0) {
723+
return Status::Invalid("Group must have at least one child.");
724+
}
725+
726+
if (list_group.logical_type()->is_none() && HasListElementName(list_group, group)) {
727+
// Backward-compatibility rule 4
728+
RETURN_NOT_OK(check_two_level_list_repeated(group));
729+
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
730+
}
731+
732+
const auto& repeated_field = list_group.field(0);
733+
if (!list_group.logical_type()->is_none() || repeated_field->is_repeated()) {
734+
RETURN_NOT_OK(check_two_level_list_repeated(group));
735+
if (list_group.logical_type()->is_list()) {
736+
return ListToSchemaField(list_group, current_levels, ctx, out, child_field);
737+
} else if (list_group.logical_type()->is_map()) {
738+
return MapToSchemaField(list_group, current_levels, ctx, out, child_field);
739+
} else {
740+
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
741+
}
742+
}
743+
744+
if (group.is_repeated()) {
745+
return Status::Invalid("LIST-annotated groups must not be repeated.");
746+
}
747+
748+
return NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field);
749+
}
750+
751+
RETURN_NOT_OK(check_two_level_list_repeated(group));
752+
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
753+
int column_index = ctx->schema->GetColumnIndex(primitive_node);
754+
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
755+
GetTypeForNode(column_index, primitive_node, ctx));
756+
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
757+
FieldIdMetadata(list_node.field_id()));
758+
return PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field);
759+
}
760+
697761
Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
698762
SchemaTreeContext* ctx, const SchemaField* parent,
699763
SchemaField* out) {
700764
if (group.field_count() != 1) {
701765
return Status::Invalid("LIST-annotated groups must have a single child.");
702766
}
703-
if (group.is_repeated()) {
704-
return Status::Invalid("LIST-annotated groups must not be repeated.");
705-
}
706767

707-
current_levels.Increment(group);
768+
if (group.is_optional()) {
769+
current_levels.IncrementOptional();
770+
}
708771

709772
out->children.resize(group.field_count());
710773
SchemaField* child_field = &out->children[0];
@@ -720,88 +783,8 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
720783
}
721784

722785
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
723-
if (list_node.is_group()) {
724-
const auto& list_group = static_cast<const GroupNode&>(list_node);
725-
if (list_group.field_count() > 1) {
726-
// The inner type of the list should be a struct when there are multiple fields
727-
// in the repeated group
728-
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
729-
} else if (list_group.field_count() == 1) {
730-
const auto& repeated_field = list_group.field(0);
731-
if (repeated_field->is_repeated()) {
732-
// Special case where the inner type might be a list with two-level encoding
733-
// like below:
734-
//
735-
// required/optional group name=SOMETHING (LIST) {
736-
// repeated group array (LIST) {
737-
// repeated TYPE item;
738-
// }
739-
// }
740-
//
741-
// yields list<item: list<item: TYPE not null> not null> ?nullable
742-
if (!list_group.logical_type()->is_list()) {
743-
return Status::Invalid("Group with one repeated child must be LIST-annotated.");
744-
}
745-
// LIST-annotated group with three-level encoding cannot be repeated.
746-
if (repeated_field->is_group()) {
747-
auto& repeated_group_field = static_cast<const GroupNode&>(*repeated_field);
748-
if (repeated_group_field.field_count() == 0) {
749-
return Status::Invalid("LIST-annotated groups must have at least one child.");
750-
}
751-
if (!repeated_group_field.field(0)->is_repeated()) {
752-
return Status::Invalid("LIST-annotated groups must not be repeated.");
753-
}
754-
}
755-
RETURN_NOT_OK(
756-
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
757-
} else if (HasListElementName(list_group, group)) {
758-
// We distinguish the special case that we have
759-
//
760-
// required/optional group name=SOMETHING {
761-
// repeated group name=array or $SOMETHING_tuple {
762-
// required/optional TYPE item;
763-
// }
764-
// }
765-
//
766-
// The inner type of the list should be a struct rather than a primitive value
767-
//
768-
// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
769-
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
770-
} else {
771-
// Resolve 3-level encoding
772-
//
773-
// required/optional group name=whatever {
774-
// repeated group name=list {
775-
// required/optional TYPE item;
776-
// }
777-
// }
778-
//
779-
// yields list<item: TYPE ?nullable> ?nullable
780-
RETURN_NOT_OK(
781-
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
782-
}
783-
} else {
784-
return Status::Invalid("Group must have at least one child.");
785-
}
786-
} else {
787-
// Two-level list encoding
788-
//
789-
// required/optional group LIST {
790-
// repeated TYPE;
791-
// }
792-
//
793-
// TYPE is a primitive type
794-
//
795-
// yields list<item: TYPE not null> ?nullable
796-
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
797-
int column_index = ctx->schema->GetColumnIndex(primitive_node);
798-
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
799-
GetTypeForNode(column_index, primitive_node, ctx));
800-
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
801-
FieldIdMetadata(list_node.field_id()));
802-
RETURN_NOT_OK(
803-
PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
804-
}
786+
RETURN_NOT_OK(ResolveList(group, list_node, current_levels, ctx, out, child_field));
787+
805788
ARROW_ASSIGN_OR_RAISE(auto list_type,
806789
MakeArrowList(child_field->field, ctx->properties));
807790
out->field = ::arrow::field(group.name(), std::move(list_type), group.is_optional(),

0 commit comments

Comments
 (0)