Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 80 additions & 20 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,32 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}

// Deep nested two-level encoding List<List<List<Integer>>>:
// optional group my_list (LIST) {
// repeated group array (LIST) {
// repeated group array (LIST) {
// repeated int32 array;
// }
// }
// }
{
auto inner_array =
PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
auto middle_array = GroupNode::Make("array", Repetition::REPEATED, {inner_array},
ConvertedType::LIST);
auto outer_array = GroupNode::Make("array", Repetition::REPEATED, {middle_array},
ConvertedType::LIST);
parquet_fields.push_back(GroupNode::Make("my_list", Repetition::OPTIONAL,
{outer_array}, ConvertedType::LIST));
auto arrow_inner_array = ::arrow::field("array", INT32, /*nullable=*/false);
auto arrow_middle_array = ::arrow::field(
"array", list_case.type_factory(arrow_inner_array), /*nullable=*/false);
auto arrow_outer_array = ::arrow::field(
"array", list_case.type_factory(arrow_middle_array), /*nullable=*/false);
auto arrow_list = list_case.type_factory(arrow_outer_array);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, true));
}

// List<Map<String, String>> in three-level list encoding:
// optional group my_list (LIST) {
// repeated group list {
Expand Down Expand Up @@ -681,6 +707,36 @@ TEST_F(TestConvertParquetSchema, ParquetLists) {
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
}

// List<Map<String, String>> in two-level list encoding:
//
// optional group my_list (LIST) {
// repeated group array (MAP) {
// repeated group key_value {
// required binary key (STRING);
// optional binary value (STRING);
// }
// }
// }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to add a case for the below which is still produced by parquet-java by default:

    // optional group my_list (LIST) {
    //   repeated group array (MAP) {
    //     repeated group key_value (MAP_KEY_VALUE) {
    //       required binary key (STRING);
    //       optional binary value (STRING);
    //     }
    //   }
    // }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's not in backward-compatibility rules too. We may add it into document first.

{
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
auto array =
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));

auto arrow_key = ::arrow::field("key", UTF8, /*nullable=*/false);
auto arrow_value = ::arrow::field("value", UTF8, /*nullable=*/true);
auto arrow_element = ::arrow::field(
"array", std::make_shared<::arrow::MapType>(arrow_key, arrow_value),
/*nullable=*/false);
auto arrow_list = list_case.type_factory(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, /*nullable=*/true));
}

auto arrow_schema = ::arrow::schema(arrow_fields);

ArrowReaderProperties props;
Expand Down Expand Up @@ -844,34 +900,39 @@ TEST_F(TestConvertParquetSchema, ParquetRepeatedNestedSchema) {
}

TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
// List<Map<String, String>> in two-level list encoding:
// Two-level list-annotated group cannot be repeated
//
// optional group my_list (LIST) {
// repeated group array (MAP) {
// repeated group key_value {
// required binary key (STRING);
// optional binary value (STRING);
// }
// }
// repeated group my_list (LIST) {
// repeated int32 array;
// }
{
auto key = PrimitiveNode::Make("key", Repetition::REQUIRED, ParquetType::BYTE_ARRAY,
ConvertedType::UTF8);
auto value = PrimitiveNode::Make("value", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto key_value = GroupNode::Make("key_value", Repetition::REPEATED, {key, value});
auto array =
GroupNode::Make("array", Repetition::REPEATED, {key_value}, ConvertedType::MAP);
auto array = PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
std::vector<NodePtr> parquet_fields;
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));
GroupNode::Make("my_list", Repetition::REPEATED, {array}, ConvertedType::LIST));

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("Group with one repeated child must be LIST-annotated."),
Invalid, testing::HasSubstr("LIST-annotated groups must not be repeated."),
ConvertSchema(parquet_fields));
}
// Two-level list-annotated group cannot be repeated
//
// required group my_struct {
// repeated group my_list (LIST) {
// repeated int32 array;
// }
// }
{
auto array = PrimitiveNode::Make("array", Repetition::REPEATED, ParquetType::INT32);
auto list =
GroupNode::Make("my_list", Repetition::REPEATED, {array}, ConvertedType::LIST);
std::vector<NodePtr> parquet_fields;
parquet_fields.push_back(GroupNode::Make("my_struct", Repetition::REQUIRED, {list}));

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("LIST-annotated groups must not be repeated."),
ConvertSchema(parquet_fields));
}
// List<List<String>>: outer list is two-level encoding, inner list is three-level
//
// optional group my_list (LIST) {
Expand Down Expand Up @@ -912,8 +973,7 @@ TEST_F(TestConvertParquetSchema, IllegalParquetNestedSchema) {
GroupNode::Make("my_list", Repetition::OPTIONAL, {array}, ConvertedType::LIST));

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid,
testing::HasSubstr("LIST-annotated groups must have at least one child."),
Invalid, testing::HasSubstr("Group must have at least one child."),
ConvertSchema(parquet_fields));
}
}
Expand Down
183 changes: 103 additions & 80 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "arrow/io/memory.h"
#include "arrow/ipc/api.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/base64.h"
#include "arrow/util/checked_cast.h"
Expand Down Expand Up @@ -623,7 +624,10 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
if (group.field_count() != 1) {
return Status::Invalid("MAP-annotated groups must have a single child.");
}
if (group.is_repeated()) {
// MAP-annotated groups cannot be repeated unless served as an element within a
// LIST-annotated group with 2-level structure.
if (group.is_repeated() &&
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
return Status::Invalid("MAP-annotated groups must not be repeated.");
}

Expand Down Expand Up @@ -654,7 +658,9 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
return ListToSchemaField(group, current_levels, ctx, parent, out);
}

current_levels.Increment(group);
if (group.is_optional()) {
current_levels.IncrementOptional();
}
int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();

out->children.resize(1);
Expand Down Expand Up @@ -697,41 +703,58 @@ Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
return Status::OK();
}

Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
SchemaTreeContext* ctx, const SchemaField* parent,
SchemaField* out) {
if (group.field_count() != 1) {
return Status::Invalid("LIST-annotated groups must have a single child.");
}
if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

current_levels.Increment(group);
Status ResolveList(const GroupNode& group, LevelInfo current_levels,
SchemaTreeContext* ctx, SchemaField* out) {
auto check_two_level_list_repetition = [](const GroupNode& group) -> Status {
// When it is repeated, the LIST-annotated 2-level structure can only serve as an
// element within another LIST-annotated 2-level structure.
if (group.is_repeated() &&
(group.parent() == nullptr || !group.parent()->logical_type()->is_list())) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
return {};
};

out->children.resize(group.field_count());
SchemaField* child_field = &out->children[0];

ctx->LinkParent(out, parent);
ctx->LinkParent(child_field, out);

const Node& list_node = *group.field(0);

if (!list_node.is_repeated()) {
return Status::Invalid(
"Non-repeated nodes in a LIST-annotated group are not supported.");
}

int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
if (list_node.is_group()) {
const auto& list_group = static_cast<const GroupNode&>(list_node);
if (list_group.field_count() > 1) {
// The inner type of the list should be a struct when there are multiple fields
// in the repeated group
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
} else if (list_group.field_count() == 1) {
const auto& repeated_field = list_group.field(0);
if (repeated_field->is_repeated()) {
RETURN_NOT_OK(check_two_level_list_repetition(group));
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}
if (list_group.field_count() == 0) {
return Status::Invalid("Group must have at least one child.");
}

if (list_group.logical_type()->is_none() && HasListElementName(list_group, group)) {
// Rule 4 at
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
//
// required/optional group name=SOMETHING {
// repeated group name=array or $SOMETHING_tuple {
// required/optional TYPE item;
// }
// }
//
// The inner type of the list should be a struct rather than a primitive value
//
// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
RETURN_NOT_OK(check_two_level_list_repetition(group));
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}

const auto& repeated_field = list_group.field(0);
if (!list_group.logical_type()->is_none() || repeated_field->is_repeated()) {
RETURN_NOT_OK(check_two_level_list_repetition(group));
if (list_group.logical_type()->is_list()) {
// Special case where the inner type might be a list with two-level encoding
// like below:
//
Expand All @@ -742,69 +765,69 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
// }
//
// yields list<item: list<item: TYPE not null> not null> ?nullable
if (!list_group.logical_type()->is_list()) {
return Status::Invalid("Group with one repeated child must be LIST-annotated.");
}
// LIST-annotated group with three-level encoding cannot be repeated.
if (repeated_field->is_group()) {
auto& repeated_group_field = static_cast<const GroupNode&>(*repeated_field);
if (repeated_group_field.field_count() == 0) {
return Status::Invalid("LIST-annotated groups must have at least one child.");
}
if (!repeated_group_field.field(0)->is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
}
RETURN_NOT_OK(
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
} else if (HasListElementName(list_group, group)) {
// We distinguish the special case that we have
//
// required/optional group name=SOMETHING {
// repeated group name=array or $SOMETHING_tuple {
// required/optional TYPE item;
// }
// }
//
// The inner type of the list should be a struct rather than a primitive value
//
// yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
return ListToSchemaField(list_group, current_levels, ctx, out, child_field);
} else if (list_group.logical_type()->is_map()) {
return MapToSchemaField(list_group, current_levels, ctx, out, child_field);
} else {
// Resolve 3-level encoding
//
// required/optional group name=whatever {
// repeated group name=list {
// required/optional TYPE item;
// }
// }
//
// yields list<item: TYPE ?nullable> ?nullable
RETURN_NOT_OK(
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
return GroupToStruct(list_group, current_levels, ctx, out, child_field);
}
} else {
return Status::Invalid("Group must have at least one child.");
}
} else {
// Two-level list encoding

// Resolve normal 3-level encoding
//
// required/optional group LIST {
// repeated TYPE;
// required/optional group name=whatever {
// repeated group name=list {
// required/optional TYPE item;
// }
// }
//
// TYPE is a primitive type
//
// yields list<item: TYPE not null> ?nullable
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
int column_index = ctx->schema->GetColumnIndex(primitive_node);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
GetTypeForNode(column_index, primitive_node, ctx));
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
FieldIdMetadata(list_node.field_id()));
RETURN_NOT_OK(
PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
// yields list<item: TYPE ?nullable> ?nullable
if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

return NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field);
}

// Two-level list encoding
//
// required/optional group LIST {
// repeated TYPE;
// }
//
// TYPE is a primitive type
//
// yields list<item: TYPE not null> ?nullable
RETURN_NOT_OK(check_two_level_list_repetition(group));
const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
int column_index = ctx->schema->GetColumnIndex(primitive_node);
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
GetTypeForNode(column_index, primitive_node, ctx));
auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
FieldIdMetadata(list_node.field_id()));
return PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field);
}

Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
SchemaTreeContext* ctx, const SchemaField* parent,
SchemaField* out) {
if (group.field_count() != 1) {
return Status::Invalid("LIST-annotated groups must have a single child.");
}

if (group.is_optional()) {
current_levels.IncrementOptional();
}

out->children.resize(group.field_count());
SchemaField* child_field = &out->children[0];

ctx->LinkParent(out, parent);
ctx->LinkParent(child_field, out);

int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
RETURN_NOT_OK(ResolveList(group, current_levels, ctx, out));

ARROW_ASSIGN_OR_RAISE(auto list_type,
MakeArrowList(child_field->field, ctx->properties));
out->field = ::arrow::field(group.name(), std::move(list_type), group.is_optional(),
Expand Down
Loading