Skip to content

Commit 09c94ff

Browse files
committed
Convert RunEndEncoded field to Parquet
1 parent 4506998 commit 09c94ff

File tree

9 files changed

+312
-26
lines changed

9 files changed

+312
-26
lines changed

arrow-array/src/array/run_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ use crate::{
3232

3333
/// An array of [run-end encoded values](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
3434
///
35-
/// This encoding is variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
35+
/// This encoding is a variation on [run-length encoding (RLE)](https://en.wikipedia.org/wiki/Run-length_encoding)
3636
/// and is good for representing data containing same values repeated consecutively.
3737
///
3838
/// [`RunArray`] contains `run_ends` array and `values` array of same length.
3939
/// The `run_ends` array stores the indexes at which the run ends. The `values` array
40-
/// stores the value of each run. Below example illustrates how a logical array is represented in
40+
/// stores the value of each run. The below example illustrates how a logical array is represented in
4141
/// [`RunArray`]
4242
///
4343
///

arrow-cast/src/cast/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
130130
| FixedSizeList(_, _)
131131
| Struct(_)
132132
| Map(_, _)
133-
| Dictionary(_, _),
133+
| Dictionary(_, _)
134+
| RunEndEncoded(_, _),
134135
) => true,
135136
// Dictionary/List conditions should be put in front of others
136137
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
@@ -179,6 +180,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
179180
_ => false,
180181
}
181182
}
183+
// TODO: RunEndEncoded here?
182184
// cast one decimal type to another decimal type
183185
(
184186
Decimal32(_, _) | Decimal64(_, _) | Decimal128(_, _) | Decimal256(_, _),
@@ -815,6 +817,7 @@ pub fn cast_with_options(
815817
"Casting from type {from_type:?} to dictionary type {to_type:?} not supported",
816818
))),
817819
},
820+
// TODO: RunEndEncoded here?
818821
(List(_), List(to)) => cast_list_values::<i32>(array, to, cast_options),
819822
(LargeList(_), LargeList(to)) => cast_list_values::<i64>(array, to, cast_options),
820823
(List(_), LargeList(list_to)) => cast_list::<i32, i64>(array, list_to, cast_options),

arrow-schema/src/datatype.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ pub enum DataType {
354354
/// that contain many repeated values using less memory, but with
355355
/// a higher CPU overhead for some operations.
356356
///
357-
/// This type mostly used to represent low cardinality string
357+
/// This type is mostly used to represent low cardinality string
358358
/// arrays or a limited set of primitive types as integers.
359359
Dictionary(Box<DataType>, Box<DataType>),
360360
/// Exact 32-bit width decimal value with precision and scale

parquet/src/arrow/array_reader/byte_array.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,28 @@ pub fn make_byte_array_reader(
6767
pages, data_type, reader,
6868
)))
6969
}
70+
// TODO eventually add a dedicated [`ArrayReader`] for REE
71+
ArrowType::RunEndEncoded(_, ref val_field) => match val_field.data_type() {
72+
ArrowType::Binary
73+
| ArrowType::Utf8
74+
| ArrowType::Decimal128(_, _)
75+
| ArrowType::Decimal256(_, _) => {
76+
let reader = GenericRecordReader::new(column_desc);
77+
Ok(Box::new(ByteArrayReader::<i32>::new(
78+
pages, data_type, reader,
79+
)))
80+
}
81+
ArrowType::LargeUtf8 | ArrowType::LargeBinary => {
82+
let reader = GenericRecordReader::new(column_desc);
83+
Ok(Box::new(ByteArrayReader::<i64>::new(
84+
pages, data_type, reader,
85+
)))
86+
}
87+
_ => Err(general_err!(
88+
"invalid run end encoded value type for byte array reader - {}",
89+
data_type
90+
)),
91+
},
7092
_ => Err(general_err!(
7193
"invalid data type for byte array reader - {}",
7294
data_type
@@ -147,6 +169,11 @@ impl<I: OffsetSizeTrait> ArrayReader for ByteArrayReader<I> {
147169
.with_precision_and_scale(p, s)?;
148170
Arc::new(decimal)
149171
}
172+
// TODO eventually add a dedicated [`ArrayReader`] for REE
173+
ArrowType::RunEndEncoded(_, ref val_field) => {
174+
let array = buffer.into_array(null_buffer, val_field.data_type().clone());
175+
arrow_cast::cast(&array, &self.data_type)?
176+
}
150177
_ => buffer.into_array(null_buffer, self.data_type.clone()),
151178
};
152179

parquet/src/arrow/arrow_writer/byte_array.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use crate::util::bit_util::num_required_bits;
2828
use crate::util::interner::{Interner, Storage};
2929
use arrow_array::{
3030
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
31-
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
31+
LargeBinaryArray, LargeStringArray, RunArray, StringArray, StringViewArray,
3232
};
3333
use arrow_schema::DataType;
3434

@@ -59,6 +59,28 @@ macro_rules! downcast_dict_op {
5959
};
6060
}
6161

62+
macro_rules! downcast_ree_impl {
63+
($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{
64+
$op($array
65+
.as_any()
66+
.downcast_ref::<RunArray<arrow_array::types::$key>>()
67+
.unwrap()
68+
.downcast::<$val>()
69+
.unwrap()$(, $arg)*)
70+
}};
71+
}
72+
73+
macro_rules! downcast_ree_op {
74+
($run_end_field:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => {
75+
match $run_end_field.data_type() {
76+
DataType::Int16 => downcast_ree_impl!($array, Int16Type, $val, $op$(, $arg)*),
77+
DataType::Int32 => downcast_ree_impl!($array, Int32Type, $val, $op$(, $arg)*),
78+
DataType::Int64 => downcast_ree_impl!($array, Int64Type, $val, $op$(, $arg)*),
79+
_ => unreachable!(),
80+
}
81+
};
82+
}
83+
6284
macro_rules! downcast_op {
6385
($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => {
6486
match $data_type {
@@ -90,6 +112,20 @@ macro_rules! downcast_op {
90112
}
91113
d => unreachable!("cannot downcast {} dictionary value to byte array", d),
92114
},
115+
DataType::RunEndEncoded(run_end, value) => match value.data_type() {
116+
DataType::Utf8 => downcast_ree_op!(run_end, StringArray, $array, $op$(, $arg)*),
117+
DataType::LargeUtf8 => {
118+
downcast_ree_op!(run_end, LargeStringArray, $array, $op$(, $arg)*)
119+
}
120+
DataType::Binary => downcast_ree_op!(run_end, BinaryArray, $array, $op$(, $arg)*),
121+
DataType::LargeBinary => {
122+
downcast_ree_op!(run_end, LargeBinaryArray, $array, $op$(, $arg)*)
123+
}
124+
DataType::FixedSizeBinary(_) => {
125+
downcast_ree_op!(run_end, FixedSizeBinaryArray, $array, $op$(, $arg)*)
126+
}
127+
d => unreachable!("cannot downcast {} run end encoded value to byte array", d),
128+
},
93129
d => unreachable!("cannot downcast {} to byte array", d),
94130
}
95131
};

parquet/src/arrow/arrow_writer/levels.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,10 @@ impl LevelInfoBuilder {
222222
_ => unreachable!(),
223223
})
224224
}
225+
DataType::RunEndEncoded(_, v) if is_leaf(v.data_type()) => {
226+
let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone());
227+
Ok(Self::Primitive(levels))
228+
}
225229
d => Err(nyi_err!("Datatype {} is not yet supported", d)),
226230
}
227231
}

0 commit comments

Comments
 (0)