@@ -1122,6 +1122,17 @@ impl ArrowColumnWriterFactory {
11221122 ArrowDataType :: FixedSizeBinary ( _) => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
11231123 _ => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
11241124 } ,
1125+ ArrowDataType :: RunEndEncoded ( _, value_type) => match value_type. data_type ( ) {
1126+ ArrowDataType :: Utf8
1127+ | ArrowDataType :: LargeUtf8
1128+ | ArrowDataType :: Binary
1129+ | ArrowDataType :: LargeBinary => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
1130+ ArrowDataType :: Utf8View | ArrowDataType :: BinaryView => {
1131+ out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?)
1132+ }
1133+ ArrowDataType :: FixedSizeBinary ( _) => out. push ( bytes ( leaves. next ( ) . unwrap ( ) ) ?) ,
1134+ _ => out. push ( col ( leaves. next ( ) . unwrap ( ) ) ?) ,
1135+ } ,
11251136 _ => {
11261137 return Err ( ParquetError :: NYI ( format ! (
11271138 "Attempting to write an Arrow type {data_type} to parquet that is not yet implemented"
@@ -1215,6 +1226,41 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
12151226 write_primitive ( typed, array. values ( ) , levels)
12161227 }
12171228 } ,
1229+ ArrowDataType :: RunEndEncoded ( _, value_type) => match value_type. data_type ( ) {
1230+ ArrowDataType :: Decimal32 ( _, _) => {
1231+ let array = arrow_cast:: cast ( column, value_type. data_type ( ) ) ?;
1232+ let array = array
1233+ . as_primitive :: < Decimal32Type > ( )
1234+ . unary :: < _ , Int32Type > ( |v| v) ;
1235+ write_primitive ( typed, array. values ( ) , levels)
1236+ }
1237+ ArrowDataType :: Decimal64 ( _, _) => {
1238+ let array = arrow_cast:: cast ( column, value_type. data_type ( ) ) ?;
1239+ let array = array
1240+ . as_primitive :: < Decimal64Type > ( )
1241+ . unary :: < _ , Int32Type > ( |v| v as i32 ) ;
1242+ write_primitive ( typed, array. values ( ) , levels)
1243+ }
1244+ ArrowDataType :: Decimal128 ( _, _) => {
1245+ let array = arrow_cast:: cast ( column, value_type. data_type ( ) ) ?;
1246+ let array = array
1247+ . as_primitive :: < Decimal128Type > ( )
1248+ . unary :: < _ , Int32Type > ( |v| v as i32 ) ;
1249+ write_primitive ( typed, array. values ( ) , levels)
1250+ }
1251+ ArrowDataType :: Decimal256 ( _, _) => {
1252+ let array = arrow_cast:: cast ( column, value_type. data_type ( ) ) ?;
1253+ let array = array
1254+ . as_primitive :: < Decimal256Type > ( )
1255+ . unary :: < _ , Int32Type > ( |v| v. as_i128 ( ) as i32 ) ;
1256+ write_primitive ( typed, array. values ( ) , levels)
1257+ }
1258+ _ => {
1259+ let array = arrow_cast:: cast ( column, & ArrowDataType :: Int32 ) ?;
1260+ let array = array. as_primitive :: < Int32Type > ( ) ;
1261+ write_primitive ( typed, array. values ( ) , levels)
1262+ }
1263+ } ,
12181264 _ => {
12191265 let array = arrow_cast:: cast ( column, & ArrowDataType :: Int32 ) ?;
12201266 let array = array. as_primitive :: < Int32Type > ( ) ;
@@ -1297,6 +1343,12 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
12971343 write_primitive ( typed, array. values ( ) , levels)
12981344 }
12991345 } ,
1346+ ArrowDataType :: RunEndEncoded ( _run_ends, _values) => {
1347+ Err ( ParquetError :: NYI (
1348+ "Int64ColumnWriter: Attempting to write an Arrow REE type that is not yet implemented"
1349+ . to_string ( ) ,
1350+ ) )
1351+ }
13001352 _ => {
13011353 let array = arrow_cast:: cast ( column, & ArrowDataType :: Int64 ) ?;
13021354 let array = array. as_primitive :: < Int64Type > ( ) ;
@@ -1371,6 +1423,12 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi
13711423 let array = column. as_primitive :: < Float16Type > ( ) ;
13721424 get_float_16_array_slice ( array, indices)
13731425 }
1426+ ArrowDataType :: RunEndEncoded ( _run_ends, _values) => {
1427+ return Err ( ParquetError :: NYI (
1428+ "FixedLenByteArrayColumnWriter: Attempting to write an Arrow REE type that is not yet implemented"
1429+ . to_string ( ) ,
1430+ ) ) ;
1431+ }
13741432 _ => {
13751433 return Err ( ParquetError :: NYI (
13761434 "Attempting to write an Arrow type that is not yet implemented" . to_string ( ) ,
@@ -4481,4 +4539,153 @@ mod tests {
44814539 assert_eq ! ( get_dict_page_size( col0_meta) , 1024 * 1024 ) ;
44824540 assert_eq ! ( get_dict_page_size( col1_meta) , 1024 * 1024 * 4 ) ;
44834541 }
4542+
4543+ #[ test]
4544+ fn arrow_writer_run_end_encoded_string ( ) {
4545+ // Create a run array of strings
4546+ let mut builder = StringRunBuilder :: < Int32Type > :: new ( ) ;
4547+ builder. extend (
4548+ vec ! [ Some ( "alpha" ) ; 100000 ]
4549+ . into_iter ( )
4550+ . chain ( vec ! [ Some ( "beta" ) ; 100000 ] ) ,
4551+ ) ;
4552+ let run_array: RunArray < Int32Type > = builder. finish ( ) ;
4553+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4554+ "ree" ,
4555+ run_array. data_type( ) . clone( ) ,
4556+ run_array. is_nullable( ) ,
4557+ ) ] ) ) ;
4558+
4559+ // Write to parquet
4560+ let mut parquet_bytes: Vec < u8 > = Vec :: new ( ) ;
4561+ let mut writer = ArrowWriter :: try_new ( & mut parquet_bytes, schema. clone ( ) , None ) . unwrap ( ) ;
4562+ let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ Arc :: new( run_array) ] ) . unwrap ( ) ;
4563+ writer. write ( & batch) . unwrap ( ) ;
4564+ writer. close ( ) . unwrap ( ) ;
4565+
4566+ // Read back and verify
4567+ let bytes = Bytes :: from ( parquet_bytes) ;
4568+ let reader = ParquetRecordBatchReaderBuilder :: try_new ( bytes) . unwrap ( ) ;
4569+
4570+ // Check if dictionary was used by examining the metadata
4571+ let metadata = reader. metadata ( ) ;
4572+ let row_group = & metadata. row_groups ( ) [ 0 ] ;
4573+ let col_meta = & row_group. columns ( ) [ 0 ] ;
4574+
4575+ // If dictionary encoding worked, we should see RLE_DICTIONARY encoding
4576+ // and have a dictionary page offset
4577+ let has_dict_encoding = col_meta. encodings ( ) . any ( |e| e == Encoding :: RLE_DICTIONARY ) ;
4578+ let has_dict_page = col_meta. dictionary_page_offset ( ) . is_some ( ) ;
4579+
4580+ // Verify the schema is REE encoded when we read it back
4581+ let expected_schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4582+ "ree" ,
4583+ DataType :: RunEndEncoded (
4584+ Arc :: new( Field :: new( "run_ends" , arrow_schema:: DataType :: Int32 , false ) ) ,
4585+ Arc :: new( Field :: new( "values" , arrow_schema:: DataType :: Utf8 , true ) ) ,
4586+ ) ,
4587+ false ,
4588+ ) ] ) ) ;
4589+ assert_eq ! ( & expected_schema, reader. schema( ) ) ;
4590+
4591+ // Read the data back
4592+ let batches: Vec < _ > = reader
4593+ . build ( )
4594+ . unwrap ( )
4595+ . collect :: < ArrowResult < Vec < _ > > > ( )
4596+ . unwrap ( ) ;
4597+ assert_eq ! ( batches. len( ) , 196 ) ;
4598+ // Count rows in total
4599+ let total_rows = batches. iter ( ) . map ( |b| b. num_rows ( ) ) . sum :: < usize > ( ) ;
4600+ assert_eq ! ( total_rows, 200000 ) ;
4601+
4602+ // Ensure dictionary encoding
4603+ assert ! ( has_dict_encoding, "RunArray should be dictionary encoded" ) ;
4604+ assert ! ( has_dict_page, "RunArray should have dictionary page" ) ;
4605+ }
4606+
4607+ #[ test]
4608+ fn arrow_writer_run_end_encoded_int ( ) {
4609+ // Create a run array of strings
4610+ let mut builder = PrimitiveRunBuilder :: < Int32Type , Int32Type > :: new ( ) ;
4611+ builder. extend (
4612+ vec ! [ Some ( 1 ) ; 100000 ]
4613+ . into_iter ( )
4614+ . chain ( vec ! [ Some ( 2 ) ; 100000 ] ) ,
4615+ ) ;
4616+ let run_array: RunArray < Int32Type > = builder. finish ( ) ;
4617+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4618+ "ree" ,
4619+ run_array. data_type( ) . clone( ) ,
4620+ run_array. is_nullable( ) ,
4621+ ) ] ) ) ;
4622+
4623+ // Write to parquet
4624+ let mut parquet_bytes: Vec < u8 > = Vec :: new ( ) ;
4625+ let mut writer = ArrowWriter :: try_new ( & mut parquet_bytes, schema. clone ( ) , None ) . unwrap ( ) ;
4626+ let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ Arc :: new( run_array) ] ) . unwrap ( ) ;
4627+ writer. write ( & batch) . unwrap ( ) ;
4628+ writer. close ( ) . unwrap ( ) ;
4629+
4630+ // Read back and verify
4631+ let bytes = Bytes :: from ( parquet_bytes) ;
4632+ let reader = ParquetRecordBatchReaderBuilder :: try_new ( bytes) . unwrap ( ) ;
4633+
4634+ // Check if dictionary was used by examining the metadata
4635+ let metadata = reader. metadata ( ) ;
4636+ let row_group = & metadata. row_groups ( ) [ 0 ] ;
4637+ let col_meta = & row_group. columns ( ) [ 0 ] ;
4638+ let has_dict_encoding = col_meta. encodings ( ) . any ( |e| e == Encoding :: RLE_DICTIONARY ) ;
4639+
4640+ // If dictionary encoding worked, we should see RLE_DICTIONARY encoding
4641+ // and have a dictionary page offset
4642+ // let has_dict_encoding = col_meta.encodings().contains(&Encoding::RLE_DICTIONARY);
4643+ let has_dict_page = col_meta. dictionary_page_offset ( ) . is_some ( ) ;
4644+
4645+ // Verify the schema is REE encoded when we read it back
4646+ let expected_schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4647+ "ree" ,
4648+ DataType :: RunEndEncoded (
4649+ Arc :: new( Field :: new( "run_ends" , arrow_schema:: DataType :: Int32 , false ) ) ,
4650+ Arc :: new( Field :: new( "values" , arrow_schema:: DataType :: Int32 , true ) ) ,
4651+ ) ,
4652+ false ,
4653+ ) ] ) ) ;
4654+ assert_eq ! ( & expected_schema, reader. schema( ) ) ;
4655+
4656+ // Read the data back
4657+ let batches: Vec < _ > = reader
4658+ . build ( )
4659+ . unwrap ( )
4660+ . collect :: < ArrowResult < Vec < _ > > > ( )
4661+ . unwrap ( ) ;
4662+ assert_eq ! ( batches. len( ) , 196 ) ;
4663+ // Count rows in total
4664+ let total_rows = batches. iter ( ) . map ( |b| b. num_rows ( ) ) . sum :: < usize > ( ) ;
4665+ assert_eq ! ( total_rows, 200000 ) ;
4666+
4667+ // Ensure dictionary encoding
4668+ assert ! ( has_dict_encoding, "RunArray should be dictionary encoded" ) ;
4669+ assert ! ( has_dict_page, "RunArray should have dictionary page" ) ;
4670+ }
4671+
4672+ #[ test]
4673+ fn arrow_writer_round_trip_run_end_encoded_string ( ) {
4674+ // Create a run array of strings (cannot have more than 1024 values per record batch)
4675+ let mut builder = StringRunBuilder :: < Int32Type > :: new ( ) ;
4676+ builder. extend (
4677+ vec ! [ Some ( "alpha" ) ; 512 ]
4678+ . into_iter ( )
4679+ . chain ( vec ! [ Some ( "beta" ) ; 512 ] ) ,
4680+ ) ;
4681+ let run_array: RunArray < Int32Type > = builder. finish ( ) ;
4682+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
4683+ "ree" ,
4684+ run_array. data_type( ) . clone( ) ,
4685+ run_array. is_nullable( ) ,
4686+ ) ] ) ) ;
4687+
4688+ let batch = RecordBatch :: try_new ( schema. clone ( ) , vec ! [ Arc :: new( run_array) ] ) . unwrap ( ) ;
4689+ roundtrip ( batch, None ) ;
4690+ }
44844691}
0 commit comments