|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +//! This module provides the `UnpartitionedWriter` implementation. |
| 19 | +
|
| 20 | +use std::marker::PhantomData; |
| 21 | + |
| 22 | +use crate::Result; |
| 23 | +use crate::writer::{DefaultInput, DefaultOutput, IcebergWriter, IcebergWriterBuilder}; |
| 24 | + |
| 25 | +/// A simple wrapper around `IcebergWriterBuilder` for unpartitioned tables. |
| 26 | +/// |
| 27 | +/// This writer lazily creates the underlying writer on the first write operation |
| 28 | +/// and writes all data to a single file (or set of files if rolling). |
| 29 | +/// |
| 30 | +/// # Type Parameters |
| 31 | +/// |
| 32 | +/// * `B` - The inner writer builder type |
| 33 | +/// * `I` - Input type (defaults to `RecordBatch`) |
| 34 | +/// * `O` - Output collection type (defaults to `Vec<DataFile>`) |
| 35 | +pub struct UnpartitionedWriter<B, I = DefaultInput, O = DefaultOutput> |
| 36 | +where |
| 37 | + B: IcebergWriterBuilder<I, O>, |
| 38 | + O: IntoIterator + FromIterator<<O as IntoIterator>::Item>, |
| 39 | + <O as IntoIterator>::Item: Clone, |
| 40 | +{ |
| 41 | + inner_builder: B, |
| 42 | + writer: Option<B::R>, |
| 43 | + output: Vec<<O as IntoIterator>::Item>, |
| 44 | + _phantom: PhantomData<I>, |
| 45 | +} |
| 46 | + |
| 47 | +impl<B, I, O> UnpartitionedWriter<B, I, O> |
| 48 | +where |
| 49 | + B: IcebergWriterBuilder<I, O>, |
| 50 | + I: Send + 'static, |
| 51 | + O: IntoIterator + FromIterator<<O as IntoIterator>::Item>, |
| 52 | + <O as IntoIterator>::Item: Send + Clone, |
| 53 | +{ |
| 54 | + /// Create a new `UnpartitionedWriter`. |
| 55 | + pub fn new(inner_builder: B) -> Self { |
| 56 | + Self { |
| 57 | + inner_builder, |
| 58 | + writer: None, |
| 59 | + output: Vec::new(), |
| 60 | + _phantom: PhantomData, |
| 61 | + } |
| 62 | + } |
| 63 | + |
| 64 | + /// Write data to the writer. |
| 65 | + /// |
| 66 | + /// The underlying writer is lazily created on the first write operation. |
| 67 | + /// |
| 68 | + /// # Parameters |
| 69 | + /// |
| 70 | + /// * `input` - The input data to write |
| 71 | + /// |
| 72 | + /// # Returns |
| 73 | + /// |
| 74 | + /// `Ok(())` on success, or an error if the write operation fails. |
| 75 | + pub async fn write(&mut self, input: I) -> Result<()> { |
| 76 | + // Lazily create writer on first write |
| 77 | + if self.writer.is_none() { |
| 78 | + self.writer = Some(self.inner_builder.clone().build(None).await?); |
| 79 | + } |
| 80 | + |
| 81 | + // Write directly to inner writer |
| 82 | + self.writer |
| 83 | + .as_mut() |
| 84 | + .expect("Writer should be initialized") |
| 85 | + .write(input) |
| 86 | + .await |
| 87 | + } |
| 88 | + |
| 89 | + /// Close the writer and return all written data files. |
| 90 | + /// |
| 91 | + /// This method consumes the writer to prevent further use. |
| 92 | + /// |
| 93 | + /// # Returns |
| 94 | + /// |
| 95 | + /// The accumulated output from all write operations, or an empty collection |
| 96 | + /// if no data was written. |
| 97 | + pub async fn close(mut self) -> Result<O> { |
| 98 | + if let Some(mut writer) = self.writer.take() { |
| 99 | + self.output.extend(writer.close().await?); |
| 100 | + } |
| 101 | + Ok(O::from_iter(self.output)) |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +#[cfg(test)] |
| 106 | +mod tests { |
| 107 | + use std::collections::HashMap; |
| 108 | + use std::sync::Arc; |
| 109 | + |
| 110 | + use arrow_array::{Int32Array, RecordBatch, StringArray}; |
| 111 | + use arrow_schema::{DataType, Field, Schema}; |
| 112 | + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; |
| 113 | + use parquet::file::properties::WriterProperties; |
| 114 | + use tempfile::TempDir; |
| 115 | + |
| 116 | + use super::*; |
| 117 | + use crate::Result; |
| 118 | + use crate::io::FileIOBuilder; |
| 119 | + use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Struct, Type}; |
| 120 | + use crate::writer::base_writer::data_file_writer::DataFileWriterBuilder; |
| 121 | + use crate::writer::file_writer::ParquetWriterBuilder; |
| 122 | + use crate::writer::file_writer::location_generator::{ |
| 123 | + DefaultFileNameGenerator, DefaultLocationGenerator, |
| 124 | + }; |
| 125 | + use crate::writer::file_writer::rolling_writer::RollingFileWriterBuilder; |
| 126 | + |
| 127 | + #[tokio::test] |
| 128 | + async fn test_unpartitioned_writer() -> Result<()> { |
| 129 | + let temp_dir = TempDir::new()?; |
| 130 | + |
| 131 | + // Build Iceberg schema |
| 132 | + let schema = Arc::new( |
| 133 | + crate::spec::Schema::builder() |
| 134 | + .with_schema_id(1) |
| 135 | + .with_fields(vec![ |
| 136 | + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), |
| 137 | + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), |
| 138 | + ]) |
| 139 | + .build()?, |
| 140 | + ); |
| 141 | + |
| 142 | + // Build Arrow schema |
| 143 | + let arrow_schema = Arc::new(Schema::new(vec![ |
| 144 | + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( |
| 145 | + PARQUET_FIELD_ID_META_KEY.to_string(), |
| 146 | + "1".to_string(), |
| 147 | + )])), |
| 148 | + Field::new("name", DataType::Utf8, false).with_metadata(HashMap::from([( |
| 149 | + PARQUET_FIELD_ID_META_KEY.to_string(), |
| 150 | + "2".to_string(), |
| 151 | + )])), |
| 152 | + ])); |
| 153 | + |
| 154 | + // Build writer |
| 155 | + let file_io = FileIOBuilder::new_fs_io().build()?; |
| 156 | + let location_gen = DefaultLocationGenerator::with_data_location( |
| 157 | + temp_dir.path().to_str().unwrap().to_string(), |
| 158 | + ); |
| 159 | + let file_name_gen = |
| 160 | + DefaultFileNameGenerator::new("test".to_string(), None, DataFileFormat::Parquet); |
| 161 | + let parquet_writer_builder = |
| 162 | + ParquetWriterBuilder::new(WriterProperties::builder().build(), schema.clone()); |
| 163 | + let rolling_writer_builder = RollingFileWriterBuilder::new_with_default_file_size( |
| 164 | + parquet_writer_builder, |
| 165 | + file_io, |
| 166 | + location_gen, |
| 167 | + file_name_gen, |
| 168 | + ); |
| 169 | + let writer_builder = DataFileWriterBuilder::new(rolling_writer_builder); |
| 170 | + |
| 171 | + let mut writer = UnpartitionedWriter::new(writer_builder); |
| 172 | + |
| 173 | + // Write two batches |
| 174 | + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![ |
| 175 | + Arc::new(Int32Array::from(vec![1, 2])), |
| 176 | + Arc::new(StringArray::from(vec!["Alice", "Bob"])), |
| 177 | + ])?; |
| 178 | + let batch2 = RecordBatch::try_new(arrow_schema, vec![ |
| 179 | + Arc::new(Int32Array::from(vec![3, 4])), |
| 180 | + Arc::new(StringArray::from(vec!["Charlie", "Dave"])), |
| 181 | + ])?; |
| 182 | + |
| 183 | + writer.write(batch1).await?; |
| 184 | + writer.write(batch2).await?; |
| 185 | + |
| 186 | + let data_files = writer.close().await?; |
| 187 | + |
| 188 | + // Verify files have empty partition and correct format |
| 189 | + assert!(!data_files.is_empty()); |
| 190 | + for file in &data_files { |
| 191 | + assert_eq!(file.partition, Struct::empty()); |
| 192 | + assert_eq!(file.file_format, DataFileFormat::Parquet); |
| 193 | + assert_eq!(file.record_count, 4); |
| 194 | + } |
| 195 | + |
| 196 | + Ok(()) |
| 197 | + } |
| 198 | +} |
0 commit comments