Skip to content

feat: Add union_by_name, union_by_name_distinct to DataFrame api #15489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions datafusion/core/src/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,46 @@ impl DataFrame {
})
}

/// Calculate the union of two [`DataFrame`]s using column names, preserving duplicate rows.
///
/// The two [`DataFrame`]s are combined using column names rather than position,
/// filling missing columns with null.
///
///
/// # Example
/// ```
/// # use datafusion::prelude::*;
/// # use datafusion::error::Result;
/// # use datafusion_common::assert_batches_sorted_eq;
/// # #[tokio::main]
/// # async fn main() -> Result<()> {
/// let ctx = SessionContext::new();
/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
/// let d2 = df.clone().select_columns(&["b", "c", "a"])?.with_column("d", lit("77"))?;
/// let df = df.union_by_name(d2)?;
/// let expected = vec![
/// "+---+---+---+----+",
/// "| a | b | c | d |",
/// "+---+---+---+----+",
/// "| 1 | 2 | 3 | |",
/// "| 1 | 2 | 3 | 77 |",
/// "+---+---+---+----+"
/// ];
/// # assert_batches_sorted_eq!(expected, &df.collect().await?);
/// # Ok(())
/// # }
/// ```
pub fn union_by_name(self, dataframe: DataFrame) -> Result<DataFrame> {
let plan = LogicalPlanBuilder::from(self.plan)
.union_by_name(dataframe.plan)?
.build()?;
Ok(DataFrame {
session_state: self.session_state,
plan,
projection_requires_validation: true,
})
}

/// Calculate the distinct union of two [`DataFrame`]s.
///
/// The two [`DataFrame`]s must have exactly the same schema. Any duplicate
Expand Down Expand Up @@ -724,6 +764,45 @@ impl DataFrame {
})
}

/// Calculate the union of two [`DataFrame`]s using column names with all duplicated rows removed.
///
/// The two [`DataFrame`]s are combined using column names rather than position,
/// filling missing columns with null.
///
///
/// # Example
/// ```
/// # use datafusion::prelude::*;
/// # use datafusion::error::Result;
/// # use datafusion_common::assert_batches_sorted_eq;
/// # #[tokio::main]
/// # async fn main() -> Result<()> {
/// let ctx = SessionContext::new();
/// let df = ctx.read_csv("tests/data/example.csv", CsvReadOptions::new()).await?;
/// let d2 = df.clone().select_columns(&["b", "c", "a"])?;
/// let df = df.union_by_name_distinct(d2)?;
/// let expected = vec![
/// "+---+---+---+",
/// "| a | b | c |",
/// "+---+---+---+",
/// "| 1 | 2 | 3 |",
/// "+---+---+---+"
/// ];
/// # assert_batches_sorted_eq!(expected, &df.collect().await?);
/// # Ok(())
/// # }
/// ```
pub fn union_by_name_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
let plan = LogicalPlanBuilder::from(self.plan)
.union_by_name_distinct(dataframe.plan)?
.build()?;
Ok(DataFrame {
session_state: self.session_state,
plan,
projection_requires_validation: true,
})
}

/// Return a new `DataFrame` with all duplicated rows removed.
///
/// # Example
Expand Down
92 changes: 92 additions & 0 deletions datafusion/core/tests/dataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5206,6 +5206,40 @@ fn union_fields() -> UnionFields {
.collect()
}

#[tokio::test]
async fn union_literal_is_null_and_not_null() -> Result<()> {
let str_array_1 = StringArray::from(vec![None::<String>]);
let str_array_2 = StringArray::from(vec![Some("a")]);

let batch_1 =
RecordBatch::try_from_iter(vec![("arr", Arc::new(str_array_1) as ArrayRef)])?;
let batch_2 =
RecordBatch::try_from_iter(vec![("arr", Arc::new(str_array_2) as ArrayRef)])?;

let ctx = SessionContext::new();
ctx.register_batch("union_batch_1", batch_1)?;
ctx.register_batch("union_batch_2", batch_2)?;

let df1 = ctx.table("union_batch_1").await?;
let df2 = ctx.table("union_batch_2").await?;

let batches = df1.union(df2)?.collect().await?;
let schema = batches[0].schema();

for batch in batches {
// Verify schema is the same for all batches
if !schema.contains(&batch.schema()) {
return Err(DataFusionError::Internal(format!(
"Schema mismatch. Previously had\n{:#?}\n\nGot:\n{:#?}",
&schema,
batch.schema()
)));
}
}

Ok(())
}

#[tokio::test]
async fn sparse_union_is_null() {
// union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
Expand Down Expand Up @@ -5477,6 +5511,64 @@ async fn boolean_dictionary_as_filter() {
);
}

#[tokio::test]
async fn test_union_by_name() -> Result<()> {
let df = create_test_table("test")
.await?
.select(vec![col("a"), col("b"), lit(1).alias("c")])?
.alias("table_alias")?;

let df2 = df.clone().select_columns(&["c", "b", "a"])?;
let result = df.union_by_name(df2)?.sort_by(vec![col("a"), col("b")])?;

assert_snapshot!(
batches_to_sort_string(&result.collect().await?),
@r"
+-----------+-----+---+
| a | b | c |
+-----------+-----+---+
| 123AbcDef | 100 | 1 |
| 123AbcDef | 100 | 1 |
| CBAdef | 10 | 1 |
| CBAdef | 10 | 1 |
| abc123 | 10 | 1 |
| abc123 | 10 | 1 |
| abcDEF | 1 | 1 |
| abcDEF | 1 | 1 |
+-----------+-----+---+
"
);
Ok(())
}

#[tokio::test]
async fn test_union_by_name_distinct() -> Result<()> {
let df = create_test_table("test")
.await?
.select(vec![col("a"), col("b"), lit(1).alias("c")])?
.alias("table_alias")?;

let df2 = df.clone().select_columns(&["c", "b", "a"])?;
let result = df
.union_by_name_distinct(df2)?
.sort_by(vec![col("a"), col("b")])?;

assert_snapshot!(
batches_to_sort_string(&result.collect().await?),
@r"
+-----------+-----+---+
| a | b | c |
+-----------+-----+---+
| 123AbcDef | 100 | 1 |
| CBAdef | 10 | 1 |
| abc123 | 10 | 1 |
| abcDEF | 1 | 1 |
+-----------+-----+---+
"
);
Ok(())
}

#[tokio::test]
async fn test_alias() -> Result<()> {
let df = create_test_table("test")
Expand Down