Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 52 additions & 11 deletions datafusion/catalog/src/information_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::{CatalogProviderList, SchemaProvider, TableProvider};
use arrow::array::builder::{BooleanBuilder, UInt8Builder};
use arrow::{
array::{StringBuilder, UInt64Builder},
datatypes::{DataType, Field, Schema, SchemaRef},
datatypes::{DataType, Field, FieldRef, Schema, SchemaRef},
record_batch::RecordBatch,
};
use async_trait::async_trait;
Expand All @@ -34,7 +34,10 @@ use datafusion_common::error::Result;
use datafusion_common::types::NativeType;
use datafusion_execution::TaskContext;
use datafusion_execution::runtime_env::RuntimeEnv;
use datafusion_expr::{AggregateUDF, ScalarUDF, Signature, TypeSignature, WindowUDF};
use datafusion_expr::function::WindowUDFFieldArgs;
use datafusion_expr::{
AggregateUDF, ReturnFieldArgs, ScalarUDF, Signature, TypeSignature, WindowUDF,
};
use datafusion_expr::{TableType, Volatility};
use datafusion_physical_plan::SendableRecordBatchStream;
use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
Expand Down Expand Up @@ -421,10 +424,24 @@ fn get_udf_args_and_return_types(
Ok(arg_types
.into_iter()
.map(|arg_types| {
// only handle the function which implemented [`ScalarUDFImpl::return_type`] method
let arg_fields: Vec<FieldRef> = arg_types
.iter()
.enumerate()
.map(|(i, t)| {
Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
})
.collect();
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated arg_fields construction logic across three functions

Low Severity

The same code block to build arg_fields from arg_types is duplicated verbatim in get_udf_args_and_return_types, get_udaf_args_and_return_types, and get_udwf_args_and_return_types. This could be extracted into a helper function like fn build_arg_fields(arg_types: &[DataType]) -> Vec<FieldRef>.

Fix in Cursor Fix in Web

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:good-to-have; category:bug; feedback: The Bugbot AI reviewer is correct! There is some duplicated code that would be good to be extracted to a helper function and reused. Prevents double maintenance and possible regressions if a fix is applied at only one of the copies.

let scalar_arguments = vec![None; arg_fields.len()];
let return_type = udf
.return_type(&arg_types)
.map(|t| remove_native_type_prefix(&NativeType::from(t)))
.return_field_from_args(ReturnFieldArgs {
arg_fields: &arg_fields,
scalar_arguments: &scalar_arguments,
})
.map(|f| {
remove_native_type_prefix(&NativeType::from(
f.data_type().clone(),
))
})
.ok();
let arg_types = arg_types
.into_iter()
Expand All @@ -447,11 +464,21 @@ fn get_udaf_args_and_return_types(
Ok(arg_types
.into_iter()
.map(|arg_types| {
// only handle the function which implemented [`ScalarUDFImpl::return_type`] method
let arg_fields: Vec<FieldRef> = arg_types
.iter()
.enumerate()
.map(|(i, t)| {
Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
})
.collect();
let return_type = udaf
.return_type(&arg_types)
.ok()
.map(|t| remove_native_type_prefix(&NativeType::from(t)));
.return_field(&arg_fields)
.map(|f| {
remove_native_type_prefix(&NativeType::from(
f.data_type().clone(),
))
})
.ok();
let arg_types = arg_types
.into_iter()
.map(|t| remove_native_type_prefix(&NativeType::from(t)))
Expand All @@ -473,12 +500,26 @@ fn get_udwf_args_and_return_types(
Ok(arg_types
.into_iter()
.map(|arg_types| {
// only handle the function which implemented [`ScalarUDFImpl::return_type`] method
let arg_fields: Vec<FieldRef> = arg_types
.iter()
.enumerate()
.map(|(i, t)| {
Arc::new(Field::new(format!("arg_{i}"), t.clone(), true))
})
.collect();
let return_type = udwf
.field(WindowUDFFieldArgs::new(&arg_fields, udwf.name()))
.map(|f| {
remove_native_type_prefix(&NativeType::from(
f.data_type().clone(),
))
})
.ok();
let arg_types = arg_types
.into_iter()
.map(|t| remove_native_type_prefix(&NativeType::from(t)))
.collect::<Vec<_>>();
(arg_types, None)
(arg_types, return_type)
})
.collect::<BTreeSet<_>>())
}
Expand Down
26 changes: 10 additions & 16 deletions datafusion/functions/src/datetime/date_trunc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ use arrow::datatypes::{Field, FieldRef};
use datafusion_common::cast::as_primitive_array;
use datafusion_common::types::{NativeType, logical_date, logical_string};
use datafusion_common::{
DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err,
DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err,
};
use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
use datafusion_expr::{
Expand Down Expand Up @@ -223,27 +223,21 @@ impl ScalarUDFImpl for DateTruncFunc {
&self.signature
}

// keep return_type implementation for information schema generation
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
if arg_types[1].is_null() {
Ok(Timestamp(Nanosecond, None))
} else {
Ok(arg_types[1].clone())
}
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be called instead")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since DateTruncFunc::return_type now always errors, datafusion/functions/benches/date_trunc.rs (around lines 60–63) will now panic when building return_field via udf.return_type(...). Consider updating the benchmark to derive the return field via return_field_from_args (or otherwise avoid calling return_type).

Other Locations
  • datafusion/functions/benches/date_trunc.rs:60

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:delightful; category:bug; feedback: The Augment AI reviewer is correct! The Augment AI reviewer is the only one that checked this unchanged but related file in the repository and found the issue! The benchmark tests still compile because the old method is still there but since they are not executed by the CI the runtime error is not detected by the Pull Request author. Prevents detecting that the benchmark test is broken in a later stage, possible by another developer.

}

fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let data_types = args
.arg_fields
.iter()
.map(|f| f.data_type())
.cloned()
.collect::<Vec<_>>();
let return_type = self.return_type(&data_types)?;
let field = &args.arg_fields[1];
let return_type = if field.data_type().is_null() {
Timestamp(Nanosecond, None)
} else {
field.data_type().clone()
};
Ok(Arc::new(Field::new(
self.name(),
return_type,
args.arg_fields[1].is_nullable(),
field.is_nullable(),
)))
Comment on lines +226 to 241
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Guard against missing args to avoid panic.

Line 231 indexes args.arg_fields[1] directly. If this method is ever invoked with fewer than two args (planner bug or misuse), it will panic instead of returning a user-facing error.

🛠️ Proposed fix
-        let field = &args.arg_fields[1];
+        let field = args
+            .arg_fields
+            .get(1)
+            .ok_or_else(|| exec_err!("{} expects 2 arguments", self.name()))?;
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be called instead")
}
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let data_types = args
.arg_fields
.iter()
.map(|f| f.data_type())
.cloned()
.collect::<Vec<_>>();
let return_type = self.return_type(&data_types)?;
let field = &args.arg_fields[1];
let return_type = if field.data_type().is_null() {
Timestamp(Nanosecond, None)
} else {
field.data_type().clone()
};
Ok(Arc::new(Field::new(
self.name(),
return_type,
args.arg_fields[1].is_nullable(),
field.is_nullable(),
)))
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be called instead")
}
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let field = args
.arg_fields
.get(1)
.ok_or_else(|| exec_err!("{} expects 2 arguments", self.name()))?;
let return_type = if field.data_type().is_null() {
Timestamp(Nanosecond, None)
} else {
field.data_type().clone()
};
Ok(Arc::new(Field::new(
self.name(),
return_type,
field.is_nullable(),
)))
🤖 Prompt for AI Agents
In `@datafusion/functions/src/datetime/date_trunc.rs` around lines 226 - 241, The
method return_field_from_args currently indexes args.arg_fields[1] directly and
can panic if fewer than two args are passed; update return_field_from_args to
first validate args.arg_fields has at least 2 elements (or use
args.arg_fields.get(1)) and return a Result::Err (use the existing internal_err!
or another appropriate error helper) when missing, then proceed to derive
return_type from the second arg as before (referencing return_field_from_args,
args.arg_fields, and self.name()) so the function returns a proper error instead
of panicking.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:incorrect-but-reasonable; category:bug; feedback: The CodeRabbit AI reviewer is correct to mark this as a possible issue but it is actually safe to do it because the method is called after an earlier check that all required function arguments are provided and they are from the allowed types (signature coercion check),

}

Expand Down