From ae3ebcb2769307fdf37d6723a451040e4dbebf16 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Wed, 31 Dec 2025 19:07:21 +0530 Subject: [PATCH 1/5] perf: optimize octet_length for string arrays --- .../functions/src/string/octet_length.rs | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 3732897f3d372..2d89c7fab49e8 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. -use arrow::compute::kernels::length::length; use arrow::datatypes::DataType; use std::any::Any; use crate::utils::utf8_to_int_type; +use arrow::array::{ + Array, ArrayRef, Int32Builder, Int64Builder, LargeStringArray, StringArray, + StringViewArray, +}; use datafusion_common::types::logical_string; use datafusion_common::utils::take_function_args; use datafusion_common::{Result, ScalarValue}; @@ -28,6 +31,7 @@ use datafusion_expr::{ TypeSignatureClass, Volatility, }; use datafusion_macros::user_doc; +use std::sync::Arc; #[user_doc( doc_section(label = "String Functions"), @@ -90,7 +94,45 @@ impl ScalarUDFImpl for OctetLengthFunc { let [array] = take_function_args(self.name(), &args.args)?; match array { - ColumnarValue::Array(v) => Ok(ColumnarValue::Array(length(v.as_ref())?)), + ColumnarValue::Array(v) => { + let arr: ArrayRef = v.clone(); + + if let Some(arr) = arr.as_any().downcast_ref::() { + let mut builder = Int32Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i32); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = arr.as_any().downcast_ref::() + { + let mut builder = Int64Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i64); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = arr.as_any().downcast_ref::() { + let mut builder = Int32Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value(i).len() as i32); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else { + unreachable!("octet_length expects string arrays") + } + } + ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( v.as_ref().map(|x| x.len() as i32), From 63bc5fdce04950e3ddde7de0df595324fd85f314 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Wed, 31 Dec 2025 19:31:20 +0530 Subject: [PATCH 2/5] refactor: simplify StringViewArray handling in octet_length --- .../functions/src/string/octet_length.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 2d89c7fab49e8..40de5d9a341a6 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -20,8 +20,8 @@ use std::any::Any; use crate::utils::utf8_to_int_type; use arrow::array::{ - Array, ArrayRef, Int32Builder, Int64Builder, LargeStringArray, StringArray, - StringViewArray, + Array, ArrayRef, Int32Array, Int32Builder, Int64Builder, LargeStringArray, + StringArray, StringViewArray, }; use datafusion_common::types::logical_string; use datafusion_common::utils::take_function_args; @@ -119,15 +119,12 @@ impl ScalarUDFImpl for OctetLengthFunc { } Ok(ColumnarValue::Array(Arc::new(builder.finish()))) } else if let Some(arr) = arr.as_any().downcast_ref::() { - let mut builder = Int32Builder::with_capacity(arr.len()); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - builder.append_value(arr.value(i).len() as i32); - } - } - Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + let result = arr + .iter() + .map(|s| s.map(|s| s.len() as i32)) + .collect::(); + + Ok(ColumnarValue::Array(Arc::new(result))) } else { unreachable!("octet_length expects string arrays") } From 7c89fa0ccaac30e2ab893f2d2dcbedeb27e89803 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Wed, 31 Dec 2025 19:46:32 +0530 Subject: [PATCH 3/5] perf: avoid unnecessary ArrayRef clone in octet_length --- .../functions/src/string/octet_length.rs | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 40de5d9a341a6..112dafbc9ab9c 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -20,7 +20,7 @@ use std::any::Any; use crate::utils::utf8_to_int_type; use arrow::array::{ - Array, ArrayRef, Int32Array, Int32Builder, Int64Builder, LargeStringArray, + Array, Int32Array, Int32Builder, Int64Builder, LargeStringArray, StringArray, StringViewArray, }; use datafusion_common::types::logical_string; @@ -95,40 +95,37 @@ impl ScalarUDFImpl for OctetLengthFunc { match array { ColumnarValue::Array(v) => { - let arr: ArrayRef = v.clone(); - - if let Some(arr) = arr.as_any().downcast_ref::() { - let mut builder = Int32Builder::with_capacity(arr.len()); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - builder.append_value(arr.value_length(i) as i32); - } - } - Ok(ColumnarValue::Array(Arc::new(builder.finish()))) - } else if let Some(arr) = arr.as_any().downcast_ref::() - { - let mut builder = Int64Builder::with_capacity(arr.len()); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - builder.append_value(arr.value_length(i) as i64); - } - } - Ok(ColumnarValue::Array(Arc::new(builder.finish()))) - } else if let Some(arr) = arr.as_any().downcast_ref::() { - let result = arr - .iter() - .map(|s| s.map(|s| s.len() as i32)) - .collect::(); - - Ok(ColumnarValue::Array(Arc::new(result))) - } else { - unreachable!("octet_length expects string arrays") - } + if let Some(arr) = v.as_any().downcast_ref::() { + let mut builder = Int32Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i32); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = v.as_any().downcast_ref::() { + let mut builder = Int64Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i64); } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = v.as_any().downcast_ref::() { + let result = arr + .iter() + .map(|s| s.map(|s| s.len() as i32)) + .collect::(); + + Ok(ColumnarValue::Array(Arc::new(result))) + } else { + unreachable!("octet_length expects string arrays") + } +} ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( From a54b5d178321b165adb27b2fd186d36fa8784c8c Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Wed, 31 Dec 2025 23:21:00 +0530 Subject: [PATCH 4/5] chore: rustfmt octet_length --- .../functions/src/string/octet_length.rs | 64 +++++++++---------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 112dafbc9ab9c..e2f0c9cb13149 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -20,8 +20,8 @@ use std::any::Any; use crate::utils::utf8_to_int_type; use arrow::array::{ - Array, Int32Array, Int32Builder, Int64Builder, LargeStringArray, - StringArray, StringViewArray, + Array, Int32Array, Int32Builder, Int64Builder, LargeStringArray, StringArray, + StringViewArray, }; use datafusion_common::types::logical_string; use datafusion_common::utils::take_function_args; @@ -95,37 +95,37 @@ impl ScalarUDFImpl for OctetLengthFunc { match array { ColumnarValue::Array(v) => { - if let Some(arr) = v.as_any().downcast_ref::() { - let mut builder = Int32Builder::with_capacity(arr.len()); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - builder.append_value(arr.value_length(i) as i32); - } - } - Ok(ColumnarValue::Array(Arc::new(builder.finish()))) - } else if let Some(arr) = v.as_any().downcast_ref::() { - let mut builder = Int64Builder::with_capacity(arr.len()); - for i in 0..arr.len() { - if arr.is_null(i) { - builder.append_null(); - } else { - builder.append_value(arr.value_length(i) as i64); - } - } - Ok(ColumnarValue::Array(Arc::new(builder.finish()))) - } else if let Some(arr) = v.as_any().downcast_ref::() { - let result = arr - .iter() - .map(|s| s.map(|s| s.len() as i32)) - .collect::(); + if let Some(arr) = v.as_any().downcast_ref::() { + let mut builder = Int32Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i32); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = v.as_any().downcast_ref::() { + let mut builder = Int64Builder::with_capacity(arr.len()); + for i in 0..arr.len() { + if arr.is_null(i) { + builder.append_null(); + } else { + builder.append_value(arr.value_length(i) as i64); + } + } + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } else if let Some(arr) = v.as_any().downcast_ref::() { + let result = arr + .iter() + .map(|s| s.map(|s| s.len() as i32)) + .collect::(); - Ok(ColumnarValue::Array(Arc::new(result))) - } else { - unreachable!("octet_length expects string arrays") - } -} + Ok(ColumnarValue::Array(Arc::new(result))) + } else { + unreachable!("octet_length expects string arrays") + } + } ColumnarValue::Scalar(v) => match v { ScalarValue::Utf8(v) => Ok(ColumnarValue::Scalar(ScalarValue::Int32( From 7540b63620137364515311fa1b1c5d15847ce2b8 Mon Sep 17 00:00:00 2001 From: Brijesh-Thakkar Date: Wed, 31 Dec 2025 23:42:59 +0530 Subject: [PATCH 5/5] fix: clippy warnings and rustfmt for octet_length --- datafusion/functions/src/string/octet_length.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index e2f0c9cb13149..0a012b3215ba4 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -101,7 +101,7 @@ impl ScalarUDFImpl for OctetLengthFunc { if arr.is_null(i) { builder.append_null(); } else { - builder.append_value(arr.value_length(i) as i32); + builder.append_value(arr.value_length(i)); } } Ok(ColumnarValue::Array(Arc::new(builder.finish()))) @@ -111,7 +111,7 @@ impl ScalarUDFImpl for OctetLengthFunc { if arr.is_null(i) { builder.append_null(); } else { - builder.append_value(arr.value_length(i) as i64); + builder.append_value(arr.value_length(i)); } } Ok(ColumnarValue::Array(Arc::new(builder.finish())))