diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index a8c41121b29e1..5af901a6bfb6f 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -308,12 +308,7 @@ required-features = ["string_expressions"] [[bench]] harness = false -name = "left" -required-features = ["unicode_expressions"] - -[[bench]] -harness = false -name = "right" +name = "left_right" required-features = ["unicode_expressions"] [[bench]] diff --git a/datafusion/functions/benches/left.rs b/datafusion/functions/benches/left.rs deleted file mode 100644 index d208e7d4fc93a..0000000000000 --- a/datafusion/functions/benches/left.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate criterion; - -use std::hint::black_box; -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int64Array}; -use arrow::datatypes::{DataType, Field}; -use arrow::util::bench_util::{ - create_string_array_with_len, create_string_view_array_with_len, -}; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use datafusion_common::config::ConfigOptions; -use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use datafusion_functions::unicode::left; - -fn create_args( - size: usize, - str_len: usize, - use_negative: bool, - is_string_view: bool, -) -> Vec { - let string_arg = if is_string_view { - ColumnarValue::Array(Arc::new(create_string_view_array_with_len( - size, 0.1, str_len, true, - ))) - } else { - ColumnarValue::Array(Arc::new(create_string_array_with_len::( - size, 0.1, str_len, - ))) - }; - - // For negative n, we want to trigger the double-iteration code path - let n_values: Vec = if use_negative { - (0..size).map(|i| -((i % 10 + 1) as i64)).collect() - } else { - (0..size).map(|i| (i % 10 + 1) as i64).collect() - }; - let n_array = Arc::new(Int64Array::from(n_values)); - - vec![ - string_arg, - ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef), - ] -} - -fn criterion_benchmark(c: &mut Criterion) { - for is_string_view in [false, true] { - for size in [1024, 4096] { - let mut group = c.benchmark_group(format!("left size={size}")); - - // Benchmark with positive n (no optimization needed) - let mut function_name = if is_string_view { - "string_view_array positive n" - } else { - "string_array positive n" - }; - let args = create_args(size, 32, false, is_string_view); - group.bench_function(BenchmarkId::new(function_name, size), |b| { - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - let config_options = Arc::new(ConfigOptions::default()); - - b.iter(|| { - black_box( - left() - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: size, - return_field: Field::new("f", DataType::Utf8, true) - .into(), - config_options: Arc::clone(&config_options), - }) - .expect("left should work"), - ) - }) - }); - - // Benchmark with negative n (triggers optimization) - function_name = if is_string_view { - "string_view_array negative n" - } else { - "string_array negative n" - }; - let args = create_args(size, 32, true, is_string_view); - group.bench_function(BenchmarkId::new(function_name, size), |b| { - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - let config_options = Arc::new(ConfigOptions::default()); - - b.iter(|| { - black_box( - left() - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: size, - return_field: Field::new("f", DataType::Utf8, true) - .into(), - config_options: Arc::clone(&config_options), - }) - .expect("left should work"), - ) - }) - }); - - group.finish(); - } - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/functions/benches/left_right.rs b/datafusion/functions/benches/left_right.rs new file mode 100644 index 0000000000000..9848c21ed7e55 --- /dev/null +++ b/datafusion/functions/benches/left_right.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array}; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::unicode::{left, right}; + +fn create_args( + size: usize, + str_len: usize, + use_negative: bool, + is_string_view: bool, +) -> Vec { + let string_arg = if is_string_view { + ColumnarValue::Array(Arc::new(create_string_view_array_with_len( + size, 0.1, str_len, true, + ))) + } else { + ColumnarValue::Array(Arc::new(create_string_array_with_len::( + size, 0.1, str_len, + ))) + }; + + // For negative n, we want to trigger the double-iteration code path + let n_values: Vec = if use_negative { + (0..size).map(|i| -((i % 10 + 1) as i64)).collect() + } else { + (0..size).map(|i| (i % 10 + 1) as i64).collect() + }; + let n_array = Arc::new(Int64Array::from(n_values)); + + vec![ + string_arg, + ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef), + ] +} + +fn criterion_benchmark(c: &mut Criterion) { + let left_function = left(); + let right_function = right(); + + for function in [left_function, right_function] { + for is_string_view in [false, true] { + for size in [1024, 4096] { + let function_name = function.name(); + let mut group = c.benchmark_group(format!("{function_name} size={size}")); + + // Benchmark with positive n (no optimization needed) + let mut bench_name = if is_string_view { + "string_view_array positive n" + } else { + "string_array positive n" + }; + let return_type = if is_string_view { + DataType::Utf8View + } else { + DataType::Utf8 + }; + + let args = create_args(size, 32, false, is_string_view); + group.bench_function(BenchmarkId::new(bench_name, size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + function + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new( + "f", + return_type.clone(), + true, + ) + .into(), + config_options: Arc::clone(&config_options), + }) + .expect("should work"), + ) + }) + }); + + // Benchmark with negative n (triggers optimization) + bench_name = if is_string_view { + "string_view_array negative n" + } else { + "string_array negative n" + }; + let return_type = if is_string_view { + DataType::Utf8View + } else { + DataType::Utf8 + }; + + let args = create_args(size, 32, true, is_string_view); + group.bench_function(BenchmarkId::new(bench_name, size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + function + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new( + "f", + return_type.clone(), + true, + ) + .into(), + config_options: Arc::clone(&config_options), + }) + .expect("should work"), + ) + }) + }); + + group.finish(); + } + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/benches/right.rs b/datafusion/functions/benches/right.rs deleted file mode 100644 index 80294ecc4b053..0000000000000 --- a/datafusion/functions/benches/right.rs +++ /dev/null @@ -1,150 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate criterion; - -use std::hint::black_box; -use std::sync::Arc; - -use arrow::array::{ArrayRef, Int64Array}; -use arrow::datatypes::{DataType, Field}; -use arrow::util::bench_util::{ - create_string_array_with_len, create_string_view_array_with_len, -}; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; -use datafusion_common::config::ConfigOptions; -use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use datafusion_functions::unicode::right; - -fn create_args( - size: usize, - str_len: usize, - use_negative: bool, - is_string_view: bool, -) -> Vec { - let string_arg = if is_string_view { - ColumnarValue::Array(Arc::new(create_string_view_array_with_len( - size, 0.1, str_len, true, - ))) - } else { - ColumnarValue::Array(Arc::new(create_string_array_with_len::( - size, 0.1, str_len, - ))) - }; - - // For negative n, we want to trigger the double-iteration code path - let n_values: Vec = if use_negative { - (0..size).map(|i| -((i % 10 + 1) as i64)).collect() - } else { - (0..size).map(|i| (i % 10 + 1) as i64).collect() - }; - let n_array = Arc::new(Int64Array::from(n_values)); - - vec![ - string_arg, - ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef), - ] -} - -fn criterion_benchmark(c: &mut Criterion) { - for is_string_view in [false, true] { - for size in [1024, 4096] { - let mut group = c.benchmark_group(format!("right size={size}")); - - // Benchmark with positive n (no optimization needed) - let mut function_name = if is_string_view { - "string_view_array positive n" - } else { - "string_array positive n" - }; - let args = create_args(size, 32, false, is_string_view); - group.bench_function(BenchmarkId::new(function_name, size), |b| { - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - let return_type = if is_string_view { - DataType::Utf8View - } else { - DataType::Utf8 - }; - let config_options = Arc::new(ConfigOptions::default()); - - b.iter(|| { - black_box( - right() - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: size, - return_field: Field::new("f", return_type.clone(), true) - .into(), - config_options: Arc::clone(&config_options), - }) - .expect("right should work"), - ) - }) - }); - - // Benchmark with negative n (triggers optimization) - function_name = if is_string_view { - "string_view_array negative n" - } else { - "string_array negative n" - }; - let args = create_args(size, 32, true, is_string_view); - group.bench_function(BenchmarkId::new(function_name, size), |b| { - let arg_fields = args - .iter() - .enumerate() - .map(|(idx, arg)| { - Field::new(format!("arg_{idx}"), arg.data_type(), true).into() - }) - .collect::>(); - let return_type = if is_string_view { - DataType::Utf8View - } else { - DataType::Utf8 - }; - let config_options = Arc::new(ConfigOptions::default()); - - b.iter(|| { - black_box( - right() - .invoke_with_args(ScalarFunctionArgs { - args: args.clone(), - arg_fields: arg_fields.clone(), - number_rows: size, - return_field: Field::new("f", return_type.clone(), true) - .into(), - config_options: Arc::clone(&config_options), - }) - .expect("right should work"), - ) - }) - }); - - group.finish(); - } - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs new file mode 100644 index 0000000000000..362101a695329 --- /dev/null +++ b/datafusion/functions/src/unicode/common.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Common utilities for implementing unicode functions + +use arrow::array::{ + Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array, + OffsetSizeTrait, StringViewArray, make_view, +}; +use arrow::datatypes::DataType; +use arrow_buffer::{NullBuffer, ScalarBuffer}; +use datafusion_common::cast::{ + as_generic_string_array, as_int64_array, as_string_view_array, +}; +use datafusion_common::exec_err; +use std::cmp::Ordering; +use std::ops::Range; +use std::sync::Arc; + +/// A trait for `left` and `right` byte slicing operations +pub(crate) trait LeftRightSlicer { + fn slice(string: &str, n: i64) -> Range; +} + +pub(crate) struct LeftSlicer {} + +impl LeftRightSlicer for LeftSlicer { + fn slice(string: &str, n: i64) -> Range { + if n == 0 { + // Return nothing for `n=0` + 0..0 + } else { + 0..left_right_byte_length(string, n) + } + } +} + +pub(crate) struct RightSlicer {} + +impl LeftRightSlicer for RightSlicer { + fn slice(string: &str, n: i64) -> Range { + if n == 0 { + // Return nothing for `n=0` + 0..0 + } else if n == i64::MIN { + // Special case for i64::MIN overflow + 0..0 + } else { + left_right_byte_length(string, -n)..string.len() + } + } +} + +/// Calculate the byte length of the substring of `n` chars from string `string` +#[inline] +fn left_right_byte_length(string: &str, n: i64) -> usize { + match n.cmp(&0) { + Ordering::Less => string + .char_indices() + .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize) + .map(|(index, _)| index) + .unwrap_or(0), + Ordering::Equal => 0, + Ordering::Greater => string + .char_indices() + .nth(n.unsigned_abs().min(usize::MAX as u64) as usize) + .map(|(index, _)| index) + .unwrap_or(string.len()), + } +} + +/// General implementation for `left` and `right` functions +pub(crate) fn general_left_right( + args: &[ArrayRef], +) -> datafusion_common::Result { + let n_array = as_int64_array(&args[1])?; + + match args[0].data_type() { + DataType::Utf8 => { + let string_array = as_generic_string_array::(&args[0])?; + general_left_right_array::(string_array, n_array) + } + DataType::LargeUtf8 => { + let string_array = as_generic_string_array::(&args[0])?; + general_left_right_array::(string_array, n_array) + } + DataType::Utf8View => { + let string_view_array = as_string_view_array(&args[0])?; + general_left_right_view::(string_view_array, n_array) + } + _ => exec_err!("Not supported"), + } +} + +/// `general_left_right` implementation for strings +fn general_left_right_array< + 'a, + T: OffsetSizeTrait, + V: ArrayAccessor, + F: LeftRightSlicer, +>( + string_array: V, + n_array: &Int64Array, +) -> datafusion_common::Result { + let iter = ArrayIter::new(string_array); + let result = iter + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => { + let range = F::slice(string, n); + // Extract a given range from a byte-indexed slice + Some(&string[range]) + } + _ => None, + }) + .collect::>(); + + Ok(Arc::new(result) as ArrayRef) +} + +/// `general_left_right` implementation for StringViewArray +fn general_left_right_view( + string_view_array: &StringViewArray, + n_array: &Int64Array, +) -> datafusion_common::Result { + let len = n_array.len(); + + let views = string_view_array.views(); + // Every string in StringViewArray has one corresponding view in `views` + debug_assert!(views.len() == string_view_array.len()); + + // Compose null buffer at once + let string_nulls = string_view_array.nulls(); + let n_nulls = n_array.nulls(); + let new_nulls = NullBuffer::union(string_nulls, n_nulls); + + let new_views = (0..len) + .map(|idx| { + let view = views[idx]; + + let is_valid = match &new_nulls { + Some(nulls_buf) => nulls_buf.is_valid(idx), + None => true, + }; + + if is_valid { + let string: &str = string_view_array.value(idx); + let n = n_array.value(idx); + + // Input string comes from StringViewArray, so it should fit in 32-bit length + let range = F::slice(string, n); + let result_bytes = &string.as_bytes()[range.clone()]; + + let byte_view = ByteView::from(view); + // New offsets starts at 0 for left, and at `range.start` for right, + // which is encoded in the given range + let new_offset = byte_view.offset + (range.start as u32); + // Reuse buffer + make_view(result_bytes, byte_view.buffer_index, new_offset) + } else { + // For nulls, keep the original view + view + } + }) + .collect::>(); + + // Buffers are unchanged + let result = StringViewArray::try_new( + ScalarBuffer::from(new_views), + Vec::from(string_view_array.data_buffers()), + new_nulls, + )?; + Ok(Arc::new(result) as ArrayRef) +} diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index 54f204993c46e..76873e7f5d3e1 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -16,20 +16,11 @@ // under the License. use std::any::Any; -use std::cmp::Ordering; -use std::sync::Arc; +use crate::unicode::common::{LeftSlicer, general_left_right}; use crate::utils::make_scalar_function; -use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array, - OffsetSizeTrait, StringViewArray, -}; use arrow::datatypes::DataType; -use arrow_buffer::{NullBuffer, ScalarBuffer}; use datafusion_common::Result; -use datafusion_common::cast::{ - as_generic_string_array, as_int64_array, as_string_view_array, -}; use datafusion_common::exec_err; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ @@ -97,6 +88,10 @@ impl ScalarUDFImpl for LeftFunc { Ok(arg_types[0].clone()) } + /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. + /// left('abcde', 2) = 'ab' + /// left('abcde', -2) = 'abc' + /// The implementation uses UTF-8 code points as characters fn invoke_with_args( &self, args: datafusion_expr::ScalarFunctionArgs, @@ -104,7 +99,7 @@ impl ScalarUDFImpl for LeftFunc { let args = &args.args; match args[0].data_type() { DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { - make_scalar_function(left, vec![])(args) + make_scalar_function(general_left_right::, vec![])(args) } other => exec_err!( "Unsupported data type {other:?} for function {},\ @@ -119,145 +114,6 @@ impl ScalarUDFImpl for LeftFunc { } } -/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. -/// left('abcde', 2) = 'ab' -/// left('abcde', -2) = 'ab' -/// The implementation uses UTF-8 code points as characters -fn left(args: &[ArrayRef]) -> Result { - let n_array = as_int64_array(&args[1])?; - - match args[0].data_type() { - DataType::Utf8 => { - let string_array = as_generic_string_array::(&args[0])?; - left_impl::(string_array, n_array) - } - DataType::LargeUtf8 => { - let string_array = as_generic_string_array::(&args[0])?; - left_impl::(string_array, n_array) - } - DataType::Utf8View => { - let string_view_array = as_string_view_array(&args[0])?; - left_impl_view(string_view_array, n_array) - } - _ => exec_err!("Not supported"), - } -} - -/// `left` implementation for strings -fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( - string_array: V, - n_array: &Int64Array, -) -> Result { - let iter = ArrayIter::new(string_array); - let result = iter - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => { - let byte_length = left_byte_length(string, n); - // Extract first `byte_length` bytes from a byte-indexed slice - Some(&string[0..byte_length]) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// `left` implementation for StringViewArray -fn left_impl_view( - string_view_array: &StringViewArray, - n_array: &Int64Array, -) -> Result { - let len = n_array.len(); - - let views = string_view_array.views(); - // Every string in StringViewArray has one corresponding view in `views` - debug_assert!(views.len() == string_view_array.len()); - - // Compose null buffer at once - let string_nulls = string_view_array.nulls(); - let n_nulls = n_array.nulls(); - let new_nulls = NullBuffer::union(string_nulls, n_nulls); - - let new_views = (0..len) - .map(|idx| { - let view = views[idx]; - - let is_valid = match &new_nulls { - Some(nulls_buf) => nulls_buf.is_valid(idx), - None => true, - }; - - if is_valid { - let string: &str = string_view_array.value(idx); - let n = n_array.value(idx); - - // Input string comes from StringViewArray, so it should fit in 32-bit length - let new_length: u32 = left_byte_length(string, n) as u32; - let byte_view = ByteView::from(view); - // Construct a new view - shrink_string_view_array_view(string, new_length, byte_view) - } else { - // For nulls, keep the original view - view - } - }) - .collect::>(); - - // Buffers are unchanged - let result = StringViewArray::try_new( - ScalarBuffer::from(new_views), - Vec::from(string_view_array.data_buffers()), - new_nulls, - )?; - Ok(Arc::new(result) as ArrayRef) -} - -/// Calculate the byte length of the substring of `n` chars from string `string` -fn left_byte_length(string: &str, n: i64) -> usize { - match n.cmp(&0) { - Ordering::Less => string - .char_indices() - .nth_back(n.unsigned_abs() as usize - 1) - .map(|(index, _)| index) - .unwrap_or(0), - Ordering::Equal => 0, - Ordering::Greater => string - .char_indices() - .nth(n.unsigned_abs() as usize) - .map(|(index, _)| index) - .unwrap_or(string.len()), - } -} - -/// Construct a new StringViewArray view from existing view `byte_view` and new length `len`. -/// Prefix is taken from the original string `string`. -/// Handles both inline and non-inline views, referencing the same buffers. -fn shrink_string_view_array_view(string: &str, len: u32, byte_view: ByteView) -> u128 { - debug_assert!(len <= byte_view.length); - // Acquire bytes view to string (no allocations) - let bytes = string.as_bytes(); - - if len <= 12 { - // Inline view - // Construct manually since ByteView cannot work with inline views - let mut view_buffer = [0u8; 16]; - // 4 bytes: length - view_buffer[0..4].copy_from_slice(&len.to_le_bytes()); - // 12 bytes: the whole zero-padded string - view_buffer[4..4 + len as usize].copy_from_slice(&bytes[..len as usize]); - u128::from_le_bytes(view_buffer) - } else { - // Non-inline view. - // Use ByteView constructor to reference existing buffers - let new_byte_view = ByteView::new(len, &bytes[..4]) - .with_buffer_index(byte_view.buffer_index) - .with_offset(byte_view.offset); - new_byte_view.as_u128() - } -} - #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; diff --git a/datafusion/functions/src/unicode/mod.rs b/datafusion/functions/src/unicode/mod.rs index 4a0dd21d749af..7250b3915fb5c 100644 --- a/datafusion/functions/src/unicode/mod.rs +++ b/datafusion/functions/src/unicode/mod.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; pub mod character_length; +pub mod common; pub mod find_in_set; pub mod initcap; pub mod left; diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 569f20d3291b2..a97e242b73f9e 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -16,20 +16,11 @@ // under the License. use std::any::Any; -use std::cmp::Ordering; -use std::sync::Arc; +use crate::unicode::common::{RightSlicer, general_left_right}; use crate::utils::make_scalar_function; -use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array, - OffsetSizeTrait, StringViewArray, make_view, -}; use arrow::datatypes::DataType; -use arrow_buffer::{NullBuffer, ScalarBuffer}; use datafusion_common::Result; -use datafusion_common::cast::{ - as_generic_string_array, as_int64_array, as_string_view_array, -}; use datafusion_common::exec_err; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ @@ -97,6 +88,10 @@ impl ScalarUDFImpl for RightFunc { Ok(arg_types[0].clone()) } + /// Returns right n characters in the string, or when n is negative, returns all but first |n| characters. + /// right('abcde', 2) = 'de' + /// right('abcde', -2) = 'cde' + /// The implementation uses UTF-8 code points as characters fn invoke_with_args( &self, args: datafusion_expr::ScalarFunctionArgs, @@ -104,7 +99,7 @@ impl ScalarUDFImpl for RightFunc { let args = &args.args; match args[0].data_type() { DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { - make_scalar_function(right, vec![])(args) + make_scalar_function(general_left_right::, vec![])(args) } other => exec_err!( "Unsupported data type {other:?} for function {},\ @@ -119,129 +114,6 @@ impl ScalarUDFImpl for RightFunc { } } -/// Returns right n characters in the string, or when n is negative, returns all but first |n| characters. -/// right('abcde', 2) = 'de' -/// right('abcde', -2) = 'cde' -/// The implementation uses UTF-8 code points as characters -fn right(args: &[ArrayRef]) -> Result { - let n_array = as_int64_array(&args[1])?; - - match args[0].data_type() { - DataType::Utf8 => { - let string_array = as_generic_string_array::(&args[0])?; - right_impl::(string_array, n_array) - } - DataType::LargeUtf8 => { - let string_array = as_generic_string_array::(&args[0])?; - right_impl::(string_array, n_array) - } - DataType::Utf8View => { - let string_view_array = as_string_view_array(&args[0])?; - right_impl_view(string_view_array, n_array) - } - _ => exec_err!("Not supported"), - } -} - -/// `right` implementation for strings -fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( - string_array: V, - n_array: &Int64Array, -) -> Result { - let iter = ArrayIter::new(string_array); - let result = iter - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => { - let byte_length = right_byte_length(string, n); - // Extract starting from `byte_length` bytes from a byte-indexed slice - Some(&string[byte_length..]) - } - _ => None, - }) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) -} - -/// `right` implementation for StringViewArray -fn right_impl_view( - string_view_array: &StringViewArray, - n_array: &Int64Array, -) -> Result { - let len = n_array.len(); - - let views = string_view_array.views(); - // Every string in StringViewArray has one corresponding view in `views` - debug_assert!(views.len() == string_view_array.len()); - - // Compose null buffer at once - let string_nulls = string_view_array.nulls(); - let n_nulls = n_array.nulls(); - let new_nulls = NullBuffer::union(string_nulls, n_nulls); - - let new_views = (0..len) - .map(|idx| { - let view = views[idx]; - - let is_valid = match &new_nulls { - Some(nulls_buf) => nulls_buf.is_valid(idx), - None => true, - }; - - if is_valid { - let string: &str = string_view_array.value(idx); - let n = n_array.value(idx); - - let new_offset = right_byte_length(string, n); - let result_bytes = &string.as_bytes()[new_offset..]; - - if result_bytes.len() > 12 { - let byte_view = ByteView::from(view); - // Reuse buffer, but adjust offset and length - make_view( - result_bytes, - byte_view.buffer_index, - byte_view.offset + new_offset as u32, - ) - } else { - // inline value does not need block id or offset - make_view(result_bytes, 0, 0) - } - } else { - // For nulls, keep the original view - view - } - }) - .collect::>(); - - // Buffers are unchanged - let result = StringViewArray::try_new( - ScalarBuffer::from(new_views), - Vec::from(string_view_array.data_buffers()), - new_nulls, - )?; - Ok(Arc::new(result) as ArrayRef) -} - -/// Calculate the byte length of the substring of last `n` chars from string `string` -/// (or all but first `|n|` chars if n is negative) -fn right_byte_length(string: &str, n: i64) -> usize { - match n.cmp(&0) { - Ordering::Less => string - .char_indices() - .nth(n.unsigned_abs().min(usize::MAX as u64) as usize) - .map(|(index, _)| index) - .unwrap_or(string.len()), - Ordering::Equal => string.len(), - Ordering::Greater => string - .char_indices() - .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize) - .map(|(index, _)| index) - .unwrap_or(0), - } -} - #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray};