From 08f3cdbb9315967e468cd98cdf5852f0ff4fba64 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 30 Dec 2025 13:23:14 -0800 Subject: [PATCH] perf: optimize left function by eliminating double chars() iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For negative n values, the function was calling string.chars() twice: 1. Once to count total characters 2. Again to take the prefix This optimization collects chars into a reusable buffer once per row for the negative n case, eliminating the redundant iteration. Benchmark results (negative n, which triggers the optimization): - size=1024: 71.323 µs → 52.760 µs (26.0% faster) - size=4096: 289.62 µs → 212.23 µs (26.7% faster) Benchmark results (positive n, minimal overhead): - size=1024: 24.465 µs → 24.691 µs (0.9% slower) - size=4096: 96.129 µs → 97.078 µs (1.0% slower) The dramatic improvement for negative n cases far outweighs the negligible overhead for positive n cases. --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/left.rs | 111 +++++++++++++++++++++++ datafusion/functions/src/unicode/left.rs | 12 ++- 3 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 datafusion/functions/benches/left.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index d85a269c7fa71..eba283a9001e6 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -279,3 +279,8 @@ required-features = ["unicode_expressions"] harness = false name = "levenshtein" required-features = ["unicode_expressions"] + +[[bench]] +harness = false +name = "left" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/left.rs b/datafusion/functions/benches/left.rs new file mode 100644 index 0000000000000..3ea628fe2987c --- /dev/null +++ b/datafusion/functions/benches/left.rs @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array}; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::unicode::left; + +fn create_args(size: usize, str_len: usize, use_negative: bool) -> Vec { + let string_array = Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + // For negative n, we want to trigger the double-iteration code path + let n_values: Vec = if use_negative { + (0..size).map(|i| -((i % 10 + 1) as i64)).collect() + } else { + (0..size).map(|i| (i % 10 + 1) as i64).collect() + }; + let n_array = Arc::new(Int64Array::from(n_values)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef), + ] +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("left size={size}")); + + // Benchmark with positive n (no optimization needed) + let args = create_args(size, 32, false); + group.bench_function(BenchmarkId::new("positive n", size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + left() + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .expect("left should work"), + ) + }) + }); + + // Benchmark with negative n (triggers optimization) + let args = create_args(size, 32, true); + group.bench_function(BenchmarkId::new("negative n", size), |b| { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + b.iter(|| { + black_box( + left() + .invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .expect("left should work"), + ) + }) + }); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index ecff8f8699506..6e07ea76d5593 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -139,14 +139,22 @@ fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( n_array: &Int64Array, ) -> Result { let iter = ArrayIter::new(string_array); + let mut chars_buf = Vec::new(); let result = iter .zip(n_array.iter()) .map(|(string, n)| match (string, n) { (Some(string), Some(n)) => match n.cmp(&0) { Ordering::Less => { - let len = string.chars().count() as i64; + // Collect chars once and reuse for both count and take + chars_buf.clear(); + chars_buf.extend(string.chars()); + let len = chars_buf.len() as i64; + Some(if n.abs() < len { - string.chars().take((len + n) as usize).collect::() + chars_buf + .iter() + .take((len + n) as usize) + .collect::() } else { "".to_string() })