Skip to content

Commit 08f3cdb

Browse files
committed
perf: optimize left function by eliminating double chars() iteration
For negative n values, the function was calling string.chars() twice: 1. Once to count total characters 2. Again to take the prefix This optimization collects chars into a reusable buffer once per row for the negative n case, eliminating the redundant iteration. Benchmark results (negative n, which triggers the optimization): - size=1024: 71.323 µs → 52.760 µs (26.0% faster) - size=4096: 289.62 µs → 212.23 µs (26.7% faster) Benchmark results (positive n, minimal overhead): - size=1024: 24.465 µs → 24.691 µs (0.9% slower) - size=4096: 96.129 µs → 97.078 µs (1.0% slower) The dramatic improvement for negative n cases far outweighs the negligible overhead for positive n cases.
1 parent 7c50448 commit 08f3cdb

File tree

3 files changed

+126
-2
lines changed

3 files changed

+126
-2
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,8 @@ required-features = ["unicode_expressions"]
279279
harness = false
280280
name = "levenshtein"
281281
required-features = ["unicode_expressions"]
282+
283+
[[bench]]
284+
harness = false
285+
name = "left"
286+
required-features = ["unicode_expressions"]
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use std::hint::black_box;
21+
use std::sync::Arc;
22+
23+
use arrow::array::{ArrayRef, Int64Array};
24+
use arrow::datatypes::{DataType, Field};
25+
use arrow::util::bench_util::create_string_array_with_len;
26+
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
27+
use datafusion_common::config::ConfigOptions;
28+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
29+
use datafusion_functions::unicode::left;
30+
31+
fn create_args(size: usize, str_len: usize, use_negative: bool) -> Vec<ColumnarValue> {
32+
let string_array = Arc::new(create_string_array_with_len::<i32>(size, 0.1, str_len));
33+
34+
// For negative n, we want to trigger the double-iteration code path
35+
let n_values: Vec<i64> = if use_negative {
36+
(0..size).map(|i| -((i % 10 + 1) as i64)).collect()
37+
} else {
38+
(0..size).map(|i| (i % 10 + 1) as i64).collect()
39+
};
40+
let n_array = Arc::new(Int64Array::from(n_values));
41+
42+
vec![
43+
ColumnarValue::Array(string_array),
44+
ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
45+
]
46+
}
47+
48+
fn criterion_benchmark(c: &mut Criterion) {
49+
for size in [1024, 4096] {
50+
let mut group = c.benchmark_group(format!("left size={size}"));
51+
52+
// Benchmark with positive n (no optimization needed)
53+
let args = create_args(size, 32, false);
54+
group.bench_function(BenchmarkId::new("positive n", size), |b| {
55+
let arg_fields = args
56+
.iter()
57+
.enumerate()
58+
.map(|(idx, arg)| {
59+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
60+
})
61+
.collect::<Vec<_>>();
62+
let config_options = Arc::new(ConfigOptions::default());
63+
64+
b.iter(|| {
65+
black_box(
66+
left()
67+
.invoke_with_args(ScalarFunctionArgs {
68+
args: args.clone(),
69+
arg_fields: arg_fields.clone(),
70+
number_rows: size,
71+
return_field: Field::new("f", DataType::Utf8, true).into(),
72+
config_options: Arc::clone(&config_options),
73+
})
74+
.expect("left should work"),
75+
)
76+
})
77+
});
78+
79+
// Benchmark with negative n (triggers optimization)
80+
let args = create_args(size, 32, true);
81+
group.bench_function(BenchmarkId::new("negative n", size), |b| {
82+
let arg_fields = args
83+
.iter()
84+
.enumerate()
85+
.map(|(idx, arg)| {
86+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
87+
})
88+
.collect::<Vec<_>>();
89+
let config_options = Arc::new(ConfigOptions::default());
90+
91+
b.iter(|| {
92+
black_box(
93+
left()
94+
.invoke_with_args(ScalarFunctionArgs {
95+
args: args.clone(),
96+
arg_fields: arg_fields.clone(),
97+
number_rows: size,
98+
return_field: Field::new("f", DataType::Utf8, true).into(),
99+
config_options: Arc::clone(&config_options),
100+
})
101+
.expect("left should work"),
102+
)
103+
})
104+
});
105+
106+
group.finish();
107+
}
108+
}
109+
110+
criterion_group!(benches, criterion_benchmark);
111+
criterion_main!(benches);

datafusion/functions/src/unicode/left.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,22 @@ fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
139139
n_array: &Int64Array,
140140
) -> Result<ArrayRef> {
141141
let iter = ArrayIter::new(string_array);
142+
let mut chars_buf = Vec::new();
142143
let result = iter
143144
.zip(n_array.iter())
144145
.map(|(string, n)| match (string, n) {
145146
(Some(string), Some(n)) => match n.cmp(&0) {
146147
Ordering::Less => {
147-
let len = string.chars().count() as i64;
148+
// Collect chars once and reuse for both count and take
149+
chars_buf.clear();
150+
chars_buf.extend(string.chars());
151+
let len = chars_buf.len() as i64;
152+
148153
Some(if n.abs() < len {
149-
string.chars().take((len + n) as usize).collect::<String>()
154+
chars_buf
155+
.iter()
156+
.take((len + n) as usize)
157+
.collect::<String>()
150158
} else {
151159
"".to_string()
152160
})

0 commit comments

Comments
 (0)