Skip to content

Commit d2d2e29

Browse files
authored
byte_length() expression (#8298)
This PR addes byte_length() expression for strings and blobs. This PR does _not_ allow byte_length pushdown for Duckdb's strlen() due to SharedArray performance issues (will be added separately). Signed-off-by: Mikhail Kot <mikhail@spiraldb.com>
1 parent 5e3aedb commit d2d2e29

6 files changed

Lines changed: 326 additions & 0 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
use vortex_array::ArrayRef;
4+
use vortex_array::ArrayView;
5+
use vortex_array::ExecutionCtx;
6+
use vortex_array::IntoArray;
7+
use vortex_array::ValidityVTable;
8+
use vortex_array::arrays::ConstantArray;
9+
use vortex_array::builtins::ArrayBuiltins;
10+
use vortex_array::dtype::DType;
11+
use vortex_array::dtype::PType;
12+
use vortex_array::scalar::Scalar;
13+
use vortex_array::scalar_fn::fns::byte_length::ByteLengthKernel;
14+
use vortex_array::validity::Validity;
15+
use vortex_error::VortexResult;
16+
17+
use crate::FSST;
18+
use crate::array::FSSTArrayExt;
19+
20+
impl ByteLengthKernel for FSST {
21+
fn byte_length(
22+
array: ArrayView<'_, Self>,
23+
_ctx: &mut ExecutionCtx,
24+
) -> VortexResult<Option<ArrayRef>> {
25+
let nullable = array.dtype().nullability();
26+
let dtype = DType::Primitive(PType::U64, nullable);
27+
// Uncompressed lengths are non-nullable and may be less than u64 each
28+
let lengths = array.uncompressed_lengths().cast(dtype.clone())?;
29+
Ok(Some(match FSST::validity(array)? {
30+
Validity::NonNullable | Validity::AllValid => lengths,
31+
Validity::Array(v) => lengths.mask(v)?,
32+
Validity::AllInvalid => {
33+
ConstantArray::new(Scalar::null(dtype), lengths.len()).into_array()
34+
}
35+
}))
36+
}
37+
}

encodings/fsst/src/compute/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4+
mod byte_length;
45
mod cast;
56
mod compare;
67
mod filter;

encodings/fsst/src/kernel.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor;
55
use vortex_array::arrays::filter::FilterExecuteAdaptor;
66
use vortex_array::kernel::ParentKernelSet;
77
use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor;
8+
use vortex_array::scalar_fn::fns::byte_length::ByteLengthExecuteAdaptor;
89
use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor;
910
use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor;
1011

@@ -16,6 +17,7 @@ pub(super) const PARENT_KERNELS: ParentKernelSet<FSST> = ParentKernelSet::new(&[
1617
ParentKernelSet::lift(&FilterExecuteAdaptor(FSST)),
1718
ParentKernelSet::lift(&TakeExecuteAdaptor(FSST)),
1819
ParentKernelSet::lift(&LikeExecuteAdaptor(FSST)),
20+
ParentKernelSet::lift(&ByteLengthExecuteAdaptor(FSST)),
1921
]);
2022

2123
#[cfg(test)]
@@ -27,10 +29,13 @@ mod tests {
2729
use vortex_array::IntoArray;
2830
use vortex_array::VortexSessionExecute;
2931
use vortex_array::arrays::FilterArray;
32+
use vortex_array::arrays::PrimitiveArray;
3033
use vortex_array::arrays::varbin::builder::VarBinBuilder;
3134
use vortex_array::assert_arrays_eq;
3235
use vortex_array::dtype::DType;
3336
use vortex_array::dtype::Nullability;
37+
use vortex_array::expr::byte_length;
38+
use vortex_array::expr::root;
3439
use vortex_array::session::ArraySession;
3540
use vortex_error::VortexResult;
3641
use vortex_mask::Mask;
@@ -205,4 +210,24 @@ mod tests {
205210
assert_arrays_eq!(result, fsst_array);
206211
Ok(())
207212
}
213+
214+
#[test]
215+
fn test_fsst_byte_length() -> VortexResult<()> {
216+
let mut builder = VarBinBuilder::<i32>::with_capacity(3);
217+
builder.append_value(b"hello");
218+
builder.append_value(b"world!!");
219+
builder.append_value("Пуховички"); // 9 characters, 18 bytes
220+
builder.append_value(b"");
221+
222+
let varbin = builder.finish(DType::Utf8(Nullability::NonNullable));
223+
let compressor = fsst_train_compressor(&varbin);
224+
let len = varbin.len();
225+
let dtype = varbin.dtype().clone();
226+
let mut ctx = SESSION.create_execution_ctx();
227+
let fsst = fsst_compress(varbin, len, &dtype, &compressor, &mut ctx).into_array();
228+
let result = fsst.apply(&byte_length(root()))?;
229+
let expected = PrimitiveArray::from_iter(vec![5u64, 7, 18, 0]);
230+
assert_arrays_eq!(result, expected);
231+
Ok(())
232+
}
208233
}

vortex-array/src/expr/exprs.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use crate::scalar_fn::ScalarFnVTableExt;
2121
use crate::scalar_fn::fns::between::Between;
2222
use crate::scalar_fn::fns::between::BetweenOptions;
2323
use crate::scalar_fn::fns::binary::Binary;
24+
use crate::scalar_fn::fns::byte_length::ByteLength;
2425
use crate::scalar_fn::fns::case_when::CaseWhen;
2526
use crate::scalar_fn::fns::case_when::CaseWhenOptions;
2627
use crate::scalar_fn::fns::cast::Cast;
@@ -718,3 +719,16 @@ pub fn dynamic(
718719
pub fn list_contains(list: Expression, value: Expression) -> Expression {
719720
ListContains.new_expr(EmptyOptions, [list, value])
720721
}
722+
723+
// ---- ByteLength ----
724+
725+
/// Creates an expression that computes the byte length of each element.
726+
/// This is akin to ANSI SQL OCTET_LENGTH(), or DuckDB's strlen().
727+
///
728+
/// ```rust
729+
/// # use vortex_array::expr::{byte_length, root};
730+
/// let expr = byte_length(root());
731+
/// ```
732+
pub fn byte_length(input: Expression) -> Expression {
733+
ByteLength.new_expr(EmptyOptions, [input])
734+
}
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use num_traits::AsPrimitive;
5+
use vortex_buffer::Buffer;
6+
use vortex_error::VortexExpect;
7+
use vortex_error::VortexResult;
8+
use vortex_error::vortex_bail;
9+
use vortex_error::vortex_ensure;
10+
use vortex_session::VortexSession;
11+
use vortex_session::registry::CachedId;
12+
13+
use crate::ArrayRef;
14+
use crate::ExecutionCtx;
15+
use crate::IntoArray;
16+
use crate::array::ArrayView;
17+
use crate::array::VTable;
18+
use crate::arrays::ConstantArray;
19+
use crate::arrays::PrimitiveArray;
20+
use crate::arrays::VarBinViewArray;
21+
use crate::arrays::scalar_fn::ExactScalarFn;
22+
use crate::arrays::scalar_fn::ScalarFnArrayView;
23+
use crate::arrays::varbinview::VarBinViewArrayExt;
24+
use crate::dtype::DType;
25+
use crate::dtype::Nullability;
26+
use crate::dtype::PType;
27+
use crate::kernel::ExecuteParentKernel;
28+
use crate::scalar::Scalar;
29+
use crate::scalar_fn::Arity;
30+
use crate::scalar_fn::ChildName;
31+
use crate::scalar_fn::EmptyOptions;
32+
use crate::scalar_fn::ExecutionArgs;
33+
use crate::scalar_fn::ScalarFnId;
34+
use crate::scalar_fn::ScalarFnVTable;
35+
36+
pub trait ByteLengthKernel: VTable {
37+
fn byte_length(
38+
array: ArrayView<'_, Self>,
39+
ctx: &mut ExecutionCtx,
40+
) -> VortexResult<Option<ArrayRef>>;
41+
}
42+
43+
#[derive(Default, Debug)]
44+
pub struct ByteLengthExecuteAdaptor<V>(pub V);
45+
46+
impl<V: ByteLengthKernel> ExecuteParentKernel<V> for ByteLengthExecuteAdaptor<V> {
47+
type Parent = ExactScalarFn<ByteLength>;
48+
49+
fn execute_parent(
50+
&self,
51+
array: ArrayView<'_, V>,
52+
_parent: ScalarFnArrayView<'_, ByteLength>,
53+
child_idx: usize,
54+
ctx: &mut ExecutionCtx,
55+
) -> VortexResult<Option<ArrayRef>> {
56+
vortex_ensure!(child_idx == 0);
57+
V::byte_length(array, ctx)
58+
}
59+
}
60+
61+
/// Byte length of each element in a Utf8 or Binary array.
62+
#[derive(Clone)]
63+
pub struct ByteLength;
64+
65+
impl ScalarFnVTable for ByteLength {
66+
type Options = EmptyOptions;
67+
68+
fn id(&self) -> ScalarFnId {
69+
static ID: CachedId = CachedId::new("vortex.byte_length");
70+
*ID
71+
}
72+
73+
fn serialize(&self, _instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
74+
Ok(Some(vec![]))
75+
}
76+
77+
fn deserialize(
78+
&self,
79+
_metadata: &[u8],
80+
_session: &VortexSession,
81+
) -> VortexResult<Self::Options> {
82+
Ok(EmptyOptions)
83+
}
84+
85+
fn arity(&self, _options: &Self::Options) -> Arity {
86+
Arity::Exact(1)
87+
}
88+
89+
fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
90+
match child_idx {
91+
0 => ChildName::from("input"),
92+
_ => unreachable!("Invalid child index {child_idx} for byte_length()"),
93+
}
94+
}
95+
96+
fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
97+
match &arg_dtypes[0] {
98+
DType::Utf8(nullable) | DType::Binary(nullable) => {
99+
Ok(DType::Primitive(PType::U64, *nullable))
100+
}
101+
other => vortex_bail!("byte_length() requires Utf8 or Binary, got {other}"),
102+
}
103+
}
104+
105+
fn execute(
106+
&self,
107+
_options: &Self::Options,
108+
args: &dyn ExecutionArgs,
109+
ctx: &mut ExecutionCtx,
110+
) -> VortexResult<ArrayRef> {
111+
let input = args.get(0)?;
112+
let nullability = input.dtype().nullability();
113+
114+
if let Some(scalar) = input.as_constant() {
115+
let len_scalar = scalar_byte_length(&scalar, nullability)?;
116+
return Ok(ConstantArray::new(len_scalar, args.row_count()).into_array());
117+
}
118+
119+
match input.dtype() {
120+
DType::Utf8(_) | DType::Binary(_) => byte_length(&input, nullability, ctx),
121+
other => vortex_bail!("byte_length() requires Utf8 or Binary, got {other}"),
122+
}
123+
}
124+
125+
fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
126+
false
127+
}
128+
129+
fn is_fallible(&self, _options: &Self::Options) -> bool {
130+
false
131+
}
132+
}
133+
134+
fn scalar_byte_length(scalar: &Scalar, nullability: Nullability) -> VortexResult<Scalar> {
135+
if scalar.is_null() {
136+
let dtype = DType::Primitive(PType::U64, Nullability::Nullable);
137+
return Ok(Scalar::null(dtype));
138+
}
139+
let len = match scalar.dtype() {
140+
DType::Utf8(_) => scalar
141+
.as_utf8()
142+
.value()
143+
.vortex_expect("null utf-8 scalar")
144+
.len(),
145+
DType::Binary(_) => scalar
146+
.as_binary()
147+
.value()
148+
.vortex_expect("null binary scalar")
149+
.len(),
150+
other => vortex_bail!("byte_length() requires Utf8 or Binary, got {other}"),
151+
};
152+
let len: u64 = len.as_();
153+
Ok(Scalar::primitive(len, nullability))
154+
}
155+
156+
pub(crate) fn byte_length(
157+
array: &ArrayRef,
158+
nullability: Nullability,
159+
ctx: &mut ExecutionCtx,
160+
) -> VortexResult<ArrayRef> {
161+
let array = array.clone().execute::<VarBinViewArray>(ctx)?;
162+
let validity = array.varbinview_validity();
163+
let lengths: Buffer<u64> = array.views().iter().map(|v| v.len() as u64).collect();
164+
Ok(PrimitiveArray::new(lengths, validity.union_nullability(nullability)).into_array())
165+
}
166+
167+
#[cfg(test)]
168+
mod tests {
169+
use rstest::rstest;
170+
use vortex_error::VortexResult;
171+
172+
use crate::ArrayRef;
173+
use crate::IntoArray;
174+
use crate::LEGACY_SESSION;
175+
use crate::VortexSessionExecute;
176+
use crate::arrays::ConstantArray;
177+
use crate::arrays::PrimitiveArray;
178+
use crate::arrays::VarBinArray;
179+
use crate::arrays::VarBinViewArray;
180+
use crate::assert_arrays_eq;
181+
use crate::dtype::DType;
182+
use crate::dtype::Nullability;
183+
use crate::expr::byte_length;
184+
use crate::expr::root;
185+
use crate::scalar::Scalar;
186+
187+
#[rstest]
188+
#[case(VarBinArray::from_strs(vec!["hello", "world", ""]).into_array(), vec![5u64, 5, 0])]
189+
#[case(VarBinArray::from_bytes(vec![b"ab".as_ref(), b"cde"]).into_array(), vec![2u64, 3])]
190+
#[case(VarBinArray::from_strs(vec!["Пуховички"]).into_array(), vec![18u64])]
191+
#[case(VarBinArray::from_bytes(vec!["Пуховички".as_ref()]).into_array(), vec![18u64])]
192+
fn test_bytes_byte_length(
193+
#[case] array: ArrayRef,
194+
#[case] expected_lens: Vec<u64>,
195+
) -> VortexResult<()> {
196+
let result = array.apply(&byte_length(root()))?;
197+
let expected = PrimitiveArray::from_iter(expected_lens);
198+
assert_arrays_eq!(result, expected);
199+
Ok(())
200+
}
201+
202+
#[test]
203+
fn test_varbinview_byte_length() -> VortexResult<()> {
204+
let array = VarBinViewArray::from_iter_str(["short", "a longer string here"]).into_array();
205+
let result = array.apply(&byte_length(root()))?;
206+
let expected = PrimitiveArray::from_iter(vec![5u64, 20]);
207+
assert_arrays_eq!(result, expected);
208+
Ok(())
209+
}
210+
211+
#[test]
212+
fn test_nullable_string_byte_length() -> VortexResult<()> {
213+
let array = VarBinArray::from_nullable_strs(vec![Some("hello"), None, Some("Пуховички")])
214+
.into_array();
215+
let result = array.apply(&byte_length(root()))?;
216+
217+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
218+
assert!(result.is_valid(0, &mut ctx)?);
219+
assert!(!result.is_valid(1, &mut ctx)?);
220+
assert!(result.is_valid(2, &mut ctx)?);
221+
assert_eq!(
222+
result.execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())?,
223+
Scalar::primitive(5u64, Nullability::Nullable),
224+
);
225+
assert_eq!(
226+
result.execute_scalar(2, &mut LEGACY_SESSION.create_execution_ctx())?,
227+
Scalar::primitive(18u64, Nullability::Nullable),
228+
);
229+
Ok(())
230+
}
231+
232+
#[test]
233+
fn test_null_scalar_byte_length() -> VortexResult<()> {
234+
let null_scalar = Scalar::null(DType::Utf8(Nullability::Nullable));
235+
let array = ConstantArray::new(null_scalar, 2).into_array();
236+
let result = array.apply(&byte_length(root()))?;
237+
let mut ctx = LEGACY_SESSION.create_execution_ctx();
238+
assert!(!result.is_valid(0, &mut ctx)?);
239+
assert!(!result.is_valid(1, &mut ctx)?);
240+
Ok(())
241+
}
242+
243+
#[test]
244+
fn test_display() {
245+
let expr = byte_length(root());
246+
assert_eq!(expr.to_string(), "vortex.byte_length($)");
247+
}
248+
}

vortex-array/src/scalar_fn/fns/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
pub mod between;
55
pub mod binary;
6+
pub mod byte_length;
67
pub mod case_when;
78
pub mod cast;
89
pub mod dynamic;

0 commit comments

Comments
 (0)