Skip to content

Commit dbb16c1

Browse files
committed
Avoid constructing an ExecutionCtx for non-null validity during validation
`VarBinViewArray`/`VarBinArray` validation only needs an execution context when validity is array-backed. Previously `is_null` short-circuited for the `NonNullable`/`AllValid` variants without ever creating a context, but the ctx-threaded version constructed a `LEGACY_SESSION` context unconditionally, adding overhead to the common non-null construction path (observed as a regression in the `varbinview_zip` benchmarks). Construct the context lazily, only for `Validity::Array`, and resolve the constant variants without one. The UTF-8 validation loop is extracted into `VarBinArray::validate_utf8` with an `is_null_at` helper to keep the function within the cognitive-complexity budget. Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent 15cd35a commit dbb16c1

2 files changed

Lines changed: 54 additions & 25 deletions

File tree

vortex-array/src/arrays/varbin/array.rs

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use vortex_error::vortex_err;
1414

1515
use crate::ArrayRef;
1616
use crate::ArraySlots;
17+
use crate::ExecutionCtx;
1718
use crate::LEGACY_SESSION;
1819
#[expect(deprecated)]
1920
use crate::ToCanonical as _;
@@ -244,34 +245,55 @@ impl VarBinData {
244245
&& matches!(dtype, DType::Utf8(_))
245246
&& let Some(bytes) = bytes.as_host_opt()
246247
{
247-
#[expect(deprecated)]
248-
let primitive_offsets = offsets.to_primitive();
249-
match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
250-
let offsets_slice = primitive_offsets.as_slice::<O>();
251-
let mut ctx = LEGACY_SESSION.create_execution_ctx();
252-
for (i, (start, end)) in offsets_slice
253-
.windows(2)
254-
.map(|o| (o[0].as_(), o[1].as_()))
255-
.enumerate()
256-
{
257-
if validity.execute_is_null(i, &mut ctx)? {
258-
continue;
259-
}
260-
261-
let string_bytes = &bytes.as_ref()[start..end];
262-
simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
263-
#[expect(clippy::unwrap_used)]
264-
// run validation using `compat` package to get more detailed error message
265-
let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
266-
vortex_err!("invalid utf-8: {err} at index {i}")
267-
})?;
268-
}
269-
});
248+
Self::validate_utf8(offsets, bytes.as_ref(), validity)?;
270249
}
271250

272251
Ok(())
273252
}
274253

254+
/// Validates that every non-null value is valid UTF-8.
255+
fn validate_utf8(offsets: &ArrayRef, bytes: &[u8], validity: &Validity) -> VortexResult<()> {
256+
#[expect(deprecated)]
257+
let primitive_offsets = offsets.to_primitive();
258+
// Only array-backed validity needs an execution context; the constant variants resolve
259+
// null-ness without one, so avoid constructing a context otherwise.
260+
let mut ctx =
261+
matches!(validity, Validity::Array(_)).then(|| LEGACY_SESSION.create_execution_ctx());
262+
match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
263+
let offsets_slice = primitive_offsets.as_slice::<O>();
264+
for (i, (start, end)) in offsets_slice
265+
.windows(2)
266+
.map(|o| (o[0].as_(), o[1].as_()))
267+
.enumerate()
268+
{
269+
if Self::is_null_at(validity, i, ctx.as_mut())? {
270+
continue;
271+
}
272+
273+
let string_bytes = &bytes[start..end];
274+
simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
275+
#[expect(clippy::unwrap_used)]
276+
// run validation using `compat` package to get more detailed error message
277+
let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
278+
vortex_err!("invalid utf-8: {err} at index {i}")
279+
})?;
280+
}
281+
});
282+
Ok(())
283+
}
284+
285+
/// Resolves null-ness at `index`, using `ctx` only for array-backed validity.
286+
fn is_null_at(
287+
validity: &Validity,
288+
index: usize,
289+
ctx: Option<&mut ExecutionCtx>,
290+
) -> VortexResult<bool> {
291+
match ctx {
292+
Some(ctx) => validity.execute_is_null(index, ctx),
293+
None => Ok(matches!(validity, Validity::AllInvalid)),
294+
}
295+
}
296+
275297
/// Access the value bytes child buffer
276298
///
277299
/// # Note

vortex-array/src/arrays/varbinview/array.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,16 @@ impl VarBinViewData {
318318
where
319319
F: Fn(&[u8]) -> bool,
320320
{
321-
let mut ctx = LEGACY_SESSION.create_execution_ctx();
321+
// Only array-backed validity needs an execution context; the constant variants resolve
322+
// null-ness without one, so avoid constructing a context in the common (non-null) case.
323+
let mut ctx =
324+
matches!(validity, Validity::Array(_)).then(|| LEGACY_SESSION.create_execution_ctx());
322325
for (idx, &view) in views.iter().enumerate() {
323-
if validity.execute_is_null(idx, &mut ctx)? {
326+
let is_null = match ctx.as_mut() {
327+
Some(ctx) => validity.execute_is_null(idx, ctx)?,
328+
None => matches!(validity, Validity::AllInvalid),
329+
};
330+
if is_null {
324331
continue;
325332
}
326333

0 commit comments

Comments
 (0)