Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,15 @@ impl ArrayData {
) -> Result<(), ArrowError> {
let offsets: &[T] = self.typed_buffer(0, self.len)?;
let sizes: &[T] = self.typed_buffer(1, self.len)?;
for i in 0..values_length {
if offsets.len() != sizes.len() {
return Err(ArrowError::ComputeError(format!(
"ListView offsets len {} does not match sizes len {}",
offsets.len(),
sizes.len()
)));
}

for i in 0..sizes.len() {
let size = sizes[i].to_usize().ok_or_else(|| {
ArrowError::InvalidArgumentError(format!(
"Error converting size[{}] ({}) to usize for {}",
Expand Down
129 changes: 129 additions & 0 deletions arrow-data/src/equal/list_view.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::ArrayData;
use crate::data::count_nulls;
use crate::equal::equal_values;
use arrow_buffer::ArrowNativeType;
use num_integer::Integer;

pub(super) fn list_view_equal<T: ArrowNativeType + Integer>(
lhs: &ArrayData,
rhs: &ArrayData,
lhs_start: usize,
rhs_start: usize,
len: usize,
) -> bool {
let lhs_offsets = lhs.buffer::<T>(0);
let lhs_sizes = lhs.buffer::<T>(1);

let rhs_offsets = rhs.buffer::<T>(0);
let rhs_sizes = rhs.buffer::<T>(1);

let lhs_data = &lhs.child_data()[0];
let rhs_data = &rhs.child_data()[0];

let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len);
let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len);

if lhs_null_count != rhs_null_count {
return false;
}

if lhs_null_count == 0 {
// non-null pathway: all sizes must be equal, and all values must be equal
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];

if lhs_range_sizes.len() != rhs_range_sizes.len() {
return false;
}

if lhs_range_sizes != rhs_range_sizes {
return false;
}

// Check values for equality
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];

if lhs_range_offsets.len() != rhs_range_offsets.len() {
return false;
}

for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
.iter()
.zip(rhs_range_offsets)
.zip(lhs_sizes)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this loop you zip lhs_range_offsets with rhs_range_offsets and then with lhs_sizes, but lhs_sizes is the full buffer, not the sliced lhs_range_sizes. When lhs_start != 0 this will read sizes from the wrong indices and can produce incorrect equality results. Consider using the sliced sizes here (also applies to the similar loop in the else branch).

🤖 React with 👍 or 👎 to let us know if the comment was useful.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback:The AI agent is correct that the zipped iterators might start from the wrong initial element with lhs_sizes. The third iterator should be lhs_range_sizes to use the same sub-slice of the iterator as the other two.

{
let lhs_offset = lhs_offset.to_usize().unwrap();
let rhs_offset = rhs_offset.to_usize().unwrap();
let size = size.to_usize().unwrap();

// Check if offsets are valid for the given range
if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
Comment on lines +68 to +81
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Fix wrong zip and use per-range sizes to avoid mismatches on slices.

You’re zipping offsets with the full lhs_sizes buffer, not the sliced range. This misaligns sizes when lhs_start > 0. Replace the loop with index-based access over the sliced ranges.

Apply:

-        for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets
-            .iter()
-            .zip(rhs_range_offsets)
-            .zip(lhs_sizes)
-        {
-            let lhs_offset = lhs_offset.to_usize().unwrap();
-            let rhs_offset = rhs_offset.to_usize().unwrap();
-            let size = size.to_usize().unwrap();
-
-            // Check if offsets are valid for the given range
-            if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
-                return false;
-            }
-        }
+        for i in 0..len {
+            let lhs_offset = match lhs_range_offsets[i].to_usize() {
+                Some(v) => v,
+                None => return false,
+            };
+            let rhs_offset = match rhs_range_offsets[i].to_usize() {
+                Some(v) => v,
+                None => return false,
+            };
+            let size = match lhs_range_sizes[i].to_usize() {
+                Some(v) => v,
+                None => return false,
+            };
+            if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
+                return false;
+            }
+        }

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In arrow-data/src/equal/list_view.rs around lines 68 to 81, the loop zips
lhs_range_offsets and rhs_range_offsets with the entire lhs_sizes buffer,
causing size mismatches when lhs_start > 0; replace the zipped iteration with an
index-based loop over the sliced ranges (use the length of the range_offsets
slice), and for each index read lhs_range_offsets[i], rhs_range_offsets[i], and
the corresponding per-range size from the sliced lhs_sizes (or the correct
offset into lhs_sizes) converting each to usize and then call equal_values with
those values; ensure you iterate exactly over the number of ranges so offsets
and sizes stay aligned.

} else {
// Need to integrate validity check in the inner loop.
// non-null pathway: all sizes must be equal, and all values must be equal
let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len];
let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len];

let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len);
let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len);

// Sizes can differ if values are null
if lhs_range_sizes.len() != rhs_range_sizes.len() {
return false;
}

// Check values for equality, with null checking
let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len];
let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len];

if lhs_range_offsets.len() != rhs_range_offsets.len() {
return false;
}

for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the null-handling path, for non-null entries the per-element sizes of lhs and rhs should be equal before comparing values. Currently the loop uses only the lhs size when calling equal_values, which could mark two lists equal if rhs has extra values beyond the lhs size.

🤖 React with 👍 or 👎 to let us know if the comment was useful.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:annoying; category:bug; feedback:The length of the range offsets is compared earlier and it they do not match the equal_values() is not called at all. See https://github.com/martin-augment/arrow-rs/pull/8/files/d0da6c59482632690ff08dfd0c29d1f9de33c19f#diff-ba36cb5edf354d7753dceb65d0744b1bddf1ceb2ae1ef16b3417c83bc8d78aeeR92

.iter()
.zip(rhs_range_offsets)
.zip(lhs_sizes)
.enumerate()
{
let lhs_is_null = lhs_nulls.is_null(index);
let rhs_is_null = rhs_nulls.is_null(index);

if lhs_is_null != rhs_is_null {
return false;
}

let lhs_offset = lhs_offset.to_usize().unwrap();
let rhs_offset = rhs_offset.to_usize().unwrap();
let size = size.to_usize().unwrap();

// Check if values match in the range
if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
Comment on lines +104 to +125
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Nullable path: enforce per-index size equality and fix the same zip bug.

When both sides are non-null you must first check sizes[i] equality; otherwise differing lengths can compare equal by truncation. Also switch to the sliced size range.

Apply:

-        for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
-            .iter()
-            .zip(rhs_range_offsets)
-            .zip(lhs_sizes)
-            .enumerate()
-        {
-            let lhs_is_null = lhs_nulls.is_null(index);
-            let rhs_is_null = rhs_nulls.is_null(index);
-
-            if lhs_is_null != rhs_is_null {
-                return false;
-            }
-
-            let lhs_offset = lhs_offset.to_usize().unwrap();
-            let rhs_offset = rhs_offset.to_usize().unwrap();
-            let size = size.to_usize().unwrap();
-
-            // Check if values match in the range
-            if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
-                return false;
-            }
-        }
+        for index in 0..len {
+            let lhs_is_null = lhs_nulls.is_null(index);
+            let rhs_is_null = rhs_nulls.is_null(index);
+            if lhs_is_null != rhs_is_null {
+                return false;
+            }
+            if !lhs_is_null {
+                if lhs_range_sizes[index] != rhs_range_sizes[index] {
+                    return false;
+                }
+                let lhs_offset = match lhs_range_offsets[index].to_usize() {
+                    Some(v) => v,
+                    None => return false,
+                };
+                let rhs_offset = match rhs_range_offsets[index].to_usize() {
+                    Some(v) => v,
+                    None => return false,
+                };
+                let size = match lhs_range_sizes[index].to_usize() {
+                    Some(v) => v,
+                    None => return false,
+                };
+                if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
+                    return false;
+                }
+            }
+        }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets
.iter()
.zip(rhs_range_offsets)
.zip(lhs_sizes)
.enumerate()
{
let lhs_is_null = lhs_nulls.is_null(index);
let rhs_is_null = rhs_nulls.is_null(index);
if lhs_is_null != rhs_is_null {
return false;
}
let lhs_offset = lhs_offset.to_usize().unwrap();
let rhs_offset = rhs_offset.to_usize().unwrap();
let size = size.to_usize().unwrap();
// Check if values match in the range
if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
for index in 0..len {
let lhs_is_null = lhs_nulls.is_null(index);
let rhs_is_null = rhs_nulls.is_null(index);
if lhs_is_null != rhs_is_null {
return false;
}
if !lhs_is_null {
if lhs_range_sizes[index] != rhs_range_sizes[index] {
return false;
}
let lhs_offset = match lhs_range_offsets[index].to_usize() {
Some(v) => v,
None => return false,
};
let rhs_offset = match rhs_range_offsets[index].to_usize() {
Some(v) => v,
None => return false,
};
let size = match lhs_range_sizes[index].to_usize() {
Some(v) => v,
None => return false,
};
if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) {
return false;
}
}
}
🤖 Prompt for AI Agents
In arrow-data/src/equal/list_view.rs around lines 104 to 125, the nullable
branch currently can compare lists of different lengths as equal and also uses
the wrong zip (causing misaligned size indexing); fix by iterating over the
three aligned iterators (lhs_range_offsets, rhs_range_offsets, and the sliced
sizes iterator) so each index gets its corresponding size, then for each index
when both sides are non-null first compare sizes[i] for equality and return
false if they differ, and only if sizes match call equal_values with the
per-index offsets and that size; ensure you use the correctly sliced sizes
iterator (not the whole lhs_sizes misaligned with zips) so indices line up.

}

true
}
10 changes: 5 additions & 5 deletions arrow-data/src/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ mod dictionary;
mod fixed_binary;
mod fixed_list;
mod list;
mod list_view;
mod null;
mod primitive;
mod run;
Expand All @@ -41,6 +42,8 @@ mod variable_size;
// these methods assume the same type, len and null count.
// For this reason, they are not exposed and are instead used
// to build the generic functions below (`equal_range` and `equal`).
use self::run::run_equal;
use crate::equal::list_view::list_view_equal;
use boolean::boolean_equal;
use byte_view::byte_view_equal;
use dictionary::dictionary_equal;
Expand All @@ -53,8 +56,6 @@ use structure::struct_equal;
use union::union_equal;
use variable_size::variable_sized_equal;

use self::run::run_equal;

/// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively
/// for `len` slots.
#[inline]
Expand Down Expand Up @@ -104,10 +105,9 @@ fn equal_values(
byte_view_equal(lhs, rhs, lhs_start, rhs_start, len)
}
DataType::List(_) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not yet implemented")
}
DataType::LargeList(_) => list_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
DataType::ListView(_) => list_view_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
DataType::LargeListView(_) => list_view_equal::<i64>(lhs, rhs, lhs_start, rhs_start, len),
DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len),
Comment on lines +109 to 111
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

ListView/LargeListView equality dispatch is correct; verify inner fn slices sizes correctly.

list_view_equal should iterate the sliced sizes for the range, not the full slice. Please confirm it zips with lhs_range_sizes/rhs_range_sizes, not the entire lhs_sizes.

If found, change:

  • zip(lhs_sizes) -> zip(lhs_range_sizes.iter()) (and analogously in the branch with enumerate()).

🏁 Script executed:

#!/bin/bash
# Locate potential misuse of full sizes slice inside list_view_equal
rg -nP 'list_view_equal.*\{(?s).*zip\(\s*lhs_sizes\s*\)' arrow-data/src/equal/list_view.rs -C2
rg -nP 'zip\(\s*lhs_sizes\s*\)' arrow-data/src/equal/list_view.rs -n -C3

Length of output: 589


This script will help retrieve the complete function implementation.


🏁 Script executed:

#!/bin/bash
# Extract the full list_view_equal function definition
rg -n 'fn list_view_equal' arrow-data/src/equal/list_view.rs -A 60

Length of output: 73


🏁 Script executed:

#!/bin/bash
# Extract the full list_view_equal function definition
rg -n 'fn list_view_equal' arrow-data/src/equal/list_view.rs -A 60

Length of output: 2225


🏁 Script executed:

#!/bin/bash
# Extract the rest of the function to see the null handling branch
rg -n 'fn list_view_equal' arrow-data/src/equal/list_view.rs -A 100 | tail -50

Length of output: 2071


Fix two instances of incorrect buffer slicing in list_view_equal.

The function correctly creates lhs_range_sizes and rhs_range_sizes slices at lines 49/85, but then incorrectly zips with the full lhs_sizes buffer at lines 71 and 107. This causes the loop to iterate over wrong indices when lhs_start > 0.

Required changes:

  • Line 71: .zip(lhs_sizes).zip(lhs_range_sizes.iter())
  • Line 107: .zip(lhs_sizes).zip(lhs_range_sizes.iter())
🤖 Prompt for AI Agents
In arrow-data/src/equal/mod.rs around lines 49–111, list_view_equal creates
lhs_range_sizes and rhs_range_sizes slices but later zips with the full
lhs_sizes buffer causing incorrect iteration when lhs_start > 0; change the two
zips at line 71 and line 107 to zip with lhs_range_sizes.iter() instead of
lhs_sizes so the loop iterates over the sliced ranges (i.e., replace
.zip(lhs_sizes) with .zip(lhs_range_sizes.iter()) at both places).

DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len),
DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len),
Expand Down
56 changes: 56 additions & 0 deletions arrow-data/src/transform/list_view.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::ArrayData;
use crate::transform::_MutableArrayData;
use arrow_buffer::ArrowNativeType;
use num_integer::Integer;
use num_traits::CheckedAdd;

pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
array: &ArrayData,
) -> crate::transform::Extend<'_> {
let offsets = array.buffer::<T>(0);
let sizes = array.buffer::<T>(1);
Box::new(
move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| {
let offset_buffer = &mut mutable.buffer1;
let sizes_buffer = &mut mutable.buffer2;

for &offset in &offsets[start..start + len] {
offset_buffer.push(offset);
}

// sizes
for &size in &sizes[start..start + len] {
sizes_buffer.push(size);
}

// the beauty of views is that we don't need to copy child_data, we just splat
// the offsets and sizes.
},
)
}

pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
let offset_buffer = &mut mutable.buffer1;
let sizes_buffer = &mut mutable.buffer2;

// We push 0 as a placeholder for NULL values in both the offsets and sizes
(0..len).for_each(|_| offset_buffer.push(T::default()));
(0..len).for_each(|_| sizes_buffer.push(T::default()));
}
33 changes: 21 additions & 12 deletions arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ mod boolean;
mod fixed_binary;
mod fixed_size_list;
mod list;
mod list_view;
mod null;
mod primitive;
mod run;
Expand Down Expand Up @@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> {
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::LargeList(_) => list::build_extend::<i64>(array),
DataType::ListView(_) => list_view::build_extend::<i32>(array),
DataType::LargeListView(_) => list_view::build_extend::<i64>(array),
DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
DataType::Struct(_) => structure::build_extend(array),
DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
Expand Down Expand Up @@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::LargeList(_) => list::extend_nulls::<i64>,
DataType::ListView(_) => list_view::extend_nulls::<i32>,
DataType::LargeListView(_) => list_view::extend_nulls::<i64>,
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
DataType::UInt8 => primitive::extend_nulls::<u8>,
DataType::UInt16 => primitive::extend_nulls::<u16>,
Expand Down Expand Up @@ -450,7 +449,11 @@ impl<'a> MutableArrayData<'a> {
new_buffers(data_type, *capacity)
}
(
DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
DataType::List(_)
| DataType::LargeList(_)
| DataType::ListView(_)
| DataType::LargeListView(_)
| DataType::FixedSizeList(_, _),
Capacities::List(capacity, _),
) => {
array_capacity = *capacity;
Expand Down Expand Up @@ -491,10 +494,11 @@ impl<'a> MutableArrayData<'a> {
| DataType::Utf8View
| DataType::Interval(_)
| DataType::FixedSizeBinary(_) => vec![],
DataType::ListView(_) | DataType::LargeListView(_) => {
unimplemented!("ListView/LargeListView not implemented")
}
DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
DataType::Map(_, _)
| DataType::List(_)
| DataType::LargeList(_)
| DataType::ListView(_)
| DataType::LargeListView(_) => {
let children = arrays
.iter()
.map(|array| &array.child_data()[0])
Expand Down Expand Up @@ -785,7 +789,12 @@ impl<'a> MutableArrayData<'a> {
b.insert(0, data.buffer1.into());
b
}
DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
DataType::Utf8
| DataType::Binary
| DataType::LargeUtf8
| DataType::LargeBinary
| DataType::ListView(_)
| DataType::LargeListView(_) => {
vec![data.buffer1.into(), data.buffer2.into()]
}
DataType::Union(_, mode) => {
Expand Down
Loading
Loading