Merge remote-tracking branch 'upstream/main' into optimise-decimal-ca…

…sting
apache · Feb 5, 2025 · 6599659 · 6599659
2 parents 50903f2 + 1019f5b
commit 6599659
Show file tree

Hide file tree

Showing 159 changed files with 7,744 additions and 1,157 deletions.
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
@@ -97,6 +97,8 @@ jobs:
         run: cargo check -p parquet --no-default-features
       - name: Check compilation --no-default-features --features arrow
         run: cargo check -p parquet --no-default-features --features arrow
+      - name: Check compilation --no-default-features --features simdutf8
+        run: cargo check -p parquet --no-default-features --features simdutf8
       - name: Check compilation --no-default-features --all-features
         run: cargo check -p parquet --all-features
       - name: Check compilation --all-targets

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Creates a github release on https://github.com/apache/arrow-rs/releases
+# when a tag is pushed to the repository
+name: Release
+on:
+  push:
+    tags:
+      - '*'
+      - '!*-rc*'
+permissions:
+  contents: write
+env:
+  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+jobs:
+  publish:
+    name: Publish
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@v4
+      - name: Create GitHub Releases
+        run: |
+          case "${GITHUB_REF_NAME}" in
+            object_store_*)
+              version=${GITHUB_REF_NAME#object_store_}
+              title="object_store ${version}"
+              notes_file=object_store/CHANGELOG.md
+              ;;
+            *)
+              version=${GITHUB_REF_NAME}
+              title="arrow ${version}"
+              notes_file=CHANGELOG.md
+              ;;
+          esac
+          gh release create ${GITHUB_REF_NAME} \
+            --title "${title}" \
+            --notes-file ${notes_file} \
+            --verify-tag
diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/Cargo.toml b/Cargo.toml
@@ -62,7 +62,7 @@ exclude = [
 ]
 
 [workspace.package]
-version = "54.0.0"
+version = "54.1.0"
 homepage = "https://github.com/apache/arrow-rs"
 repository = "https://github.com/apache/arrow-rs"
 authors = ["Apache Arrow <[email protected]>"]
@@ -77,20 +77,20 @@ edition = "2021"
 rust-version = "1.70"
 
 [workspace.dependencies]
-arrow = { version = "54.0.0", path = "./arrow", default-features = false }
-arrow-arith = { version = "54.0.0", path = "./arrow-arith" }
-arrow-array = { version = "54.0.0", path = "./arrow-array" }
-arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" }
-arrow-cast = { version = "54.0.0", path = "./arrow-cast" }
-arrow-csv = { version = "54.0.0", path = "./arrow-csv" }
-arrow-data = { version = "54.0.0", path = "./arrow-data" }
-arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" }
-arrow-json = { version = "54.0.0", path = "./arrow-json" }
-arrow-ord = { version = "54.0.0", path = "./arrow-ord" }
-arrow-row = { version = "54.0.0", path = "./arrow-row" }
-arrow-schema = { version = "54.0.0", path = "./arrow-schema" }
-arrow-select = { version = "54.0.0", path = "./arrow-select" }
-arrow-string = { version = "54.0.0", path = "./arrow-string" }
-parquet = { version = "54.0.0", path = "./parquet", default-features = false }
+arrow = { version = "54.1.0", path = "./arrow", default-features = false }
+arrow-arith = { version = "54.1.0", path = "./arrow-arith" }
+arrow-array = { version = "54.1.0", path = "./arrow-array" }
+arrow-buffer = { version = "54.1.0", path = "./arrow-buffer" }
+arrow-cast = { version = "54.1.0", path = "./arrow-cast" }
+arrow-csv = { version = "54.1.0", path = "./arrow-csv" }
+arrow-data = { version = "54.1.0", path = "./arrow-data" }
+arrow-ipc = { version = "54.1.0", path = "./arrow-ipc" }
+arrow-json = { version = "54.1.0", path = "./arrow-json" }
+arrow-ord = { version = "54.1.0", path = "./arrow-ord" }
+arrow-row = { version = "54.1.0", path = "./arrow-row" }
+arrow-schema = { version = "54.1.0", path = "./arrow-schema" }
+arrow-select = { version = "54.1.0", path = "./arrow-select" }
+arrow-string = { version = "54.1.0", path = "./arrow-string" }
+parquet = { version = "54.1.0", path = "./parquet", default-features = false }
 
 chrono = { version = "0.4.34", default-features = false, features = ["clock"] }
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ as the [`parquet`] and [`parquet-derive`] crates.
 
 This crate releases every month. We release new major versions (with potentially
 breaking API changes) at most once a quarter, and release incremental minor
-versions in the intervening months. See [this ticket] for more details.
+versions in the intervening months. See [ticket #5368] for more details.
 
 To keep our maintenance burden down, we do regularly scheduled releases (major
 and minor) from the `main` branch. How we handle PRs with breaking API changes
@@ -63,16 +63,13 @@ is described in the [contributing] guide.
 
 Planned Release Schedule
 
-| Approximate Date | Version  | Notes                                      |
-| ---------------- | -------- | ------------------------------------------ |
-| Nov 2024         | `53.3.0` | Minor, NO breaking API changes             |
-| Dec 2024         | `54.0.0` | Major, potentially breaking API changes    |
-| Jan 2025         | `53.4.0` | Minor, NO breaking API changes (`53` line) |
-| Jan 2025         | `54.1.0` | Minor, NO breaking API changes             |
-| Feb 2025         | `54.2.0` | Minor, NO breaking API changes             |
-| Mar 2025         | `55.0.0` | Major, potentially breaking API changes    |
+| Approximate Date | Version  | Notes                                   |
+| ---------------- | -------- | --------------------------------------- |
+| Jan 2025         | `54.1.0` | Minor, NO breaking API changes          |
+| Feb 2025         | `54.2.0` | Minor, NO breaking API changes          |
+| Mar 2025         | `55.0.0` | Major, potentially breaking API changes |
 
-[this ticket]: https://github.com/apache/arrow-rs/issues/5368
+[ticket #5368]: https://github.com/apache/arrow-rs/issues/5368
 [semantic versioning]: https://semver.org/
 
 ### `object_store` crate
@@ -87,7 +84,6 @@ Planned Release Schedule
 
 | Approximate Date | Version  | Notes                                   |
 | ---------------- | -------- | --------------------------------------- |
-| Dec 2024         | `0.11.2` | Minor, NO breaking API changes          |
 | Feb 2025         | `0.12.0` | Major, potentially breaking API changes |
 
 ### Guidelines for `panic` vs `Result`
@@ -96,9 +92,9 @@ In general, use panics for bad states that are unreachable, unrecoverable or har
 For those caused by invalid user input, however, we prefer to report that invalidity
 gracefully as an error result instead of panicking. In general, invalid input should result
 in an `Error` as soon as possible. It _is_ ok for code paths after validation to assume
-validation has already occurred and panic if not. See [this ticket] for more nuances.
+validation has already occurred and panic if not. See [ticket #6737] for more nuances.
 
-[this ticket]: https://github.com/apache/arrow-rs/issues/6737
+[ticket #6737]: https://github.com/apache/arrow-rs/issues/6737
 
 ### Deprecation Guidelines
 

diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
@@ -55,7 +55,7 @@ ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
 force_validate = []
 
 [dev-dependencies]
-rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
+rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
 criterion = { version = "0.5", default-features = false }
 
 [build-dependencies]

diff --git a/arrow-array/benches/fixed_size_list_array.rs b/arrow-array/benches/fixed_size_list_array.rs
@@ -18,13 +18,13 @@
 use arrow_array::{Array, FixedSizeListArray, Int32Array};
 use arrow_schema::Field;
 use criterion::*;
-use rand::{thread_rng, Rng};
+use rand::{rng, Rng};
 use std::sync::Arc;
 
 fn gen_fsl(len: usize, value_len: usize) -> FixedSizeListArray {
-    let mut rng = thread_rng();
+    let mut rng = rng();
     let values = Arc::new(Int32Array::from(
-        (0..len).map(|_| rng.gen::<i32>()).collect::<Vec<_>>(),
+        (0..len).map(|_| rng.random::<i32>()).collect::<Vec<_>>(),
     ));
     let field = Arc::new(Field::new_list_field(values.data_type().clone(), true));
     FixedSizeListArray::new(field, value_len as i32, values, None)

diff --git a/arrow-array/benches/occupancy.rs b/arrow-array/benches/occupancy.rs
@@ -19,7 +19,7 @@ use arrow_array::types::Int32Type;
 use arrow_array::{DictionaryArray, Int32Array};
 use arrow_buffer::NullBuffer;
 use criterion::*;
-use rand::{thread_rng, Rng};
+use rand::{rng, Rng};
 use std::sync::Arc;
 
 fn gen_dict(
@@ -28,11 +28,11 @@ fn gen_dict(
     occupancy: f64,
     null_percent: f64,
 ) -> DictionaryArray<Int32Type> {
-    let mut rng = thread_rng();
+    let mut rng = rng();
     let values = Int32Array::from(vec![0; values_len]);
     let max_key = (values_len as f64 * occupancy) as i32;
-    let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
-    let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();
+    let keys = (0..len).map(|_| rng.random_range(0..max_key)).collect();
+    let nulls = (0..len).map(|_| !rng.random_bool(null_percent)).collect();
 
     let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
     DictionaryArray::new(keys, Arc::new(values))

diff --git a/arrow-array/benches/union_array.rs b/arrow-array/benches/union_array.rs
@@ -24,27 +24,27 @@ use arrow_array::{Array, ArrayRef, Int32Array, UnionArray};
 use arrow_buffer::{NullBuffer, ScalarBuffer};
 use arrow_schema::{DataType, Field, UnionFields};
 use criterion::*;
-use rand::{thread_rng, Rng};
+use rand::{rng, Rng};
 
 fn array_with_nulls() -> ArrayRef {
-    let mut rng = thread_rng();
+    let mut rng = rng();
 
-    let values = ScalarBuffer::from_iter(repeat_with(|| rng.gen()).take(4096));
+    let values = ScalarBuffer::from_iter(repeat_with(|| rng.random()).take(4096));
 
     // nulls with at least one null and one valid
     let nulls: NullBuffer = [true, false]
         .into_iter()
-        .chain(repeat_with(|| rng.gen()))
+        .chain(repeat_with(|| rng.random()))
         .take(4096)
         .collect();
 
     Arc::new(Int32Array::new(values.clone(), Some(nulls)))
 }
 
 fn array_without_nulls() -> ArrayRef {
-    let mut rng = thread_rng();
+    let mut rng = rng();
 
-    let values = ScalarBuffer::from_iter(repeat_with(|| rng.gen()).take(4096));
+    let values = ScalarBuffer::from_iter(repeat_with(|| rng.random()).take(4096));
 
     Arc::new(Int32Array::new(values.clone(), None))
 }

diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs
@@ -479,7 +479,7 @@ impl From<BooleanBuffer> for BooleanArray {
 mod tests {
     use super::*;
     use arrow_buffer::Buffer;
-    use rand::{thread_rng, Rng};
+    use rand::{rng, Rng};
 
     #[test]
     fn test_boolean_fmt_debug() {
@@ -667,11 +667,11 @@ mod tests {
     #[test]
     #[cfg_attr(miri, ignore)] // Takes too long
     fn test_true_false_count() {
-        let mut rng = thread_rng();
+        let mut rng = rng();
 
         for _ in 0..10 {
             // No nulls
-            let d: Vec<_> = (0..2000).map(|_| rng.gen_bool(0.5)).collect();
+            let d: Vec<_> = (0..2000).map(|_| rng.random_bool(0.5)).collect();
             let b = BooleanArray::from(d.clone());
 
             let expected_true = d.iter().filter(|x| **x).count();
@@ -680,7 +680,7 @@ mod tests {
 
             // With nulls
             let d: Vec<_> = (0..2000)
-                .map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5)))
+                .map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5)))
                 .collect();
             let b = BooleanArray::from(d.clone());
 

diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs
@@ -55,7 +55,9 @@ impl OffsetSizeTrait for i64 {
 }
 
 /// An array of [variable length lists], similar to JSON arrays
-/// (e.g. `["A", "B", "C"]`).
+/// (e.g. `["A", "B", "C"]`). This struct specifically represents
+/// the [list layout]. Refer to [`GenericListViewArray`] for the
+/// [list-view layout].
 ///
 /// Lists are represented using `offsets` into a `values` child
 /// array. Offsets are stored in two adjacent entries of an
@@ -118,12 +120,48 @@ impl OffsetSizeTrait for i64 {
 ///                 (offsets[i],            │   ListArray               (Array)
 ///                offsets[i+1])                                    └ ─ ─ ─ ─ ─ ─ ┘    │
 ///                                         └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
+/// ```
+///
+/// # Slicing
+///
+/// Slicing a `ListArray` creates a new `ListArray` without copying any data,
+/// but this means the [`Self::values`] and [`Self::offsets`] may have "unused" data
 ///
+/// For example, calling `slice(1, 3)` on the `ListArray` in the above example
+/// would result in the following. Note
 ///
+/// 1. `Values` array is unchanged
+/// 2. `Offsets` do not start at `0`, nor cover all values in the Values array.
+///
+/// ```text
+///                                 ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
+///                                                         ┌ ─ ─ ─ ─ ─ ─ ┐    │  ╔═══╗
+///                                 │                         ╔═══╗ ╔═══╗         ║   ║  Not used
+///                                                         │ ║ 1 ║ ║ A ║ │ 0  │  ╚═══╝
+///  ┌─────────────┐  ┌───────┐     │     ┌───┐   ┌───┐       ╠═══╣ ╠═══╣
+///  │ [] (empty)  │  │ (3,3) │           │ 1 │   │ 3 │     │ ║ 1 ║ ║ B ║ │ 1  │
+///  ├─────────────┤  ├───────┤     │     ├───┤   ├───┤       ╠═══╣ ╠═══╣
+///  │    NULL     │  │ (3,4) │           │ 0 │   │ 3 │     │ ║ 1 ║ ║ C ║ │ 2  │
+///  ├─────────────┤  ├───────┤     │     ├───┤   ├───┤       ╠───╣ ╠───╣
+///  │     [D]     │  │ (4,5) │           │ 1 │   │ 4 │     │ │ 0 │ │ ? │ │ 3  │
+///  └─────────────┘  └───────┘     │     └───┘   ├───┤       ├───┤ ├───┤
+///                                               │ 5 │     │ │ 1 │ │ D │ │ 4  │
+///                                 │             └───┘       ├───┤ ├───┤
+///                                                         │ │ 0 │ │ ? │ │ 5  │
+///                                 │  Validity               ╠═══╣ ╠═══╣
+///     Logical       Logical          (nulls)   Offsets    │ ║ 1 ║ ║ F ║ │ 6  │
+///      Values       Offsets       │                         ╚═══╝ ╚═══╝
+///                                                         │    Values   │    │
+///                 (offsets[i],    │   ListArray               (Array)
+///                offsets[i+1])                            └ ─ ─ ─ ─ ─ ─ ┘    │
+///                                 └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
 /// ```
 ///
 /// [`StringArray`]: crate::array::StringArray
+/// [`GenericListViewArray`]: crate::array::GenericListViewArray
 /// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
+/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout
+/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout
 pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
     data_type: DataType,
     nulls: Option<NullBuffer>,
@@ -258,13 +296,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
     /// Returns a reference to the offsets of this list
     ///
     /// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
-    /// allowing for zero-copy cloning
+    /// allowing for zero-copy cloning.
+    ///
+    /// Notes: The `offsets` may not start at 0 and may not cover all values in
+    /// [`Self::values`]. This can happen when the list array was sliced via
+    /// [`Self::slice`]. See documentation for [`Self`] for more details.
     #[inline]
     pub fn offsets(&self) -> &OffsetBuffer<OffsetSize> {
         &self.value_offsets
     }
 
     /// Returns a reference to the values of this list
+    ///
+    /// Note: The list array may not refer to all values in the `values` array.
+    /// For example if the list array was sliced via [`Self::slice`] values will
+    /// still contain values both before and after the slice. See documentation
+    /// for [`Self`] for more details.
     #[inline]
     pub fn values(&self) -> &ArrayRef {
         &self.values
@@ -291,7 +338,9 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
         self.values.slice(start, end - start)
     }
 
-    /// Returns the offset values in the offsets buffer
+    /// Returns the offset values in the offsets buffer.
+    ///
+    /// See [`Self::offsets`] for more details.
     #[inline]
     pub fn value_offsets(&self) -> &[OffsetSize] {
         &self.value_offsets
@@ -320,6 +369,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
     }
 
     /// Returns a zero-copy slice of this array with the indicated offset and length.
+    ///
+    /// Notes: this method does *NOT* slice the underlying values array or modify
+    /// the values in the offsets buffer. See [`Self::values`] and
+    /// [`Self::offsets`] for more information.
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
             data_type: self.data_type.clone(),
@@ -551,12 +604,12 @@ impl<OffsetSize: OffsetSizeTrait> std::fmt::Debug for GenericListArray<OffsetSiz
 
 /// A [`GenericListArray`] of variable size lists, storing offsets as `i32`.
 ///
-// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`]
+/// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`]
 pub type ListArray = GenericListArray<i32>;
 
 /// A [`GenericListArray`] of variable size lists, storing offsets as `i64`.
 ///
-// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`]
+/// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`]
 pub type LargeListArray = GenericListArray<i64>;
 
 #[cfg(test)]