Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into optimise-decimal-ca…
Browse files Browse the repository at this point in the history
…sting
  • Loading branch information
aweltsch committed Feb 5, 2025
2 parents 50903f2 + 1019f5b commit 6599659
Show file tree
Hide file tree
Showing 159 changed files with 7,744 additions and 1,157 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/parquet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ jobs:
run: cargo check -p parquet --no-default-features
- name: Check compilation --no-default-features --features arrow
run: cargo check -p parquet --no-default-features --features arrow
- name: Check compilation --no-default-features --features simdutf8
run: cargo check -p parquet --no-default-features --features simdutf8
- name: Check compilation --no-default-features --all-features
run: cargo check -p parquet --all-features
- name: Check compilation --all-targets
Expand Down
54 changes: 54 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Creates a github release on https://github.com/apache/arrow-rs/releases
# when a tag is pushed to the repository
name: Release
on:
push:
tags:
- '*'
- '!*-rc*'
permissions:
contents: write
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
jobs:
publish:
name: Publish
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- name: Create GitHub Releases
run: |
case "${GITHUB_REF_NAME}" in
object_store_*)
version=${GITHUB_REF_NAME#object_store_}
title="object_store ${version}"
notes_file=object_store/CHANGELOG.md
;;
*)
version=${GITHUB_REF_NAME}
title="arrow ${version}"
notes_file=CHANGELOG.md
;;
esac
gh release create ${GITHUB_REF_NAME} \
--title "${title}" \
--notes-file ${notes_file} \
--verify-tag
111 changes: 111 additions & 0 deletions CHANGELOG-old.md

Large diffs are not rendered by default.

173 changes: 74 additions & 99 deletions CHANGELOG.md

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ exclude = [
]

[workspace.package]
version = "54.0.0"
version = "54.1.0"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <[email protected]>"]
Expand All @@ -77,20 +77,20 @@ edition = "2021"
rust-version = "1.70"

[workspace.dependencies]
arrow = { version = "54.0.0", path = "./arrow", default-features = false }
arrow-arith = { version = "54.0.0", path = "./arrow-arith" }
arrow-array = { version = "54.0.0", path = "./arrow-array" }
arrow-buffer = { version = "54.0.0", path = "./arrow-buffer" }
arrow-cast = { version = "54.0.0", path = "./arrow-cast" }
arrow-csv = { version = "54.0.0", path = "./arrow-csv" }
arrow-data = { version = "54.0.0", path = "./arrow-data" }
arrow-ipc = { version = "54.0.0", path = "./arrow-ipc" }
arrow-json = { version = "54.0.0", path = "./arrow-json" }
arrow-ord = { version = "54.0.0", path = "./arrow-ord" }
arrow-row = { version = "54.0.0", path = "./arrow-row" }
arrow-schema = { version = "54.0.0", path = "./arrow-schema" }
arrow-select = { version = "54.0.0", path = "./arrow-select" }
arrow-string = { version = "54.0.0", path = "./arrow-string" }
parquet = { version = "54.0.0", path = "./parquet", default-features = false }
arrow = { version = "54.1.0", path = "./arrow", default-features = false }
arrow-arith = { version = "54.1.0", path = "./arrow-arith" }
arrow-array = { version = "54.1.0", path = "./arrow-array" }
arrow-buffer = { version = "54.1.0", path = "./arrow-buffer" }
arrow-cast = { version = "54.1.0", path = "./arrow-cast" }
arrow-csv = { version = "54.1.0", path = "./arrow-csv" }
arrow-data = { version = "54.1.0", path = "./arrow-data" }
arrow-ipc = { version = "54.1.0", path = "./arrow-ipc" }
arrow-json = { version = "54.1.0", path = "./arrow-json" }
arrow-ord = { version = "54.1.0", path = "./arrow-ord" }
arrow-row = { version = "54.1.0", path = "./arrow-row" }
arrow-schema = { version = "54.1.0", path = "./arrow-schema" }
arrow-select = { version = "54.1.0", path = "./arrow-select" }
arrow-string = { version = "54.1.0", path = "./arrow-string" }
parquet = { version = "54.1.0", path = "./parquet", default-features = false }

chrono = { version = "0.4.34", default-features = false, features = ["clock"] }
22 changes: 9 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ as the [`parquet`] and [`parquet-derive`] crates.

This crate releases every month. We release new major versions (with potentially
breaking API changes) at most once a quarter, and release incremental minor
versions in the intervening months. See [this ticket] for more details.
versions in the intervening months. See [ticket #5368] for more details.

To keep our maintenance burden down, we do regularly scheduled releases (major
and minor) from the `main` branch. How we handle PRs with breaking API changes
Expand All @@ -63,16 +63,13 @@ is described in the [contributing] guide.

Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | ------------------------------------------ |
| Nov 2024 | `53.3.0` | Minor, NO breaking API changes |
| Dec 2024 | `54.0.0` | Major, potentially breaking API changes |
| Jan 2025 | `53.4.0` | Minor, NO breaking API changes (`53` line) |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |
| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Jan 2025 | `54.1.0` | Minor, NO breaking API changes |
| Feb 2025 | `54.2.0` | Minor, NO breaking API changes |
| Mar 2025 | `55.0.0` | Major, potentially breaking API changes |

[this ticket]: https://github.com/apache/arrow-rs/issues/5368
[ticket #5368]: https://github.com/apache/arrow-rs/issues/5368
[semantic versioning]: https://semver.org/

### `object_store` crate
Expand All @@ -87,7 +84,6 @@ Planned Release Schedule

| Approximate Date | Version | Notes |
| ---------------- | -------- | --------------------------------------- |
| Dec 2024 | `0.11.2` | Minor, NO breaking API changes |
| Feb 2025 | `0.12.0` | Major, potentially breaking API changes |

### Guidelines for `panic` vs `Result`
Expand All @@ -96,9 +92,9 @@ In general, use panics for bad states that are unreachable, unrecoverable or har
For those caused by invalid user input, however, we prefer to report that invalidity
gracefully as an error result instead of panicking. In general, invalid input should result
in an `Error` as soon as possible. It _is_ ok for code paths after validation to assume
validation has already occurred and panic if not. See [this ticket] for more nuances.
validation has already occurred and panic if not. See [ticket #6737] for more nuances.

[this ticket]: https://github.com/apache/arrow-rs/issues/6737
[ticket #6737]: https://github.com/apache/arrow-rs/issues/6737

### Deprecation Guidelines

Expand Down
2 changes: 1 addition & 1 deletion arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
force_validate = []

[dev-dependencies]
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
criterion = { version = "0.5", default-features = false }

[build-dependencies]
Expand Down
6 changes: 3 additions & 3 deletions arrow-array/benches/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
use arrow_array::{Array, FixedSizeListArray, Int32Array};
use arrow_schema::Field;
use criterion::*;
use rand::{thread_rng, Rng};
use rand::{rng, Rng};
use std::sync::Arc;

fn gen_fsl(len: usize, value_len: usize) -> FixedSizeListArray {
let mut rng = thread_rng();
let mut rng = rng();
let values = Arc::new(Int32Array::from(
(0..len).map(|_| rng.gen::<i32>()).collect::<Vec<_>>(),
(0..len).map(|_| rng.random::<i32>()).collect::<Vec<_>>(),
));
let field = Arc::new(Field::new_list_field(values.data_type().clone(), true));
FixedSizeListArray::new(field, value_len as i32, values, None)
Expand Down
8 changes: 4 additions & 4 deletions arrow-array/benches/occupancy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use arrow_array::types::Int32Type;
use arrow_array::{DictionaryArray, Int32Array};
use arrow_buffer::NullBuffer;
use criterion::*;
use rand::{thread_rng, Rng};
use rand::{rng, Rng};
use std::sync::Arc;

fn gen_dict(
Expand All @@ -28,11 +28,11 @@ fn gen_dict(
occupancy: f64,
null_percent: f64,
) -> DictionaryArray<Int32Type> {
let mut rng = thread_rng();
let mut rng = rng();
let values = Int32Array::from(vec![0; values_len]);
let max_key = (values_len as f64 * occupancy) as i32;
let keys = (0..len).map(|_| rng.gen_range(0..max_key)).collect();
let nulls = (0..len).map(|_| !rng.gen_bool(null_percent)).collect();
let keys = (0..len).map(|_| rng.random_range(0..max_key)).collect();
let nulls = (0..len).map(|_| !rng.random_bool(null_percent)).collect();

let keys = Int32Array::new(keys, Some(NullBuffer::new(nulls)));
DictionaryArray::new(keys, Arc::new(values))
Expand Down
12 changes: 6 additions & 6 deletions arrow-array/benches/union_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,27 @@ use arrow_array::{Array, ArrayRef, Int32Array, UnionArray};
use arrow_buffer::{NullBuffer, ScalarBuffer};
use arrow_schema::{DataType, Field, UnionFields};
use criterion::*;
use rand::{thread_rng, Rng};
use rand::{rng, Rng};

fn array_with_nulls() -> ArrayRef {
let mut rng = thread_rng();
let mut rng = rng();

let values = ScalarBuffer::from_iter(repeat_with(|| rng.gen()).take(4096));
let values = ScalarBuffer::from_iter(repeat_with(|| rng.random()).take(4096));

// nulls with at least one null and one valid
let nulls: NullBuffer = [true, false]
.into_iter()
.chain(repeat_with(|| rng.gen()))
.chain(repeat_with(|| rng.random()))
.take(4096)
.collect();

Arc::new(Int32Array::new(values.clone(), Some(nulls)))
}

fn array_without_nulls() -> ArrayRef {
let mut rng = thread_rng();
let mut rng = rng();

let values = ScalarBuffer::from_iter(repeat_with(|| rng.gen()).take(4096));
let values = ScalarBuffer::from_iter(repeat_with(|| rng.random()).take(4096));

Arc::new(Int32Array::new(values.clone(), None))
}
Expand Down
8 changes: 4 additions & 4 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ impl From<BooleanBuffer> for BooleanArray {
mod tests {
use super::*;
use arrow_buffer::Buffer;
use rand::{thread_rng, Rng};
use rand::{rng, Rng};

#[test]
fn test_boolean_fmt_debug() {
Expand Down Expand Up @@ -667,11 +667,11 @@ mod tests {
#[test]
#[cfg_attr(miri, ignore)] // Takes too long
fn test_true_false_count() {
let mut rng = thread_rng();
let mut rng = rng();

for _ in 0..10 {
// No nulls
let d: Vec<_> = (0..2000).map(|_| rng.gen_bool(0.5)).collect();
let d: Vec<_> = (0..2000).map(|_| rng.random_bool(0.5)).collect();
let b = BooleanArray::from(d.clone());

let expected_true = d.iter().filter(|x| **x).count();
Expand All @@ -680,7 +680,7 @@ mod tests {

// With nulls
let d: Vec<_> = (0..2000)
.map(|_| rng.gen_bool(0.5).then(|| rng.gen_bool(0.5)))
.map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5)))
.collect();
let b = BooleanArray::from(d.clone());

Expand Down
63 changes: 58 additions & 5 deletions arrow-array/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ impl OffsetSizeTrait for i64 {
}

/// An array of [variable length lists], similar to JSON arrays
/// (e.g. `["A", "B", "C"]`).
/// (e.g. `["A", "B", "C"]`). This struct specifically represents
/// the [list layout]. Refer to [`GenericListViewArray`] for the
/// [list-view layout].
///
/// Lists are represented using `offsets` into a `values` child
/// array. Offsets are stored in two adjacent entries of an
Expand Down Expand Up @@ -118,12 +120,48 @@ impl OffsetSizeTrait for i64 {
/// (offsets[i], │ ListArray (Array)
/// offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ```
///
/// # Slicing
///
/// Slicing a `ListArray` creates a new `ListArray` without copying any data,
/// but this means the [`Self::values`] and [`Self::offsets`] may have "unused" data
///
/// For example, calling `slice(1, 3)` on the `ListArray` in the above example
/// would result in the following. Note
///
/// 1. `Values` array is unchanged
/// 2. `Offsets` do not start at `0`, nor cover all values in the Values array.
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ┌ ─ ─ ─ ─ ─ ─ ┐ │ ╔═══╗
/// │ ╔═══╗ ╔═══╗ ║ ║ Not used
/// │ ║ 1 ║ ║ A ║ │ 0 │ ╚═══╝
/// ┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ╠═══╣ ╠═══╣
/// │ [] (empty) │ │ (3,3) │ │ 1 │ │ 3 │ │ ║ 1 ║ ║ B ║ │ 1 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ╠═══╣ ╠═══╣
/// │ NULL │ │ (3,4) │ │ 0 │ │ 3 │ │ ║ 1 ║ ║ C ║ │ 2 │
/// ├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ╠───╣ ╠───╣
/// │ [D] │ │ (4,5) │ │ 1 │ │ 4 │ │ │ 0 │ │ ? │ │ 3 │
/// └─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤
/// │ 5 │ │ │ 1 │ │ D │ │ 4 │
/// │ └───┘ ├───┤ ├───┤
/// │ │ 0 │ │ ? │ │ 5 │
/// │ Validity ╠═══╣ ╠═══╣
/// Logical Logical (nulls) Offsets │ ║ 1 ║ ║ F ║ │ 6 │
/// Values Offsets │ ╚═══╝ ╚═══╝
/// │ Values │ │
/// (offsets[i], │ ListArray (Array)
/// offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │
/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ```
///
/// [`StringArray`]: crate::array::StringArray
/// [`GenericListViewArray`]: crate::array::GenericListViewArray
/// [variable length lists]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
/// [list layout]: https://arrow.apache.org/docs/format/Columnar.html#list-layout
/// [list-view layout]: https://arrow.apache.org/docs/format/Columnar.html#listview-layout
pub struct GenericListArray<OffsetSize: OffsetSizeTrait> {
data_type: DataType,
nulls: Option<NullBuffer>,
Expand Down Expand Up @@ -258,13 +296,22 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
/// Returns a reference to the offsets of this list
///
/// Unlike [`Self::value_offsets`] this returns the [`OffsetBuffer`]
/// allowing for zero-copy cloning
/// allowing for zero-copy cloning.
///
/// Notes: The `offsets` may not start at 0 and may not cover all values in
/// [`Self::values`]. This can happen when the list array was sliced via
/// [`Self::slice`]. See documentation for [`Self`] for more details.
#[inline]
pub fn offsets(&self) -> &OffsetBuffer<OffsetSize> {
&self.value_offsets
}

/// Returns a reference to the values of this list
///
/// Note: The list array may not refer to all values in the `values` array.
/// For example if the list array was sliced via [`Self::slice`] values will
/// still contain values both before and after the slice. See documentation
/// for [`Self`] for more details.
#[inline]
pub fn values(&self) -> &ArrayRef {
&self.values
Expand All @@ -291,7 +338,9 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
self.values.slice(start, end - start)
}

/// Returns the offset values in the offsets buffer
/// Returns the offset values in the offsets buffer.
///
/// See [`Self::offsets`] for more details.
#[inline]
pub fn value_offsets(&self) -> &[OffsetSize] {
&self.value_offsets
Expand Down Expand Up @@ -320,6 +369,10 @@ impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
}

/// Returns a zero-copy slice of this array with the indicated offset and length.
///
/// Notes: this method does *NOT* slice the underlying values array or modify
/// the values in the offsets buffer. See [`Self::values`] and
/// [`Self::offsets`] for more information.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
Expand Down Expand Up @@ -551,12 +604,12 @@ impl<OffsetSize: OffsetSizeTrait> std::fmt::Debug for GenericListArray<OffsetSiz

/// A [`GenericListArray`] of variable size lists, storing offsets as `i32`.
///
// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`]
/// See [`ListBuilder`](crate::builder::ListBuilder) for how to construct a [`ListArray`]
pub type ListArray = GenericListArray<i32>;

/// A [`GenericListArray`] of variable size lists, storing offsets as `i64`.
///
// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`]
/// See [`LargeListBuilder`](crate::builder::LargeListBuilder) for how to construct a [`LargeListArray`]
pub type LargeListArray = GenericListArray<i64>;

#[cfg(test)]
Expand Down
Loading

0 comments on commit 6599659

Please sign in to comment.