Skip to content

Commit d5cda4a

Browse files
authored
apacheGH-44084: [C++] Improve merge step in chunked sorting (apache#44217)
### Rationale for this change When merge-sorting the chunks of a chunked array or table, we would currently repeatedly resolve the chunk indices for each individual value lookup. This requires `O(n*log k)` chunk resolutions with `n` being the chunked array or table length, and `k` the number of chunks. Instead, this PR translates the logical indices to physical all at once, without even requiring expensive chunk resolution as the logical indices are initially chunk-partitioned. This change yields significant speedups on chunked array and table sorting: ``` benchmark baseline contender change % counters ChunkedArraySortIndicesInt64Narrow/1048576/100 345.419 MiB/sec 628.334 MiB/sec 81.905 {'family_index': 0, 'per_family_instance_index': 6, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/1048576/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 242, 'null_percent': 1.0} TableSortIndicesInt64Narrow/1048576/0/1/32 25.997M items/sec 44.550M items/sec 71.366 {'family_index': 3, 'per_family_instance_index': 11, 'run_name': 'TableSortIndicesInt64Narrow/1048576/0/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 0.0} ChunkedArraySortIndicesInt64Wide/32768/10000 91.182 MiB/sec 153.756 MiB/sec 68.625 {'family_index': 1, 'per_family_instance_index': 0, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/10000', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2067, 'null_percent': 0.01} ChunkedArraySortIndicesInt64Wide/32768/10 96.536 MiB/sec 161.648 MiB/sec 67.449 {'family_index': 1, 'per_family_instance_index': 2, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/10', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2238, 'null_percent': 10.0} TableSortIndicesInt64Narrow/1048576/100/1/32 24.290M items/sec 40.513M items/sec 66.791 {'family_index': 3, 'per_family_instance_index': 9, 'run_name': 'TableSortIndicesInt64Narrow/1048576/100/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Wide/32768/100 90.030 MiB/sec 149.633 MiB/sec 66.203 {'family_index': 1, 'per_family_instance_index': 1, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2017, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Wide/32768/0 91.982 MiB/sec 152.840 MiB/sec 66.163 {'family_index': 1, 'per_family_instance_index': 5, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/0', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2115, 'null_percent': 0.0} ChunkedArraySortIndicesInt64Narrow/8388608/100 240.335 MiB/sec 387.423 MiB/sec 61.201 {'family_index': 0, 'per_family_instance_index': 7, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/8388608/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 21, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Wide/32768/2 172.376 MiB/sec 274.133 MiB/sec 59.032 {'family_index': 1, 'per_family_instance_index': 3, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/2', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3770, 'null_percent': 50.0} TableSortIndicesInt64Wide/1048576/4/1/32 7.407M items/sec 11.621M items/sec 56.904 {'family_index': 4, 'per_family_instance_index': 10, 'run_name': 'TableSortIndicesInt64Wide/1048576/4/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 5, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 25.0} TableSortIndicesInt64Wide/1048576/100/1/32 5.788M items/sec 9.062M items/sec 56.565 {'family_index': 4, 'per_family_instance_index': 9, 'run_name': 'TableSortIndicesInt64Wide/1048576/100/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 1.0} TableSortIndicesInt64Wide/1048576/0/1/32 5.785M items/sec 9.049M items/sec 56.409 {'family_index': 4, 'per_family_instance_index': 11, 'run_name': 'TableSortIndicesInt64Wide/1048576/0/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 0.0} ChunkedArraySortIndicesInt64Narrow/32768/2 194.743 MiB/sec 291.432 MiB/sec 49.649 {'family_index': 0, 'per_family_instance_index': 3, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/2', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4340, 'null_percent': 50.0} TableSortIndicesInt64Narrow/1048576/4/1/32 25.686M items/sec 38.087M items/sec 48.279 {'family_index': 3, 'per_family_instance_index': 10, 'run_name': 'TableSortIndicesInt64Narrow/1048576/4/1/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 17, 'chunks': 32.0, 'columns': 1.0, 'null_percent': 25.0} TableSortIndicesInt64Wide/1048576/0/8/32 5.766M items/sec 8.374M items/sec 45.240 {'family_index': 4, 'per_family_instance_index': 5, 'run_name': 'TableSortIndicesInt64Wide/1048576/0/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 0.0} TableSortIndicesInt64Wide/1048576/0/16/32 5.752M items/sec 8.352M items/sec 45.202 {'family_index': 4, 'per_family_instance_index': 2, 'run_name': 'TableSortIndicesInt64Wide/1048576/0/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 0.0} ChunkedArraySortIndicesInt64Narrow/32768/10000 121.253 MiB/sec 175.286 MiB/sec 44.562 {'family_index': 0, 'per_family_instance_index': 0, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/10000', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2673, 'null_percent': 0.01} TableSortIndicesInt64Wide/1048576/100/2/32 5.549M items/sec 7.984M items/sec 43.876 {'family_index': 4, 'per_family_instance_index': 6, 'run_name': 'TableSortIndicesInt64Wide/1048576/100/2/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 2.0, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Wide/1048576/100 69.599 MiB/sec 99.666 MiB/sec 43.200 {'family_index': 1, 'per_family_instance_index': 6, 'run_name': 'ChunkedArraySortIndicesInt64Wide/1048576/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 49, 'null_percent': 1.0} TableSortIndicesInt64Narrow/1048576/0/1/4 55.940M items/sec 79.984M items/sec 42.982 {'family_index': 3, 'per_family_instance_index': 23, 'run_name': 'TableSortIndicesInt64Narrow/1048576/0/1/4', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 37, 'chunks': 4.0, 'columns': 1.0, 'null_percent': 0.0} TableSortIndicesInt64Wide/1048576/100/16/32 5.554M items/sec 7.909M items/sec 42.417 {'family_index': 4, 'per_family_instance_index': 0, 'run_name': 'TableSortIndicesInt64Wide/1048576/100/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Narrow/32768/10 127.758 MiB/sec 181.407 MiB/sec 41.992 {'family_index': 0, 'per_family_instance_index': 2, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/10', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2856, 'null_percent': 10.0} TableSortIndicesInt64Wide/1048576/100/8/32 5.572M items/sec 7.775M items/sec 39.548 {'family_index': 4, 'per_family_instance_index': 3, 'run_name': 'TableSortIndicesInt64Wide/1048576/100/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Narrow/32768/100 119.600 MiB/sec 166.454 MiB/sec 39.176 {'family_index': 0, 'per_family_instance_index': 1, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2667, 'null_percent': 1.0} TableSortIndicesInt64Wide/1048576/0/2/32 5.781M items/sec 8.016M items/sec 38.669 {'family_index': 4, 'per_family_instance_index': 8, 'run_name': 'TableSortIndicesInt64Wide/1048576/0/2/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 'chunks': 32.0, 'columns': 2.0, 'null_percent': 0.0} TableSortIndicesInt64Narrow/1048576/100/1/4 52.252M items/sec 72.193M items/sec 38.162 {'family_index': 3, 'per_family_instance_index': 21, 'run_name': 'TableSortIndicesInt64Narrow/1048576/100/1/4', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 35, 'chunks': 4.0, 'columns': 1.0, 'null_percent': 1.0} ChunkedArraySortIndicesInt64Narrow/32768/0 121.868 MiB/sec 168.364 MiB/sec 38.152 {'family_index': 0, 'per_family_instance_index': 5, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/0', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2691, 'null_percent': 0.0} TableSortIndicesInt64Wide/1048576/4/2/32 5.017M items/sec 6.720M items/sec 33.934 {'family_index': 4, 'per_family_instance_index': 7, 'run_name': 'TableSortIndicesInt64Wide/1048576/4/2/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 'chunks': 32.0, 'columns': 2.0, 'null_percent': 25.0} ChunkedArraySortIndicesInt64Wide/8388608/100 54.785 MiB/sec 72.642 MiB/sec 32.593 {'family_index': 1, 'per_family_instance_index': 7, 'run_name': 'ChunkedArraySortIndicesInt64Wide/8388608/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 5, 'null_percent': 1.0} TableSortIndicesInt64Wide/1048576/4/8/32 4.222M items/sec 5.483M items/sec 29.861 {'family_index': 4, 'per_family_instance_index': 4, 'run_name': 'TableSortIndicesInt64Wide/1048576/4/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 25.0} ChunkedArraySortIndicesString/32768/10 146.866 MiB/sec 190.314 MiB/sec 29.583 {'family_index': 2, 'per_family_instance_index': 2, 'run_name': 'ChunkedArraySortIndicesString/32768/10', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3494, 'null_percent': 10.0} TableSortIndicesInt64Wide/1048576/4/16/32 4.225M items/sec 5.433M items/sec 28.599 {'family_index': 4, 'per_family_instance_index': 1, 'run_name': 'TableSortIndicesInt64Wide/1048576/4/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 25.0} TableSortIndicesInt64Narrow/1048576/100/16/32 2.193M items/sec 2.711M items/sec 23.652 {'family_index': 3, 'per_family_instance_index': 0, 'run_name': 'TableSortIndicesInt64Narrow/1048576/100/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 1.0} ChunkedArraySortIndicesString/32768/100 156.401 MiB/sec 191.910 MiB/sec 22.704 {'family_index': 2, 'per_family_instance_index': 1, 'run_name': 'ChunkedArraySortIndicesString/32768/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3488, 'null_percent': 1.0} TableSortIndicesInt64Narrow/1048576/4/1/4 47.342M items/sec 58.062M items/sec 22.644 {'family_index': 3, 'per_family_instance_index': 22, 'run_name': 'TableSortIndicesInt64Narrow/1048576/4/1/4', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 32, 'chunks': 4.0, 'columns': 1.0, 'null_percent': 25.0} ChunkedArraySortIndicesString/32768/0 161.457 MiB/sec 195.782 MiB/sec 21.259 {'family_index': 2, 'per_family_instance_index': 5, 'run_name': 'ChunkedArraySortIndicesString/32768/0', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3644, 'null_percent': 0.0} TableSortIndicesInt64Narrow/1048576/4/16/32 1.915M items/sec 2.309M items/sec 20.561 {'family_index': 3, 'per_family_instance_index': 1, 'run_name': 'TableSortIndicesInt64Narrow/1048576/4/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 25.0} TableSortIndicesInt64Narrow/1048576/0/16/32 2.561M items/sec 3.079M items/sec 20.208 {'family_index': 3, 'per_family_instance_index': 2, 'run_name': 'TableSortIndicesInt64Narrow/1048576/0/16/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2, 'chunks': 32.0, 'columns': 16.0, 'null_percent': 0.0} ChunkedArraySortIndicesString/32768/10000 157.786 MiB/sec 189.412 MiB/sec 20.043 {'family_index': 2, 'per_family_instance_index': 0, 'run_name': 'ChunkedArraySortIndicesString/32768/10000', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3539, 'null_percent': 0.01} ChunkedArraySortIndicesString/32768/2 139.241 MiB/sec 164.172 MiB/sec 17.904 {'family_index': 2, 'per_family_instance_index': 3, 'run_name': 'ChunkedArraySortIndicesString/32768/2', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3155, 'null_percent': 50.0} TableSortIndicesInt64Narrow/1048576/0/8/32 2.595M items/sec 3.038M items/sec 17.081 {'family_index': 3, 'per_family_instance_index': 5, 'run_name': 'TableSortIndicesInt64Narrow/1048576/0/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 0.0} TableSortIndicesInt64Narrow/1048576/4/8/32 1.999M items/sec 2.298M items/sec 14.936 {'family_index': 3, 'per_family_instance_index': 4, 'run_name': 'TableSortIndicesInt64Narrow/1048576/4/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 25.0} ChunkedArraySortIndicesString/8388608/100 81.026 MiB/sec 93.120 MiB/sec 14.926 {'family_index': 2, 'per_family_instance_index': 7, 'run_name': 'ChunkedArraySortIndicesString/8388608/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7, 'null_percent': 1.0} TableSortIndicesInt64Narrow/1048576/100/8/32 2.382M items/sec 2.719M items/sec 14.168 {'family_index': 3, 'per_family_instance_index': 3, 'run_name': 'TableSortIndicesInt64Narrow/1048576/100/8/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 2, 'chunks': 32.0, 'columns': 8.0, 'null_percent': 1.0} ChunkedArraySortIndicesString/1048576/100 107.722 MiB/sec 122.229 MiB/sec 13.467 {'family_index': 2, 'per_family_instance_index': 6, 'run_name': 'ChunkedArraySortIndicesString/1048576/100', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 77, 'null_percent': 1.0} TableSortIndicesInt64Narrow/1048576/100/2/32 4.019M items/sec 4.477M items/sec 11.383 {'family_index': 3, 'per_family_instance_index': 6, 'run_name': 'TableSortIndicesInt64Narrow/1048576/100/2/32', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 'chunks': 32.0, 'columns': 2.0, 'null_percent': 1.0} TableSortIndicesInt64Wide/1048576/4/1/4 11.595M items/sec 12.791M items/sec 10.314 {'family_index': 4, 'per_family_instance_index': 22, 'run_name': 'TableSortIndicesInt64Wide/1048576/4/1/4', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 8, 'chunks': 4.0, 'columns': 1.0, 'null_percent': 25.0} TableSortIndicesInt64Wide/1048576/0/1/4 9.231M items/sec 10.181M items/sec 10.294 {'family_index': 4, 'per_family_instance_index': 23, 'run_name': 'TableSortIndicesInt64Wide/1048576/0/1/4', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 6, 'chunks': 4.0, 'columns': 1.0, 'null_percent': 0.0} ``` However, performance also regresses when the input is all-nulls (which is probably rare): ``` benchmark baseline contender change % counters ChunkedArraySortIndicesString/32768/1 5.636 GiB/sec 4.336 GiB/sec -23.068 {'family_index': 2, 'per_family_instance_index': 4, 'run_name': 'ChunkedArraySortIndicesString/32768/1', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 127778, 'null_percent': 100.0} ChunkedArraySortIndicesInt64Narrow/32768/1 3.963 GiB/sec 2.852 GiB/sec -28.025 {'family_index': 0, 'per_family_instance_index': 4, 'run_name': 'ChunkedArraySortIndicesInt64Narrow/32768/1', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 91209, 'null_percent': 100.0} ChunkedArraySortIndicesInt64Wide/32768/1 4.038 GiB/sec 2.869 GiB/sec -28.954 {'family_index': 1, 'per_family_instance_index': 4, 'run_name': 'ChunkedArraySortIndicesInt64Wide/32768/1', 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 94090, 'null_percent': 100.0} ``` ### Are these changes tested? Yes, by existing tests. ### Are there any user-facing changes? No. * GitHub Issue: apache#44084 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent c4d17fd commit d5cda4a

8 files changed

+496
-196
lines changed

cpp/src/arrow/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,7 @@ set(ARROW_COMPUTE_SRCS
731731
compute/light_array_internal.cc
732732
compute/ordering.cc
733733
compute/registry.cc
734+
compute/kernels/chunked_internal.cc
734735
compute/kernels/codegen_internal.cc
735736
compute/kernels/ree_util_internal.cc
736737
compute/kernels/scalar_cast_boolean.cc

cpp/src/arrow/chunk_resolver.cc

+6-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
namespace arrow {
3030

31+
using util::span;
32+
3133
namespace {
3234
template <typename T>
3335
int64_t GetLength(const T& array) {
@@ -42,7 +44,7 @@ int64_t GetLength<std::shared_ptr<RecordBatch>>(
4244
}
4345

4446
template <typename T>
45-
inline std::vector<int64_t> MakeChunksOffsets(const std::vector<T>& chunks) {
47+
inline std::vector<int64_t> MakeChunksOffsets(span<T> chunks) {
4648
std::vector<int64_t> offsets(chunks.size() + 1);
4749
int64_t offset = 0;
4850
std::transform(chunks.begin(), chunks.end(), offsets.begin(),
@@ -112,13 +114,13 @@ void ResolveManyInline(uint32_t num_offsets, const int64_t* signed_offsets,
112114
} // namespace
113115

114116
ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept
115-
: offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
117+
: offsets_(MakeChunksOffsets(span(chunks))), cached_chunk_(0) {}
116118

117-
ChunkResolver::ChunkResolver(const std::vector<const Array*>& chunks) noexcept
119+
ChunkResolver::ChunkResolver(span<const Array* const> chunks) noexcept
118120
: offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
119121

120122
ChunkResolver::ChunkResolver(const RecordBatchVector& batches) noexcept
121-
: offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {}
123+
: offsets_(MakeChunksOffsets(span(batches))), cached_chunk_(0) {}
122124

123125
ChunkResolver::ChunkResolver(ChunkResolver&& other) noexcept
124126
: offsets_(std::move(other.offsets_)),

cpp/src/arrow/chunk_resolver.h

+7-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include "arrow/type_fwd.h"
2828
#include "arrow/util/macros.h"
29+
#include "arrow/util/span.h"
2930

3031
namespace arrow {
3132

@@ -76,11 +77,14 @@ class ARROW_EXPORT ChunkResolver {
7677

7778
public:
7879
explicit ChunkResolver(const ArrayVector& chunks) noexcept;
79-
80-
explicit ChunkResolver(const std::vector<const Array*>& chunks) noexcept;
81-
80+
explicit ChunkResolver(util::span<const Array* const> chunks) noexcept;
8281
explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
8382

83+
/// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets.
84+
///
85+
/// The first offset must be 0 and the last offset must be the logical length of the
86+
/// chunked array. Each offset before the last represents the starting logical index of
87+
/// the corresponding chunk.
8488
explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
8589
: offsets_(std::move(offsets)), cached_chunk_(0) {
8690
#ifndef NDEBUG
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "arrow/compute/kernels/chunked_internal.h"
19+
20+
#include <algorithm>
21+
22+
#include "arrow/record_batch.h"
23+
#include "arrow/util/logging.h"
24+
25+
namespace arrow::compute::internal {
26+
27+
std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
28+
std::vector<const Array*> pointers(arrays.size());
29+
std::transform(arrays.begin(), arrays.end(), pointers.begin(),
30+
[&](const std::shared_ptr<Array>& array) { return array.get(); });
31+
return pointers;
32+
}
33+
34+
std::vector<int64_t> ChunkedIndexMapper::GetChunkLengths(
35+
util::span<const Array* const> chunks) {
36+
std::vector<int64_t> chunk_lengths(chunks.size());
37+
for (int64_t i = 0; i < static_cast<int64_t>(chunks.size()); ++i) {
38+
chunk_lengths[i] = chunks[i]->length();
39+
}
40+
return chunk_lengths;
41+
}
42+
43+
std::vector<int64_t> ChunkedIndexMapper::GetChunkLengths(
44+
const RecordBatchVector& chunks) {
45+
std::vector<int64_t> chunk_lengths(chunks.size());
46+
for (int64_t i = 0; i < static_cast<int64_t>(chunks.size()); ++i) {
47+
chunk_lengths[i] = chunks[i]->num_rows();
48+
}
49+
return chunk_lengths;
50+
}
51+
52+
Result<std::pair<CompressedChunkLocation*, CompressedChunkLocation*>>
53+
ChunkedIndexMapper::LogicalToPhysical() {
54+
// Check that indices would fall in bounds for CompressedChunkLocation
55+
if (ARROW_PREDICT_FALSE(chunk_lengths_.size() >
56+
CompressedChunkLocation::kMaxChunkIndex + 1)) {
57+
return Status::NotImplemented("Chunked array has more than ",
58+
CompressedChunkLocation::kMaxChunkIndex + 1, " chunks");
59+
}
60+
for (int64_t chunk_length : chunk_lengths_) {
61+
if (ARROW_PREDICT_FALSE(static_cast<uint64_t>(chunk_length) >
62+
CompressedChunkLocation::kMaxIndexInChunk + 1)) {
63+
return Status::NotImplemented("Individual chunk in chunked array has more than ",
64+
CompressedChunkLocation::kMaxIndexInChunk + 1,
65+
" elements");
66+
}
67+
}
68+
69+
const int64_t num_indices = static_cast<int64_t>(indices_end_ - indices_begin_);
70+
DCHECK_EQ(num_indices, std::accumulate(chunk_lengths_.begin(), chunk_lengths_.end(),
71+
static_cast<int64_t>(0)));
72+
CompressedChunkLocation* physical_begin =
73+
reinterpret_cast<CompressedChunkLocation*>(indices_begin_);
74+
DCHECK_EQ(physical_begin + num_indices,
75+
reinterpret_cast<CompressedChunkLocation*>(indices_end_));
76+
77+
int64_t chunk_offset = 0;
78+
for (int64_t chunk_index = 0; chunk_index < static_cast<int64_t>(chunk_lengths_.size());
79+
++chunk_index) {
80+
const int64_t chunk_length = chunk_lengths_[chunk_index];
81+
for (int64_t i = 0; i < chunk_length; ++i) {
82+
// Logical indices are expected to be chunk-partitioned, which avoids costly
83+
// chunked index resolution.
84+
DCHECK_GE(indices_begin_[chunk_offset + i], static_cast<uint64_t>(chunk_offset));
85+
DCHECK_LT(indices_begin_[chunk_offset + i],
86+
static_cast<uint64_t>(chunk_offset + chunk_length));
87+
physical_begin[chunk_offset + i] = CompressedChunkLocation{
88+
static_cast<uint64_t>(chunk_index),
89+
indices_begin_[chunk_offset + i] - static_cast<uint64_t>(chunk_offset)};
90+
}
91+
chunk_offset += chunk_length;
92+
}
93+
94+
return std::pair{physical_begin, physical_begin + num_indices};
95+
}
96+
97+
Status ChunkedIndexMapper::PhysicalToLogical() {
98+
std::vector<int64_t> chunk_offsets(chunk_lengths_.size());
99+
{
100+
int64_t offset = 0;
101+
for (int64_t i = 0; i < static_cast<int64_t>(chunk_lengths_.size()); ++i) {
102+
chunk_offsets[i] = offset;
103+
offset += chunk_lengths_[i];
104+
}
105+
}
106+
107+
const int64_t num_indices = static_cast<int64_t>(indices_end_ - indices_begin_);
108+
CompressedChunkLocation* physical_begin =
109+
reinterpret_cast<CompressedChunkLocation*>(indices_begin_);
110+
for (int64_t i = 0; i < num_indices; ++i) {
111+
const auto loc = physical_begin[i];
112+
DCHECK_LT(loc.chunk_index(), chunk_offsets.size());
113+
DCHECK_LT(loc.index_in_chunk(),
114+
static_cast<uint64_t>(chunk_lengths_[loc.chunk_index()]));
115+
indices_begin_[i] =
116+
chunk_offsets[loc.chunk_index()] + static_cast<int64_t>(loc.index_in_chunk());
117+
}
118+
119+
return Status::OK();
120+
}
121+
122+
} // namespace arrow::compute::internal

cpp/src/arrow/compute/kernels/chunked_internal.h

+101-20
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,32 @@
2020
#include <algorithm>
2121
#include <cstdint>
2222
#include <memory>
23+
#include <utility>
2324
#include <vector>
2425

2526
#include "arrow/array.h"
2627
#include "arrow/chunk_resolver.h"
2728
#include "arrow/compute/kernels/codegen_internal.h"
29+
#include "arrow/util/span.h"
2830

29-
namespace arrow {
30-
namespace compute {
31-
namespace internal {
31+
namespace arrow::compute::internal {
3232

3333
// The target chunk in a chunked array.
3434
struct ResolvedChunk {
3535
// The target array in chunked array.
3636
const Array* array;
3737
// The index in the target array.
38-
const int64_t index;
38+
int64_t index;
3939

4040
ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
4141

42-
public:
42+
friend bool operator==(const ResolvedChunk& left, const ResolvedChunk& right) {
43+
return left.array == right.array && left.index == right.index;
44+
}
45+
friend bool operator!=(const ResolvedChunk& left, const ResolvedChunk& right) {
46+
return left.array != right.array || left.index != right.index;
47+
}
48+
4349
bool IsNull() const { return array->IsNull(index); }
4450

4551
template <typename ArrowType, typename ViewType = GetViewType<ArrowType>>
@@ -50,34 +56,109 @@ struct ResolvedChunk {
5056
}
5157
};
5258

59+
// A compressed (chunk_index, index_in_chunk) pair.
60+
// The goal of compression is to make it fit in 64 bits, allowing in place
61+
// replacement of logical uint64_t indices with physical indices.
62+
// (see ChunkedIndexMapper)
63+
struct CompressedChunkLocation {
64+
static constexpr int kChunkIndexBits = 24;
65+
static constexpr int KIndexInChunkBits = 64 - kChunkIndexBits;
66+
67+
static constexpr uint64_t kMaxChunkIndex = (1ULL << kChunkIndexBits) - 1;
68+
static constexpr uint64_t kMaxIndexInChunk = (1ULL << KIndexInChunkBits) - 1;
69+
70+
CompressedChunkLocation() = default;
71+
72+
constexpr uint64_t chunk_index() const { return data_ & kMaxChunkIndex; }
73+
constexpr uint64_t index_in_chunk() const { return data_ >> kChunkIndexBits; }
74+
75+
explicit constexpr CompressedChunkLocation(uint64_t chunk_index,
76+
uint64_t index_in_chunk)
77+
: data_((index_in_chunk << kChunkIndexBits) | chunk_index) {}
78+
79+
template <typename IndexType>
80+
explicit operator TypedChunkLocation<IndexType>() {
81+
return {static_cast<IndexType>(chunk_index()),
82+
static_cast<IndexType>(index_in_chunk())};
83+
}
84+
85+
private:
86+
uint64_t data_;
87+
};
88+
89+
static_assert(sizeof(uint64_t) == sizeof(CompressedChunkLocation));
90+
5391
class ChunkedArrayResolver {
5492
private:
5593
ChunkResolver resolver_;
56-
std::vector<const Array*> chunks_;
94+
util::span<const Array* const> chunks_;
95+
std::vector<const Array*> owned_chunks_;
5796

5897
public:
59-
explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
98+
explicit ChunkedArrayResolver(std::vector<const Array*>&& chunks)
99+
: resolver_(chunks), chunks_(chunks), owned_chunks_(std::move(chunks)) {}
100+
explicit ChunkedArrayResolver(util::span<const Array* const> chunks)
60101
: resolver_(chunks), chunks_(chunks) {}
61102

62-
ChunkedArrayResolver(ChunkedArrayResolver&& other) = default;
63-
ChunkedArrayResolver& operator=(ChunkedArrayResolver&& other) = default;
103+
ARROW_DEFAULT_MOVE_AND_ASSIGN(ChunkedArrayResolver);
64104

65-
ChunkedArrayResolver(const ChunkedArrayResolver& other) = default;
66-
ChunkedArrayResolver& operator=(const ChunkedArrayResolver& other) = default;
105+
ChunkedArrayResolver(const ChunkedArrayResolver& other)
106+
: resolver_(other.resolver_), owned_chunks_(other.owned_chunks_) {
107+
// Rebind span to owned_chunks_ if necessary
108+
chunks_ = owned_chunks_.empty() ? other.chunks_ : owned_chunks_;
109+
}
110+
ChunkedArrayResolver& operator=(const ChunkedArrayResolver& other) {
111+
resolver_ = other.resolver_;
112+
owned_chunks_ = other.owned_chunks_;
113+
chunks_ = owned_chunks_.empty() ? other.chunks_ : owned_chunks_;
114+
return *this;
115+
}
67116

68117
ResolvedChunk Resolve(int64_t index) const {
69118
const auto loc = resolver_.Resolve(index);
70119
return {chunks_[loc.chunk_index], loc.index_in_chunk};
71120
}
72121
};
73122

74-
inline std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
75-
std::vector<const Array*> pointers(arrays.size());
76-
std::transform(arrays.begin(), arrays.end(), pointers.begin(),
77-
[&](const std::shared_ptr<Array>& array) { return array.get(); });
78-
return pointers;
79-
}
123+
std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays);
124+
125+
// A class that turns logical (linear) indices into physical (chunked) indices,
126+
// and vice-versa.
127+
class ChunkedIndexMapper {
128+
public:
129+
ChunkedIndexMapper(const std::vector<const Array*>& chunks, uint64_t* indices_begin,
130+
uint64_t* indices_end)
131+
: ChunkedIndexMapper(util::span(chunks), indices_begin, indices_end) {}
132+
ChunkedIndexMapper(util::span<const Array* const> chunks, uint64_t* indices_begin,
133+
uint64_t* indices_end)
134+
: chunk_lengths_(GetChunkLengths(chunks)),
135+
indices_begin_(indices_begin),
136+
indices_end_(indices_end) {}
137+
ChunkedIndexMapper(const RecordBatchVector& chunks, uint64_t* indices_begin,
138+
uint64_t* indices_end)
139+
: chunk_lengths_(GetChunkLengths(chunks)),
140+
indices_begin_(indices_begin),
141+
indices_end_(indices_end) {}
142+
143+
// Turn the original uint64_t logical indices into physical. This reuses the
144+
// same memory area, so the logical indices cannot be used anymore until
145+
// PhysicalToLogical() is called.
146+
//
147+
// This assumes that the logical indices are originally chunk-partitioned.
148+
Result<std::pair<CompressedChunkLocation*, CompressedChunkLocation*>>
149+
LogicalToPhysical();
150+
151+
// Turn the physical indices back into logical, making the uint64_t indices
152+
// usable again.
153+
Status PhysicalToLogical();
154+
155+
private:
156+
static std::vector<int64_t> GetChunkLengths(util::span<const Array* const> chunks);
157+
static std::vector<int64_t> GetChunkLengths(const RecordBatchVector& chunks);
158+
159+
std::vector<int64_t> chunk_lengths_;
160+
uint64_t* indices_begin_;
161+
uint64_t* indices_end_;
162+
};
80163

81-
} // namespace internal
82-
} // namespace compute
83-
} // namespace arrow
164+
} // namespace arrow::compute::internal

cpp/src/arrow/compute/kernels/vector_rank.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
namespace arrow::compute::internal {
2323

24+
using ::arrow::util::span;
25+
2426
namespace {
2527

2628
// ----------------------------------------------------------------------
@@ -237,7 +239,7 @@ class Ranker<ChunkedArray> : public RankerMixin<ChunkedArray, Ranker<ChunkedArra
237239
physical_chunks_, order_, null_placement_));
238240

239241
const auto arrays = GetArrayPointers(physical_chunks_);
240-
auto value_selector = [resolver = ChunkedArrayResolver(arrays)](int64_t index) {
242+
auto value_selector = [resolver = ChunkedArrayResolver(span(arrays))](int64_t index) {
241243
return resolver.Resolve(index).Value<InType>();
242244
};
243245
ARROW_ASSIGN_OR_RAISE(*output_, CreateRankings(ctx_, sorted, null_placement_,

0 commit comments

Comments
 (0)