20
20
#include < algorithm>
21
21
#include < cstdint>
22
22
#include < memory>
23
+ #include < utility>
23
24
#include < vector>
24
25
25
26
#include " arrow/array.h"
26
27
#include " arrow/chunk_resolver.h"
27
28
#include " arrow/compute/kernels/codegen_internal.h"
29
+ #include " arrow/util/span.h"
28
30
29
- namespace arrow {
30
- namespace compute {
31
- namespace internal {
31
+ namespace arrow ::compute::internal {
32
32
33
33
// The target chunk in a chunked array.
34
34
struct ResolvedChunk {
35
35
// The target array in chunked array.
36
36
const Array* array;
37
37
// The index in the target array.
38
- const int64_t index;
38
+ int64_t index;
39
39
40
40
ResolvedChunk (const Array* array, int64_t index) : array(array), index(index) {}
41
41
42
- public:
42
+ friend bool operator ==(const ResolvedChunk& left, const ResolvedChunk& right) {
43
+ return left.array == right.array && left.index == right.index ;
44
+ }
45
+ friend bool operator !=(const ResolvedChunk& left, const ResolvedChunk& right) {
46
+ return left.array != right.array || left.index != right.index ;
47
+ }
48
+
43
49
bool IsNull () const { return array->IsNull (index ); }
44
50
45
51
template <typename ArrowType, typename ViewType = GetViewType<ArrowType>>
@@ -50,34 +56,109 @@ struct ResolvedChunk {
50
56
}
51
57
};
52
58
59
+ // A compressed (chunk_index, index_in_chunk) pair.
60
+ // The goal of compression is to make it fit in 64 bits, allowing in place
61
+ // replacement of logical uint64_t indices with physical indices.
62
+ // (see ChunkedIndexMapper)
63
+ struct CompressedChunkLocation {
64
+ static constexpr int kChunkIndexBits = 24 ;
65
+ static constexpr int KIndexInChunkBits = 64 - kChunkIndexBits ;
66
+
67
+ static constexpr uint64_t kMaxChunkIndex = (1ULL << kChunkIndexBits ) - 1 ;
68
+ static constexpr uint64_t kMaxIndexInChunk = (1ULL << KIndexInChunkBits) - 1 ;
69
+
70
+ CompressedChunkLocation () = default ;
71
+
72
+ constexpr uint64_t chunk_index () const { return data_ & kMaxChunkIndex ; }
73
+ constexpr uint64_t index_in_chunk () const { return data_ >> kChunkIndexBits ; }
74
+
75
+ explicit constexpr CompressedChunkLocation (uint64_t chunk_index,
76
+ uint64_t index_in_chunk)
77
+ : data_((index_in_chunk << kChunkIndexBits ) | chunk_index) {}
78
+
79
+ template <typename IndexType>
80
+ explicit operator TypedChunkLocation<IndexType>() {
81
+ return {static_cast <IndexType>(chunk_index ()),
82
+ static_cast <IndexType>(index_in_chunk ())};
83
+ }
84
+
85
+ private:
86
+ uint64_t data_;
87
+ };
88
+
89
+ static_assert (sizeof (uint64_t ) == sizeof (CompressedChunkLocation));
90
+
53
91
class ChunkedArrayResolver {
54
92
private:
55
93
ChunkResolver resolver_;
56
- std::vector<const Array*> chunks_;
94
+ util::span<const Array* const > chunks_;
95
+ std::vector<const Array*> owned_chunks_;
57
96
58
97
public:
59
- explicit ChunkedArrayResolver (const std::vector<const Array*>& chunks)
98
+ explicit ChunkedArrayResolver (std::vector<const Array*>&& chunks)
99
+ : resolver_(chunks), chunks_(chunks), owned_chunks_(std::move(chunks)) {}
100
+ explicit ChunkedArrayResolver (util::span<const Array* const > chunks)
60
101
: resolver_(chunks), chunks_(chunks) {}
61
102
62
- ChunkedArrayResolver (ChunkedArrayResolver&& other) = default ;
63
- ChunkedArrayResolver& operator =(ChunkedArrayResolver&& other) = default ;
103
+ ARROW_DEFAULT_MOVE_AND_ASSIGN (ChunkedArrayResolver);
64
104
65
- ChunkedArrayResolver (const ChunkedArrayResolver& other) = default ;
66
- ChunkedArrayResolver& operator =(const ChunkedArrayResolver& other) = default ;
105
+ ChunkedArrayResolver (const ChunkedArrayResolver& other)
106
+ : resolver_(other.resolver_), owned_chunks_(other.owned_chunks_) {
107
+ // Rebind span to owned_chunks_ if necessary
108
+ chunks_ = owned_chunks_.empty () ? other.chunks_ : owned_chunks_;
109
+ }
110
+ ChunkedArrayResolver& operator =(const ChunkedArrayResolver& other) {
111
+ resolver_ = other.resolver_ ;
112
+ owned_chunks_ = other.owned_chunks_ ;
113
+ chunks_ = owned_chunks_.empty () ? other.chunks_ : owned_chunks_;
114
+ return *this ;
115
+ }
67
116
68
117
ResolvedChunk Resolve (int64_t index) const {
69
118
const auto loc = resolver_.Resolve (index );
70
119
return {chunks_[loc.chunk_index ], loc.index_in_chunk };
71
120
}
72
121
};
73
122
74
- inline std::vector<const Array*> GetArrayPointers (const ArrayVector& arrays) {
75
- std::vector<const Array*> pointers (arrays.size ());
76
- std::transform (arrays.begin (), arrays.end (), pointers.begin (),
77
- [&](const std::shared_ptr<Array>& array) { return array.get (); });
78
- return pointers;
79
- }
123
+ std::vector<const Array*> GetArrayPointers (const ArrayVector& arrays);
124
+
125
+ // A class that turns logical (linear) indices into physical (chunked) indices,
126
+ // and vice-versa.
127
+ class ChunkedIndexMapper {
128
+ public:
129
+ ChunkedIndexMapper (const std::vector<const Array*>& chunks, uint64_t * indices_begin,
130
+ uint64_t * indices_end)
131
+ : ChunkedIndexMapper(util::span(chunks), indices_begin, indices_end) {}
132
+ ChunkedIndexMapper (util::span<const Array* const > chunks, uint64_t * indices_begin,
133
+ uint64_t * indices_end)
134
+ : chunk_lengths_(GetChunkLengths(chunks)),
135
+ indices_begin_ (indices_begin),
136
+ indices_end_(indices_end) {}
137
+ ChunkedIndexMapper (const RecordBatchVector& chunks, uint64_t * indices_begin,
138
+ uint64_t * indices_end)
139
+ : chunk_lengths_(GetChunkLengths(chunks)),
140
+ indices_begin_(indices_begin),
141
+ indices_end_(indices_end) {}
142
+
143
+ // Turn the original uint64_t logical indices into physical. This reuses the
144
+ // same memory area, so the logical indices cannot be used anymore until
145
+ // PhysicalToLogical() is called.
146
+ //
147
+ // This assumes that the logical indices are originally chunk-partitioned.
148
+ Result<std::pair<CompressedChunkLocation*, CompressedChunkLocation*>>
149
+ LogicalToPhysical ();
150
+
151
+ // Turn the physical indices back into logical, making the uint64_t indices
152
+ // usable again.
153
+ Status PhysicalToLogical ();
154
+
155
+ private:
156
+ static std::vector<int64_t > GetChunkLengths (util::span<const Array* const > chunks);
157
+ static std::vector<int64_t > GetChunkLengths (const RecordBatchVector& chunks);
158
+
159
+ std::vector<int64_t > chunk_lengths_;
160
+ uint64_t * indices_begin_;
161
+ uint64_t * indices_end_;
162
+ };
80
163
81
- } // namespace internal
82
- } // namespace compute
83
- } // namespace arrow
164
+ } // namespace arrow::compute::internal
0 commit comments