Skip to content

Commit 211f115

Browse files
authored
Merge pull request ClickHouse#171 from Enmk/optimize_array_access
Fix Nested Array
2 parents 323adbe + 6708b82 commit 211f115

37 files changed

+1109
-91
lines changed

clickhouse/base/wire_format.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ void WireFormat::WriteAll(OutputStream& output, const void* buf, size_t len) {
4040
}
4141

4242
if (len) {
43-
throw Error("Failed to write " + std::to_string(original_len)
43+
throw ProtocolError("Failed to write " + std::to_string(original_len)
4444
+ " bytes, only written " + std::to_string(original_len - len));
4545
}
4646
}

clickhouse/columns/array.cpp

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,21 @@
55
namespace clickhouse {
66

77
ColumnArray::ColumnArray(ColumnRef data)
8+
: ColumnArray(data, std::make_shared<ColumnUInt64>())
9+
{
10+
}
11+
12+
ColumnArray::ColumnArray(ColumnRef data, std::shared_ptr<ColumnUInt64> offsets)
813
: Column(Type::CreateArray(data->Type()))
914
, data_(data)
10-
, offsets_(std::make_shared<ColumnUInt64>())
15+
, offsets_(offsets)
16+
{
17+
}
18+
19+
ColumnArray::ColumnArray(ColumnArray&& other)
20+
: Column(other.Type())
21+
, data_(std::move(other.data_))
22+
, offsets_(std::move(other.offsets_))
1123
{
1224
}
1325

@@ -18,30 +30,33 @@ void ColumnArray::AppendAsColumn(ColumnRef array) {
1830
"to column type " + data_->Type()->GetName());
1931
}
2032

21-
if (offsets_->Size() == 0) {
22-
offsets_->Append(array->Size());
23-
} else {
24-
offsets_->Append((*offsets_)[offsets_->Size() - 1] + array->Size());
25-
}
26-
33+
AddOffset(array->Size());
2734
data_->Append(array);
2835
}
2936

3037
ColumnRef ColumnArray::GetAsColumn(size_t n) const {
38+
if (n >= Size())
39+
throw ValidationError("Index is out ouf bounds: " + std::to_string(n));
40+
3141
return data_->Slice(GetOffset(n), GetSize(n));
3242
}
3343

3444
ColumnRef ColumnArray::Slice(size_t begin, size_t size) const {
35-
auto result = std::make_shared<ColumnArray>(GetAsColumn(begin));
36-
result->OffsetsIncrease(1);
45+
if (size && begin + size > Size())
46+
throw ValidationError("Slice indexes are out of bounds");
3747

38-
for (size_t i = 1; i < size; i++) {
39-
result->Append(std::make_shared<ColumnArray>(GetAsColumn(begin + i)));
48+
auto result = std::make_shared<ColumnArray>(data_->CloneEmpty());
49+
for (size_t i = 0; i < size; i++) {
50+
result->AppendAsColumn(GetAsColumn(begin + i));
4051
}
4152

4253
return result;
4354
}
4455

56+
ColumnRef ColumnArray::CloneEmpty() const {
57+
return std::make_shared<ColumnArray>(data_->CloneEmpty());
58+
}
59+
4560
void ColumnArray::Append(ColumnRef column) {
4661
if (auto col = column->As<ColumnArray>()) {
4762
if (!col->data_->Type()->IsEqual(data_->Type())) {
@@ -108,8 +123,25 @@ size_t ColumnArray::GetOffset(size_t n) const {
108123
return (n == 0) ? 0 : (*offsets_)[n - 1];
109124
}
110125

126+
void ColumnArray::AddOffset(size_t n) {
127+
if (offsets_->Size() == 0) {
128+
offsets_->Append(n);
129+
} else {
130+
offsets_->Append((*offsets_)[offsets_->Size() - 1] + n);
131+
}
132+
}
133+
111134
size_t ColumnArray::GetSize(size_t n) const {
112135
return (n == 0) ? (*offsets_)[n] : ((*offsets_)[n] - (*offsets_)[n - 1]);
113136
}
114137

138+
ColumnRef ColumnArray::GetData() {
139+
return data_;
140+
}
141+
142+
void ColumnArray::Reset() {
143+
data_.reset();
144+
offsets_.reset();
145+
}
146+
115147
}

clickhouse/columns/array.h

Lines changed: 226 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,48 @@
33
#include "column.h"
44
#include "numeric.h"
55

6+
#include <memory>
7+
68
namespace clickhouse {
79

10+
template <typename NestedColumnType>
11+
class ColumnArrayT;
12+
813
/**
914
* Represents column of Array(T).
1015
*/
1116
class ColumnArray : public Column {
1217
public:
13-
ColumnArray(ColumnRef data);
18+
using ValueType = ColumnRef;
19+
20+
/** Create an array of given type.
21+
*
22+
* `data` is used internaly (and modified) by ColumnArray.
23+
* Users are strongly advised against supplying non-empty columns and/or modifying
24+
* contents of `data` afterwards.
25+
*/
26+
explicit ColumnArray(ColumnRef data);
27+
28+
/** Create an array of given type, with actual values and offsets.
29+
*
30+
* Both `data` and `offsets` are used (and modified) internally bye ColumnArray.
31+
* Users are strongly advised against modifying contents of `data` or `offsets` afterwards.
32+
*/
33+
ColumnArray(ColumnRef data, std::shared_ptr<ColumnUInt64> offsets);
1434

15-
/// Converts input column to array and appends
16-
/// as one row to the current column.
35+
/// Converts input column to array and appends as one row to the current column.
1736
void AppendAsColumn(ColumnRef array);
1837

1938
/// Convets array at pos n to column.
2039
/// Type of element of result column same as type of array element.
2140
ColumnRef GetAsColumn(size_t n) const;
2241

42+
/// Shorthand to get a column casted to a proper type.
43+
template <typename T>
44+
auto GetAsColumnTyped(size_t n) const {
45+
return GetAsColumn(n)->AsStrict<T>();
46+
}
47+
2348
public:
2449
/// Appends content of given column to the end of current one.
2550
void Append(ColumnRef column) override;
@@ -44,19 +69,214 @@ class ColumnArray : public Column {
4469

4570
/// Makes slice of the current column.
4671
ColumnRef Slice(size_t, size_t) const override;
47-
72+
ColumnRef CloneEmpty() const override;
4873
void Swap(Column&) override;
4974

5075
void OffsetsIncrease(size_t);
5176

52-
private:
53-
size_t GetOffset(size_t n) const;
77+
protected:
78+
template<typename T> friend class ColumnArrayT;
79+
80+
ColumnArray(ColumnArray&& array);
5481

82+
size_t GetOffset(size_t n) const;
5583
size_t GetSize(size_t n) const;
84+
ColumnRef GetData();
85+
void AddOffset(size_t n);
86+
void Reset();
5687

5788
private:
5889
ColumnRef data_;
5990
std::shared_ptr<ColumnUInt64> offsets_;
6091
};
6192

93+
template <typename ColumnType>
94+
class ColumnArrayT : public ColumnArray {
95+
public:
96+
class ArrayValueView;
97+
using ValueType = ArrayValueView;
98+
using NestedColumnType = ColumnType;
99+
100+
explicit ColumnArrayT(std::shared_ptr<NestedColumnType> data)
101+
: ColumnArray(data)
102+
, typed_nested_data_(data)
103+
{}
104+
105+
ColumnArrayT(std::shared_ptr<NestedColumnType> data, std::shared_ptr<ColumnUInt64> offsets)
106+
: ColumnArray(data, offsets)
107+
, typed_nested_data_(data)
108+
{}
109+
110+
template <typename ...Args>
111+
explicit ColumnArrayT(Args &&... args)
112+
: ColumnArrayT(std::make_shared<NestedColumnType>(std::forward<Args>(args)...))
113+
{}
114+
115+
/** Create a ColumnArrayT from a ColumnArray, without copying data and offsets, but by 'stealing' those from `col`.
116+
*
117+
* Ownership of column internals is transferred to returned object, original (argument) object
118+
* MUST NOT BE USED IN ANY WAY, it is only safe to dispose it.
119+
*
120+
* Throws an exception if `col` is of wrong type, it is safe to use original col in this case.
121+
* This is a static method to make such conversion verbose.
122+
*/
123+
static auto Wrap(ColumnArray&& col) {
124+
if constexpr (std::is_base_of_v<ColumnArray, NestedColumnType> && !std::is_same_v<ColumnArray, NestedColumnType>) {
125+
// assuming NestedColumnType is ArrayT specialization
126+
return std::make_shared<ColumnArrayT<NestedColumnType>>(NestedColumnType::Wrap(col.GetData()), col.offsets_);
127+
} else {
128+
auto nested_data = col.GetData()->template AsStrict<NestedColumnType>();
129+
return std::make_shared<ColumnArrayT<NestedColumnType>>(nested_data, col.offsets_);
130+
}
131+
}
132+
133+
static auto Wrap(Column&& col) {
134+
return Wrap(std::move(dynamic_cast<ColumnArray&&>(col)));
135+
}
136+
137+
// Helper to simplify integration with other APIs
138+
static auto Wrap(ColumnRef&& col) {
139+
return Wrap(std::move(*col->AsStrict<ColumnArray>()));
140+
}
141+
142+
/// A single (row) value of the Array-column, i.e. readonly array of items.
143+
class ArrayValueView {
144+
const std::shared_ptr<NestedColumnType> typed_nested_data_;
145+
const size_t offset_;
146+
const size_t size_;
147+
148+
public:
149+
using ValueType = typename NestedColumnType::ValueType;
150+
151+
ArrayValueView(std::shared_ptr<NestedColumnType> data, size_t offset = 0, size_t size = std::numeric_limits<size_t>::max())
152+
: typed_nested_data_(data)
153+
, offset_(offset)
154+
, size_(std::min(typed_nested_data_->Size() - offset, size))
155+
{}
156+
157+
inline auto operator[](size_t index) const {
158+
return (*typed_nested_data_)[offset_ + index];
159+
}
160+
161+
inline auto At(size_t index) const {
162+
if (index >= size_)
163+
throw ValidationError("ColumnArray value index out of bounds: "
164+
+ std::to_string(index) + ", max is " + std::to_string(size_));
165+
return typed_nested_data_->At(offset_ + index);
166+
}
167+
168+
class Iterator {
169+
const std::shared_ptr<NestedColumnType> typed_nested_data_;
170+
const size_t offset_;
171+
const size_t size_;
172+
size_t index_;
173+
public:
174+
Iterator(std::shared_ptr<NestedColumnType> typed_nested_data, size_t offset, size_t size, size_t index)
175+
: typed_nested_data_(typed_nested_data)
176+
, offset_(offset)
177+
, size_(size)
178+
, index_(index)
179+
{}
180+
181+
using ValueType = typename NestedColumnType::ValueType;
182+
183+
inline auto operator*() const {
184+
return typed_nested_data_->At(offset_ + index_);
185+
}
186+
187+
inline Iterator& operator++() {
188+
++index_;
189+
return *this;
190+
}
191+
192+
inline bool operator==(const Iterator& other) const {
193+
return this->typed_nested_data_ == other.typed_nested_data_
194+
&& this->offset_ == other.offset_
195+
&& this->size_ == other.size_
196+
&& this->index_ == other.index_;
197+
}
198+
199+
inline bool operator!=(const Iterator& other) const {
200+
return !(*this == other);
201+
}
202+
};
203+
204+
// minimalistic stl-like container interface, hence the lowercase
205+
inline Iterator begin() const {
206+
return Iterator{typed_nested_data_, offset_, size_, 0};
207+
}
208+
209+
inline Iterator cbegin() const {
210+
return Iterator{typed_nested_data_, offset_, size_, 0};
211+
}
212+
213+
inline Iterator end() const {
214+
return Iterator{typed_nested_data_, offset_, size_, size_};
215+
}
216+
217+
inline Iterator cend() const {
218+
return Iterator{typed_nested_data_, offset_, size_, size_};
219+
}
220+
221+
inline size_t size() const {
222+
return size_;
223+
}
224+
225+
// It is ugly to have both size() and Size(), but it is for compatitability with both STL and rest of the clickhouse-cpp.
226+
inline size_t Size() const {
227+
return size_;
228+
}
229+
};
230+
231+
inline auto At(size_t index) const {
232+
if (index >= Size())
233+
throw ValidationError("ColumnArray row index out of bounds: "
234+
+ std::to_string(index) + ", max is " + std::to_string(Size()));
235+
236+
return ArrayValueView{typed_nested_data_, GetOffset(index), GetSize(index)};
237+
}
238+
239+
inline auto operator[](size_t index) const {
240+
return ArrayValueView{typed_nested_data_, GetOffset(index), GetSize(index)};
241+
}
242+
243+
using ColumnArray::Append;
244+
245+
template <typename Container>
246+
inline void Append(const Container& container) {
247+
Append(std::begin(container), std::end(container));
248+
}
249+
250+
template <typename ValueType>
251+
inline void Append(const std::initializer_list<ValueType>& container) {
252+
Append(std::begin(container), std::end(container));
253+
}
254+
255+
template <typename Begin, typename End>
256+
inline void Append(Begin begin, const End & end) {
257+
auto & nested_data = *typed_nested_data_;
258+
size_t counter = 0;
259+
260+
while (begin != end) {
261+
nested_data.Append(*begin);
262+
++begin;
263+
++counter;
264+
}
265+
266+
// Even if there are 0 items, increase counter, creating empty array item.
267+
AddOffset(counter);
268+
}
269+
270+
private:
271+
/// Helper to allow wrapping a "typeless" ColumnArray
272+
ColumnArrayT(ColumnArray&& array, std::shared_ptr<NestedColumnType> nested_data)
273+
: ColumnArray(std::move(array))
274+
, typed_nested_data_(std::move(nested_data))
275+
{}
276+
277+
278+
private:
279+
std::shared_ptr<NestedColumnType> typed_nested_data_;
280+
};
281+
62282
}

clickhouse/columns/column.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@ class Column : public std::enable_shared_from_this<Column> {
3535
return std::dynamic_pointer_cast<const T>(shared_from_this());
3636
}
3737

38+
/// Downcast pointer to the specific column's subtype.
39+
template <typename T>
40+
inline std::shared_ptr<T> AsStrict() {
41+
auto result = std::dynamic_pointer_cast<T>(shared_from_this());
42+
if (!result) {
43+
throw ValidationError("Can't cast from " + type_->GetName());
44+
}
45+
return result;
46+
}
47+
3848
/// Get type object of the column.
3949
inline TypeRef Type() const { return type_; }
4050
inline const class Type& GetType() const { return *type_; }
@@ -73,6 +83,8 @@ class Column : public std::enable_shared_from_this<Column> {
7383
/// Makes slice of the current column.
7484
virtual ColumnRef Slice(size_t begin, size_t len) const = 0;
7585

86+
virtual ColumnRef CloneEmpty() const = 0;
87+
7688
virtual void Swap(Column&) = 0;
7789

7890
/// Get a view on raw item data if it is supported by column, will throw an exception if index is out of range.

0 commit comments

Comments
 (0)