Skip to content

Commit 4ceb5d1

Browse files
authored
Merge pull request ClickHouse#49 from Enmk/LowCardinality-Clear-fix
Fixed `ColumnLowCardinality::Clear()`
2 parents 2cdb0d8 + 7b3fcdc commit 4ceb5d1

File tree

5 files changed

+148
-39
lines changed

5 files changed

+148
-39
lines changed

clickhouse/columns/column.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class Column : public std::enable_shared_from_this<Column> {
5757
virtual void Swap(Column&) = 0;
5858

5959
/// Get a view on raw item data if it is supported by column, will throw an exception if index is out of range.
60-
/// Please note that view is invalidated once column is items are added or deleted, column is loaded from strean or destroyed.
60+
/// Please note that view is invalidated once column items are added or deleted, column is loaded from strean or destroyed.
6161
virtual ItemView GetItem(size_t) const {
6262
throw std::runtime_error("GetItem() is not supported for column of " + type_->GetName());
6363
}

clickhouse/columns/lowcardinality.cpp

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <string_view>
1111
#include <type_traits>
1212

13+
#include <cassert>
14+
1315
namespace {
1416
using namespace clickhouse;
1517

@@ -105,13 +107,13 @@ inline void AppendToDictionary(Column& dictionary, const ItemView & item) {
105107
}
106108
}
107109

108-
// Add special NULL-item, which is expected at pos(0) in dictionary,
110+
// A special NULL-item, which is expected at pos(0) in dictionary,
109111
// note that we distinguish empty string from NULL-value.
110-
inline void AppendNullItemToDictionary(ColumnRef dictionary) {
112+
inline auto GetNullItemForDictionary(const ColumnRef dictionary) {
111113
if (auto n = dictionary->As<ColumnNullable>()) {
112-
AppendToDictionary(*dictionary, ItemView{});
114+
return ItemView{};
113115
} else {
114-
AppendToDictionary(*dictionary, ItemView{dictionary->Type()->GetCode(), std::string_view{}});
116+
return ItemView{dictionary->Type()->GetCode(), std::string_view{}};
115117
}
116118
}
117119

@@ -120,23 +122,21 @@ inline void AppendNullItemToDictionary(ColumnRef dictionary) {
120122
namespace clickhouse {
121123
ColumnLowCardinality::ColumnLowCardinality(ColumnRef dictionary_column)
122124
: Column(Type::CreateLowCardinality(dictionary_column->Type())),
123-
dictionary_column_(dictionary_column),
125+
dictionary_column_(dictionary_column->Slice(0, 0)), // safe way to get an column of the same type.
124126
index_column_(std::make_shared<ColumnUInt32>())
125127
{
126-
if (dictionary_column_->Size() != 0) {
127-
// When dictionary column was constructed with values, re-add values by copying to update index and unique_items_map.
128-
129-
// Steal values into temporary column.
130-
auto values = dictionary_column_->Slice(0, 0);
131-
values->Swap(*dictionary_column_);
132-
133-
AppendNullItemToDictionary(dictionary_column_);
134-
135-
// Re-add values, updating index and unique_items_map.
136-
for (size_t i = 0; i < values->Size(); ++i)
137-
AppendUnsafe(values->GetItem(i));
128+
if (dictionary_column->Size() != 0) {
129+
AppendNullItemToEmptyColumn();
130+
131+
// Add values, updating index_column_ and unique_items_map_.
132+
for (size_t i = 0; i < dictionary_column->Size(); ++i) {
133+
// TODO: it would be possible to eliminate copying
134+
// by adding InsertUnsafe(pos, ItemView) method to a Column,
135+
// but that is too much work for now.
136+
AppendUnsafe(dictionary_column->GetItem(i));
137+
}
138138
} else {
139-
AppendNullItemToDictionary(dictionary_column_);
139+
AppendNullItemToEmptyColumn();
140140
}
141141
}
142142

@@ -288,6 +288,9 @@ void ColumnLowCardinality::Save(CodedOutputStream* output) {
288288
void ColumnLowCardinality::Clear() {
289289
index_column_->Clear();
290290
dictionary_column_->Clear();
291+
unique_items_map_.clear();
292+
293+
AppendNullItemToEmptyColumn();
291294
}
292295

293296
size_t ColumnLowCardinality::Size() const {
@@ -298,8 +301,7 @@ ColumnRef ColumnLowCardinality::Slice(size_t begin, size_t len) {
298301
begin = std::min(begin, Size());
299302
len = std::min(len, Size() - begin);
300303

301-
ColumnRef new_dictionary = dictionary_column_->Slice(0, 0);
302-
auto result = std::make_shared<ColumnLowCardinality>(new_dictionary);
304+
auto result = std::make_shared<ColumnLowCardinality>(dictionary_column_->Slice(0, 0));
303305

304306
for (size_t i = begin; i < begin + len; ++i)
305307
result->AppendUnsafe(this->GetItem(i));
@@ -353,6 +355,19 @@ void ColumnLowCardinality::AppendUnsafe(const ItemView & value) {
353355
}
354356
}
355357

358+
void ColumnLowCardinality::AppendNullItemToEmptyColumn()
359+
{
360+
// INVARIANT: Empty LC column has an (invisible) null-item at pos 0, which MUST be present in
361+
// unique_items_map_ in order to reuse dictionary posistion on subsequent Append()-s.
362+
363+
// Should be only performed on empty LC column.
364+
assert(dictionary_column_->Size() == 0 && unique_items_map_.empty());
365+
366+
const auto null_item = GetNullItemForDictionary(dictionary_column_);
367+
AppendToDictionary(*dictionary_column_, null_item);
368+
unique_items_map_.emplace(computeHashKey(null_item), dictionary_column_->Size());
369+
}
370+
356371
size_t ColumnLowCardinality::GetDictionarySize() const {
357372
return dictionary_column_->Size();
358373
}

clickhouse/columns/lowcardinality.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,14 @@ class ColumnLowCardinality : public Column {
4040
friend class ColumnLowCardinalityT;
4141

4242
private:
43-
// Please note that ColumnLowCardinalityT takes reference to underlying dictionary column object,
43+
// IMPLEMENTATION NOTE: ColumnLowCardinalityT takes reference to underlying dictionary column object,
4444
// so make sure to NOT change address of the dictionary object (with reset(), swap()) or with anything else.
4545
ColumnRef dictionary_column_;
4646
ColumnRef index_column_;
4747
UniqueItems unique_items_map_;
4848

4949
public:
50+
// c-tor makes a deep copy of the dictionary_column.
5051
explicit ColumnLowCardinality(ColumnRef dictionary_column);
5152
~ColumnLowCardinality();
5253

@@ -82,6 +83,9 @@ class ColumnLowCardinality : public Column {
8283
ColumnRef GetDictionary();
8384
void AppendUnsafe(const ItemView &);
8485

86+
private:
87+
void AppendNullItemToEmptyColumn();
88+
8589
public:
8690
static details::LowCardinalityHashKey computeHashKey(const ItemView &);
8791
};
@@ -90,7 +94,9 @@ class ColumnLowCardinality : public Column {
9094
*/
9195
template <typename DictionaryColumnType>
9296
class ColumnLowCardinalityT : public ColumnLowCardinality {
97+
9398
DictionaryColumnType& typed_dictionary_;
99+
const Type::Code type_;
94100

95101
public:
96102
using WrappedColumnType = DictionaryColumnType;
@@ -100,7 +106,8 @@ class ColumnLowCardinalityT : public ColumnLowCardinality {
100106
template <typename ...Args>
101107
explicit ColumnLowCardinalityT(Args &&... args)
102108
: ColumnLowCardinality(std::make_shared<DictionaryColumnType>(std::forward<Args>(args)...)),
103-
typed_dictionary_(dynamic_cast<DictionaryColumnType &>(*GetDictionary()))
109+
typed_dictionary_(dynamic_cast<DictionaryColumnType &>(*GetDictionary())),
110+
type_(typed_dictionary_.Type()->GetCode())
104111
{}
105112

106113
/// Extended interface to simplify reading/adding individual items.
@@ -119,7 +126,7 @@ class ColumnLowCardinalityT : public ColumnLowCardinality {
119126
using ColumnLowCardinality::Append;
120127

121128
inline void Append(const ValueType & value) {
122-
AppendUnsafe(ItemView{typed_dictionary_.Type()->GetCode(), value});
129+
AppendUnsafe(ItemView{type_, value});
123130
}
124131

125132
template <typename T>

ut/client_ut.cpp

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,29 @@ class ClientCase : public testing::TestWithParam<ClientOptions> {
3030
client_->Execute("DROP DATABASE test_clickhouse_cpp");
3131
}
3232

33+
template <typename T>
34+
std::shared_ptr<T> createTableWithOneColumn(Block & block)
35+
{
36+
auto col = std::make_shared<T>();
37+
const auto type_name = col->GetType().GetName();
38+
39+
client_->Execute("DROP TABLE IF EXISTS " + table_name + ";");
40+
client_->Execute("CREATE TABLE IF NOT EXISTS " + table_name + "( " + column_name + " " + type_name + " )"
41+
"ENGINE = Memory");
42+
43+
block.AppendColumn("test_column", col);
44+
45+
return col;
46+
}
47+
48+
std::string getOneColumnSelectQuery() const
49+
{
50+
return "SELECT " + column_name + " FROM " + table_name;
51+
}
52+
3353
std::unique_ptr<Client> client_;
54+
const std::string table_name = "test_clickhouse_cpp.test_ut_table";
55+
const std::string column_name = "test_column";
3456
};
3557

3658
TEST_P(ClientCase, Array) {
@@ -117,22 +139,57 @@ TEST_P(ClientCase, Date) {
117139

118140
TEST_P(ClientCase, LowCardinality) {
119141
Block block;
120-
client_->Execute("DROP TABLE IF EXISTS test_clickhouse_cpp.low_cardinality;");
142+
auto lc = createTableWithOneColumn<ColumnLowCardinalityT<ColumnString>>(block);
121143

122-
client_->Execute("CREATE TABLE IF NOT EXISTS "
123-
"test_clickhouse_cpp.low_cardinality (lc LowCardinality(String)) "
124-
"ENGINE = Memory");
144+
const std::vector<std::string> data{{"FooBar", "1", "2", "Foo", "4", "Bar", "Foo", "7", "8", "Foo"}};
145+
lc->AppendMany(data);
146+
147+
block.RefreshRowCount();
148+
client_->Insert(table_name, block);
149+
150+
size_t total_rows = 0;
151+
client_->Select(getOneColumnSelectQuery(),
152+
[&total_rows, &data](const Block& block) {
153+
total_rows += block.GetRowCount();
154+
if (block.GetRowCount() == 0) {
155+
return;
156+
}
157+
158+
ASSERT_EQ(1U, block.GetColumnCount());
159+
if (auto col = block[0]->As<ColumnLowCardinalityT<ColumnString>>()) {
160+
ASSERT_EQ(data.size(), col->Size());
161+
for (size_t i = 0; i < col->Size(); ++i) {
162+
EXPECT_EQ(data[i], (*col)[i]) << " at index: " << i;
163+
}
164+
}
165+
}
166+
);
167+
168+
ASSERT_EQ(total_rows, data.size());
169+
}
170+
171+
TEST_P(ClientCase, LowCardinality_InsertAfterClear) {
172+
// User can successfully insert values after invoking Clear() on LC column.
173+
Block block;
174+
auto lc = createTableWithOneColumn<ColumnLowCardinalityT<ColumnString>>(block);
175+
176+
// Add some data, but don't care about it much.
177+
lc->AppendMany(std::vector<std::string_view>{"abc", "def", "123", "abc", "123", "def", "ghi"});
178+
EXPECT_GT(lc->Size(), 0u);
179+
EXPECT_GT(lc->GetDictionarySize(), 0u);
125180

126-
auto lc = std::make_shared<ColumnLowCardinalityT<ColumnString>>();
181+
lc->Clear();
127182

183+
// Now ensure that all data appended after Clear() is inserted properly
128184
const std::vector<std::string> data{{"FooBar", "1", "2", "Foo", "4", "Bar", "Foo", "7", "8", "Foo"}};
129185
lc->AppendMany(data);
130186

131-
block.AppendColumn("lc", lc);
132-
client_->Insert("test_clickhouse_cpp.low_cardinality", block);
187+
block.RefreshRowCount();
188+
client_->Insert(table_name, block);
133189

190+
// Now validate that data was properly inserted
134191
size_t total_rows = 0;
135-
client_->Select("SELECT lc FROM test_clickhouse_cpp.low_cardinality",
192+
client_->Select(getOneColumnSelectQuery(),
136193
[&total_rows, &data](const Block& block) {
137194
total_rows += block.GetRowCount();
138195
if (block.GetRowCount() == 0) {

ut/columns_ut.cpp

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,14 @@
1111
#include <contrib/gtest/gtest.h>
1212
#include "utils.h"
1313

14+
#include <string_view>
15+
16+
17+
namespace
18+
{
19+
1420
using namespace clickhouse;
21+
using namespace std::literals::string_view_literals;
1522

1623
static std::vector<uint32_t> MakeNumbers() {
1724
return std::vector<uint32_t>
@@ -40,6 +47,17 @@ static std::vector<uint64_t> MakeUUIDs() {
4047
0x3507213c178649f9llu, 0x9faf035d662f60aellu};
4148
}
4249

50+
#define BINARY_STRING(x) std::string_view(x, sizeof(x) - 1)
51+
52+
static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY =
53+
"\x01\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00"
54+
"\x09\x00\x00\x00\x00\x00\x00\x00\x00\x06\x46\x6f\x6f\x42\x61\x72"
55+
"\x01\x31\x01\x32\x03\x46\x6f\x6f\x01\x34\x03\x42\x61\x72\x01\x37"
56+
"\x01\x38\x0a\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06"
57+
"\x04\x07\x08\x04"sv;
58+
59+
}
60+
4361
// TODO: add tests for ColumnDecimal.
4462

4563
TEST(ColumnsCase, NumericInit) {
@@ -217,14 +235,26 @@ TEST(ColumnsCase, LowCardinalityWrapperString_Append_and_Read) {
217235
}
218236
}
219237

220-
#define BINARY_STRING(x) std::string_view(x, sizeof(x) - 1)
238+
TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
239+
const size_t items_count = 11;
240+
ColumnLowCardinalityT<ColumnString> col;
241+
for (const auto & item : build_vector(&foobar, items_count))
242+
{
243+
col.Append(item);
244+
}
221245

222-
static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY =
223-
BINARY_STRING("\x01\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00"
224-
"\x09\x00\x00\x00\x00\x00\x00\x00\x00\x06\x46\x6f\x6f\x42\x61\x72"
225-
"\x01\x31\x01\x32\x03\x46\x6f\x6f\x01\x34\x03\x42\x61\x72\x01\x37"
226-
"\x01\x38\x0a\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06"
227-
"\x04\x07\x08\x04");
246+
col.Clear();
247+
ASSERT_EQ(col.Size(), 0u);
248+
ASSERT_EQ(col.GetDictionarySize(), 1u); // null-item
249+
250+
for (const auto & item : build_vector(&foobar, items_count))
251+
{
252+
col.Append(item);
253+
}
254+
255+
ASSERT_EQ(col.Size(), items_count);
256+
ASSERT_EQ(col.GetDictionarySize(), 8u + 1); // 8 unique items from sequence + 1 null-item
257+
}
228258

229259
TEST(ColumnsCase, LowCardinalityString_Load) {
230260
const size_t items_count = 10;

0 commit comments

Comments
 (0)