Skip to content

Commit 620adbe

Browse files
committed
fix: fix undefined behavior in bitwise left-shift operations across codebase (#308)
1 parent b715776 commit 620adbe

29 files changed

Lines changed: 206 additions & 94 deletions

src/paimon/common/compression/block_decompressor.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
namespace paimon {
2121

2222
int32_t BlockDecompressor::ReadIntLE(const char* buf) {
23-
return (buf[0] & 0xFF) | ((buf[1] & 0xFF) << 8) | ((buf[2] & 0xFF) << 16) |
24-
((buf[3] & 0xFF) << 24);
23+
return static_cast<int32_t>(static_cast<uint32_t>(static_cast<uint8_t>(buf[0])) |
24+
(static_cast<uint32_t>(static_cast<uint8_t>(buf[1])) << 8) |
25+
(static_cast<uint32_t>(static_cast<uint8_t>(buf[2])) << 16) |
26+
(static_cast<uint32_t>(static_cast<uint8_t>(buf[3])) << 24));
2527
}
2628

2729
Status BlockDecompressor::ValidateLength(int32_t compressed_len, int32_t original_len) {

src/paimon/common/data/abstract_binary_writer.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -199,20 +199,18 @@ int32_t AbstractBinaryWriter::RoundNumberOfBytesToNearestWord(int32_t num_bytes)
199199
template <typename T>
200200
void AbstractBinaryWriter::WriteBytesToFixLenPart(MemorySegment* segment, int32_t field_offset,
201201
const T& bytes, int32_t len) {
202-
int64_t first_byte = len | 0x80; // first bit is 1, other bits is len
203-
int64_t seven_bytes = 0L; // real data
202+
uint64_t first_byte = static_cast<uint64_t>(len) | 0x80u; // first bit is 1, other bits is len
203+
uint64_t seven_bytes = 0; // real data
204204
if ((SystemByteOrder() == ByteOrder::PAIMON_LITTLE_ENDIAN)) {
205205
for (int32_t i = 0; i < len; i++) {
206-
seven_bytes |= ((0x00000000000000FFL & bytes[i]) << (i * 8L));
206+
seven_bytes |= (static_cast<uint64_t>(static_cast<uint8_t>(bytes[i])) << (i * 8));
207207
}
208208
} else {
209209
for (int32_t i = 0; i < len; i++) {
210-
seven_bytes |= ((0x00000000000000FFL & bytes[i]) << ((6 - i) * 8L));
210+
seven_bytes |= (static_cast<uint64_t>(static_cast<uint8_t>(bytes[i])) << ((6 - i) * 8));
211211
}
212212
}
213-
const int64_t offset_and_size =
214-
(first_byte << 56) | // NOLINT(clang-analyzer-core.UndefinedBinaryOperatorResult)
215-
seven_bytes;
213+
const auto offset_and_size = static_cast<int64_t>(first_byte << 56 | seven_bytes);
216214
segment->PutValue<int64_t>(field_offset, offset_and_size);
217215
}
218216

src/paimon/common/data/binary_array_writer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ int32_t BinaryArrayWriter::GetFieldOffset(int32_t pos) const {
9595
}
9696

9797
void BinaryArrayWriter::SetOffsetAndSize(int32_t pos, int32_t offset, int64_t size) {
98-
const int64_t offset_and_size = (static_cast<int64_t>(offset) << 32) | size;
98+
const auto offset_and_size =
99+
static_cast<int64_t>((static_cast<uint64_t>(offset) << 32) | static_cast<uint64_t>(size));
99100
segment_.PutValue<int64_t>(GetElementOffset(pos, 8), offset_and_size);
100101
}
101102

src/paimon/common/data/binary_row.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@
2828

2929
namespace paimon {
3030
const int64_t BinaryRow::FIRST_BYTE_ZERO =
31-
(SystemByteOrder() == ByteOrder::PAIMON_LITTLE_ENDIAN) ? (~0xFFL) : (~(0xFFL << 56L));
31+
(SystemByteOrder() == ByteOrder::PAIMON_LITTLE_ENDIAN)
32+
? static_cast<int64_t>(~static_cast<uint64_t>(0xFF))
33+
: static_cast<int64_t>(~(static_cast<uint64_t>(0xFF) << 56));
3234

3335
const BinaryRow& BinaryRow::EmptyRow() {
3436
static const BinaryRow empty_row = GetEmptyRow();

src/paimon/common/data/binary_row_writer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ class BinaryRowWriter : public AbstractBinaryWriter {
100100
}
101101

102102
void SetOffsetAndSize(int32_t pos, int32_t offset, int64_t size) override {
103-
const int64_t offset_and_size = (static_cast<int64_t>(offset) << 32) | size;
103+
const auto offset_and_size = static_cast<int64_t>((static_cast<uint64_t>(offset) << 32) |
104+
static_cast<uint64_t>(size));
104105
segment_.PutValue<int64_t>(GetFieldOffset(pos), offset_and_size);
105106
}
106107

src/paimon/common/data/columnar/columnar_array.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,11 @@ Decimal ColumnarArray::GetDecimal(int32_t pos, int32_t precision, int32_t scale)
4646
auto array = arrow::internal::checked_cast<const ArrayType*>(array_);
4747
assert(array);
4848
arrow::Decimal128 decimal(array->GetValue(offset_ + pos));
49-
return Decimal(precision, scale,
50-
static_cast<Decimal::int128_t>(decimal.high_bits()) << 64 | decimal.low_bits());
49+
return Decimal(
50+
precision, scale,
51+
static_cast<Decimal::int128_t>(
52+
static_cast<Decimal::uint128_t>(static_cast<uint64_t>(decimal.high_bits())) << 64 |
53+
decimal.low_bits()));
5154
}
5255

5356
Timestamp ColumnarArray::GetTimestamp(int32_t pos, int32_t precision) const {

src/paimon/common/data/columnar/columnar_row.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@
2626
#include "arrow/util/checked_cast.h"
2727
#include "arrow/util/decimal.h"
2828
#include "paimon/common/data/columnar/columnar_array.h"
29+
#include "paimon/common/data/columnar/columnar_batch_context.h"
2930
#include "paimon/common/data/columnar/columnar_map.h"
31+
#include "paimon/common/data/columnar/columnar_row_ref.h"
3032
#include "paimon/common/utils/date_time_utils.h"
3133

3234
namespace paimon {
@@ -35,8 +37,11 @@ Decimal ColumnarRow::GetDecimal(int32_t pos, int32_t precision, int32_t scale) c
3537
auto array = arrow::internal::checked_cast<const ArrayType*>(array_vec_[pos]);
3638
assert(array);
3739
arrow::Decimal128 decimal(array->GetValue(row_id_));
38-
return Decimal(precision, scale,
39-
static_cast<Decimal::int128_t>(decimal.high_bits()) << 64 | decimal.low_bits());
40+
return Decimal(
41+
precision, scale,
42+
static_cast<Decimal::int128_t>(
43+
static_cast<Decimal::uint128_t>(static_cast<uint64_t>(decimal.high_bits())) << 64 |
44+
decimal.low_bits()));
4045
}
4146

4247
Timestamp ColumnarRow::GetTimestamp(int32_t pos, int32_t precision) const {
@@ -57,6 +62,9 @@ Timestamp ColumnarRow::GetTimestamp(int32_t pos, int32_t precision) const {
5762
std::shared_ptr<InternalRow> ColumnarRow::GetRow(int32_t pos, int32_t num_fields) const {
5863
auto struct_array = arrow::internal::checked_cast<const arrow::StructArray*>(array_vec_[pos]);
5964
assert(struct_array);
65+
// NOTE: For performance, the returned nested row does NOT hold shared ownership of the parent
66+
// StructArray. Callers must ensure the parent ColumnarRow (or its underlying RecordBatch)
67+
// outlives the returned row to avoid dangling pointers.
6068
return std::make_shared<ColumnarRow>(struct_array->fields(), pool_, row_id_);
6169
}
6270

src/paimon/common/data/columnar/columnar_row.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,19 @@ class MemoryPool;
4646
/// Columnar row to support access to vector column data. It is a row view in arrow::Array.
4747
class ColumnarRow : public InternalRow {
4848
public:
49+
/// @brief Construct a ColumnarRow without holding ownership of the underlying arrays.
50+
/// @warning The caller MUST ensure the data source (e.g., RecordBatch or parent StructArray)
51+
/// outlives this ColumnarRow. The internal array_vec_ stores raw pointers only; if the
52+
/// source is freed first, these pointers will dangle. This design is intentional for
53+
/// performance—avoiding per-row shared_ptr ref-count overhead on the hot read path.
4954
ColumnarRow(const arrow::ArrayVector& array_vec, const std::shared_ptr<MemoryPool>& pool,
5055
int64_t row_id)
5156
: ColumnarRow(/*struct_array holder*/ nullptr, array_vec, pool, row_id) {}
5257

58+
/// @brief Construct a ColumnarRow that holds shared ownership of a StructArray.
59+
/// @note When struct_array is non-null it keeps the underlying buffers alive, making it safe
60+
/// to outlive the original batch. Prefer this overload when the row may escape the scope of
61+
/// its parent container.
5362
ColumnarRow(const std::shared_ptr<arrow::StructArray>& struct_array,
5463
const arrow::ArrayVector& array_vec, const std::shared_ptr<MemoryPool>& pool,
5564
int64_t row_id)

src/paimon/common/data/columnar/columnar_row_ref.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,11 @@ Decimal ColumnarRowRef::GetDecimal(int32_t pos, int32_t precision, int32_t scale
3434
auto array = arrow::internal::checked_cast<const ArrayType*>(ctx_->array_vec[pos].get());
3535
assert(array);
3636
arrow::Decimal128 decimal(array->GetValue(row_id_));
37-
return Decimal(precision, scale,
38-
static_cast<Decimal::int128_t>(decimal.high_bits()) << 64 | decimal.low_bits());
37+
return Decimal(
38+
precision, scale,
39+
static_cast<Decimal::int128_t>(
40+
static_cast<Decimal::uint128_t>(static_cast<uint64_t>(decimal.high_bits())) << 64 |
41+
decimal.low_bits()));
3942
}
4043

4144
Timestamp ColumnarRowRef::GetTimestamp(int32_t pos, int32_t precision) const {

src/paimon/common/data/columnar/columnar_utils.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,17 @@ class ColumnarUtils {
6565
auto value_type_id = dict_type->value_type()->id();
6666
auto index_type_id = dict_type->index_type()->id();
6767
int64_t dict_index = -1;
68-
if (index_type_id == arrow::Type::type::INT32) {
68+
if (index_type_id == arrow::Type::type::INT8) {
69+
auto indices =
70+
arrow::internal::checked_cast<arrow::Int8Array*>(typed_array->indices().get());
71+
assert(indices);
72+
dict_index = indices->Value(pos);
73+
} else if (index_type_id == arrow::Type::type::INT16) {
74+
auto indices =
75+
arrow::internal::checked_cast<arrow::Int16Array*>(typed_array->indices().get());
76+
assert(indices);
77+
dict_index = indices->Value(pos);
78+
} else if (index_type_id == arrow::Type::type::INT32) {
6979
auto indices =
7080
arrow::internal::checked_cast<arrow::Int32Array*>(typed_array->indices().get());
7181
assert(indices);

0 commit comments

Comments
 (0)