diff --git a/src/paimon/common/utils/bloom_filter.cpp b/src/paimon/common/utils/bloom_filter.cpp index 1e9b9a69..85f9afcd 100644 --- a/src/paimon/common/utils/bloom_filter.cpp +++ b/src/paimon/common/utils/bloom_filter.cpp @@ -59,7 +59,10 @@ Status BloomFilter::AddHash(int32_t hash1) { auto hash2 = static_cast(static_cast(hash1) >> 16); for (int32_t i = 1; i <= num_hash_functions_; i++) { - int32_t combined_hash = hash1 + (i * hash2); + // Use uint32_t arithmetic to avoid signed overflow UB (matches Java int wrap semantics) + auto combined_hash = + static_cast(static_cast(hash1) + + (static_cast(i) * static_cast(hash2))); // hashcode should be positive, flip all the bits if it's negative if (combined_hash < 0) { combined_hash = ~combined_hash; @@ -74,7 +77,10 @@ bool BloomFilter::TestHash(int32_t hash1) const { auto hash2 = static_cast(static_cast(hash1) >> 16); for (int32_t i = 1; i <= num_hash_functions_; i++) { - int32_t combined_hash = hash1 + (i * hash2); + // Use uint32_t arithmetic to avoid signed overflow UB (matches Java int wrap semantics) + auto combined_hash = + static_cast(static_cast(hash1) + + (static_cast(i) * static_cast(hash2))); // hashcode should be positive, flip all the bits if it's negative if (combined_hash < 0) { combined_hash = ~combined_hash; diff --git a/src/paimon/common/utils/bloom_filter64.cpp b/src/paimon/common/utils/bloom_filter64.cpp index 4c740b9e..02ee654f 100644 --- a/src/paimon/common/utils/bloom_filter64.cpp +++ b/src/paimon/common/utils/bloom_filter64.cpp @@ -68,7 +68,10 @@ void BloomFilter64::AddHash(int64_t hash64) { auto hash2 = static_cast(static_cast(hash64) >> 32); for (int32_t i = 1; i <= num_hash_functions_; i++) { - int32_t combined_hash = hash1 + (i * hash2); + // Use uint32_t arithmetic to avoid signed overflow UB (matches Java int wrap semantics) + auto combined_hash = + static_cast(static_cast(hash1) + + (static_cast(i) * static_cast(hash2))); // hashcode should be positive, flip all the bits if it's negative if (combined_hash < 0) { combined_hash = ~combined_hash; @@ -83,7 +86,10 @@ bool BloomFilter64::TestHash(int64_t hash64) const { auto hash2 = static_cast(static_cast(hash64) >> 32); for (int32_t i = 1; i <= num_hash_functions_; i++) { - int32_t combined_hash = hash1 + (i * hash2); + // Use uint32_t arithmetic to avoid signed overflow UB (matches Java int wrap semantics) + auto combined_hash = + static_cast(static_cast(hash1) + + (static_cast(i) * static_cast(hash2))); // hashcode should be positive, flip all the bits if it's negative if (combined_hash < 0) { combined_hash = ~combined_hash; diff --git a/src/paimon/common/utils/delta_varint_compressor.cpp b/src/paimon/common/utils/delta_varint_compressor.cpp index e8f02d39..fdf9fa87 100644 --- a/src/paimon/common/utils/delta_varint_compressor.cpp +++ b/src/paimon/common/utils/delta_varint_compressor.cpp @@ -30,12 +30,14 @@ std::vector DeltaVarintCompressor::Compress(const std::vector& da return {}; } - // 1. Delta encoding + // 1. Delta encoding (use unsigned subtraction to avoid signed overflow UB) std::vector deltas; deltas.reserve(data.size()); deltas.push_back(data[0]); for (size_t i = 1; i < data.size(); i++) { - deltas.push_back(data[i] - data[i - 1]); + uint64_t unsigned_delta = + static_cast(data[i]) - static_cast(data[i - 1]); + deltas.push_back(static_cast(unsigned_delta)); } // 2. ZigZag + Varint @@ -61,11 +63,13 @@ Result> DeltaVarintCompressor::Decompress(const std::vector deltas.push_back(delta); } - // 2. Delta decoding + // 2. Delta decoding (use unsigned addition to avoid signed overflow UB) std::vector result(deltas.size()); result[0] = deltas[0]; for (size_t i = 1; i < result.size(); i++) { - result[i] = result[i - 1] + deltas[i]; + uint64_t reconstructed = + static_cast(result[i - 1]) + static_cast(deltas[i]); + result[i] = static_cast(reconstructed); } return result; }