Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/common/dbpa_local_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

#include "dbpa_local.h"
#include "../processing/parquet_utils.h"
#include <gtest/gtest.h>
#include <memory>
#include <vector>
Expand Down Expand Up @@ -45,9 +46,9 @@ TEST_F(LocalDataBatchProtectionAgentTest, SuccessfulEncryption) {
std::string app_context = R"({"user_id": "test_user"})";

EXPECT_NO_THROW(agent.init("test_column", configuration_map, app_context, "test_key",
Type::UNDEFINED, std::nullopt, CompressionCodec::UNCOMPRESSED, std::nullopt));
Type::BYTE_ARRAY, std::nullopt, CompressionCodec::UNCOMPRESSED, std::nullopt));

std::vector<uint8_t> test_data = {1, 2, 3, 4};
std::vector<uint8_t> test_data = BuildByteArrayValueBytes("test_ABC");
std::map<std::string, std::string> encoding_attributes = {{"page_encoding", "PLAIN"}, {"page_type", "DICTIONARY_PAGE"}};
auto result = agent.Encrypt(test_data, encoding_attributes);

Expand Down Expand Up @@ -88,9 +89,9 @@ TEST_F(LocalDataBatchProtectionAgentTest, SuccessfulDecryption) {
std::string app_context = R"({"user_id": "test_user"})";

EXPECT_NO_THROW(agent.init("test_column", configuration_map, app_context, "test_key",
Type::UNDEFINED, std::nullopt, CompressionCodec::UNCOMPRESSED, DBPS_ENCRYPTION_METADATA));
Type::BYTE_ARRAY, std::nullopt, CompressionCodec::UNCOMPRESSED, DBPS_ENCRYPTION_METADATA));

std::vector<uint8_t> test_data = {1, 2, 3, 4};
std::vector<uint8_t> test_data = BuildByteArrayValueBytes("test_EFG");
std::map<std::string, std::string> encoding_attributes = {{"page_encoding", "PLAIN"}, {"page_type", "DICTIONARY_PAGE"}};
auto result = agent.Decrypt(test_data, encoding_attributes);

Expand All @@ -107,10 +108,10 @@ TEST_F(LocalDataBatchProtectionAgentTest, RoundTripEncryptDecrypt) {
std::string app_context = R"({"user_id": "test_user"})";

EXPECT_NO_THROW(encrypt_agent.init("test_column", configuration_map, app_context, "test_key",
Type::UNDEFINED, std::nullopt, CompressionCodec::UNCOMPRESSED, std::nullopt));
Type::BYTE_ARRAY, std::nullopt, CompressionCodec::UNCOMPRESSED, std::nullopt));

// Original data to encrypt
std::vector<uint8_t> original_data = {1, 2, 3, 4, 5};
std::vector<uint8_t> original_data = BuildByteArrayValueBytes("roundtrip_XYZ");
std::map<std::string, std::string> encoding_attributes = {{"page_encoding", "PLAIN"}, {"page_type", "DICTIONARY_PAGE"}};

// Encrypt the data
Expand All @@ -133,7 +134,7 @@ TEST_F(LocalDataBatchProtectionAgentTest, RoundTripEncryptDecrypt) {
// Create a new agent for decryption with the encryption_metadata from the encryption result
LocalDataBatchProtectionAgent decrypt_agent;
EXPECT_NO_THROW(decrypt_agent.init("test_column", configuration_map, app_context, "test_key",
Type::UNDEFINED, std::nullopt, CompressionCodec::UNCOMPRESSED, encryption_metadata));
Type::BYTE_ARRAY, std::nullopt, CompressionCodec::UNCOMPRESSED, encryption_metadata));

// Decrypt the ciphertext
auto decrypt_result = decrypt_agent.Decrypt(ciphertext, encoding_attributes);
Expand Down
3 changes: 1 addition & 2 deletions src/common/enum_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ constexpr std::optional<E> from_string_impl(
// For dbps::external::Type
namespace {
using T = ::dbps::external::Type::type;
inline constexpr std::array<std::pair<T, std::string_view>, 9> kTypePairs{{
inline constexpr std::array<std::pair<T, std::string_view>, 8> kTypePairs{{
{T::BOOLEAN, "BOOLEAN"},
{T::INT32, "INT32"},
{T::INT64, "INT64"},
Expand All @@ -61,7 +61,6 @@ inline constexpr std::array<std::pair<T, std::string_view>, 9> kTypePairs{{
{T::DOUBLE, "DOUBLE"},
{T::BYTE_ARRAY, "BYTE_ARRAY"},
{T::FIXED_LEN_BYTE_ARRAY, "FIXED_LEN_BYTE_ARRAY"},
{T::UNDEFINED, "UNDEFINED"},
}};
} // anon

Expand Down
21 changes: 6 additions & 15 deletions src/common/enum_utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ TEST(EnumUtils, TypeToStringConversion) {
ASSERT_EQ("DOUBLE", std::string(to_string(Type::DOUBLE)));
ASSERT_EQ("BYTE_ARRAY", std::string(to_string(Type::BYTE_ARRAY)));
ASSERT_EQ("FIXED_LEN_BYTE_ARRAY", std::string(to_string(Type::FIXED_LEN_BYTE_ARRAY)));
ASSERT_EQ("UNDEFINED", std::string(to_string(Type::UNDEFINED)));
}

TEST(EnumUtils, TypeFromStringConversion) {
Expand Down Expand Up @@ -69,10 +68,6 @@ TEST(EnumUtils, TypeFromStringConversion) {
result = to_datatype_enum("FIXED_LEN_BYTE_ARRAY");
ASSERT_TRUE(result.has_value());
ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, result.value());

result = to_datatype_enum("UNDEFINED");
ASSERT_TRUE(result.has_value());
ASSERT_EQ(Type::UNDEFINED, result.value());
}

TEST(EnumUtils, TypeInvalidFromString) {
Expand Down Expand Up @@ -246,7 +241,7 @@ TEST(EnumUtils, RoundTripTypeConversion) {
// Test all Type enum values
Type::type types[] = {
Type::BOOLEAN, Type::INT32, Type::INT64, Type::INT96,
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY, Type::UNDEFINED
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY
};

for (auto type : types) {
Expand Down Expand Up @@ -347,7 +342,7 @@ TEST(EnumUtils, TypeEnumCompleteness) {
// Define all known Type enum values
Type::type all_types[] = {
Type::BOOLEAN, Type::INT32, Type::INT64, Type::INT96,
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY, Type::UNDEFINED
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY
};

// Test that every enum value can be converted to string and back
Expand Down Expand Up @@ -406,12 +401,12 @@ TEST(EnumUtils, StringUniqueness) {
// Collect all Type strings
Type::type all_types[] = {
Type::BOOLEAN, Type::INT32, Type::INT64, Type::INT96,
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY, Type::UNDEFINED
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY
};
for (auto type : all_types) {
type_strings.insert(std::string(to_string(type)));
}
ASSERT_EQ(9, type_strings.size()); // All strings should be unique
ASSERT_EQ(8, type_strings.size()); // All strings should be unique

// Collect all CompressionCodec strings
CompressionCodec::type all_codecs[] = {
Expand Down Expand Up @@ -444,7 +439,7 @@ TEST(EnumUtils, CrossEnumStringCollision) {
// Collect all strings from all enums
Type::type all_types[] = {
Type::BOOLEAN, Type::INT32, Type::INT64, Type::INT96,
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY, Type::UNDEFINED
Type::FLOAT, Type::DOUBLE, Type::BYTE_ARRAY, Type::FIXED_LEN_BYTE_ARRAY
};
for (auto type : all_types) {
all_strings.insert(std::string(to_string(type)));
Expand All @@ -467,12 +462,8 @@ TEST(EnumUtils, CrossEnumStringCollision) {
all_strings.insert(std::string(to_string(encoding)));
}

// Verify two equally named enums are handled correctly.
ASSERT_EQ("UNDEFINED", std::string(to_string(Type::UNDEFINED)));
ASSERT_EQ("UNDEFINED", std::string(to_string(Encoding::UNDEFINED)));

// Total should be 9 + 10 + 11 = 30 unique strings, but we have 1 collision
// (Type::UNDEFINED and Encoding::UNDEFINED both map to "UNDEFINED")
// So we expect 29 unique strings
// Total should be 8 + 10 + 11 = 29 unique strings
ASSERT_EQ(29, all_strings.size());
}
3 changes: 1 addition & 2 deletions src/common/enums.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ struct Type {
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
UNDEFINED = 8
FIXED_LEN_BYTE_ARRAY = 7
};
};

Expand Down
38 changes: 5 additions & 33 deletions src/processing/encryption_sequencer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "encryption_sequencer.h"
#include "compression_utils.h"
#include "parquet_utils.h"
#include "../common/enums.h"
#include "../common/bytes_utils.h"
#include <iostream>
Expand All @@ -30,40 +31,11 @@ using namespace dbps::compression;

using namespace dbps::external;

// TODO: Move this to a common test utility file.
// Methods that will pad byte arrays of strings (or pure bytes) with preceding
// bytes that specify the array length. Needed because this is how Parquet
// encodings represent their data.
std::vector<uint8_t> EncodeStringByteArray(const std::vector<std::string>& strings) {
std::vector<uint8_t> result;
for (const auto& str : strings) {
uint32_t len = str.size();
// Add 4-byte length prefix (little-endian)
result.push_back(len & 0xFF);
result.push_back((len >> 8) & 0xFF);
result.push_back((len >> 16) & 0xFF);
result.push_back((len >> 24) & 0xFF);
// Add string data
result.insert(result.end(), str.begin(), str.end());
}
return result;
}

std::vector<uint8_t> EncodePlainByteArray(const std::vector<uint8_t>& payload) {
std::vector<uint8_t> out;
uint32_t len = static_cast<uint32_t>(payload.size());
out.push_back(static_cast<uint8_t>( len & 0xFF));
out.push_back(static_cast<uint8_t>((len >> 8) & 0xFF));
out.push_back(static_cast<uint8_t>((len >> 16) & 0xFF));
out.push_back(static_cast<uint8_t>((len >> 24) & 0xFF));
out.insert(out.end(), payload.begin(), payload.end());
return out;
}

// Test data constants - pure binary data
const std::vector<uint8_t> HELLO_WORLD_DATA = EncodeStringByteArray({"Hello, World!"});
const std::vector<uint8_t> BINARY_DATA = EncodePlainByteArray({0x00, 0x01, 0x02, 0x03, 0x04, 0x05});
const std::vector<uint8_t> SINGLE_CHAR_DATA = EncodeStringByteArray({"A"});
const std::vector<uint8_t> HELLO_WORLD_DATA = BuildByteArrayValueBytes("Hello, World!");
const std::vector<uint8_t> BINARY_DATA = BuildByteArrayValueBytes(
std::string("\x00\x01\x02\x03\x04\x05", 6));
const std::vector<uint8_t> SINGLE_CHAR_DATA = BuildByteArrayValueBytes("A");
const std::vector<uint8_t> EMPTY_DATA = {};
const std::vector<uint8_t> FIXED_LEN_BYTE_ARRAY_DATA = {
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
Expand Down
19 changes: 17 additions & 2 deletions src/processing/parquet_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,6 @@ inline static size_t GetFixedElemSizeOrThrow(Type::type datatype, const std::opt
}
return static_cast<size_t>(datatype_length.value());
}
case Type::UNDEFINED:
return 1;
case Type::BOOLEAN:
throw InvalidInputException("BOOLEAN is bit-sized; not fixed byte-sized");
case Type::BYTE_ARRAY:
Expand Down Expand Up @@ -254,6 +252,23 @@ std::vector<uint8_t> CombineRawBytesIntoValueBytes(
return out;
}

std::vector<uint8_t> BuildByteArrayValueBytes(const std::string& payload) {
std::vector<RawValueBytes> elements;
elements.emplace_back(payload.begin(), payload.end());
return CombineRawBytesIntoValueBytes(
elements, Type::BYTE_ARRAY, std::nullopt, Encoding::PLAIN);
}

std::vector<std::string> ParseByteArrayListValueBytes(const std::vector<uint8_t>& bytes) {
TypedListValues list = ParseValueBytesIntoTypedList(
bytes, Type::BYTE_ARRAY, std::nullopt, Encoding::PLAIN);
const auto* values = std::get_if<std::vector<std::string>>(&list);
if (!values) {
throw InvalidInputException("Expected BYTE_ARRAY values");
}
return *values;
}

LevelAndValueBytes DecompressAndSplit(
const std::vector<uint8_t>& plaintext,
CompressionCodec::type compression,
Expand Down
10 changes: 10 additions & 0 deletions src/processing/parquet_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@ std::vector<uint8_t> CombineRawBytesIntoValueBytes(
const std::optional<int>& datatype_length,
Encoding::type encoding);

/**
* Build BYTE_ARRAY value bytes for a single string payload.
*/
std::vector<uint8_t> BuildByteArrayValueBytes(const std::string& payload);

/**
* Parse BYTE_ARRAY value bytes into a list of string payloads.
*/
std::vector<std::string> ParseByteArrayListValueBytes(const std::vector<uint8_t>& bytes);

/**
* Decompresses and splits a Parquet page into level and value bytes.
* Handles DATA_PAGE_V1, DATA_PAGE_V2 (including optional compression on value bytes),
Expand Down
29 changes: 29 additions & 0 deletions src/processing/parquet_utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,35 @@ TEST(ParquetUtils, CombineRawBytesIntoValueBytes_BYTE_ARRAY) {
EXPECT_EQ(out, expected);
}

TEST(ParquetUtils, BuildByteArrayValueBytes_SinglePayload) {
std::vector<uint8_t> out = BuildByteArrayValueBytes("abc");
ASSERT_EQ(out.size(), 7u);
EXPECT_EQ(read_u32_le(out, 0), 3u);
EXPECT_EQ(out[4], static_cast<uint8_t>('a'));
EXPECT_EQ(out[5], static_cast<uint8_t>('b'));
EXPECT_EQ(out[6], static_cast<uint8_t>('c'));
}

TEST(ParquetUtils, ParseByteArrayListValueBytes_SinglePayload) {
std::vector<uint8_t> bytes = BuildByteArrayValueBytes("hello");
std::vector<std::string> out = ParseByteArrayListValueBytes(bytes);
ASSERT_EQ(out.size(), 1u);
EXPECT_EQ(out[0], "hello");
}

TEST(ParquetUtils, ParseByteArrayListValueBytes_MultiplePayloads) {
std::vector<RawValueBytes> elems = {
{'a','b'},
{'x','y','z'}
};
std::vector<uint8_t> bytes = CombineRawBytesIntoValueBytes(
elems, Type::BYTE_ARRAY, std::nullopt, Encoding::PLAIN);
std::vector<std::string> out = ParseByteArrayListValueBytes(bytes);
ASSERT_EQ(out.size(), 2u);
EXPECT_EQ(out[0], "ab");
EXPECT_EQ(out[1], "xyz");
}

TEST(ParquetUtils, CombineRawBytesIntoValueBytes_FIXED_LEN_BYTE_ARRAY_SizeMismatch) {
// Expect length 3, but provide a 2-byte element -> should throw
std::vector<RawValueBytes> elems = {
Expand Down
34 changes: 0 additions & 34 deletions src/processing/typed_list_values.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,6 @@ TypedListValues BuildTypedListFromRawBytes(
}
return out;
}
case Type::UNDEFINED: {
std::vector<uint8_t> out;
for (size_t i = 0; i < elements_bytes.size(); ++i) {
const std::vector<uint8_t>& bytes = elements_bytes[i];
if (bytes.size() != 1) {
throw std::runtime_error("DecryptTypedListValues: invalid UNDEFINED element size");
}
out.push_back(bytes[0]);
}
return out;
}
default:
throw std::runtime_error("DecryptTypedListValues: unsupported datatype");
}
Expand Down Expand Up @@ -203,7 +192,6 @@ const char* GetTypeName() {
else if constexpr (std::is_same_v<T, std::vector<std::array<uint32_t, 3>>>) return "INT96";
else if constexpr (std::is_same_v<T, std::vector<std::string>>)
return "string (BYTE_ARRAY/FIXED_LEN_BYTE_ARRAY)";
else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) return "UNDEFINED (raw bytes)";
else if constexpr (std::is_same_v<T, std::monostate>) return "empty/error";
else return "unknown";
}
Expand All @@ -227,28 +215,6 @@ std::string TypedListToString(const TypedListValues& list) {
<< values[i][1] << ", " << values[i][2] << "]\n";
}
}
else if constexpr (std::is_same_v<T, std::vector<uint8_t>>) {
// Special case for UNDEFINED - raw bytes as hex
out << "Decoded UNDEFINED type (raw bytes):\n";
out << " Hex: ";
for (size_t i = 0; i < values.size(); ++i) {
out << std::hex << std::setw(2) << std::setfill('0')
<< static_cast<int>(values[i]);
if (i < values.size() - 1) out << " ";
}
out << std::dec << "\n"; // Reset to decimal

// Also show as string if printable
out << " String: \"";
for (uint8_t byte : values) {
if (byte >= 32 && byte < 127) {
out << static_cast<char>(byte);
} else {
out << ".";
}
}
out << "\"\n";
}
else if constexpr (std::is_same_v<T, std::vector<std::string>>) {
// String values with quotes and the length of the string.
out << "Decoded " << GetTypeName<T>() << " values:\n";
Expand Down
3 changes: 1 addition & 2 deletions src/processing/typed_list_values.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ using TypedListValues = std::variant<
std::vector<float>,
std::vector<double>,
std::vector<std::array<uint32_t, 3>>, // For INT96
std::vector<std::string>, // For BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY
std::vector<uint8_t> // For UNDEFINED, a plain untyped byte sequence.
std::vector<std::string> // For BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY
>;

/**
Expand Down
19 changes: 0 additions & 19 deletions src/processing/typed_list_values_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,19 +124,6 @@ TEST(TypedListValuesTest, BuildRawBytesFromTypedListValues_BuildTypedListFromRaw
}
}

TEST(TypedListValuesTest, BuildRawBytesFromTypedListValues_BuildTypedListFromRawBytes_RoundTrip_UNDEFINED) {
TypedListValues input = std::vector<uint8_t>{0u, 255u, 42u};
std::vector<RawValueBytes> raw = BuildRawBytesFromTypedListValues(input);
ASSERT_EQ(raw.size(), 3u);
for (const auto& r : raw) {
EXPECT_EQ(r.size(), 1u);
}
auto out = BuildTypedListFromRawBytes(Type::UNDEFINED, raw);
const auto& out_vec = std::get<std::vector<uint8_t>>(out);
const auto& in_vec = std::get<std::vector<uint8_t>>(input);
ASSERT_EQ(out_vec, in_vec);
}

TEST(TypedListValuesTest, BuildTypedListFromRawBytes_InvalidElementSizes_Throws) {
// INT32 wrong size
{
Expand Down Expand Up @@ -168,12 +155,6 @@ TEST(TypedListValuesTest, BuildTypedListFromRawBytes_InvalidElementSizes_Throws)
std::vector<RawValueBytes> v{r};
EXPECT_THROW((void)BuildTypedListFromRawBytes(Type::INT96, v), std::runtime_error);
}
// UNDEFINED wrong size (expects exactly 1)
{
RawValueBytes r = {0xAA, 0xBB};
std::vector<RawValueBytes> v{r};
EXPECT_THROW((void)BuildTypedListFromRawBytes(Type::UNDEFINED, v), std::runtime_error);
}
}

TEST(TypedListValuesTest, BuildTypedListFromRawBytes_UnsupportedType_Throws) {
Expand Down
Loading