Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions cpp/src/parquet/encryption/crypto_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
#pragma once

#include <memory>
#include <unordered_map>

#include "parquet/encryption/encryption.h"
#include "parquet/encryption/file_key_wrapper.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/platform.h"
#include "parquet/types.h"

namespace parquet::encryption {

Expand Down Expand Up @@ -96,7 +98,7 @@ struct PARQUET_EXPORT ColumnEncryptionAttributes {
std::string key_id;
};

/// Encryption Configuration for use with External Encryption services.
/// Encryption Configuration for use with External Encryptions.
/// Extends the already existing EncryptionConfiguration with more context and with
/// the capability of specifying encryption algorithm per column.
struct PARQUET_EXPORT ExternalEncryptionConfiguration : public EncryptionConfiguration {
Expand All @@ -118,21 +120,21 @@ struct PARQUET_EXPORT ExternalEncryptionConfiguration : public EncryptionConfigu
/// If a column name appears in both, an exception will be thrown.
std::unordered_map<std::string, ColumnEncryptionAttributes> per_column_encryption;

/// External encryption services may use additional context provided by the application to
/// enforce robust access control. The values sent to the external service depend on each
/// External encryptors may use additional context provided by the application to
/// enforce robust access control. The values sent to the external encryptor depend on each
/// implementation.
/// This value must be a valid JSON-formatted string.
/// Validation of the string will be done by the external encryption service, Arrow will only
/// Validation of the string will be done by the external encryptor, Arrow will only
/// forward this value.
/// Format: "{\"user_id\": \"abc123\", \"location\": {\"lat\": 9.7489, \"lon\": -83.7534}}"
std::string app_context;

/// Key/value map of the location of configuration files needed by the external
/// encryption service. This may include location of a dynamically-linked library, or the
/// location of a file where the external service can find urls, certificates, and parameters
/// needed to make a remote service call.
/// encryptors. This may include location of a dynamically-linked library, or the
/// location of a file where the external encryptor can find urls, certificates, and parameters
/// needed to make a remote call.
/// For security, these values should never be sent in this config, only the locations of
/// the files that the external service will know how to access.
/// the files that the external encryptor will know how to access.
std::unordered_map<std::string, std::string> connection_config;
};

Expand All @@ -143,6 +145,26 @@ struct PARQUET_EXPORT DecryptionConfiguration {
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
};

struct PARQUET_EXPORT ExternalDecryptionConfiguration : public DecryptionConfiguration {
/// External decryptors may use additional context provided by the application to
/// enforce robust access control. The values sent to the external decryptor depend on each
/// implementation.
/// This value must be a valid JSON-formatted string.
/// Validation of the string will be done by the external decryptors, Arrow will only
/// forward this value.
/// Format: "{\"user_id\": \"abc123\", \"location\": {\"lat\": 9.7489, \"lon\": -83.7534}}"
std::string app_context;

/// Map of the encryption algorithms to the key/value map of the location of configuration files
/// needed by the external decryptors. This may include location of a dynamically-linked
/// library, or the location of a file where the external decryptor can find urls, certificates,
/// and parameters needed to make a remote call.
/// For security, these values should never be sent in this config, only the locations of
/// the files that the external decryptor will know how to access.
std::unordered_map<ParquetCipher::type, std::unordered_map<std::string, std::string>>
connection_config;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think this should be "instantiation_config" :) (not every external agent will be network based). But again, I'm OK with a later conversation on the topic (independent of this PR)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Punting for later discussion.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can do a global renaming if we find it necessary. Can be an isolated change.

};

/// This is a core class, that translates the parameters of high level encryption (like
/// the names of encrypted columns, names of master keys, etc), into parameters of low
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
Expand All @@ -162,8 +184,8 @@ class PARQUET_EXPORT CryptoFactory {
const EncryptionConfiguration& encryption_config, const std::string& file_path = "",
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);

/// Get the external encryption properties for a Parquet file. Used when encryption
/// will be provided by an external service.
/// Get the external encryption properties for a Parquet file. Used when an external encryptor
/// will be used to encrypt the file.
std::shared_ptr<ExternalFileEncryptionProperties> GetExternalFileEncryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const ExternalEncryptionConfiguration& external_encryption_config,
Expand Down
67 changes: 63 additions & 4 deletions cpp/src/parquet/encryption/encryption.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,58 @@ FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix
return this;
}

ExternalFileDecryptionProperties::Builder* ExternalFileDecryptionProperties::Builder::app_context(
const std::string& context) {
if (!app_context_.empty()) {
throw ParquetException("App context already set");
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add one-line comment to say in what conditions could the app_context hit this condition. Basically what this is guarding against.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also following Arrow's builder patterns, it only allows the builder to set a particular property once.

}

if (context.empty()) {
return this;
}

app_context_ = context;
return this;
}

ExternalFileDecryptionProperties::Builder*
ExternalFileDecryptionProperties::Builder::connection_config(
const std::map<ParquetCipher::type, std::map<std::string, std::string>>& config) {
if (connection_config_.size() != 0) {
throw ParquetException("Connection config already set");
}

if (config.size() == 0) {
return this;
}

connection_config_ = config;
return this;
}

std::shared_ptr<ExternalFileDecryptionProperties>
ExternalFileDecryptionProperties::Builder::build_external() {
return std::shared_ptr<ExternalFileDecryptionProperties>(new ExternalFileDecryptionProperties(
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_,
app_context_, connection_config_));
}

ExternalFileDecryptionProperties::ExternalFileDecryptionProperties(
const std::string& footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
bool check_plaintext_footer_integrity, const std::string& aad_prefix,
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
bool plaintext_files_allowed,
const std::string& app_context,
const std::map<ParquetCipher::type, std::map<std::string, std::string>>& connection_config)
: FileDecryptionProperties(footer_key, key_retriever, check_plaintext_footer_integrity,
aad_prefix, aad_prefix_verifier, column_decryption_properties,
plaintext_files_allowed),
app_context_(app_context),
connection_config_(connection_config) {}

ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
const std::string& key) {
if (key.empty()) return this;
Expand All @@ -145,9 +197,15 @@ ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
return this;
}

ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::parquet_cipher(
ParquetCipher::type parquet_cipher) {
parquet_cipher_ = parquet_cipher;
return this;
}

std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
return std::shared_ptr<ColumnDecryptionProperties>(
new ColumnDecryptionProperties(column_path_, key_));
new ColumnDecryptionProperties(column_path_, key_, parquet_cipher_));
}

FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
Expand Down Expand Up @@ -211,9 +269,10 @@ ColumnEncryptionProperties::ColumnEncryptionProperties(
key_ = key;
}

ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
const std::string& key)
: column_path_(column_path) {
ColumnDecryptionProperties::ColumnDecryptionProperties(
const std::string& column_path, const std::string& key,
std::optional<ParquetCipher::type> parquet_cipher)
: column_path_(column_path), parquet_cipher_(parquet_cipher) {
Comment on lines +272 to +275
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is it that parquet_cipher is optional?

I understand if the config (passed from the App, IIRC) does not have a per-column cipher already in-place, but when we resolve the properties, it should be part of each column, isn't it? (maybe I'm missing something).

If it must not be optional, I believe the proper order of the params would be column_path, parquet_cipher, key.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We thought about propagating the parquet_cipher to each column even if no per_column_encryption was specified, but decided against it.

We don't want to change too much Arrow's current way of life, and the parquet_cipher is defined somewhere else for all these cases.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got that. But it reads a bit weird that parquet_cipher is optional. Could you add a comment why this is? (basically that is an overwrite of the file-level algorithm value) <= right?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In encryption.h I already have put a comment above its definition saying that if the value is not set, then the ParquetCipher defined in the FileEncryptionProperties or the InternalFileDecryptor will be used.

I added a bit more comment to make it clearer though.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this PR has been merged, but I think we need a bit of discussion still. We're conflating two dimensions here: (a) how users express the configuration (i.e. how configuration is generated by the App), (b) how the configuration is 'parsed' into an Arrow-usable object. Having the hierarchical stuff at the App level is fine IMO. Having to understand the hierarchical stuff "everywhere" that the config is read, not so OK IMO.

DCHECK(!column_path.empty());

if (!key.empty()) {
Expand Down
62 changes: 58 additions & 4 deletions cpp/src/parquet/encryption/encryption.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,9 @@ class PARQUET_EXPORT ColumnEncryptionProperties {
Builder* key_id(const std::string& key_id);

/// Set ParquetCipher type to use.
/// This field is declared as optional. If the value is not set, then the ParquetCipher
/// declared in the FileEncryptionProperties will be used.
/// This field is declared as optional, present when per column encryption was used. If the
/// value is not set, then the ParquetCipher declared in the FileEncryptionProperties will be
/// used.
Builder* parquet_cipher(ParquetCipher::type parquet_cipher);

std::shared_ptr<ColumnEncryptionProperties> build() {
Expand Down Expand Up @@ -189,11 +190,17 @@ class PARQUET_EXPORT ColumnDecryptionProperties {
/// key length must be either 16, 24 or 32 bytes.
Builder* key(const std::string& key);

/// Set ParquetCipher type to use.
/// This field is declared as optional, present when per column encryption was used. If the
/// value is not set, then the ParquetCipher declared in the InternalFileDecryptor will be used.
Builder* parquet_cipher(ParquetCipher::type parquet_cipher);

std::shared_ptr<ColumnDecryptionProperties> build();

private:
const std::string column_path_;
std::string key_;
std::optional<ParquetCipher::type> parquet_cipher_;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or maybe the comment on the overwrite could go here. (Please 2-check my assumption that is this actually the column-level overwrite of the file-level attribute. If it's something else, let me know)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comments are above the builder attributes (following the rest of the code).

};

ColumnDecryptionProperties() = default;
Expand All @@ -205,15 +212,20 @@ class PARQUET_EXPORT ColumnDecryptionProperties {
std::string column_path() const { return column_path_; }
std::string key() const { return key_; }

/// Check whether the optional has a value before using.
std::optional<ParquetCipher::type> parquet_cipher() const { return parquet_cipher_; }

private:
const std::string column_path_;
std::string key_;
std::optional<ParquetCipher::type> parquet_cipher_;

/// This class is only required for setting explicit column decryption keys -
/// to override key retriever (or to provide keys when key metadata and/or
/// key retriever are not available)
explicit ColumnDecryptionProperties(const std::string& column_path,
const std::string& key);
const std::string& key,
std::optional<ParquetCipher::type> parquet_cipher);
};

class PARQUET_EXPORT AADPrefixVerifier {
Expand Down Expand Up @@ -304,7 +316,7 @@ class PARQUET_EXPORT FileDecryptionProperties {
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
}

private:
protected:
std::string footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
Expand Down Expand Up @@ -349,6 +361,7 @@ class PARQUET_EXPORT FileDecryptionProperties {
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;

protected:
FileDecryptionProperties(
const std::string& footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
Expand All @@ -358,6 +371,47 @@ class PARQUET_EXPORT FileDecryptionProperties {
bool plaintext_files_allowed);
};

class PARQUET_EXPORT ExternalFileDecryptionProperties : public FileDecryptionProperties {
public:
class PARQUET_EXPORT Builder : public FileDecryptionProperties::Builder {
public:
Builder() : FileDecryptionProperties::Builder() {}

Builder* app_context(const std::string& context);

Builder* connection_config(
const std::map<ParquetCipher::type, std::map<std::string, std::string>>& config);

std::shared_ptr<ExternalFileDecryptionProperties> build_external();

private:
std::string app_context_;
std::map<ParquetCipher::type, std::map<std::string, std::string>> connection_config_;
};

const std::string& app_context() const {
return app_context_;
}

const std::map<ParquetCipher::type, std::map<std::string, std::string>>& connection_config() const {
return connection_config_;
}

private:
std::string app_context_;
std::map<ParquetCipher::type, std::map<std::string, std::string>> connection_config_;

ExternalFileDecryptionProperties(
const std::string& footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
bool check_plaintext_footer_integrity, const std::string& aad_prefix,
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
bool plaintext_files_allowed,
const std::string& app_context,
const std::map<ParquetCipher::type, std::map<std::string, std::string>>& connection_config);
};

class PARQUET_EXPORT FileEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/parquet/encryption/properties_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ TEST(TestColumnEncryptionProperties, ColumnParquetCipherSpecified) {
ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, properties->parquet_cipher().value());
}

TEST(TestColumnDecryptionProperties, ColumnParquetCipherNotSpecified) {
std::string column_path = "column_path";
ColumnDecryptionProperties::Builder column_builder(column_path);
std::shared_ptr<ColumnDecryptionProperties> properties = column_builder.build();

ASSERT_EQ(column_path, properties->column_path());
ASSERT_EQ(false, properties->parquet_cipher().has_value());
}

TEST(TestColumnDecryptionProperties, ColumnParquetCipherSpecified) {
std::string column_path = "column_path";
ColumnDecryptionProperties::Builder column_builder(column_path);
column_builder.parquet_cipher(ParquetCipher::AES_GCM_CTR_V1);
std::shared_ptr<ColumnDecryptionProperties> properties = column_builder.build();

ASSERT_EQ(column_path, properties->column_path());
ASSERT_EQ(true, properties->parquet_cipher().has_value());
ASSERT_EQ(ParquetCipher::AES_GCM_CTR_V1, properties->parquet_cipher().value());
}

// Encrypt all columns and the footer with the same key.
// (uniform encryption)
TEST(TestEncryptionProperties, UniformEncryption) {
Expand Down
Loading