diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 7aa3e58c1d3b..20b0a655d8e2 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -248,13 +248,17 @@ Result AzureOptions::FromUri(const std::string& uri_string, } bool AzureOptions::Equals(const AzureOptions& other) const { - const bool equals = blob_storage_authority == other.blob_storage_authority && - dfs_storage_authority == other.dfs_storage_authority && - blob_storage_scheme == other.blob_storage_scheme && - dfs_storage_scheme == other.dfs_storage_scheme && - default_metadata == other.default_metadata && - account_name == other.account_name && - credential_kind_ == other.credential_kind_; + const bool equals = + account_name == other.account_name && + blob_storage_authority == other.blob_storage_authority && + dfs_storage_authority == other.dfs_storage_authority && + blob_storage_scheme == other.blob_storage_scheme && + dfs_storage_scheme == other.dfs_storage_scheme && + default_metadata == other.default_metadata && + background_writes == other.background_writes && + credential_kind_ == other.credential_kind_ && account_key_ == other.account_key_ && + sas_token_ == other.sas_token_ && tenant_id_ == other.tenant_id_ && + client_id_ == other.client_id_ && client_secret_ == other.client_secret_; if (!equals) { return false; } @@ -318,39 +322,59 @@ std::string AzureOptions::AccountDfsUrl(const std::string& account_name) const { return BuildBaseUrl(dfs_storage_scheme, dfs_storage_authority, account_name); } +void AzureOptions::ClearCredentials() { + credential_kind_ = CredentialKind::kDefault; + storage_shared_key_credential_ = nullptr; + account_key_.clear(); + sas_token_.clear(); + tenant_id_.clear(); + client_id_.clear(); + client_secret_.clear(); + token_credential_ = nullptr; +} + Status AzureOptions::ConfigureDefaultCredential() { + ClearCredentials(); credential_kind_ = CredentialKind::kDefault; token_credential_ = std::make_shared(); return Status::OK(); } Status AzureOptions::ConfigureAnonymousCredential() { + ClearCredentials(); credential_kind_ = CredentialKind::kAnonymous; return Status::OK(); } Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_key) { + ClearCredentials(); credential_kind_ = CredentialKind::kStorageSharedKey; if (account_name.empty()) { return Status::Invalid("AzureOptions doesn't contain a valid account name"); } + account_key_ = account_key; storage_shared_key_credential_ = std::make_shared(account_name, account_key); return Status::OK(); } Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) { - credential_kind_ = CredentialKind::kSASToken; + ClearCredentials(); if (account_name.empty()) { return Status::Invalid("AzureOptions doesn't contain a valid account name"); } sas_token_ = sas_token; + credential_kind_ = CredentialKind::kSASToken; return Status::OK(); } Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { + ClearCredentials(); + tenant_id_ = tenant_id; + client_id_ = client_id; + client_secret_ = client_secret; credential_kind_ = CredentialKind::kClientSecret; token_credential_ = std::make_shared( tenant_id, client_id, client_secret); @@ -358,6 +382,8 @@ Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_i } Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& client_id) { + ClearCredentials(); + client_id_ = client_id; credential_kind_ = CredentialKind::kManagedIdentity; token_credential_ = std::make_shared(client_id); @@ -365,18 +391,21 @@ Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& clien } Status AzureOptions::ConfigureCLICredential() { + ClearCredentials(); credential_kind_ = CredentialKind::kCLI; token_credential_ = std::make_shared(); return Status::OK(); } Status AzureOptions::ConfigureWorkloadIdentityCredential() { + ClearCredentials(); credential_kind_ = CredentialKind::kWorkloadIdentity; token_credential_ = std::make_shared(); return Status::OK(); } Status AzureOptions::ConfigureEnvironmentCredential() { + ClearCredentials(); credential_kind_ = CredentialKind::kEnvironment; token_credential_ = std::make_shared(); return Status::OK(); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index ae374d487b1a..d21bcd08df81 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -99,10 +99,6 @@ struct ARROW_EXPORT AzureOptions { /// Default: "https" std::string dfs_storage_scheme = "https"; - // TODO(GH-38598): Add support for more auth methods. - // std::string connection_string; - // std::string sas_token; - /// \brief Default metadata for OpenOutputStream. /// /// This will be ignored if non-empty metadata is passed to OpenOutputStream. @@ -126,7 +122,11 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr storage_shared_key_credential_; + std::string account_key_; std::string sas_token_; + std::string tenant_id_; + std::string client_id_; + std::string client_secret_; mutable std::shared_ptr token_credential_; public: @@ -136,6 +136,7 @@ struct ARROW_EXPORT AzureOptions { private: void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path); Status ExtractFromUriQuery(const Uri& uri); + void ClearCredentials(); public: /// \brief Construct a new AzureOptions from an URI. @@ -204,6 +205,12 @@ struct ARROW_EXPORT AzureOptions { std::string AccountBlobUrl(const std::string& account_name) const; std::string AccountDfsUrl(const std::string& account_name) const; + std::string AccountKey() const { return account_key_; } + std::string SasToken() const { return sas_token_; } + std::string TenantId() const { return tenant_id_; } + std::string ClientId() const { return client_id_; } + std::string ClientSecret() const { return client_secret_; } + Result> MakeBlobServiceClient() const; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index c3af6fb0797f..4cd425055540 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -493,6 +493,11 @@ TEST(AzureFileSystem, InitializeWithDefaultCredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureDefaultCredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -509,6 +514,23 @@ TEST(AzureFileSystem, InitializeWithAnonymousCredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureAnonymousCredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); +} + +TEST(AzureFileSystem, InitializeWithAccountKeyCredential) { + AzureOptions options; + options.account_name = "dummy-account-name"; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredential("account_key")); + ASSERT_EQ(options.AccountKey(), "account_key"); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -517,6 +539,11 @@ TEST(AzureFileSystem, InitializeWithClientSecretCredential) { options.account_name = "dummy-account-name"; ARROW_EXPECT_OK( options.ConfigureClientSecretCredential("tenant_id", "client_id", "client_secret")); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), "tenant_id"); + ASSERT_EQ(options.ClientId(), "client_id"); + ASSERT_EQ(options.ClientSecret(), "client_secret"); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -524,9 +551,19 @@ TEST(AzureFileSystem, InitializeWithManagedIdentityCredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); ARROW_EXPECT_OK(options.ConfigureManagedIdentityCredential("specific-client-id")); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), "specific-client-id"); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options)); } @@ -534,6 +571,11 @@ TEST(AzureFileSystem, InitializeWithCLICredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureCLICredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -541,6 +583,11 @@ TEST(AzureFileSystem, InitializeWithWorkloadIdentityCredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureWorkloadIdentityCredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } @@ -548,12 +595,42 @@ TEST(AzureFileSystem, InitializeWithEnvironmentCredential) { AzureOptions options; options.account_name = "dummy-account-name"; ARROW_EXPECT_OK(options.ConfigureEnvironmentCredential()); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), ""); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); } TEST(AzureFileSystem, OptionsCompare) { - AzureOptions options; - EXPECT_TRUE(options.Equals(options)); + AzureOptions options0; + EXPECT_TRUE(options0.Equals(options0)); + + AzureOptions options1; + options1.account_name = "account_name"; + EXPECT_FALSE(options1.Equals(options0)); + + AzureOptions options2; + options2.account_name = "account_name"; + ASSERT_OK(options2.ConfigureAccountKeyCredential("fake_account_key")); + EXPECT_FALSE(options2.Equals(options1)); + + AzureOptions options3; + options3.account_name = "account_name"; + ASSERT_OK(options3.ConfigureAccountKeyCredential("different_fake_account_key")); + EXPECT_FALSE(options3.Equals(options2)); + + AzureOptions options4; + options4.account_name = "account_name"; + ASSERT_OK(options4.ConfigureSASCredential("fake_sas_token")); + EXPECT_FALSE(options4.Equals(options3)); + + AzureOptions options5; + options5.account_name = "account_name"; + ASSERT_OK(options5.ConfigureClientSecretCredential("fake_tenant_id", "fake_client_id", + "fake_client_secret")); + EXPECT_FALSE(options5.Equals(options4)); } class TestAzureOptions : public ::testing::Test { @@ -1679,9 +1756,14 @@ class TestAzureFileSystem : public ::testing::Test { env->account_name(), env->account_key()))); // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so // make sure these don't cause problems. - ARROW_EXPECT_OK(options.ConfigureSASCredential( - "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) + - "&credential_kind=dummy-value1")); + auto polluted_sas_token = "?blob_storage_authority=dummy_value0&" + + sas_token.substr(1) + "&credential_kind=dummy-value1"; + ARROW_EXPECT_OK(options.ConfigureSASCredential(polluted_sas_token)); + ASSERT_EQ(options.AccountKey(), ""); + ASSERT_EQ(options.SasToken(), polluted_sas_token); + ASSERT_EQ(options.TenantId(), ""); + ASSERT_EQ(options.ClientId(), ""); + ASSERT_EQ(options.ClientSecret(), ""); EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); diff --git a/python/pyarrow/_azurefs.pyx b/python/pyarrow/_azurefs.pyx index aa6ad1d90e07..deb58b0aed84 100644 --- a/python/pyarrow/_azurefs.pyx +++ b/python/pyarrow/_azurefs.pyx @@ -28,33 +28,33 @@ cdef class AzureFileSystem(FileSystem): Azure Blob Storage backed FileSystem implementation This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. - Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific - features will be used when they provide a performance advantage. Azurite emulator is + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is also supported. Note: `/` is the only supported delimiter. - The storage account is considered the root of the filesystem. When enabled, containers - will be created or deleted during relevant directory operations. Obviously, this also - requires authentication with the additional permissions. + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. - By default `DefaultAzureCredential `__ + By default `DefaultAzureCredential `__ is used for authentication. This means it will try several types of authentication - and go with the first one that works. If any authentication parameters are provided when + and go with the first one that works. If any authentication parameters are provided when initialising the FileSystem, they will be used instead of the default credential. Parameters ---------- account_name : str - Azure Blob Storage account name. This is the globally unique identifier for the + Azure Blob Storage account name. This is the globally unique identifier for the storage account. account_key : str, default None - Account key of the storage account. If sas_token and account_key are None the + Account key of the storage account. If sas_token and account_key are None the default credential will be used. The parameters account_key and sas_token are mutually exclusive. blob_storage_authority : str, default None hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful for connecting to a local emulator, like Azurite. blob_storage_scheme : str, default None - Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local emulator, like Azurite. client_id : str, default None The client ID (Application ID) for Azure Active Directory authentication. @@ -101,11 +101,6 @@ cdef class AzureFileSystem(FileSystem): """ cdef: CAzureFileSystem* azurefs - c_string account_key - c_string sas_token - c_string tenant_id - c_string client_id - c_string client_secret def __init__(self, account_name, *, account_key=None, blob_storage_authority=None, blob_storage_scheme=None, client_id=None, client_secret=None, @@ -133,14 +128,10 @@ cdef class AzureFileSystem(FileSystem): raise ValueError("client_id must be specified") if not tenant_id and not client_secret: options.ConfigureManagedIdentityCredential(tobytes(client_id)) - self.client_id = tobytes(client_id) elif tenant_id and client_secret: options.ConfigureClientSecretCredential( tobytes(tenant_id), tobytes(client_id), tobytes(client_secret) ) - self.tenant_id = tobytes(tenant_id) - self.client_id = tobytes(client_id) - self.client_secret = tobytes(client_secret) else: raise ValueError( "Invalid Azure credential configuration: " @@ -149,10 +140,8 @@ cdef class AzureFileSystem(FileSystem): ) elif account_key: options.ConfigureAccountKeyCredential(tobytes(account_key)) - self.account_key = tobytes(account_key) elif sas_token: options.ConfigureSASCredential(tobytes(sas_token)) - self.sas_token = tobytes(sas_token) else: options.ConfigureDefaultCredential() @@ -176,13 +165,13 @@ cdef class AzureFileSystem(FileSystem): return ( AzureFileSystem._reconstruct, (dict( account_name=frombytes(opts.account_name), - account_key=frombytes(self.account_key), + account_key=frombytes(opts.AccountKey()), blob_storage_authority=frombytes(opts.blob_storage_authority), blob_storage_scheme=frombytes(opts.blob_storage_scheme), - client_id=frombytes(self.client_id), - client_secret=frombytes(self.client_secret), + client_id=frombytes(opts.ClientId()), + client_secret=frombytes(opts.ClientSecret()), dfs_storage_authority=frombytes(opts.dfs_storage_authority), dfs_storage_scheme=frombytes(opts.dfs_storage_scheme), - sas_token=frombytes(self.sas_token), - tenant_id=frombytes(self.tenant_id) + sas_token=frombytes(opts.SasToken()), + tenant_id=frombytes(opts.TenantId()) ),)) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index af01c47c8c7b..d18dc2d2bde1 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -259,6 +259,11 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CStatus ConfigureClientSecretCredential(c_string tenant_id, c_string client_id, c_string client_secret) + c_string SasToken() + c_string AccountKey() + c_string TenantId() + c_string ClientId() + c_string ClientSecret() cdef cppclass CAzureFileSystem "arrow::fs::AzureFileSystem": @staticmethod diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 376398baa07e..5bf1950c0654 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -636,7 +636,29 @@ def test_subtree_filesystem(): ' base_fs=