From e145ef3b66deca65785553339bcae4f6433d4443 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Tue, 28 Oct 2025 14:12:38 +0100 Subject: [PATCH 01/44] Implement databricks creds manager --- .../assessments/configure_assessment.py | 10 +-- .../labs/lakebridge/assessments/profiler.py | 4 +- .../connections/credential_manager.py | 66 +++++++++++--- .../reconcile/connectors/databricks.py | 2 + .../lakebridge/reconcile/connectors/oracle.py | 2 + .../reconcile/connectors/secrets.py | 29 ++---- .../reconcile/connectors/snowflake.py | 2 + .../lakebridge/reconcile/connectors/tsql.py | 2 + .../synapse/dedicated_sqlpool_extract.py | 3 +- .../synapse/monitoring_metrics_extract.py | 3 +- .../synapse/serverless_sqlpool_extract.py | 3 +- .../assessments/synapse/workspace_extract.py | 3 +- .../connections/test_credential_manager.py | 90 ------------------- 13 files changed, 77 insertions(+), 142 deletions(-) delete mode 100644 tests/unit/connections/test_credential_manager.py diff --git a/src/databricks/labs/lakebridge/assessments/configure_assessment.py b/src/databricks/labs/lakebridge/assessments/configure_assessment.py index 0da1c28efa..1c9d0081c1 100644 --- a/src/databricks/labs/lakebridge/assessments/configure_assessment.py +++ b/src/databricks/labs/lakebridge/assessments/configure_assessment.py @@ -8,11 +8,9 @@ from databricks.labs.lakebridge.connections.credential_manager import ( cred_file as creds, - CredentialManager, create_credential_manager, ) from databricks.labs.lakebridge.connections.database_manager import DatabaseManager -from databricks.labs.lakebridge.connections.env_getter import EnvGetter from databricks.labs.lakebridge.assessments import CONNECTOR_REQUIRED logger = logging.getLogger(__name__) @@ -44,8 +42,8 @@ def __init__( def _configure_credentials(self) -> str: pass - @staticmethod - def _test_connection(source: str, cred_manager: CredentialManager): + def _test_connection(self, source: str): + cred_manager = create_credential_manager(self._credential_file) config = cred_manager.get_credentials(source) try: @@ -67,9 +65,7 @@ def run(self): logger.info(f"{source.capitalize()} details and credentials received.") if CONNECTOR_REQUIRED.get(self._source_name, True): if self.prompts.confirm(f"Do you want to test the connection to {source}?"): - cred_manager = create_credential_manager("lakebridge", EnvGetter()) - if cred_manager: - self._test_connection(source, cred_manager) + self._test_connection(source) logger.info(f"{source.capitalize()} Assessment Configuration Completed") diff --git a/src/databricks/labs/lakebridge/assessments/profiler.py b/src/databricks/labs/lakebridge/assessments/profiler.py index 053e7aabce..6b8b57c6b5 100644 --- a/src/databricks/labs/lakebridge/assessments/profiler.py +++ b/src/databricks/labs/lakebridge/assessments/profiler.py @@ -6,8 +6,8 @@ from databricks.labs.lakebridge.connections.database_manager import DatabaseManager from databricks.labs.lakebridge.connections.credential_manager import ( create_credential_manager, + cred_file, ) -from databricks.labs.lakebridge.connections.env_getter import EnvGetter from databricks.labs.lakebridge.assessments import ( PRODUCT_NAME, PRODUCT_PATH_PREFIX, @@ -62,7 +62,7 @@ def profile( def _setup_extractor(platform: str) -> DatabaseManager | None: if not CONNECTOR_REQUIRED[platform]: return None - cred_manager = create_credential_manager(PRODUCT_NAME, EnvGetter()) + cred_manager = create_credential_manager(cred_file(PRODUCT_NAME)) connect_config = cred_manager.get_credentials(platform) return DatabaseManager(platform, connect_config) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index b9b3bde974..f5eef803bf 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -1,9 +1,15 @@ +from collections.abc import Callable +from functools import partial from pathlib import Path import logging from typing import Protocol +import base64 import yaml +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound + from databricks.labs.lakebridge.connections.env_getter import EnvGetter @@ -32,18 +38,48 @@ def get_secret(self, key: str) -> str: return key -class DatabricksSecretProvider: +class DatabricksSecretProvider(SecretProvider): + def __init__(self, ws: WorkspaceClient): + self._ws = ws + def get_secret(self, key: str) -> str: - raise NotImplementedError("Databricks secret vault not implemented") + """Get the secret value given a secret scope & secret key. + + :param key: key in the format 'scope/secret_key' + :return: The decoded UTF-8 secret value. + + Raises: + NotFound: The secret could not be found. + UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. + """ + scope, key_only = key.split(sep="/") + assert scope and key_only, "Secret key must be in the format 'scope/secret_key'" + + try: + secret = self._ws.secrets.get_secret(scope, key_only) + assert secret.value is not None + return base64.b64decode(secret.value).decode("utf-8") + except NotFound as e: + raise NotFound(f'Secret does not exist with scope: {scope} and key: {key_only} : {e}') from e + except UnicodeDecodeError as e: + raise UnicodeDecodeError( + "utf-8", + key_only.encode(), + 0, + 1, + f"Secret {key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.", + ) from e class CredentialManager: - def __init__(self, credentials: dict, secret_providers: dict[str, SecretProvider]): + SecretProviderFactory = Callable[[], SecretProvider] + def __init__(self, credentials: dict, secret_providers: dict[str, SecretProviderFactory]): self._credentials = credentials self._default_vault = self._credentials.get('secret_vault_type', 'local').lower() - self._provider = secret_providers.get(self._default_vault) - if not self._provider: + provider_factory = secret_providers.get(self._default_vault) + if not provider_factory: raise ValueError(f"Unsupported secret vault type: {self._default_vault}") + self._provider = provider_factory() def get_credentials(self, source: str) -> dict: if source not in self._credentials: @@ -76,14 +112,22 @@ def _load_credentials(path: Path) -> dict: raise FileNotFoundError(f"Credentials file not found at {path}") from e -def create_credential_manager(product_name: str, env_getter: EnvGetter) -> CredentialManager: - creds_path = cred_file(product_name) +def create_databricks_secret_provider() -> DatabricksSecretProvider: + ws = WorkspaceClient() + return DatabricksSecretProvider(ws) + + + +def create_credential_manager(creds_path: Path | str) -> CredentialManager: + if isinstance(creds_path, str): + creds_path = Path(creds_path) creds = _load_credentials(creds_path) - secret_providers = { - 'local': LocalSecretProvider(), - 'env': EnvSecretProvider(env_getter), - 'databricks': DatabricksSecretProvider(), + # Lazily initialize secret providers + secret_providers: dict[str, CredentialManager.SecretProviderFactory] = { + 'local': LocalSecretProvider, + 'env': partial(EnvSecretProvider, EnvGetter()), + 'databricks': create_databricks_secret_provider, } return CredentialManager(creds, secret_providers) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index 89d05b3e4c..fcc67c0ccf 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,6 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin @@ -50,6 +51,7 @@ def __init__( self._spark = spark self._ws = ws self._secret_scope = secret_scope + self._secrets = DatabricksSecretProvider(self._ws) def read_data( self, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 3e809a55b8..c7205afabe 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -7,6 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier @@ -45,6 +46,7 @@ def __init__( self._spark = spark self._ws = ws self._secret_scope = secret_scope + self._secrets = DatabricksSecretProvider(self._ws) @property def get_jdbc_url(self) -> str: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py index daa213afc8..ed64aa9925 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py @@ -1,15 +1,15 @@ -import base64 import logging -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider + logger = logging.getLogger(__name__) # TODO use CredentialManager to allow for changing secret provider for tests class SecretsMixin: - _ws: WorkspaceClient + _secrets: DatabricksSecretProvider _secret_scope: str def _get_secret_or_none(self, secret_key: str) -> str | None: @@ -26,24 +26,5 @@ def _get_secret_or_none(self, secret_key: str) -> str | None: return None def _get_secret(self, secret_key: str) -> str: - """Get the secret value given a secret scope & secret key. - - Raises: - NotFound: The secret could not be found. - UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. - """ - try: - # Return the decoded secret value in string format - secret = self._ws.secrets.get_secret(self._secret_scope, secret_key) - assert secret.value is not None - return base64.b64decode(secret.value).decode("utf-8") - except NotFound as e: - raise NotFound(f'Secret does not exist with scope: {self._secret_scope} and key: {secret_key} : {e}') from e - except UnicodeDecodeError as e: - raise UnicodeDecodeError( - "utf-8", - secret_key.encode(), - 0, - 1, - f"Secret {self._secret_scope}/{secret_key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.", - ) from e + _secret_full_key = f"{self._secret_scope}/{secret_key}" + return self._secrets.get_secret(_secret_full_key) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index e66751d29b..f9822e90ad 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -9,6 +9,7 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier @@ -62,6 +63,7 @@ def __init__( self._spark = spark self._ws = ws self._secret_scope = secret_scope + self._secrets = DatabricksSecretProvider(self._ws) @property def get_jdbc_url(self) -> str: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index ec32d87765..1d65e993cf 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -7,6 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier @@ -64,6 +65,7 @@ def __init__( self._spark = spark self._ws = ws self._secret_scope = secret_scope + self._secrets = DatabricksSecretProvider(self._ws) @property def get_jdbc_url(self) -> str: diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py index e2c9569ca9..9418c0dc5f 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py @@ -13,7 +13,6 @@ import zoneinfo from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.assessments import PRODUCT_NAME from databricks.labs.lakebridge.resources.assessments.synapse.common.profiler_classes import SynapseWorkspace from databricks.labs.lakebridge.resources.assessments.synapse.common.queries import SynapseQueries @@ -23,7 +22,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Synapse Synapse Dedicated SQL Pool Extract Script") - cred_manager = create_credential_manager(PRODUCT_NAME, creds_file) + cred_manager = create_credential_manager(creds_file) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py index b9df9258e0..a4f29a3a0d 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py @@ -5,7 +5,6 @@ import pandas as pd from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.assessments import PRODUCT_NAME from databricks.labs.lakebridge.resources.assessments.synapse.common.profiler_classes import ( SynapseWorkspace, SynapseMetrics, @@ -23,7 +22,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Monitoring Metrics Extract Script") - cred_manager = create_credential_manager(PRODUCT_NAME, creds_file) + cred_manager = create_credential_manager(creds_file) synapse_workspace_settings = cred_manager.get_credentials("synapse") synapse_profiler_settings = synapse_workspace_settings["profiler"] diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py index f9c8085dcd..81421ae5a9 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py @@ -3,7 +3,6 @@ import duckdb from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.assessments import PRODUCT_NAME from databricks.labs.lakebridge.resources.assessments.synapse.common.functions import ( arguments_loader, @@ -54,7 +53,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Synapse Synapse Serverless SQL Pool Extract Script") - cred_manager = create_credential_manager(PRODUCT_NAME, creds_file) + cred_manager = create_credential_manager(creds_file) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py index f36e27dd66..25d15745ce 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py @@ -5,7 +5,6 @@ import pandas as pd from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.assessments import PRODUCT_NAME from databricks.labs.lakebridge.resources.assessments.synapse.common.functions import ( arguments_loader, @@ -21,7 +20,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Workspace Extract") - cred_manager = create_credential_manager(PRODUCT_NAME, creds_file) + cred_manager = create_credential_manager(creds_file) synapse_workspace_settings = cred_manager.get_credentials("synapse") tz_info = synapse_workspace_settings["workspace"]["tz_info"] workspace_tz = zoneinfo.ZoneInfo(tz_info) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py deleted file mode 100644 index 32ed07cfc0..0000000000 --- a/tests/unit/connections/test_credential_manager.py +++ /dev/null @@ -1,90 +0,0 @@ -import pytest -from unittest.mock import patch, MagicMock -from pathlib import Path -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.connections.env_getter import EnvGetter -import os - -product_name = "remorph" - - -@pytest.fixture -def env_getter(): - return MagicMock(spec=EnvGetter) - - -@pytest.fixture -def local_credentials(): - return { - 'secret_vault_type': 'local', - 'mssql': { - 'database': 'DB_NAME', - 'driver': 'ODBC Driver 18 for SQL Server', - 'server': 'example_host', - 'user': 'local_user', - 'password': 'local_password', - }, - } - - -@pytest.fixture -def env_credentials(): - return { - 'secret_vault_type': 'env', - 'mssql': { - 'database': 'DB_NAME', - 'driver': 'ODBC Driver 18 for SQL Server', - 'server': 'example_host', - 'user': 'MSSQL_USER_ENV', - 'password': 'MSSQL_PASSWORD_ENV', - }, - } - - -@pytest.fixture -def databricks_credentials(): - return { - 'secret_vault_type': 'databricks', - 'secret_vault_name': 'databricks_vault_name', - 'mssql': { - 'database': 'DB_NAME', - 'driver': 'ODBC Driver 18 for SQL Server', - 'server': 'example_host', - 'user': 'databricks_user', - 'password': 'databricks_password', - }, - } - - -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') -def test_local_credentials(mock_get_home, mock_load_credentials, local_credentials, env_getter): - mock_load_credentials.return_value = local_credentials - mock_get_home.return_value = Path("/fake/home") - credentials = create_credential_manager(product_name, env_getter) - creds = credentials.get_credentials('mssql') - assert creds['user'] == 'local_user' - assert creds['password'] == 'local_password' - - -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') -@patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) -def test_env_credentials(mock_get_home, mock_load_credentials, env_credentials, env_getter): - mock_load_credentials.return_value = env_credentials - mock_get_home.return_value = Path("/fake/home") - env_getter.get.side_effect = lambda key: os.environ[key] - credentials = create_credential_manager(product_name, env_getter) - creds = credentials.get_credentials('mssql') - assert creds['user'] == 'env_user' - assert creds['password'] == 'env_password' - - -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') -def test_databricks_credentials(mock_get_home, mock_load_credentials, databricks_credentials, env_getter): - mock_load_credentials.return_value = databricks_credentials - mock_get_home.return_value = Path("/fake/home") - credentials = create_credential_manager(product_name, env_getter) - with pytest.raises(NotImplementedError): - credentials.get_credentials('mssql') From 6c75f6c30ee5b655cb68a05b7bc23033a973e211 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 29 Oct 2025 14:05:47 +0100 Subject: [PATCH 02/44] [WIP] use `DatabricksSecretProvider` in reconcile datasources --- .../connections/credential_manager.py | 28 ++++++++--- src/databricks/labs/lakebridge/install.py | 2 +- .../reconcile/connectors/data_source.py | 3 +- .../reconcile/connectors/databricks.py | 10 +--- .../reconcile/connectors/dialect_utils.py | 8 ++- .../reconcile/connectors/jdbc_reader.py | 1 - .../lakebridge/reconcile/connectors/models.py | 7 --- .../lakebridge/reconcile/connectors/oracle.py | 15 +++--- .../reconcile/connectors/secrets.py | 30 ----------- .../reconcile/connectors/snowflake.py | 50 ++++++++++--------- .../reconcile/connectors/source_adapter.py | 10 ++-- .../lakebridge/reconcile/connectors/tsql.py | 21 ++++---- tests/conftest.py | 3 +- .../reconcile/connectors/test_read_schema.py | 2 +- .../reconcile/query_builder/test_execute.py | 2 +- .../reconcile/connectors/test_databricks.py | 4 +- .../unit/reconcile/connectors/test_oracle.py | 2 +- .../unit/reconcile/connectors/test_secrets.py | 44 ++++++++-------- .../reconcile/connectors/test_snowflake.py | 2 +- .../reconcile/connectors/test_sql_server.py | 2 +- 20 files changed, 111 insertions(+), 135 deletions(-) delete mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/models.py delete mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/secrets.py diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index f5eef803bf..3cd0e50e97 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -20,6 +20,12 @@ class SecretProvider(Protocol): def get_secret(self, key: str) -> str: pass + def get_secret_or_none(self, key: str) -> str | None: + try: + return self.get_secret(key) + except KeyError: + return None + class LocalSecretProvider(SecretProvider): def get_secret(self, key: str) -> str: @@ -42,10 +48,13 @@ class DatabricksSecretProvider(SecretProvider): def __init__(self, ws: WorkspaceClient): self._ws = ws + def get_databricks_secret(self, scope: str, key: str) -> str: + return self.get_secret(f"{scope}/{key}") + def get_secret(self, key: str) -> str: """Get the secret value given a secret scope & secret key. - :param key: key in the format 'scope/secret_key' + :param key: key in the format 'scope/secret' :return: The decoded UTF-8 secret value. Raises: @@ -53,14 +62,14 @@ def get_secret(self, key: str) -> str: UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. """ scope, key_only = key.split(sep="/") - assert scope and key_only, "Secret key must be in the format 'scope/secret_key'" + assert scope and key_only, "Secret name must be in the format 'scope/secret'" try: secret = self._ws.secrets.get_secret(scope, key_only) assert secret.value is not None return base64.b64decode(secret.value).decode("utf-8") except NotFound as e: - raise NotFound(f'Secret does not exist with scope: {scope} and key: {key_only} : {e}') from e + raise KeyError(f'Secret does not exist with scope: {scope} and key: {key_only} : {e}') from e except UnicodeDecodeError as e: raise UnicodeDecodeError( "utf-8", @@ -73,6 +82,7 @@ def get_secret(self, key: str) -> str: class CredentialManager: SecretProviderFactory = Callable[[], SecretProvider] + def __init__(self, credentials: dict, secret_providers: dict[str, SecretProviderFactory]): self._credentials = credentials self._default_vault = self._credentials.get('secret_vault_type', 'local').lower() @@ -117,11 +127,13 @@ def create_databricks_secret_provider() -> DatabricksSecretProvider: return DatabricksSecretProvider(ws) - -def create_credential_manager(creds_path: Path | str) -> CredentialManager: - if isinstance(creds_path, str): - creds_path = Path(creds_path) - creds = _load_credentials(creds_path) +def create_credential_manager(creds_or_path: dict | Path | str) -> CredentialManager: + if isinstance(creds_or_path, str): + creds_or_path = Path(creds_or_path) + if isinstance(creds_or_path, Path): + creds = _load_credentials(creds_or_path) + else: + creds = creds_or_path # Lazily initialize secret providers secret_providers: dict[str, CredentialManager.SecretProviderFactory] = { diff --git a/src/databricks/labs/lakebridge/install.py b/src/databricks/labs/lakebridge/install.py index ccd60772a9..6f8a57ab82 100644 --- a/src/databricks/labs/lakebridge/install.py +++ b/src/databricks/labs/lakebridge/install.py @@ -316,7 +316,7 @@ def _prompt_for_new_reconcile_installation(self) -> ReconcileConfig: report_type = self._prompts.choice( "Select the report type:", [report_type.value for report_type in ReconReportType] ) - scope_name = self._prompts.question( + scope_name = self._prompts.question( # TODO deprecate f"Enter Secret scope name to store `{data_source.capitalize()}` connection details / secrets", default=f"remorph_{data_source}", ) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py index 9294768b77..47aab8bc89 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py @@ -3,8 +3,7 @@ from pyspark.sql import DataFrame -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index fcc67c0ccf..10f9bdf748 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,11 +7,8 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -37,7 +34,7 @@ def _get_schema_query(catalog: str, schema: str, table: str): return re.sub(r'\s+', ' ', query) -class DatabricksDataSource(DataSource, SecretsMixin): +class DatabricksDataSource(DataSource): _IDENTIFIER_DELIMITER = "`" def __init__( @@ -45,13 +42,10 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, ): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope - self._secrets = DatabricksSecretProvider(self._ws) def read_data( self, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py index 665755e85c..2785fd8002 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py @@ -1,4 +1,10 @@ -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +import dataclasses + + +@dataclasses.dataclass() +class NormalizedIdentifier: + ansi_normalized: str + source_normalized: str class DialectUtils: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py index 261ab9b11a..fbc520f963 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py @@ -8,7 +8,6 @@ class JDBCReaderMixin: _spark: SparkSession - # TODO update the url def _get_jdbc_reader(self, query, jdbc_url, driver, prepare_query=None): driver_class = { "oracle": "oracle.jdbc.driver.OracleDriver", diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/models.py b/src/databricks/labs/lakebridge/reconcile/connectors/models.py deleted file mode 100644 index c98cbef7dd..0000000000 --- a/src/databricks/labs/lakebridge/reconcile/connectors/models.py +++ /dev/null @@ -1,7 +0,0 @@ -import dataclasses - - -@dataclasses.dataclass -class NormalizedIdentifier: - ansi_normalized: str - source_normalized: str diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index c7205afabe..a0b1162dc2 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -10,16 +10,14 @@ from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient logger = logging.getLogger(__name__) -class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class OracleDataSource(DataSource, JDBCReaderMixin): _DRIVER = "oracle" _IDENTIFIER_DELIMITER = "\"" _SCHEMA_QUERY = """select column_name, case when (data_precision is not null @@ -41,19 +39,20 @@ def __init__( spark: SparkSession, ws: WorkspaceClient, secret_scope: str, + secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently ): self._engine = engine self._spark = spark self._ws = ws self._secret_scope = secret_scope - self._secrets = DatabricksSecretProvider(self._ws) + self._secrets = secrets @property def get_jdbc_url(self) -> str: return ( - f"jdbc:{OracleDataSource._DRIVER}:thin:{self._get_secret('user')}" - f"/{self._get_secret('password')}@//{self._get_secret('host')}" - f":{self._get_secret('port')}/{self._get_secret('database')}" + f"jdbc:{OracleDataSource._DRIVER}:thin:{self._secrets.get_databricks_secret(self._secret_scope, 'user')}" + f"/{self._secrets.get_databricks_secret(self._secret_scope, 'password')}@//{self._secrets.get_databricks_secret(self._secret_scope, 'host')}" + f":{self._secrets.get_databricks_secret(self._secret_scope, 'port')}/{self._secrets.get_databricks_secret(self._secret_scope, 'database')}" ) def read_data( diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py deleted file mode 100644 index ed64aa9925..0000000000 --- a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py +++ /dev/null @@ -1,30 +0,0 @@ -import logging - -from databricks.sdk.errors import NotFound - -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider - -logger = logging.getLogger(__name__) - - -# TODO use CredentialManager to allow for changing secret provider for tests -class SecretsMixin: - _secrets: DatabricksSecretProvider - _secret_scope: str - - def _get_secret_or_none(self, secret_key: str) -> str | None: - """ - Get the secret value given a secret scope & secret key. Log a warning if secret does not exist - Used To ensure backwards compatibility when supporting new secrets - """ - try: - # Return the decoded secret value in string format - return self._get_secret(secret_key) - except NotFound as e: - logger.warning(f"Secret not found: key={secret_key}") - logger.debug("Secret lookup failed", exc_info=e) - return None - - def _get_secret(self, secret_key: str) -> str: - _secret_full_key = f"{self._secret_scope}/{secret_key}" - return self._secrets.get_secret(_secret_full_key) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index f9822e90ad..c41de52486 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -12,9 +12,7 @@ from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -23,7 +21,7 @@ logger = logging.getLogger(__name__) -class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class SnowflakeDataSource(DataSource, JDBCReaderMixin): _DRIVER = "snowflake" _IDENTIFIER_DELIMITER = "\"" @@ -58,27 +56,32 @@ def __init__( spark: SparkSession, ws: WorkspaceClient, secret_scope: str, + secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently ): self._engine = engine self._spark = spark self._ws = ws self._secret_scope = secret_scope - self._secrets = DatabricksSecretProvider(self._ws) + self._secrets = secrets @property def get_jdbc_url(self) -> str: - try: - sf_password = self._get_secret('sfPassword') - except (NotFound, KeyError) as e: - message = "sfPassword is mandatory for jdbc connectivity with Snowflake." - logger.error(message) - raise NotFound(message) from e + creds = self._get_snowflake_options() + sf_password = creds.get('sfPassword') + if not sf_password: + try: + sf_password = self._secrets.get_databricks_secret(self._secret_scope, 'sfPassword') + except (NotFound, KeyError) as e: + message = "sfPassword is mandatory for jdbc connectivity with Snowflake." + logger.error(message) + raise NotFound(message) from e + # TODO Support PEM key auth return ( - f"jdbc:{SnowflakeDataSource._DRIVER}://{self._get_secret('sfAccount')}.snowflakecomputing.com" - f"/?user={self._get_secret('sfUser')}&password={sf_password}" - f"&db={self._get_secret('sfDatabase')}&schema={self._get_secret('sfSchema')}" - f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}" + f"jdbc:{SnowflakeDataSource._DRIVER}://{creds['sfUrl']}" + f"/?user={creds['sfUser']}&password={sf_password}" + f"&db={creds['sfDatabase']}&schema={creds['sfSchema']}" + f"&warehouse={creds['sfWarehouse']}&role={creds['sfRole']}" ) def read_data( @@ -141,12 +144,12 @@ def reader(self, query: str) -> DataFrameReader: # Pay attention to https://pylint.pycqa.org/en/latest/user_guide/messages/warning/method-cache-max-size-none.html def _get_snowflake_options(self): options = { - "sfUrl": self._get_secret('sfUrl'), - "sfUser": self._get_secret('sfUser'), - "sfDatabase": self._get_secret('sfDatabase'), - "sfSchema": self._get_secret('sfSchema'), - "sfWarehouse": self._get_secret('sfWarehouse'), - "sfRole": self._get_secret('sfRole'), + "sfUrl": self._secrets.get_databricks_secret(self._secret_scope, 'sfUrl'), + "sfUser": self._secrets.get_databricks_secret(self._secret_scope, 'sfUser'), + "sfDatabase": self._secrets.get_databricks_secret(self._secret_scope, 'sfDatabase'), + "sfSchema": self._secrets.get_databricks_secret(self._secret_scope, 'sfSchema'), + "sfWarehouse": self._secrets.get_databricks_secret(self._secret_scope, 'sfWarehouse'), + "sfRole": self._secrets.get_databricks_secret(self._secret_scope, 'sfRole'), } options = options | self._get_snowflake_auth_options() @@ -155,13 +158,14 @@ def _get_snowflake_options(self): def _get_snowflake_auth_options(self): try: key = SnowflakeDataSource._get_private_key( - self._get_secret('pem_private_key'), self._get_secret_or_none('pem_private_key_password') + self._secrets.get_databricks_secret(self._secret_scope, 'pem_private_key'), + self._secrets.get_secret_or_none(f"{self._secret_scope}/pem_private_key_password"), ) return {"pem_private_key": key} except (NotFound, KeyError): logger.warning("pem_private_key not found. Checking for sfPassword") try: - password = self._get_secret('sfPassword') + password = self._secrets.get_databricks_secret(self._secret_scope, 'sfPassword') return {"sfPassword": password} except (NotFound, KeyError) as e: message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth." diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py index 71039f4494..df4c066634 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py @@ -2,6 +2,7 @@ from sqlglot import Dialect from sqlglot.dialects import TSQL +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource @@ -19,12 +20,13 @@ def create_adapter( ws: WorkspaceClient, secret_scope: str, ) -> DataSource: + secrets = DatabricksSecretProvider(ws) if isinstance(engine, Snowflake): - return SnowflakeDataSource(engine, spark, ws, secret_scope) + return SnowflakeDataSource(engine, spark, ws, secret_scope, secrets) if isinstance(engine, Oracle): - return OracleDataSource(engine, spark, ws, secret_scope) + return OracleDataSource(engine, spark, ws, secret_scope, secrets) if isinstance(engine, Databricks): - return DatabricksDataSource(engine, spark, ws, secret_scope) + return DatabricksDataSource(engine, spark, ws) if isinstance(engine, TSQL): - return TSQLServerDataSource(engine, spark, ws, secret_scope) + return TSQLServerDataSource(engine, spark, ws, secret_scope, secrets) raise ValueError(f"Unsupported source type --> {engine}") diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index 1d65e993cf..52a857914c 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -10,9 +10,7 @@ from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -50,7 +48,7 @@ """ -class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class TSQLServerDataSource(DataSource, JDBCReaderMixin): _DRIVER = "sqlserver" _IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"} @@ -60,23 +58,24 @@ def __init__( spark: SparkSession, ws: WorkspaceClient, secret_scope: str, + secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently ): self._engine = engine self._spark = spark self._ws = ws self._secret_scope = secret_scope - self._secrets = DatabricksSecretProvider(self._ws) + self._secrets = secrets @property def get_jdbc_url(self) -> str: # Construct the JDBC URL return ( - f"jdbc:{self._DRIVER}://{self._get_secret('host')}:{self._get_secret('port')};" - f"databaseName={self._get_secret('database')};" - f"user={self._get_secret('user')};" - f"password={self._get_secret('password')};" - f"encrypt={self._get_secret('encrypt')};" - f"trustServerCertificate={self._get_secret('trustServerCertificate')};" + f"jdbc:{self._DRIVER}://{self._secrets.get_databricks_secret(self._secret_scope, 'host')}:{self._secrets.get_databricks_secret(self._secret_scope, 'port')};" + f"databaseName={self._secrets.get_databricks_secret(self._secret_scope, 'database')};" + f"user={self._secrets.get_databricks_secret(self._secret_scope, 'user')};" + f"password={self._secrets.get_databricks_secret(self._secret_scope, 'password')};" + f"encrypt={self._secrets.get_databricks_secret(self._secret_scope, 'encrypt')};" + f"trustServerCertificate={self._secrets.get_databricks_secret(self._secret_scope, 'trustServerCertificate')};" ) def read_data( diff --git a/tests/conftest.py b/tests/conftest.py index a6a3f114cf..64e54775ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,8 +17,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource from databricks.labs.lakebridge.reconcile.recon_config import ( Table, diff --git a/tests/integration/reconcile/connectors/test_read_schema.py b/tests/integration/reconcile/connectors/test_read_schema.py index 9178c5623a..285cf1ea38 100644 --- a/tests/integration/reconcile/connectors/test_read_schema.py +++ b/tests/integration/reconcile/connectors/test_read_schema.py @@ -36,7 +36,7 @@ def test_tsql_server_read_schema_happy(mock_spark): def test_databricks_read_schema_happy(mock_spark): mock_ws = create_autospec(WorkspaceClient) - connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws, "my_secret") + connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws) mock_spark.sql("CREATE DATABASE IF NOT EXISTS my_test_db") mock_spark.sql("CREATE TABLE IF NOT EXISTS my_test_db.my_test_table (id INT, name STRING) USING parquet") diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 0015ff9dbb..0c3bd774e6 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1906,7 +1906,7 @@ def test_initialise_data_source(mock_workspace_client, mock_spark): source, target = initialise_data_source(mock_workspace_client, mock_spark, src_engine, secret_scope) snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ - databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ + databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client).__class__ assert isinstance(source, snowflake_data_source) assert isinstance(target, databricks_data_source) diff --git a/tests/unit/reconcile/connectors/test_databricks.py b/tests/unit/reconcile/connectors/test_databricks.py index 7f89612e85..3e3e37e169 100644 --- a/tests/unit/reconcile/connectors/test_databricks.py +++ b/tests/unit/reconcile/connectors/test_databricks.py @@ -3,7 +3,7 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -26,7 +26,7 @@ def test_get_schema(): engine, spark, ws, scope = initial_setup() # catalog as catalog - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) ddds.get_schema("catalog", "schema", "supplier") spark.sql.assert_called_with( re.sub( diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index b2b65c5350..12d1c15071 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -4,7 +4,7 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException diff --git a/tests/unit/reconcile/connectors/test_secrets.py b/tests/unit/reconcile/connectors/test_secrets.py index dea7515b09..c50035ee72 100644 --- a/tests/unit/reconcile/connectors/test_secrets.py +++ b/tests/unit/reconcile/connectors/test_secrets.py @@ -3,27 +3,16 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.workspace import GetSecretResponse - -class SecretsMixinUnderTest(SecretsMixin): - def __init__(self, ws: WorkspaceClient, secret_scope: str): - self._ws = ws - self._secret_scope = secret_scope - - def get_secret(self, secret_key: str) -> str: - return self._get_secret(secret_key) - - def get_secret_or_none(self, secret_key: str) -> str | None: - return self._get_secret_or_none(secret_key) +from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider def mock_secret(scope, key): secret_mock = { - "scope": { + "some_scope": { 'user_name': GetSecretResponse( key='user_name', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') ), @@ -40,26 +29,37 @@ def test_get_secrets_happy(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = mock_secret - sut = SecretsMixinUnderTest(ws, "scope") + sut = DatabricksSecretProvider(ws) - assert sut.get_secret("user_name") == "my_user" - assert sut.get_secret_or_none("user_name") == "my_user" - assert sut.get_secret("password") == "my_password" - assert sut.get_secret_or_none("password") == "my_password" + assert sut.get_secret("some_scope/user_name") == "my_user" + assert sut.get_secret_or_none("some_scope/user_name") == "my_user" + assert sut.get_secret("some_scope/password") == "my_password" + assert sut.get_secret_or_none("some_scope/password") == "my_password" def test_get_secrets_not_found_exception(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = SecretsMixinUnderTest(ws, "scope") + sut = DatabricksSecretProvider(ws) with pytest.raises(NotFound, match="Secret does not exist with scope: scope and key: unknown : Test Exception"): - sut.get_secret("unknown") + sut.get_secret("some_scope/unknown") def test_get_secrets_not_found_swallow(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = SecretsMixinUnderTest(ws, "scope") + sut = DatabricksSecretProvider(ws) + + assert sut.get_secret_or_none("some_scope/unknown") is None + + +def test_get_secrets_invalid_name(): + ws = create_autospec(WorkspaceClient) + sut = DatabricksSecretProvider(ws) + + with pytest.raises(AssertionError, match="Secret name must be in the format 'scope/secret'"): + sut.get_secret("just_key") - assert sut.get_secret_or_none("unknown") is None + with pytest.raises(AssertionError, match="Secret name must be in the format 'scope/secret'"): + sut.get_secret_or_none("just_key") diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index 114aa42f2a..0464ffba0d 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -6,7 +6,7 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, InvalidSnowflakePemPrivateKey diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index fa6ad90415..056539f3f7 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -4,7 +4,7 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.tsql import TSQLServerDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException From 3f81eb498caeb6f9394b99f5b60f47f2afec54eb Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 29 Oct 2025 14:09:56 +0100 Subject: [PATCH 03/44] update oracle data source after merge --- src/databricks/labs/lakebridge/reconcile/connectors/oracle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 5726557611..0143337be2 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -109,8 +109,8 @@ def _get_timestamp_options() -> dict[str, str]: } def reader(self, query: str) -> DataFrameReader: - user = self._get_secret('user') - password = self._get_secret('password') + user = self._secrets.get_databricks_secret(self._secret_scope, 'user') + password = self._secrets.get_databricks_secret(self._secret_scope, 'password') logger.debug(f"Using user: {user} to connect to Oracle") return self._get_jdbc_reader( query, self.get_jdbc_url, OracleDataSource._DRIVER, {"user": user, "password": password} From 374b63f9cd0ebada7e9d58fafb9a89743e5063be Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 10 Nov 2025 13:00:21 +0100 Subject: [PATCH 04/44] add load_credentials interface for data sources and impl for snowflake plus major cleanup --- src/databricks/labs/lakebridge/config.py | 21 +++- .../connections/credential_manager.py | 9 +- .../reconcile/connectors/data_source.py | 25 ++++ .../reconcile/connectors/databricks.py | 4 + .../lakebridge/reconcile/connectors/oracle.py | 25 ++-- .../reconcile/connectors/snowflake.py | 113 ++++++++--------- .../reconcile/connectors/source_adapter.py | 9 +- .../lakebridge/reconcile/connectors/tsql.py | 22 ++-- .../reconcile/trigger_recon_service.py | 2 +- .../labs/lakebridge/reconcile/utils.py | 10 +- tests/conftest.py | 4 + .../reconcile/connectors/test_read_schema.py | 12 +- .../reconcile/query_builder/test_execute.py | 8 +- .../reconcile/test_oracle_reconcile.py | 2 +- .../reconcile/connectors/test_databricks.py | 22 ++-- .../unit/reconcile/connectors/test_oracle.py | 20 +-- .../reconcile/connectors/test_snowflake.py | 115 ++++++++++-------- .../reconcile/connectors/test_sql_server.py | 24 ++-- tests/unit/reconcile/test_source_adapter.py | 18 ++- 19 files changed, 259 insertions(+), 206 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 346b3f7457..9a14123424 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -258,6 +258,12 @@ class ReconcileMetadataConfig: volume: str = "reconcile_volume" +@dataclass +class ReconcileCredentialConfig: + vault_type: str + source_creds: dict[str, str] + + @dataclass class ReconcileConfig: __file__ = "reconcile.yml" @@ -265,11 +271,24 @@ class ReconcileConfig: data_source: str report_type: str - secret_scope: str + secret_scope: str # TODO deprecate in favor of creds_or_secret_scope or remove altogether database_config: DatabaseConfig metadata_config: ReconcileMetadataConfig job_id: str | None = None tables: ReconcileTablesConfig | None = None + creds_or_secret_scope: ReconcileCredentialConfig | str | None = None + # supports local, env, databricks creds or direct secret scope string (old behavior) + # TODO make not optional + + @property + def creds(self): + if self.creds_or_secret_scope is not None and isinstance(self.creds_or_secret_scope, ReconcileCredentialConfig): + return self.creds_or_secret_scope + + return ReconcileCredentialConfig( + vault_type="databricks", + source_creds={"__secret_scope": self.creds_or_secret_scope or self.secret_scope}, + ) @dataclass diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 3cd0e50e97..556e429fba 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -122,12 +122,11 @@ def _load_credentials(path: Path) -> dict: raise FileNotFoundError(f"Credentials file not found at {path}") from e -def create_databricks_secret_provider() -> DatabricksSecretProvider: - ws = WorkspaceClient() +def create_databricks_secret_provider(ws) -> DatabricksSecretProvider: return DatabricksSecretProvider(ws) -def create_credential_manager(creds_or_path: dict | Path | str) -> CredentialManager: +def create_credential_manager(creds_or_path: dict | Path | str, ws: WorkspaceClient | None = None) -> CredentialManager: if isinstance(creds_or_path, str): creds_or_path = Path(creds_or_path) if isinstance(creds_or_path, Path): @@ -139,7 +138,9 @@ def create_credential_manager(creds_or_path: dict | Path | str) -> CredentialMan secret_providers: dict[str, CredentialManager.SecretProviderFactory] = { 'local': LocalSecretProvider, 'env': partial(EnvSecretProvider, EnvGetter()), - 'databricks': create_databricks_secret_provider, } + if ws: + secret_providers['databricks'] = partial(create_databricks_secret_provider, ws) + return CredentialManager(creds, secret_providers) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py index 47aab8bc89..0b3c6e6388 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py @@ -3,6 +3,7 @@ from pyspark.sql import DataFrame +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema @@ -10,6 +11,23 @@ logger = logging.getLogger(__name__) +def build_credentials(vault_type: str, source: str, credentials: dict) -> dict: + """Build credentials dictionary with secret vault type included. + + Args: + vault_type: The type of secret vault (e.g., 'local', 'databricks'). + source: The source system name. + credentials: The original credentials dictionary. + + Returns: + A new credentials dictionary including the secret vault type. + """ + return { + source: credentials, + 'secret_vault_type': vault_type.lower(), + } + + class DataSource(ABC): @abstractmethod @@ -33,6 +51,10 @@ def get_schema( ) -> list[Schema]: return NotImplemented + @abstractmethod + def load_credentials(self, creds: ReconcileCredentialConfig) -> "DataSource": + return NotImplemented + @abstractmethod def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: pass @@ -93,5 +115,8 @@ def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bo return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})") return mock_schema + def load_credentials(self, creds: ReconcileCredentialConfig) -> "MockDataSource": + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier(identifier, self._delimiter, self._delimiter) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index 10f9bdf748..ef7fe9c7ce 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,6 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema @@ -92,6 +93,9 @@ def get_schema( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "schema", schema_query) + def load_credentials(self, creds: ReconcileCredentialConfig) -> "DatabricksDataSource": + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 0143337be2..83b751d1c8 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -7,7 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier @@ -33,25 +33,17 @@ class OracleDataSource(DataSource, JDBCReaderMixin): FROM ALL_TAB_COLUMNS WHERE lower(TABLE_NAME) = '{table}' and lower(owner) = '{owner}'""" - def __init__( - self, - engine: Dialect, - spark: SparkSession, - ws: WorkspaceClient, - secret_scope: str, - secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently - ): + def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope - self._secrets = secrets + self._creds: dict[str, str] = {} @property def get_jdbc_url(self) -> str: return ( - f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._secrets.get_databricks_secret(self._secret_scope, 'host')}" - f":{self._secrets.get_databricks_secret(self._secret_scope, 'port')}/{self._secrets.get_databricks_secret(self._secret_scope, 'database')}" + f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._creds.get('host')}" + f":{self._creds.get('port')}/{self._creds.get('database')}" ) def read_data( @@ -109,13 +101,16 @@ def _get_timestamp_options() -> dict[str, str]: } def reader(self, query: str) -> DataFrameReader: - user = self._secrets.get_databricks_secret(self._secret_scope, 'user') - password = self._secrets.get_databricks_secret(self._secret_scope, 'password') + user = self._creds.get('user') + password = self._creds.get('password') logger.debug(f"Using user: {user} to connect to Oracle") return self._get_jdbc_reader( query, self.get_jdbc_url, OracleDataSource._DRIVER, {"user": user, "password": password} ) + def load_credentials(self, creds: ReconcileCredentialConfig) -> "OracleDataSource": + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: normalized = DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index c41de52486..efa6978eee 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -9,14 +9,16 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource +from databricks.labs.lakebridge.config import ReconcileCredentialConfig +from databricks.labs.lakebridge.connections.credential_manager import ( + create_credential_manager, +) +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound logger = logging.getLogger(__name__) @@ -50,38 +52,62 @@ class SnowflakeDataSource(DataSource, JDBCReaderMixin): where lower(table_name)='{table}' and table_schema = '{schema}' order by ordinal_position""" - def __init__( - self, - engine: Dialect, - spark: SparkSession, - ws: WorkspaceClient, - secret_scope: str, - secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently - ): + def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope - self._secrets = secrets + self._creds: dict[str, str] = {} + + def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSource": + connector_creds = [ + "sfUser", + "sfUrl", + "sfDatabase", + "sfSchema", + "sfWarehouse", + "sfRole", + ] + connector_additional_creds = ["sfPassword", "pem_private_key", "pem_private_key_password"] + + use_scope = creds.source_creds.get("__secret_scope") + if use_scope: + source_creds = {key: f"{use_scope}/{key}" for key in connector_creds + connector_additional_creds} + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "snowflake", source_creds) + else: + parsed_creds = build_credentials(creds.vault_type, "snowflake", creds.source_creds) + + self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("snowflake") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." + assert any( + self._creds.get(k) for k in connector_additional_creds + ), f"Missing Snowflake credentials. Please configure any of {connector_additional_creds}." + + if self._creds.get("pem_private_key"): + self._creds["pem_private_key"] = SnowflakeDataSource._get_private_key( + self._creds["pem_private_key"], + self._creds.get("pem_private_key_password"), + ) + + return self @property def get_jdbc_url(self) -> str: - creds = self._get_snowflake_options() - sf_password = creds.get('sfPassword') + sf_password = self._creds.get('sfPassword') if not sf_password: - try: - sf_password = self._secrets.get_databricks_secret(self._secret_scope, 'sfPassword') - except (NotFound, KeyError) as e: - message = "sfPassword is mandatory for jdbc connectivity with Snowflake." - logger.error(message) - raise NotFound(message) from e - # TODO Support PEM key auth + message = "sfPassword is mandatory for jdbc connectivity with Snowflake." + logger.error(message) + raise ValueError(message) + # TODO Support PEM key auth return ( - f"jdbc:{SnowflakeDataSource._DRIVER}://{creds['sfUrl']}" - f"/?user={creds['sfUser']}&password={sf_password}" - f"&db={creds['sfDatabase']}&schema={creds['sfSchema']}" - f"&warehouse={creds['sfWarehouse']}&role={creds['sfRole']}" + f"jdbc:{SnowflakeDataSource._DRIVER}://{self._creds['sfUrl']}" + f"/?user={self._creds['sfUser']}&password={sf_password}" + f"&db={self._creds['sfDatabase']}&schema={self._creds['sfSchema']}" + f"&warehouse={self._creds['sfWarehouse']}&role={self._creds['sfRole']}" ) def read_data( @@ -137,40 +163,7 @@ def get_schema( return self.log_and_throw_exception(e, "schema", schema_query) def reader(self, query: str) -> DataFrameReader: - options = self._get_snowflake_options() - return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options) - - # TODO cache this method using @functools.cache - # Pay attention to https://pylint.pycqa.org/en/latest/user_guide/messages/warning/method-cache-max-size-none.html - def _get_snowflake_options(self): - options = { - "sfUrl": self._secrets.get_databricks_secret(self._secret_scope, 'sfUrl'), - "sfUser": self._secrets.get_databricks_secret(self._secret_scope, 'sfUser'), - "sfDatabase": self._secrets.get_databricks_secret(self._secret_scope, 'sfDatabase'), - "sfSchema": self._secrets.get_databricks_secret(self._secret_scope, 'sfSchema'), - "sfWarehouse": self._secrets.get_databricks_secret(self._secret_scope, 'sfWarehouse'), - "sfRole": self._secrets.get_databricks_secret(self._secret_scope, 'sfRole'), - } - options = options | self._get_snowflake_auth_options() - - return options - - def _get_snowflake_auth_options(self): - try: - key = SnowflakeDataSource._get_private_key( - self._secrets.get_databricks_secret(self._secret_scope, 'pem_private_key'), - self._secrets.get_secret_or_none(f"{self._secret_scope}/pem_private_key_password"), - ) - return {"pem_private_key": key} - except (NotFound, KeyError): - logger.warning("pem_private_key not found. Checking for sfPassword") - try: - password = self._secrets.get_databricks_secret(self._secret_scope, 'sfPassword') - return {"sfPassword": password} - except (NotFound, KeyError) as e: - message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth." - logger.error(message) - raise NotFound(message) from e + return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**self._creds) @staticmethod def _get_private_key(pem_private_key: str, pem_private_key_password: str | None) -> str: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py index df4c066634..286bb36a8f 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py @@ -2,7 +2,6 @@ from sqlglot import Dialect from sqlglot.dialects import TSQL -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource @@ -18,15 +17,13 @@ def create_adapter( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, ) -> DataSource: - secrets = DatabricksSecretProvider(ws) if isinstance(engine, Snowflake): - return SnowflakeDataSource(engine, spark, ws, secret_scope, secrets) + return SnowflakeDataSource(engine, spark, ws) if isinstance(engine, Oracle): - return OracleDataSource(engine, spark, ws, secret_scope, secrets) + return OracleDataSource(engine, spark, ws) if isinstance(engine, Databricks): return DatabricksDataSource(engine, spark, ws) if isinstance(engine, TSQL): - return TSQLServerDataSource(engine, spark, ws, secret_scope, secrets) + return TSQLServerDataSource(engine, spark, ws) raise ValueError(f"Unsupported source type --> {engine}") diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index 5326fd4c18..8c66ba3d0e 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -7,7 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier @@ -57,25 +57,22 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, - secrets: DatabricksSecretProvider, # only Databricks secrets are supported currently ): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope - self._secrets = secrets + self._creds: dict[str, str] = {} @property def get_jdbc_url(self) -> str: # Construct the JDBC URL return ( - f"jdbc:{self._DRIVER}://{self._secrets.get_databricks_secret(self._secret_scope, 'host')}:{self._secrets.get_databricks_secret(self._secret_scope, 'port')};" - f"databaseName={self._secrets.get_databricks_secret(self._secret_scope, 'database')};" - f"user={self._secrets.get_databricks_secret(self._secret_scope, 'user')};" - f"password={self._secrets.get_databricks_secret(self._secret_scope, 'password')};" - f"encrypt={self._secrets.get_databricks_secret(self._secret_scope, 'encrypt')};" - f"trustServerCertificate={self._secrets.get_databricks_secret(self._secret_scope, 'trustServerCertificate')};" + f"jdbc:{self._DRIVER}://{self._creds.get('host')}:{self._creds.get('port')};" + f"databaseName={self._creds.get('database')};" + f"user={self._creds.get('user')};" + f"password={self._creds.get('password')};" + f"encrypt={self._creds.get('encrypt')};" + f"trustServerCertificate={self._creds.get('trustServerCertificate')};" ) def read_data( @@ -105,6 +102,9 @@ def read_data( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "data", table_query) + def load_credentials(self, creds: ReconcileCredentialConfig) -> "TSQLServerDataSource": + return self + def get_schema( self, catalog: str | None, diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index 3fd837d668..9873a177ca 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - secret_scope=reconcile_config.secret_scope, + creds=reconcile_config.creds, ) recon_id = str(uuid4()) diff --git a/src/databricks/labs/lakebridge/reconcile/utils.py b/src/databricks/labs/lakebridge/reconcile/utils.py index 42a309d8da..1fa80b6a37 100644 --- a/src/databricks/labs/lakebridge/reconcile/utils.py +++ b/src/databricks/labs/lakebridge/reconcile/utils.py @@ -4,7 +4,7 @@ from databricks.sdk import WorkspaceClient -from databricks.labs.lakebridge.config import ReconcileMetadataConfig +from databricks.labs.lakebridge.config import ReconcileMetadataConfig, ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter from databricks.labs.lakebridge.reconcile.exception import InvalidInputException from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -17,10 +17,12 @@ def initialise_data_source( ws: WorkspaceClient, spark: SparkSession, engine: str, - secret_scope: str, + creds: ReconcileCredentialConfig, ): - source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope) - target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope) + source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws) + target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws) + source.load_credentials(creds) + target.load_credentials(creds) return source, target diff --git a/tests/conftest.py b/tests/conftest.py index 64e54775ab..e06ddaa514 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource from databricks.labs.lakebridge.reconcile.recon_config import ( @@ -328,6 +329,9 @@ def read_data( ) -> DataFrame: raise RuntimeError("Not implemented") + def load_credentials(self, creds: ReconcileCredentialConfig) -> "FakeDataSource": + raise RuntimeError("Not implemented") + @pytest.fixture def fake_oracle_datasource() -> FakeDataSource: diff --git a/tests/integration/reconcile/connectors/test_read_schema.py b/tests/integration/reconcile/connectors/test_read_schema.py index c1484a093c..5f5a7f3f77 100644 --- a/tests/integration/reconcile/connectors/test_read_schema.py +++ b/tests/integration/reconcile/connectors/test_read_schema.py @@ -17,8 +17,8 @@ class TSQLServerDataSourceUnderTest(TSQLServerDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("tsql"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(True) + super().__init__(get_dialect("tsql"), spark, ws) + self._test_env = TestEnvGetter(True) # TODO use load_credentials @property def get_jdbc_url(self) -> str: @@ -31,8 +31,8 @@ def get_jdbc_url(self) -> str: class OracleDataSourceUnderTest(OracleDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("oracle"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(False) + super().__init__(get_dialect("oracle"), spark, ws) + self._test_env = TestEnvGetter(False) # TODO use load_credentials @property def get_jdbc_url(self) -> str: @@ -48,8 +48,8 @@ def reader(self, query: str) -> DataFrameReader: class SnowflakeDataSourceUnderTest(SnowflakeDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("snowflake"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(True) + super().__init__(get_dialect("snowflake"), spark, ws) + self._test_env = TestEnvGetter(True) # TODO use load_credentials @property def get_jdbc_url(self) -> str: diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 0c3bd774e6..96cc6e73aa 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -13,6 +13,7 @@ TableRecon, ReconcileMetadataConfig, ReconcileConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService @@ -1901,11 +1902,12 @@ def test_data_recon_with_source_exception( def test_initialise_data_source(mock_workspace_client, mock_spark): src_engine = get_dialect("snowflake") - secret_scope = "test" - source, target = initialise_data_source(mock_workspace_client, mock_spark, src_engine, secret_scope) + source, target = initialise_data_source( + mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("xx", {}) + ) - snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ + snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client).__class__ assert isinstance(source, snowflake_data_source) diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index 841b130168..29ceb47549 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -17,7 +17,7 @@ class DatabricksDataSourceUnderTest(DatabricksDataSource): def __init__(self, databricks, ws, local_spark): - super().__init__(get_dialect("databricks"), databricks, ws, "not used") + super().__init__(get_dialect("databricks"), databricks, ws) self._local_spark = local_spark def read_data( diff --git a/tests/unit/reconcile/connectors/test_databricks.py b/tests/unit/reconcile/connectors/test_databricks.py index 3e3e37e169..2f69dbd317 100644 --- a/tests/unit/reconcile/connectors/test_databricks.py +++ b/tests/unit/reconcile/connectors/test_databricks.py @@ -23,7 +23,7 @@ def initial_setup(): def test_get_schema(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # catalog as catalog ddds = DatabricksDataSource(engine, spark, ws) @@ -56,10 +56,10 @@ def test_get_schema(): def test_read_data_from_uc(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) # Test with query ddds.read_data("org", "data", "employee", "select id as id, name as name from :tbl", None) @@ -72,10 +72,10 @@ def test_read_data_from_uc(): def test_read_data_from_hive(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) # Test with query ddds.read_data("hive_metastore", "data", "employee", "select id as id, name as name from :tbl", None) @@ -88,10 +88,10 @@ def test_read_data_from_hive(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises( @@ -104,10 +104,10 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises(DataSourceRuntimeException) as exception: ddds.get_schema("org", "data", "employee") @@ -121,8 +121,8 @@ def test_get_schema_exception_handling(): def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = DatabricksDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = DatabricksDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '`a`') assert data_source.normalize_identifier('`b`') == NormalizedIdentifier("`b`", '`b`') diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index cc348fd79e..f98ef9c927 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -45,10 +45,10 @@ def initial_setup(): def test_read_data_with_options(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for SnowflakeDataSource - ords = OracleDataSource(engine, spark, ws, scope) + ords = OracleDataSource(engine, spark, ws) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -96,10 +96,10 @@ def test_read_data_with_options(): def test_get_schema(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for SnowflakeDataSource - ords = OracleDataSource(engine, spark, ws, scope) + ords = OracleDataSource(engine, spark, ws) # call test method ords.get_schema(None, "data", "employee") # spark assertions @@ -127,8 +127,8 @@ def test_get_schema(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() - ords = OracleDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + ords = OracleDataSource(engine, spark, ws) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -157,8 +157,8 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() - ords = OracleDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + ords = OracleDataSource(engine, spark, ws) spark.read.format().option().option().option().option().option().load.side_effect = RuntimeError("Test Exception") @@ -184,8 +184,8 @@ def test_get_schema_exception_handling(): @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = OracleDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = OracleDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index 0464ffba0d..566f58dd45 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -6,6 +6,7 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource @@ -19,9 +20,6 @@ def mock_secret(scope, key): secret_mock = { "scope": { - 'sfAccount': GetSecretResponse( - key='sfAccount', value=base64.b64encode(bytes('my_account', 'utf-8')).decode('utf-8') - ), 'sfUser': GetSecretResponse( key='sfUser', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') ), @@ -40,13 +38,39 @@ def mock_secret(scope, key): 'sfRole': GetSecretResponse( key='sfRole', value=base64.b64encode(bytes('my_role', 'utf-8')).decode('utf-8') ), - 'sfUrl': GetSecretResponse(key='sfUrl', value=base64.b64encode(bytes('my_url', 'utf-8')).decode('utf-8')), + 'sfUrl': GetSecretResponse( + key='sfUrl', value=base64.b64encode(bytes('my_account.snowflakecomputing.com', 'utf-8')).decode('utf-8') + ), } } return secret_mock[scope][key] +@pytest.fixture() +def snowflake_creds(): + def _snowflake_creds(scope, use_private_key=False, use_pem_password=False): + creds = { + 'sfUser': f'{scope}/sfUser', + 'sfDatabase': f'{scope}/sfDatabase', + 'sfSchema': f'{scope}/sfSchema', + 'sfWarehouse': f'{scope}/sfWarehouse', + 'sfRole': f'{scope}/sfRole', + 'sfUrl': f'{scope}/sfUrl', + } + + if use_private_key: + creds['pem_private_key'] = f'{scope}/pem_private_key' + if use_pem_password: + creds['pem_private_key_password'] = f'{scope}/pem_private_key_password' + else: + creds['sfPassword'] = f'{scope}/sfPassword' + + return creds + + return _snowflake_creds + + def generate_pkcs8_pem_key(malformed=False): private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048) pem_key = private_key.private_bytes( @@ -91,11 +115,12 @@ def initial_setup(): return engine, spark, ws, scope -def test_get_jdbc_url_happy(): +def test_get_jdbc_url_happy(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) url = dfds.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -106,28 +131,13 @@ def test_get_jdbc_url_happy(): ) -def test_get_jdbc_url_fail(): - # initial setup - engine, spark, ws, scope = initial_setup() - ws.secrets.get_secret.side_effect = mock_secret - # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) - url = dfds.get_jdbc_url - # Assert that the URL is generated correctly - assert url == ( - "jdbc:snowflake://my_account.snowflakecomputing.com" - "/?user=my_user&password=my_password" - "&db=my_database&schema=my_schema" - "&warehouse=my_warehouse&role=my_role" - ) - - -def test_read_data_with_out_options(): +def test_read_data_with_out_options(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object with no JDBC reader options table_conf = Table( source_name="supplier", @@ -141,7 +151,7 @@ def test_read_data_with_out_options(): spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") spark.read.format().option().options.assert_called_with( - sfUrl="my_url", + sfUrl="my_account.snowflakecomputing.com", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -152,12 +162,13 @@ def test_read_data_with_out_options(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_with_options(): +def test_read_data_with_options(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -192,12 +203,13 @@ def test_read_data_with_options(): spark.read.format().option().option().option().options().load.assert_called_once() -def test_get_schema(): +def test_get_schema(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # Mocking get secret method to return the required values # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) # call test method dfds.get_schema("catalog", "schema", "supplier") # spark assertions @@ -215,7 +227,7 @@ def test_get_schema(): ), ) spark.read.format().option().options.assert_called_with( - sfUrl="my_url", + sfUrl="my_account.snowflakecomputing.com", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -226,10 +238,11 @@ def test_get_schema(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_exception_handling(): +def test_read_data_exception_handling(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -254,11 +267,12 @@ def test_read_data_exception_handling(): dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) -def test_get_schema_exception_handling(): +def test_get_schema_exception_handling(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) spark.read.format().option().options().load.side_effect = RuntimeError("Test Exception") @@ -276,16 +290,17 @@ def test_get_schema_exception_handling(): dfds.get_schema("catalog", "schema", "supplier") -def test_read_data_without_options_private_key(): +def test_read_data_without_options_private_key(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope, use_private_key=True))) table_conf = Table(source_name="supplier", target_name="supplier") dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") expected_options = { - "sfUrl": "my_url", + "sfUrl": "my_account.snowflakecomputing.com", "sfUser": "my_user", "sfDatabase": "my_database", "sfSchema": "my_schema", @@ -298,30 +313,30 @@ def test_read_data_without_options_private_key(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_without_options_malformed_private_key(): +def test_read_data_without_options_malformed_private_key(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_malformed_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) - table_conf = Table(source_name="supplier", target_name="supplier") + dfds = SnowflakeDataSource(engine, spark, ws) + with pytest.raises(InvalidSnowflakePemPrivateKey, match="Failed to load or process the provided PEM private key."): - dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) + dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope, use_private_key=True))) -def test_read_data_without_any_auth(): +def test_read_data_without_any_auth(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_no_auth_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) - table_conf = Table(source_name="supplier", target_name="supplier") - with pytest.raises( - NotFound, match='sfPassword and pem_private_key not found. Either one is required for snowflake auth.' - ): - dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) + dfds = SnowflakeDataSource(engine, spark, ws) + creds = snowflake_creds(scope) + creds.pop('sfPassword') + + with pytest.raises(AssertionError, match='Missing Snowflake credentials. Please configure any of .*'): + dfds.load_credentials(ReconcileCredentialConfig("databricks", creds)) @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = SnowflakeDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = SnowflakeDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index 056539f3f7..997cd70c58 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -49,9 +49,9 @@ def initial_setup(): def test_get_jdbc_url_happy(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for TSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) url = data_source.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -61,10 +61,10 @@ def test_get_jdbc_url_happy(): def test_get_jdbc_url_fail(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() ws.secrets.get_secret.side_effect = mock_secret # create object for TSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) url = data_source.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -74,10 +74,10 @@ def test_get_jdbc_url_fail(): def test_read_data_with_options(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for MSSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="src_supplier", @@ -116,9 +116,9 @@ def test_read_data_with_options(): def test_get_schema(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # Mocking get secret method to return the required values - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) # call test method data_source.get_schema("org", "schema", "supplier") # spark assertions @@ -163,8 +163,8 @@ def test_get_schema(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) spark.read.format().option().option().option().option().load.side_effect = RuntimeError("Test Exception") @@ -180,8 +180,8 @@ def test_get_schema_exception_handling(): def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", "[a]") assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", "[b]") diff --git a/tests/unit/reconcile/test_source_adapter.py b/tests/unit/reconcile/test_source_adapter.py index 5a9cc4032d..68b093e2da 100644 --- a/tests/unit/reconcile/test_source_adapter.py +++ b/tests/unit/reconcile/test_source_adapter.py @@ -15,10 +15,9 @@ def test_create_adapter_for_snowflake_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("snowflake") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - snowflake_data_source = SnowflakeDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + snowflake_data_source = SnowflakeDataSource(engine, spark, ws).__class__ assert isinstance(data_source, snowflake_data_source) @@ -27,10 +26,9 @@ def test_create_adapter_for_oracle_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("oracle") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - oracle_data_source = OracleDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + oracle_data_source = OracleDataSource(engine, spark, ws).__class__ assert isinstance(data_source, oracle_data_source) @@ -39,10 +37,9 @@ def test_create_adapter_for_databricks_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("databricks") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - databricks_data_source = DatabricksDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + databricks_data_source = DatabricksDataSource(engine, spark, ws).__class__ assert isinstance(data_source, databricks_data_source) @@ -51,7 +48,6 @@ def test_raise_exception_for_unknown_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("trino") ws = create_autospec(WorkspaceClient) - scope = "scope" with pytest.raises(ValueError, match=f"Unsupported source type --> {engine}"): - create_adapter(engine, spark, ws, scope) + create_adapter(engine, spark, ws) From 5102b3ba6c04e6de0b4bd1ef45aa2b69bfd14323 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 10 Nov 2025 14:31:05 +0100 Subject: [PATCH 05/44] implement for tsql and oracle --- .../connections/credential_manager.py | 5 +-- .../lakebridge/reconcile/connectors/oracle.py | 25 ++++++++++++++- .../lakebridge/reconcile/connectors/tsql.py | 27 +++++++++++++++- .../unit/reconcile/connectors/test_oracle.py | 18 +++++++++-- .../unit/reconcile/connectors/test_secrets.py | 4 ++- .../reconcile/connectors/test_sql_server.py | 31 ++++++++++--------- 6 files changed, 87 insertions(+), 23 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 556e429fba..204c85bef1 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -61,8 +61,9 @@ def get_secret(self, key: str) -> str: NotFound: The secret could not be found. UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. """ - scope, key_only = key.split(sep="/") - assert scope and key_only, "Secret name must be in the format 'scope/secret'" + key_parts = key.split(sep="/") + assert len(key_parts) == 2, "Secret name must be in the format 'scope/secret'" + scope, key_only = key_parts[0], key_parts[1] try: secret = self._ws.secrets.get_secret(scope, key_only) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 83b751d1c8..26c042c9f3 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -8,7 +8,8 @@ from sqlglot import Dialect from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource +from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema @@ -109,6 +110,28 @@ def reader(self, query: str) -> DataFrameReader: ) def load_credentials(self, creds: ReconcileCredentialConfig) -> "OracleDataSource": + connector_creds = [ + "host", + "port", + "database", + "user", + "password", + ] + + use_scope = creds.source_creds.get("__secret_scope") + if use_scope: + source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "oracle", source_creds) + else: + parsed_creds = build_credentials(creds.vault_type, "oracle", creds.source_creds) + + self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("oracle") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory Oracle credentials. Please configure all of {connector_creds}." + return self def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index 8c66ba3d0e..c0ddf524db 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -8,7 +8,8 @@ from sqlglot import Dialect from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource +from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema @@ -103,6 +104,30 @@ def read_data( return self.log_and_throw_exception(e, "data", table_query) def load_credentials(self, creds: ReconcileCredentialConfig) -> "TSQLServerDataSource": + connector_creds = [ + "host", + "port", + "database", + "user", + "password", + "encrypt", + "trustServerCertificate", + ] + + use_scope = creds.source_creds.get("__secret_scope") + if use_scope: + source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "mssql", source_creds) + else: + parsed_creds = build_credentials(creds.vault_type, "mssql", creds.source_creds) + + self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("mssql") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory MS SQL credentials. Please configure all of {connector_creds}." + return self def get_schema( diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index f98ef9c927..11b917c3e1 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -4,6 +4,7 @@ import pytest +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource @@ -31,6 +32,16 @@ def mock_secret(scope, key): return secret_mock[scope][key] +def oracle_creds(scope): + return { + "host": f"{scope}/host", + "port": f"{scope}/port", + "database": f"{scope}/database", + "user": f"{scope}/user", + "password": f"{scope}/password", + } + + def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -45,10 +56,11 @@ def initial_setup(): def test_read_data_with_options(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() - # create object for SnowflakeDataSource + # create object for OracleDataSource ords = OracleDataSource(engine, spark, ws) + ords.load_credentials(ReconcileCredentialConfig("databricks", oracle_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -98,7 +110,7 @@ def test_get_schema(): # initial setup engine, spark, ws, _ = initial_setup() - # create object for SnowflakeDataSource + # create object for OracleDataSource ords = OracleDataSource(engine, spark, ws) # call test method ords.get_schema(None, "data", "employee") diff --git a/tests/unit/reconcile/connectors/test_secrets.py b/tests/unit/reconcile/connectors/test_secrets.py index c50035ee72..c515bb9add 100644 --- a/tests/unit/reconcile/connectors/test_secrets.py +++ b/tests/unit/reconcile/connectors/test_secrets.py @@ -42,7 +42,9 @@ def test_get_secrets_not_found_exception(): ws.secrets.get_secret.side_effect = NotFound("Test Exception") sut = DatabricksSecretProvider(ws) - with pytest.raises(NotFound, match="Secret does not exist with scope: scope and key: unknown : Test Exception"): + with pytest.raises( + KeyError, match="Secret does not exist with scope: some_scope and key: unknown : Test Exception" + ): sut.get_secret("some_scope/unknown") diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index 997cd70c58..101e9c1b71 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -4,6 +4,7 @@ import pytest +from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.tsql import TSQLServerDataSource @@ -35,6 +36,18 @@ def mock_secret(scope, key): return scope_secret_mock[scope][key] +def mssql_creds(scope): + return { + "host": f"{scope}/host", + "port": f"{scope}/port", + "database": f"{scope}/database", + "user": f"{scope}/user", + "password": f"{scope}/password", + "encrypt": f"{scope}/encrypt", + "trustServerCertificate": f"{scope}/trustServerCertificate", + } + + def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -49,22 +62,10 @@ def initial_setup(): def test_get_jdbc_url_happy(): # initial setup - engine, spark, ws, _ = initial_setup() - # create object for TSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws) - url = data_source.get_jdbc_url - # Assert that the URL is generated correctly - assert url == ( - """jdbc:sqlserver://my_host:777;databaseName=my_database;user=my_user;password=my_password;encrypt=true;trustServerCertificate=true;""" - ) - - -def test_get_jdbc_url_fail(): - # initial setup - engine, spark, ws, _ = initial_setup() - ws.secrets.get_secret.side_effect = mock_secret + engine, spark, ws, scope = initial_setup() # create object for TSQLServerDataSource data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialConfig("databricks", mssql_creds(scope))) url = data_source.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -94,7 +95,7 @@ def test_read_data_with_options(): # spark assertions spark.read.format.assert_called_with("jdbc") - spark.read.format().option.assert_called_with( + spark.read.format().option.assert_called_with( # FIXME "url", "jdbc:sqlserver://my_host:777;databaseName=my_database;user=my_user;password=my_password;encrypt=true;trustServerCertificate=true;", ) From e07e443c3c19c32b46a386286c348ef4d9ca2150 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 10 Nov 2025 15:17:12 +0100 Subject: [PATCH 06/44] improve snowflake load credentials --- .../lakebridge/reconcile/connectors/snowflake.py | 9 +++++---- .../reconcile/query_builder/test_execute.py | 13 ++++++++++--- tests/unit/reconcile/connectors/test_sql_server.py | 5 +++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index efa6978eee..98443eb63b 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -67,11 +67,12 @@ def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSo "sfWarehouse", "sfRole", ] - connector_additional_creds = ["sfPassword", "pem_private_key", "pem_private_key_password"] use_scope = creds.source_creds.get("__secret_scope") if use_scope: - source_creds = {key: f"{use_scope}/{key}" for key in connector_creds + connector_additional_creds} + # to use pem key and/or pem password, migrate to source_creds approach + connector_creds += ["sfPassword"] + source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" parsed_creds = build_credentials(creds.vault_type, "snowflake", source_creds) @@ -83,8 +84,8 @@ def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSo self._creds.get(k) for k in connector_creds ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." assert any( - self._creds.get(k) for k in connector_additional_creds - ), f"Missing Snowflake credentials. Please configure any of {connector_additional_creds}." + self._creds.get(k) for k in ["sfPassword", "pem_private_key"] + ), "Missing Snowflake credentials. Please configure any of [sfPassword, pem_private_key]." if self._creds.get("pem_private_key"): self._creds["pem_private_key"] = SnowflakeDataSource._get_private_key( diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 96cc6e73aa..80092d16d1 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1,13 +1,17 @@ +import base64 from pathlib import Path from dataclasses import dataclass from datetime import datetime -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, create_autospec import pytest from pyspark import Row from pyspark.errors import PySparkException from pyspark.testing import assertDataFrameEqual +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.workspace import GetSecretResponse + from databricks.labs.lakebridge.config import ( DatabaseConfig, TableRecon, @@ -1904,7 +1908,7 @@ def test_initialise_data_source(mock_workspace_client, mock_spark): src_engine = get_dialect("snowflake") source, target = initialise_data_source( - mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("xx", {}) + mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("local", {}) ) snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ @@ -2022,7 +2026,10 @@ def test_reconcile_data_with_threshold_and_row_report_type( @patch('databricks.labs.lakebridge.reconcile.recon_capture.generate_final_reconcile_output') def test_recon_output_without_exception(mock_gen_final_recon_output): - mock_workspace_client = MagicMock() + mock_workspace_client = create_autospec(WorkspaceClient) + mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( + key="key", value=base64.b64encode(bytes('value', 'utf-8')).decode('utf-8') + ) mock_spark = MagicMock() mock_table_recon = MagicMock() mock_gen_final_recon_output.return_value = ReconcileOutput( diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index 101e9c1b71..32c81d3bd7 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -75,10 +75,11 @@ def test_get_jdbc_url_happy(): def test_read_data_with_options(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # create object for MSSQLServerDataSource data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialConfig("databricks", mssql_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="src_supplier", @@ -95,7 +96,7 @@ def test_read_data_with_options(): # spark assertions spark.read.format.assert_called_with("jdbc") - spark.read.format().option.assert_called_with( # FIXME + spark.read.format().option.assert_called_with( "url", "jdbc:sqlserver://my_host:777;databaseName=my_database;user=my_user;password=my_password;encrypt=true;trustServerCertificate=true;", ) From fb6692dde917c1b59890303a3aad9b7834315503 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 10 Nov 2025 15:58:08 +0100 Subject: [PATCH 07/44] add check if user loaded credentials in snowflake and use local creds in one test for it to be green --- .../reconcile/connectors/snowflake.py | 17 ++++++++--------- .../reconcile/query_builder/test_execute.py | 11 ++++++++++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index 98443eb63b..173e834043 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -84,7 +84,7 @@ def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSo self._creds.get(k) for k in connector_creds ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." assert any( - self._creds.get(k) for k in ["sfPassword", "pem_private_key"] + self._creds.get(k) for k in ("sfPassword", "pem_private_key") ), "Missing Snowflake credentials. Please configure any of [sfPassword, pem_private_key]." if self._creds.get("pem_private_key"): @@ -97,19 +97,15 @@ def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSo @property def get_jdbc_url(self) -> str: - sf_password = self._creds.get('sfPassword') - if not sf_password: - message = "sfPassword is mandatory for jdbc connectivity with Snowflake." - logger.error(message) - raise ValueError(message) - # TODO Support PEM key auth + if not self._creds: + raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialConfig)`.") return ( f"jdbc:{SnowflakeDataSource._DRIVER}://{self._creds['sfUrl']}" - f"/?user={self._creds['sfUser']}&password={sf_password}" + f"/?user={self._creds['sfUser']}&password={self._creds['sfPassword']}" f"&db={self._creds['sfDatabase']}&schema={self._creds['sfSchema']}" f"&warehouse={self._creds['sfWarehouse']}&role={self._creds['sfRole']}" - ) + ) # TODO Support PEM key auth def read_data( self, @@ -164,6 +160,9 @@ def get_schema( return self.log_and_throw_exception(e, "schema", schema_query) def reader(self, query: str) -> DataFrameReader: + if not self._creds: + raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialConfig)`.") + return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**self._creds) @staticmethod diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 80092d16d1..c510a1d04b 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1907,8 +1907,17 @@ def test_data_recon_with_source_exception( def test_initialise_data_source(mock_workspace_client, mock_spark): src_engine = get_dialect("snowflake") + sf_creds = { + "sfUser": "user", + "sfPassword": "password", + "sfUrl": "account.snowflakecomputing.com", + "sfDatabase": "database", + "sfSchema": "schema", + "sfWarehouse": "warehouse", + "sfRole": "role", + } source, target = initialise_data_source( - mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("local", {}) + mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("local", sf_creds) ) snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ From a16455d2aecaf0866e8d70486d6a0bb0d84b9035 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 21 Nov 2025 09:48:13 +0100 Subject: [PATCH 08/44] revert changes to reconcile before moving to a new branch --- .../reconcile/connectors/data_source.py | 28 +---- .../reconcile/connectors/databricks.py | 12 +- .../reconcile/connectors/dialect_utils.py | 8 +- .../reconcile/connectors/jdbc_reader.py | 1 + .../lakebridge/reconcile/connectors/models.py | 7 ++ .../lakebridge/reconcile/connectors/oracle.py | 53 +++----- .../reconcile/connectors/secrets.py | 49 ++++++++ .../reconcile/connectors/snowflake.py | 115 ++++++++--------- .../reconcile/connectors/source_adapter.py | 9 +- .../lakebridge/reconcile/connectors/tsql.py | 52 ++------ .../reconcile/trigger_recon_service.py | 2 +- .../labs/lakebridge/reconcile/utils.py | 10 +- .../reconcile/connectors/test_read_schema.py | 14 +-- .../reconcile/query_builder/test_execute.py | 30 +---- .../reconcile/test_oracle_reconcile.py | 2 +- .../reconcile/connectors/test_databricks.py | 26 ++-- .../unit/reconcile/connectors/test_oracle.py | 36 ++---- .../unit/reconcile/connectors/test_secrets.py | 48 ++++--- .../reconcile/connectors/test_snowflake.py | 117 ++++++++---------- .../reconcile/connectors/test_sql_server.py | 46 ++++--- tests/unit/reconcile/test_source_adapter.py | 18 +-- 21 files changed, 310 insertions(+), 373 deletions(-) create mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/models.py create mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/secrets.py diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py index 0b3c6e6388..9294768b77 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py @@ -3,31 +3,14 @@ from pyspark.sql import DataFrame -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema logger = logging.getLogger(__name__) -def build_credentials(vault_type: str, source: str, credentials: dict) -> dict: - """Build credentials dictionary with secret vault type included. - - Args: - vault_type: The type of secret vault (e.g., 'local', 'databricks'). - source: The source system name. - credentials: The original credentials dictionary. - - Returns: - A new credentials dictionary including the secret vault type. - """ - return { - source: credentials, - 'secret_vault_type': vault_type.lower(), - } - - class DataSource(ABC): @abstractmethod @@ -51,10 +34,6 @@ def get_schema( ) -> list[Schema]: return NotImplemented - @abstractmethod - def load_credentials(self, creds: ReconcileCredentialConfig) -> "DataSource": - return NotImplemented - @abstractmethod def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: pass @@ -115,8 +94,5 @@ def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bo return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})") return mock_schema - def load_credentials(self, creds: ReconcileCredentialConfig) -> "MockDataSource": - return self - def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier(identifier, self._delimiter, self._delimiter) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index ef7fe9c7ce..89d05b3e4c 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,9 +7,10 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -35,7 +36,7 @@ def _get_schema_query(catalog: str, schema: str, table: str): return re.sub(r'\s+', ' ', query) -class DatabricksDataSource(DataSource): +class DatabricksDataSource(DataSource, SecretsMixin): _IDENTIFIER_DELIMITER = "`" def __init__( @@ -43,10 +44,12 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, + secret_scope: str, ): self._engine = engine self._spark = spark self._ws = ws + self._secret_scope = secret_scope def read_data( self, @@ -93,9 +96,6 @@ def get_schema( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "schema", schema_query) - def load_credentials(self, creds: ReconcileCredentialConfig) -> "DatabricksDataSource": - return self - def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py index 2785fd8002..665755e85c 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py @@ -1,10 +1,4 @@ -import dataclasses - - -@dataclasses.dataclass() -class NormalizedIdentifier: - ansi_normalized: str - source_normalized: str +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier class DialectUtils: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py index 7159d04368..f2313e7a90 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py @@ -8,6 +8,7 @@ class JDBCReaderMixin: _spark: SparkSession + # TODO update the url def _get_jdbc_reader(self, query, jdbc_url, driver, additional_options: dict | None = None): driver_class = { "oracle": "oracle.jdbc.OracleDriver", diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/models.py b/src/databricks/labs/lakebridge/reconcile/connectors/models.py new file mode 100644 index 0000000000..c98cbef7dd --- /dev/null +++ b/src/databricks/labs/lakebridge/reconcile/connectors/models.py @@ -0,0 +1,7 @@ +import dataclasses + + +@dataclasses.dataclass +class NormalizedIdentifier: + ansi_normalized: str + source_normalized: str diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 26c042c9f3..ebcd5f0991 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -7,18 +7,18 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient logger = logging.getLogger(__name__) -class OracleDataSource(DataSource, JDBCReaderMixin): +class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin): _DRIVER = "oracle" _IDENTIFIER_DELIMITER = "\"" _SCHEMA_QUERY = """select column_name, case when (data_precision is not null @@ -34,17 +34,23 @@ class OracleDataSource(DataSource, JDBCReaderMixin): FROM ALL_TAB_COLUMNS WHERE lower(TABLE_NAME) = '{table}' and lower(owner) = '{owner}'""" - def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): + def __init__( + self, + engine: Dialect, + spark: SparkSession, + ws: WorkspaceClient, + secret_scope: str, + ): self._engine = engine self._spark = spark self._ws = ws - self._creds: dict[str, str] = {} + self._secret_scope = secret_scope @property def get_jdbc_url(self) -> str: return ( - f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._creds.get('host')}" - f":{self._creds.get('port')}/{self._creds.get('database')}" + f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._get_secret('host')}" + f":{self._get_secret('port')}/{self._get_secret('database')}" ) def read_data( @@ -102,38 +108,13 @@ def _get_timestamp_options() -> dict[str, str]: } def reader(self, query: str) -> DataFrameReader: - user = self._creds.get('user') - password = self._creds.get('password') + user = self._get_secret('user') + password = self._get_secret('password') logger.debug(f"Using user: {user} to connect to Oracle") return self._get_jdbc_reader( query, self.get_jdbc_url, OracleDataSource._DRIVER, {"user": user, "password": password} ) - def load_credentials(self, creds: ReconcileCredentialConfig) -> "OracleDataSource": - connector_creds = [ - "host", - "port", - "database", - "user", - "password", - ] - - use_scope = creds.source_creds.get("__secret_scope") - if use_scope: - source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "oracle", source_creds) - else: - parsed_creds = build_credentials(creds.vault_type, "oracle", creds.source_creds) - - self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("oracle") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory Oracle credentials. Please configure all of {connector_creds}." - - return self - def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: normalized = DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py new file mode 100644 index 0000000000..daa213afc8 --- /dev/null +++ b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py @@ -0,0 +1,49 @@ +import base64 +import logging + +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound + +logger = logging.getLogger(__name__) + + +# TODO use CredentialManager to allow for changing secret provider for tests +class SecretsMixin: + _ws: WorkspaceClient + _secret_scope: str + + def _get_secret_or_none(self, secret_key: str) -> str | None: + """ + Get the secret value given a secret scope & secret key. Log a warning if secret does not exist + Used To ensure backwards compatibility when supporting new secrets + """ + try: + # Return the decoded secret value in string format + return self._get_secret(secret_key) + except NotFound as e: + logger.warning(f"Secret not found: key={secret_key}") + logger.debug("Secret lookup failed", exc_info=e) + return None + + def _get_secret(self, secret_key: str) -> str: + """Get the secret value given a secret scope & secret key. + + Raises: + NotFound: The secret could not be found. + UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. + """ + try: + # Return the decoded secret value in string format + secret = self._ws.secrets.get_secret(self._secret_scope, secret_key) + assert secret.value is not None + return base64.b64decode(secret.value).decode("utf-8") + except NotFound as e: + raise NotFound(f'Secret does not exist with scope: {self._secret_scope} and key: {secret_key} : {e}') from e + except UnicodeDecodeError as e: + raise UnicodeDecodeError( + "utf-8", + secret_key.encode(), + 0, + 1, + f"Secret {self._secret_scope}/{secret_key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.", + ) from e diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index 173e834043..e66751d29b 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -9,21 +9,20 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.connections.credential_manager import ( - create_credential_manager, -) -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound logger = logging.getLogger(__name__) -class SnowflakeDataSource(DataSource, JDBCReaderMixin): +class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin): _DRIVER = "snowflake" _IDENTIFIER_DELIMITER = "\"" @@ -52,60 +51,33 @@ class SnowflakeDataSource(DataSource, JDBCReaderMixin): where lower(table_name)='{table}' and table_schema = '{schema}' order by ordinal_position""" - def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): + def __init__( + self, + engine: Dialect, + spark: SparkSession, + ws: WorkspaceClient, + secret_scope: str, + ): self._engine = engine self._spark = spark self._ws = ws - self._creds: dict[str, str] = {} - - def load_credentials(self, creds: ReconcileCredentialConfig) -> "SnowflakeDataSource": - connector_creds = [ - "sfUser", - "sfUrl", - "sfDatabase", - "sfSchema", - "sfWarehouse", - "sfRole", - ] - - use_scope = creds.source_creds.get("__secret_scope") - if use_scope: - # to use pem key and/or pem password, migrate to source_creds approach - connector_creds += ["sfPassword"] - source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "snowflake", source_creds) - else: - parsed_creds = build_credentials(creds.vault_type, "snowflake", creds.source_creds) - - self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("snowflake") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." - assert any( - self._creds.get(k) for k in ("sfPassword", "pem_private_key") - ), "Missing Snowflake credentials. Please configure any of [sfPassword, pem_private_key]." - - if self._creds.get("pem_private_key"): - self._creds["pem_private_key"] = SnowflakeDataSource._get_private_key( - self._creds["pem_private_key"], - self._creds.get("pem_private_key_password"), - ) - - return self + self._secret_scope = secret_scope @property def get_jdbc_url(self) -> str: - if not self._creds: - raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialConfig)`.") + try: + sf_password = self._get_secret('sfPassword') + except (NotFound, KeyError) as e: + message = "sfPassword is mandatory for jdbc connectivity with Snowflake." + logger.error(message) + raise NotFound(message) from e return ( - f"jdbc:{SnowflakeDataSource._DRIVER}://{self._creds['sfUrl']}" - f"/?user={self._creds['sfUser']}&password={self._creds['sfPassword']}" - f"&db={self._creds['sfDatabase']}&schema={self._creds['sfSchema']}" - f"&warehouse={self._creds['sfWarehouse']}&role={self._creds['sfRole']}" - ) # TODO Support PEM key auth + f"jdbc:{SnowflakeDataSource._DRIVER}://{self._get_secret('sfAccount')}.snowflakecomputing.com" + f"/?user={self._get_secret('sfUser')}&password={sf_password}" + f"&db={self._get_secret('sfDatabase')}&schema={self._get_secret('sfSchema')}" + f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}" + ) def read_data( self, @@ -160,10 +132,39 @@ def get_schema( return self.log_and_throw_exception(e, "schema", schema_query) def reader(self, query: str) -> DataFrameReader: - if not self._creds: - raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialConfig)`.") - - return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**self._creds) + options = self._get_snowflake_options() + return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options) + + # TODO cache this method using @functools.cache + # Pay attention to https://pylint.pycqa.org/en/latest/user_guide/messages/warning/method-cache-max-size-none.html + def _get_snowflake_options(self): + options = { + "sfUrl": self._get_secret('sfUrl'), + "sfUser": self._get_secret('sfUser'), + "sfDatabase": self._get_secret('sfDatabase'), + "sfSchema": self._get_secret('sfSchema'), + "sfWarehouse": self._get_secret('sfWarehouse'), + "sfRole": self._get_secret('sfRole'), + } + options = options | self._get_snowflake_auth_options() + + return options + + def _get_snowflake_auth_options(self): + try: + key = SnowflakeDataSource._get_private_key( + self._get_secret('pem_private_key'), self._get_secret_or_none('pem_private_key_password') + ) + return {"pem_private_key": key} + except (NotFound, KeyError): + logger.warning("pem_private_key not found. Checking for sfPassword") + try: + password = self._get_secret('sfPassword') + return {"sfPassword": password} + except (NotFound, KeyError) as e: + message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth." + logger.error(message) + raise NotFound(message) from e @staticmethod def _get_private_key(pem_private_key: str, pem_private_key_password: str | None) -> str: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py index 286bb36a8f..71039f4494 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py @@ -17,13 +17,14 @@ def create_adapter( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, + secret_scope: str, ) -> DataSource: if isinstance(engine, Snowflake): - return SnowflakeDataSource(engine, spark, ws) + return SnowflakeDataSource(engine, spark, ws, secret_scope) if isinstance(engine, Oracle): - return OracleDataSource(engine, spark, ws) + return OracleDataSource(engine, spark, ws, secret_scope) if isinstance(engine, Databricks): - return DatabricksDataSource(engine, spark, ws) + return DatabricksDataSource(engine, spark, ws, secret_scope) if isinstance(engine, TSQL): - return TSQLServerDataSource(engine, spark, ws) + return TSQLServerDataSource(engine, spark, ws, secret_scope) raise ValueError(f"Unsupported source type --> {engine}") diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index c0ddf524db..a5241b7ac8 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -7,11 +7,11 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, build_credentials +from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -49,7 +49,7 @@ """ -class TSQLServerDataSource(DataSource, JDBCReaderMixin): +class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin): _DRIVER = "sqlserver" _IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"} @@ -58,22 +58,23 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, + secret_scope: str, ): self._engine = engine self._spark = spark self._ws = ws - self._creds: dict[str, str] = {} + self._secret_scope = secret_scope @property def get_jdbc_url(self) -> str: # Construct the JDBC URL return ( - f"jdbc:{self._DRIVER}://{self._creds.get('host')}:{self._creds.get('port')};" - f"databaseName={self._creds.get('database')};" - f"user={self._creds.get('user')};" - f"password={self._creds.get('password')};" - f"encrypt={self._creds.get('encrypt')};" - f"trustServerCertificate={self._creds.get('trustServerCertificate')};" + f"jdbc:{self._DRIVER}://{self._get_secret('host')}:{self._get_secret('port')};" + f"databaseName={self._get_secret('database')};" + f"user={self._get_secret('user')};" + f"password={self._get_secret('password')};" + f"encrypt={self._get_secret('encrypt')};" + f"trustServerCertificate={self._get_secret('trustServerCertificate')};" ) def read_data( @@ -103,33 +104,6 @@ def read_data( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "data", table_query) - def load_credentials(self, creds: ReconcileCredentialConfig) -> "TSQLServerDataSource": - connector_creds = [ - "host", - "port", - "database", - "user", - "password", - "encrypt", - "trustServerCertificate", - ] - - use_scope = creds.source_creds.get("__secret_scope") - if use_scope: - source_creds = {key: f"{use_scope}/{key}" for key in connector_creds} - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "mssql", source_creds) - else: - parsed_creds = build_credentials(creds.vault_type, "mssql", creds.source_creds) - - self._creds = create_credential_manager(parsed_creds, self._ws).get_credentials("mssql") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory MS SQL credentials. Please configure all of {connector_creds}." - - return self - def get_schema( self, catalog: str | None, diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index 9873a177ca..3fd837d668 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - creds=reconcile_config.creds, + secret_scope=reconcile_config.secret_scope, ) recon_id = str(uuid4()) diff --git a/src/databricks/labs/lakebridge/reconcile/utils.py b/src/databricks/labs/lakebridge/reconcile/utils.py index 1fa80b6a37..42a309d8da 100644 --- a/src/databricks/labs/lakebridge/reconcile/utils.py +++ b/src/databricks/labs/lakebridge/reconcile/utils.py @@ -4,7 +4,7 @@ from databricks.sdk import WorkspaceClient -from databricks.labs.lakebridge.config import ReconcileMetadataConfig, ReconcileCredentialConfig +from databricks.labs.lakebridge.config import ReconcileMetadataConfig from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter from databricks.labs.lakebridge.reconcile.exception import InvalidInputException from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -17,12 +17,10 @@ def initialise_data_source( ws: WorkspaceClient, spark: SparkSession, engine: str, - creds: ReconcileCredentialConfig, + secret_scope: str, ): - source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws) - target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws) - source.load_credentials(creds) - target.load_credentials(creds) + source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope) + target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope) return source, target diff --git a/tests/integration/reconcile/connectors/test_read_schema.py b/tests/integration/reconcile/connectors/test_read_schema.py index 5f5a7f3f77..b1d2752c7f 100644 --- a/tests/integration/reconcile/connectors/test_read_schema.py +++ b/tests/integration/reconcile/connectors/test_read_schema.py @@ -17,8 +17,8 @@ class TSQLServerDataSourceUnderTest(TSQLServerDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("tsql"), spark, ws) - self._test_env = TestEnvGetter(True) # TODO use load_credentials + super().__init__(get_dialect("tsql"), spark, ws, "secret_scope") + self._test_env = TestEnvGetter(True) @property def get_jdbc_url(self) -> str: @@ -31,8 +31,8 @@ def get_jdbc_url(self) -> str: class OracleDataSourceUnderTest(OracleDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("oracle"), spark, ws) - self._test_env = TestEnvGetter(False) # TODO use load_credentials + super().__init__(get_dialect("oracle"), spark, ws, "secret_scope") + self._test_env = TestEnvGetter(False) @property def get_jdbc_url(self) -> str: @@ -48,8 +48,8 @@ def reader(self, query: str) -> DataFrameReader: class SnowflakeDataSourceUnderTest(SnowflakeDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("snowflake"), spark, ws) - self._test_env = TestEnvGetter(True) # TODO use load_credentials + super().__init__(get_dialect("snowflake"), spark, ws, "secret_scope") + self._test_env = TestEnvGetter(True) @property def get_jdbc_url(self) -> str: @@ -86,7 +86,7 @@ def test_sql_server_read_schema_happy(mock_spark): def test_databricks_read_schema_happy(mock_spark): mock_ws = create_autospec(WorkspaceClient) - connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws) + connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws, "my_secret") mock_spark.sql("CREATE DATABASE IF NOT EXISTS my_test_db") mock_spark.sql("CREATE TABLE IF NOT EXISTS my_test_db.my_test_table (id INT, name STRING) USING parquet") diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index c510a1d04b..0015ff9dbb 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1,23 +1,18 @@ -import base64 from pathlib import Path from dataclasses import dataclass from datetime import datetime -from unittest.mock import patch, MagicMock, create_autospec +from unittest.mock import patch, MagicMock import pytest from pyspark import Row from pyspark.errors import PySparkException from pyspark.testing import assertDataFrameEqual -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.workspace import GetSecretResponse - from databricks.labs.lakebridge.config import ( DatabaseConfig, TableRecon, ReconcileMetadataConfig, ReconcileConfig, - ReconcileCredentialConfig, ) from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService @@ -1906,22 +1901,12 @@ def test_data_recon_with_source_exception( def test_initialise_data_source(mock_workspace_client, mock_spark): src_engine = get_dialect("snowflake") + secret_scope = "test" - sf_creds = { - "sfUser": "user", - "sfPassword": "password", - "sfUrl": "account.snowflakecomputing.com", - "sfDatabase": "database", - "sfSchema": "schema", - "sfWarehouse": "warehouse", - "sfRole": "role", - } - source, target = initialise_data_source( - mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialConfig("local", sf_creds) - ) + source, target = initialise_data_source(mock_workspace_client, mock_spark, src_engine, secret_scope) - snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ - databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client).__class__ + snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ + databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ assert isinstance(source, snowflake_data_source) assert isinstance(target, databricks_data_source) @@ -2035,10 +2020,7 @@ def test_reconcile_data_with_threshold_and_row_report_type( @patch('databricks.labs.lakebridge.reconcile.recon_capture.generate_final_reconcile_output') def test_recon_output_without_exception(mock_gen_final_recon_output): - mock_workspace_client = create_autospec(WorkspaceClient) - mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( - key="key", value=base64.b64encode(bytes('value', 'utf-8')).decode('utf-8') - ) + mock_workspace_client = MagicMock() mock_spark = MagicMock() mock_table_recon = MagicMock() mock_gen_final_recon_output.return_value = ReconcileOutput( diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index 29ceb47549..841b130168 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -17,7 +17,7 @@ class DatabricksDataSourceUnderTest(DatabricksDataSource): def __init__(self, databricks, ws, local_spark): - super().__init__(get_dialect("databricks"), databricks, ws) + super().__init__(get_dialect("databricks"), databricks, ws, "not used") self._local_spark = local_spark def read_data( diff --git a/tests/unit/reconcile/connectors/test_databricks.py b/tests/unit/reconcile/connectors/test_databricks.py index 2f69dbd317..7f89612e85 100644 --- a/tests/unit/reconcile/connectors/test_databricks.py +++ b/tests/unit/reconcile/connectors/test_databricks.py @@ -3,7 +3,7 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -23,10 +23,10 @@ def initial_setup(): def test_get_schema(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # catalog as catalog - ddds = DatabricksDataSource(engine, spark, ws) + ddds = DatabricksDataSource(engine, spark, ws, scope) ddds.get_schema("catalog", "schema", "supplier") spark.sql.assert_called_with( re.sub( @@ -56,10 +56,10 @@ def test_get_schema(): def test_read_data_from_uc(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws) + ddds = DatabricksDataSource(engine, spark, ws, scope) # Test with query ddds.read_data("org", "data", "employee", "select id as id, name as name from :tbl", None) @@ -72,10 +72,10 @@ def test_read_data_from_uc(): def test_read_data_from_hive(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws) + ddds = DatabricksDataSource(engine, spark, ws, scope) # Test with query ddds.read_data("hive_metastore", "data", "employee", "select id as id, name as name from :tbl", None) @@ -88,10 +88,10 @@ def test_read_data_from_hive(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws) + ddds = DatabricksDataSource(engine, spark, ws, scope) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises( @@ -104,10 +104,10 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws) + ddds = DatabricksDataSource(engine, spark, ws, scope) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises(DataSourceRuntimeException) as exception: ddds.get_schema("org", "data", "employee") @@ -121,8 +121,8 @@ def test_get_schema_exception_handling(): def test_normalize_identifier(): - engine, spark, ws, _ = initial_setup() - data_source = DatabricksDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + data_source = DatabricksDataSource(engine, spark, ws, scope) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '`a`') assert data_source.normalize_identifier('`b`') == NormalizedIdentifier("`b`", '`b`') diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index 11b917c3e1..086b48f19c 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -4,8 +4,7 @@ import pytest -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -32,16 +31,6 @@ def mock_secret(scope, key): return secret_mock[scope][key] -def oracle_creds(scope): - return { - "host": f"{scope}/host", - "port": f"{scope}/port", - "database": f"{scope}/database", - "user": f"{scope}/user", - "password": f"{scope}/password", - } - - def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -58,9 +47,8 @@ def test_read_data_with_options(): # initial setup engine, spark, ws, scope = initial_setup() - # create object for OracleDataSource - ords = OracleDataSource(engine, spark, ws) - ords.load_credentials(ReconcileCredentialConfig("databricks", oracle_creds(scope))) + # create object for SnowflakeDataSource + ords = OracleDataSource(engine, spark, ws, scope) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -108,10 +96,10 @@ def test_read_data_with_options(): def test_get_schema(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() - # create object for OracleDataSource - ords = OracleDataSource(engine, spark, ws) + # create object for SnowflakeDataSource + ords = OracleDataSource(engine, spark, ws, scope) # call test method ords.get_schema(None, "data", "employee") # spark assertions @@ -139,8 +127,8 @@ def test_get_schema(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, _ = initial_setup() - ords = OracleDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + ords = OracleDataSource(engine, spark, ws, scope) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -169,8 +157,8 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, _ = initial_setup() - ords = OracleDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + ords = OracleDataSource(engine, spark, ws, scope) spark.read.format().option().option().option().option().option().load.side_effect = RuntimeError("Test Exception") @@ -196,8 +184,8 @@ def test_get_schema_exception_handling(): @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, _ = initial_setup() - data_source = OracleDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + data_source = OracleDataSource(engine, spark, ws, scope) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_secrets.py b/tests/unit/reconcile/connectors/test_secrets.py index c515bb9add..dea7515b09 100644 --- a/tests/unit/reconcile/connectors/test_secrets.py +++ b/tests/unit/reconcile/connectors/test_secrets.py @@ -3,16 +3,27 @@ import pytest +from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.workspace import GetSecretResponse -from databricks.labs.lakebridge.connections.credential_manager import DatabricksSecretProvider + +class SecretsMixinUnderTest(SecretsMixin): + def __init__(self, ws: WorkspaceClient, secret_scope: str): + self._ws = ws + self._secret_scope = secret_scope + + def get_secret(self, secret_key: str) -> str: + return self._get_secret(secret_key) + + def get_secret_or_none(self, secret_key: str) -> str | None: + return self._get_secret_or_none(secret_key) def mock_secret(scope, key): secret_mock = { - "some_scope": { + "scope": { 'user_name': GetSecretResponse( key='user_name', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') ), @@ -29,39 +40,26 @@ def test_get_secrets_happy(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = mock_secret - sut = DatabricksSecretProvider(ws) + sut = SecretsMixinUnderTest(ws, "scope") - assert sut.get_secret("some_scope/user_name") == "my_user" - assert sut.get_secret_or_none("some_scope/user_name") == "my_user" - assert sut.get_secret("some_scope/password") == "my_password" - assert sut.get_secret_or_none("some_scope/password") == "my_password" + assert sut.get_secret("user_name") == "my_user" + assert sut.get_secret_or_none("user_name") == "my_user" + assert sut.get_secret("password") == "my_password" + assert sut.get_secret_or_none("password") == "my_password" def test_get_secrets_not_found_exception(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = DatabricksSecretProvider(ws) + sut = SecretsMixinUnderTest(ws, "scope") - with pytest.raises( - KeyError, match="Secret does not exist with scope: some_scope and key: unknown : Test Exception" - ): - sut.get_secret("some_scope/unknown") + with pytest.raises(NotFound, match="Secret does not exist with scope: scope and key: unknown : Test Exception"): + sut.get_secret("unknown") def test_get_secrets_not_found_swallow(): ws = create_autospec(WorkspaceClient) ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = DatabricksSecretProvider(ws) - - assert sut.get_secret_or_none("some_scope/unknown") is None - - -def test_get_secrets_invalid_name(): - ws = create_autospec(WorkspaceClient) - sut = DatabricksSecretProvider(ws) - - with pytest.raises(AssertionError, match="Secret name must be in the format 'scope/secret'"): - sut.get_secret("just_key") + sut = SecretsMixinUnderTest(ws, "scope") - with pytest.raises(AssertionError, match="Secret name must be in the format 'scope/secret'"): - sut.get_secret_or_none("just_key") + assert sut.get_secret_or_none("unknown") is None diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index 566f58dd45..114aa42f2a 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -6,8 +6,7 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, InvalidSnowflakePemPrivateKey @@ -20,6 +19,9 @@ def mock_secret(scope, key): secret_mock = { "scope": { + 'sfAccount': GetSecretResponse( + key='sfAccount', value=base64.b64encode(bytes('my_account', 'utf-8')).decode('utf-8') + ), 'sfUser': GetSecretResponse( key='sfUser', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') ), @@ -38,39 +40,13 @@ def mock_secret(scope, key): 'sfRole': GetSecretResponse( key='sfRole', value=base64.b64encode(bytes('my_role', 'utf-8')).decode('utf-8') ), - 'sfUrl': GetSecretResponse( - key='sfUrl', value=base64.b64encode(bytes('my_account.snowflakecomputing.com', 'utf-8')).decode('utf-8') - ), + 'sfUrl': GetSecretResponse(key='sfUrl', value=base64.b64encode(bytes('my_url', 'utf-8')).decode('utf-8')), } } return secret_mock[scope][key] -@pytest.fixture() -def snowflake_creds(): - def _snowflake_creds(scope, use_private_key=False, use_pem_password=False): - creds = { - 'sfUser': f'{scope}/sfUser', - 'sfDatabase': f'{scope}/sfDatabase', - 'sfSchema': f'{scope}/sfSchema', - 'sfWarehouse': f'{scope}/sfWarehouse', - 'sfRole': f'{scope}/sfRole', - 'sfUrl': f'{scope}/sfUrl', - } - - if use_private_key: - creds['pem_private_key'] = f'{scope}/pem_private_key' - if use_pem_password: - creds['pem_private_key_password'] = f'{scope}/pem_private_key_password' - else: - creds['sfPassword'] = f'{scope}/sfPassword' - - return creds - - return _snowflake_creds - - def generate_pkcs8_pem_key(malformed=False): private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048) pem_key = private_key.private_bytes( @@ -115,12 +91,11 @@ def initial_setup(): return engine, spark, ws, scope -def test_get_jdbc_url_happy(snowflake_creds): +def test_get_jdbc_url_happy(): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) url = dfds.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -131,13 +106,28 @@ def test_get_jdbc_url_happy(snowflake_creds): ) -def test_read_data_with_out_options(snowflake_creds): +def test_get_jdbc_url_fail(): + # initial setup + engine, spark, ws, scope = initial_setup() + ws.secrets.get_secret.side_effect = mock_secret + # create object for SnowflakeDataSource + dfds = SnowflakeDataSource(engine, spark, ws, scope) + url = dfds.get_jdbc_url + # Assert that the URL is generated correctly + assert url == ( + "jdbc:snowflake://my_account.snowflakecomputing.com" + "/?user=my_user&password=my_password" + "&db=my_database&schema=my_schema" + "&warehouse=my_warehouse&role=my_role" + ) + + +def test_read_data_with_out_options(): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) # Create a Tables configuration object with no JDBC reader options table_conf = Table( source_name="supplier", @@ -151,7 +141,7 @@ def test_read_data_with_out_options(snowflake_creds): spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") spark.read.format().option().options.assert_called_with( - sfUrl="my_account.snowflakecomputing.com", + sfUrl="my_url", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -162,13 +152,12 @@ def test_read_data_with_out_options(snowflake_creds): spark.read.format().option().options().load.assert_called_once() -def test_read_data_with_options(snowflake_creds): +def test_read_data_with_options(): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -203,13 +192,12 @@ def test_read_data_with_options(snowflake_creds): spark.read.format().option().option().option().options().load.assert_called_once() -def test_get_schema(snowflake_creds): +def test_get_schema(): # initial setup engine, spark, ws, scope = initial_setup() # Mocking get secret method to return the required values # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) # call test method dfds.get_schema("catalog", "schema", "supplier") # spark assertions @@ -227,7 +215,7 @@ def test_get_schema(snowflake_creds): ), ) spark.read.format().option().options.assert_called_with( - sfUrl="my_account.snowflakecomputing.com", + sfUrl="my_url", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -238,11 +226,10 @@ def test_get_schema(snowflake_creds): spark.read.format().option().options().load.assert_called_once() -def test_read_data_exception_handling(snowflake_creds): +def test_read_data_exception_handling(): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -267,12 +254,11 @@ def test_read_data_exception_handling(snowflake_creds): dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) -def test_get_schema_exception_handling(snowflake_creds): +def test_get_schema_exception_handling(): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) spark.read.format().option().options().load.side_effect = RuntimeError("Test Exception") @@ -290,17 +276,16 @@ def test_get_schema_exception_handling(snowflake_creds): dfds.get_schema("catalog", "schema", "supplier") -def test_read_data_without_options_private_key(snowflake_creds): +def test_read_data_without_options_private_key(): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws) - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope, use_private_key=True))) + dfds = SnowflakeDataSource(engine, spark, ws, scope) table_conf = Table(source_name="supplier", target_name="supplier") dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") expected_options = { - "sfUrl": "my_account.snowflakecomputing.com", + "sfUrl": "my_url", "sfUser": "my_user", "sfDatabase": "my_database", "sfSchema": "my_schema", @@ -313,30 +298,30 @@ def test_read_data_without_options_private_key(snowflake_creds): spark.read.format().option().options().load.assert_called_once() -def test_read_data_without_options_malformed_private_key(snowflake_creds): +def test_read_data_without_options_malformed_private_key(): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_malformed_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws) - + dfds = SnowflakeDataSource(engine, spark, ws, scope) + table_conf = Table(source_name="supplier", target_name="supplier") with pytest.raises(InvalidSnowflakePemPrivateKey, match="Failed to load or process the provided PEM private key."): - dfds.load_credentials(ReconcileCredentialConfig("databricks", snowflake_creds(scope, use_private_key=True))) + dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) -def test_read_data_without_any_auth(snowflake_creds): +def test_read_data_without_any_auth(): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_no_auth_key_secret - dfds = SnowflakeDataSource(engine, spark, ws) - creds = snowflake_creds(scope) - creds.pop('sfPassword') - - with pytest.raises(AssertionError, match='Missing Snowflake credentials. Please configure any of .*'): - dfds.load_credentials(ReconcileCredentialConfig("databricks", creds)) + dfds = SnowflakeDataSource(engine, spark, ws, scope) + table_conf = Table(source_name="supplier", target_name="supplier") + with pytest.raises( + NotFound, match='sfPassword and pem_private_key not found. Either one is required for snowflake auth.' + ): + dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, _ = initial_setup() - data_source = SnowflakeDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + data_source = SnowflakeDataSource(engine, spark, ws, scope) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index 32c81d3bd7..fa6ad90415 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -4,8 +4,7 @@ import pytest -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.tsql import TSQLServerDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -36,18 +35,6 @@ def mock_secret(scope, key): return scope_secret_mock[scope][key] -def mssql_creds(scope): - return { - "host": f"{scope}/host", - "port": f"{scope}/port", - "database": f"{scope}/database", - "user": f"{scope}/user", - "password": f"{scope}/password", - "encrypt": f"{scope}/encrypt", - "trustServerCertificate": f"{scope}/trustServerCertificate", - } - - def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -64,8 +51,20 @@ def test_get_jdbc_url_happy(): # initial setup engine, spark, ws, scope = initial_setup() # create object for TSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws) - data_source.load_credentials(ReconcileCredentialConfig("databricks", mssql_creds(scope))) + data_source = TSQLServerDataSource(engine, spark, ws, scope) + url = data_source.get_jdbc_url + # Assert that the URL is generated correctly + assert url == ( + """jdbc:sqlserver://my_host:777;databaseName=my_database;user=my_user;password=my_password;encrypt=true;trustServerCertificate=true;""" + ) + + +def test_get_jdbc_url_fail(): + # initial setup + engine, spark, ws, scope = initial_setup() + ws.secrets.get_secret.side_effect = mock_secret + # create object for TSQLServerDataSource + data_source = TSQLServerDataSource(engine, spark, ws, scope) url = data_source.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -78,8 +77,7 @@ def test_read_data_with_options(): engine, spark, ws, scope = initial_setup() # create object for MSSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws) - data_source.load_credentials(ReconcileCredentialConfig("databricks", mssql_creds(scope))) + data_source = TSQLServerDataSource(engine, spark, ws, scope) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="src_supplier", @@ -118,9 +116,9 @@ def test_read_data_with_options(): def test_get_schema(): # initial setup - engine, spark, ws, _ = initial_setup() + engine, spark, ws, scope = initial_setup() # Mocking get secret method to return the required values - data_source = TSQLServerDataSource(engine, spark, ws) + data_source = TSQLServerDataSource(engine, spark, ws, scope) # call test method data_source.get_schema("org", "schema", "supplier") # spark assertions @@ -165,8 +163,8 @@ def test_get_schema(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, _ = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws, scope) spark.read.format().option().option().option().option().load.side_effect = RuntimeError("Test Exception") @@ -182,8 +180,8 @@ def test_get_schema_exception_handling(): def test_normalize_identifier(): - engine, spark, ws, _ = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws) + engine, spark, ws, scope = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws, scope) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", "[a]") assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", "[b]") diff --git a/tests/unit/reconcile/test_source_adapter.py b/tests/unit/reconcile/test_source_adapter.py index 68b093e2da..5a9cc4032d 100644 --- a/tests/unit/reconcile/test_source_adapter.py +++ b/tests/unit/reconcile/test_source_adapter.py @@ -15,9 +15,10 @@ def test_create_adapter_for_snowflake_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("snowflake") ws = create_autospec(WorkspaceClient) + scope = "scope" - data_source = create_adapter(engine, spark, ws) - snowflake_data_source = SnowflakeDataSource(engine, spark, ws).__class__ + data_source = create_adapter(engine, spark, ws, scope) + snowflake_data_source = SnowflakeDataSource(engine, spark, ws, scope).__class__ assert isinstance(data_source, snowflake_data_source) @@ -26,9 +27,10 @@ def test_create_adapter_for_oracle_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("oracle") ws = create_autospec(WorkspaceClient) + scope = "scope" - data_source = create_adapter(engine, spark, ws) - oracle_data_source = OracleDataSource(engine, spark, ws).__class__ + data_source = create_adapter(engine, spark, ws, scope) + oracle_data_source = OracleDataSource(engine, spark, ws, scope).__class__ assert isinstance(data_source, oracle_data_source) @@ -37,9 +39,10 @@ def test_create_adapter_for_databricks_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("databricks") ws = create_autospec(WorkspaceClient) + scope = "scope" - data_source = create_adapter(engine, spark, ws) - databricks_data_source = DatabricksDataSource(engine, spark, ws).__class__ + data_source = create_adapter(engine, spark, ws, scope) + databricks_data_source = DatabricksDataSource(engine, spark, ws, scope).__class__ assert isinstance(data_source, databricks_data_source) @@ -48,6 +51,7 @@ def test_raise_exception_for_unknown_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("trino") ws = create_autospec(WorkspaceClient) + scope = "scope" with pytest.raises(ValueError, match=f"Unsupported source type --> {engine}"): - create_adapter(engine, spark, ws) + create_adapter(engine, spark, ws, scope) From 79b072195aec17c58a1c03d6a9f203cfd8d7f827 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 21 Nov 2025 10:33:33 +0100 Subject: [PATCH 09/44] revert two more changes --- tests/conftest.py | 7 +- .../connections/test_credential_manager.py | 90 +++++++++++++++++++ 2 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 tests/unit/connections/test_credential_manager.py diff --git a/tests/conftest.py b/tests/conftest.py index 86cdf726e0..2b2419231e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,8 +17,8 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam -from databricks.labs.lakebridge.config import ReconcileCredentialConfig -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource from databricks.labs.lakebridge.reconcile.recon_config import ( Table, @@ -344,9 +344,6 @@ def read_data( ) -> DataFrame: raise RuntimeError("Not implemented") - def load_credentials(self, creds: ReconcileCredentialConfig) -> "FakeDataSource": - raise RuntimeError("Not implemented") - @pytest.fixture def fake_oracle_datasource() -> FakeDataSource: diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py new file mode 100644 index 0000000000..32ed07cfc0 --- /dev/null +++ b/tests/unit/connections/test_credential_manager.py @@ -0,0 +1,90 @@ +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path +from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.env_getter import EnvGetter +import os + +product_name = "remorph" + + +@pytest.fixture +def env_getter(): + return MagicMock(spec=EnvGetter) + + +@pytest.fixture +def local_credentials(): + return { + 'secret_vault_type': 'local', + 'mssql': { + 'database': 'DB_NAME', + 'driver': 'ODBC Driver 18 for SQL Server', + 'server': 'example_host', + 'user': 'local_user', + 'password': 'local_password', + }, + } + + +@pytest.fixture +def env_credentials(): + return { + 'secret_vault_type': 'env', + 'mssql': { + 'database': 'DB_NAME', + 'driver': 'ODBC Driver 18 for SQL Server', + 'server': 'example_host', + 'user': 'MSSQL_USER_ENV', + 'password': 'MSSQL_PASSWORD_ENV', + }, + } + + +@pytest.fixture +def databricks_credentials(): + return { + 'secret_vault_type': 'databricks', + 'secret_vault_name': 'databricks_vault_name', + 'mssql': { + 'database': 'DB_NAME', + 'driver': 'ODBC Driver 18 for SQL Server', + 'server': 'example_host', + 'user': 'databricks_user', + 'password': 'databricks_password', + }, + } + + +@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') +@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') +def test_local_credentials(mock_get_home, mock_load_credentials, local_credentials, env_getter): + mock_load_credentials.return_value = local_credentials + mock_get_home.return_value = Path("/fake/home") + credentials = create_credential_manager(product_name, env_getter) + creds = credentials.get_credentials('mssql') + assert creds['user'] == 'local_user' + assert creds['password'] == 'local_password' + + +@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') +@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') +@patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) +def test_env_credentials(mock_get_home, mock_load_credentials, env_credentials, env_getter): + mock_load_credentials.return_value = env_credentials + mock_get_home.return_value = Path("/fake/home") + env_getter.get.side_effect = lambda key: os.environ[key] + credentials = create_credential_manager(product_name, env_getter) + creds = credentials.get_credentials('mssql') + assert creds['user'] == 'env_user' + assert creds['password'] == 'env_password' + + +@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') +@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') +def test_databricks_credentials(mock_get_home, mock_load_credentials, databricks_credentials, env_getter): + mock_load_credentials.return_value = databricks_credentials + mock_get_home.return_value = Path("/fake/home") + credentials = create_credential_manager(product_name, env_getter) + with pytest.raises(NotImplementedError): + credentials.get_credentials('mssql') From c284f5038960f65d1a6cb4ec92844ec78268ff11 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 21 Nov 2025 11:38:34 +0100 Subject: [PATCH 10/44] implement `test_databricks_credentials` --- .../connections/test_credential_manager.py | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index 32ed07cfc0..6d9c29dba0 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -1,9 +1,13 @@ -import pytest -from unittest.mock import patch, MagicMock +import os +import base64 from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager from databricks.labs.lakebridge.connections.env_getter import EnvGetter -import os +from databricks.sdk.service.workspace import GetSecretResponse product_name = "remorph" @@ -45,13 +49,12 @@ def env_credentials(): def databricks_credentials(): return { 'secret_vault_type': 'databricks', - 'secret_vault_name': 'databricks_vault_name', 'mssql': { - 'database': 'DB_NAME', - 'driver': 'ODBC Driver 18 for SQL Server', - 'server': 'example_host', - 'user': 'databricks_user', - 'password': 'databricks_password', + 'database': 'databricks_vault_name/DB_NAME', + 'driver': 'databricks_vault_name/ODBC Driver 18 for SQL Server', + 'server': 'databricks_vault_name/example_host', + 'user': 'databricks_vault_name/databricks_user', + 'password': 'databricks_vault_name/databricks_password', }, } @@ -72,19 +75,20 @@ def test_local_credentials(mock_get_home, mock_load_credentials, local_credentia @patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) def test_env_credentials(mock_get_home, mock_load_credentials, env_credentials, env_getter): mock_load_credentials.return_value = env_credentials - mock_get_home.return_value = Path("/fake/home") env_getter.get.side_effect = lambda key: os.environ[key] - credentials = create_credential_manager(product_name, env_getter) + credentials = create_credential_manager(Path("/fake/home")) creds = credentials.get_credentials('mssql') assert creds['user'] == 'env_user' assert creds['password'] == 'env_password' @patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') -def test_databricks_credentials(mock_get_home, mock_load_credentials, databricks_credentials, env_getter): +def test_databricks_credentials(mock_load_credentials, databricks_credentials, env_getter, mock_workspace_client): + mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( + key='some_key', value=base64.b64encode(bytes('some_secret', 'utf-8')).decode('utf-8') + ) mock_load_credentials.return_value = databricks_credentials - mock_get_home.return_value = Path("/fake/home") - credentials = create_credential_manager(product_name, env_getter) - with pytest.raises(NotImplementedError): - credentials.get_credentials('mssql') + credentials = create_credential_manager(Path("/fake/home"), mock_workspace_client) + creds = credentials.get_credentials('mssql') + assert creds['user'] == 'some_secret' + assert creds['password'] == 'some_secret' From 287c6540783b293e43756ee884cee01d7da7a6b7 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 21 Nov 2025 16:39:29 +0100 Subject: [PATCH 11/44] do not patch in specs and do not accept str on the factory method --- .../connections/credential_manager.py | 4 +- .../synapse/dedicated_sqlpool_extract.py | 4 +- .../synapse/monitoring_metrics_extract.py | 4 +- .../synapse/serverless_sqlpool_extract.py | 4 +- .../assessments/synapse/workspace_extract.py | 4 +- .../connections/test_credential_manager.py | 41 +++++-------------- 6 files changed, 24 insertions(+), 37 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 204c85bef1..e7d690d8c9 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -127,9 +127,7 @@ def create_databricks_secret_provider(ws) -> DatabricksSecretProvider: return DatabricksSecretProvider(ws) -def create_credential_manager(creds_or_path: dict | Path | str, ws: WorkspaceClient | None = None) -> CredentialManager: - if isinstance(creds_or_path, str): - creds_or_path = Path(creds_or_path) +def create_credential_manager(creds_or_path: dict | Path, ws: WorkspaceClient | None = None) -> CredentialManager: if isinstance(creds_or_path, Path): creds = _load_credentials(creds_or_path) else: diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py index 9418c0dc5f..23c04136f9 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py @@ -1,5 +1,7 @@ import json import sys +from pathlib import Path + from databricks.labs.lakebridge.resources.assessments.synapse.common.functions import ( arguments_loader, create_synapse_artifacts_client, @@ -22,7 +24,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Synapse Synapse Dedicated SQL Pool Extract Script") - cred_manager = create_credential_manager(creds_file) + cred_manager = create_credential_manager(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py index a4f29a3a0d..2077a98dd2 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py @@ -1,5 +1,7 @@ import json import sys +from pathlib import Path + import urllib3 import zoneinfo import pandas as pd @@ -22,7 +24,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Monitoring Metrics Extract Script") - cred_manager = create_credential_manager(creds_file) + cred_manager = create_credential_manager(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") synapse_profiler_settings = synapse_workspace_settings["profiler"] diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py index 81421ae5a9..dca95d1954 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py @@ -1,5 +1,7 @@ import json import sys +from pathlib import Path + import duckdb from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager @@ -53,7 +55,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Synapse Synapse Serverless SQL Pool Extract Script") - cred_manager = create_credential_manager(creds_file) + cred_manager = create_credential_manager(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py index 25d15745ce..0c9087c9d8 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py @@ -2,6 +2,8 @@ import sys from datetime import date, timedelta import zoneinfo +from pathlib import Path + import pandas as pd from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager @@ -20,7 +22,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Workspace Extract") - cred_manager = create_credential_manager(creds_file) + cred_manager = create_credential_manager(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") tz_info = synapse_workspace_settings["workspace"]["tz_info"] workspace_tz = zoneinfo.ZoneInfo(tz_info) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index 6d9c29dba0..5227f57302 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -1,22 +1,14 @@ -import os import base64 -from pathlib import Path -from unittest.mock import patch, MagicMock +from unittest.mock import patch import pytest from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager -from databricks.labs.lakebridge.connections.env_getter import EnvGetter from databricks.sdk.service.workspace import GetSecretResponse product_name = "remorph" -@pytest.fixture -def env_getter(): - return MagicMock(spec=EnvGetter) - - @pytest.fixture def local_credentials(): return { @@ -50,45 +42,34 @@ def databricks_credentials(): return { 'secret_vault_type': 'databricks', 'mssql': { - 'database': 'databricks_vault_name/DB_NAME', - 'driver': 'databricks_vault_name/ODBC Driver 18 for SQL Server', - 'server': 'databricks_vault_name/example_host', - 'user': 'databricks_vault_name/databricks_user', - 'password': 'databricks_vault_name/databricks_password', + 'database': 'databricks_vault_name/db_key', + 'server': 'databricks_vault_name/host_key', + 'user': 'databricks_vault_name/user_key', + 'password': 'databricks_vault_name/pass_key', }, } -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') -def test_local_credentials(mock_get_home, mock_load_credentials, local_credentials, env_getter): - mock_load_credentials.return_value = local_credentials - mock_get_home.return_value = Path("/fake/home") - credentials = create_credential_manager(product_name, env_getter) +def test_local_credentials(local_credentials): + credentials = create_credential_manager(local_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'local_user' assert creds['password'] == 'local_password' -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -@patch('databricks.labs.lakebridge.connections.credential_manager._get_home') @patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) -def test_env_credentials(mock_get_home, mock_load_credentials, env_credentials, env_getter): - mock_load_credentials.return_value = env_credentials - env_getter.get.side_effect = lambda key: os.environ[key] - credentials = create_credential_manager(Path("/fake/home")) +def test_env_credentials(env_credentials): + credentials = create_credential_manager(env_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'env_user' assert creds['password'] == 'env_password' -@patch('databricks.labs.lakebridge.connections.credential_manager._load_credentials') -def test_databricks_credentials(mock_load_credentials, databricks_credentials, env_getter, mock_workspace_client): +def test_databricks_credentials(databricks_credentials, mock_workspace_client): mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( key='some_key', value=base64.b64encode(bytes('some_secret', 'utf-8')).decode('utf-8') ) - mock_load_credentials.return_value = databricks_credentials - credentials = create_credential_manager(Path("/fake/home"), mock_workspace_client) + credentials = create_credential_manager(databricks_credentials, mock_workspace_client) creds = credentials.get_credentials('mssql') assert creds['user'] == 'some_secret' assert creds['password'] == 'some_secret' From 53422f41d290bb7e3ce89df5ccbfc9b5c23dc03c Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 24 Nov 2025 13:42:22 +0100 Subject: [PATCH 12/44] add test to increase coverage --- tests/unit/connections/test_credential_manager.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index 5227f57302..07e62a7b0f 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -4,8 +4,10 @@ import pytest from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.sdk.errors import NotFound from databricks.sdk.service.workspace import GetSecretResponse + product_name = "remorph" @@ -73,3 +75,11 @@ def test_databricks_credentials(databricks_credentials, mock_workspace_client): creds = credentials.get_credentials('mssql') assert creds['user'] == 'some_secret' assert creds['password'] == 'some_secret' + + +def test_databricks_credentials_not_found(databricks_credentials, mock_workspace_client): + mock_workspace_client.secrets.get_secret.side_effect = NotFound("Test Exception") + credentials = create_credential_manager(databricks_credentials, mock_workspace_client) + + with pytest.raises(KeyError, match="Source system: unknown credentials not found"): + credentials.get_credentials("unknown") From 88b5c14dbc2172191e527d624a2a8cabef47da8b Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 24 Nov 2025 13:50:20 +0100 Subject: [PATCH 13/44] remove unused addition --- .../labs/lakebridge/connections/credential_manager.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index e7d690d8c9..2f29b35bd1 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -20,13 +20,6 @@ class SecretProvider(Protocol): def get_secret(self, key: str) -> str: pass - def get_secret_or_none(self, key: str) -> str | None: - try: - return self.get_secret(key) - except KeyError: - return None - - class LocalSecretProvider(SecretProvider): def get_secret(self, key: str) -> str: return key From 4957affa0c376d7c3ecf31c1a3691fffd76c1bbc Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 24 Nov 2025 14:55:59 +0100 Subject: [PATCH 14/44] Bump reconcile config to v2 and only use creds with a v1_migrate heper implemented --- src/databricks/labs/lakebridge/config.py | 24 +++++-------- .../connections/credential_manager.py | 1 + .../labs/lakebridge/deployment/recon.py | 2 +- src/databricks/labs/lakebridge/install.py | 3 +- .../reconcile/trigger_recon_service.py | 2 +- .../reconcile/query_builder/test_execute.py | 13 +++---- .../reconcile/test_oracle_reconcile.py | 9 +++-- tests/unit/deployment/test_installation.py | 7 ++-- tests/unit/deployment/test_job.py | 5 +-- tests/unit/deployment/test_recon.py | 5 +-- tests/unit/test_install.py | 34 +++++++++++++------ 11 files changed, 61 insertions(+), 44 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 695901762e..40a225ddc4 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -261,35 +261,29 @@ class ReconcileMetadataConfig: @dataclass class ReconcileCredentialConfig: - vault_type: str + vault_type: str # supports local, env, databricks creds. source_creds: dict[str, str] @dataclass class ReconcileConfig: __file__ = "reconcile.yml" - __version__ = 1 + __version__ = 2 data_source: str report_type: str - secret_scope: str # TODO deprecate in favor of creds_or_secret_scope or remove altogether + creds: ReconcileCredentialConfig database_config: DatabaseConfig metadata_config: ReconcileMetadataConfig job_id: str | None = None tables: ReconcileTablesConfig | None = None - creds_or_secret_scope: ReconcileCredentialConfig | str | None = None - # supports local, env, databricks creds or direct secret scope string (old behavior) - # TODO make not optional - @property - def creds(self): - if self.creds_or_secret_scope is not None and isinstance(self.creds_or_secret_scope, ReconcileCredentialConfig): - return self.creds_or_secret_scope - - return ReconcileCredentialConfig( - vault_type="databricks", - source_creds={"__secret_scope": self.creds_or_secret_scope or self.secret_scope}, - ) + @classmethod + def v1_migrate(cls, raw: dict[str, Any]) -> dict[str, Any]: + secret_scope = raw.pop("secret_scope") + raw["version"] = 2 + raw["creds"] = {"vault_type": "local", "source_creds": {"__secret_scope": secret_scope}} + return raw @dataclass diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 2f29b35bd1..e1994f59d1 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -20,6 +20,7 @@ class SecretProvider(Protocol): def get_secret(self, key: str) -> str: pass + class LocalSecretProvider(SecretProvider): def get_secret(self, key: str) -> str: return key diff --git a/src/databricks/labs/lakebridge/deployment/recon.py b/src/databricks/labs/lakebridge/deployment/recon.py index 98235eb677..093823ef58 100644 --- a/src/databricks/labs/lakebridge/deployment/recon.py +++ b/src/databricks/labs/lakebridge/deployment/recon.py @@ -61,7 +61,7 @@ def uninstall(self, recon_config: ReconcileConfig | None): f"from catalog `{recon_config.metadata_config.catalog}`. Please remove it and the tables inside manually." ) logging.info( - f"Won't remove configured reconcile secret scope `{recon_config.secret_scope}`. " + f"Won't remove configured reconcile creds from `{recon_config.creds.vault_type}`. " f"Please remove it manually." ) diff --git a/src/databricks/labs/lakebridge/install.py b/src/databricks/labs/lakebridge/install.py index 9957d6426d..c8841c177d 100644 --- a/src/databricks/labs/lakebridge/install.py +++ b/src/databricks/labs/lakebridge/install.py @@ -20,6 +20,7 @@ LakebridgeConfiguration, ReconcileMetadataConfig, TranspileConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator @@ -336,7 +337,7 @@ def _prompt_for_new_reconcile_installation(self) -> ReconcileConfig: return ReconcileConfig( data_source=data_source, report_type=report_type, - secret_scope=scope_name, + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": scope_name}), database_config=db_config, metadata_config=metadata_config, ) diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index 3fd837d668..ffa5184319 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - secret_scope=reconcile_config.secret_scope, + secret_scope=reconcile_config.creds.source_creds["__secret_scope"], ) recon_id = str(uuid4()) diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 0015ff9dbb..e61efe750c 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -13,6 +13,7 @@ TableRecon, ReconcileMetadataConfig, ReconcileConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService @@ -735,7 +736,7 @@ def mock_for_report_type_data( reconcile_config_data = ReconcileConfig( data_source="databricks", report_type="data", - secret_scope="remorph_databricks", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -936,7 +937,7 @@ def mock_for_report_type_schema( reconcile_config_schema = ReconcileConfig( data_source="databricks", report_type="schema", - secret_scope="remorph_databricks", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1152,7 +1153,7 @@ def mock_for_report_type_all( reconcile_config_all = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1431,7 +1432,7 @@ def mock_for_report_type_row( reconcile_config_row = ReconcileConfig( data_source="snowflake", report_type="row", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1581,7 +1582,7 @@ def mock_for_recon_exception(normalized_table_conf_with_opts, setup_metadata_tab reconcile_config_exception = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -2041,7 +2042,7 @@ def test_recon_output_without_exception(mock_gen_final_recon_output): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index 841b130168..5393eb3ae7 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -4,7 +4,12 @@ from pyspark.sql import DataFrame from databricks.connect import DatabricksSession -from databricks.labs.lakebridge.config import DatabaseConfig, ReconcileMetadataConfig, ReconcileConfig +from databricks.labs.lakebridge.config import ( + DatabaseConfig, + ReconcileMetadataConfig, + ReconcileConfig, + ReconcileCredentialConfig, +) from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.recon_capture import ReconCapture from databricks.labs.lakebridge.reconcile.recon_config import Table, JdbcReaderOptions @@ -46,7 +51,7 @@ def test_oracle_db_reconcile(mock_spark, mock_workspace_client, tmp_path): reconcile_config = ReconcileConfig( data_source="oracle", report_type=report, - secret_scope="not used", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=db_config, metadata_config=ReconcileMetadataConfig(catalog="tmp", schema="reconcile"), ) diff --git a/tests/unit/deployment/test_installation.py b/tests/unit/deployment/test_installation.py index 039e4a412b..f13ede1f23 100644 --- a/tests/unit/deployment/test_installation.py +++ b/tests/unit/deployment/test_installation.py @@ -16,6 +16,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation from databricks.labs.lakebridge.deployment.recon import ReconDeployment @@ -55,7 +56,7 @@ def test_install_all(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - secret_scope="remorph_oracle6", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10006", target_catalog="tpch6", @@ -110,7 +111,7 @@ def test_recon_component_installation(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - secret_scope="remorph_oracle8", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10008", target_catalog="tpch8", @@ -193,7 +194,7 @@ def test_uninstall_configs_exist(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake1", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data1", source_schema="tpch_sf10001", diff --git a/tests/unit/deployment/test_job.py b/tests/unit/deployment/test_job.py index e5f263e0f2..39e8849180 100644 --- a/tests/unit/deployment/test_job.py +++ b/tests/unit/deployment/test_job.py @@ -13,6 +13,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -22,7 +23,7 @@ def oracle_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="oracle", report_type="all", - secret_scope="remorph_oracle9", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", @@ -41,7 +42,7 @@ def snowflake_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake9", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", diff --git a/tests/unit/deployment/test_recon.py b/tests/unit/deployment/test_recon.py index f55c62b757..4eed4862fe 100644 --- a/tests/unit/deployment/test_recon.py +++ b/tests/unit/deployment/test_recon.py @@ -13,6 +13,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -56,7 +57,7 @@ def test_install(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake4", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data4", source_schema="tpch_sf10004", @@ -149,7 +150,7 @@ def test_uninstall(ws): recon_config = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake5", + creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data5", source_schema="tpch_sf10005", diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index e599ca5f68..6f2f96a57b 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -17,6 +17,7 @@ ReconcileConfig, ReconcileMetadataConfig, TranspileConfig, + ReconcileCredentialConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator @@ -617,7 +618,7 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work "reconcile.yml": { "source_dialect": "oracle", # Invalid key "report_type": "all", - "secret_scope": "remorph_oracle", + "secret_scope": "remorph_oracle", # version 1 "database_config": { "source_schema": "tpch_sf1000", "target_catalog": "tpch", @@ -661,7 +662,7 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work reconcile=ReconcileConfig( data_source="oracle", report_type="all", - secret_scope="remorph_oracle", + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "remorph_oracle"}), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -681,7 +682,10 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work { "data_source": "oracle", "report_type": "all", - "secret_scope": "remorph_oracle", + "creds": { + "vault_type": "databricks", + "source_creds": {"__secret_scope": "remorph_oracle"}, + }, "database_config": { "source_schema": "tpch_sf1000", "target_catalog": "tpch", @@ -692,7 +696,7 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work "schema": "reconcile", "volume": "reconcile_volume", }, - "version": 1, + "version": 2, }, ) @@ -740,7 +744,9 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No reconcile=ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig( + vault_type="databricks", source_creds={"__secret_scope": "remorph_snowflake"} + ), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -761,7 +767,10 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No { "data_source": "snowflake", "report_type": "all", - "secret_scope": "remorph_snowflake", + "creds": { + "vault_type": "databricks", + "source_creds": {"__secret_scope": "remorph_snowflake"}, + }, "database_config": { "source_catalog": "snowflake_sample_data", "source_schema": "tpch_sf1000", @@ -773,7 +782,7 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No "schema": "reconcile", "volume": "reconcile_volume", }, - "version": 1, + "version": 2, }, ) @@ -819,7 +828,7 @@ def test_configure_all_override_installation( "reconcile.yml": { "data_source": "snowflake", "report_type": "all", - "secret_scope": "remorph_snowflake", + "secret_scope": "remorph_snowflake", # v1 "database_config": { "source_catalog": "snowflake_sample_data", "source_schema": "tpch_sf1000", @@ -876,7 +885,7 @@ def test_configure_all_override_installation( expected_reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - secret_scope="remorph_snowflake", + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "remorph_snowflake"}), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -911,7 +920,10 @@ def test_configure_all_override_installation( { "data_source": "snowflake", "report_type": "all", - "secret_scope": "remorph_snowflake", + "creds": { + "vault_type": "databricks", + "source_creds": {"__secret_scope": "remorph_snowflake"}, + }, "database_config": { "source_catalog": "snowflake_sample_data", "source_schema": "tpch_sf1000", @@ -923,7 +935,7 @@ def test_configure_all_override_installation( "schema": "reconcile", "volume": "reconcile_volume", }, - "version": 1, + "version": 2, }, ) From be3f4bfa92b880bc302bb3d4b05ea0007fc79ee4 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 24 Nov 2025 15:39:35 +0100 Subject: [PATCH 15/44] fix test fixtures --- tests/integration/reconcile/query_builder/test_execute.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index e61efe750c..26fdd74a8d 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -736,7 +736,7 @@ def mock_for_report_type_data( reconcile_config_data = ReconcileConfig( data_source="databricks", report_type="data", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -937,7 +937,7 @@ def mock_for_report_type_schema( reconcile_config_schema = ReconcileConfig( data_source="databricks", report_type="schema", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1582,7 +1582,7 @@ def mock_for_recon_exception(normalized_table_conf_with_opts, setup_metadata_tab reconcile_config_exception = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, From 9b1dd103658febd2756d20867ce744fb6634a173 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 24 Nov 2025 15:46:46 +0100 Subject: [PATCH 16/44] one more fix --- tests/integration/reconcile/query_builder/test_execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 26fdd74a8d..96933a6ec7 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -2042,7 +2042,7 @@ def test_recon_output_without_exception(mock_gen_final_recon_output): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, From 76b044e13e338947817fb8eca8345f2e12d5f8a8 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 28 Nov 2025 15:54:09 +0100 Subject: [PATCH 17/44] change hardcoded vault_type value (it is only used in a log line so far) --- src/databricks/labs/lakebridge/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 40a225ddc4..36e52f3e6a 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -282,7 +282,7 @@ class ReconcileConfig: def v1_migrate(cls, raw: dict[str, Any]) -> dict[str, Any]: secret_scope = raw.pop("secret_scope") raw["version"] = 2 - raw["creds"] = {"vault_type": "local", "source_creds": {"__secret_scope": secret_scope}} + raw["creds"] = {"vault_type": "databricks", "source_creds": {"__secret_scope": secret_scope}} return raw From 87ff1a60d568a311087984f600f41a6d61d8225a Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 12:56:23 +0100 Subject: [PATCH 18/44] Apply suggestions from code review Co-authored-by: Andrew Snare --- src/databricks/labs/lakebridge/config.py | 2 +- .../connections/credential_manager.py | 18 ++++++++++++------ .../labs/lakebridge/deployment/recon.py | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 36e52f3e6a..31de025b18 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -279,7 +279,7 @@ class ReconcileConfig: tables: ReconcileTablesConfig | None = None @classmethod - def v1_migrate(cls, raw: dict[str, Any]) -> dict[str, Any]: + def v1_migrate(cls, raw: dict[str, JsonValue]) -> dict[str, JsonValue]: secret_scope = raw.pop("secret_scope") raw["version"] = 2 raw["creds"] = {"vault_type": "databricks", "source_creds": {"__secret_scope": secret_scope}} diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index e1994f59d1..99a32c6f34 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -48,16 +48,22 @@ def get_databricks_secret(self, scope: str, key: str) -> str: def get_secret(self, key: str) -> str: """Get the secret value given a secret scope & secret key. - :param key: key in the format 'scope/secret' - :return: The decoded UTF-8 secret value. + Args: + key: key in the format 'scope/secret' + Returns: + The decoded UTF-8 secret value. Raises: NotFound: The secret could not be found. UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. """ - key_parts = key.split(sep="/") - assert len(key_parts) == 2, "Secret name must be in the format 'scope/secret'" - scope, key_only = key_parts[0], key_parts[1] + match key.split(sep="/", maxsplit=3): + case scope, key_only: + scope = scope + key_only = key_only + case _: + msg = f"Secret key must be in the format 'scope/secret': {key}" + raise ValueError(msg) try: secret = self._ws.secrets.get_secret(scope, key_only) @@ -117,7 +123,7 @@ def _load_credentials(path: Path) -> dict: raise FileNotFoundError(f"Credentials file not found at {path}") from e -def create_databricks_secret_provider(ws) -> DatabricksSecretProvider: +def create_databricks_secret_provider(ws: WorkspaceClient) -> DatabricksSecretProvider: return DatabricksSecretProvider(ws) diff --git a/src/databricks/labs/lakebridge/deployment/recon.py b/src/databricks/labs/lakebridge/deployment/recon.py index 093823ef58..4cbe51e24b 100644 --- a/src/databricks/labs/lakebridge/deployment/recon.py +++ b/src/databricks/labs/lakebridge/deployment/recon.py @@ -61,7 +61,7 @@ def uninstall(self, recon_config: ReconcileConfig | None): f"from catalog `{recon_config.metadata_config.catalog}`. Please remove it and the tables inside manually." ) logging.info( - f"Won't remove configured reconcile creds from `{recon_config.creds.vault_type}`. " + f"Won't remove configured reconcile credentials from `{recon_config.creds.vault_type}`. " f"Please remove it manually." ) From 6d8230c036b89fa3bb82e894c735fad7b8528ecb Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 13:06:06 +0100 Subject: [PATCH 19/44] fmt after applying suggestions --- .../labs/lakebridge/connections/credential_manager.py | 9 +++++---- tests/unit/test_install.py | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 99a32c6f34..deab385728 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -54,13 +54,14 @@ def get_secret(self, key: str) -> str: The decoded UTF-8 secret value. Raises: - NotFound: The secret could not be found. + ValueError: The secret key must be in the format 'scope/secret'. + KeyError: The secret could not be found. UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. """ match key.split(sep="/", maxsplit=3): - case scope, key_only: - scope = scope - key_only = key_only + case _scope, _key_only: + scope = _scope + key_only = _key_only case _: msg = f"Secret key must be in the format 'scope/secret': {key}" raise ValueError(msg) diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index f65dd039ce..a7c895dd87 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -830,7 +830,6 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl reconcile=ReconcileConfig( data_source="databricks", report_type="all", - secret_scope="remorph_databricks", database_config=DatabaseConfig( source_schema="some_schema", target_catalog="tpch", @@ -842,6 +841,9 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl schema="reconcile", volume="reconcile_volume", ), + creds=ReconcileCredentialConfig( + vault_type="databricks", source_creds={"__secret_scope": "remorph_databricks"} + ), ), transpile=None, ) From e31a720290d20a465e88ad53c23d0393c0e6f75f Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 13:09:44 +0100 Subject: [PATCH 20/44] fix test --- tests/unit/test_install.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index a7c895dd87..0c80635485 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -853,7 +853,10 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl { "data_source": "databricks", "report_type": "all", - "secret_scope": "remorph_databricks", + "creds": { + "vault_type": "databricks", + "source_creds": {"__secret_scope": "remorph_databricks"}, + }, "database_config": { "source_catalog": "databricks_catalog", "source_schema": "some_schema", @@ -865,7 +868,7 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl "schema": "reconcile", "volume": "reconcile_volume", }, - "version": 1, + "version": 2, }, ) From 3ef81cc26445a0a8cead72f5a635e4964078307f Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 13:42:34 +0100 Subject: [PATCH 21/44] do not lazy init and remove unused method --- .../connections/credential_manager.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index deab385728..3816beb86e 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -1,5 +1,3 @@ -from collections.abc import Callable -from functools import partial from pathlib import Path import logging from typing import Protocol @@ -42,9 +40,6 @@ class DatabricksSecretProvider(SecretProvider): def __init__(self, ws: WorkspaceClient): self._ws = ws - def get_databricks_secret(self, scope: str, key: str) -> str: - return self.get_secret(f"{scope}/{key}") - def get_secret(self, key: str) -> str: """Get the secret value given a secret scope & secret key. @@ -83,15 +78,10 @@ def get_secret(self, key: str) -> str: class CredentialManager: - SecretProviderFactory = Callable[[], SecretProvider] - - def __init__(self, credentials: dict, secret_providers: dict[str, SecretProviderFactory]): + def __init__(self, credentials: dict, secret_providers: dict[str, SecretProvider]): self._credentials = credentials self._default_vault = self._credentials.get('secret_vault_type', 'local').lower() - provider_factory = secret_providers.get(self._default_vault) - if not provider_factory: - raise ValueError(f"Unsupported secret vault type: {self._default_vault}") - self._provider = provider_factory() + self._provider = secret_providers.get(self._default_vault) def get_credentials(self, source: str) -> dict: if source not in self._credentials: @@ -134,13 +124,12 @@ def create_credential_manager(creds_or_path: dict | Path, ws: WorkspaceClient | else: creds = creds_or_path - # Lazily initialize secret providers - secret_providers: dict[str, CredentialManager.SecretProviderFactory] = { - 'local': LocalSecretProvider, - 'env': partial(EnvSecretProvider, EnvGetter()), + secret_providers: dict[str, SecretProvider] = { + 'local': LocalSecretProvider(), + 'env': EnvSecretProvider(EnvGetter()), } if ws: - secret_providers['databricks'] = partial(create_databricks_secret_provider, ws) + secret_providers['databricks'] = create_databricks_secret_provider(ws) return CredentialManager(creds, secret_providers) From 178fe0faec3ad45ee6b232a3ff5bd4a2c8a0ed5b Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 15:35:20 +0100 Subject: [PATCH 22/44] add validation for vault_type --- src/databricks/labs/lakebridge/config.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 31de025b18..2cfd28b436 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -261,9 +261,13 @@ class ReconcileMetadataConfig: @dataclass class ReconcileCredentialConfig: - vault_type: str # supports local, env, databricks creds. + vault_type: str source_creds: dict[str, str] + def __post_init__(self): + if self.vault_type not in {"local", "env", "databricks"}: + raise ValueError(f"Unsupported vault_type: {self.vault_type}") + @dataclass class ReconcileConfig: From 0f3e3ffe3e08ea6fdcc0602089309b517ace80ff Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 3 Dec 2025 17:09:56 +0100 Subject: [PATCH 23/44] type test_credential_manager.py --- tests/unit/connections/test_credential_manager.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index 07e62a7b0f..08d94c4d0f 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -8,9 +8,6 @@ from databricks.sdk.service.workspace import GetSecretResponse -product_name = "remorph" - - @pytest.fixture def local_credentials(): return { @@ -52,7 +49,7 @@ def databricks_credentials(): } -def test_local_credentials(local_credentials): +def test_local_credentials(local_credentials: dict[str, str]) -> None: credentials = create_credential_manager(local_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'local_user' @@ -60,14 +57,14 @@ def test_local_credentials(local_credentials): @patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) -def test_env_credentials(env_credentials): +def test_env_credentials(env_credentials: dict[str, str]) -> None: credentials = create_credential_manager(env_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'env_user' assert creds['password'] == 'env_password' -def test_databricks_credentials(databricks_credentials, mock_workspace_client): +def test_databricks_credentials(databricks_credentials: dict[str, str], mock_workspace_client) -> None: mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( key='some_key', value=base64.b64encode(bytes('some_secret', 'utf-8')).decode('utf-8') ) @@ -77,7 +74,7 @@ def test_databricks_credentials(databricks_credentials, mock_workspace_client): assert creds['password'] == 'some_secret' -def test_databricks_credentials_not_found(databricks_credentials, mock_workspace_client): +def test_databricks_credentials_not_found(databricks_credentials: dict[str, str], mock_workspace_client) -> None: mock_workspace_client.secrets.get_secret.side_effect = NotFound("Test Exception") credentials = create_credential_manager(databricks_credentials, mock_workspace_client) From 2f66882c136d31be2bc13dc05883fd014297e81a Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Thu, 4 Dec 2025 14:31:55 +0100 Subject: [PATCH 24/44] add tests --- .../connections/credential_manager.py | 4 ++-- .../connections/test_credential_manager.py | 21 +++++++++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 3816beb86e..d3fcf29ad6 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -58,7 +58,7 @@ def get_secret(self, key: str) -> str: scope = _scope key_only = _key_only case _: - msg = f"Secret key must be in the format 'scope/secret': {key}" + msg = f"Secret key must be in the format 'scope/secret': Got {key}" raise ValueError(msg) try: @@ -66,7 +66,7 @@ def get_secret(self, key: str) -> str: assert secret.value is not None return base64.b64decode(secret.value).decode("utf-8") except NotFound as e: - raise KeyError(f'Secret does not exist with scope: {scope} and key: {key_only} : {e}') from e + raise KeyError(f'Secret does not exist with scope: {scope} and key: {key_only}') from e except UnicodeDecodeError as e: raise UnicodeDecodeError( "utf-8", diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index 08d94c4d0f..a2fa4202d1 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -49,6 +49,16 @@ def databricks_credentials(): } +@pytest.fixture +def databricks_invalid_key(): + return { + 'secret_vault_type': 'databricks', + 'mssql': { + 'database': 'without_scope', + }, + } + + def test_local_credentials(local_credentials: dict[str, str]) -> None: credentials = create_credential_manager(local_credentials) creds = credentials.get_credentials('mssql') @@ -78,5 +88,12 @@ def test_databricks_credentials_not_found(databricks_credentials: dict[str, str] mock_workspace_client.secrets.get_secret.side_effect = NotFound("Test Exception") credentials = create_credential_manager(databricks_credentials, mock_workspace_client) - with pytest.raises(KeyError, match="Source system: unknown credentials not found"): - credentials.get_credentials("unknown") + with pytest.raises(KeyError, match="Secret does not exist with scope: databricks_vault_name and key: db_key"): + credentials.get_credentials("mssql") + + +def test_databricks_invalid_key(databricks_invalid_key: dict[str, str], mock_workspace_client) -> None: + credentials = create_credential_manager(databricks_invalid_key, mock_workspace_client) + + with pytest.raises(ValueError, match="Secret key must be in the format 'scope/secret': Got without_scope"): + credentials.get_credentials("mssql") From fef02c5a08de64c015d95cc1ea4e453698c3a4c5 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Thu, 4 Dec 2025 15:53:22 +0100 Subject: [PATCH 25/44] remove one unnecessary indirection --- .../labs/lakebridge/connections/credential_manager.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index d3fcf29ad6..f6ab1e7132 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -114,10 +114,6 @@ def _load_credentials(path: Path) -> dict: raise FileNotFoundError(f"Credentials file not found at {path}") from e -def create_databricks_secret_provider(ws: WorkspaceClient) -> DatabricksSecretProvider: - return DatabricksSecretProvider(ws) - - def create_credential_manager(creds_or_path: dict | Path, ws: WorkspaceClient | None = None) -> CredentialManager: if isinstance(creds_or_path, Path): creds = _load_credentials(creds_or_path) @@ -130,6 +126,6 @@ def create_credential_manager(creds_or_path: dict | Path, ws: WorkspaceClient | } if ws: - secret_providers['databricks'] = create_databricks_secret_provider(ws) + secret_providers['databricks'] = DatabricksSecretProvider(ws) return CredentialManager(creds, secret_providers) From 235bd1ab8b8998998f13bde9979273d6b5ca86ef Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 8 Dec 2025 17:30:52 +0100 Subject: [PATCH 26/44] show user value error on decode errors --- .../labs/lakebridge/connections/credential_manager.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index f6ab1e7132..f42e087600 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -68,13 +68,8 @@ def get_secret(self, key: str) -> str: except NotFound as e: raise KeyError(f'Secret does not exist with scope: {scope} and key: {key_only}') from e except UnicodeDecodeError as e: - raise UnicodeDecodeError( - "utf-8", - key_only.encode(), - 0, - 1, - f"Secret {key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.", - ) from e + msg = f"Secret {key} has Base64 bytes that cannot be decoded to UTF-8 string" + raise ValueError(msg) from e class CredentialManager: From 4a9067342469d0b7ae699024c6dec56841ea7158 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 8 Dec 2025 18:49:23 +0100 Subject: [PATCH 27/44] add factory method --- .../assessments/configure_assessment.py | 4 +- .../labs/lakebridge/assessments/profiler.py | 5 +-- .../connections/credential_manager.py | 39 +++++++++++-------- .../synapse/dedicated_sqlpool_extract.py | 4 +- .../synapse/monitoring_metrics_extract.py | 4 +- .../synapse/serverless_sqlpool_extract.py | 4 +- .../assessments/synapse/workspace_extract.py | 4 +- .../connections/test_credential_manager.py | 12 +++--- 8 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/databricks/labs/lakebridge/assessments/configure_assessment.py b/src/databricks/labs/lakebridge/assessments/configure_assessment.py index 1c9d0081c1..edcbe64e8b 100644 --- a/src/databricks/labs/lakebridge/assessments/configure_assessment.py +++ b/src/databricks/labs/lakebridge/assessments/configure_assessment.py @@ -8,7 +8,7 @@ from databricks.labs.lakebridge.connections.credential_manager import ( cred_file as creds, - create_credential_manager, + CredentialManager, ) from databricks.labs.lakebridge.connections.database_manager import DatabaseManager from databricks.labs.lakebridge.assessments import CONNECTOR_REQUIRED @@ -43,7 +43,7 @@ def _configure_credentials(self) -> str: pass def _test_connection(self, source: str): - cred_manager = create_credential_manager(self._credential_file) + cred_manager = CredentialManager.from_file(self._credential_file) config = cred_manager.get_credentials(source) try: diff --git a/src/databricks/labs/lakebridge/assessments/profiler.py b/src/databricks/labs/lakebridge/assessments/profiler.py index 6b8b57c6b5..5160badbfe 100644 --- a/src/databricks/labs/lakebridge/assessments/profiler.py +++ b/src/databricks/labs/lakebridge/assessments/profiler.py @@ -5,8 +5,7 @@ from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig from databricks.labs.lakebridge.connections.database_manager import DatabaseManager from databricks.labs.lakebridge.connections.credential_manager import ( - create_credential_manager, - cred_file, + CredentialManager, ) from databricks.labs.lakebridge.assessments import ( PRODUCT_NAME, @@ -62,7 +61,7 @@ def profile( def _setup_extractor(platform: str) -> DatabaseManager | None: if not CONNECTOR_REQUIRED[platform]: return None - cred_manager = create_credential_manager(cred_file(PRODUCT_NAME)) + cred_manager = CredentialManager.from_product_name(PRODUCT_NAME) connect_config = cred_manager.get_credentials(platform) return DatabaseManager(platform, connect_config) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index f42e087600..60f71c36ac 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -78,6 +78,28 @@ def __init__(self, credentials: dict, secret_providers: dict[str, SecretProvider self._default_vault = self._credentials.get('secret_vault_type', 'local').lower() self._provider = secret_providers.get(self._default_vault) + @classmethod + def from_product_name(cls, product_name: str, ws: WorkspaceClient | None = None) -> "CredentialManager": + path = cred_file(product_name) + credentials = _load_credentials(path) + return cls.from_credentials(credentials, ws) + + @classmethod + def from_file(cls, path: Path, ws: WorkspaceClient | None = None) -> "CredentialManager": + credentials = _load_credentials(path) + return cls.from_credentials(credentials, ws) + + @classmethod + def from_credentials(cls, credentials: dict, ws: WorkspaceClient | None = None) -> "CredentialManager": + secret_providers: dict[str, SecretProvider] = { + 'local': LocalSecretProvider(), + 'env': EnvSecretProvider(EnvGetter()), + } + + if ws: + secret_providers['databricks'] = DatabricksSecretProvider(ws) + return cls(credentials, secret_providers) + def get_credentials(self, source: str) -> dict: if source not in self._credentials: raise KeyError(f"Source system: {source} credentials not found") @@ -107,20 +129,3 @@ def _load_credentials(path: Path) -> dict: return yaml.safe_load(f) except FileNotFoundError as e: raise FileNotFoundError(f"Credentials file not found at {path}") from e - - -def create_credential_manager(creds_or_path: dict | Path, ws: WorkspaceClient | None = None) -> CredentialManager: - if isinstance(creds_or_path, Path): - creds = _load_credentials(creds_or_path) - else: - creds = creds_or_path - - secret_providers: dict[str, SecretProvider] = { - 'local': LocalSecretProvider(), - 'env': EnvSecretProvider(EnvGetter()), - } - - if ws: - secret_providers['databricks'] = DatabricksSecretProvider(ws) - - return CredentialManager(creds, secret_providers) diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py index 23c04136f9..5f2bfe2ebc 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/dedicated_sqlpool_extract.py @@ -14,7 +14,7 @@ from databricks.labs.lakebridge.resources.assessments.synapse.common.connector import get_sqlpool_reader import zoneinfo -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.credential_manager import CredentialManager from databricks.labs.lakebridge.resources.assessments.synapse.common.profiler_classes import SynapseWorkspace from databricks.labs.lakebridge.resources.assessments.synapse.common.queries import SynapseQueries @@ -24,7 +24,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Synapse Synapse Dedicated SQL Pool Extract Script") - cred_manager = create_credential_manager(Path(creds_file)) + cred_manager = CredentialManager.from_file(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py index 2077a98dd2..71f404d03f 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/monitoring_metrics_extract.py @@ -6,7 +6,7 @@ import zoneinfo import pandas as pd -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.credential_manager import CredentialManager from databricks.labs.lakebridge.resources.assessments.synapse.common.profiler_classes import ( SynapseWorkspace, SynapseMetrics, @@ -24,7 +24,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Monitoring Metrics Extract Script") - cred_manager = create_credential_manager(Path(creds_file)) + cred_manager = CredentialManager.from_file(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") synapse_profiler_settings = synapse_workspace_settings["profiler"] diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py index dca95d1954..8fd83b1a54 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/serverless_sqlpool_extract.py @@ -4,7 +4,7 @@ import duckdb -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.credential_manager import CredentialManager from databricks.labs.lakebridge.resources.assessments.synapse.common.functions import ( arguments_loader, @@ -55,7 +55,7 @@ def execute(): logger = set_logger(__name__) db_path, creds_file = arguments_loader(desc="Synapse Synapse Serverless SQL Pool Extract Script") - cred_manager = create_credential_manager(Path(creds_file)) + cred_manager = CredentialManager.from_file(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") config = synapse_workspace_settings["workspace"] auth_type = synapse_workspace_settings["jdbc"].get("auth_type", "sql_authentication") diff --git a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py index 0c9087c9d8..9e277877af 100644 --- a/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py +++ b/src/databricks/labs/lakebridge/resources/assessments/synapse/workspace_extract.py @@ -6,7 +6,7 @@ import pandas as pd -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.credential_manager import CredentialManager from databricks.labs.lakebridge.resources.assessments.synapse.common.functions import ( arguments_loader, @@ -22,7 +22,7 @@ def execute(): db_path, creds_file = arguments_loader(desc="Workspace Extract") - cred_manager = create_credential_manager(Path(creds_file)) + cred_manager = CredentialManager.from_file(Path(creds_file)) synapse_workspace_settings = cred_manager.get_credentials("synapse") tz_info = synapse_workspace_settings["workspace"]["tz_info"] workspace_tz = zoneinfo.ZoneInfo(tz_info) diff --git a/tests/unit/connections/test_credential_manager.py b/tests/unit/connections/test_credential_manager.py index a2fa4202d1..9de95a76b9 100644 --- a/tests/unit/connections/test_credential_manager.py +++ b/tests/unit/connections/test_credential_manager.py @@ -3,7 +3,7 @@ import pytest -from databricks.labs.lakebridge.connections.credential_manager import create_credential_manager +from databricks.labs.lakebridge.connections.credential_manager import CredentialManager from databricks.sdk.errors import NotFound from databricks.sdk.service.workspace import GetSecretResponse @@ -60,7 +60,7 @@ def databricks_invalid_key(): def test_local_credentials(local_credentials: dict[str, str]) -> None: - credentials = create_credential_manager(local_credentials) + credentials = CredentialManager.from_credentials(local_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'local_user' assert creds['password'] == 'local_password' @@ -68,7 +68,7 @@ def test_local_credentials(local_credentials: dict[str, str]) -> None: @patch.dict('os.environ', {'MSSQL_USER_ENV': 'env_user', 'MSSQL_PASSWORD_ENV': 'env_password'}) def test_env_credentials(env_credentials: dict[str, str]) -> None: - credentials = create_credential_manager(env_credentials) + credentials = CredentialManager.from_credentials(env_credentials) creds = credentials.get_credentials('mssql') assert creds['user'] == 'env_user' assert creds['password'] == 'env_password' @@ -78,7 +78,7 @@ def test_databricks_credentials(databricks_credentials: dict[str, str], mock_wor mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( key='some_key', value=base64.b64encode(bytes('some_secret', 'utf-8')).decode('utf-8') ) - credentials = create_credential_manager(databricks_credentials, mock_workspace_client) + credentials = CredentialManager.from_credentials(databricks_credentials, mock_workspace_client) creds = credentials.get_credentials('mssql') assert creds['user'] == 'some_secret' assert creds['password'] == 'some_secret' @@ -86,14 +86,14 @@ def test_databricks_credentials(databricks_credentials: dict[str, str], mock_wor def test_databricks_credentials_not_found(databricks_credentials: dict[str, str], mock_workspace_client) -> None: mock_workspace_client.secrets.get_secret.side_effect = NotFound("Test Exception") - credentials = create_credential_manager(databricks_credentials, mock_workspace_client) + credentials = CredentialManager.from_credentials(databricks_credentials, mock_workspace_client) with pytest.raises(KeyError, match="Secret does not exist with scope: databricks_vault_name and key: db_key"): credentials.get_credentials("mssql") def test_databricks_invalid_key(databricks_invalid_key: dict[str, str], mock_workspace_client) -> None: - credentials = create_credential_manager(databricks_invalid_key, mock_workspace_client) + credentials = CredentialManager.from_credentials(databricks_invalid_key, mock_workspace_client) with pytest.raises(ValueError, match="Secret key must be in the format 'scope/secret': Got without_scope"): credentials.get_credentials("mssql") From 4870029eb8c6d237c3b2530b95a9f5745dc656d9 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 8 Dec 2025 19:17:20 +0100 Subject: [PATCH 28/44] allow databricks vaults only --- src/databricks/labs/lakebridge/config.py | 2 +- tests/integration/reconcile/query_builder/test_execute.py | 4 ++-- tests/integration/reconcile/test_oracle_reconcile.py | 2 +- tests/unit/deployment/test_installation.py | 6 +++--- tests/unit/deployment/test_job.py | 4 ++-- tests/unit/deployment/test_recon.py | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 0b39b64838..ac3c2b25c0 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -257,7 +257,7 @@ class ReconcileCredentialConfig: source_creds: dict[str, str] def __post_init__(self): - if self.vault_type not in {"local", "env", "databricks"}: + if self.vault_type != "databricks": raise ValueError(f"Unsupported vault_type: {self.vault_type}") diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 07335dd3e6..935d35eb1c 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1141,7 +1141,7 @@ def mock_for_report_type_all( reconcile_config_all = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1416,7 +1416,7 @@ def mock_for_report_type_row( reconcile_config_row = ReconcileConfig( data_source="snowflake", report_type="row", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index 5393eb3ae7..b47468a6aa 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -51,7 +51,7 @@ def test_oracle_db_reconcile(mock_spark, mock_workspace_client, tmp_path): reconcile_config = ReconcileConfig( data_source="oracle", report_type=report, - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=db_config, metadata_config=ReconcileMetadataConfig(catalog="tmp", schema="reconcile"), ) diff --git a/tests/unit/deployment/test_installation.py b/tests/unit/deployment/test_installation.py index f13ede1f23..0e24ba9b0b 100644 --- a/tests/unit/deployment/test_installation.py +++ b/tests/unit/deployment/test_installation.py @@ -56,7 +56,7 @@ def test_install_all(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10006", target_catalog="tpch6", @@ -111,7 +111,7 @@ def test_recon_component_installation(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10008", target_catalog="tpch8", @@ -194,7 +194,7 @@ def test_uninstall_configs_exist(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data1", source_schema="tpch_sf10001", diff --git a/tests/unit/deployment/test_job.py b/tests/unit/deployment/test_job.py index 39e8849180..bdde15b202 100644 --- a/tests/unit/deployment/test_job.py +++ b/tests/unit/deployment/test_job.py @@ -23,7 +23,7 @@ def oracle_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", @@ -42,7 +42,7 @@ def snowflake_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", diff --git a/tests/unit/deployment/test_recon.py b/tests/unit/deployment/test_recon.py index 4eed4862fe..c0ca1bacba 100644 --- a/tests/unit/deployment/test_recon.py +++ b/tests/unit/deployment/test_recon.py @@ -57,7 +57,7 @@ def test_install(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data4", source_schema="tpch_sf10004", @@ -150,7 +150,7 @@ def test_uninstall(ws): recon_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="local", source_creds={"fake": "fake"}), + creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data5", source_schema="tpch_sf10005", From 3507cca4aa5253a76f7dd6a0ff4cb4c270b80150 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 15 Dec 2025 11:30:01 +0100 Subject: [PATCH 29/44] rename class and property --- src/databricks/labs/lakebridge/config.py | 8 +++--- src/databricks/labs/lakebridge/install.py | 6 +++-- .../reconcile/trigger_recon_service.py | 2 +- .../reconcile/query_builder/test_execute.py | 14 +++++----- .../reconcile/test_oracle_reconcile.py | 4 +-- tests/unit/deployment/test_installation.py | 8 +++--- tests/unit/deployment/test_job.py | 6 ++--- tests/unit/deployment/test_recon.py | 6 ++--- tests/unit/test_install.py | 26 +++++++++++-------- 9 files changed, 43 insertions(+), 37 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index ac3c2b25c0..4088e7911e 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -252,9 +252,9 @@ class ReconcileMetadataConfig: @dataclass -class ReconcileCredentialConfig: +class ReconcileCredentialsConfig: vault_type: str - source_creds: dict[str, str] + vault_secret_names: dict[str, str] def __post_init__(self): if self.vault_type != "databricks": @@ -268,7 +268,7 @@ class ReconcileConfig: data_source: str report_type: str - creds: ReconcileCredentialConfig + creds: ReconcileCredentialsConfig database_config: DatabaseConfig metadata_config: ReconcileMetadataConfig @@ -276,7 +276,7 @@ class ReconcileConfig: def v1_migrate(cls, raw: dict[str, JsonValue]) -> dict[str, JsonValue]: secret_scope = raw.pop("secret_scope") raw["version"] = 2 - raw["creds"] = {"vault_type": "databricks", "source_creds": {"__secret_scope": secret_scope}} + raw["creds"] = {"vault_type": "databricks", "vault_secret_names": {"__secret_scope": secret_scope}} return raw diff --git a/src/databricks/labs/lakebridge/install.py b/src/databricks/labs/lakebridge/install.py index 9d5cdb2c91..0d1970a1b0 100644 --- a/src/databricks/labs/lakebridge/install.py +++ b/src/databricks/labs/lakebridge/install.py @@ -20,7 +20,7 @@ LakebridgeConfiguration, ReconcileMetadataConfig, TranspileConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator @@ -337,7 +337,9 @@ def _prompt_for_new_reconcile_installation(self) -> ReconcileConfig: return ReconcileConfig( data_source=data_source, report_type=report_type, - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": scope_name}), + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": scope_name} + ), database_config=db_config, metadata_config=metadata_config, ) diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index ffa5184319..ae631fda49 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - secret_scope=reconcile_config.creds.source_creds["__secret_scope"], + secret_scope=reconcile_config.creds.vault_secret_names["__secret_scope"], ) recon_id = str(uuid4()) diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 935d35eb1c..04b2196085 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -13,7 +13,7 @@ TableRecon, ReconcileMetadataConfig, ReconcileConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.reconcile.reconciliation import Reconciliation from databricks.labs.lakebridge.reconcile.trigger_recon_service import TriggerReconService @@ -732,7 +732,7 @@ def mock_for_report_type_data( reconcile_config_data = ReconcileConfig( data_source="databricks", report_type="data", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -929,7 +929,7 @@ def mock_for_report_type_schema( reconcile_config_schema = ReconcileConfig( data_source="databricks", report_type="schema", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1141,7 +1141,7 @@ def mock_for_report_type_all( reconcile_config_all = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1416,7 +1416,7 @@ def mock_for_report_type_row( reconcile_config_row = ReconcileConfig( data_source="snowflake", report_type="row", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1562,7 +1562,7 @@ def mock_for_recon_exception(normalized_table_conf_with_opts, setup_metadata_tab reconcile_config_exception = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -2022,7 +2022,7 @@ def test_recon_output_without_exception(mock_gen_final_recon_output): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index b47468a6aa..fd6aeb03a0 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -8,7 +8,7 @@ DatabaseConfig, ReconcileMetadataConfig, ReconcileConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.recon_capture import ReconCapture @@ -51,7 +51,7 @@ def test_oracle_db_reconcile(mock_spark, mock_workspace_client, tmp_path): reconcile_config = ReconcileConfig( data_source="oracle", report_type=report, - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=db_config, metadata_config=ReconcileMetadataConfig(catalog="tmp", schema="reconcile"), ) diff --git a/tests/unit/deployment/test_installation.py b/tests/unit/deployment/test_installation.py index 0e24ba9b0b..61647fac71 100644 --- a/tests/unit/deployment/test_installation.py +++ b/tests/unit/deployment/test_installation.py @@ -16,7 +16,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation from databricks.labs.lakebridge.deployment.recon import ReconDeployment @@ -56,7 +56,7 @@ def test_install_all(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10006", target_catalog="tpch6", @@ -111,7 +111,7 @@ def test_recon_component_installation(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10008", target_catalog="tpch8", @@ -194,7 +194,7 @@ def test_uninstall_configs_exist(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data1", source_schema="tpch_sf10001", diff --git a/tests/unit/deployment/test_job.py b/tests/unit/deployment/test_job.py index bdde15b202..3e60a8058f 100644 --- a/tests/unit/deployment/test_job.py +++ b/tests/unit/deployment/test_job.py @@ -13,7 +13,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -23,7 +23,7 @@ def oracle_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", @@ -42,7 +42,7 @@ def snowflake_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", diff --git a/tests/unit/deployment/test_recon.py b/tests/unit/deployment/test_recon.py index c0ca1bacba..a53a82134b 100644 --- a/tests/unit/deployment/test_recon.py +++ b/tests/unit/deployment/test_recon.py @@ -13,7 +13,7 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -57,7 +57,7 @@ def test_install(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data4", source_schema="tpch_sf10004", @@ -150,7 +150,7 @@ def test_uninstall(ws): recon_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "fake"}), + creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data5", source_schema="tpch_sf10005", diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 0c80635485..7dc62b43a9 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -17,7 +17,7 @@ ReconcileConfig, ReconcileMetadataConfig, TranspileConfig, - ReconcileCredentialConfig, + ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator @@ -662,7 +662,9 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work reconcile=ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "remorph_oracle"}), + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_oracle"} + ), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -684,7 +686,7 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work "report_type": "all", "creds": { "vault_type": "databricks", - "source_creds": {"__secret_scope": "remorph_oracle"}, + "vault_secret_names": {"__secret_scope": "remorph_oracle"}, }, "database_config": { "source_schema": "tpch_sf1000", @@ -744,8 +746,8 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No reconcile=ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig( - vault_type="databricks", source_creds={"__secret_scope": "remorph_snowflake"} + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_snowflake"} ), database_config=DatabaseConfig( source_schema="tpch_sf1000", @@ -769,7 +771,7 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No "report_type": "all", "creds": { "vault_type": "databricks", - "source_creds": {"__secret_scope": "remorph_snowflake"}, + "vault_secret_names": {"__secret_scope": "remorph_snowflake"}, }, "database_config": { "source_catalog": "snowflake_sample_data", @@ -841,8 +843,8 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl schema="reconcile", volume="reconcile_volume", ), - creds=ReconcileCredentialConfig( - vault_type="databricks", source_creds={"__secret_scope": "remorph_databricks"} + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_databricks"} ), ), transpile=None, @@ -855,7 +857,7 @@ def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceCl "report_type": "all", "creds": { "vault_type": "databricks", - "source_creds": {"__secret_scope": "remorph_databricks"}, + "vault_secret_names": {"__secret_scope": "remorph_databricks"}, }, "database_config": { "source_catalog": "databricks_catalog", @@ -971,7 +973,9 @@ def test_configure_all_override_installation( expected_reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialConfig(vault_type="databricks", source_creds={"__secret_scope": "remorph_snowflake"}), + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_snowflake"} + ), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -1008,7 +1012,7 @@ def test_configure_all_override_installation( "report_type": "all", "creds": { "vault_type": "databricks", - "source_creds": {"__secret_scope": "remorph_snowflake"}, + "vault_secret_names": {"__secret_scope": "remorph_snowflake"}, }, "database_config": { "source_catalog": "snowflake_sample_data", From fd793ff1b05121022a3ec247c30f73c8d5f5f08d Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 15 Dec 2025 11:44:37 +0100 Subject: [PATCH 30/44] add utility method --- src/databricks/labs/lakebridge/config.py | 4 ++++ .../labs/lakebridge/reconcile/trigger_recon_service.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 4088e7911e..9f49f89a35 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -260,6 +260,10 @@ def __post_init__(self): if self.vault_type != "databricks": raise ValueError(f"Unsupported vault_type: {self.vault_type}") + def get_databricks_secret_scope(self) -> str: + """Utility to support older installations that only allowed secret scopes.""" + return self.vault_secret_names["__secret_scope"] + @dataclass class ReconcileConfig: diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index ae631fda49..f11f32a60a 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - secret_scope=reconcile_config.creds.vault_secret_names["__secret_scope"], + secret_scope=reconcile_config.creds.get_databricks_secret_scope(), ) recon_id = str(uuid4()) From f1fed749715c3cbd98724eb8d8ef27353f1bdfa8 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 17 Dec 2025 12:54:42 +0100 Subject: [PATCH 31/44] add TODO --- .../labs/lakebridge/connections/credential_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 60f71c36ac..77a186bfc6 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -46,12 +46,11 @@ def get_secret(self, key: str) -> str: Args: key: key in the format 'scope/secret' Returns: - The decoded UTF-8 secret value. + The secret value. Raises: ValueError: The secret key must be in the format 'scope/secret'. KeyError: The secret could not be found. - UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. """ match key.split(sep="/", maxsplit=3): case _scope, _key_only: @@ -66,6 +65,7 @@ def get_secret(self, key: str) -> str: assert secret.value is not None return base64.b64decode(secret.value).decode("utf-8") except NotFound as e: + # TODO do not raise KeyError and standardize across all secret providers. Caller should handle missing secrets. raise KeyError(f'Secret does not exist with scope: {scope} and key: {key_only}') from e except UnicodeDecodeError as e: msg = f"Secret {key} has Base64 bytes that cannot be decoded to UTF-8 string" From 77d3741d9bce8460356825e572830e468d5211d0 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 17 Dec 2025 22:25:10 +0100 Subject: [PATCH 32/44] add unit test after introducing fixtures to remove duplicate code warning --- tests/conftest.py | 75 +++++++++ tests/unit/test_config.py | 21 ++- tests/unit/test_install.py | 307 +++++-------------------------------- 3 files changed, 131 insertions(+), 272 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2b2419231e..a6a9639bbe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,12 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam +from databricks.labs.lakebridge.config import ( + ReconcileConfig, + ReconcileCredentialsConfig, + DatabaseConfig, + ReconcileMetadataConfig, +) from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource @@ -420,3 +426,72 @@ def table_schema_tsql_ansi(table_schema): src_schema = [tsql_schema_fixture_factory(s.column_name, s.data_type) for s in src_schema] tgt_schema = [ansi_schema_fixture_factory(s.column_name, s.data_type) for s in tgt_schema] return src_schema, tgt_schema + + +@pytest.fixture +def reconcile_config(datasource: str) -> ReconcileConfig: + return ReconcileConfig( + data_source=datasource, + report_type="all", + creds=ReconcileCredentialsConfig( + vault_type="databricks", vault_secret_names={"__secret_scope": f"remorph_{datasource}"} + ), + database_config=DatabaseConfig( + source_schema="tpch_sf1000", + target_catalog="tpch", + target_schema="1000gb", + source_catalog=f"{datasource}_sample_data", + ), + metadata_config=ReconcileMetadataConfig( + catalog="remorph", + schema="reconcile", + volume="reconcile_volume", + ), + ) + + +@pytest.fixture +def reconcile_config_v1_yml(datasource: str) -> dict: + return { + "reconcile.yml": { + "data_source": datasource, + "report_type": "all", + "secret_scope": f"remorph_{datasource}", # v1 + "database_config": { + "source_catalog": f"{datasource}_sample_data", + "source_schema": "tpch_sf1000", + "target_catalog": "tpch", + "target_schema": "1000gb", + }, + "metadata_config": { + "catalog": "remorph", + "schema": "reconcile", + "volume": "reconcile_volume", + }, + "version": 1, + }, + } + + +@pytest.fixture +def reconcile_config_v2_yml(datasource: str) -> dict: + return { + "data_source": datasource, + "report_type": "all", + "creds": { + "vault_type": "databricks", + "vault_secret_names": {"__secret_scope": f"remorph_{datasource}"}, + }, + "database_config": { + "source_catalog": f"{datasource}_sample_data", + "source_schema": "tpch_sf1000", + "target_catalog": "tpch", + "target_schema": "1000gb", + }, + "metadata_config": { + "catalog": "remorph", + "schema": "reconcile", + "volume": "reconcile_volume", + }, + "version": 2, + } diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 3202bb86f3..033cd1b1e1 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -1,6 +1,12 @@ +import pytest + from databricks.labs.blueprint.installation import MockInstallation -from databricks.labs.lakebridge.config import TranspileConfig, TableRecon +from databricks.labs.lakebridge.config import ( + TranspileConfig, + TableRecon, + ReconcileConfig, +) from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -95,3 +101,16 @@ def test_reconcile_table_config_default_serialization() -> None: loaded = installation.load(TableRecon) assert loaded.tables == config.tables + + +@pytest.mark.parametrize("datasource", ["snowflake"]) +def test_reconcile_config_default_serialization( + datasource, reconcile_config: ReconcileConfig, reconcile_config_v1_yml: dict +) -> None: + """Verify that older config that had extra keys still works""" + installation = MockInstallation( + reconcile_config_v1_yml, + ) + + loaded = installation.load(ReconcileConfig) + assert loaded == reconcile_config diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 7dc62b43a9..65cfadbd14 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -9,15 +9,13 @@ from databricks.sdk.service import iam from databricks.labs.blueprint.tui import MockPrompts from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2 + from databricks.labs.lakebridge.config import ( - DatabaseConfig, LSPConfigOptionV1, LSPPromptMethod, LakebridgeConfiguration, ReconcileConfig, - ReconcileMetadataConfig, TranspileConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator @@ -555,7 +553,7 @@ def test_configure_transpile_installation_with_validation_and_warehouse_id_from_ ) -def test_configure_reconcile_installation_no_override(ws: WorkspaceClient) -> None: +def test_configure_reconcile_installation_no_override(ws: WorkspaceClient, reconcile_config_v1_yml: dict) -> None: prompts = MockPrompts( { r"Do you want to override the existing installation?": "no", @@ -566,27 +564,7 @@ def test_configure_reconcile_installation_no_override(ws: WorkspaceClient) -> No prompts=prompts, resource_configurator=create_autospec(ResourceConfigurator), workspace_installation=create_autospec(WorkspaceInstallation), - installation=MockInstallation( - { - "reconcile.yml": { - "data_source": "snowflake", - "report_type": "all", - "secret_scope": "remorph_snowflake", - "database_config": { - "source_catalog": "snowflake_sample_data", - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 1, - } - } - ), + installation=MockInstallation(reconcile_config_v1_yml), ) workspace_installer = WorkspaceInstaller( ctx.workspace_client, @@ -601,12 +579,19 @@ def test_configure_reconcile_installation_no_override(ws: WorkspaceClient) -> No workspace_installer.configure(module="reconcile") -def test_configure_reconcile_installation_config_error_continue_install(ws: WorkspaceClient) -> None: +@pytest.mark.parametrize("datasource", ["oracle"]) +def test_configure_reconcile_installation_config_error_continue_install( + datasource: str, + ws: WorkspaceClient, + reconcile_config: ReconcileConfig, + reconcile_config_v2_yml: dict, + reconcile_config_v1_yml: dict, +) -> None: prompts = MockPrompts( { - r"Select the Data Source": str(RECONCILE_DATA_SOURCES.index("oracle")), + r"Select the Data Source": str(RECONCILE_DATA_SOURCES.index(datasource)), r"Select the report type": str(RECONCILE_REPORT_TYPES.index("all")), - r"Enter Secret scope name to store .* connection details / secrets": "remorph_oracle", + r"Enter Secret scope name to store .* connection details / secrets": f"remorph_{datasource}", r"Enter source database name for .*": "tpch_sf1000", r"Enter target catalog name for Databricks": "tpch", r"Enter target schema name for Databricks": "1000gb", @@ -617,20 +602,8 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work { "reconcile.yml": { "source_dialect": "oracle", # Invalid key - "report_type": "all", - "secret_scope": "remorph_oracle", # version 1 - "database_config": { - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 1, - } + **reconcile_config_v1_yml["reconcile.yml"], + }, } ) @@ -658,59 +631,29 @@ def test_configure_reconcile_installation_config_error_continue_install(ws: Work ) config = workspace_installer.configure(module="reconcile") + reconcile_config.database_config.source_catalog = None expected_config = LakebridgeConfiguration( - reconcile=ReconcileConfig( - data_source="oracle", - report_type="all", - creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_oracle"} - ), - database_config=DatabaseConfig( - source_schema="tpch_sf1000", - target_catalog="tpch", - target_schema="1000gb", - ), - metadata_config=ReconcileMetadataConfig( - catalog="remorph", - schema="reconcile", - volume="reconcile_volume", - ), - ), + reconcile=reconcile_config, transpile=None, ) assert config == expected_config - installation.assert_file_written( - "reconcile.yml", - { - "data_source": "oracle", - "report_type": "all", - "creds": { - "vault_type": "databricks", - "vault_secret_names": {"__secret_scope": "remorph_oracle"}, - }, - "database_config": { - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 2, - }, - ) + dbc = reconcile_config_v2_yml["database_config"] + dbc.pop("source_catalog") + reconcile_config_v2_yml["database_config"] = dbc + installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) +@pytest.mark.parametrize("datasource", ["snowflake", "databricks"]) @patch("webbrowser.open") -def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> None: +def test_configure_reconcile_no_existing_installation( + _, datasource: str, ws: WorkspaceClient, reconcile_config: ReconcileConfig, reconcile_config_v2_yml: dict +) -> None: prompts = MockPrompts( { - r"Select the Data Source": str(RECONCILE_DATA_SOURCES.index("snowflake")), + r"Select the Data Source": str(RECONCILE_DATA_SOURCES.index(datasource)), r"Select the report type": str(RECONCILE_REPORT_TYPES.index("all")), - r"Enter Secret scope name to store .* connection details / secrets": "remorph_snowflake", - r"Enter source catalog name for .*": "snowflake_sample_data", + r"Enter Secret scope name to store .* connection details / secrets": f"remorph_{datasource}", + r"Enter source catalog name for .*": f"{datasource}_sample_data", r"Enter source schema name for .*": "tpch_sf1000", r"Enter target catalog name for Databricks": "tpch", r"Enter target schema name for Databricks": "1000gb", @@ -743,141 +686,19 @@ def test_configure_reconcile_no_existing_installation(ws: WorkspaceClient) -> No config = workspace_installer.configure(module="reconcile") expected_config = LakebridgeConfiguration( - reconcile=ReconcileConfig( - data_source="snowflake", - report_type="all", - creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_snowflake"} - ), - database_config=DatabaseConfig( - source_schema="tpch_sf1000", - target_catalog="tpch", - target_schema="1000gb", - source_catalog="snowflake_sample_data", - ), - metadata_config=ReconcileMetadataConfig( - catalog="remorph", - schema="reconcile", - volume="reconcile_volume", - ), - ), - transpile=None, - ) - assert config == expected_config - installation.assert_file_written( - "reconcile.yml", - { - "data_source": "snowflake", - "report_type": "all", - "creds": { - "vault_type": "databricks", - "vault_secret_names": {"__secret_scope": "remorph_snowflake"}, - }, - "database_config": { - "source_catalog": "snowflake_sample_data", - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 2, - }, - ) - - -@patch("webbrowser.open") -def test_configure_reconcile_databricks_no_existing_installation(ws: WorkspaceClient) -> None: - prompts = MockPrompts( - { - r"Select the Data Source": str(RECONCILE_DATA_SOURCES.index("databricks")), - r"Enter Secret scope name to store .* connection details / secrets": "remorph_databricks", - r"Select the report type": str(RECONCILE_REPORT_TYPES.index("all")), - r"Enter source catalog name for .*": "databricks_catalog", - r"Enter source schema name for .*": "some_schema", - r"Enter target catalog name for Databricks": "tpch", - r"Enter target schema name for Databricks": "1000gb", - r"Open .* in the browser?": "yes", - } - ) - installation = MockInstallation() - resource_configurator = create_autospec(ResourceConfigurator) - resource_configurator.prompt_for_catalog_setup.return_value = "remorph" - resource_configurator.prompt_for_schema_setup.return_value = "reconcile" - resource_configurator.prompt_for_volume_setup.return_value = "reconcile_volume" - - ctx = ApplicationContext(ws) - ctx.replace( - prompts=prompts, - installation=installation, - resource_configurator=resource_configurator, - workspace_installation=create_autospec(WorkspaceInstallation), - ) - - workspace_installer = WorkspaceInstaller( - ctx.workspace_client, - ctx.prompts, - ctx.installation, - ctx.install_state, - ctx.product_info, - ctx.resource_configurator, - ctx.workspace_installation, - ) - config = workspace_installer.configure(module="reconcile") - - expected_config = LakebridgeConfiguration( - reconcile=ReconcileConfig( - data_source="databricks", - report_type="all", - database_config=DatabaseConfig( - source_schema="some_schema", - target_catalog="tpch", - target_schema="1000gb", - source_catalog="databricks_catalog", - ), - metadata_config=ReconcileMetadataConfig( - catalog="remorph", - schema="reconcile", - volume="reconcile_volume", - ), - creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_databricks"} - ), - ), + reconcile=reconcile_config, transpile=None, ) assert config == expected_config - installation.assert_file_written( - "reconcile.yml", - { - "data_source": "databricks", - "report_type": "all", - "creds": { - "vault_type": "databricks", - "vault_secret_names": {"__secret_scope": "remorph_databricks"}, - }, - "database_config": { - "source_catalog": "databricks_catalog", - "source_schema": "some_schema", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 2, - }, - ) + installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) def test_configure_all_override_installation( ws_installer: Callable[..., WorkspaceInstaller], ws: WorkspaceClient, + reconcile_config: ReconcileConfig, + reconcile_config_v1_yml: dict, + reconcile_config_v2_yml: dict, ) -> None: prompts = MockPrompts( { @@ -913,23 +734,7 @@ def test_configure_all_override_installation( }, "version": 3, }, - "reconcile.yml": { - "data_source": "snowflake", - "report_type": "all", - "secret_scope": "remorph_snowflake", # v1 - "database_config": { - "source_catalog": "snowflake_sample_data", - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 1, - }, + **reconcile_config_v1_yml, } ) @@ -970,25 +775,7 @@ def test_configure_all_override_installation( schema_name="transpiler", ) - expected_reconcile_config = ReconcileConfig( - data_source="snowflake", - report_type="all", - creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": "remorph_snowflake"} - ), - database_config=DatabaseConfig( - source_schema="tpch_sf1000", - target_catalog="tpch", - target_schema="1000gb", - source_catalog="snowflake_sample_data", - ), - metadata_config=ReconcileMetadataConfig( - catalog="remorph", - schema="reconcile", - volume="reconcile_volume", - ), - ) - expected_config = LakebridgeConfiguration(transpile=expected_transpile_config, reconcile=expected_reconcile_config) + expected_config = LakebridgeConfiguration(transpile=expected_transpile_config, reconcile=reconcile_config) assert config == expected_config installation.assert_file_written( "config.yml", @@ -1005,29 +792,7 @@ def test_configure_all_override_installation( }, ) - installation.assert_file_written( - "reconcile.yml", - { - "data_source": "snowflake", - "report_type": "all", - "creds": { - "vault_type": "databricks", - "vault_secret_names": {"__secret_scope": "remorph_snowflake"}, - }, - "database_config": { - "source_catalog": "snowflake_sample_data", - "source_schema": "tpch_sf1000", - "target_catalog": "tpch", - "target_schema": "1000gb", - }, - "metadata_config": { - "catalog": "remorph", - "schema": "reconcile", - "volume": "reconcile_volume", - }, - "version": 2, - }, - ) + installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) def test_runs_upgrades_on_more_recent_version( From ed200383cc99f2b3f66b04fe049930eec8984cf4 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 17 Dec 2025 22:28:59 +0100 Subject: [PATCH 33/44] add missing spec parameter needed for fixture --- tests/unit/test_install.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 65cfadbd14..c8d6b8c281 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -552,7 +552,7 @@ def test_configure_transpile_installation_with_validation_and_warehouse_id_from_ }, ) - +@pytest.mark.parametrize("datasource", ["snowflake"]) def test_configure_reconcile_installation_no_override(ws: WorkspaceClient, reconcile_config_v1_yml: dict) -> None: prompts = MockPrompts( { @@ -692,7 +692,7 @@ def test_configure_reconcile_no_existing_installation( assert config == expected_config installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) - +@pytest.mark.parametrize("datasource", ["snowflake"]) def test_configure_all_override_installation( ws_installer: Callable[..., WorkspaceInstaller], ws: WorkspaceClient, From 63a710435626669ae8c651cab8de5657f0988d95 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Thu, 18 Dec 2025 09:47:03 +0100 Subject: [PATCH 34/44] fmt --- tests/unit/test_install.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index c8d6b8c281..54eea46669 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -552,6 +552,7 @@ def test_configure_transpile_installation_with_validation_and_warehouse_id_from_ }, ) + @pytest.mark.parametrize("datasource", ["snowflake"]) def test_configure_reconcile_installation_no_override(ws: WorkspaceClient, reconcile_config_v1_yml: dict) -> None: prompts = MockPrompts( @@ -692,6 +693,7 @@ def test_configure_reconcile_no_existing_installation( assert config == expected_config installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) + @pytest.mark.parametrize("datasource", ["snowflake"]) def test_configure_all_override_installation( ws_installer: Callable[..., WorkspaceInstaller], From 3c6f6f8cecddb585489db6641171ad619bba42d0 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 22 Dec 2025 10:43:31 +0100 Subject: [PATCH 35/44] Use configurable credentials in reconcile in place of db secret scope (#2159) ## Changes ### What does this PR do? * Move away from hardcoded secrets in reconcile * use credential manager which enables local, env and databricks ### Relevant implementation details * add `load_credentials` to `DataSource` which takes care of loading the credentials ### Caveats/things to watch out for when reviewing: ### Linked issues Progresses https://github.com/databrickslabs/lakebridge/issues/1008, https://github.com/databrickslabs/lakebridge/pull/2123, https://github.com/databrickslabs/lakebridge/pull/2157 ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [X] modified existing command: `databricks labs lakebridge reconcile` - [ ] ... +add your own ### Tests - [ ] manually tested - [X] added unit tests - [X] added integration tests --------- Co-authored-by: Guenia Izquierdo --- .../connections/credential_manager.py | 17 +++ .../reconcile/connectors/data_source.py | 12 +- .../reconcile/connectors/databricks.py | 12 +- .../reconcile/connectors/dialect_utils.py | 8 +- .../reconcile/connectors/jdbc_reader.py | 1 - .../lakebridge/reconcile/connectors/models.py | 7 - .../lakebridge/reconcile/connectors/oracle.py | 60 ++++++--- .../reconcile/connectors/secrets.py | 49 ------- .../reconcile/connectors/snowflake.py | 120 +++++++++-------- .../reconcile/connectors/source_adapter.py | 9 +- .../lakebridge/reconcile/connectors/tsql.py | 59 ++++++-- .../reconcile/trigger_recon_service.py | 2 +- .../labs/lakebridge/reconcile/utils.py | 10 +- tests/conftest.py | 6 +- .../reconcile/connectors/test_read_schema.py | 14 +- .../reconcile/query_builder/test_execute.py | 29 +++- .../reconcile/test_oracle_reconcile.py | 2 +- .../reconcile/connectors/test_databricks.py | 26 ++-- .../unit/reconcile/connectors/test_oracle.py | 53 ++++++-- .../unit/reconcile/connectors/test_secrets.py | 65 --------- .../reconcile/connectors/test_snowflake.py | 126 +++++++++++------- .../reconcile/connectors/test_sql_server.py | 55 ++++++-- tests/unit/reconcile/test_source_adapter.py | 18 +-- 23 files changed, 418 insertions(+), 342 deletions(-) delete mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/models.py delete mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/secrets.py delete mode 100644 tests/unit/reconcile/connectors/test_secrets.py diff --git a/src/databricks/labs/lakebridge/connections/credential_manager.py b/src/databricks/labs/lakebridge/connections/credential_manager.py index 77a186bfc6..3ff84fd845 100644 --- a/src/databricks/labs/lakebridge/connections/credential_manager.py +++ b/src/databricks/labs/lakebridge/connections/credential_manager.py @@ -115,6 +115,23 @@ def _get_secret_value(self, key: str) -> str: return self._provider.get_secret(key) +def build_credentials(vault_type: str, source: str, credentials: dict) -> dict: + """Build credentials dictionary with secret vault type included. + + Args: + vault_type: The type of secret vault (e.g., 'local', 'databricks'). + source: The source system name. + credentials: The original credentials dictionary. + + Returns: + A new credentials dictionary including the secret vault type. + """ + return { + source: credentials, + 'secret_vault_type': vault_type.lower(), + } + + def _get_home() -> Path: return Path(__file__).home() diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py index 9294768b77..0de291b072 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py @@ -3,8 +3,8 @@ from pyspark.sql import DataFrame -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema @@ -12,6 +12,7 @@ class DataSource(ABC): + _DOCS_URL = "https://databrickslabs.github.io/lakebridge/docs/reconcile/" @abstractmethod def read_data( @@ -34,6 +35,10 @@ def get_schema( ) -> list[Schema]: return NotImplemented + @abstractmethod + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "DataSource": + return NotImplemented + @abstractmethod def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: pass @@ -94,5 +99,8 @@ def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bo return self.log_and_throw_exception(self._exception, "schema", f"({catalog}, {schema}, {table})") return mock_schema + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "MockDataSource": + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier(identifier, self._delimiter, self._delimiter) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index 89d05b3e4c..07cddaccb0 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,10 +7,9 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient @@ -36,7 +35,7 @@ def _get_schema_query(catalog: str, schema: str, table: str): return re.sub(r'\s+', ' ', query) -class DatabricksDataSource(DataSource, SecretsMixin): +class DatabricksDataSource(DataSource): _IDENTIFIER_DELIMITER = "`" def __init__( @@ -44,12 +43,10 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, ): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope def read_data( self, @@ -96,6 +93,9 @@ def get_schema( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "schema", schema_query) + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "DatabricksDataSource": + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: return DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py index 665755e85c..2785fd8002 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py @@ -1,4 +1,10 @@ -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +import dataclasses + + +@dataclasses.dataclass() +class NormalizedIdentifier: + ansi_normalized: str + source_normalized: str class DialectUtils: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py index 3b9ec6b1e4..98726359f0 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py @@ -8,7 +8,6 @@ class JDBCReaderMixin: _spark: SparkSession - # TODO update the url def _get_jdbc_reader(self, query, jdbc_url, driver, additional_options: dict | None = None): driver_class = { "oracle": "oracle.jdbc.OracleDriver", diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/models.py b/src/databricks/labs/lakebridge/reconcile/connectors/models.py deleted file mode 100644 index c98cbef7dd..0000000000 --- a/src/databricks/labs/lakebridge/reconcile/connectors/models.py +++ /dev/null @@ -1,7 +0,0 @@ -import dataclasses - - -@dataclasses.dataclass -class NormalizedIdentifier: - ansi_normalized: str - source_normalized: str diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index b7e78e71ce..2787d1c4cb 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -8,18 +8,18 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema, OptionalPrimitiveType from databricks.sdk import WorkspaceClient logger = logging.getLogger(__name__) -class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class OracleDataSource(DataSource, JDBCReaderMixin): _DRIVER = "oracle" _IDENTIFIER_DELIMITER = "\"" _SCHEMA_QUERY = """select column_name, case when (data_precision is not null @@ -35,23 +35,23 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin): FROM ALL_TAB_COLUMNS WHERE lower(TABLE_NAME) = '{table}' and lower(owner) = '{owner}'""" - def __init__( - self, - engine: Dialect, - spark: SparkSession, - ws: WorkspaceClient, - secret_scope: str, - ): + def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope + self._creds_or_empty: dict[str, str] = {} + + @property + def _creds(self): + if self._creds_or_empty: + return self._creds_or_empty + raise RuntimeError("Oracle credentials have not been loaded. Please call load_credentials() first.") @property def get_jdbc_url(self) -> str: return ( - f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._get_secret('host')}" - f":{self._get_secret('port')}/{self._get_secret('database')}" + f"jdbc:{OracleDataSource._DRIVER}:thin:@//{self._creds.get('host')}" + f":{self._creds.get('port')}/{self._creds.get('database')}" ) def read_data( @@ -111,13 +111,41 @@ def _get_timestamp_options() -> dict[str, str]: def reader(self, query: str, options: Mapping[str, OptionalPrimitiveType] | None = None) -> DataFrameReader: if options is None: options = {} - user = self._get_secret('user') - password = self._get_secret('password') + user = self._creds.get('user') + password = self._creds.get('password') logger.debug(f"Using user: {user} to connect to Oracle") return self._get_jdbc_reader( query, self.get_jdbc_url, OracleDataSource._DRIVER, {**options, "user": user, "password": password} ) + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "OracleDataSource": + connector_creds = [ + "host", + "port", + "database", + "user", + "password", + ] + + use_scope = creds.vault_secret_names.get("__secret_scope") + if use_scope: + vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} + logger.warning( + f"Secret scope configuration is deprecated. Please refer to the docs {self._DOCS_URL} to update." + ) + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "oracle", vault_secret_names) + else: + parsed_creds = build_credentials(creds.vault_type, "oracle", creds.vault_secret_names) + + self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("oracle") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory Oracle credentials. Please configure all of {connector_creds}." + + return self + def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: normalized = DialectUtils.normalize_identifier( identifier, diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py b/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py deleted file mode 100644 index daa213afc8..0000000000 --- a/src/databricks/labs/lakebridge/reconcile/connectors/secrets.py +++ /dev/null @@ -1,49 +0,0 @@ -import base64 -import logging - -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound - -logger = logging.getLogger(__name__) - - -# TODO use CredentialManager to allow for changing secret provider for tests -class SecretsMixin: - _ws: WorkspaceClient - _secret_scope: str - - def _get_secret_or_none(self, secret_key: str) -> str | None: - """ - Get the secret value given a secret scope & secret key. Log a warning if secret does not exist - Used To ensure backwards compatibility when supporting new secrets - """ - try: - # Return the decoded secret value in string format - return self._get_secret(secret_key) - except NotFound as e: - logger.warning(f"Secret not found: key={secret_key}") - logger.debug("Secret lookup failed", exc_info=e) - return None - - def _get_secret(self, secret_key: str) -> str: - """Get the secret value given a secret scope & secret key. - - Raises: - NotFound: The secret could not be found. - UnicodeDecodeError: The secret value was not Base64-encoded UTF-8. - """ - try: - # Return the decoded secret value in string format - secret = self._ws.secrets.get_secret(self._secret_scope, secret_key) - assert secret.value is not None - return base64.b64decode(secret.value).decode("utf-8") - except NotFound as e: - raise NotFound(f'Secret does not exist with scope: {self._secret_scope} and key: {secret_key} : {e}') from e - except UnicodeDecodeError as e: - raise UnicodeDecodeError( - "utf-8", - secret_key.encode(), - 0, - 1, - f"Secret {self._secret_scope}/{secret_key} has Base64 bytes that cannot be decoded to utf-8 string: {e}.", - ) from e diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index e66751d29b..eb2a77816a 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -9,20 +9,19 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import InvalidSnowflakePemPrivateKey from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound logger = logging.getLogger(__name__) -class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class SnowflakeDataSource(DataSource, JDBCReaderMixin): _DRIVER = "snowflake" _IDENTIFIER_DELIMITER = "\"" @@ -51,33 +50,69 @@ class SnowflakeDataSource(DataSource, SecretsMixin, JDBCReaderMixin): where lower(table_name)='{table}' and table_schema = '{schema}' order by ordinal_position""" - def __init__( - self, - engine: Dialect, - spark: SparkSession, - ws: WorkspaceClient, - secret_scope: str, - ): + def __init__(self, engine: Dialect, spark: SparkSession, ws: WorkspaceClient): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope + self._creds_or_empty: dict[str, str] = {} + + @property + def _creds(self): + if self._creds_or_empty: + return self._creds_or_empty + raise RuntimeError("Snowflake credentials have not been loaded. Please call load_credentials() first.") + + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "SnowflakeDataSource": + connector_creds = [ + "sfUser", + "sfUrl", + "sfDatabase", + "sfSchema", + "sfWarehouse", + "sfRole", + ] + + use_scope = creds.vault_secret_names.get("__secret_scope") + if use_scope: + # to use pem key and/or pem password, migrate to vault_secret_names approach + logger.warning( + f"Secret scope configuration is deprecated. Using secret scopes supports password authentication only. Please refer to the docs {self._DOCS_URL} to update and to access full features." + ) + connector_creds += ["sfPassword"] + vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "snowflake", vault_secret_names) + else: + parsed_creds = build_credentials(creds.vault_type, "snowflake", creds.vault_secret_names) + + self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("snowflake") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." + assert any( + self._creds.get(k) for k in ("sfPassword", "pem_private_key") + ), "Missing Snowflake credentials. Please configure any of [sfPassword, pem_private_key]." + + if self._creds.get("pem_private_key"): + self._creds["pem_private_key"] = SnowflakeDataSource._get_private_key( + self._creds["pem_private_key"], + self._creds.get("pem_private_key_password"), + ) + + return self @property def get_jdbc_url(self) -> str: - try: - sf_password = self._get_secret('sfPassword') - except (NotFound, KeyError) as e: - message = "sfPassword is mandatory for jdbc connectivity with Snowflake." - logger.error(message) - raise NotFound(message) from e + if not self._creds: + raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialsConfig)`.") return ( - f"jdbc:{SnowflakeDataSource._DRIVER}://{self._get_secret('sfAccount')}.snowflakecomputing.com" - f"/?user={self._get_secret('sfUser')}&password={sf_password}" - f"&db={self._get_secret('sfDatabase')}&schema={self._get_secret('sfSchema')}" - f"&warehouse={self._get_secret('sfWarehouse')}&role={self._get_secret('sfRole')}" - ) + f"jdbc:{SnowflakeDataSource._DRIVER}://{self._creds['sfUrl']}" + f"/?user={self._creds['sfUser']}&password={self._creds['sfPassword']}" + f"&db={self._creds['sfDatabase']}&schema={self._creds['sfSchema']}" + f"&warehouse={self._creds['sfWarehouse']}&role={self._creds['sfRole']}" + ) # TODO Support PEM key auth def read_data( self, @@ -132,39 +167,10 @@ def get_schema( return self.log_and_throw_exception(e, "schema", schema_query) def reader(self, query: str) -> DataFrameReader: - options = self._get_snowflake_options() - return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**options) - - # TODO cache this method using @functools.cache - # Pay attention to https://pylint.pycqa.org/en/latest/user_guide/messages/warning/method-cache-max-size-none.html - def _get_snowflake_options(self): - options = { - "sfUrl": self._get_secret('sfUrl'), - "sfUser": self._get_secret('sfUser'), - "sfDatabase": self._get_secret('sfDatabase'), - "sfSchema": self._get_secret('sfSchema'), - "sfWarehouse": self._get_secret('sfWarehouse'), - "sfRole": self._get_secret('sfRole'), - } - options = options | self._get_snowflake_auth_options() - - return options - - def _get_snowflake_auth_options(self): - try: - key = SnowflakeDataSource._get_private_key( - self._get_secret('pem_private_key'), self._get_secret_or_none('pem_private_key_password') - ) - return {"pem_private_key": key} - except (NotFound, KeyError): - logger.warning("pem_private_key not found. Checking for sfPassword") - try: - password = self._get_secret('sfPassword') - return {"sfPassword": password} - except (NotFound, KeyError) as e: - message = "sfPassword and pem_private_key not found. Either one is required for snowflake auth." - logger.error(message) - raise NotFound(message) from e + if not self._creds: + raise RuntimeError("Credentials not loaded. Please call `load_credentials(ReconcileCredentialsConfig)`.") + + return self._spark.read.format("snowflake").option("dbtable", f"({query}) as tmp").options(**self._creds) @staticmethod def _get_private_key(pem_private_key: str, pem_private_key_password: str | None) -> str: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py index 71039f4494..286bb36a8f 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/source_adapter.py @@ -17,14 +17,13 @@ def create_adapter( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, ) -> DataSource: if isinstance(engine, Snowflake): - return SnowflakeDataSource(engine, spark, ws, secret_scope) + return SnowflakeDataSource(engine, spark, ws) if isinstance(engine, Oracle): - return OracleDataSource(engine, spark, ws, secret_scope) + return OracleDataSource(engine, spark, ws) if isinstance(engine, Databricks): - return DatabricksDataSource(engine, spark, ws, secret_scope) + return DatabricksDataSource(engine, spark, ws) if isinstance(engine, TSQL): - return TSQLServerDataSource(engine, spark, ws, secret_scope) + return TSQLServerDataSource(engine, spark, ws) raise ValueError(f"Unsupported source type --> {engine}") diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index 3b3394441a..704cb93231 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -8,11 +8,11 @@ from pyspark.sql.functions import col from sqlglot import Dialect +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema, OptionalPrimitiveType from databricks.sdk import WorkspaceClient @@ -50,7 +50,7 @@ """ -class TSQLServerDataSource(DataSource, SecretsMixin, JDBCReaderMixin): +class TSQLServerDataSource(DataSource, JDBCReaderMixin): _DRIVER = "sqlserver" _IDENTIFIER_DELIMITER = {"prefix": "[", "suffix": "]"} @@ -59,21 +59,26 @@ def __init__( engine: Dialect, spark: SparkSession, ws: WorkspaceClient, - secret_scope: str, ): self._engine = engine self._spark = spark self._ws = ws - self._secret_scope = secret_scope + self._creds_or_empty: dict[str, str] = {} + + @property + def _creds(self): + if self._creds_or_empty: + return self._creds_or_empty + raise RuntimeError("MS SQL/Synapse credentials have not been loaded. Please call load_credentials() first.") @property def get_jdbc_url(self) -> str: # Construct the JDBC URL return ( - f"jdbc:{self._DRIVER}://{self._get_secret('host')}:{self._get_secret('port')};" - f"databaseName={self._get_secret('database')};" - f"encrypt={self._get_secret('encrypt')};" - f"trustServerCertificate={self._get_secret('trustServerCertificate')};" + f"jdbc:{self._DRIVER}://{self._creds.get('host')}:{self._creds.get('port')};" + f"databaseName={self._creds.get('database')};" + f"encrypt={self._creds.get('encrypt')};" + f"trustServerCertificate={self._creds.get('trustServerCertificate')};" ) def read_data( @@ -103,6 +108,36 @@ def read_data( except (RuntimeError, PySparkException) as e: return self.log_and_throw_exception(e, "data", table_query) + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "TSQLServerDataSource": + connector_creds = [ + "host", + "port", + "database", + "user", + "password", + "encrypt", + "trustServerCertificate", + ] + + use_scope = creds.vault_secret_names.get("__secret_scope") + if use_scope: + logger.warning( + f"Secret scope configuration is deprecated. Please refer to the docs {self._DOCS_URL} to update." + ) + vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} + + assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" + parsed_creds = build_credentials(creds.vault_type, "mssql", vault_secret_names) + else: + parsed_creds = build_credentials(creds.vault_type, "mssql", creds.vault_secret_names) + + self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("mssql") + assert all( + self._creds.get(k) for k in connector_creds + ), f"Missing mandatory MS SQL credentials. Please configure all of {connector_creds}." + + return self + def get_schema( self, catalog: str | None, @@ -141,8 +176,8 @@ def reader(self, query: str, options: Mapping[str, OptionalPrimitiveType] | None def _get_user_password(self) -> Mapping[str, str]: return { - "user": self._get_secret("user"), - "password": self._get_secret("password"), + "user": self._creds.get("user"), + "password": self._creds.get("password"), } def normalize_identifier(self, identifier: str) -> NormalizedIdentifier: diff --git a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py index f11f32a60a..9873a177ca 100644 --- a/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py +++ b/src/databricks/labs/lakebridge/reconcile/trigger_recon_service.py @@ -74,7 +74,7 @@ def create_recon_dependencies( engine=reconcile_config.data_source, spark=spark, ws=ws_client, - secret_scope=reconcile_config.creds.get_databricks_secret_scope(), + creds=reconcile_config.creds, ) recon_id = str(uuid4()) diff --git a/src/databricks/labs/lakebridge/reconcile/utils.py b/src/databricks/labs/lakebridge/reconcile/utils.py index 42a309d8da..7389dfe131 100644 --- a/src/databricks/labs/lakebridge/reconcile/utils.py +++ b/src/databricks/labs/lakebridge/reconcile/utils.py @@ -4,7 +4,7 @@ from databricks.sdk import WorkspaceClient -from databricks.labs.lakebridge.config import ReconcileMetadataConfig +from databricks.labs.lakebridge.config import ReconcileMetadataConfig, ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.source_adapter import create_adapter from databricks.labs.lakebridge.reconcile.exception import InvalidInputException from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -17,10 +17,12 @@ def initialise_data_source( ws: WorkspaceClient, spark: SparkSession, engine: str, - secret_scope: str, + creds: ReconcileCredentialsConfig, ): - source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws, secret_scope=secret_scope) - target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws, secret_scope=secret_scope) + source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws) + target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws) + source.load_credentials(creds) + target.load_credentials(creds) return source, target diff --git a/tests/conftest.py b/tests/conftest.py index a6a9639bbe..98bf0dfefb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,8 +23,7 @@ DatabaseConfig, ReconcileMetadataConfig, ) -from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource from databricks.labs.lakebridge.reconcile.recon_config import ( Table, @@ -350,6 +349,9 @@ def read_data( ) -> DataFrame: raise RuntimeError("Not implemented") + def load_credentials(self, creds: ReconcileCredentialsConfig) -> "FakeDataSource": + raise RuntimeError("Not implemented") + @pytest.fixture def fake_oracle_datasource() -> FakeDataSource: diff --git a/tests/integration/reconcile/connectors/test_read_schema.py b/tests/integration/reconcile/connectors/test_read_schema.py index a3e509bdfc..fa47ac51ea 100644 --- a/tests/integration/reconcile/connectors/test_read_schema.py +++ b/tests/integration/reconcile/connectors/test_read_schema.py @@ -19,8 +19,8 @@ class TSQLServerDataSourceUnderTest(TSQLServerDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("tsql"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(True) + super().__init__(get_dialect("tsql"), spark, ws) + self._test_env = TestEnvGetter(True) # TODO use load_credentials @property def get_jdbc_url(self) -> str: @@ -34,8 +34,8 @@ def _get_user_password(self) -> dict: class OracleDataSourceUnderTest(OracleDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("oracle"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(False) + super().__init__(get_dialect("oracle"), spark, ws) + self._test_env = TestEnvGetter(False) # TODO use load_credentials @property def get_jdbc_url(self) -> str: @@ -53,8 +53,8 @@ def reader(self, query: str, options: Mapping[str, OptionalPrimitiveType] | None class SnowflakeDataSourceUnderTest(SnowflakeDataSource): def __init__(self, spark, ws): - super().__init__(get_dialect("snowflake"), spark, ws, "secret_scope") - self._test_env = TestEnvGetter(True) + super().__init__(get_dialect("snowflake"), spark, ws) + self._test_env = TestEnvGetter(True) # TODO use load_credentials @property def get_jdbc_url(self) -> str: @@ -91,7 +91,7 @@ def test_sql_server_read_schema_happy(mock_spark): def test_databricks_read_schema_happy(mock_spark): mock_ws = create_autospec(WorkspaceClient) - connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws, "my_secret") + connector = DatabricksDataSource(get_dialect("databricks"), mock_spark, mock_ws) mock_spark.sql("CREATE DATABASE IF NOT EXISTS my_test_db") mock_spark.sql("CREATE TABLE IF NOT EXISTS my_test_db.my_test_table (id INT, name STRING) USING parquet") diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 04b2196085..94edd5ec5c 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -1,13 +1,17 @@ +import base64 from pathlib import Path from dataclasses import dataclass from datetime import datetime -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, create_autospec import pytest from pyspark import Row from pyspark.errors import PySparkException from pyspark.testing import assertDataFrameEqual +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.workspace import GetSecretResponse + from databricks.labs.lakebridge.config import ( DatabaseConfig, TableRecon, @@ -1882,12 +1886,22 @@ def test_data_recon_with_source_exception( def test_initialise_data_source(mock_workspace_client, mock_spark): src_engine = get_dialect("snowflake") - secret_scope = "test" - source, target = initialise_data_source(mock_workspace_client, mock_spark, src_engine, secret_scope) + sf_creds = { + "sfUser": "user", + "sfPassword": "password", + "sfUrl": "account.snowflakecomputing.com", + "sfDatabase": "database", + "sfSchema": "schema", + "sfWarehouse": "warehouse", + "sfRole": "role", + } + source, target = initialise_data_source( + mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialsConfig("local", sf_creds) + ) - snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ - databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client, secret_scope).__class__ + snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ + databricks_data_source = DatabricksDataSource(src_engine, mock_spark, mock_workspace_client).__class__ assert isinstance(source, snowflake_data_source) assert isinstance(target, databricks_data_source) @@ -2001,7 +2015,10 @@ def test_reconcile_data_with_threshold_and_row_report_type( @patch('databricks.labs.lakebridge.reconcile.recon_capture.generate_final_reconcile_output') def test_recon_output_without_exception(mock_gen_final_recon_output): - mock_workspace_client = MagicMock() + mock_workspace_client = create_autospec(WorkspaceClient) + mock_workspace_client.secrets.get_secret.return_value = GetSecretResponse( + key="key", value=base64.b64encode(bytes('value', 'utf-8')).decode('utf-8') + ) mock_spark = MagicMock() mock_table_recon = MagicMock() mock_gen_final_recon_output.return_value = ReconcileOutput( diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index fd6aeb03a0..5325fcc8dc 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -22,7 +22,7 @@ class DatabricksDataSourceUnderTest(DatabricksDataSource): def __init__(self, databricks, ws, local_spark): - super().__init__(get_dialect("databricks"), databricks, ws, "not used") + super().__init__(get_dialect("databricks"), databricks, ws) self._local_spark = local_spark def read_data( diff --git a/tests/unit/reconcile/connectors/test_databricks.py b/tests/unit/reconcile/connectors/test_databricks.py index 7f89612e85..2f69dbd317 100644 --- a/tests/unit/reconcile/connectors/test_databricks.py +++ b/tests/unit/reconcile/connectors/test_databricks.py @@ -3,7 +3,7 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -23,10 +23,10 @@ def initial_setup(): def test_get_schema(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # catalog as catalog - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) ddds.get_schema("catalog", "schema", "supplier") spark.sql.assert_called_with( re.sub( @@ -56,10 +56,10 @@ def test_get_schema(): def test_read_data_from_uc(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) # Test with query ddds.read_data("org", "data", "employee", "select id as id, name as name from :tbl", None) @@ -72,10 +72,10 @@ def test_read_data_from_uc(): def test_read_data_from_hive(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) # Test with query ddds.read_data("hive_metastore", "data", "employee", "select id as id, name as name from :tbl", None) @@ -88,10 +88,10 @@ def test_read_data_from_hive(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises( @@ -104,10 +104,10 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() # create object for DatabricksDataSource - ddds = DatabricksDataSource(engine, spark, ws, scope) + ddds = DatabricksDataSource(engine, spark, ws) spark.sql.side_effect = RuntimeError("Test Exception") with pytest.raises(DataSourceRuntimeException) as exception: ddds.get_schema("org", "data", "employee") @@ -121,8 +121,8 @@ def test_get_schema_exception_handling(): def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = DatabricksDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = DatabricksDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '`a`') assert data_source.normalize_identifier('`b`') == NormalizedIdentifier("`b`", '`b`') diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index 07ee7e1d2f..4e45da62c6 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -4,7 +4,8 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -31,6 +32,16 @@ def mock_secret(scope, key): return secret_mock[scope][key] +def oracle_creds(scope): + return { + "host": f"{scope}/host", + "port": f"{scope}/port", + "database": f"{scope}/database", + "user": f"{scope}/user", + "password": f"{scope}/password", + } + + def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -47,8 +58,9 @@ def test_read_data_with_options(): # initial setup engine, spark, ws, scope = initial_setup() - # create object for SnowflakeDataSource - ords = OracleDataSource(engine, spark, ws, scope) + # create object for OracleDataSource + ords = OracleDataSource(engine, spark, ws) + ords.load_credentials(ReconcileCredentialsConfig("databricks", oracle_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -96,10 +108,11 @@ def test_read_data_with_options(): def test_get_schema(): # initial setup - engine, spark, ws, scope = initial_setup() + engine, spark, ws, _ = initial_setup() - # create object for SnowflakeDataSource - ords = OracleDataSource(engine, spark, ws, scope) + # create object for OracleDataSource + ords = OracleDataSource(engine, spark, ws) + ords.load_credentials(ReconcileCredentialsConfig("databricks", oracle_creds("scope"))) # call test method ords.get_schema(None, "data", "employee") # spark assertions @@ -127,8 +140,9 @@ def test_get_schema(): def test_read_data_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() - ords = OracleDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + ords = OracleDataSource(engine, spark, ws) + ords.load_credentials(ReconcileCredentialsConfig("databricks", oracle_creds("scope"))) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -155,9 +169,9 @@ def test_read_data_exception_handling(): def test_get_schema_exception_handling(): # initial setup - engine, spark, ws, scope = initial_setup() - ords = OracleDataSource(engine, spark, ws, scope) - + engine, spark, ws, _ = initial_setup() + ords = OracleDataSource(engine, spark, ws) + ords.load_credentials(ReconcileCredentialsConfig("databricks", oracle_creds("scope"))) spark.read.format().option().option().option().options().load.side_effect = RuntimeError("Test Exception") # Call the get_schema method with predefined table, schema, and catalog names and assert that a PySparkException @@ -180,10 +194,23 @@ def test_get_schema_exception_handling(): ords.get_schema(None, "data", "employee") +def test_credentials_not_loaded_fails(): + engine, spark, ws, _ = initial_setup() + data_source = OracleDataSource(engine, spark, ws) + + # Call the get_schema method with predefined table, schema, and catalog names and assert that a PySparkException + # is raised + with pytest.raises( + DataSourceRuntimeException, + match=re.escape("Oracle credentials have not been loaded. Please call load_credentials() first."), + ): + data_source.get_schema("org", "schema", "supplier") + + @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = OracleDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = OracleDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_secrets.py b/tests/unit/reconcile/connectors/test_secrets.py deleted file mode 100644 index dea7515b09..0000000000 --- a/tests/unit/reconcile/connectors/test_secrets.py +++ /dev/null @@ -1,65 +0,0 @@ -import base64 -from unittest.mock import create_autospec - -import pytest - -from databricks.labs.lakebridge.reconcile.connectors.secrets import SecretsMixin -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound -from databricks.sdk.service.workspace import GetSecretResponse - - -class SecretsMixinUnderTest(SecretsMixin): - def __init__(self, ws: WorkspaceClient, secret_scope: str): - self._ws = ws - self._secret_scope = secret_scope - - def get_secret(self, secret_key: str) -> str: - return self._get_secret(secret_key) - - def get_secret_or_none(self, secret_key: str) -> str | None: - return self._get_secret_or_none(secret_key) - - -def mock_secret(scope, key): - secret_mock = { - "scope": { - 'user_name': GetSecretResponse( - key='user_name', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') - ), - 'password': GetSecretResponse( - key='password', value=base64.b64encode(bytes('my_password', 'utf-8')).decode('utf-8') - ), - } - } - - return secret_mock.get(scope).get(key) - - -def test_get_secrets_happy(): - ws = create_autospec(WorkspaceClient) - ws.secrets.get_secret.side_effect = mock_secret - - sut = SecretsMixinUnderTest(ws, "scope") - - assert sut.get_secret("user_name") == "my_user" - assert sut.get_secret_or_none("user_name") == "my_user" - assert sut.get_secret("password") == "my_password" - assert sut.get_secret_or_none("password") == "my_password" - - -def test_get_secrets_not_found_exception(): - ws = create_autospec(WorkspaceClient) - ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = SecretsMixinUnderTest(ws, "scope") - - with pytest.raises(NotFound, match="Secret does not exist with scope: scope and key: unknown : Test Exception"): - sut.get_secret("unknown") - - -def test_get_secrets_not_found_swallow(): - ws = create_autospec(WorkspaceClient) - ws.secrets.get_secret.side_effect = NotFound("Test Exception") - sut = SecretsMixinUnderTest(ws, "scope") - - assert sut.get_secret_or_none("unknown") is None diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index 114aa42f2a..3674bd5340 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -6,7 +6,8 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException, InvalidSnowflakePemPrivateKey @@ -19,9 +20,6 @@ def mock_secret(scope, key): secret_mock = { "scope": { - 'sfAccount': GetSecretResponse( - key='sfAccount', value=base64.b64encode(bytes('my_account', 'utf-8')).decode('utf-8') - ), 'sfUser': GetSecretResponse( key='sfUser', value=base64.b64encode(bytes('my_user', 'utf-8')).decode('utf-8') ), @@ -40,13 +38,39 @@ def mock_secret(scope, key): 'sfRole': GetSecretResponse( key='sfRole', value=base64.b64encode(bytes('my_role', 'utf-8')).decode('utf-8') ), - 'sfUrl': GetSecretResponse(key='sfUrl', value=base64.b64encode(bytes('my_url', 'utf-8')).decode('utf-8')), + 'sfUrl': GetSecretResponse( + key='sfUrl', value=base64.b64encode(bytes('my_account.snowflakecomputing.com', 'utf-8')).decode('utf-8') + ), } } return secret_mock[scope][key] +@pytest.fixture() +def snowflake_creds(): + def _snowflake_creds(scope, use_private_key=False, use_pem_password=False): + creds = { + 'sfUser': f'{scope}/sfUser', + 'sfDatabase': f'{scope}/sfDatabase', + 'sfSchema': f'{scope}/sfSchema', + 'sfWarehouse': f'{scope}/sfWarehouse', + 'sfRole': f'{scope}/sfRole', + 'sfUrl': f'{scope}/sfUrl', + } + + if use_private_key: + creds['pem_private_key'] = f'{scope}/pem_private_key' + if use_pem_password: + creds['pem_private_key_password'] = f'{scope}/pem_private_key_password' + else: + creds['sfPassword'] = f'{scope}/sfPassword' + + return creds + + return _snowflake_creds + + def generate_pkcs8_pem_key(malformed=False): private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048) pem_key = private_key.private_bytes( @@ -91,11 +115,12 @@ def initial_setup(): return engine, spark, ws, scope -def test_get_jdbc_url_happy(): +def test_get_jdbc_url_happy(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) url = dfds.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -106,28 +131,13 @@ def test_get_jdbc_url_happy(): ) -def test_get_jdbc_url_fail(): - # initial setup - engine, spark, ws, scope = initial_setup() - ws.secrets.get_secret.side_effect = mock_secret - # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) - url = dfds.get_jdbc_url - # Assert that the URL is generated correctly - assert url == ( - "jdbc:snowflake://my_account.snowflakecomputing.com" - "/?user=my_user&password=my_password" - "&db=my_database&schema=my_schema" - "&warehouse=my_warehouse&role=my_role" - ) - - -def test_read_data_with_out_options(): +def test_read_data_with_out_options(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object with no JDBC reader options table_conf = Table( source_name="supplier", @@ -141,7 +151,7 @@ def test_read_data_with_out_options(): spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") spark.read.format().option().options.assert_called_with( - sfUrl="my_url", + sfUrl="my_account.snowflakecomputing.com", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -152,12 +162,13 @@ def test_read_data_with_out_options(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_with_options(): +def test_read_data_with_options(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="supplier", @@ -192,12 +203,13 @@ def test_read_data_with_options(): spark.read.format().option().option().option().options().load.assert_called_once() -def test_get_schema(): +def test_get_schema(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() # Mocking get secret method to return the required values # create object for SnowflakeDataSource - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) # call test method dfds.get_schema("catalog", "schema", "supplier") # spark assertions @@ -215,7 +227,7 @@ def test_get_schema(): ), ) spark.read.format().option().options.assert_called_with( - sfUrl="my_url", + sfUrl="my_account.snowflakecomputing.com", sfUser="my_user", sfPassword="my_password", sfDatabase="my_database", @@ -226,10 +238,11 @@ def test_get_schema(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_exception_handling(): +def test_read_data_exception_handling(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) # Create a Tables configuration object table_conf = Table( source_name="supplier", @@ -254,11 +267,12 @@ def test_read_data_exception_handling(): dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) -def test_get_schema_exception_handling(): +def test_get_schema_exception_handling(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope))) spark.read.format().option().options().load.side_effect = RuntimeError("Test Exception") @@ -276,16 +290,17 @@ def test_get_schema_exception_handling(): dfds.get_schema("catalog", "schema", "supplier") -def test_read_data_without_options_private_key(): +def test_read_data_without_options_private_key(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) + dfds = SnowflakeDataSource(engine, spark, ws) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope, use_private_key=True))) table_conf = Table(source_name="supplier", target_name="supplier") dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) spark.read.format.assert_called_with("snowflake") spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") expected_options = { - "sfUrl": "my_url", + "sfUrl": "my_account.snowflakecomputing.com", "sfUser": "my_user", "sfDatabase": "my_database", "sfSchema": "my_schema", @@ -298,30 +313,43 @@ def test_read_data_without_options_private_key(): spark.read.format().option().options().load.assert_called_once() -def test_read_data_without_options_malformed_private_key(): +def test_read_data_without_options_malformed_private_key(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_malformed_private_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) - table_conf = Table(source_name="supplier", target_name="supplier") + dfds = SnowflakeDataSource(engine, spark, ws) + with pytest.raises(InvalidSnowflakePemPrivateKey, match="Failed to load or process the provided PEM private key."): - dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) + dfds.load_credentials(ReconcileCredentialsConfig("databricks", snowflake_creds(scope, use_private_key=True))) -def test_read_data_without_any_auth(): +def test_read_data_without_any_auth(snowflake_creds): engine, spark, ws, scope = initial_setup() ws.secrets.get_secret.side_effect = mock_no_auth_key_secret - dfds = SnowflakeDataSource(engine, spark, ws, scope) - table_conf = Table(source_name="supplier", target_name="supplier") + dfds = SnowflakeDataSource(engine, spark, ws) + creds = snowflake_creds(scope) + creds.pop('sfPassword') + + with pytest.raises(AssertionError, match='Missing Snowflake credentials. Please configure any of .*'): + dfds.load_credentials(ReconcileCredentialsConfig("databricks", creds)) + + +def test_credentials_not_loaded_fails(): + engine, spark, ws, _ = initial_setup() + data_source = SnowflakeDataSource(engine, spark, ws) + + # Call the get_schema method with predefined table, schema, and catalog names and assert that a PySparkException + # is raised with pytest.raises( - NotFound, match='sfPassword and pem_private_key not found. Either one is required for snowflake auth.' + DataSourceRuntimeException, + match=re.escape("Snowflake credentials have not been loaded. Please call load_credentials() first."), ): - dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) + data_source.get_schema("org", "schema", "supplier") @pytest.mark.skip("Turned off till we can handle case sensitivity.") def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = SnowflakeDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = SnowflakeDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", '"a"') assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", '"b"') diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index 175a91086b..f21bffc5cf 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -4,7 +4,8 @@ import pytest -from databricks.labs.lakebridge.reconcile.connectors.models import NormalizedIdentifier +from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.tsql import TSQLServerDataSource from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException @@ -35,6 +36,18 @@ def mock_secret(scope, key): return scope_secret_mock[scope][key] +def mssql_creds(scope): + return { + "host": f"{scope}/host", + "port": f"{scope}/port", + "database": f"{scope}/database", + "user": f"{scope}/user", + "password": f"{scope}/password", + "encrypt": f"{scope}/encrypt", + "trustServerCertificate": f"{scope}/trustServerCertificate", + } + + def initial_setup(): pyspark_sql_session = MagicMock() spark = pyspark_sql_session.SparkSession.builder.getOrCreate() @@ -51,7 +64,8 @@ def test_get_jdbc_url_happy(): # initial setup engine, spark, ws, scope = initial_setup() # create object for TSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialsConfig("databricks", mssql_creds(scope))) url = data_source.get_jdbc_url # Assert that the URL is generated correctly assert url == ( @@ -64,7 +78,8 @@ def test_read_data_with_options(): engine, spark, ws, scope = initial_setup() # create object for MSSQLServerDataSource - data_source = TSQLServerDataSource(engine, spark, ws, scope) + data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialsConfig("databricks", mssql_creds(scope))) # Create a Tables configuration object with JDBC reader options table_conf = Table( source_name="src_supplier", @@ -104,13 +119,12 @@ def test_read_data_with_options(): def test_get_schema(): - # initial setup - engine, spark, ws, scope = initial_setup() - # Mocking get secret method to return the required values - data_source = TSQLServerDataSource(engine, spark, ws, scope) - # call test method + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialsConfig("databricks", mssql_creds("scope"))) + data_source.get_schema("org", "schema", "supplier") - # spark assertions + spark.read.format.assert_called_with("jdbc") spark.read.format().option().option().option.assert_called_with( "dbtable", @@ -151,9 +165,9 @@ def test_get_schema(): def test_get_schema_exception_handling(): - # initial setup - engine, spark, ws, scope = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) + data_source.load_credentials(ReconcileCredentialsConfig("databricks", mssql_creds("scope"))) spark.read.format().option().option().option().options().load.side_effect = RuntimeError("Test Exception") @@ -168,9 +182,22 @@ def test_get_schema_exception_handling(): data_source.get_schema("org", "schema", "supplier") +def test_credentials_not_loaded_fails(): + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) + + # Call the get_schema method with predefined table, schema, and catalog names and assert that a PySparkException + # is raised + with pytest.raises( + DataSourceRuntimeException, + match=re.escape("MS SQL/Synapse credentials have not been loaded. Please call load_credentials() first."), + ): + data_source.get_schema("org", "schema", "supplier") + + def test_normalize_identifier(): - engine, spark, ws, scope = initial_setup() - data_source = TSQLServerDataSource(engine, spark, ws, scope) + engine, spark, ws, _ = initial_setup() + data_source = TSQLServerDataSource(engine, spark, ws) assert data_source.normalize_identifier("a") == NormalizedIdentifier("`a`", "[a]") assert data_source.normalize_identifier('"b"') == NormalizedIdentifier("`b`", "[b]") diff --git a/tests/unit/reconcile/test_source_adapter.py b/tests/unit/reconcile/test_source_adapter.py index 5a9cc4032d..68b093e2da 100644 --- a/tests/unit/reconcile/test_source_adapter.py +++ b/tests/unit/reconcile/test_source_adapter.py @@ -15,10 +15,9 @@ def test_create_adapter_for_snowflake_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("snowflake") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - snowflake_data_source = SnowflakeDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + snowflake_data_source = SnowflakeDataSource(engine, spark, ws).__class__ assert isinstance(data_source, snowflake_data_source) @@ -27,10 +26,9 @@ def test_create_adapter_for_oracle_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("oracle") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - oracle_data_source = OracleDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + oracle_data_source = OracleDataSource(engine, spark, ws).__class__ assert isinstance(data_source, oracle_data_source) @@ -39,10 +37,9 @@ def test_create_adapter_for_databricks_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("databricks") ws = create_autospec(WorkspaceClient) - scope = "scope" - data_source = create_adapter(engine, spark, ws, scope) - databricks_data_source = DatabricksDataSource(engine, spark, ws, scope).__class__ + data_source = create_adapter(engine, spark, ws) + databricks_data_source = DatabricksDataSource(engine, spark, ws).__class__ assert isinstance(data_source, databricks_data_source) @@ -51,7 +48,6 @@ def test_raise_exception_for_unknown_dialect(): spark = create_autospec(DatabricksSession) engine = get_dialect("trino") ws = create_autospec(WorkspaceClient) - scope = "scope" with pytest.raises(ValueError, match=f"Unsupported source type --> {engine}"): - create_adapter(engine, spark, ws, scope) + create_adapter(engine, spark, ws) From 2723dc28f189f9551733e7ccd5b8d698fc476766 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 22 Dec 2025 13:43:38 +0100 Subject: [PATCH 36/44] [WIP] introduce credentials.py module and migrate secret scope to creds --- src/databricks/labs/lakebridge/config.py | 24 ++---- .../reconcile/connectors/credentials.py | 77 +++++++++++++++++++ .../reconcile/connectors/data_source.py | 2 +- .../reconcile/connectors/databricks.py | 2 +- .../lakebridge/reconcile/connectors/oracle.py | 31 ++------ .../reconcile/connectors/snowflake.py | 37 ++------- .../lakebridge/reconcile/connectors/tsql.py | 33 ++------ .../unit/reconcile/connectors/test_oracle.py | 2 +- .../reconcile/connectors/test_snowflake.py | 2 +- .../reconcile/connectors/test_sql_server.py | 2 +- 10 files changed, 108 insertions(+), 104 deletions(-) create mode 100644 src/databricks/labs/lakebridge/reconcile/connectors/credentials.py diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 9f49f89a35..dd24eaad4b 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -7,6 +7,8 @@ from databricks.labs.blueprint.installation import JsonValue from databricks.labs.blueprint.tui import Prompts + +from databricks.labs.lakebridge.reconcile.connectors.credentials import build_source_creds, ReconcileCredentialsConfig from databricks.labs.lakebridge.transpiler.transpile_status import TranspileError from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -251,20 +253,6 @@ class ReconcileMetadataConfig: volume: str = "reconcile_volume" -@dataclass -class ReconcileCredentialsConfig: - vault_type: str - vault_secret_names: dict[str, str] - - def __post_init__(self): - if self.vault_type != "databricks": - raise ValueError(f"Unsupported vault_type: {self.vault_type}") - - def get_databricks_secret_scope(self) -> str: - """Utility to support older installations that only allowed secret scopes.""" - return self.vault_secret_names["__secret_scope"] - - @dataclass class ReconcileConfig: __file__ = "reconcile.yml" @@ -278,9 +266,13 @@ class ReconcileConfig: @classmethod def v1_migrate(cls, raw: dict[str, JsonValue]) -> dict[str, JsonValue]: - secret_scope = raw.pop("secret_scope") + secret_scope = str(raw.pop("secret_scope")) + data_source = str(raw["data_source"]) raw["version"] = 2 - raw["creds"] = {"vault_type": "databricks", "vault_secret_names": {"__secret_scope": secret_scope}} + raw["creds"] = { + "vault_type": "databricks", + "vault_secret_names": build_source_creds(data_source, secret_scope), + } return raw diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py new file mode 100644 index 0000000000..300d6f2d5a --- /dev/null +++ b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py @@ -0,0 +1,77 @@ +from dataclasses import dataclass + +from databricks.sdk import WorkspaceClient + +from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager + + +@dataclass +class ReconcileCredentialsConfig: + vault_type: str + vault_secret_names: dict[str, str] + + def __post_init__(self): + if self.vault_type != "databricks": + raise ValueError(f"Unsupported vault_type: {self.vault_type}") + + +_REQUIRED_JDBC_CREDS = [ + "host", + "port", + "database", + "user", + "password", +] + +_TSQL_REQUIRED_CREDS = [*_REQUIRED_JDBC_CREDS, "encrypt", "trustServerCertificate"] + +_ORACLE_REQUIRED_CREDS = [*_REQUIRED_JDBC_CREDS] + +_SNOWFLAKE_REQUIRED_CREDS = [ + "sfUser", + "sfUrl", + "sfDatabase", + "sfSchema", + "sfWarehouse", + "sfRole", + "sfPassword", +] + +_SOURCE_CREDENTIALS_MAP = { + "databricks": [], + "snowflake": _SNOWFLAKE_REQUIRED_CREDS, + "oracle": _ORACLE_REQUIRED_CREDS, + "tsql": _TSQL_REQUIRED_CREDS, + "synapse": _TSQL_REQUIRED_CREDS, +} + + +def build_source_creds(source: str, secret_scope: str) -> dict: + keys = _SOURCE_CREDENTIALS_MAP.get(source) + if not keys: + raise ValueError(f"Unsupported source system: {source}") + return {key: f"{secret_scope}/{key}" for key in keys} + + +def validate_creds(creds: ReconcileCredentialsConfig, source: str) -> None: + required_keys = _SOURCE_CREDENTIALS_MAP.get(source) + if not required_keys: + raise ValueError(f"Unsupported source system: {source}") + + missing = [k for k in required_keys if not creds.vault_secret_names.get(k)] + if missing: + raise ValueError( + f"Missing mandatory {source} credentials. " f"Please configure all of {required_keys}. Missing: {missing}" + ) + + +def load_and_validate_credentials( + creds: ReconcileCredentialsConfig, + ws: WorkspaceClient, + source: str, +) -> dict[str, str]: + validate_creds(creds, source) + + parsed = build_credentials(creds.vault_type, source, creds.vault_secret_names) + resolved = CredentialManager.from_credentials(parsed, ws).get_credentials(source) + return resolved diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py index 0de291b072..abcccddb85 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/data_source.py @@ -3,7 +3,7 @@ from pyspark.sql import DataFrame -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.exception import DataSourceRuntimeException from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py index 07cddaccb0..40dbcd2afe 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/databricks.py @@ -7,7 +7,7 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.recon_config import JdbcReaderOptions, Schema diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py index 2787d1c4cb..760b284d2b 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/oracle.py @@ -8,8 +8,10 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig -from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager +from databricks.labs.lakebridge.reconcile.connectors.credentials import ( + load_and_validate_credentials, + ReconcileCredentialsConfig, +) from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier @@ -119,30 +121,7 @@ def reader(self, query: str, options: Mapping[str, OptionalPrimitiveType] | None ) def load_credentials(self, creds: ReconcileCredentialsConfig) -> "OracleDataSource": - connector_creds = [ - "host", - "port", - "database", - "user", - "password", - ] - - use_scope = creds.vault_secret_names.get("__secret_scope") - if use_scope: - vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} - logger.warning( - f"Secret scope configuration is deprecated. Please refer to the docs {self._DOCS_URL} to update." - ) - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "oracle", vault_secret_names) - else: - parsed_creds = build_credentials(creds.vault_type, "oracle", creds.vault_secret_names) - - self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("oracle") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory Oracle credentials. Please configure all of {connector_creds}." + self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "oracle") return self diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index eb2a77816a..855bb925d6 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -9,8 +9,10 @@ from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig -from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager +from databricks.labs.lakebridge.reconcile.connectors.credentials import ( + load_and_validate_credentials, + ReconcileCredentialsConfig, +) from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier @@ -63,37 +65,14 @@ def _creds(self): raise RuntimeError("Snowflake credentials have not been loaded. Please call load_credentials() first.") def load_credentials(self, creds: ReconcileCredentialsConfig) -> "SnowflakeDataSource": - connector_creds = [ - "sfUser", - "sfUrl", - "sfDatabase", - "sfSchema", - "sfWarehouse", - "sfRole", - ] - - use_scope = creds.vault_secret_names.get("__secret_scope") - if use_scope: - # to use pem key and/or pem password, migrate to vault_secret_names approach - logger.warning( - f"Secret scope configuration is deprecated. Using secret scopes supports password authentication only. Please refer to the docs {self._DOCS_URL} to update and to access full features." - ) - connector_creds += ["sfPassword"] - vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "snowflake", vault_secret_names) - else: - parsed_creds = build_credentials(creds.vault_type, "snowflake", creds.vault_secret_names) - - self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("snowflake") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory Snowflake credentials. Please configure all of {connector_creds}." + self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "snowflake") + + # Ensure at least one authentication method is provided assert any( self._creds.get(k) for k in ("sfPassword", "pem_private_key") ), "Missing Snowflake credentials. Please configure any of [sfPassword, pem_private_key]." + # Process PEM private key if provided if self._creds.get("pem_private_key"): self._creds["pem_private_key"] = SnowflakeDataSource._get_private_key( self._creds["pem_private_key"], diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index 704cb93231..f569c7b5ce 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -8,8 +8,10 @@ from pyspark.sql.functions import col from sqlglot import Dialect -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig -from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager +from databricks.labs.lakebridge.reconcile.connectors.credentials import ( + load_and_validate_credentials, + ReconcileCredentialsConfig, +) from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource from databricks.labs.lakebridge.reconcile.connectors.jdbc_reader import JDBCReaderMixin from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier @@ -109,32 +111,7 @@ def read_data( return self.log_and_throw_exception(e, "data", table_query) def load_credentials(self, creds: ReconcileCredentialsConfig) -> "TSQLServerDataSource": - connector_creds = [ - "host", - "port", - "database", - "user", - "password", - "encrypt", - "trustServerCertificate", - ] - - use_scope = creds.vault_secret_names.get("__secret_scope") - if use_scope: - logger.warning( - f"Secret scope configuration is deprecated. Please refer to the docs {self._DOCS_URL} to update." - ) - vault_secret_names = {key: f"{use_scope}/{key}" for key in connector_creds} - - assert creds.vault_type == "databricks", "Secret scope provided, vault_type must be 'databricks'" - parsed_creds = build_credentials(creds.vault_type, "mssql", vault_secret_names) - else: - parsed_creds = build_credentials(creds.vault_type, "mssql", creds.vault_secret_names) - - self._creds_or_empty = CredentialManager.from_credentials(parsed_creds, self._ws).get_credentials("mssql") - assert all( - self._creds.get(k) for k in connector_creds - ), f"Missing mandatory MS SQL credentials. Please configure all of {connector_creds}." + self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "tsql") return self diff --git a/tests/unit/reconcile/connectors/test_oracle.py b/tests/unit/reconcile/connectors/test_oracle.py index 4e45da62c6..d5d3a39b94 100644 --- a/tests/unit/reconcile/connectors/test_oracle.py +++ b/tests/unit/reconcile/connectors/test_oracle.py @@ -4,7 +4,7 @@ import pytest -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.oracle import OracleDataSource diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index 3674bd5340..ca46eeecfc 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -6,7 +6,7 @@ from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.primitives import serialization -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.snowflake import SnowflakeDataSource diff --git a/tests/unit/reconcile/connectors/test_sql_server.py b/tests/unit/reconcile/connectors/test_sql_server.py index f21bffc5cf..28cacc4b5a 100644 --- a/tests/unit/reconcile/connectors/test_sql_server.py +++ b/tests/unit/reconcile/connectors/test_sql_server.py @@ -4,7 +4,7 @@ import pytest -from databricks.labs.lakebridge.config import ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import ReconcileCredentialsConfig from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import NormalizedIdentifier from databricks.labs.lakebridge.transpiler.sqlglot.dialect_utils import get_dialect from databricks.labs.lakebridge.reconcile.connectors.tsql import TSQLServerDataSource From 234185ff985d160fd62683aec5ffa953c217bb5d Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 22 Dec 2025 15:07:35 +0100 Subject: [PATCH 37/44] [WIP] handle snowflake auth --- .../lakebridge/reconcile/connectors/credentials.py | 12 ++++++++++-- .../lakebridge/reconcile/connectors/snowflake.py | 8 ++++++++ tests/conftest.py | 12 +++++++++++- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py index 300d6f2d5a..844f34df30 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py @@ -1,9 +1,12 @@ +import logging from dataclasses import dataclass from databricks.sdk import WorkspaceClient from databricks.labs.lakebridge.connections.credential_manager import build_credentials, CredentialManager +logger = logging.getLogger(__name__) + @dataclass class ReconcileCredentialsConfig: @@ -34,7 +37,7 @@ def __post_init__(self): "sfSchema", "sfWarehouse", "sfRole", - "sfPassword", + # sfPassword is not required here; auth is validated separately ] _SOURCE_CREDENTIALS_MAP = { @@ -50,7 +53,12 @@ def build_source_creds(source: str, secret_scope: str) -> dict: keys = _SOURCE_CREDENTIALS_MAP.get(source) if not keys: raise ValueError(f"Unsupported source system: {source}") - return {key: f"{secret_scope}/{key}" for key in keys} + parsed = {key: f"{secret_scope}/{key}" for key in keys} + if source == "snowflake": + logger.warning("Please specify the Snowflake authentication method in the credentials config.") + parsed["pem_private_key"] = f"{secret_scope}/pem_private_key" + parsed["sfPassword"] = f"{secret_scope}/sfPassword" + return parsed def validate_creds(creds: ReconcileCredentialsConfig, source: str) -> None: diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index 855bb925d6..12159c050c 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -65,6 +65,14 @@ def _creds(self): raise RuntimeError("Snowflake credentials have not been loaded. Please call load_credentials() first.") def load_credentials(self, creds: ReconcileCredentialsConfig) -> "SnowflakeDataSource": + password = creds.vault_secret_names.pop("sfPassword", None) + pem_key = creds.vault_secret_names.pop("pem_private_key", None) + if password and pem_key: # user did not specify auth method after migrating from secret scope + logger.warning( + f"Snowflake auth not specified after migrating from secret scope so defaulting to sfPassword. " + f"Please update the creds config and include the necessary keys. Docs: {self._DOCS_URL}." + ) + creds.vault_secret_names["sfPassword"] = password self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "snowflake") # Ensure at least one authentication method is provided diff --git a/tests/conftest.py b/tests/conftest.py index 98bf0dfefb..11d650def0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -432,11 +432,21 @@ def table_schema_tsql_ansi(table_schema): @pytest.fixture def reconcile_config(datasource: str) -> ReconcileConfig: + return ReconcileConfig( data_source=datasource, report_type="all", creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": f"remorph_{datasource}"} + vault_type="databricks", + vault_secret_names={ + "sfDatabase": "remorph_snowflake/sfDatabase", + "sfPassword": "remorph_snowflake/sfPassword", + "sfRole": "remorph_snowflake/sfRole", + "sfSchema": "remorph_snowflake/sfSchema", + "sfUrl": "remorph_snowflake/sfUrl", + "sfUser": "remorph_snowflake/sfUser", + "sfWarehouse": "remorph_snowflake/sfWarehouse", + }, ), database_config=DatabaseConfig( source_schema="tpch_sf1000", From 4ae37c5e9e6a2133ac1b74bf0b91c5042439e622 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 22 Dec 2025 16:59:18 +0100 Subject: [PATCH 38/44] fix snowflake auth handling --- .../labs/lakebridge/reconcile/connectors/snowflake.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py index 12159c050c..6acd786681 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/snowflake.py @@ -65,14 +65,15 @@ def _creds(self): raise RuntimeError("Snowflake credentials have not been loaded. Please call load_credentials() first.") def load_credentials(self, creds: ReconcileCredentialsConfig) -> "SnowflakeDataSource": - password = creds.vault_secret_names.pop("sfPassword", None) - pem_key = creds.vault_secret_names.pop("pem_private_key", None) + password = creds.vault_secret_names.get("sfPassword") + pem_key = creds.vault_secret_names.get("pem_private_key") if password and pem_key: # user did not specify auth method after migrating from secret scope logger.warning( f"Snowflake auth not specified after migrating from secret scope so defaulting to sfPassword. " f"Please update the creds config and include the necessary keys. Docs: {self._DOCS_URL}." ) - creds.vault_secret_names["sfPassword"] = password + creds.vault_secret_names.pop("pem_private_key") + self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "snowflake") # Ensure at least one authentication method is provided From 611aef141121cdc5c95a85451e3936f7c8c97f69 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Mon, 22 Dec 2025 17:00:05 +0100 Subject: [PATCH 39/44] make credentials optional and create test fixtures correctly for green tests --- src/databricks/labs/lakebridge/config.py | 15 ++++--- .../labs/lakebridge/deployment/recon.py | 9 ++-- src/databricks/labs/lakebridge/install.py | 7 ++-- .../reconcile/connectors/credentials.py | 9 +++- .../labs/lakebridge/reconcile/utils.py | 7 ++-- tests/conftest.py | 42 +++++++++---------- tests/unit/contexts/test_application.py | 3 +- 7 files changed, 50 insertions(+), 42 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index dd24eaad4b..1250e79d08 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -8,7 +8,7 @@ from databricks.labs.blueprint.installation import JsonValue from databricks.labs.blueprint.tui import Prompts -from databricks.labs.lakebridge.reconcile.connectors.credentials import build_source_creds, ReconcileCredentialsConfig +from databricks.labs.lakebridge.reconcile.connectors.credentials import build_recon_creds, ReconcileCredentialsConfig from databricks.labs.lakebridge.transpiler.transpile_status import TranspileError from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -260,19 +260,22 @@ class ReconcileConfig: data_source: str report_type: str - creds: ReconcileCredentialsConfig database_config: DatabaseConfig metadata_config: ReconcileMetadataConfig + creds: ReconcileCredentialsConfig | None + # databricks does not require creds @classmethod def v1_migrate(cls, raw: dict[str, JsonValue]) -> dict[str, JsonValue]: secret_scope = str(raw.pop("secret_scope")) data_source = str(raw["data_source"]) + maybe_creds = build_recon_creds(data_source, secret_scope) + if maybe_creds: + raw["creds"] = { + "vault_secret_names": dict(maybe_creds.vault_secret_names), + "vault_type": maybe_creds.vault_type, + } raw["version"] = 2 - raw["creds"] = { - "vault_type": "databricks", - "vault_secret_names": build_source_creds(data_source, secret_scope), - } return raw diff --git a/src/databricks/labs/lakebridge/deployment/recon.py b/src/databricks/labs/lakebridge/deployment/recon.py index 4cbe51e24b..9dbd1bb337 100644 --- a/src/databricks/labs/lakebridge/deployment/recon.py +++ b/src/databricks/labs/lakebridge/deployment/recon.py @@ -60,10 +60,11 @@ def uninstall(self, recon_config: ReconcileConfig | None): f"Won't remove reconcile metadata schema `{recon_config.metadata_config.schema}` " f"from catalog `{recon_config.metadata_config.catalog}`. Please remove it and the tables inside manually." ) - logging.info( - f"Won't remove configured reconcile credentials from `{recon_config.creds.vault_type}`. " - f"Please remove it manually." - ) + if recon_config.creds: + logging.info( + f"Won't remove configured reconcile credentials from `{recon_config.creds.vault_type}`. " + f"Please remove it manually." + ) def _deploy_tables(self, recon_config: ReconcileConfig): logger.info("Deploying reconciliation metadata tables.") diff --git a/src/databricks/labs/lakebridge/install.py b/src/databricks/labs/lakebridge/install.py index 0d1970a1b0..ea4458f893 100644 --- a/src/databricks/labs/lakebridge/install.py +++ b/src/databricks/labs/lakebridge/install.py @@ -20,11 +20,11 @@ LakebridgeConfiguration, ReconcileMetadataConfig, TranspileConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation +from databricks.labs.lakebridge.reconcile.connectors.credentials import build_recon_creds from databricks.labs.lakebridge.reconcile.constants import ReconReportType, ReconSourceType from databricks.labs.lakebridge.transpiler.installers import ( BladebridgeInstaller, @@ -330,6 +330,7 @@ def _prompt_for_new_reconcile_installation(self) -> ReconcileConfig: f"Enter Secret scope name to store `{data_source.capitalize()}` connection details / secrets", default=f"remorph_{data_source}", ) + creds = build_recon_creds(data_source, scope_name) db_config = self._prompt_for_reconcile_database_config(data_source) metadata_config = self._prompt_for_reconcile_metadata_config() @@ -337,9 +338,7 @@ def _prompt_for_new_reconcile_installation(self) -> ReconcileConfig: return ReconcileConfig( data_source=data_source, report_type=report_type, - creds=ReconcileCredentialsConfig( - vault_type="databricks", vault_secret_names={"__secret_scope": scope_name} - ), + creds=creds, database_config=db_config, metadata_config=metadata_config, ) diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py index 844f34df30..b9f028d988 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py @@ -49,16 +49,21 @@ def __post_init__(self): } -def build_source_creds(source: str, secret_scope: str) -> dict: +def build_recon_creds(source: str, secret_scope: str) -> ReconcileCredentialsConfig | None: + if source == "databricks": + return None + keys = _SOURCE_CREDENTIALS_MAP.get(source) if not keys: raise ValueError(f"Unsupported source system: {source}") parsed = {key: f"{secret_scope}/{key}" for key in keys} + if source == "snowflake": logger.warning("Please specify the Snowflake authentication method in the credentials config.") parsed["pem_private_key"] = f"{secret_scope}/pem_private_key" parsed["sfPassword"] = f"{secret_scope}/sfPassword" - return parsed + + return ReconcileCredentialsConfig("databricks", parsed) def validate_creds(creds: ReconcileCredentialsConfig, source: str) -> None: diff --git a/src/databricks/labs/lakebridge/reconcile/utils.py b/src/databricks/labs/lakebridge/reconcile/utils.py index 7389dfe131..8d43445237 100644 --- a/src/databricks/labs/lakebridge/reconcile/utils.py +++ b/src/databricks/labs/lakebridge/reconcile/utils.py @@ -17,12 +17,13 @@ def initialise_data_source( ws: WorkspaceClient, spark: SparkSession, engine: str, - creds: ReconcileCredentialsConfig, + creds: ReconcileCredentialsConfig | None, ): source = create_adapter(engine=get_dialect(engine), spark=spark, ws=ws) target = create_adapter(engine=get_dialect("databricks"), spark=spark, ws=ws) - source.load_credentials(creds) - target.load_credentials(creds) + if engine != "databricks": + assert creds, "Credentials must be provided for non-Databricks sources" + source.load_credentials(creds) return source, target diff --git a/tests/conftest.py b/tests/conftest.py index 11d650def0..105f1e8c6c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,7 @@ DatabaseConfig, ReconcileMetadataConfig, ) +from databricks.labs.lakebridge.reconcile.connectors.credentials import build_recon_creds from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils, NormalizedIdentifier from databricks.labs.lakebridge.reconcile.connectors.data_source import DataSource, MockDataSource from databricks.labs.lakebridge.reconcile.recon_config import ( @@ -431,23 +432,17 @@ def table_schema_tsql_ansi(table_schema): @pytest.fixture -def reconcile_config(datasource: str) -> ReconcileConfig: +def secret_scope(datasource: str) -> str: + return f"remorph_{datasource}" + + +@pytest.fixture +def reconcile_config(datasource: str, secret_scope: str) -> ReconcileConfig: return ReconcileConfig( data_source=datasource, report_type="all", - creds=ReconcileCredentialsConfig( - vault_type="databricks", - vault_secret_names={ - "sfDatabase": "remorph_snowflake/sfDatabase", - "sfPassword": "remorph_snowflake/sfPassword", - "sfRole": "remorph_snowflake/sfRole", - "sfSchema": "remorph_snowflake/sfSchema", - "sfUrl": "remorph_snowflake/sfUrl", - "sfUser": "remorph_snowflake/sfUser", - "sfWarehouse": "remorph_snowflake/sfWarehouse", - }, - ), + creds=build_recon_creds(datasource, secret_scope), database_config=DatabaseConfig( source_schema="tpch_sf1000", target_catalog="tpch", @@ -463,12 +458,12 @@ def reconcile_config(datasource: str) -> ReconcileConfig: @pytest.fixture -def reconcile_config_v1_yml(datasource: str) -> dict: +def reconcile_config_v1_yml(datasource: str, secret_scope: str) -> dict: return { "reconcile.yml": { "data_source": datasource, "report_type": "all", - "secret_scope": f"remorph_{datasource}", # v1 + "secret_scope": secret_scope, # v1 "database_config": { "source_catalog": f"{datasource}_sample_data", "source_schema": "tpch_sf1000", @@ -486,14 +481,10 @@ def reconcile_config_v1_yml(datasource: str) -> dict: @pytest.fixture -def reconcile_config_v2_yml(datasource: str) -> dict: - return { +def reconcile_config_v2_yml(datasource: str, secret_scope: str) -> dict: + yml = { "data_source": datasource, "report_type": "all", - "creds": { - "vault_type": "databricks", - "vault_secret_names": {"__secret_scope": f"remorph_{datasource}"}, - }, "database_config": { "source_catalog": f"{datasource}_sample_data", "source_schema": "tpch_sf1000", @@ -507,3 +498,12 @@ def reconcile_config_v2_yml(datasource: str) -> dict: }, "version": 2, } + + maybe_creds = build_recon_creds(datasource, secret_scope) + if maybe_creds: + yml["creds"] = { + "vault_secret_names": dict(maybe_creds.vault_secret_names), + "vault_type": maybe_creds.vault_type, + } + + return yml diff --git a/tests/unit/contexts/test_application.py b/tests/unit/contexts/test_application.py index 79ac8eaf5c..bfbc0d45b7 100644 --- a/tests/unit/contexts/test_application.py +++ b/tests/unit/contexts/test_application.py @@ -62,7 +62,7 @@ def test_workspace_context_attributes_not_none(ws): "target_schema": "1000gb", }, "report_type": "all", - "secret_scope": "remorph_snowflake", + "secret_scope": "remorph_snowflake", # v1 "tables": { "filter_type": "exclude", "tables_list": ["ORDERS", "PART"], @@ -72,7 +72,6 @@ def test_workspace_context_attributes_not_none(ws): "schema": "reconcile", "volume": "reconcile_volume", }, - "job_id": "12345", # removed as it was never used "version": 1, }, "state.json": { From ac95e5281da77f73b85b71c74c8ae8d482784e84 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 24 Dec 2025 13:47:45 +0100 Subject: [PATCH 40/44] fix serializing dataclass in production and tests --- src/databricks/labs/lakebridge/config.py | 9 +++------ .../labs/lakebridge/reconcile/connectors/credentials.py | 2 +- .../labs/lakebridge/reconcile/connectors/tsql.py | 2 +- tests/conftest.py | 8 +++----- tests/unit/test_config.py | 3 ++- tests/unit/test_install.py | 2 +- 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 1250e79d08..bb51c422fc 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -1,6 +1,6 @@ import logging from collections.abc import Mapping, Sequence -from dataclasses import dataclass +from dataclasses import dataclass, asdict from enum import Enum, auto from pathlib import Path from typing import Any, Literal, TypeVar, cast @@ -262,7 +262,7 @@ class ReconcileConfig: report_type: str database_config: DatabaseConfig metadata_config: ReconcileMetadataConfig - creds: ReconcileCredentialsConfig | None + creds: ReconcileCredentialsConfig | None = None # databricks does not require creds @classmethod @@ -271,10 +271,7 @@ def v1_migrate(cls, raw: dict[str, JsonValue]) -> dict[str, JsonValue]: data_source = str(raw["data_source"]) maybe_creds = build_recon_creds(data_source, secret_scope) if maybe_creds: - raw["creds"] = { - "vault_secret_names": dict(maybe_creds.vault_secret_names), - "vault_type": maybe_creds.vault_type, - } + raw["creds"] = asdict(maybe_creds) raw["version"] = 2 return raw diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py index b9f028d988..e33dac122e 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/credentials.py @@ -44,7 +44,7 @@ def __post_init__(self): "databricks": [], "snowflake": _SNOWFLAKE_REQUIRED_CREDS, "oracle": _ORACLE_REQUIRED_CREDS, - "tsql": _TSQL_REQUIRED_CREDS, + "mssql": _TSQL_REQUIRED_CREDS, "synapse": _TSQL_REQUIRED_CREDS, } diff --git a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py index f569c7b5ce..8dbdd3eb4d 100644 --- a/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py +++ b/src/databricks/labs/lakebridge/reconcile/connectors/tsql.py @@ -111,7 +111,7 @@ def read_data( return self.log_and_throw_exception(e, "data", table_query) def load_credentials(self, creds: ReconcileCredentialsConfig) -> "TSQLServerDataSource": - self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "tsql") + self._creds_or_empty = load_and_validate_credentials(creds, self._ws, "mssql") return self diff --git a/tests/conftest.py b/tests/conftest.py index 105f1e8c6c..29b85a5c45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ from pathlib import Path from unittest.mock import create_autospec +from dataclasses import asdict import pytest from pyspark.sql import DataFrame @@ -463,7 +464,7 @@ def reconcile_config_v1_yml(datasource: str, secret_scope: str) -> dict: "reconcile.yml": { "data_source": datasource, "report_type": "all", - "secret_scope": secret_scope, # v1 + "secret_scope": secret_scope, "database_config": { "source_catalog": f"{datasource}_sample_data", "source_schema": "tpch_sf1000", @@ -501,9 +502,6 @@ def reconcile_config_v2_yml(datasource: str, secret_scope: str) -> dict: maybe_creds = build_recon_creds(datasource, secret_scope) if maybe_creds: - yml["creds"] = { - "vault_secret_names": dict(maybe_creds.vault_secret_names), - "vault_type": maybe_creds.vault_type, - } + yml["creds"] = asdict(maybe_creds) return yml diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 033cd1b1e1..9e1ab4a82d 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -7,6 +7,7 @@ TableRecon, ReconcileConfig, ) +from databricks.labs.lakebridge.reconcile.constants import ReconSourceType from databricks.labs.lakebridge.reconcile.recon_config import Table @@ -103,7 +104,7 @@ def test_reconcile_table_config_default_serialization() -> None: assert loaded.tables == config.tables -@pytest.mark.parametrize("datasource", ["snowflake"]) +@pytest.mark.parametrize("datasource", [source.value for source in ReconSourceType]) def test_reconcile_config_default_serialization( datasource, reconcile_config: ReconcileConfig, reconcile_config_v1_yml: dict ) -> None: diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 54eea46669..7cba331d02 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -553,7 +553,7 @@ def test_configure_transpile_installation_with_validation_and_warehouse_id_from_ ) -@pytest.mark.parametrize("datasource", ["snowflake"]) +@pytest.mark.parametrize("datasource", [source.value for source in ReconSourceType]) def test_configure_reconcile_installation_no_override(ws: WorkspaceClient, reconcile_config_v1_yml: dict) -> None: prompts = MockPrompts( { From 204af6bc72f2a19c0dd9e57a66e7b321191d783c Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 24 Dec 2025 14:22:33 +0100 Subject: [PATCH 41/44] add more tests --- tests/unit/reconcile/test_credentials.py | 67 ++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tests/unit/reconcile/test_credentials.py diff --git a/tests/unit/reconcile/test_credentials.py b/tests/unit/reconcile/test_credentials.py new file mode 100644 index 0000000000..a7b9db1dac --- /dev/null +++ b/tests/unit/reconcile/test_credentials.py @@ -0,0 +1,67 @@ +import logging +import pytest + +from databricks.labs.lakebridge.reconcile.connectors.credentials import ( + ReconcileCredentialsConfig, + build_recon_creds, + validate_creds, +) + + +def test_databricks_source_returns_none(): + assert build_recon_creds("databricks", "scope") is None + + +def test_build_unsupported_source_raises(): + with pytest.raises(ValueError, match="Unsupported source system: unknown"): + build_recon_creds("unknown", "scope") + + +@pytest.mark.parametrize("source", ["oracle", "mssql", "synapse"]) +def test_non_snowflake_sources_build_expected_mapping(source): + scope = "my-scope" + cfg = build_recon_creds(source, scope) + + assert isinstance(cfg, ReconcileCredentialsConfig) + assert cfg.vault_type == "databricks" + + required = [ + "host", + "port", + "database", + "user", + "password", + ] + for k in required: + assert cfg.vault_secret_names[k] == f"{scope}/{k}" + + +def test_snowflake_adds_extra_keys_and_logs_warning(caplog): + logger = "databricks.labs.lakebridge.reconcile.connectors.credentials" + scope = "sf-scope" + with caplog.at_level(logging.WARNING, logger): + cfg = build_recon_creds("snowflake", scope) + + # warning logged + assert any("Please specify the Snowflake authentication method" in r.message for r in caplog.records) + + # snowflake adds pem_private_key and sfPassword + assert cfg.vault_secret_names["pem_private_key"] == f"{scope}/pem_private_key" + assert cfg.vault_secret_names["sfPassword"] == f"{scope}/sfPassword" + + +def test_validate_unsupported_source_raises(): + cfg = ReconcileCredentialsConfig("databricks", {}) + with pytest.raises(ValueError, match="Unsupported source system: unknown"): + validate_creds(cfg, "unknown") + + +@pytest.mark.parametrize("source", ["oracle", "mssql", "synapse"]) +def test_missing_required_keys_raise(source): + creds = ReconcileCredentialsConfig( + "databricks", + {"host": "scope/host", "user": "scope/user"}, + ) + + with pytest.raises(ValueError): + validate_creds(creds, source) From 9f6d58ccc2730093e6ccbe5d9e64cd0db1726d0f Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Wed, 24 Dec 2025 15:15:32 +0100 Subject: [PATCH 42/44] introduce oracle specific fixture for better readability --- tests/conftest.py | 9 +++++++++ tests/unit/test_install.py | 7 ++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 707d3c40d9..5c5ec91778 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -490,3 +490,12 @@ def reconcile_config_v2_yml(datasource: str, secret_scope: str) -> dict: yml["creds"] = asdict(maybe_creds) return yml + + +@pytest.fixture +def oracle_reconcile_config_v2_yml(reconcile_config_v2_yml: dict) -> dict: + dbc = reconcile_config_v2_yml["database_config"] + dbc.pop("source_catalog") + reconcile_config_v2_yml["database_config"] = dbc + + return reconcile_config_v2_yml diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index 7cba331d02..fd97261692 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -585,7 +585,7 @@ def test_configure_reconcile_installation_config_error_continue_install( datasource: str, ws: WorkspaceClient, reconcile_config: ReconcileConfig, - reconcile_config_v2_yml: dict, + oracle_reconcile_config_v2_yml: dict, reconcile_config_v1_yml: dict, ) -> None: prompts = MockPrompts( @@ -638,10 +638,7 @@ def test_configure_reconcile_installation_config_error_continue_install( transpile=None, ) assert config == expected_config - dbc = reconcile_config_v2_yml["database_config"] - dbc.pop("source_catalog") - reconcile_config_v2_yml["database_config"] = dbc - installation.assert_file_written("reconcile.yml", reconcile_config_v2_yml) + installation.assert_file_written("reconcile.yml", oracle_reconcile_config_v2_yml) @pytest.mark.parametrize("datasource", ["snowflake", "databricks"]) From fe90e437d4e02306861b61f64729f3253fe3998a Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Thu, 25 Dec 2025 14:50:00 +0100 Subject: [PATCH 43/44] add test if both auth methods are there --- .../reconcile/connectors/test_snowflake.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/unit/reconcile/connectors/test_snowflake.py b/tests/unit/reconcile/connectors/test_snowflake.py index ca46eeecfc..43b05e173f 100644 --- a/tests/unit/reconcile/connectors/test_snowflake.py +++ b/tests/unit/reconcile/connectors/test_snowflake.py @@ -162,6 +162,48 @@ def test_read_data_with_out_options(snowflake_creds): spark.read.format().option().options().load.assert_called_once() +def test_read_data_with_out_options_both_password_and_pemkey_exist(snowflake_creds, caplog): + # initial setup + engine, spark, ws, scope = initial_setup() + + # create object for SnowflakeDataSource + dfds = SnowflakeDataSource(engine, spark, ws) + creds = snowflake_creds(scope) + creds['pem_private_key'] = f'{scope}/pem_private_key' # both exist + # Create a Tables configuration object with no JDBC reader options + table_conf = Table( + source_name="supplier", + target_name="supplier", + ) + + with caplog.at_level("WARNING", logger="databricks.labs.lakebridge.reconcile.connectors.snowflake"): + dfds.load_credentials(ReconcileCredentialsConfig("databricks", creds)) + + assert any( + "Snowflake auth not specified after migrating from secret scope so defaulting to sfPassword." in record.message + for record in caplog.records + ) + + # Call the read_data method with the Tables configuration + dfds.read_data("org", "data", "employee", "select 1 from :tbl", table_conf.jdbc_reader_options) + + # Check that the warning was logged + + # spark assertions + spark.read.format.assert_called_with("snowflake") + spark.read.format().option.assert_called_with("dbtable", "(select 1 from org.data.employee) as tmp") + spark.read.format().option().options.assert_called_with( + sfUrl="my_account.snowflakecomputing.com", + sfUser="my_user", + sfPassword="my_password", + sfDatabase="my_database", + sfSchema="my_schema", + sfWarehouse="my_warehouse", + sfRole="my_role", + ) + spark.read.format().option().options().load.assert_called_once() + + def test_read_data_with_options(snowflake_creds): # initial setup engine, spark, ws, scope = initial_setup() From c568d808b3ed7d2bac88c8882c4162d9d1175788 Mon Sep 17 00:00:00 2001 From: M Abulazm Date: Fri, 9 Jan 2026 16:12:34 +0100 Subject: [PATCH 44/44] remove incomplete creds from test fixtures --- .../integration/reconcile/connectors/test_read_schema.py | 2 +- tests/integration/reconcile/query_builder/test_execute.py | 8 +------- tests/integration/reconcile/test_oracle_reconcile.py | 2 -- tests/integration/reconcile/test_recon_databricks.py | 1 - tests/unit/deployment/test_installation.py | 4 ---- tests/unit/deployment/test_job.py | 3 --- tests/unit/deployment/test_recon.py | 3 --- 7 files changed, 2 insertions(+), 21 deletions(-) diff --git a/tests/integration/reconcile/connectors/test_read_schema.py b/tests/integration/reconcile/connectors/test_read_schema.py index ba31f64eb1..74113ee968 100644 --- a/tests/integration/reconcile/connectors/test_read_schema.py +++ b/tests/integration/reconcile/connectors/test_read_schema.py @@ -110,7 +110,7 @@ def test_databricks_read_schema_happy_sandbox( spark: SparkSession, ws: WorkspaceClient, recon_tables: tuple[TableInfo, TableInfo] ) -> None: test_table, _ = recon_tables - connector = DatabricksDataSource(get_dialect("databricks"), spark, ws, "my_secret") + connector = DatabricksDataSource(get_dialect("databricks"), spark, ws) assert test_table.catalog_name assert test_table.schema_name diff --git a/tests/integration/reconcile/query_builder/test_execute.py b/tests/integration/reconcile/query_builder/test_execute.py index 94edd5ec5c..6a1314208a 100644 --- a/tests/integration/reconcile/query_builder/test_execute.py +++ b/tests/integration/reconcile/query_builder/test_execute.py @@ -736,7 +736,6 @@ def mock_for_report_type_data( reconcile_config_data = ReconcileConfig( data_source="databricks", report_type="data", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -933,7 +932,6 @@ def mock_for_report_type_schema( reconcile_config_schema = ReconcileConfig( data_source="databricks", report_type="schema", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1145,7 +1143,6 @@ def mock_for_report_type_all( reconcile_config_all = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1420,7 +1417,6 @@ def mock_for_report_type_row( reconcile_config_row = ReconcileConfig( data_source="snowflake", report_type="row", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1566,7 +1562,6 @@ def mock_for_recon_exception(normalized_table_conf_with_opts, setup_metadata_tab reconcile_config_exception = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, @@ -1897,7 +1892,7 @@ def test_initialise_data_source(mock_workspace_client, mock_spark): "sfRole": "role", } source, target = initialise_data_source( - mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialsConfig("local", sf_creds) + mock_workspace_client, mock_spark, "snowflake", ReconcileCredentialsConfig("databricks", sf_creds) ) snowflake_data_source = SnowflakeDataSource(src_engine, mock_spark, mock_workspace_client).__class__ @@ -2039,7 +2034,6 @@ def test_recon_output_without_exception(mock_gen_final_recon_output): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog=CATALOG, source_schema=SCHEMA, diff --git a/tests/integration/reconcile/test_oracle_reconcile.py b/tests/integration/reconcile/test_oracle_reconcile.py index 5203504619..0042356d45 100644 --- a/tests/integration/reconcile/test_oracle_reconcile.py +++ b/tests/integration/reconcile/test_oracle_reconcile.py @@ -8,7 +8,6 @@ DatabaseConfig, ReconcileMetadataConfig, ReconcileConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.reconcile.connectors.databricks import DatabricksDataSource from databricks.labs.lakebridge.reconcile.recon_capture import ReconCapture @@ -55,7 +54,6 @@ def test_oracle_db_reconcile(mock_spark, mock_workspace_client, tmp_path): reconcile_config = ReconcileConfig( data_source="oracle", report_type=report, - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=db_config, metadata_config=ReconcileMetadataConfig(catalog="tmp", schema="reconcile"), ) diff --git a/tests/integration/reconcile/test_recon_databricks.py b/tests/integration/reconcile/test_recon_databricks.py index e669b5d8ea..faab4c9e99 100644 --- a/tests/integration/reconcile/test_recon_databricks.py +++ b/tests/integration/reconcile/test_recon_databricks.py @@ -56,7 +56,6 @@ def recon_config(watchdog_remove_after: str, recon_schema: SchemaInfo, make_volu conf = ReconcileConfig( data_source="databricks", report_type="all", - secret_scope="NOT_NEEDED", database_config=DatabaseConfig( source_catalog=recon_schema.catalog_name, source_schema=recon_schema.name, diff --git a/tests/unit/deployment/test_installation.py b/tests/unit/deployment/test_installation.py index 458b6a31dc..1d25487edd 100644 --- a/tests/unit/deployment/test_installation.py +++ b/tests/unit/deployment/test_installation.py @@ -15,7 +15,6 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation from databricks.labs.lakebridge.deployment.recon import ReconDeployment @@ -50,7 +49,6 @@ def test_install_all(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10006", target_catalog="tpch6", @@ -99,7 +97,6 @@ def test_recon_component_installation(ws): reconcile_config = ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10008", target_catalog="tpch8", @@ -147,7 +144,6 @@ def test_uninstall_configs_exist(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data1", source_schema="tpch_sf10001", diff --git a/tests/unit/deployment/test_job.py b/tests/unit/deployment/test_job.py index 8f7f0caa5e..a1e766202c 100644 --- a/tests/unit/deployment/test_job.py +++ b/tests/unit/deployment/test_job.py @@ -13,7 +13,6 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -23,7 +22,6 @@ def oracle_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="oracle", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", @@ -42,7 +40,6 @@ def snowflake_recon_config() -> ReconcileConfig: return ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_schema="tpch_sf10009", target_catalog="tpch9", diff --git a/tests/unit/deployment/test_recon.py b/tests/unit/deployment/test_recon.py index a53a82134b..486e5a7c03 100644 --- a/tests/unit/deployment/test_recon.py +++ b/tests/unit/deployment/test_recon.py @@ -13,7 +13,6 @@ ReconcileConfig, DatabaseConfig, ReconcileMetadataConfig, - ReconcileCredentialsConfig, ) from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment from databricks.labs.lakebridge.deployment.job import JobDeployment @@ -57,7 +56,6 @@ def test_install(ws): reconcile_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data4", source_schema="tpch_sf10004", @@ -150,7 +148,6 @@ def test_uninstall(ws): recon_config = ReconcileConfig( data_source="snowflake", report_type="all", - creds=ReconcileCredentialsConfig(vault_type="databricks", vault_secret_names={"__secret_scope": "fake"}), database_config=DatabaseConfig( source_catalog="snowflake_sample_data5", source_schema="tpch_sf10005",