-
Notifications
You must be signed in to change notification settings - Fork 81
Handle empty SQL resultsets gracefully in assessment pipeline #2172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1e0a883
57a3f9d
3cad168
c46b6d3
839af11
7f151f3
aae9455
f495f05
0ff326e
71211a6
514a752
ee008b3
67abfde
e536890
a873787
6d649ce
55b3b87
7aa7761
dd357cb
e6d426e
66e1525
8801636
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,21 @@ | ||
| from pathlib import Path | ||
| from logging import Logger | ||
|
|
||
| import duckdb | ||
| import pytest | ||
|
|
||
| from databricks.labs.lakebridge.assessments.pipeline import PipelineClass, DB_NAME, StepExecutionStatus | ||
|
|
||
| from databricks.labs.lakebridge.assessments.pipeline import ( | ||
| PipelineClass, | ||
| DB_NAME, | ||
| StepExecutionStatus, | ||
| StepExecutionResult, | ||
| ) | ||
| from databricks.labs.lakebridge.assessments.profiler_config import Step, PipelineConfig | ||
| from databricks.labs.lakebridge.connections.database_manager import DatabaseManager | ||
|
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def pipeline_config(): | ||
| def pipeline_config() -> PipelineConfig: | ||
| prefix = Path(__file__).parent | ||
| config_path = f"{prefix}/../../resources/assessments/pipeline_config.yml" | ||
| config = PipelineClass.load_config_from_yaml(config_path) | ||
|
|
@@ -19,7 +26,7 @@ def pipeline_config(): | |
|
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def pipeline_dep_failure_config(): | ||
| def pipeline_dep_failure_config() -> PipelineConfig: | ||
| prefix = Path(__file__).parent | ||
| config_path = f"{prefix}/../../resources/assessments/pipeline_config_failure_dependency.yml" | ||
| config = PipelineClass.load_config_from_yaml(config_path) | ||
|
|
@@ -30,7 +37,7 @@ def pipeline_dep_failure_config(): | |
|
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def sql_failure_config(): | ||
| def sql_failure_config() -> PipelineConfig: | ||
| prefix = Path(__file__).parent | ||
| config_path = f"{prefix}/../../resources/assessments/pipeline_config_sql_failure.yml" | ||
| config = PipelineClass.load_config_from_yaml(config_path) | ||
|
|
@@ -40,7 +47,7 @@ def sql_failure_config(): | |
|
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def python_failure_config(): | ||
| def python_failure_config() -> PipelineConfig: | ||
| prefix = Path(__file__).parent | ||
| config_path = f"{prefix}/../../resources/assessments/pipeline_config_python_failure.yml" | ||
| config = PipelineClass.load_config_from_yaml(config_path) | ||
|
|
@@ -49,7 +56,21 @@ def python_failure_config(): | |
| return config | ||
|
|
||
|
|
||
| def test_run_pipeline(sandbox_sqlserver, pipeline_config, get_logger): | ||
| @pytest.fixture(scope="module") | ||
| def empty_result_config() -> PipelineConfig: | ||
| prefix = Path(__file__).parent | ||
| config_path = f"{prefix}/../../resources/assessments/pipeline_config_empty_result.yml" | ||
| config = PipelineClass.load_config_from_yaml(config_path) | ||
| for step in config.steps: | ||
| step.extract_source = f"{prefix}/../../{step.extract_source}" | ||
|
Comment on lines
+61
to
+65
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to self, will need adjustment w.r.t. #2210. |
||
| return config | ||
|
|
||
|
|
||
| def test_run_pipeline( | ||
| sandbox_sqlserver: DatabaseManager, | ||
| pipeline_config: PipelineConfig, | ||
| get_logger: Logger, | ||
| ) -> None: | ||
| pipeline = PipelineClass(config=pipeline_config, executor=sandbox_sqlserver) | ||
| results = pipeline.execute() | ||
|
|
||
|
|
@@ -63,7 +84,11 @@ def test_run_pipeline(sandbox_sqlserver, pipeline_config, get_logger): | |
| assert verify_output(get_logger, pipeline_config.extract_folder) | ||
|
|
||
|
|
||
| def test_run_sql_failure_pipeline(sandbox_sqlserver, sql_failure_config, get_logger): | ||
| def test_run_sql_failure_pipeline( | ||
| sandbox_sqlserver: DatabaseManager, | ||
| sql_failure_config: PipelineConfig, | ||
| get_logger: Logger, | ||
| ) -> None: | ||
| pipeline = PipelineClass(config=sql_failure_config, executor=sandbox_sqlserver) | ||
| with pytest.raises(RuntimeError) as e: | ||
| pipeline.execute() | ||
|
|
@@ -72,7 +97,11 @@ def test_run_sql_failure_pipeline(sandbox_sqlserver, sql_failure_config, get_log | |
| assert "Pipeline execution failed due to errors in steps: invalid_sql_step" in str(e.value) | ||
|
|
||
|
|
||
| def test_run_python_failure_pipeline(sandbox_sqlserver, python_failure_config, get_logger): | ||
| def test_run_python_failure_pipeline( | ||
| sandbox_sqlserver: DatabaseManager, | ||
| python_failure_config: PipelineConfig, | ||
| get_logger: Logger, | ||
| ) -> None: | ||
| pipeline = PipelineClass(config=python_failure_config, executor=sandbox_sqlserver) | ||
| with pytest.raises(RuntimeError) as e: | ||
| pipeline.execute() | ||
|
|
@@ -81,7 +110,9 @@ def test_run_python_failure_pipeline(sandbox_sqlserver, python_failure_config, g | |
| assert "Pipeline execution failed due to errors in steps: invalid_python_step" in str(e.value) | ||
|
|
||
|
|
||
| def test_run_python_dep_failure_pipeline(sandbox_sqlserver, pipeline_dep_failure_config, get_logger): | ||
| def test_run_python_dep_failure_pipeline( | ||
| sandbox_sqlserver: DatabaseManager, pipeline_dep_failure_config: PipelineConfig, get_logger | ||
| ): | ||
| pipeline = PipelineClass(config=pipeline_dep_failure_config, executor=sandbox_sqlserver) | ||
| with pytest.raises(RuntimeError) as e: | ||
| pipeline.execute() | ||
|
|
@@ -90,7 +121,11 @@ def test_run_python_dep_failure_pipeline(sandbox_sqlserver, pipeline_dep_failure | |
| assert "Pipeline execution failed due to errors in steps: package_status" in str(e.value) | ||
|
|
||
|
|
||
| def test_skipped_steps(sandbox_sqlserver, pipeline_config, get_logger): | ||
| def test_skipped_steps( | ||
| sandbox_sqlserver: DatabaseManager, | ||
| pipeline_config: PipelineConfig, | ||
| get_logger: Logger, | ||
| ) -> None: | ||
| # Modify config to have some inactive steps | ||
| for step in pipeline_config.steps: | ||
| step.flag = "inactive" | ||
|
|
@@ -126,7 +161,7 @@ def verify_output(get_logger, path): | |
| return True | ||
|
|
||
|
|
||
| def test_pipeline_config_comments(): | ||
| def test_pipeline_config_comments() -> None: | ||
| pipeline_w_comments = PipelineConfig( | ||
| name="warehouse_profiler", | ||
| version="1.0", | ||
|
|
@@ -140,7 +175,7 @@ def test_pipeline_config_comments(): | |
| assert pipeline_wo_comments.comment is None | ||
|
|
||
|
|
||
| def test_pipeline_step_comments(): | ||
| def test_pipeline_step_comments() -> None: | ||
| step_w_comment = Step( | ||
| name="step_w_comment", | ||
| type="sql", | ||
|
|
@@ -160,3 +195,26 @@ def test_pipeline_step_comments(): | |
| ) | ||
| assert step_w_comment.comment == "This is a step comment." | ||
| assert step_wo_comment.comment is None | ||
|
|
||
|
|
||
| def test_run_empty_result_pipeline( | ||
| sandbox_sqlserver: DatabaseManager, | ||
| empty_result_config: PipelineConfig, | ||
| get_logger: Logger, | ||
| ) -> None: | ||
| pipeline = PipelineClass(config=empty_result_config, executor=sandbox_sqlserver) | ||
| results = pipeline.execute() | ||
|
|
||
| # Verify step completed successfully despite empty results | ||
| assert len(results) == 1 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: unnecessary assertion. |
||
| assert results == [ | ||
| StepExecutionResult(step_name="empty_result_step", status=StepExecutionStatus.COMPLETE, error_message=None) | ||
| ] | ||
|
|
||
| # Verify that no table was created (processing was skipped for empty resultset) | ||
| with duckdb.connect(str(Path(empty_result_config.extract_folder)) + "/" + DB_NAME) as conn: | ||
| tables = conn.execute("SHOW TABLES").fetchall() | ||
| table_names = [table[0] for table in tables] | ||
|
|
||
| # Table should NOT be created when resultset is empty | ||
| assert "empty_result_step" not in table_names, "Empty resultset should skip table creation" | ||
sundarshankar89 marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| -- Query that returns valid schema but 0 rows | ||
| SELECT | ||
| 'test' as col1, | ||
| 'test' as col2, | ||
| 'test' as col3 | ||
| WHERE 1 = 0 |
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| name: test_empty_result_pipeline | ||
| version: 1.0 | ||
| extract_folder: /tmp/lakebridge_test_empty_result | ||
| steps: | ||
| - name: empty_result_step | ||
| type: sql | ||
| extract_source: resources/assessments/empty_resultset.sql | ||
| mode: overwrite | ||
| frequency: once | ||
| flag: active |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In switching to the DF approach I deliberately preserved the existing all-string behaviour. However this
TODOmight be resolved trivially if the.astype(str)is dropped: Pandas will infer the column types, which DuckDB will take over.