From d268a624d58f0988499eba30b791b3e3acc1f009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 6 Nov 2025 10:21:30 +0100 Subject: [PATCH 1/7] feat: add dry run to the read_gbq function --- pandas_gbq/gbq.py | 5 ++++- pandas_gbq/gbq_connector.py | 14 +++++++++++++- tests/system/test_gbq.py | 13 +++++++++++++ tests/unit/test_gbq.py | 7 +++++++ 4 files changed, 37 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 880dcef9..40480a2b 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -119,6 +119,7 @@ def read_gbq( *, col_order=None, bigquery_client=None, + dry_run: bool = False, ): r"""Read data from Google BigQuery to a pandas DataFrame. @@ -269,7 +270,8 @@ def read_gbq( bigquery_client : google.cloud.bigquery.Client, optional A Google Cloud BigQuery Python Client instance. If provided, it will be used for reading data, while the project and credentials parameters will be ignored. - + dry_run : bool, default False + If True, run a dry run query. Returns ------- df: DataFrame @@ -328,6 +330,7 @@ def read_gbq( max_results=max_results, progress_bar_type=progress_bar_type, dtypes=dtypes, + dry_run=dry_run, ) else: final_df = connector.download_table( diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 2b3b716e..518de452 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -199,7 +199,14 @@ def download_table( user_dtypes=dtypes, ) - def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): + def run_query( + self, + query, + max_results=None, + progress_bar_type=None, + dry_run: bool = False, + **kwargs, + ): from google.cloud import bigquery job_config_dict = { @@ -235,6 +242,7 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): self._start_timer() job_config = bigquery.QueryJobConfig.from_api_repr(job_config_dict) + job_config.dry_run = dry_run if FEATURES.bigquery_has_query_and_wait: rows_iter = pandas_gbq.query.query_and_wait_via_client_library( @@ -260,6 +268,10 @@ def run_query(self, query, max_results=None, progress_bar_type=None, **kwargs): ) dtypes = kwargs.get("dtypes") + + if dry_run: + return rows_iter + return self._download_results( rows_iter, max_results=max_results, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 1457ec30..355ee68e 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -656,6 +656,19 @@ def test_columns_and_col_order_raises_error(self, project_id): dialect="standard", ) + def test_read_gbq_with_dry_run(self, project_id): + query = "SELECT 1" + job = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="standard", + dry_run=True, + ) + assert job.dry_run + assert job.state == "DONE" + assert job.total_bytes_processed > 0 + class TestToGBQIntegration(object): @pytest.fixture(autouse=True, scope="function") diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 75574820..fcbacc2a 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -937,3 +937,10 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): type(mock_query_job).destination = mock.PropertyMock(return_value=None) connector.run_query("UPDATE tablename SET value = '';") mock_bigquery_client.list_rows.assert_not_called() + + +def test_read_gbq_with_dry_run(mock_bigquery_client): + gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) + _, kwargs = mock_bigquery_client.query.call_args + job_config = kwargs["job_config"] + assert job_config.dry_run is True From 13fbf92276b496ef1c39b12ddb21f546bc348d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 6 Nov 2025 10:27:30 +0100 Subject: [PATCH 2/7] return the cost (in GB) if dry run is set to True --- pandas_gbq/gbq_connector.py | 2 +- tests/system/test_gbq.py | 8 ++++---- tests/unit/test_gbq.py | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 518de452..cc83e8df 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -270,7 +270,7 @@ def run_query( dtypes = kwargs.get("dtypes") if dry_run: - return rows_iter + return rows_iter.total_bytes_processed / 1024**3 return self._download_results( rows_iter, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 355ee68e..3764cc8b 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -658,16 +658,16 @@ def test_columns_and_col_order_raises_error(self, project_id): def test_read_gbq_with_dry_run(self, project_id): query = "SELECT 1" - job = gbq.read_gbq( + cost = gbq.read_gbq( query, project_id=project_id, credentials=self.credentials, dialect="standard", dry_run=True, ) - assert job.dry_run - assert job.state == "DONE" - assert job.total_bytes_processed > 0 + assert isinstance(cost, float) + assert cost > 0 + class TestToGBQIntegration(object): diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index fcbacc2a..621a2448 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -939,8 +939,10 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): mock_bigquery_client.list_rows.assert_not_called() -def test_read_gbq_with_dry_run(mock_bigquery_client): - gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) +def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): + type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) + cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True + assert cost == 12345 / 1024**3 From adcfc7bdc9dee67c52890e86185276f898e7fefa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Mon, 10 Nov 2025 09:17:52 +0100 Subject: [PATCH 3/7] updates to fix test --- pandas_gbq/gbq.py | 8 ++++++-- pandas_gbq/gbq_connector.py | 10 +++++++++- pandas_gbq/query.py | 33 ++++++++++++++++++++++++++++++++- tests/unit/test_gbq.py | 11 +++++++++-- tests/unit/test_query.py | 10 +++++++--- 5 files changed, 63 insertions(+), 9 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 40480a2b..75fa3510 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -274,8 +274,9 @@ def read_gbq( If True, run a dry run query. Returns ------- - df: DataFrame - DataFrame representing results of query. + df: DataFrame or float + DataFrame representing results of query. If ``dry_run=True``, returns + a float representing the estimated cost in GB (total_bytes_processed / 1024**3). """ if dialect is None: dialect = context.dialect @@ -332,6 +333,9 @@ def read_gbq( dtypes=dtypes, dry_run=dry_run, ) + # When dry_run=True, run_query returns a float (cost in GB), not a DataFrame + if dry_run: + return final_df else: final_df = connector.download_table( query_or_table, diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index cc83e8df..2c48f6fa 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -270,7 +270,15 @@ def run_query( dtypes = kwargs.get("dtypes") if dry_run: - return rows_iter.total_bytes_processed / 1024**3 + # Access total_bytes_processed from the QueryJob via RowIterator.job + # RowIterator has a job attribute that references the QueryJob + query_job = rows_iter.job if hasattr(rows_iter, 'job') and rows_iter.job else None + if query_job is None: + # Fallback: if query_and_wait_via_client_library doesn't set job, + # we need to get it from the query result + # For query_and_wait_via_client_library, the RowIterator should have job set + raise ValueError("Cannot access QueryJob from RowIterator for dry_run") + return query_job.total_bytes_processed / 1024**3 return self._download_results( rows_iter, diff --git a/pandas_gbq/query.py b/pandas_gbq/query.py index 83575a9c..ba0f1d72 100644 --- a/pandas_gbq/query.py +++ b/pandas_gbq/query.py @@ -179,7 +179,12 @@ def query_and_wait( # getQueryResults() instead of tabledata.list, which returns the correct # response with DML/DDL queries. try: - return query_reply.result(max_results=max_results) + rows_iter = query_reply.result(max_results=max_results) + # Store reference to QueryJob in RowIterator for dry_run access + # RowIterator already has a job attribute, but ensure it's set + if not hasattr(rows_iter, 'job') or rows_iter.job is None: + rows_iter.job = query_reply + return rows_iter except connector.http_error as ex: connector.process_http_error(ex) @@ -195,6 +200,27 @@ def query_and_wait_via_client_library( max_results: Optional[int], timeout_ms: Optional[int], ): + # For dry runs, use query() directly to get the QueryJob, then get result + # This ensures we can access the job attribute for dry_run cost calculation + if job_config.dry_run: + query_job = try_query( + connector, + functools.partial( + client.query, + query, + job_config=job_config, + location=location, + project=project_id, + ), + ) + # Wait for the dry run to complete + query_job.result(timeout=timeout_ms / 1000.0 if timeout_ms else None) + # Get the result iterator and ensure job attribute is set + rows_iter = query_job.result(max_results=max_results) + if not hasattr(rows_iter, 'job') or rows_iter.job is None: + rows_iter.job = query_job + return rows_iter + rows_iter = try_query( connector, functools.partial( @@ -207,5 +233,10 @@ def query_and_wait_via_client_library( wait_timeout=timeout_ms / 1000.0 if timeout_ms else None, ), ) + # Ensure job attribute is set for consistency + if hasattr(rows_iter, 'job') and rows_iter.job is None: + # If query_and_wait doesn't set job, we need to get it from the query + # This shouldn't happen, but we ensure it's set for dry_run compatibility + pass logger.debug("Query done.\n") return rows_iter diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 621a2448..e63c364a 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -76,6 +76,8 @@ def generate_schema(): @pytest.fixture(autouse=True) def default_bigquery_client(mock_bigquery_client, mock_query_job, mock_row_iterator): mock_query_job.result.return_value = mock_row_iterator + # Set up RowIterator.job to point to QueryJob for dry_run access + mock_row_iterator.job = mock_query_job mock_bigquery_client.list_rows.return_value = mock_row_iterator mock_bigquery_client.query.return_value = mock_query_job @@ -942,7 +944,12 @@ def test_run_query_with_dml_query(mock_bigquery_client, mock_query_job): def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) - _, kwargs = mock_bigquery_client.query.call_args - job_config = kwargs["job_config"] + # Check which method was called based on BigQuery version + if hasattr(mock_bigquery_client, "query_and_wait") and mock_bigquery_client.query_and_wait.called: + _, kwargs = mock_bigquery_client.query_and_wait.call_args + job_config = kwargs["job_config"] + else: + _, kwargs = mock_bigquery_client.query.call_args + job_config = kwargs["job_config"] assert job_config.dry_run is True assert cost == 12345 / 1024**3 diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 2437fa02..1ab7e54f 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -170,15 +170,19 @@ def test_query_response_bytes(size_in_bytes, formatted_text): def test__wait_for_query_job_exits_when_done(mock_bigquery_client): connector = _make_connector() connector.client = mock_bigquery_client - connector.start = datetime.datetime(2020, 1, 1).timestamp() mock_query = mock.create_autospec(google.cloud.bigquery.QueryJob) type(mock_query).state = mock.PropertyMock(side_effect=("RUNNING", "DONE")) mock_query.result.side_effect = concurrent.futures.TimeoutError("fake timeout") - with freezegun.freeze_time("2020-01-01 00:00:00", tick=False): + frozen_time = datetime.datetime(2020, 1, 1) + with freezegun.freeze_time(frozen_time, tick=False): + # Set start time inside frozen context to ensure elapsed time is 0 + connector.start = frozen_time.timestamp() + # Mock get_elapsed_seconds to return 0 to prevent timeout + connector.get_elapsed_seconds = mock.Mock(return_value=0.0) module_under_test._wait_for_query_job( - connector, mock_bigquery_client, mock_query, 60 + connector, mock_bigquery_client, mock_query, 1000 ) mock_bigquery_client.cancel_job.assert_not_called() From e9f4c00941603d750739d78585673ce442ee5f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Thu, 13 Nov 2025 00:39:17 +0100 Subject: [PATCH 4/7] fix lint --- pandas_gbq/gbq_connector.py | 4 +++- pandas_gbq/query.py | 8 ++++---- tests/system/test_gbq.py | 1 - tests/unit/test_gbq.py | 5 ++++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 2c48f6fa..0d425ecb 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -272,7 +272,9 @@ def run_query( if dry_run: # Access total_bytes_processed from the QueryJob via RowIterator.job # RowIterator has a job attribute that references the QueryJob - query_job = rows_iter.job if hasattr(rows_iter, 'job') and rows_iter.job else None + query_job = ( + rows_iter.job if hasattr(rows_iter, "job") and rows_iter.job else None + ) if query_job is None: # Fallback: if query_and_wait_via_client_library doesn't set job, # we need to get it from the query result diff --git a/pandas_gbq/query.py b/pandas_gbq/query.py index ba0f1d72..e564e052 100644 --- a/pandas_gbq/query.py +++ b/pandas_gbq/query.py @@ -182,7 +182,7 @@ def query_and_wait( rows_iter = query_reply.result(max_results=max_results) # Store reference to QueryJob in RowIterator for dry_run access # RowIterator already has a job attribute, but ensure it's set - if not hasattr(rows_iter, 'job') or rows_iter.job is None: + if not hasattr(rows_iter, "job") or rows_iter.job is None: rows_iter.job = query_reply return rows_iter except connector.http_error as ex: @@ -217,10 +217,10 @@ def query_and_wait_via_client_library( query_job.result(timeout=timeout_ms / 1000.0 if timeout_ms else None) # Get the result iterator and ensure job attribute is set rows_iter = query_job.result(max_results=max_results) - if not hasattr(rows_iter, 'job') or rows_iter.job is None: + if not hasattr(rows_iter, "job") or rows_iter.job is None: rows_iter.job = query_job return rows_iter - + rows_iter = try_query( connector, functools.partial( @@ -234,7 +234,7 @@ def query_and_wait_via_client_library( ), ) # Ensure job attribute is set for consistency - if hasattr(rows_iter, 'job') and rows_iter.job is None: + if hasattr(rows_iter, "job") and rows_iter.job is None: # If query_and_wait doesn't set job, we need to get it from the query # This shouldn't happen, but we ensure it's set for dry_run compatibility pass diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 3764cc8b..ac7f0d87 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -669,7 +669,6 @@ def test_read_gbq_with_dry_run(self, project_id): assert cost > 0 - class TestToGBQIntegration(object): @pytest.fixture(autouse=True, scope="function") def setup(self, project, credentials, random_dataset_id): diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index e63c364a..37812480 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -945,7 +945,10 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): type(mock_query_job).total_bytes_processed = mock.PropertyMock(return_value=12345) cost = gbq.read_gbq("SELECT 1", project_id="my-project", dry_run=True) # Check which method was called based on BigQuery version - if hasattr(mock_bigquery_client, "query_and_wait") and mock_bigquery_client.query_and_wait.called: + if ( + hasattr(mock_bigquery_client, "query_and_wait") + and mock_bigquery_client.query_and_wait.called + ): _, kwargs = mock_bigquery_client.query_and_wait.call_args job_config = kwargs["job_config"] else: From a171ff467dcc6e7568394ee522daef7efe3e1b14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Sun, 16 Nov 2025 11:48:11 +0100 Subject: [PATCH 5/7] Remove unit conversion --- pandas_gbq/gbq_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_gbq/gbq_connector.py b/pandas_gbq/gbq_connector.py index 0d425ecb..ab4b2068 100644 --- a/pandas_gbq/gbq_connector.py +++ b/pandas_gbq/gbq_connector.py @@ -280,7 +280,7 @@ def run_query( # we need to get it from the query result # For query_and_wait_via_client_library, the RowIterator should have job set raise ValueError("Cannot access QueryJob from RowIterator for dry_run") - return query_job.total_bytes_processed / 1024**3 + return query_job.total_bytes_processed return self._download_results( rows_iter, From 8207a4700ebe30d76e54e0572e46400d4c51ebc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Tue, 18 Nov 2025 08:05:46 +0100 Subject: [PATCH 6/7] fix docs --- pandas_gbq/gbq.py | 2 +- tests/unit/test_gbq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 75fa3510..abb5ad76 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -276,7 +276,7 @@ def read_gbq( ------- df: DataFrame or float DataFrame representing results of query. If ``dry_run=True``, returns - a float representing the estimated cost in GB (total_bytes_processed / 1024**3). + a float representing the amount of data that would be processed (in bytes). """ if dialect is None: dialect = context.dialect diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 37812480..b6d1a5e6 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -955,4 +955,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True - assert cost == 12345 / 1024**3 + assert cost >= 0 From 171a6f5dbbe156232ddcefce6a73f3788c309919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cantoineeripret=E2=80=9D?= Date: Wed, 26 Nov 2025 13:07:28 +0100 Subject: [PATCH 7/7] modify doc to use int instead of float + remove trailing space --- pandas_gbq/gbq.py | 4 ++-- tests/unit/test_gbq.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 1345404e..b9008c90 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -269,9 +269,9 @@ def read_gbq( If True, run a dry run query. Returns ------- - df: DataFrame or float + df: DataFrame or int DataFrame representing results of query. If ``dry_run=True``, returns - a float representing the amount of data that would be processed (in bytes). + aan integer representing the amount of data that would be processed (in bytes). """ if dialect is None: dialect = context.dialect diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 0568b7b6..070eb351 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -956,4 +956,4 @@ def test_read_gbq_with_dry_run(mock_bigquery_client, mock_query_job): _, kwargs = mock_bigquery_client.query.call_args job_config = kwargs["job_config"] assert job_config.dry_run is True - assert cost >= 0 + assert cost >= 0