diff --git a/pandas_gbq/schema/bigquery_to_pandas.py b/pandas_gbq/schema/bigquery_to_pandas.py new file mode 100644 index 00000000..2b2b65e3 --- /dev/null +++ b/pandas_gbq/schema/bigquery_to_pandas.py @@ -0,0 +1,3 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. diff --git a/pandas_gbq/schema/bigquery_to_pyarrow.py b/pandas_gbq/schema/bigquery_to_pyarrow.py new file mode 100644 index 00000000..d03b3154 --- /dev/null +++ b/pandas_gbq/schema/bigquery_to_pyarrow.py @@ -0,0 +1,275 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +from typing import Any, Union +import warnings + +import db_dtypes +from google.cloud import bigquery +import pyarrow + + +def pyarrow_datetime(): + return pyarrow.timestamp("us", tz=None) + + +def pyarrow_numeric(): + return pyarrow.decimal128(38, 9) + + +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + +def pyarrow_time(): + return pyarrow.time64("us") + + +def pyarrow_timestamp(): + return pyarrow.timestamp("us", tz="UTC") + + +# Prefer JSON type built-in to pyarrow (adding in 19.0.0), if available. +# Otherwise, fallback to db-dtypes, where the JSONArrowType was added in 1.4.0, +# but since they might have an older db-dtypes, have string as a fallback for that. +if hasattr(pyarrow, "json_"): + json_arrow_type = pyarrow.json_(pyarrow.string()) +elif hasattr(db_dtypes, "JSONArrowType"): + json_arrow_type = db_dtypes.JSONArrowType() +else: + json_arrow_type = pyarrow.string() + + +# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py +# When modifying it be sure to update it there as well. +# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py +_BQ_TO_ARROW_SCALARS = { + "BIGNUMERIC": pyarrow_bignumeric, + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "JSON": json_arrow_type, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, +} + +_STRUCT_TYPES = ("RECORD", "STRUCT") + + +def bq_to_arrow_scalars(bq_scalar: str): + """ + Returns: + The Arrow scalar type that the input BigQuery scalar type maps to. + If it cannot find the BigQuery scalar, return None. + """ + return _BQ_TO_ARROW_SCALARS.get(bq_scalar) + + +BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { + "GEOGRAPHY": { + b"ARROW:extension:name": b"google:sqlType:geography", + b"ARROW:extension:metadata": b'{"encoding": "WKT"}', + }, + "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"}, + "JSON": {b"ARROW:extension:name": b"google:sqlType:json"}, +} + + +def bq_to_arrow_struct_data_type(field): + arrow_fields = [] + for subfield in field.fields: + arrow_subfield = bq_to_arrow_field(subfield) + if arrow_subfield: + arrow_fields.append(arrow_subfield) + else: + # Could not determine a subfield type. Fallback to type + # inference. + return None + return pyarrow.struct(arrow_fields) + + +def bq_to_arrow_range_data_type(field): + if field is None: + raise ValueError( + "Range element type cannot be None, must be one of " + "DATE, DATETIME, or TIMESTAMP" + ) + element_type = field.element_type.upper() + arrow_element_type = bq_to_arrow_scalars(element_type)() + return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)]) + + +def bq_to_arrow_data_type(field): + """Return the Arrow data type, corresponding to a given BigQuery column. + + Returns: + None: if default Arrow type inspection should be used. + """ + if field.mode is not None and field.mode.upper() == "REPEATED": + inner_type = bq_to_arrow_data_type( + bigquery.SchemaField(field.name, field.field_type, fields=field.fields) + ) + if inner_type: + return pyarrow.list_(inner_type) + return None + + field_type_upper = field.field_type.upper() if field.field_type else "" + if field_type_upper in _STRUCT_TYPES: + return bq_to_arrow_struct_data_type(field) + + if field_type_upper == "RANGE": + return bq_to_arrow_range_data_type(field.range_element_type) + + data_type_constructor = bq_to_arrow_scalars(field_type_upper) + if data_type_constructor is None: + return None + return data_type_constructor() + + +def bq_to_arrow_field(bq_field, array_type=None): + """Return the Arrow field, corresponding to a given BigQuery column. + + Returns: + None: if the Arrow type cannot be determined. + """ + arrow_type = bq_to_arrow_data_type(bq_field) + if arrow_type is not None: + if array_type is not None: + arrow_type = array_type # For GEOGRAPHY, at least initially + metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get( + bq_field.field_type.upper() if bq_field.field_type else "" + ) + return pyarrow.field( + bq_field.name, + arrow_type, + # Even if the remote schema is REQUIRED, there's a chance there's + # local NULL values. Arrow will gladly interpret these NULL values + # as non-NULL and give you an arbitrary value. See: + # https://github.com/googleapis/python-bigquery/issues/1692 + nullable=False if bq_field.mode.upper() == "REPEATED" else True, + metadata=metadata, + ) + + warnings.warn( + "Unable to determine Arrow type for field '{}'.".format(bq_field.name) + ) + return None + + +def bq_to_arrow_schema(bq_schema): + """Return the Arrow schema, corresponding to a given BigQuery schema. + + Returns: + None: if any Arrow type cannot be determined. + """ + arrow_fields = [] + for bq_field in bq_schema: + arrow_field = bq_to_arrow_field(bq_field) + if arrow_field is None: + # Auto-detect the schema if there is an unknown field type. + return None + arrow_fields.append(arrow_field) + return pyarrow.schema(arrow_fields) + + +def default_types_mapper( + date_as_object: bool = False, + bool_dtype: Union[Any, None] = None, + int_dtype: Union[Any, None] = None, + float_dtype: Union[Any, None] = None, + string_dtype: Union[Any, None] = None, + date_dtype: Union[Any, None] = None, + datetime_dtype: Union[Any, None] = None, + time_dtype: Union[Any, None] = None, + timestamp_dtype: Union[Any, None] = None, + range_date_dtype: Union[Any, None] = None, + range_datetime_dtype: Union[Any, None] = None, + range_timestamp_dtype: Union[Any, None] = None, +): + """Create a mapping from pyarrow types to pandas types. + + This overrides the pandas defaults to use null-safe extension types where + available. + + See: https://arrow.apache.org/docs/python/api/datatypes.html for a list of + data types. See: + tests/unit/test__pandas_helpers.py::test_bq_to_arrow_data_type for + BigQuery to Arrow type mapping. + + Note to google-cloud-bigquery developers: If you update the default dtypes, + also update the docs at docs/usage/pandas.rst. + """ + + def types_mapper(arrow_data_type): + if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type): + return bool_dtype + + elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type): + return int_dtype + + elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type): + return float_dtype + + elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type): + return string_dtype + + elif ( + # If date_as_object is True, we know some DATE columns are + # out-of-bounds of what is supported by pandas. + date_dtype is not None + and not date_as_object + and pyarrow.types.is_date(arrow_data_type) + ): + return date_dtype + + elif ( + datetime_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is None + ): + return datetime_dtype + + elif ( + timestamp_dtype is not None + and pyarrow.types.is_timestamp(arrow_data_type) + and arrow_data_type.tz is not None + ): + return timestamp_dtype + + elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type): + return time_dtype + + elif pyarrow.types.is_struct(arrow_data_type): + if range_datetime_dtype is not None and arrow_data_type.equals( + range_datetime_dtype.pyarrow_dtype + ): + return range_datetime_dtype + + elif range_date_dtype is not None and arrow_data_type.equals( + range_date_dtype.pyarrow_dtype + ): + return range_date_dtype + + # TODO: this section does not have a test yet OR at least not one that is + # recognized by coverage, hence the pragma. See Issue: #2132 + elif ( + range_timestamp_dtype is not None + and arrow_data_type.equals( # pragma: NO COVER + range_timestamp_dtype.pyarrow_dtype + ) + ): + return range_timestamp_dtype + + return types_mapper diff --git a/pandas_gbq/schema/pyarrow_to_bigquery.py b/pandas_gbq/schema/pyarrow_to_bigquery.py index d917499f..67f2e20d 100644 --- a/pandas_gbq/schema/pyarrow_to_bigquery.py +++ b/pandas_gbq/schema/pyarrow_to_bigquery.py @@ -4,6 +4,7 @@ from typing import Optional, cast +import db_dtypes from google.cloud.bigquery import schema import pyarrow import pyarrow.types @@ -34,27 +35,20 @@ # the type ID matters, and it's the same for all decimal256 instances. pyarrow.decimal128(38, scale=9).id: "NUMERIC", pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC", + # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType) + # have the same id (31 as of version 19.0.1), so these should not be + # matched by id. } -def arrow_type_to_bigquery_field( +def arrow_scalar_type_to_bigquery_field( name, type_, default_type="STRING" ) -> Optional[schema.SchemaField]: - """Infers the BigQuery schema field type from an arrow type. - - Args: - name (str): - Name of the column/field. - type_: - A pyarrow type object. + """Infers the BigQuery schema field type from a scalar arrow type. Returns: - Optional[schema.SchemaField]: - The schema field, or None if a type cannot be inferred, such as if - it is a type that doesn't have a clear mapping in BigQuery. - - null() are assumed to be the ``default_type``, since there are no - values that contradict that. + The BigQuery scalar type that the input arrow scalar type maps to. + If it cannot find the arrow scalar, return None. """ # If a sub-field is the null type, then assume it's the default type, as # that's the best we can do. @@ -82,6 +76,46 @@ def arrow_type_to_bigquery_field( if detected_type is not None: return schema.SchemaField(name, detected_type) + # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType) + # have the same id (31 as of version 19.0.1), so these should not be + # matched by id. + if (hasattr(pyarrow, "JsonType") and isinstance(type_, pyarrow.JsonType)) or ( + hasattr(db_dtypes, "JSONArrowType") + and isinstance(type_, db_dtypes.JSONArrowType) + ): + return schema.SchemaField(name, "JSON") + + # Could not identify a type. + return None + + +def arrow_type_to_bigquery_field( + name, type_, default_type="STRING" +) -> Optional[schema.SchemaField]: + """Infers the BigQuery schema field type from an arrow type. + + Args: + name (str): + Name of the column/field. + type_: + A pyarrow type object. + + Returns: + Optional[schema.SchemaField]: + The schema field, or None if a type cannot be inferred, such as if + it is a type that doesn't have a clear mapping in BigQuery. + + null() are assumed to be the ``default_type``, since there are no + values that contradict that. + """ + scalar_field = arrow_scalar_type_to_bigquery_field( + name, + type_, + default_type=default_type, + ) + if scalar_field is not None: + return scalar_field + if pyarrow.types.is_list(type_): return arrow_list_type_to_bigquery(name, type_, default_type=default_type) diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index ad7c58ec..4db4d924 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -574,6 +574,66 @@ def test_series_round_trip( ), id="struct", ), + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [0, 1, 2, 3, 4], + "json": pandas.Series( + [ + '{"key": "value"}', + None, + "123", + "[123]", + '"string"', + ], + dtype=( + pandas.ArrowDtype(db_dtypes.JSONArrowType()) + # pandas.ArrowDtype(pyarrow.json_(pyarrow.string())) + # if hasattr(pandas, "ArrowDtype") + # and hasattr(pyarrow, "json_") + # else "object" + ), + ), + }, + ), + # Writes with parquet currently blocked by internal issue 374784249. + api_methods={"load_csv"}, + expected_df=pandas.DataFrame( + { + "row_num": [0, 1, 2, 3, 4], + "json": pandas.Series( + [ + '{"key":"value"}', + None, + "123", + "[123]", + '"string"', + ], + # TODO(https://github.com/googleapis/python-bigquery/pull/1876): + # if pyarrow.json_() type is supported by both reads and writes, then + # round-trip of a JSON column should work. + # Reads currently blocked by + # https://github.com/googleapis/python-bigquery/pull/1876 + # though, we should probably move the BQ -> pandas type mapping to this + # package so the logic can be consolidated soon. + dtype=( + pandas.ArrowDtype(db_dtypes.JSONArrowType()) + # pandas.ArrowDtype(pyarrow.json_(pyarrow.string())) + # if hasattr(pandas, "ArrowDtype") + # and hasattr(pyarrow, "json_") + # else "object" + ), + ), + }, + ), + ), + marks=pytest.mark.skipif( + not hasattr(pyarrow, "json_"), + reason="no canonical JSON extension type available", + ), + id="json", + ), ] diff --git a/tests/unit/schema/test_bigquery_to_pyarrow.py b/tests/unit/schema/test_bigquery_to_pyarrow.py new file mode 100644 index 00000000..d1cc469f --- /dev/null +++ b/tests/unit/schema/test_bigquery_to_pyarrow.py @@ -0,0 +1,520 @@ +# Copyright (c) 2025 pandas-gbq Authors All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import warnings + +from google.cloud import bigquery +from google.cloud.bigquery import schema +import pyarrow +import pytest + + +@pytest.fixture +def module_under_test(): + from pandas_gbq.schema import bigquery_to_pyarrow + + return bigquery_to_pyarrow + + +def is_none(value): + return value is None + + +def is_datetime(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime-type + return all_( + pyarrow.types.is_timestamp, + lambda type_: type_.unit == "us", + lambda type_: type_.tz is None, + )(type_) + + +def is_numeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 38, + lambda type_: type_.scale == 9, + )(type_) + + +def is_bignumeric(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type + return all_( + pyarrow.types.is_decimal, + lambda type_: type_.precision == 76, + lambda type_: type_.scale == 38, + )(type_) + + +def is_timestamp(type_): + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp-type + return all_( + pyarrow.types.is_timestamp, + lambda type_: type_.unit == "us", + lambda type_: type_.tz == "UTC", + )(type_) + + +def do_all(functions, value): + return all((func(value) for func in functions)) + + +def all_(*functions): + return functools.partial(do_all, functions) + + +def test_is_datetime(): + assert is_datetime(pyarrow.timestamp("us", tz=None)) + assert not is_datetime(pyarrow.timestamp("ms", tz=None)) + assert not is_datetime(pyarrow.timestamp("us", tz="UTC")) + assert not is_datetime(pyarrow.timestamp("ns", tz="UTC")) + assert not is_datetime(pyarrow.string()) + + +def test_do_all(): + assert do_all((lambda _: True, lambda _: True), None) + assert not do_all((lambda _: True, lambda _: False), None) + assert not do_all((lambda _: False,), None) + + +def test_all_(): + assert all_(lambda _: True, lambda _: True)(None) + assert not all_(lambda _: True, lambda _: False)(None) + + +@pytest.mark.parametrize( + "bq_type,bq_mode,is_correct_type", + [ + ("STRING", "NULLABLE", pyarrow.types.is_string), + ("STRING", None, pyarrow.types.is_string), + ("string", "NULLABLE", pyarrow.types.is_string), + ("StRiNg", "NULLABLE", pyarrow.types.is_string), + ("BYTES", "NULLABLE", pyarrow.types.is_binary), + ("INTEGER", "NULLABLE", pyarrow.types.is_int64), + ("INT64", "NULLABLE", pyarrow.types.is_int64), + ("FLOAT", "NULLABLE", pyarrow.types.is_float64), + ("FLOAT64", "NULLABLE", pyarrow.types.is_float64), + ("NUMERIC", "NULLABLE", is_numeric), + ( + "BIGNUMERIC", + "NULLABLE", + is_bignumeric, + ), + ("BOOLEAN", "NULLABLE", pyarrow.types.is_boolean), + ("BOOL", "NULLABLE", pyarrow.types.is_boolean), + ("TIMESTAMP", "NULLABLE", is_timestamp), + ("DATE", "NULLABLE", pyarrow.types.is_date32), + ("TIME", "NULLABLE", pyarrow.types.is_time64), + ("DATETIME", "NULLABLE", is_datetime), + ("GEOGRAPHY", "NULLABLE", pyarrow.types.is_string), + ("UNKNOWN_TYPE", "NULLABLE", is_none), + # Use pyarrow.list_(item_type) for repeated (array) fields. + ( + "STRING", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "STRING", + "repeated", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "STRING", + "RePeAtEd", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ( + "BYTES", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_binary(type_.value_type), + ), + ), + ( + "INTEGER", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_int64(type_.value_type), + ), + ), + ( + "INT64", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_int64(type_.value_type), + ), + ), + ( + "FLOAT", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_float64(type_.value_type), + ), + ), + ( + "FLOAT64", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_float64(type_.value_type), + ), + ), + ( + "NUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_numeric(type_.value_type)), + ), + ( + "BIGNUMERIC", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_bignumeric(type_.value_type)), + ), + ( + "BOOLEAN", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_boolean(type_.value_type), + ), + ), + ( + "BOOL", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_boolean(type_.value_type), + ), + ), + ( + "TIMESTAMP", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_timestamp(type_.value_type)), + ), + ( + "DATE", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_date32(type_.value_type), + ), + ), + ( + "TIME", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_time64(type_.value_type), + ), + ), + ( + "DATETIME", + "REPEATED", + all_(pyarrow.types.is_list, lambda type_: is_datetime(type_.value_type)), + ), + ( + "GEOGRAPHY", + "REPEATED", + all_( + pyarrow.types.is_list, + lambda type_: pyarrow.types.is_string(type_.value_type), + ), + ), + ("RECORD", "REPEATED", is_none), + ("UNKNOWN_TYPE", "REPEATED", is_none), + ], +) +def test_bq_to_arrow_data_type(module_under_test, bq_type, bq_mode, is_correct_type): + field = bigquery.SchemaField("ignored_name", bq_type, mode=bq_mode) + actual = module_under_test.bq_to_arrow_data_type(field) + assert is_correct_type(actual) + + +@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): + fields = ( + bigquery.SchemaField("field01", "STRING"), + bigquery.SchemaField("field02", "BYTES"), + bigquery.SchemaField("field03", "INTEGER"), + bigquery.SchemaField("field04", "INT64"), + bigquery.SchemaField("field05", "FLOAT"), + bigquery.SchemaField("field06", "FLOAT64"), + bigquery.SchemaField("field07", "NUMERIC"), + bigquery.SchemaField("field08", "BIGNUMERIC"), + bigquery.SchemaField("field09", "BOOLEAN"), + bigquery.SchemaField("field10", "BOOL"), + bigquery.SchemaField("field11", "TIMESTAMP"), + bigquery.SchemaField("field12", "DATE"), + bigquery.SchemaField("field13", "TIME"), + bigquery.SchemaField("field14", "DATETIME"), + bigquery.SchemaField("field15", "GEOGRAPHY"), + ) + + field = bigquery.SchemaField( + "ignored_name", bq_type, mode="NULLABLE", fields=fields + ) + actual = module_under_test.bq_to_arrow_data_type(field) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), + ) + expected = pyarrow.struct(expected) + + assert pyarrow.types.is_struct(actual) + assert actual.num_fields == len(fields) + assert actual.equals(expected) + + +@pytest.mark.parametrize("bq_type", ["RECORD", "record", "STRUCT", "struct"]) +def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): + fields = ( + bigquery.SchemaField("field01", "STRING"), + bigquery.SchemaField("field02", "BYTES"), + bigquery.SchemaField("field03", "INTEGER"), + bigquery.SchemaField("field04", "INT64"), + bigquery.SchemaField("field05", "FLOAT"), + bigquery.SchemaField("field06", "FLOAT64"), + bigquery.SchemaField("field07", "NUMERIC"), + bigquery.SchemaField("field08", "BIGNUMERIC"), + bigquery.SchemaField("field09", "BOOLEAN"), + bigquery.SchemaField("field10", "BOOL"), + bigquery.SchemaField("field11", "TIMESTAMP"), + bigquery.SchemaField("field12", "DATE"), + bigquery.SchemaField("field13", "TIME"), + bigquery.SchemaField("field14", "DATETIME"), + bigquery.SchemaField("field15", "GEOGRAPHY"), + ) + + field = bigquery.SchemaField( + "ignored_name", bq_type, mode="REPEATED", fields=fields + ) + actual = module_under_test.bq_to_arrow_data_type(field) + + expected = ( + pyarrow.field("field01", pyarrow.string()), + pyarrow.field("field02", pyarrow.binary()), + pyarrow.field("field03", pyarrow.int64()), + pyarrow.field("field04", pyarrow.int64()), + pyarrow.field("field05", pyarrow.float64()), + pyarrow.field("field06", pyarrow.float64()), + pyarrow.field("field07", module_under_test.pyarrow_numeric()), + pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field09", pyarrow.bool_()), + pyarrow.field("field10", pyarrow.bool_()), + pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field12", pyarrow.date32()), + pyarrow.field("field13", module_under_test.pyarrow_time()), + pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field15", pyarrow.string()), + ) + expected_value_type = pyarrow.struct(expected) + + assert pyarrow.types.is_list(actual) + assert pyarrow.types.is_struct(actual.value_type) + assert actual.value_type.num_fields == len(fields) + assert actual.value_type.equals(expected_value_type) + + +def test_bq_to_arrow_data_type_w_struct_unknown_subfield(module_under_test): + fields = ( + bigquery.SchemaField("field1", "STRING"), + bigquery.SchemaField("field2", "INTEGER"), + # Don't know what to convert UNKNOWN_TYPE to, let type inference work, + # instead. + bigquery.SchemaField("field3", "UNKNOWN_TYPE"), + ) + field = bigquery.SchemaField( + "ignored_name", "RECORD", mode="NULLABLE", fields=fields + ) + + with warnings.catch_warnings(record=True) as warned: + actual = module_under_test.bq_to_arrow_data_type(field) + + assert actual is None + assert len(warned) == 1 + warning = warned[0] + assert "field3" in str(warning) + + +@pytest.mark.parametrize( + "bq_schema,expected", + [ + ( + bigquery.SchemaField( + "field1", + "RANGE", + range_element_type=schema.FieldElementType("DATE"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.date32()), + ("end", pyarrow.date32()), + ] + ), + ), + ( + bigquery.SchemaField( + "field2", + "RANGE", + range_element_type=schema.FieldElementType("DATETIME"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz=None)), + ("end", pyarrow.timestamp("us", tz=None)), + ] + ), + ), + ( + bigquery.SchemaField( + "field3", + "RANGE", + range_element_type=schema.FieldElementType("TIMESTAMP"), + mode="NULLABLE", + ), + pyarrow.struct( + [ + ("start", pyarrow.timestamp("us", tz="UTC")), + ("end", pyarrow.timestamp("us", tz="UTC")), + ] + ), + ), + ], +) +def test_bq_to_arrow_data_type_w_range(module_under_test, bq_schema, expected): + actual = module_under_test.bq_to_arrow_data_type(bq_schema) + assert actual.equals(expected) + + +def test_bq_to_arrow_data_type_w_range_no_element(module_under_test): + field = bigquery.SchemaField("field1", "RANGE", mode="NULLABLE") + with pytest.raises(ValueError, match="Range element type cannot be None"): + module_under_test.bq_to_arrow_data_type(field) + + +def test_bq_to_arrow_schema_w_unknown_type(module_under_test): + fields = ( + bigquery.SchemaField("field1", "STRING"), + bigquery.SchemaField("field2", "INTEGER"), + # Don't know what to convert UNKNOWN_TYPE to, let type inference work, + # instead. + bigquery.SchemaField("field3", "UNKNOWN_TYPE"), + ) + with warnings.catch_warnings(record=True) as warned: + actual = module_under_test.bq_to_arrow_schema(fields) + assert actual is None + + assert len(warned) == 1 + warning = warned[0] + assert "field3" in str(warning) + + +def test_bq_to_arrow_field_type_override(module_under_test): + # When loading pandas data, we may need to override the type + # decision based on data contents, because GEOGRAPHY data can be + # stored as either text or binary. + + assert ( + module_under_test.bq_to_arrow_field(bigquery.SchemaField("g", "GEOGRAPHY")).type + == pyarrow.string() + ) + + assert ( + module_under_test.bq_to_arrow_field( + bigquery.SchemaField("g", "GEOGRAPHY"), + pyarrow.binary(), + ).type + == pyarrow.binary() + ) + + +def test_bq_to_arrow_field_set_repeated_nullable_false(module_under_test): + assert ( + module_under_test.bq_to_arrow_field( + bigquery.SchemaField("name", "STRING", mode="REPEATED") + ).nullable + is False + ) + + assert ( + module_under_test.bq_to_arrow_field( + bigquery.SchemaField("name", "STRING", mode="NULLABLE") + ).nullable + is True + ) + + +@pytest.mark.parametrize( + "field_type, metadata", + [ + ("datetime", {b"ARROW:extension:name": b"google:sqlType:datetime"}), + ( + "geography", + { + b"ARROW:extension:name": b"google:sqlType:geography", + b"ARROW:extension:metadata": b'{"encoding": "WKT"}', + }, + ), + ], +) +def test_bq_to_arrow_field_metadata(module_under_test, field_type, metadata): + assert ( + module_under_test.bq_to_arrow_field( + bigquery.SchemaField("g", field_type) + ).metadata + == metadata + ) + + +def test_bq_to_arrow_scalars(module_under_test): + assert ( + module_under_test.bq_to_arrow_scalars("BIGNUMERIC") + == module_under_test.pyarrow_bignumeric + ) + assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None