googleapis · tswast · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
@@ -0,0 +1,3 @@
+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
@@ -0,0 +1,275 @@
+# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+from typing import Any, Union
+import warnings
+
+import db_dtypes
+from google.cloud import bigquery
+import pyarrow
+
+
+def pyarrow_datetime():
+    return pyarrow.timestamp("us", tz=None)
+
+
+def pyarrow_numeric():
+    return pyarrow.decimal128(38, 9)
+
+
+def pyarrow_bignumeric():
+    # 77th digit is partial.
+    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
+    return pyarrow.decimal256(76, 38)
+
+
+def pyarrow_time():
+    return pyarrow.time64("us")
+
+
+def pyarrow_timestamp():
+    return pyarrow.timestamp("us", tz="UTC")
+
+
+# Prefer JSON type built-in to pyarrow (adding in 19.0.0), if available.
+# Otherwise, fallback to db-dtypes, where the JSONArrowType was added in 1.4.0,
+# but since they might have an older db-dtypes, have string as a fallback for that.
+if hasattr(pyarrow, "json_"):
+    json_arrow_type = pyarrow.json_(pyarrow.string())
+elif hasattr(db_dtypes, "JSONArrowType"):
+    json_arrow_type = db_dtypes.JSONArrowType()
+else:
+    json_arrow_type = pyarrow.string()
+
+
+# This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
+# When modifying it be sure to update it there as well.
+# Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
+_BQ_TO_ARROW_SCALARS = {
+    "BIGNUMERIC": pyarrow_bignumeric,
+    "BOOL": pyarrow.bool_,
+    "BOOLEAN": pyarrow.bool_,
+    "BYTES": pyarrow.binary,
+    "DATE": pyarrow.date32,
+    "DATETIME": pyarrow_datetime,
+    "FLOAT": pyarrow.float64,
+    "FLOAT64": pyarrow.float64,
+    "GEOGRAPHY": pyarrow.string,
+    "INT64": pyarrow.int64,
+    "INTEGER": pyarrow.int64,
+    "JSON": json_arrow_type,
+    "NUMERIC": pyarrow_numeric,
+    "STRING": pyarrow.string,
+    "TIME": pyarrow_time,
+    "TIMESTAMP": pyarrow_timestamp,
+}
+
+_STRUCT_TYPES = ("RECORD", "STRUCT")
+
+
+def bq_to_arrow_scalars(bq_scalar: str):
+    """
+    Returns:
+        The Arrow scalar type that the input BigQuery scalar type maps to.
+        If it cannot find the BigQuery scalar, return None.
+    """
+    return _BQ_TO_ARROW_SCALARS.get(bq_scalar)
+
+
+BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
+    "GEOGRAPHY": {
+        b"ARROW:extension:name": b"google:sqlType:geography",
+        b"ARROW:extension:metadata": b'{"encoding": "WKT"}',
+    },
+    "DATETIME": {b"ARROW:extension:name": b"google:sqlType:datetime"},
+    "JSON": {b"ARROW:extension:name": b"google:sqlType:json"},
+}
+
+
+def bq_to_arrow_struct_data_type(field):
+    arrow_fields = []
+    for subfield in field.fields:
+        arrow_subfield = bq_to_arrow_field(subfield)
+        if arrow_subfield:
+            arrow_fields.append(arrow_subfield)
+        else:
+            # Could not determine a subfield type. Fallback to type
+            # inference.
+            return None
+    return pyarrow.struct(arrow_fields)
+
+
+def bq_to_arrow_range_data_type(field):
+    if field is None:
+        raise ValueError(
+            "Range element type cannot be None, must be one of "
+            "DATE, DATETIME, or TIMESTAMP"
+        )
+    element_type = field.element_type.upper()
+    arrow_element_type = bq_to_arrow_scalars(element_type)()
+    return pyarrow.struct([("start", arrow_element_type), ("end", arrow_element_type)])
+
+
+def bq_to_arrow_data_type(field):
+    """Return the Arrow data type, corresponding to a given BigQuery column.
+
+    Returns:
+        None: if default Arrow type inspection should be used.
+    """
+    if field.mode is not None and field.mode.upper() == "REPEATED":
+        inner_type = bq_to_arrow_data_type(
+            bigquery.SchemaField(field.name, field.field_type, fields=field.fields)
+        )
+        if inner_type:
+            return pyarrow.list_(inner_type)
+        return None
+
+    field_type_upper = field.field_type.upper() if field.field_type else ""
+    if field_type_upper in _STRUCT_TYPES:
+        return bq_to_arrow_struct_data_type(field)
+
+    if field_type_upper == "RANGE":
+        return bq_to_arrow_range_data_type(field.range_element_type)
+
+    data_type_constructor = bq_to_arrow_scalars(field_type_upper)
+    if data_type_constructor is None:
+        return None
+    return data_type_constructor()
+
+
+def bq_to_arrow_field(bq_field, array_type=None):
+    """Return the Arrow field, corresponding to a given BigQuery column.
+
+    Returns:
+        None: if the Arrow type cannot be determined.
+    """
+    arrow_type = bq_to_arrow_data_type(bq_field)
+    if arrow_type is not None:
+        if array_type is not None:
+            arrow_type = array_type  # For GEOGRAPHY, at least initially
+        metadata = BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA.get(
+            bq_field.field_type.upper() if bq_field.field_type else ""
+        )
+        return pyarrow.field(
+            bq_field.name,
+            arrow_type,
+            # Even if the remote schema is REQUIRED, there's a chance there's
+            # local NULL values. Arrow will gladly interpret these NULL values
+            # as non-NULL and give you an arbitrary value. See:
+            # https://github.com/googleapis/python-bigquery/issues/1692
+            nullable=False if bq_field.mode.upper() == "REPEATED" else True,
+            metadata=metadata,
+        )
+
+    warnings.warn(
+        "Unable to determine Arrow type for field '{}'.".format(bq_field.name)
+    )
+    return None
+
+
+def bq_to_arrow_schema(bq_schema):
+    """Return the Arrow schema, corresponding to a given BigQuery schema.
+
+    Returns:
+        None: if any Arrow type cannot be determined.
+    """
+    arrow_fields = []
+    for bq_field in bq_schema:
+        arrow_field = bq_to_arrow_field(bq_field)
+        if arrow_field is None:
+            # Auto-detect the schema if there is an unknown field type.
+            return None
+        arrow_fields.append(arrow_field)
+    return pyarrow.schema(arrow_fields)
+
+
+def default_types_mapper(
+    date_as_object: bool = False,
+    bool_dtype: Union[Any, None] = None,
+    int_dtype: Union[Any, None] = None,
+    float_dtype: Union[Any, None] = None,
+    string_dtype: Union[Any, None] = None,
+    date_dtype: Union[Any, None] = None,
+    datetime_dtype: Union[Any, None] = None,
+    time_dtype: Union[Any, None] = None,
+    timestamp_dtype: Union[Any, None] = None,
+    range_date_dtype: Union[Any, None] = None,
+    range_datetime_dtype: Union[Any, None] = None,
+    range_timestamp_dtype: Union[Any, None] = None,
+):
+    """Create a mapping from pyarrow types to pandas types.
+
+    This overrides the pandas defaults to use null-safe extension types where
+    available.
+
+    See: https://arrow.apache.org/docs/python/api/datatypes.html for a list of
+    data types. See:
+    tests/unit/test__pandas_helpers.py::test_bq_to_arrow_data_type for
+    BigQuery to Arrow type mapping.
+
+    Note to google-cloud-bigquery developers: If you update the default dtypes,
+    also update the docs at docs/usage/pandas.rst.
+    """
+
+    def types_mapper(arrow_data_type):
+        if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
+            return bool_dtype
+
+        elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
+            return int_dtype
+
+        elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
+            return float_dtype
+
+        elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
+            return string_dtype
+
+        elif (
+            # If date_as_object is True, we know some DATE columns are
+            # out-of-bounds of what is supported by pandas.
+            date_dtype is not None
+            and not date_as_object
+            and pyarrow.types.is_date(arrow_data_type)
+        ):
+            return date_dtype
+
+        elif (
+            datetime_dtype is not None
+            and pyarrow.types.is_timestamp(arrow_data_type)
+            and arrow_data_type.tz is None
+        ):
+            return datetime_dtype
+
+        elif (
+            timestamp_dtype is not None
+            and pyarrow.types.is_timestamp(arrow_data_type)
+            and arrow_data_type.tz is not None
+        ):
+            return timestamp_dtype
+
+        elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
+            return time_dtype
+
+        elif pyarrow.types.is_struct(arrow_data_type):
+            if range_datetime_dtype is not None and arrow_data_type.equals(
+                range_datetime_dtype.pyarrow_dtype
+            ):
+                return range_datetime_dtype
+
+            elif range_date_dtype is not None and arrow_data_type.equals(
+                range_date_dtype.pyarrow_dtype
+            ):
+                return range_date_dtype
+
+            # TODO: this section does not have a test yet OR at least not one that is
+            # recognized by coverage, hence the pragma. See Issue: #2132
+            elif (
+                range_timestamp_dtype is not None
+                and arrow_data_type.equals(  # pragma: NO COVER
+                    range_timestamp_dtype.pyarrow_dtype
+                )
+            ):
+                return range_timestamp_dtype
+
+    return types_mapper
@@ -4,6 +4,7 @@
 
 from typing import Optional, cast
 
+import db_dtypes
 from google.cloud.bigquery import schema
 import pyarrow
 import pyarrow.types
@@ -34,27 +35,20 @@
     # the type ID matters, and it's the same for all decimal256 instances.
     pyarrow.decimal128(38, scale=9).id: "NUMERIC",
     pyarrow.decimal256(76, scale=38).id: "BIGNUMERIC",
+    # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
+    # have the same id (31 as of version 19.0.1), so these should not be
+    # matched by id.
 }
 
 
-def arrow_type_to_bigquery_field(
+def arrow_scalar_type_to_bigquery_field(
     name, type_, default_type="STRING"
 ) -> Optional[schema.SchemaField]:
-    """Infers the BigQuery schema field type from an arrow type.
-
-    Args:
-        name (str):
-            Name of the column/field.
-        type_:
-            A pyarrow type object.
+    """Infers the BigQuery schema field type from a scalar arrow type.
 
     Returns:
-        Optional[schema.SchemaField]:
-            The schema field, or None if a type cannot be inferred, such as if
-            it is a type that doesn't have a clear mapping in BigQuery.
-
-            null() are assumed to be the ``default_type``, since there are no
-            values that contradict that.
+        The BigQuery scalar type that the input arrow scalar type maps to.
+        If it cannot find the arrow scalar, return None.
     """
     # If a sub-field is the null type, then assume it's the default type, as
     # that's the best we can do.
@@ -82,6 +76,46 @@ def arrow_type_to_bigquery_field(
     if detected_type is not None:
         return schema.SchemaField(name, detected_type)
 
+    # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
+    # have the same id (31 as of version 19.0.1), so these should not be
+    # matched by id.
+    if (hasattr(pyarrow, "JsonType") and isinstance(type_, pyarrow.JsonType)) or (
+        hasattr(db_dtypes, "JSONArrowType")
+        and isinstance(type_, db_dtypes.JSONArrowType)
+    ):
+        return schema.SchemaField(name, "JSON")
+
+    # Could not identify a type.
+    return None
+
+
+def arrow_type_to_bigquery_field(
+    name, type_, default_type="STRING"
+) -> Optional[schema.SchemaField]:
+    """Infers the BigQuery schema field type from an arrow type.
+
+    Args:
+        name (str):
+            Name of the column/field.
+        type_:
+            A pyarrow type object.
+
+    Returns:
+        Optional[schema.SchemaField]:
+            The schema field, or None if a type cannot be inferred, such as if
+            it is a type that doesn't have a clear mapping in BigQuery.
+
+            null() are assumed to be the ``default_type``, since there are no
+            values that contradict that.
+    """
+    scalar_field = arrow_scalar_type_to_bigquery_field(
+        name,
+        type_,
+        default_type=default_type,
+    )
+    if scalar_field is not None:
+        return scalar_field
+
     if pyarrow.types.is_list(type_):
         return arrow_list_type_to_bigquery(name, type_, default_type=default_type)