1# Copyright 2023 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Shared helper functions for connecting BigQuery and pyarrow.
16
17NOTE: This module is DEPRECATED. Please make updates in the pandas-gbq package,
18instead. See: go/pandas-gbq-and-bigframes-redundancy,
19https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/bigquery_to_pyarrow.py
20and
21https://github.com/googleapis/python-bigquery-pandas/blob/main/pandas_gbq/schema/pyarrow_to_bigquery.py
22"""
23
24from typing import Any
25
26try:
27 import pyarrow # type: ignore
28except ImportError:
29 pyarrow = None
30
31try:
32 import db_dtypes # type: ignore
33
34 db_dtypes_import_exception = None
35except ImportError as exc:
36 db_dtypes = None
37 db_dtypes_import_exception = exc
38
39
40def pyarrow_datetime():
41 return pyarrow.timestamp("us", tz=None)
42
43
44def pyarrow_numeric():
45 return pyarrow.decimal128(38, 9)
46
47
48def pyarrow_bignumeric():
49 # 77th digit is partial.
50 # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
51 return pyarrow.decimal256(76, 38)
52
53
54def pyarrow_time():
55 return pyarrow.time64("us")
56
57
58def pyarrow_timestamp():
59 return pyarrow.timestamp("us", tz="UTC")
60
61
62_BQ_TO_ARROW_SCALARS = {}
63_ARROW_SCALAR_IDS_TO_BQ = {}
64
65if pyarrow:
66 # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
67 # When modifying it be sure to update it there as well.
68 # Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py
69 _BQ_TO_ARROW_SCALARS = {
70 "BOOL": pyarrow.bool_,
71 "BOOLEAN": pyarrow.bool_,
72 "BYTES": pyarrow.binary,
73 "DATE": pyarrow.date32,
74 "DATETIME": pyarrow_datetime,
75 "FLOAT": pyarrow.float64,
76 "FLOAT64": pyarrow.float64,
77 "GEOGRAPHY": pyarrow.string,
78 "INT64": pyarrow.int64,
79 "INTEGER": pyarrow.int64,
80 # Normally, we'd prefer JSON type built-in to pyarrow (added in 19.0.0),
81 # but we'd like this to map as closely to the BQ Storage API as
82 # possible, which uses the string() dtype, as JSON support in Arrow
83 # predates JSON support in BigQuery by several years.
84 "JSON": pyarrow.string,
85 "NUMERIC": pyarrow_numeric,
86 "STRING": pyarrow.string,
87 "TIME": pyarrow_time,
88 "TIMESTAMP": pyarrow_timestamp,
89 }
90
91 # DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
92 _ARROW_SCALAR_IDS_TO_BQ = {
93 # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
94 pyarrow.bool_().id: "BOOL",
95 pyarrow.int8().id: "INT64",
96 pyarrow.int16().id: "INT64",
97 pyarrow.int32().id: "INT64",
98 pyarrow.int64().id: "INT64",
99 pyarrow.uint8().id: "INT64",
100 pyarrow.uint16().id: "INT64",
101 pyarrow.uint32().id: "INT64",
102 pyarrow.uint64().id: "INT64",
103 pyarrow.float16().id: "FLOAT64",
104 pyarrow.float32().id: "FLOAT64",
105 pyarrow.float64().id: "FLOAT64",
106 pyarrow.time32("ms").id: "TIME",
107 pyarrow.time64("ns").id: "TIME",
108 pyarrow.timestamp("ns").id: "TIMESTAMP",
109 pyarrow.date32().id: "DATE",
110 pyarrow.date64().id: "DATETIME", # because millisecond resolution
111 pyarrow.binary().id: "BYTES",
112 pyarrow.string().id: "STRING", # also alias for pyarrow.utf8()
113 pyarrow.large_string().id: "STRING",
114 # The exact scale and precision don't matter, see below.
115 pyarrow.decimal128(38, scale=9).id: "NUMERIC",
116 # NOTE: all extension types (e.g. json_, uuid, db_dtypes.JSONArrowType)
117 # have the same id (31 as of version 19.0.1), so these should not be
118 # matched by id.
119 }
120
121 _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric
122 # The exact decimal's scale and precision are not important, as only
123 # the type ID matters, and it's the same for all decimal256 instances.
124 _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC"
125
126
127def bq_to_arrow_scalars(bq_scalar: str):
128 """
129 DEPRECATED: update pandas_gbq.schema.bigquery_to_pyarrow, instead, which is
130 to be added in https://github.com/googleapis/python-bigquery-pandas/pull/893.
131
132 Returns:
133 The Arrow scalar type that the input BigQuery scalar type maps to.
134 If it cannot find the BigQuery scalar, return None.
135 """
136 return _BQ_TO_ARROW_SCALARS.get(bq_scalar)
137
138
139def arrow_scalar_ids_to_bq(arrow_scalar: Any):
140 """
141 DEPRECATED: update pandas_gbq.schema.pyarrow_to_bigquery, instead.
142
143 Returns:
144 The BigQuery scalar type that the input arrow scalar type maps to.
145 If it cannot find the arrow scalar, return None.
146 """
147 return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar)