23
23
import warnings
24
24
from typing import Any , Union
25
25
26
- from packaging import version
27
-
28
26
from google .cloud .bigquery import _helpers
27
+ from google .cloud .bigquery import _pyarrow_helpers
28
+ from google .cloud .bigquery import _versions_helpers
29
29
from google .cloud .bigquery import schema
30
30
31
31
try :
49
49
db_dtypes_import_exception = exc
50
50
date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype
51
51
52
- pyarrow = _helpers .PYARROW_VERSIONS .try_import ()
52
+ pyarrow = _versions_helpers .PYARROW_VERSIONS .try_import ()
53
+
54
+ _BIGNUMERIC_SUPPORT = False
55
+ if pyarrow is not None :
56
+ _BIGNUMERIC_SUPPORT = True
53
57
54
58
try :
55
59
# _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array`
@@ -119,87 +123,6 @@ def __init__(self):
119
123
self .done = False
120
124
121
125
122
- def pyarrow_datetime ():
123
- return pyarrow .timestamp ("us" , tz = None )
124
-
125
-
126
- def pyarrow_numeric ():
127
- return pyarrow .decimal128 (38 , 9 )
128
-
129
-
130
- def pyarrow_bignumeric ():
131
- # 77th digit is partial.
132
- # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types
133
- return pyarrow .decimal256 (76 , 38 )
134
-
135
-
136
- def pyarrow_time ():
137
- return pyarrow .time64 ("us" )
138
-
139
-
140
- def pyarrow_timestamp ():
141
- return pyarrow .timestamp ("us" , tz = "UTC" )
142
-
143
-
144
- if pyarrow :
145
- # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py
146
- # When modifying it be sure to update it there as well.
147
- BQ_TO_ARROW_SCALARS = {
148
- "BOOL" : pyarrow .bool_ ,
149
- "BOOLEAN" : pyarrow .bool_ ,
150
- "BYTES" : pyarrow .binary ,
151
- "DATE" : pyarrow .date32 ,
152
- "DATETIME" : pyarrow_datetime ,
153
- "FLOAT" : pyarrow .float64 ,
154
- "FLOAT64" : pyarrow .float64 ,
155
- "GEOGRAPHY" : pyarrow .string ,
156
- "INT64" : pyarrow .int64 ,
157
- "INTEGER" : pyarrow .int64 ,
158
- "NUMERIC" : pyarrow_numeric ,
159
- "STRING" : pyarrow .string ,
160
- "TIME" : pyarrow_time ,
161
- "TIMESTAMP" : pyarrow_timestamp ,
162
- }
163
- ARROW_SCALAR_IDS_TO_BQ = {
164
- # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
165
- pyarrow .bool_ ().id : "BOOL" ,
166
- pyarrow .int8 ().id : "INT64" ,
167
- pyarrow .int16 ().id : "INT64" ,
168
- pyarrow .int32 ().id : "INT64" ,
169
- pyarrow .int64 ().id : "INT64" ,
170
- pyarrow .uint8 ().id : "INT64" ,
171
- pyarrow .uint16 ().id : "INT64" ,
172
- pyarrow .uint32 ().id : "INT64" ,
173
- pyarrow .uint64 ().id : "INT64" ,
174
- pyarrow .float16 ().id : "FLOAT64" ,
175
- pyarrow .float32 ().id : "FLOAT64" ,
176
- pyarrow .float64 ().id : "FLOAT64" ,
177
- pyarrow .time32 ("ms" ).id : "TIME" ,
178
- pyarrow .time64 ("ns" ).id : "TIME" ,
179
- pyarrow .timestamp ("ns" ).id : "TIMESTAMP" ,
180
- pyarrow .date32 ().id : "DATE" ,
181
- pyarrow .date64 ().id : "DATETIME" , # because millisecond resolution
182
- pyarrow .binary ().id : "BYTES" ,
183
- pyarrow .string ().id : "STRING" , # also alias for pyarrow.utf8()
184
- # The exact scale and precision don't matter, see below.
185
- pyarrow .decimal128 (38 , scale = 9 ).id : "NUMERIC" ,
186
- }
187
-
188
- if version .parse (pyarrow .__version__ ) >= version .parse ("3.0.0" ):
189
- BQ_TO_ARROW_SCALARS ["BIGNUMERIC" ] = pyarrow_bignumeric
190
- # The exact decimal's scale and precision are not important, as only
191
- # the type ID matters, and it's the same for all decimal256 instances.
192
- ARROW_SCALAR_IDS_TO_BQ [pyarrow .decimal256 (76 , scale = 38 ).id ] = "BIGNUMERIC"
193
- _BIGNUMERIC_SUPPORT = True
194
- else :
195
- _BIGNUMERIC_SUPPORT = False # pragma: NO COVER
196
-
197
- else : # pragma: NO COVER
198
- BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER
199
- ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER
200
- _BIGNUMERIC_SUPPORT = False # pragma: NO COVER
201
-
202
-
203
126
BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = {
204
127
"GEOGRAPHY" : {
205
128
b"ARROW:extension:name" : b"google:sqlType:geography" ,
@@ -240,7 +163,7 @@ def bq_to_arrow_data_type(field):
240
163
if field_type_upper in schema ._STRUCT_TYPES :
241
164
return bq_to_arrow_struct_data_type (field )
242
165
243
- data_type_constructor = BQ_TO_ARROW_SCALARS . get (field_type_upper )
166
+ data_type_constructor = _pyarrow_helpers . bq_to_arrow_scalars (field_type_upper )
244
167
if data_type_constructor is None :
245
168
return None
246
169
return data_type_constructor ()
@@ -568,7 +491,9 @@ def augment_schema(dataframe, current_bq_schema):
568
491
if pyarrow .types .is_list (arrow_table .type ):
569
492
# `pyarrow.ListType`
570
493
detected_mode = "REPEATED"
571
- detected_type = ARROW_SCALAR_IDS_TO_BQ .get (arrow_table .values .type .id )
494
+ detected_type = _pyarrow_helpers .arrow_scalar_ids_to_bq (
495
+ arrow_table .values .type .id
496
+ )
572
497
573
498
# For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds
574
499
# it to such datetimes, causing them to be recognized as TIMESTAMP type.
@@ -584,7 +509,7 @@ def augment_schema(dataframe, current_bq_schema):
584
509
detected_type = "DATETIME"
585
510
else :
586
511
detected_mode = field .mode
587
- detected_type = ARROW_SCALAR_IDS_TO_BQ . get (arrow_table .type .id )
512
+ detected_type = _pyarrow_helpers . arrow_scalar_ids_to_bq (arrow_table .type .id )
588
513
589
514
if detected_type is None :
590
515
unknown_type_fields .append (field )
@@ -705,13 +630,13 @@ def dataframe_to_parquet(
705
630
706
631
This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.
707
632
"""
708
- pyarrow = _helpers .PYARROW_VERSIONS .try_import (raise_if_error = True )
633
+ pyarrow = _versions_helpers .PYARROW_VERSIONS .try_import (raise_if_error = True )
709
634
710
635
import pyarrow .parquet # type: ignore
711
636
712
637
kwargs = (
713
638
{"use_compliant_nested_type" : parquet_use_compliant_nested_type }
714
- if _helpers .PYARROW_VERSIONS .use_compliant_nested_type
639
+ if _versions_helpers .PYARROW_VERSIONS .use_compliant_nested_type
715
640
else {}
716
641
)
717
642
0 commit comments