1# Copyright 2015 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Define API Tables."""
16
17from __future__ import absolute_import
18
19import copy
20import datetime
21import functools
22import operator
23import typing
24from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union, Sequence
25
26import warnings
27
28try:
29 import pandas # type: ignore
30except ImportError:
31 pandas = None
32
33try:
34 import pyarrow # type: ignore
35except ImportError:
36 pyarrow = None
37
38try:
39 import db_dtypes # type: ignore
40except ImportError:
41 db_dtypes = None
42
43try:
44 import geopandas # type: ignore
45except ImportError:
46 geopandas = None
47finally:
48 _COORDINATE_REFERENCE_SYSTEM = "EPSG:4326"
49
50try:
51 import shapely # type: ignore
52 from shapely import wkt # type: ignore
53except ImportError:
54 shapely = None
55else:
56 _read_wkt = wkt.loads
57
58import google.api_core.exceptions
59from google.api_core.page_iterator import HTTPIterator
60
61import google.cloud._helpers # type: ignore
62from google.cloud.bigquery import _helpers
63from google.cloud.bigquery import _pandas_helpers
64from google.cloud.bigquery import _versions_helpers
65from google.cloud.bigquery import exceptions as bq_exceptions
66from google.cloud.bigquery._tqdm_helpers import get_progress_bar
67from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
68from google.cloud.bigquery.enums import DefaultPandasDTypes
69from google.cloud.bigquery.external_config import ExternalConfig
70from google.cloud.bigquery import schema as _schema
71from google.cloud.bigquery.schema import _build_schema_resource
72from google.cloud.bigquery.schema import _parse_schema_resource
73from google.cloud.bigquery.schema import _to_schema_fields
74from google.cloud.bigquery import external_config
75
76if typing.TYPE_CHECKING: # pragma: NO COVER
77 # Unconditionally import optional dependencies again to tell pytype that
78 # they are not None, avoiding false "no attribute" errors.
79 import pandas
80 import pyarrow
81 import geopandas # type: ignore
82 from google.cloud import bigquery_storage # type: ignore
83 from google.cloud.bigquery.dataset import DatasetReference
84
85
86_NO_GEOPANDAS_ERROR = (
87 "The geopandas library is not installed, please install "
88 "geopandas to use the to_geodataframe() function."
89)
90_NO_PYARROW_ERROR = (
91 "The pyarrow library is not installed, please install "
92 "pyarrow to use the to_arrow() function."
93)
94_NO_SHAPELY_ERROR = (
95 "The shapely library is not installed, please install "
96 "shapely to use the geography_as_object option."
97)
98
99_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'
100
101_NO_SUPPORTED_DTYPE = (
102 "The dtype cannot to be converted to a pandas ExtensionArray "
103 "because the necessary `__from_arrow__` attribute is missing."
104)
105
106_RANGE_PYARROW_WARNING = (
107 "Unable to represent RANGE schema as struct using pandas ArrowDtype. Using "
108 "`object` instead. To use ArrowDtype, use pandas >= 1.5 and "
109 "pyarrow >= 10.0.1."
110)
111
112# How many of the total rows need to be downloaded already for us to skip
113# calling the BQ Storage API?
114#
115# In microbenchmarks on 2024-05-21, I (tswast@) measure that at about 2 MB of
116# remaining results, it's faster to use the BQ Storage Read API to download
117# the results than use jobs.getQueryResults. Since we don't have a good way to
118# know the remaining bytes, we estimate by remaining number of rows.
119#
120# Except when rows themselves are larger, I observe that the a single page of
121# results will be around 10 MB. Therefore, the proportion of rows already
122# downloaded should be 10 (first page) / 12 (all results) or less for it to be
123# worth it to make a call to jobs.getQueryResults.
124ALMOST_COMPLETELY_CACHED_RATIO = 0.833333
125
126
127def _reference_getter(table):
128 """A :class:`~google.cloud.bigquery.table.TableReference` pointing to
129 this table.
130
131 Returns:
132 google.cloud.bigquery.table.TableReference: pointer to this table.
133 """
134 from google.cloud.bigquery import dataset
135
136 dataset_ref = dataset.DatasetReference(table.project, table.dataset_id)
137 return TableReference(dataset_ref, table.table_id)
138
139
140def _view_use_legacy_sql_getter(
141 table: Union["Table", "TableListItem"]
142) -> Optional[bool]:
143 """bool: Specifies whether to execute the view with Legacy or Standard SQL.
144
145 This boolean specifies whether to execute the view with Legacy SQL
146 (:data:`True`) or Standard SQL (:data:`False`). The client side default is
147 :data:`False`. The server-side default is :data:`True`. If this table is
148 not a view, :data:`None` is returned.
149
150 Raises:
151 ValueError: For invalid value types.
152 """
153
154 view: Optional[Dict[str, Any]] = table._properties.get("view")
155 if view is not None:
156 # The server-side default for useLegacySql is True.
157 return view.get("useLegacySql", True) if view is not None else True
158 # In some cases, such as in a table list no view object is present, but the
159 # resource still represents a view. Use the type as a fallback.
160 if table.table_type == "VIEW":
161 # The server-side default for useLegacySql is True.
162 return True
163 return None # explicit return statement to appease mypy
164
165
166class _TableBase:
167 """Base class for Table-related classes with common functionality."""
168
169 _PROPERTY_TO_API_FIELD: Dict[str, Union[str, List[str]]] = {
170 "dataset_id": ["tableReference", "datasetId"],
171 "project": ["tableReference", "projectId"],
172 "table_id": ["tableReference", "tableId"],
173 }
174
175 def __init__(self):
176 self._properties = {}
177
178 @property
179 def project(self) -> str:
180 """Project bound to the table."""
181 return _helpers._get_sub_prop(
182 self._properties, self._PROPERTY_TO_API_FIELD["project"]
183 )
184
185 @property
186 def dataset_id(self) -> str:
187 """ID of dataset containing the table."""
188 return _helpers._get_sub_prop(
189 self._properties, self._PROPERTY_TO_API_FIELD["dataset_id"]
190 )
191
192 @property
193 def table_id(self) -> str:
194 """The table ID."""
195 return _helpers._get_sub_prop(
196 self._properties, self._PROPERTY_TO_API_FIELD["table_id"]
197 )
198
199 @property
200 def path(self) -> str:
201 """URL path for the table's APIs."""
202 return (
203 f"/projects/{self.project}/datasets/{self.dataset_id}"
204 f"/tables/{self.table_id}"
205 )
206
207 def __eq__(self, other):
208 if isinstance(other, _TableBase):
209 return (
210 self.project == other.project
211 and self.dataset_id == other.dataset_id
212 and self.table_id == other.table_id
213 )
214 else:
215 return NotImplemented
216
217 def __hash__(self):
218 return hash((self.project, self.dataset_id, self.table_id))
219
220
221class TableReference(_TableBase):
222 """TableReferences are pointers to tables.
223
224 See
225 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablereference
226
227 Args:
228 dataset_ref: A pointer to the dataset
229 table_id: The ID of the table
230 """
231
232 _PROPERTY_TO_API_FIELD = {
233 "dataset_id": "datasetId",
234 "project": "projectId",
235 "table_id": "tableId",
236 }
237
238 def __init__(self, dataset_ref: "DatasetReference", table_id: str):
239 self._properties = {}
240
241 _helpers._set_sub_prop(
242 self._properties,
243 self._PROPERTY_TO_API_FIELD["project"],
244 dataset_ref.project,
245 )
246 _helpers._set_sub_prop(
247 self._properties,
248 self._PROPERTY_TO_API_FIELD["dataset_id"],
249 dataset_ref.dataset_id,
250 )
251 _helpers._set_sub_prop(
252 self._properties,
253 self._PROPERTY_TO_API_FIELD["table_id"],
254 table_id,
255 )
256
257 @classmethod
258 def from_string(
259 cls, table_id: str, default_project: Optional[str] = None
260 ) -> "TableReference":
261 """Construct a table reference from table ID string.
262
263 Args:
264 table_id (str):
265 A table ID in standard SQL format. If ``default_project``
266 is not specified, this must included a project ID, dataset
267 ID, and table ID, each separated by ``.``.
268 default_project (Optional[str]):
269 The project ID to use when ``table_id`` does not
270 include a project ID.
271
272 Returns:
273 TableReference: Table reference parsed from ``table_id``.
274
275 Examples:
276 >>> TableReference.from_string('my-project.mydataset.mytable')
277 TableRef...(DatasetRef...('my-project', 'mydataset'), 'mytable')
278
279 Raises:
280 ValueError:
281 If ``table_id`` is not a fully-qualified table ID in
282 standard SQL format.
283 """
284 from google.cloud.bigquery.dataset import DatasetReference
285
286 (
287 output_project_id,
288 output_dataset_id,
289 output_table_id,
290 ) = _helpers._parse_3_part_id(
291 table_id, default_project=default_project, property_name="table_id"
292 )
293
294 return cls(
295 DatasetReference(output_project_id, output_dataset_id), output_table_id
296 )
297
298 @classmethod
299 def from_api_repr(cls, resource: dict) -> "TableReference":
300 """Factory: construct a table reference given its API representation
301
302 Args:
303 resource (Dict[str, object]):
304 Table reference representation returned from the API
305
306 Returns:
307 google.cloud.bigquery.table.TableReference:
308 Table reference parsed from ``resource``.
309 """
310 from google.cloud.bigquery.dataset import DatasetReference
311
312 project = resource["projectId"]
313 dataset_id = resource["datasetId"]
314 table_id = resource["tableId"]
315
316 return cls(DatasetReference(project, dataset_id), table_id)
317
318 def to_api_repr(self) -> dict:
319 """Construct the API resource representation of this table reference.
320
321 Returns:
322 Dict[str, object]: Table reference represented as an API resource
323 """
324 return copy.deepcopy(self._properties)
325
326 def to_bqstorage(self) -> str:
327 """Construct a BigQuery Storage API representation of this table.
328
329 Install the ``google-cloud-bigquery-storage`` package to use this
330 feature.
331
332 If the ``table_id`` contains a partition identifier (e.g.
333 ``my_table$201812``) or a snapshot identifier (e.g.
334 ``mytable@1234567890``), it is ignored. Use
335 :class:`google.cloud.bigquery_storage.types.ReadSession.TableReadOptions`
336 to filter rows by partition. Use
337 :class:`google.cloud.bigquery_storage.types.ReadSession.TableModifiers`
338 to select a specific snapshot to read from.
339
340 Returns:
341 str: A reference to this table in the BigQuery Storage API.
342 """
343
344 table_id, _, _ = self.table_id.partition("@")
345 table_id, _, _ = table_id.partition("$")
346
347 table_ref = (
348 f"projects/{self.project}/datasets/{self.dataset_id}/tables/{table_id}"
349 )
350 return table_ref
351
352 def __str__(self):
353 return f"{self.project}.{self.dataset_id}.{self.table_id}"
354
355 def __repr__(self):
356 from google.cloud.bigquery.dataset import DatasetReference
357
358 dataset_ref = DatasetReference(self.project, self.dataset_id)
359 return f"TableReference({dataset_ref!r}, '{self.table_id}')"
360
361
362class Table(_TableBase):
363 """Tables represent a set of rows whose values correspond to a schema.
364
365 See
366 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#resource-table
367
368 Args:
369 table_ref (Union[google.cloud.bigquery.table.TableReference, str]):
370 A pointer to a table. If ``table_ref`` is a string, it must
371 included a project ID, dataset ID, and table ID, each separated
372 by ``.``.
373 schema (Optional[Sequence[Union[ \
374 :class:`~google.cloud.bigquery.schema.SchemaField`, \
375 Mapping[str, Any] \
376 ]]]):
377 The table's schema. If any item is a mapping, its content must be
378 compatible with
379 :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
380 """
381
382 _PROPERTY_TO_API_FIELD: Dict[str, Any] = {
383 **_TableBase._PROPERTY_TO_API_FIELD,
384 "biglake_configuration": "biglakeConfiguration",
385 "clustering_fields": "clustering",
386 "created": "creationTime",
387 "description": "description",
388 "encryption_configuration": "encryptionConfiguration",
389 "etag": "etag",
390 "expires": "expirationTime",
391 "external_data_configuration": "externalDataConfiguration",
392 "friendly_name": "friendlyName",
393 "full_table_id": "id",
394 "labels": "labels",
395 "location": "location",
396 "modified": "lastModifiedTime",
397 "mview_enable_refresh": "materializedView",
398 "mview_last_refresh_time": ["materializedView", "lastRefreshTime"],
399 "mview_query": "materializedView",
400 "mview_refresh_interval": "materializedView",
401 "mview_allow_non_incremental_definition": "materializedView",
402 "num_bytes": "numBytes",
403 "num_rows": "numRows",
404 "partition_expiration": "timePartitioning",
405 "partitioning_type": "timePartitioning",
406 "range_partitioning": "rangePartitioning",
407 "time_partitioning": "timePartitioning",
408 "schema": ["schema", "fields"],
409 "snapshot_definition": "snapshotDefinition",
410 "clone_definition": "cloneDefinition",
411 "streaming_buffer": "streamingBuffer",
412 "self_link": "selfLink",
413 "type": "type",
414 "view_use_legacy_sql": "view",
415 "view_query": "view",
416 "require_partition_filter": "requirePartitionFilter",
417 "table_constraints": "tableConstraints",
418 "max_staleness": "maxStaleness",
419 "resource_tags": "resourceTags",
420 "external_catalog_table_options": "externalCatalogTableOptions",
421 "foreign_type_info": ["schema", "foreignTypeInfo"],
422 }
423
424 def __init__(self, table_ref, schema=None) -> None:
425 table_ref = _table_arg_to_table_ref(table_ref)
426 self._properties: Dict[str, Any] = {
427 "tableReference": table_ref.to_api_repr(),
428 "labels": {},
429 }
430 # Let the @property do validation.
431 if schema is not None:
432 self.schema = schema
433
434 reference = property(_reference_getter)
435
436 @property
437 def biglake_configuration(self):
438 """google.cloud.bigquery.table.BigLakeConfiguration: Configuration
439 for managed tables for Apache Iceberg.
440
441 See https://cloud.google.com/bigquery/docs/iceberg-tables for more information.
442 """
443 prop = self._properties.get(
444 self._PROPERTY_TO_API_FIELD["biglake_configuration"]
445 )
446 if prop is not None:
447 prop = BigLakeConfiguration.from_api_repr(prop)
448 return prop
449
450 @biglake_configuration.setter
451 def biglake_configuration(self, value):
452 api_repr = value
453 if value is not None:
454 api_repr = value.to_api_repr()
455 self._properties[
456 self._PROPERTY_TO_API_FIELD["biglake_configuration"]
457 ] = api_repr
458
459 @property
460 def require_partition_filter(self):
461 """bool: If set to true, queries over the partitioned table require a
462 partition filter that can be used for partition elimination to be
463 specified.
464 """
465 return self._properties.get(
466 self._PROPERTY_TO_API_FIELD["require_partition_filter"]
467 )
468
469 @require_partition_filter.setter
470 def require_partition_filter(self, value):
471 self._properties[
472 self._PROPERTY_TO_API_FIELD["require_partition_filter"]
473 ] = value
474
475 @property
476 def schema(self):
477 """Sequence[Union[ \
478 :class:`~google.cloud.bigquery.schema.SchemaField`, \
479 Mapping[str, Any] \
480 ]]:
481 Table's schema.
482
483 Raises:
484 Exception:
485 If ``schema`` is not a sequence, or if any item in the sequence
486 is not a :class:`~google.cloud.bigquery.schema.SchemaField`
487 instance or a compatible mapping representation of the field.
488
489 .. Note::
490 If you are referencing a schema for an external catalog table such
491 as a Hive table, it will also be necessary to populate the foreign_type_info
492 attribute. This is not necessary if defining the schema for a BigQuery table.
493
494 For details, see:
495 https://cloud.google.com/bigquery/docs/external-tables
496 https://cloud.google.com/bigquery/docs/datasets-intro#external_datasets
497
498 """
499 prop = _helpers._get_sub_prop(
500 self._properties, self._PROPERTY_TO_API_FIELD["schema"]
501 )
502 if not prop:
503 return []
504 else:
505 return _parse_schema_resource(prop)
506
507 @schema.setter
508 def schema(self, value):
509 api_field = self._PROPERTY_TO_API_FIELD["schema"]
510
511 if value is None:
512 _helpers._set_sub_prop(
513 self._properties,
514 api_field,
515 None,
516 )
517 elif isinstance(value, Sequence):
518 value = _to_schema_fields(value)
519 value = _build_schema_resource(value)
520 _helpers._set_sub_prop(
521 self._properties,
522 api_field,
523 value,
524 )
525 else:
526 raise TypeError("Schema must be a Sequence (e.g. a list) or None.")
527
528 @property
529 def labels(self):
530 """Dict[str, str]: Labels for the table.
531
532 This method always returns a dict. To change a table's labels,
533 modify the dict, then call ``Client.update_table``. To delete a
534 label, set its value to :data:`None` before updating.
535
536 Raises:
537 ValueError: If ``value`` type is invalid.
538 """
539 return self._properties.setdefault(self._PROPERTY_TO_API_FIELD["labels"], {})
540
541 @labels.setter
542 def labels(self, value):
543 if not isinstance(value, dict):
544 raise ValueError("Pass a dict")
545 self._properties[self._PROPERTY_TO_API_FIELD["labels"]] = value
546
547 @property
548 def encryption_configuration(self):
549 """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom
550 encryption configuration for the table.
551
552 Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
553 if using default encryption.
554
555 See `protecting data with Cloud KMS keys
556 <https://cloud.google.com/bigquery/docs/customer-managed-encryption>`_
557 in the BigQuery documentation.
558 """
559 prop = self._properties.get(
560 self._PROPERTY_TO_API_FIELD["encryption_configuration"]
561 )
562 if prop is not None:
563 prop = EncryptionConfiguration.from_api_repr(prop)
564 return prop
565
566 @encryption_configuration.setter
567 def encryption_configuration(self, value):
568 api_repr = value
569 if value is not None:
570 api_repr = value.to_api_repr()
571 self._properties[
572 self._PROPERTY_TO_API_FIELD["encryption_configuration"]
573 ] = api_repr
574
575 @property
576 def created(self):
577 """Union[datetime.datetime, None]: Datetime at which the table was
578 created (:data:`None` until set from the server).
579 """
580 creation_time = self._properties.get(self._PROPERTY_TO_API_FIELD["created"])
581 if creation_time is not None:
582 # creation_time will be in milliseconds.
583 return google.cloud._helpers._datetime_from_microseconds(
584 1000.0 * float(creation_time)
585 )
586
587 @property
588 def etag(self):
589 """Union[str, None]: ETag for the table resource (:data:`None` until
590 set from the server).
591 """
592 return self._properties.get(self._PROPERTY_TO_API_FIELD["etag"])
593
594 @property
595 def modified(self):
596 """Union[datetime.datetime, None]: Datetime at which the table was last
597 modified (:data:`None` until set from the server).
598 """
599 modified_time = self._properties.get(self._PROPERTY_TO_API_FIELD["modified"])
600 if modified_time is not None:
601 # modified_time will be in milliseconds.
602 return google.cloud._helpers._datetime_from_microseconds(
603 1000.0 * float(modified_time)
604 )
605
606 @property
607 def num_bytes(self):
608 """Union[int, None]: The size of the table in bytes (:data:`None` until
609 set from the server).
610 """
611 return _helpers._int_or_none(
612 self._properties.get(self._PROPERTY_TO_API_FIELD["num_bytes"])
613 )
614
615 @property
616 def num_rows(self):
617 """Union[int, None]: The number of rows in the table (:data:`None`
618 until set from the server).
619 """
620 return _helpers._int_or_none(
621 self._properties.get(self._PROPERTY_TO_API_FIELD["num_rows"])
622 )
623
624 @property
625 def self_link(self):
626 """Union[str, None]: URL for the table resource (:data:`None` until set
627 from the server).
628 """
629 return self._properties.get(self._PROPERTY_TO_API_FIELD["self_link"])
630
631 @property
632 def full_table_id(self):
633 """Union[str, None]: ID for the table (:data:`None` until set from the
634 server).
635
636 In the format ``project-id:dataset_id.table_id``.
637 """
638 return self._properties.get(self._PROPERTY_TO_API_FIELD["full_table_id"])
639
640 @property
641 def table_type(self):
642 """Union[str, None]: The type of the table (:data:`None` until set from
643 the server).
644
645 Possible values are ``'TABLE'``, ``'VIEW'``, ``'MATERIALIZED_VIEW'`` or
646 ``'EXTERNAL'``.
647 """
648 return self._properties.get(self._PROPERTY_TO_API_FIELD["type"])
649
650 @property
651 def range_partitioning(self):
652 """Optional[google.cloud.bigquery.table.RangePartitioning]:
653 Configures range-based partitioning for a table.
654
655 .. note::
656 **Beta**. The integer range partitioning feature is in a
657 pre-release state and might change or have limited support.
658
659 Only specify at most one of
660 :attr:`~google.cloud.bigquery.table.Table.time_partitioning` or
661 :attr:`~google.cloud.bigquery.table.Table.range_partitioning`.
662
663 Raises:
664 ValueError:
665 If the value is not
666 :class:`~google.cloud.bigquery.table.RangePartitioning` or
667 :data:`None`.
668 """
669 resource = self._properties.get(
670 self._PROPERTY_TO_API_FIELD["range_partitioning"]
671 )
672 if resource is not None:
673 return RangePartitioning(_properties=resource)
674
675 @range_partitioning.setter
676 def range_partitioning(self, value):
677 resource = value
678 if isinstance(value, RangePartitioning):
679 resource = value._properties
680 elif value is not None:
681 raise ValueError(
682 "Expected value to be RangePartitioning or None, got {}.".format(value)
683 )
684 self._properties[self._PROPERTY_TO_API_FIELD["range_partitioning"]] = resource
685
686 @property
687 def time_partitioning(self):
688 """Optional[google.cloud.bigquery.table.TimePartitioning]: Configures time-based
689 partitioning for a table.
690
691 Only specify at most one of
692 :attr:`~google.cloud.bigquery.table.Table.time_partitioning` or
693 :attr:`~google.cloud.bigquery.table.Table.range_partitioning`.
694
695 Raises:
696 ValueError:
697 If the value is not
698 :class:`~google.cloud.bigquery.table.TimePartitioning` or
699 :data:`None`.
700 """
701 prop = self._properties.get(self._PROPERTY_TO_API_FIELD["time_partitioning"])
702 if prop is not None:
703 return TimePartitioning.from_api_repr(prop)
704
705 @time_partitioning.setter
706 def time_partitioning(self, value):
707 api_repr = value
708 if isinstance(value, TimePartitioning):
709 api_repr = value.to_api_repr()
710 elif value is not None:
711 raise ValueError(
712 "value must be google.cloud.bigquery.table.TimePartitioning " "or None"
713 )
714 self._properties[self._PROPERTY_TO_API_FIELD["time_partitioning"]] = api_repr
715
716 @property
717 def partitioning_type(self):
718 """Union[str, None]: Time partitioning of the table if it is
719 partitioned (Defaults to :data:`None`).
720
721 """
722 warnings.warn(
723 "This method will be deprecated in future versions. Please use "
724 "Table.time_partitioning.type_ instead.",
725 PendingDeprecationWarning,
726 stacklevel=2,
727 )
728 if self.time_partitioning is not None:
729 return self.time_partitioning.type_
730
731 @partitioning_type.setter
732 def partitioning_type(self, value):
733 warnings.warn(
734 "This method will be deprecated in future versions. Please use "
735 "Table.time_partitioning.type_ instead.",
736 PendingDeprecationWarning,
737 stacklevel=2,
738 )
739 api_field = self._PROPERTY_TO_API_FIELD["partitioning_type"]
740 if self.time_partitioning is None:
741 self._properties[api_field] = {}
742 self._properties[api_field]["type"] = value
743
744 @property
745 def partition_expiration(self):
746 """Union[int, None]: Expiration time in milliseconds for a partition.
747
748 If :attr:`partition_expiration` is set and :attr:`type_` is
749 not set, :attr:`type_` will default to
750 :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
751 """
752 warnings.warn(
753 "This method will be deprecated in future versions. Please use "
754 "Table.time_partitioning.expiration_ms instead.",
755 PendingDeprecationWarning,
756 stacklevel=2,
757 )
758 if self.time_partitioning is not None:
759 return self.time_partitioning.expiration_ms
760
761 @partition_expiration.setter
762 def partition_expiration(self, value):
763 warnings.warn(
764 "This method will be deprecated in future versions. Please use "
765 "Table.time_partitioning.expiration_ms instead.",
766 PendingDeprecationWarning,
767 stacklevel=2,
768 )
769 api_field = self._PROPERTY_TO_API_FIELD["partition_expiration"]
770
771 if self.time_partitioning is None:
772 self._properties[api_field] = {"type": TimePartitioningType.DAY}
773
774 if value is None:
775 self._properties[api_field]["expirationMs"] = None
776 else:
777 self._properties[api_field]["expirationMs"] = str(value)
778
779 @property
780 def clustering_fields(self):
781 """Union[List[str], None]: Fields defining clustering for the table
782
783 (Defaults to :data:`None`).
784
785 Clustering fields are immutable after table creation.
786
787 .. note::
788
789 BigQuery supports clustering for both partitioned and
790 non-partitioned tables.
791 """
792 prop = self._properties.get(self._PROPERTY_TO_API_FIELD["clustering_fields"])
793 if prop is not None:
794 return list(prop.get("fields", ()))
795
796 @clustering_fields.setter
797 def clustering_fields(self, value):
798 """Union[List[str], None]: Fields defining clustering for the table
799
800 (Defaults to :data:`None`).
801 """
802 api_field = self._PROPERTY_TO_API_FIELD["clustering_fields"]
803
804 if value is not None:
805 prop = self._properties.setdefault(api_field, {})
806 prop["fields"] = value
807 else:
808 # In order to allow unsetting clustering fields completely, we explicitly
809 # set this property to None (as oposed to merely removing the key).
810 self._properties[api_field] = None
811
812 @property
813 def description(self):
814 """Union[str, None]: Description of the table (defaults to
815 :data:`None`).
816
817 Raises:
818 ValueError: For invalid value types.
819 """
820 return self._properties.get(self._PROPERTY_TO_API_FIELD["description"])
821
822 @description.setter
823 def description(self, value):
824 if not isinstance(value, str) and value is not None:
825 raise ValueError("Pass a string, or None")
826 self._properties[self._PROPERTY_TO_API_FIELD["description"]] = value
827
828 @property
829 def expires(self):
830 """Union[datetime.datetime, None]: Datetime at which the table will be
831 deleted.
832
833 Raises:
834 ValueError: For invalid value types.
835 """
836 expiration_time = self._properties.get(self._PROPERTY_TO_API_FIELD["expires"])
837 if expiration_time is not None:
838 # expiration_time will be in milliseconds.
839 return google.cloud._helpers._datetime_from_microseconds(
840 1000.0 * float(expiration_time)
841 )
842
843 @expires.setter
844 def expires(self, value):
845 if not isinstance(value, datetime.datetime) and value is not None:
846 raise ValueError("Pass a datetime, or None")
847 value_ms = google.cloud._helpers._millis_from_datetime(value)
848 self._properties[
849 self._PROPERTY_TO_API_FIELD["expires"]
850 ] = _helpers._str_or_none(value_ms)
851
852 @property
853 def friendly_name(self):
854 """Union[str, None]: Title of the table (defaults to :data:`None`).
855
856 Raises:
857 ValueError: For invalid value types.
858 """
859 return self._properties.get(self._PROPERTY_TO_API_FIELD["friendly_name"])
860
861 @friendly_name.setter
862 def friendly_name(self, value):
863 if not isinstance(value, str) and value is not None:
864 raise ValueError("Pass a string, or None")
865 self._properties[self._PROPERTY_TO_API_FIELD["friendly_name"]] = value
866
867 @property
868 def location(self):
869 """Union[str, None]: Location in which the table is hosted
870
871 Defaults to :data:`None`.
872 """
873 return self._properties.get(self._PROPERTY_TO_API_FIELD["location"])
874
875 @property
876 def view_query(self):
877 """Union[str, None]: SQL query defining the table as a view (defaults
878 to :data:`None`).
879
880 By default, the query is treated as Standard SQL. To use Legacy
881 SQL, set :attr:`view_use_legacy_sql` to :data:`True`.
882
883 Raises:
884 ValueError: For invalid value types.
885 """
886 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
887 return _helpers._get_sub_prop(self._properties, [api_field, "query"])
888
889 @view_query.setter
890 def view_query(self, value):
891 if not isinstance(value, str):
892 raise ValueError("Pass a string")
893
894 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
895 _helpers._set_sub_prop(self._properties, [api_field, "query"], value)
896 view = self._properties[api_field]
897 # The service defaults useLegacySql to True, but this
898 # client uses Standard SQL by default.
899 if view.get("useLegacySql") is None:
900 view["useLegacySql"] = False
901
902 @view_query.deleter
903 def view_query(self):
904 """Delete SQL query defining the table as a view."""
905 self._properties.pop(self._PROPERTY_TO_API_FIELD["view_query"], None)
906
907 view_use_legacy_sql = property(_view_use_legacy_sql_getter)
908
909 @view_use_legacy_sql.setter # type: ignore # (redefinition from above)
910 def view_use_legacy_sql(self, value):
911 if not isinstance(value, bool):
912 raise ValueError("Pass a boolean")
913
914 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
915 if self._properties.get(api_field) is None:
916 self._properties[api_field] = {}
917 self._properties[api_field]["useLegacySql"] = value
918
919 @property
920 def mview_query(self):
921 """Optional[str]: SQL query defining the table as a materialized
922 view (defaults to :data:`None`).
923 """
924 api_field = self._PROPERTY_TO_API_FIELD["mview_query"]
925 return _helpers._get_sub_prop(self._properties, [api_field, "query"])
926
927 @mview_query.setter
928 def mview_query(self, value):
929 api_field = self._PROPERTY_TO_API_FIELD["mview_query"]
930 _helpers._set_sub_prop(self._properties, [api_field, "query"], str(value))
931
932 @mview_query.deleter
933 def mview_query(self):
934 """Delete SQL query defining the table as a materialized view."""
935 self._properties.pop(self._PROPERTY_TO_API_FIELD["mview_query"], None)
936
937 @property
938 def mview_last_refresh_time(self):
939 """Optional[datetime.datetime]: Datetime at which the materialized view was last
940 refreshed (:data:`None` until set from the server).
941 """
942 refresh_time = _helpers._get_sub_prop(
943 self._properties, self._PROPERTY_TO_API_FIELD["mview_last_refresh_time"]
944 )
945 if refresh_time is not None:
946 # refresh_time will be in milliseconds.
947 return google.cloud._helpers._datetime_from_microseconds(
948 1000 * int(refresh_time)
949 )
950
951 @property
952 def mview_enable_refresh(self):
953 """Optional[bool]: Enable automatic refresh of the materialized view
954 when the base table is updated. The default value is :data:`True`.
955 """
956 api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"]
957 return _helpers._get_sub_prop(self._properties, [api_field, "enableRefresh"])
958
959 @mview_enable_refresh.setter
960 def mview_enable_refresh(self, value):
961 api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"]
962 return _helpers._set_sub_prop(
963 self._properties, [api_field, "enableRefresh"], value
964 )
965
966 @property
967 def mview_refresh_interval(self):
968 """Optional[datetime.timedelta]: The maximum frequency at which this
969 materialized view will be refreshed. The default value is 1800000
970 milliseconds (30 minutes).
971 """
972 api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"]
973 refresh_interval = _helpers._get_sub_prop(
974 self._properties, [api_field, "refreshIntervalMs"]
975 )
976 if refresh_interval is not None:
977 return datetime.timedelta(milliseconds=int(refresh_interval))
978
979 @mview_refresh_interval.setter
980 def mview_refresh_interval(self, value):
981 if value is None:
982 refresh_interval_ms = None
983 else:
984 refresh_interval_ms = str(value // datetime.timedelta(milliseconds=1))
985
986 api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"]
987 _helpers._set_sub_prop(
988 self._properties,
989 [api_field, "refreshIntervalMs"],
990 refresh_interval_ms,
991 )
992
993 @property
994 def mview_allow_non_incremental_definition(self):
995 """Optional[bool]: This option declares the intention to construct a
996 materialized view that isn't refreshed incrementally.
997 The default value is :data:`False`.
998 """
999 api_field = self._PROPERTY_TO_API_FIELD[
1000 "mview_allow_non_incremental_definition"
1001 ]
1002 return _helpers._get_sub_prop(
1003 self._properties, [api_field, "allowNonIncrementalDefinition"]
1004 )
1005
1006 @mview_allow_non_incremental_definition.setter
1007 def mview_allow_non_incremental_definition(self, value):
1008 api_field = self._PROPERTY_TO_API_FIELD[
1009 "mview_allow_non_incremental_definition"
1010 ]
1011 _helpers._set_sub_prop(
1012 self._properties, [api_field, "allowNonIncrementalDefinition"], value
1013 )
1014
1015 @property
1016 def streaming_buffer(self):
1017 """google.cloud.bigquery.StreamingBuffer: Information about a table's
1018 streaming buffer.
1019 """
1020 sb = self._properties.get(self._PROPERTY_TO_API_FIELD["streaming_buffer"])
1021 if sb is not None:
1022 return StreamingBuffer(sb)
1023
1024 @property
1025 def external_data_configuration(self):
1026 """Union[google.cloud.bigquery.ExternalConfig, None]: Configuration for
1027 an external data source (defaults to :data:`None`).
1028
1029 Raises:
1030 ValueError: For invalid value types.
1031 """
1032 prop = self._properties.get(
1033 self._PROPERTY_TO_API_FIELD["external_data_configuration"]
1034 )
1035 if prop is not None:
1036 prop = ExternalConfig.from_api_repr(prop)
1037 return prop
1038
1039 @external_data_configuration.setter
1040 def external_data_configuration(self, value):
1041 if not (value is None or isinstance(value, ExternalConfig)):
1042 raise ValueError("Pass an ExternalConfig or None")
1043 api_repr = value
1044 if value is not None:
1045 api_repr = value.to_api_repr()
1046 self._properties[
1047 self._PROPERTY_TO_API_FIELD["external_data_configuration"]
1048 ] = api_repr
1049
1050 @property
1051 def snapshot_definition(self) -> Optional["SnapshotDefinition"]:
1052 """Information about the snapshot. This value is set via snapshot creation.
1053
1054 See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.snapshot_definition
1055 """
1056 snapshot_info = self._properties.get(
1057 self._PROPERTY_TO_API_FIELD["snapshot_definition"]
1058 )
1059 if snapshot_info is not None:
1060 snapshot_info = SnapshotDefinition(snapshot_info)
1061 return snapshot_info
1062
1063 @property
1064 def clone_definition(self) -> Optional["CloneDefinition"]:
1065 """Information about the clone. This value is set via clone creation.
1066
1067 See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.clone_definition
1068 """
1069 clone_info = self._properties.get(
1070 self._PROPERTY_TO_API_FIELD["clone_definition"]
1071 )
1072 if clone_info is not None:
1073 clone_info = CloneDefinition(clone_info)
1074 return clone_info
1075
1076 @property
1077 def table_constraints(self) -> Optional["TableConstraints"]:
1078 """Tables Primary Key and Foreign Key information."""
1079 table_constraints = self._properties.get(
1080 self._PROPERTY_TO_API_FIELD["table_constraints"]
1081 )
1082 if table_constraints is not None:
1083 table_constraints = TableConstraints.from_api_repr(table_constraints)
1084 return table_constraints
1085
1086 @table_constraints.setter
1087 def table_constraints(self, value):
1088 """Tables Primary Key and Foreign Key information."""
1089 api_repr = value
1090 if not isinstance(value, TableConstraints) and value is not None:
1091 raise ValueError(
1092 "value must be google.cloud.bigquery.table.TableConstraints or None"
1093 )
1094 api_repr = value.to_api_repr() if value else None
1095 self._properties[self._PROPERTY_TO_API_FIELD["table_constraints"]] = api_repr
1096
1097 @property
1098 def resource_tags(self):
1099 """Dict[str, str]: Resource tags for the table.
1100
1101 See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.resource_tags
1102 """
1103 return self._properties.setdefault(
1104 self._PROPERTY_TO_API_FIELD["resource_tags"], {}
1105 )
1106
1107 @resource_tags.setter
1108 def resource_tags(self, value):
1109 if not isinstance(value, dict) and value is not None:
1110 raise ValueError("resource_tags must be a dict or None")
1111 self._properties[self._PROPERTY_TO_API_FIELD["resource_tags"]] = value
1112
1113 @property
1114 def external_catalog_table_options(
1115 self,
1116 ) -> Optional[external_config.ExternalCatalogTableOptions]:
1117 """Options defining open source compatible datasets living in the
1118 BigQuery catalog. Contains metadata of open source database, schema
1119 or namespace represented by the current dataset."""
1120
1121 prop = self._properties.get(
1122 self._PROPERTY_TO_API_FIELD["external_catalog_table_options"]
1123 )
1124 if prop is not None:
1125 return external_config.ExternalCatalogTableOptions.from_api_repr(prop)
1126 return None
1127
1128 @external_catalog_table_options.setter
1129 def external_catalog_table_options(
1130 self, value: Union[external_config.ExternalCatalogTableOptions, dict, None]
1131 ):
1132 value = _helpers._isinstance_or_raise(
1133 value,
1134 (external_config.ExternalCatalogTableOptions, dict),
1135 none_allowed=True,
1136 )
1137 if isinstance(value, external_config.ExternalCatalogTableOptions):
1138 self._properties[
1139 self._PROPERTY_TO_API_FIELD["external_catalog_table_options"]
1140 ] = value.to_api_repr()
1141 else:
1142 self._properties[
1143 self._PROPERTY_TO_API_FIELD["external_catalog_table_options"]
1144 ] = value
1145
1146 @property
1147 def foreign_type_info(self) -> Optional[_schema.ForeignTypeInfo]:
1148 """Optional. Specifies metadata of the foreign data type definition in
1149 field schema (TableFieldSchema.foreign_type_definition).
1150 Returns:
1151 Optional[schema.ForeignTypeInfo]:
1152 Foreign type information, or :data:`None` if not set.
1153 .. Note::
1154 foreign_type_info is only required if you are referencing an
1155 external catalog such as a Hive table.
1156 For details, see:
1157 https://cloud.google.com/bigquery/docs/external-tables
1158 https://cloud.google.com/bigquery/docs/datasets-intro#external_datasets
1159 """
1160
1161 prop = _helpers._get_sub_prop(
1162 self._properties, self._PROPERTY_TO_API_FIELD["foreign_type_info"]
1163 )
1164 if prop is not None:
1165 return _schema.ForeignTypeInfo.from_api_repr(prop)
1166 return None
1167
1168 @foreign_type_info.setter
1169 def foreign_type_info(self, value: Union[_schema.ForeignTypeInfo, dict, None]):
1170 value = _helpers._isinstance_or_raise(
1171 value,
1172 (_schema.ForeignTypeInfo, dict),
1173 none_allowed=True,
1174 )
1175 if isinstance(value, _schema.ForeignTypeInfo):
1176 value = value.to_api_repr()
1177 _helpers._set_sub_prop(
1178 self._properties, self._PROPERTY_TO_API_FIELD["foreign_type_info"], value
1179 )
1180
1181 @classmethod
1182 def from_string(cls, full_table_id: str) -> "Table":
1183 """Construct a table from fully-qualified table ID.
1184
1185 Args:
1186 full_table_id (str):
1187 A fully-qualified table ID in standard SQL format. Must
1188 included a project ID, dataset ID, and table ID, each
1189 separated by ``.``.
1190
1191 Returns:
1192 Table: Table parsed from ``full_table_id``.
1193
1194 Examples:
1195 >>> Table.from_string('my-project.mydataset.mytable')
1196 Table(TableRef...(D...('my-project', 'mydataset'), 'mytable'))
1197
1198 Raises:
1199 ValueError:
1200 If ``full_table_id`` is not a fully-qualified table ID in
1201 standard SQL format.
1202 """
1203 return cls(TableReference.from_string(full_table_id))
1204
1205 @classmethod
1206 def from_api_repr(cls, resource: dict) -> "Table":
1207 """Factory: construct a table given its API representation
1208
1209 Args:
1210 resource (Dict[str, object]):
1211 Table resource representation from the API
1212
1213 Returns:
1214 google.cloud.bigquery.table.Table: Table parsed from ``resource``.
1215
1216 Raises:
1217 KeyError:
1218 If the ``resource`` lacks the key ``'tableReference'``, or if
1219 the ``dict`` stored within the key ``'tableReference'`` lacks
1220 the keys ``'tableId'``, ``'projectId'``, or ``'datasetId'``.
1221 """
1222 from google.cloud.bigquery import dataset
1223
1224 if (
1225 "tableReference" not in resource
1226 or "tableId" not in resource["tableReference"]
1227 ):
1228 raise KeyError(
1229 "Resource lacks required identity information:"
1230 '["tableReference"]["tableId"]'
1231 )
1232 project_id = _helpers._get_sub_prop(
1233 resource, cls._PROPERTY_TO_API_FIELD["project"]
1234 )
1235 table_id = _helpers._get_sub_prop(
1236 resource, cls._PROPERTY_TO_API_FIELD["table_id"]
1237 )
1238 dataset_id = _helpers._get_sub_prop(
1239 resource, cls._PROPERTY_TO_API_FIELD["dataset_id"]
1240 )
1241 dataset_ref = dataset.DatasetReference(project_id, dataset_id)
1242
1243 table = cls(dataset_ref.table(table_id))
1244 table._properties = resource
1245
1246 return table
1247
1248 def to_api_repr(self) -> dict:
1249 """Constructs the API resource of this table
1250
1251 Returns:
1252 Dict[str, object]: Table represented as an API resource
1253 """
1254 return copy.deepcopy(self._properties)
1255
1256 def to_bqstorage(self) -> str:
1257 """Construct a BigQuery Storage API representation of this table.
1258
1259 Returns:
1260 str: A reference to this table in the BigQuery Storage API.
1261 """
1262 return self.reference.to_bqstorage()
1263
1264 def _build_resource(self, filter_fields):
1265 """Generate a resource for ``update``."""
1266 return _helpers._build_resource_from_properties(self, filter_fields)
1267
1268 def __repr__(self):
1269 return "Table({})".format(repr(self.reference))
1270
1271 def __str__(self):
1272 return f"{self.project}.{self.dataset_id}.{self.table_id}"
1273
1274 @property
1275 def max_staleness(self):
1276 """Union[str, None]: The maximum staleness of data that could be returned when the table is queried.
1277
1278 Staleness encoded as a string encoding of sql IntervalValue type.
1279 This property is optional and defaults to None.
1280
1281 According to the BigQuery API documentation, maxStaleness specifies the maximum time
1282 interval for which stale data can be returned when querying the table.
1283 It helps control data freshness in scenarios like metadata-cached external tables.
1284
1285 Returns:
1286 Optional[str]: A string representing the maximum staleness interval
1287 (e.g., '1h', '30m', '15s' for hours, minutes, seconds respectively).
1288 """
1289 return self._properties.get(self._PROPERTY_TO_API_FIELD["max_staleness"])
1290
1291 @max_staleness.setter
1292 def max_staleness(self, value):
1293 """Set the maximum staleness for the table.
1294
1295 Args:
1296 value (Optional[str]): A string representing the maximum staleness interval.
1297 Must be a valid time interval string.
1298 Examples include '1h' (1 hour), '30m' (30 minutes), '15s' (15 seconds).
1299
1300 Raises:
1301 ValueError: If the value is not None and not a string.
1302 """
1303 if value is not None and not isinstance(value, str):
1304 raise ValueError("max_staleness must be a string or None")
1305
1306 self._properties[self._PROPERTY_TO_API_FIELD["max_staleness"]] = value
1307
1308
1309class TableListItem(_TableBase):
1310 """A read-only table resource from a list operation.
1311
1312 For performance reasons, the BigQuery API only includes some of the table
1313 properties when listing tables. Notably,
1314 :attr:`~google.cloud.bigquery.table.Table.schema` and
1315 :attr:`~google.cloud.bigquery.table.Table.num_rows` are missing.
1316
1317 For a full list of the properties that the BigQuery API returns, see the
1318 `REST documentation for tables.list
1319 <https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list>`_.
1320
1321
1322 Args:
1323 resource (Dict[str, object]):
1324 A table-like resource object from a table list response. A
1325 ``tableReference`` property is required.
1326
1327 Raises:
1328 ValueError:
1329 If ``tableReference`` or one of its required members is missing
1330 from ``resource``.
1331 """
1332
1333 def __init__(self, resource):
1334 if "tableReference" not in resource:
1335 raise ValueError("resource must contain a tableReference value")
1336 if "projectId" not in resource["tableReference"]:
1337 raise ValueError(
1338 "resource['tableReference'] must contain a projectId value"
1339 )
1340 if "datasetId" not in resource["tableReference"]:
1341 raise ValueError(
1342 "resource['tableReference'] must contain a datasetId value"
1343 )
1344 if "tableId" not in resource["tableReference"]:
1345 raise ValueError("resource['tableReference'] must contain a tableId value")
1346
1347 self._properties = resource
1348
1349 @property
1350 def created(self):
1351 """Union[datetime.datetime, None]: Datetime at which the table was
1352 created (:data:`None` until set from the server).
1353 """
1354 creation_time = self._properties.get("creationTime")
1355 if creation_time is not None:
1356 # creation_time will be in milliseconds.
1357 return google.cloud._helpers._datetime_from_microseconds(
1358 1000.0 * float(creation_time)
1359 )
1360
1361 @property
1362 def expires(self):
1363 """Union[datetime.datetime, None]: Datetime at which the table will be
1364 deleted.
1365 """
1366 expiration_time = self._properties.get("expirationTime")
1367 if expiration_time is not None:
1368 # expiration_time will be in milliseconds.
1369 return google.cloud._helpers._datetime_from_microseconds(
1370 1000.0 * float(expiration_time)
1371 )
1372
1373 reference = property(_reference_getter)
1374
1375 @property
1376 def labels(self):
1377 """Dict[str, str]: Labels for the table.
1378
1379 This method always returns a dict. To change a table's labels,
1380 modify the dict, then call ``Client.update_table``. To delete a
1381 label, set its value to :data:`None` before updating.
1382 """
1383 return self._properties.setdefault("labels", {})
1384
1385 @property
1386 def full_table_id(self):
1387 """Union[str, None]: ID for the table (:data:`None` until set from the
1388 server).
1389
1390 In the format ``project_id:dataset_id.table_id``.
1391 """
1392 return self._properties.get("id")
1393
1394 @property
1395 def table_type(self):
1396 """Union[str, None]: The type of the table (:data:`None` until set from
1397 the server).
1398
1399 Possible values are ``'TABLE'``, ``'VIEW'``, or ``'EXTERNAL'``.
1400 """
1401 return self._properties.get("type")
1402
1403 @property
1404 def time_partitioning(self):
1405 """google.cloud.bigquery.table.TimePartitioning: Configures time-based
1406 partitioning for a table.
1407 """
1408 prop = self._properties.get("timePartitioning")
1409 if prop is not None:
1410 return TimePartitioning.from_api_repr(prop)
1411
1412 @property
1413 def partitioning_type(self):
1414 """Union[str, None]: Time partitioning of the table if it is
1415 partitioned (Defaults to :data:`None`).
1416 """
1417 warnings.warn(
1418 "This method will be deprecated in future versions. Please use "
1419 "TableListItem.time_partitioning.type_ instead.",
1420 PendingDeprecationWarning,
1421 stacklevel=2,
1422 )
1423 if self.time_partitioning is not None:
1424 return self.time_partitioning.type_
1425
1426 @property
1427 def partition_expiration(self):
1428 """Union[int, None]: Expiration time in milliseconds for a partition.
1429
1430 If this property is set and :attr:`type_` is not set, :attr:`type_`
1431 will default to :attr:`TimePartitioningType.DAY`.
1432 """
1433 warnings.warn(
1434 "This method will be deprecated in future versions. Please use "
1435 "TableListItem.time_partitioning.expiration_ms instead.",
1436 PendingDeprecationWarning,
1437 stacklevel=2,
1438 )
1439 if self.time_partitioning is not None:
1440 return self.time_partitioning.expiration_ms
1441
1442 @property
1443 def friendly_name(self):
1444 """Union[str, None]: Title of the table (defaults to :data:`None`)."""
1445 return self._properties.get("friendlyName")
1446
1447 view_use_legacy_sql = property(_view_use_legacy_sql_getter)
1448
1449 @property
1450 def clustering_fields(self):
1451 """Union[List[str], None]: Fields defining clustering for the table
1452
1453 (Defaults to :data:`None`).
1454
1455 Clustering fields are immutable after table creation.
1456
1457 .. note::
1458
1459 BigQuery supports clustering for both partitioned and
1460 non-partitioned tables.
1461 """
1462 prop = self._properties.get("clustering")
1463 if prop is not None:
1464 return list(prop.get("fields", ()))
1465
1466 @classmethod
1467 def from_string(cls, full_table_id: str) -> "TableListItem":
1468 """Construct a table from fully-qualified table ID.
1469
1470 Args:
1471 full_table_id (str):
1472 A fully-qualified table ID in standard SQL format. Must
1473 included a project ID, dataset ID, and table ID, each
1474 separated by ``.``.
1475
1476 Returns:
1477 Table: Table parsed from ``full_table_id``.
1478
1479 Examples:
1480 >>> Table.from_string('my-project.mydataset.mytable')
1481 Table(TableRef...(D...('my-project', 'mydataset'), 'mytable'))
1482
1483 Raises:
1484 ValueError:
1485 If ``full_table_id`` is not a fully-qualified table ID in
1486 standard SQL format.
1487 """
1488 return cls(
1489 {"tableReference": TableReference.from_string(full_table_id).to_api_repr()}
1490 )
1491
1492 def to_bqstorage(self) -> str:
1493 """Construct a BigQuery Storage API representation of this table.
1494
1495 Returns:
1496 str: A reference to this table in the BigQuery Storage API.
1497 """
1498 return self.reference.to_bqstorage()
1499
1500 def to_api_repr(self) -> dict:
1501 """Constructs the API resource of this table
1502
1503 Returns:
1504 Dict[str, object]: Table represented as an API resource
1505 """
1506 return copy.deepcopy(self._properties)
1507
1508
1509def _row_from_mapping(mapping, schema):
1510 """Convert a mapping to a row tuple using the schema.
1511
1512 Args:
1513 mapping (Dict[str, object])
1514 Mapping of row data: must contain keys for all required fields in
1515 the schema. Keys which do not correspond to a field in the schema
1516 are ignored.
1517 schema (List[google.cloud.bigquery.schema.SchemaField]):
1518 The schema of the table destination for the rows
1519
1520 Returns:
1521 Tuple[object]:
1522 Tuple whose elements are ordered according to the schema.
1523
1524 Raises:
1525 ValueError: If schema is empty.
1526 """
1527 if len(schema) == 0:
1528 raise ValueError(_TABLE_HAS_NO_SCHEMA)
1529
1530 row = []
1531 for field in schema:
1532 if field.mode == "REQUIRED":
1533 row.append(mapping[field.name])
1534 elif field.mode == "REPEATED":
1535 row.append(mapping.get(field.name, ()))
1536 elif field.mode == "NULLABLE":
1537 row.append(mapping.get(field.name))
1538 else:
1539 raise ValueError("Unknown field mode: {}".format(field.mode))
1540 return tuple(row)
1541
1542
1543class StreamingBuffer(object):
1544 """Information about a table's streaming buffer.
1545
1546 See https://cloud.google.com/bigquery/streaming-data-into-bigquery.
1547
1548 Args:
1549 resource (Dict[str, object]):
1550 streaming buffer representation returned from the API
1551 """
1552
1553 def __init__(self, resource):
1554 self.estimated_bytes = None
1555 if "estimatedBytes" in resource:
1556 self.estimated_bytes = int(resource["estimatedBytes"])
1557 self.estimated_rows = None
1558 if "estimatedRows" in resource:
1559 self.estimated_rows = int(resource["estimatedRows"])
1560 self.oldest_entry_time = None
1561 if "oldestEntryTime" in resource:
1562 self.oldest_entry_time = google.cloud._helpers._datetime_from_microseconds(
1563 1000.0 * int(resource["oldestEntryTime"])
1564 )
1565
1566
1567class SnapshotDefinition:
1568 """Information about base table and snapshot time of the snapshot.
1569
1570 See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#snapshotdefinition
1571
1572 Args:
1573 resource: Snapshot definition representation returned from the API.
1574 """
1575
1576 def __init__(self, resource: Dict[str, Any]):
1577 self.base_table_reference = None
1578 if "baseTableReference" in resource:
1579 self.base_table_reference = TableReference.from_api_repr(
1580 resource["baseTableReference"]
1581 )
1582
1583 self.snapshot_time = None
1584 if "snapshotTime" in resource:
1585 self.snapshot_time = google.cloud._helpers._rfc3339_to_datetime(
1586 resource["snapshotTime"]
1587 )
1588
1589
1590class CloneDefinition:
1591 """Information about base table and clone time of the clone.
1592
1593 See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clonedefinition
1594
1595 Args:
1596 resource: Clone definition representation returned from the API.
1597 """
1598
1599 def __init__(self, resource: Dict[str, Any]):
1600 self.base_table_reference = None
1601 if "baseTableReference" in resource:
1602 self.base_table_reference = TableReference.from_api_repr(
1603 resource["baseTableReference"]
1604 )
1605
1606 self.clone_time = None
1607 if "cloneTime" in resource:
1608 self.clone_time = google.cloud._helpers._rfc3339_to_datetime(
1609 resource["cloneTime"]
1610 )
1611
1612
1613class Row(object):
1614 """A BigQuery row.
1615
1616 Values can be accessed by position (index), by key like a dict,
1617 or as properties.
1618
1619 Args:
1620 values (Sequence[object]): The row values
1621 field_to_index (Dict[str, int]):
1622 A mapping from schema field names to indexes
1623 """
1624
1625 # Choose unusual field names to try to avoid conflict with schema fields.
1626 __slots__ = ("_xxx_values", "_xxx_field_to_index")
1627
1628 def __init__(self, values, field_to_index) -> None:
1629 self._xxx_values = values
1630 self._xxx_field_to_index = field_to_index
1631
1632 def values(self):
1633 """Return the values included in this row.
1634
1635 Returns:
1636 Sequence[object]: A sequence of length ``len(row)``.
1637 """
1638 return copy.deepcopy(self._xxx_values)
1639
1640 def keys(self) -> Iterable[str]:
1641 """Return the keys for using a row as a dict.
1642
1643 Returns:
1644 Iterable[str]: The keys corresponding to the columns of a row
1645
1646 Examples:
1647
1648 >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).keys())
1649 ['x', 'y']
1650 """
1651 return self._xxx_field_to_index.keys()
1652
1653 def items(self) -> Iterable[Tuple[str, Any]]:
1654 """Return items as ``(key, value)`` pairs.
1655
1656 Returns:
1657 Iterable[Tuple[str, object]]:
1658 The ``(key, value)`` pairs representing this row.
1659
1660 Examples:
1661
1662 >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).items())
1663 [('x', 'a'), ('y', 'b')]
1664 """
1665 for key, index in self._xxx_field_to_index.items():
1666 yield (key, copy.deepcopy(self._xxx_values[index]))
1667
1668 def get(self, key: str, default: Any = None) -> Any:
1669 """Return a value for key, with a default value if it does not exist.
1670
1671 Args:
1672 key (str): The key of the column to access
1673 default (object):
1674 The default value to use if the key does not exist. (Defaults
1675 to :data:`None`.)
1676
1677 Returns:
1678 object:
1679 The value associated with the provided key, or a default value.
1680
1681 Examples:
1682 When the key exists, the value associated with it is returned.
1683
1684 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('x')
1685 'a'
1686
1687 The default value is :data:`None` when the key does not exist.
1688
1689 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z')
1690 None
1691
1692 The default value can be overridden with the ``default`` parameter.
1693
1694 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z', '')
1695 ''
1696
1697 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z', default = '')
1698 ''
1699 """
1700 index = self._xxx_field_to_index.get(key)
1701 if index is None:
1702 return default
1703 return self._xxx_values[index]
1704
1705 def __getattr__(self, name):
1706 value = self._xxx_field_to_index.get(name)
1707 if value is None:
1708 raise AttributeError("no row field {!r}".format(name))
1709 return self._xxx_values[value]
1710
1711 def __len__(self):
1712 return len(self._xxx_values)
1713
1714 def __getitem__(self, key):
1715 if isinstance(key, str):
1716 value = self._xxx_field_to_index.get(key)
1717 if value is None:
1718 raise KeyError("no row field {!r}".format(key))
1719 key = value
1720 return self._xxx_values[key]
1721
1722 def __eq__(self, other):
1723 if not isinstance(other, Row):
1724 return NotImplemented
1725 return (
1726 self._xxx_values == other._xxx_values
1727 and self._xxx_field_to_index == other._xxx_field_to_index
1728 )
1729
1730 def __ne__(self, other):
1731 return not self == other
1732
1733 def __repr__(self):
1734 # sort field dict by value, for determinism
1735 items = sorted(self._xxx_field_to_index.items(), key=operator.itemgetter(1))
1736 f2i = "{" + ", ".join("%r: %d" % item for item in items) + "}"
1737 return "Row({}, {})".format(self._xxx_values, f2i)
1738
1739
1740class _NoopProgressBarQueue(object):
1741 """A fake Queue class that does nothing.
1742
1743 This is used when there is no progress bar to send updates to.
1744 """
1745
1746 def put_nowait(self, item):
1747 """Don't actually do anything with the item."""
1748
1749
1750class RowIterator(HTTPIterator):
1751 """A class for iterating through HTTP/JSON API row list responses.
1752
1753 Args:
1754 client (Optional[google.cloud.bigquery.Client]):
1755 The API client instance. This should always be non-`None`, except for
1756 subclasses that do not use it, namely the ``_EmptyRowIterator``.
1757 api_request (Callable[google.cloud._http.JSONConnection.api_request]):
1758 The function to use to make API requests.
1759 path (str): The method path to query for the list of items.
1760 schema (Sequence[Union[ \
1761 :class:`~google.cloud.bigquery.schema.SchemaField`, \
1762 Mapping[str, Any] \
1763 ]]):
1764 The table's schema. If any item is a mapping, its content must be
1765 compatible with
1766 :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
1767 page_token (str): A token identifying a page in a result set to start
1768 fetching results from.
1769 max_results (Optional[int]): The maximum number of results to fetch.
1770 page_size (Optional[int]): The maximum number of rows in each page
1771 of results from this request. Non-positive values are ignored.
1772 Defaults to a sensible value set by the API.
1773 extra_params (Optional[Dict[str, object]]):
1774 Extra query string parameters for the API call.
1775 table (Optional[Union[ \
1776 google.cloud.bigquery.table.Table, \
1777 google.cloud.bigquery.table.TableReference, \
1778 ]]):
1779 The table which these rows belong to, or a reference to it. Used to
1780 call the BigQuery Storage API to fetch rows.
1781 selected_fields (Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]):
1782 A subset of columns to select from this table.
1783 total_rows (Optional[int]):
1784 Total number of rows in the table.
1785 first_page_response (Optional[dict]):
1786 API response for the first page of results. These are returned when
1787 the first page is requested.
1788 query (Optional[str]):
1789 The query text used.
1790 total_bytes_processed (Optional[int]):
1791 If representing query results, the total bytes processed by the associated query.
1792 slot_millis (Optional[int]):
1793 If representing query results, the number of slot ms billed for the associated query.
1794 created (Optional[datetime.datetime]):
1795 If representing query results, the creation time of the associated query.
1796 started (Optional[datetime.datetime]):
1797 If representing query results, the start time of the associated query.
1798 ended (Optional[datetime.datetime]):
1799 If representing query results, the end time of the associated query.
1800 """
1801
1802 def __init__(
1803 self,
1804 client,
1805 api_request,
1806 path,
1807 schema,
1808 page_token=None,
1809 max_results=None,
1810 page_size=None,
1811 extra_params=None,
1812 table=None,
1813 selected_fields=None,
1814 total_rows=None,
1815 first_page_response=None,
1816 location: Optional[str] = None,
1817 job_id: Optional[str] = None,
1818 query_id: Optional[str] = None,
1819 project: Optional[str] = None,
1820 num_dml_affected_rows: Optional[int] = None,
1821 query: Optional[str] = None,
1822 total_bytes_processed: Optional[int] = None,
1823 slot_millis: Optional[int] = None,
1824 created: Optional[datetime.datetime] = None,
1825 started: Optional[datetime.datetime] = None,
1826 ended: Optional[datetime.datetime] = None,
1827 ):
1828 super(RowIterator, self).__init__(
1829 client,
1830 api_request,
1831 path,
1832 item_to_value=_item_to_row,
1833 items_key="rows",
1834 page_token=page_token,
1835 max_results=max_results,
1836 extra_params=extra_params,
1837 page_start=_rows_page_start,
1838 next_token="pageToken",
1839 )
1840 schema = _to_schema_fields(schema) if schema else ()
1841 self._field_to_index = _helpers._field_to_index_mapping(schema)
1842 self._page_size = page_size
1843 self._preserve_order = False
1844 self._schema = schema
1845 self._selected_fields = selected_fields
1846 self._table = table
1847 self._total_rows = total_rows
1848 self._first_page_response = first_page_response
1849 self._location = location
1850 self._job_id = job_id
1851 self._query_id = query_id
1852 self._project = project
1853 self._num_dml_affected_rows = num_dml_affected_rows
1854 self._query = query
1855 self._total_bytes_processed = total_bytes_processed
1856 self._slot_millis = slot_millis
1857 self._job_created = created
1858 self._job_started = started
1859 self._job_ended = ended
1860
1861 @property
1862 def _billing_project(self) -> Optional[str]:
1863 """GCP Project ID where BQ API will bill to (if applicable)."""
1864 client = self.client
1865 return client.project if client is not None else None
1866
1867 @property
1868 def job_id(self) -> Optional[str]:
1869 """ID of the query job (if applicable).
1870
1871 To get the job metadata, call
1872 ``job = client.get_job(rows.job_id, location=rows.location)``.
1873 """
1874 return self._job_id
1875
1876 @property
1877 def location(self) -> Optional[str]:
1878 """Location where the query executed (if applicable).
1879
1880 See: https://cloud.google.com/bigquery/docs/locations
1881 """
1882 return self._location
1883
1884 @property
1885 def num_dml_affected_rows(self) -> Optional[int]:
1886 """If this RowIterator is the result of a DML query, the number of
1887 rows that were affected.
1888
1889 See:
1890 https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query#body.QueryResponse.FIELDS.num_dml_affected_rows
1891 """
1892 return self._num_dml_affected_rows
1893
1894 @property
1895 def project(self) -> Optional[str]:
1896 """GCP Project ID where these rows are read from."""
1897 return self._project
1898
1899 @property
1900 def query_id(self) -> Optional[str]:
1901 """[Preview] ID of a completed query.
1902
1903 This ID is auto-generated and not guaranteed to be populated.
1904 """
1905 return self._query_id
1906
1907 @property
1908 def query(self) -> Optional[str]:
1909 """The query text used."""
1910 return self._query
1911
1912 @property
1913 def total_bytes_processed(self) -> Optional[int]:
1914 """total bytes processed from job statistics, if present."""
1915 return self._total_bytes_processed
1916
1917 @property
1918 def slot_millis(self) -> Optional[int]:
1919 """Number of slot ms the user is actually billed for."""
1920 return self._slot_millis
1921
1922 @property
1923 def created(self) -> Optional[datetime.datetime]:
1924 """If representing query results, the creation time of the associated query."""
1925 return self._job_created
1926
1927 @property
1928 def started(self) -> Optional[datetime.datetime]:
1929 """If representing query results, the start time of the associated query."""
1930 return self._job_started
1931
1932 @property
1933 def ended(self) -> Optional[datetime.datetime]:
1934 """If representing query results, the end time of the associated query."""
1935 return self._job_ended
1936
1937 def _is_almost_completely_cached(self):
1938 """Check if all results are completely cached.
1939
1940 This is useful to know, because we can avoid alternative download
1941 mechanisms.
1942 """
1943 if (
1944 not hasattr(self, "_first_page_response")
1945 or self._first_page_response is None
1946 ):
1947 return False
1948
1949 total_cached_rows = len(self._first_page_response.get(self._items_key, []))
1950 if self.max_results is not None and total_cached_rows >= self.max_results:
1951 return True
1952
1953 if (
1954 self.next_page_token is None
1955 and self._first_page_response.get(self._next_token) is None
1956 ):
1957 return True
1958
1959 if self._total_rows is not None:
1960 almost_completely = self._total_rows * ALMOST_COMPLETELY_CACHED_RATIO
1961 if total_cached_rows >= almost_completely:
1962 return True
1963
1964 return False
1965
1966 def _should_use_bqstorage(self, bqstorage_client, create_bqstorage_client):
1967 """Returns True if the BigQuery Storage API can be used.
1968
1969 Returns:
1970 bool
1971 True if the BigQuery Storage client can be used or created.
1972 """
1973 using_bqstorage_api = bqstorage_client or create_bqstorage_client
1974 if not using_bqstorage_api:
1975 return False
1976
1977 if self._table is None:
1978 return False
1979
1980 # The developer has already started paging through results if
1981 # next_page_token is set.
1982 if hasattr(self, "next_page_token") and self.next_page_token is not None:
1983 return False
1984
1985 if self._is_almost_completely_cached():
1986 return False
1987
1988 if self.max_results is not None:
1989 return False
1990
1991 try:
1992 _versions_helpers.BQ_STORAGE_VERSIONS.try_import(raise_if_error=True)
1993 except bq_exceptions.BigQueryStorageNotFoundError:
1994 warnings.warn(
1995 "BigQuery Storage module not found, fetch data with the REST "
1996 "endpoint instead."
1997 )
1998 return False
1999 except bq_exceptions.LegacyBigQueryStorageError as exc:
2000 warnings.warn(str(exc))
2001 return False
2002
2003 return True
2004
2005 def _get_next_page_response(self):
2006 """Requests the next page from the path provided.
2007
2008 Returns:
2009 Dict[str, object]:
2010 The parsed JSON response of the next page's contents.
2011 """
2012 if self._first_page_response:
2013 rows = self._first_page_response.get(self._items_key, [])[
2014 : self.max_results
2015 ]
2016 response = {
2017 self._items_key: rows,
2018 }
2019 if self._next_token in self._first_page_response:
2020 response[self._next_token] = self._first_page_response[self._next_token]
2021
2022 self._first_page_response = None
2023 return response
2024
2025 params = self._get_query_params()
2026
2027 # If the user has provided page_size and start_index, we need to pass
2028 # start_index for the first page, but for all subsequent pages, we
2029 # should not pass start_index. We make a shallow copy of params and do
2030 # not alter the original, so if the user iterates the results again,
2031 # start_index is preserved.
2032 params_copy = copy.copy(params)
2033 if self._page_size is not None:
2034 if self.page_number and "startIndex" in params:
2035 del params_copy["startIndex"]
2036
2037 return self.api_request(
2038 method=self._HTTP_METHOD, path=self.path, query_params=params_copy
2039 )
2040
2041 @property
2042 def schema(self):
2043 """List[google.cloud.bigquery.schema.SchemaField]: The subset of
2044 columns to be read from the table."""
2045 return list(self._schema)
2046
2047 @property
2048 def total_rows(self):
2049 """int: The total number of rows in the table or query results."""
2050 return self._total_rows
2051
2052 def _maybe_warn_max_results(
2053 self,
2054 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"],
2055 ):
2056 """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set.
2057
2058 This helper method should be used directly in the relevant top-level public
2059 methods, so that the warning is issued for the correct line in user code.
2060
2061 Args:
2062 bqstorage_client:
2063 The BigQuery Storage client intended to use for downloading result rows.
2064 """
2065 if bqstorage_client is not None and self.max_results is not None:
2066 warnings.warn(
2067 "Cannot use bqstorage_client if max_results is set, "
2068 "reverting to fetching data with the REST endpoint.",
2069 stacklevel=3,
2070 )
2071
2072 def _to_page_iterable(
2073 self, bqstorage_download, tabledata_list_download, bqstorage_client=None
2074 ):
2075 if not self._should_use_bqstorage(bqstorage_client, False):
2076 bqstorage_client = None
2077
2078 result_pages = (
2079 bqstorage_download()
2080 if bqstorage_client is not None
2081 else tabledata_list_download()
2082 )
2083 yield from result_pages
2084
2085 def to_arrow_iterable(
2086 self,
2087 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2088 max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
2089 max_stream_count: Optional[int] = None,
2090 timeout: Optional[float] = None,
2091 ) -> Iterator["pyarrow.RecordBatch"]:
2092 """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
2093
2094 Args:
2095 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2096 A BigQuery Storage API client. If supplied, use the faster
2097 BigQuery Storage API to fetch rows from BigQuery.
2098
2099 This method requires the ``pyarrow`` and
2100 ``google-cloud-bigquery-storage`` libraries.
2101
2102 This method only exposes a subset of the capabilities of the
2103 BigQuery Storage API. For full access to all features
2104 (projections, filters, snapshots) use the Storage API directly.
2105
2106 max_queue_size (Optional[int]):
2107 The maximum number of result pages to hold in the internal queue when
2108 streaming query results over the BigQuery Storage API. Ignored if
2109 Storage API is not used.
2110
2111 By default, the max queue size is set to the number of BQ Storage streams
2112 created by the server. If ``max_queue_size`` is :data:`None`, the queue
2113 size is infinite.
2114
2115 max_stream_count (Optional[int]):
2116 The maximum number of parallel download streams when
2117 using BigQuery Storage API. Ignored if
2118 BigQuery Storage API is not used.
2119
2120 This setting also has no effect if the query result
2121 is deterministically ordered with ORDER BY,
2122 in which case, the number of download stream is always 1.
2123
2124 If set to 0 or None (the default), the number of download
2125 streams is determined by BigQuery the server. However, this behaviour
2126 can require a lot of memory to store temporary download result,
2127 especially with very large queries. In that case,
2128 setting this parameter value to a value > 0 can help
2129 reduce system resource consumption.
2130
2131 timeout (Optional[float]):
2132 The number of seconds to wait for the underlying download to complete.
2133 If ``None``, wait indefinitely.
2134
2135 Returns:
2136 pyarrow.RecordBatch:
2137 A generator of :class:`~pyarrow.RecordBatch`.
2138
2139 .. versionadded:: 2.31.0
2140 """
2141 self._maybe_warn_max_results(bqstorage_client)
2142
2143 bqstorage_download = functools.partial(
2144 _pandas_helpers.download_arrow_bqstorage,
2145 self._billing_project,
2146 self._table,
2147 bqstorage_client,
2148 preserve_order=self._preserve_order,
2149 selected_fields=self._selected_fields,
2150 max_queue_size=max_queue_size,
2151 max_stream_count=max_stream_count,
2152 timeout=timeout,
2153 )
2154 tabledata_list_download = functools.partial(
2155 _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
2156 )
2157 return self._to_page_iterable(
2158 bqstorage_download,
2159 tabledata_list_download,
2160 bqstorage_client=bqstorage_client,
2161 )
2162
2163 # If changing the signature of this method, make sure to apply the same
2164 # changes to job.QueryJob.to_arrow()
2165 def to_arrow(
2166 self,
2167 progress_bar_type: Optional[str] = None,
2168 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2169 create_bqstorage_client: bool = True,
2170 timeout: Optional[float] = None,
2171 ) -> "pyarrow.Table":
2172 """[Beta] Create a class:`pyarrow.Table` by loading all pages of a
2173 table or query.
2174
2175 Args:
2176 progress_bar_type (Optional[str]):
2177 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
2178 display a progress bar while the data downloads. Install the
2179 ``tqdm`` package to use this feature.
2180
2181 Possible values of ``progress_bar_type`` include:
2182
2183 ``None``
2184 No progress bar.
2185 ``'tqdm'``
2186 Use the :func:`tqdm.tqdm` function to print a progress bar
2187 to :data:`sys.stdout`.
2188 ``'tqdm_notebook'``
2189 Use the :func:`tqdm.notebook.tqdm` function to display a
2190 progress bar as a Jupyter notebook widget.
2191 ``'tqdm_gui'``
2192 Use the :func:`tqdm.tqdm_gui` function to display a
2193 progress bar as a graphical dialog box.
2194 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2195 A BigQuery Storage API client. If supplied, use the faster BigQuery
2196 Storage API to fetch rows from BigQuery. This API is a billable API.
2197
2198 This method requires ``google-cloud-bigquery-storage`` library.
2199
2200 This method only exposes a subset of the capabilities of the
2201 BigQuery Storage API. For full access to all features
2202 (projections, filters, snapshots) use the Storage API directly.
2203 create_bqstorage_client (Optional[bool]):
2204 If ``True`` (default), create a BigQuery Storage API client using
2205 the default API settings. The BigQuery Storage API is a faster way
2206 to fetch rows from BigQuery. See the ``bqstorage_client`` parameter
2207 for more information.
2208
2209 This argument does nothing if ``bqstorage_client`` is supplied.
2210
2211 .. versionadded:: 1.24.0
2212 timeout (Optional[float]):
2213 The number of seconds to wait for the underlying download to complete.
2214 If ``None``, wait indefinitely.
2215
2216 Returns:
2217 pyarrow.Table
2218 A :class:`pyarrow.Table` populated with row data and column
2219 headers from the query results. The column headers are derived
2220 from the destination table's schema.
2221
2222 Raises:
2223 ValueError: If the :mod:`pyarrow` library cannot be imported.
2224
2225
2226 .. versionadded:: 1.17.0
2227 """
2228 if pyarrow is None:
2229 raise ValueError(_NO_PYARROW_ERROR)
2230
2231 self._maybe_warn_max_results(bqstorage_client)
2232
2233 if not self._should_use_bqstorage(bqstorage_client, create_bqstorage_client):
2234 create_bqstorage_client = False
2235 bqstorage_client = None
2236
2237 owns_bqstorage_client = False
2238 if not bqstorage_client and create_bqstorage_client:
2239 bqstorage_client = self.client._ensure_bqstorage_client()
2240 owns_bqstorage_client = bqstorage_client is not None
2241
2242 try:
2243 progress_bar = get_progress_bar(
2244 progress_bar_type, "Downloading", self.total_rows, "rows"
2245 )
2246
2247 record_batches = []
2248 for record_batch in self.to_arrow_iterable(
2249 bqstorage_client=bqstorage_client, timeout=timeout
2250 ):
2251 record_batches.append(record_batch)
2252
2253 if progress_bar is not None:
2254 # In some cases, the number of total rows is not populated
2255 # until the first page of rows is fetched. Update the
2256 # progress bar's total to keep an accurate count.
2257 progress_bar.total = progress_bar.total or self.total_rows
2258 progress_bar.update(record_batch.num_rows)
2259
2260 if progress_bar is not None:
2261 # Indicate that the download has finished.
2262 progress_bar.close()
2263 finally:
2264 if owns_bqstorage_client:
2265 bqstorage_client._transport.grpc_channel.close() # type: ignore
2266
2267 if record_batches and bqstorage_client is not None:
2268 return pyarrow.Table.from_batches(record_batches)
2269 else:
2270 # No records (not record_batches), use schema based on BigQuery schema
2271 # **or**
2272 # we used the REST API (bqstorage_client is None),
2273 # which doesn't add arrow extension metadata, so we let
2274 # `bq_to_arrow_schema` do it.
2275 arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
2276 return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
2277
2278 def to_dataframe_iterable(
2279 self,
2280 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2281 dtypes: Optional[Dict[str, Any]] = None,
2282 max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
2283 max_stream_count: Optional[int] = None,
2284 timeout: Optional[float] = None,
2285 ) -> "pandas.DataFrame":
2286 """Create an iterable of pandas DataFrames, to process the table as a stream.
2287
2288 Args:
2289 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2290 A BigQuery Storage API client. If supplied, use the faster
2291 BigQuery Storage API to fetch rows from BigQuery.
2292
2293 This method requires ``google-cloud-bigquery-storage`` library.
2294
2295 This method only exposes a subset of the capabilities of the
2296 BigQuery Storage API. For full access to all features
2297 (projections, filters, snapshots) use the Storage API directly.
2298
2299 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
2300 A dictionary of column names pandas ``dtype``s. The provided
2301 ``dtype`` is used when constructing the series for the column
2302 specified. Otherwise, the default pandas behavior is used.
2303
2304 max_queue_size (Optional[int]):
2305 The maximum number of result pages to hold in the internal queue when
2306 streaming query results over the BigQuery Storage API. Ignored if
2307 Storage API is not used.
2308
2309 By default, the max queue size is set to the number of BQ Storage streams
2310 created by the server. If ``max_queue_size`` is :data:`None`, the queue
2311 size is infinite.
2312
2313 .. versionadded:: 2.14.0
2314
2315 max_stream_count (Optional[int]):
2316 The maximum number of parallel download streams when
2317 using BigQuery Storage API. Ignored if
2318 BigQuery Storage API is not used.
2319
2320 This setting also has no effect if the query result
2321 is deterministically ordered with ORDER BY,
2322 in which case, the number of download stream is always 1.
2323
2324 If set to 0 or None (the default), the number of download
2325 streams is determined by BigQuery the server. However, this behaviour
2326 can require a lot of memory to store temporary download result,
2327 especially with very large queries. In that case,
2328 setting this parameter value to a value > 0 can help
2329 reduce system resource consumption.
2330
2331 timeout (Optional[float]):
2332 The number of seconds to wait for the underlying download to complete.
2333 If ``None``, wait indefinitely.
2334
2335 Returns:
2336 pandas.DataFrame:
2337 A generator of :class:`~pandas.DataFrame`.
2338
2339 Raises:
2340 ValueError:
2341 If the :mod:`pandas` library cannot be imported.
2342 """
2343 _pandas_helpers.verify_pandas_imports()
2344
2345 if dtypes is None:
2346 dtypes = {}
2347
2348 self._maybe_warn_max_results(bqstorage_client)
2349
2350 column_names = [field.name for field in self._schema]
2351 bqstorage_download = functools.partial(
2352 _pandas_helpers.download_dataframe_bqstorage,
2353 self._billing_project,
2354 self._table,
2355 bqstorage_client,
2356 column_names,
2357 dtypes,
2358 preserve_order=self._preserve_order,
2359 selected_fields=self._selected_fields,
2360 max_queue_size=max_queue_size,
2361 max_stream_count=max_stream_count,
2362 timeout=timeout,
2363 )
2364 tabledata_list_download = functools.partial(
2365 _pandas_helpers.download_dataframe_row_iterator,
2366 iter(self.pages),
2367 self.schema,
2368 dtypes,
2369 )
2370 return self._to_page_iterable(
2371 bqstorage_download,
2372 tabledata_list_download,
2373 bqstorage_client=bqstorage_client,
2374 )
2375
2376 # If changing the signature of this method, make sure to apply the same
2377 # changes to job.QueryJob.to_dataframe()
2378 def to_dataframe(
2379 self,
2380 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2381 dtypes: Optional[Dict[str, Any]] = None,
2382 progress_bar_type: Optional[str] = None,
2383 create_bqstorage_client: bool = True,
2384 geography_as_object: bool = False,
2385 bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
2386 int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
2387 float_dtype: Union[Any, None] = None,
2388 string_dtype: Union[Any, None] = None,
2389 date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
2390 datetime_dtype: Union[Any, None] = None,
2391 time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
2392 timestamp_dtype: Union[Any, None] = None,
2393 range_date_dtype: Union[Any, None] = DefaultPandasDTypes.RANGE_DATE_DTYPE,
2394 range_datetime_dtype: Union[
2395 Any, None
2396 ] = DefaultPandasDTypes.RANGE_DATETIME_DTYPE,
2397 range_timestamp_dtype: Union[
2398 Any, None
2399 ] = DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE,
2400 timeout: Optional[float] = None,
2401 ) -> "pandas.DataFrame":
2402 """Create a pandas DataFrame by loading all pages of a query.
2403
2404 Args:
2405 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2406 A BigQuery Storage API client. If supplied, use the faster
2407 BigQuery Storage API to fetch rows from BigQuery.
2408
2409 This method requires ``google-cloud-bigquery-storage`` library.
2410
2411 This method only exposes a subset of the capabilities of the
2412 BigQuery Storage API. For full access to all features
2413 (projections, filters, snapshots) use the Storage API directly.
2414
2415 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
2416 A dictionary of column names pandas ``dtype``s. The provided
2417 ``dtype`` is used when constructing the series for the column
2418 specified. Otherwise, the default pandas behavior is used.
2419 progress_bar_type (Optional[str]):
2420 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
2421 display a progress bar while the data downloads. Install the
2422 ``tqdm`` package to use this feature.
2423
2424 Possible values of ``progress_bar_type`` include:
2425
2426 ``None``
2427 No progress bar.
2428 ``'tqdm'``
2429 Use the :func:`tqdm.tqdm` function to print a progress bar
2430 to :data:`sys.stdout`.
2431 ``'tqdm_notebook'``
2432 Use the :func:`tqdm.notebook.tqdm` function to display a
2433 progress bar as a Jupyter notebook widget.
2434 ``'tqdm_gui'``
2435 Use the :func:`tqdm.tqdm_gui` function to display a
2436 progress bar as a graphical dialog box.
2437
2438 .. versionadded:: 1.11.0
2439
2440 create_bqstorage_client (Optional[bool]):
2441 If ``True`` (default), create a BigQuery Storage API client
2442 using the default API settings. The BigQuery Storage API
2443 is a faster way to fetch rows from BigQuery. See the
2444 ``bqstorage_client`` parameter for more information.
2445
2446 This argument does nothing if ``bqstorage_client`` is supplied.
2447
2448 .. versionadded:: 1.24.0
2449
2450 geography_as_object (Optional[bool]):
2451 If ``True``, convert GEOGRAPHY data to :mod:`shapely`
2452 geometry objects. If ``False`` (default), don't cast
2453 geography data to :mod:`shapely` geometry objects.
2454
2455 .. versionadded:: 2.24.0
2456
2457 bool_dtype (Optional[pandas.Series.dtype, None]):
2458 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
2459 to convert BigQuery Boolean type, instead of relying on the default
2460 ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
2461 then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
2462 type can be found at:
2463 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2464
2465 .. versionadded:: 3.8.0
2466
2467 int_dtype (Optional[pandas.Series.dtype, None]):
2468 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2469 to convert BigQuery Integer types, instead of relying on the default
2470 ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2471 then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2472 Integer types can be found at:
2473 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2474
2475 .. versionadded:: 3.8.0
2476
2477 float_dtype (Optional[pandas.Series.dtype, None]):
2478 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2479 to convert BigQuery Float type, instead of relying on the default
2480 ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2481 then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2482 type can be found at:
2483 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2484
2485 .. versionadded:: 3.8.0
2486
2487 string_dtype (Optional[pandas.Series.dtype, None]):
2488 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2489 convert BigQuery String type, instead of relying on the default
2490 ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2491 then the data type will be ``numpy.dtype("object")``. BigQuery String
2492 type can be found at:
2493 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2494
2495 .. versionadded:: 3.8.0
2496
2497 date_dtype (Optional[pandas.Series.dtype, None]):
2498 If set, indicate a pandas ExtensionDtype (e.g.
2499 ``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
2500 type, instead of relying on the default ``db_dtypes.DateDtype()``.
2501 If you explicitly set the value to ``None``, then the data type will be
2502 ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
2503 Date type can be found at:
2504 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type
2505
2506 .. versionadded:: 3.10.0
2507
2508 datetime_dtype (Optional[pandas.Series.dtype, None]):
2509 If set, indicate a pandas ExtensionDtype (e.g.
2510 ``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
2511 type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
2512 If you explicitly set the value to ``None``, then the data type will be
2513 ``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
2514 Datetime type can be found at:
2515 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type
2516
2517 .. versionadded:: 3.10.0
2518
2519 time_dtype (Optional[pandas.Series.dtype, None]):
2520 If set, indicate a pandas ExtensionDtype (e.g.
2521 ``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
2522 type, instead of relying on the default ``db_dtypes.TimeDtype()``.
2523 If you explicitly set the value to ``None``, then the data type will be
2524 ``numpy.dtype("object")``. BigQuery Time type can be found at:
2525 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type
2526
2527 .. versionadded:: 3.10.0
2528
2529 timestamp_dtype (Optional[pandas.Series.dtype, None]):
2530 If set, indicate a pandas ExtensionDtype (e.g.
2531 ``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
2532 type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
2533 If you explicitly set the value to ``None``, then the data type will be
2534 ``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
2535 Datetime type can be found at:
2536 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type
2537
2538 .. versionadded:: 3.10.0
2539
2540 range_date_dtype (Optional[pandas.Series.dtype, None]):
2541 If set, indicate a pandas ExtensionDtype, such as:
2542
2543 .. code-block:: python
2544
2545 pandas.ArrowDtype(pyarrow.struct(
2546 [("start", pyarrow.date32()), ("end", pyarrow.date32())]
2547 ))
2548
2549 to convert BigQuery RANGE<DATE> type, instead of relying on
2550 the default ``object``. If you explicitly set the value to
2551 ``None``, the data type will be ``object``. BigQuery Range type
2552 can be found at:
2553 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
2554
2555 .. versionadded:: 3.21.0
2556
2557 range_datetime_dtype (Optional[pandas.Series.dtype, None]):
2558 If set, indicate a pandas ExtensionDtype, such as:
2559
2560 .. code-block:: python
2561
2562 pandas.ArrowDtype(pyarrow.struct(
2563 [
2564 ("start", pyarrow.timestamp("us")),
2565 ("end", pyarrow.timestamp("us")),
2566 ]
2567 ))
2568
2569 to convert BigQuery RANGE<DATETIME> type, instead of relying on
2570 the default ``object``. If you explicitly set the value to
2571 ``None``, the data type will be ``object``. BigQuery Range type
2572 can be found at:
2573 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
2574
2575 .. versionadded:: 3.21.0
2576
2577 range_timestamp_dtype (Optional[pandas.Series.dtype, None]):
2578 If set, indicate a pandas ExtensionDtype, such as:
2579
2580 .. code-block:: python
2581
2582 pandas.ArrowDtype(pyarrow.struct(
2583 [
2584 ("start", pyarrow.timestamp("us", tz="UTC")),
2585 ("end", pyarrow.timestamp("us", tz="UTC")),
2586 ]
2587 ))
2588
2589 to convert BigQuery RANGE<TIMESTAMP> type, instead of relying
2590 on the default ``object``. If you explicitly set the value to
2591 ``None``, the data type will be ``object``. BigQuery Range type
2592 can be found at:
2593 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#range_type
2594
2595 .. versionadded:: 3.21.0
2596
2597 timeout (Optional[float]):
2598 The number of seconds to wait for the underlying download to complete.
2599 If ``None``, wait indefinitely.
2600
2601 Returns:
2602 pandas.DataFrame:
2603 A :class:`~pandas.DataFrame` populated with row data and column
2604 headers from the query results. The column headers are derived
2605 from the destination table's schema.
2606
2607 Raises:
2608 ValueError:
2609 If the :mod:`pandas` library cannot be imported, or
2610 the :mod:`google.cloud.bigquery_storage_v1` module is
2611 required but cannot be imported. Also if
2612 `geography_as_object` is `True`, but the
2613 :mod:`shapely` library cannot be imported. Also if
2614 `bool_dtype`, `int_dtype` or other dtype parameters
2615 is not supported dtype.
2616
2617 """
2618 _pandas_helpers.verify_pandas_imports()
2619
2620 if geography_as_object and shapely is None:
2621 raise ValueError(_NO_SHAPELY_ERROR)
2622
2623 if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
2624 bool_dtype = pandas.BooleanDtype()
2625
2626 if int_dtype is DefaultPandasDTypes.INT_DTYPE:
2627 int_dtype = pandas.Int64Dtype()
2628
2629 if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
2630 time_dtype = db_dtypes.TimeDtype()
2631
2632 if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE:
2633 if _versions_helpers.SUPPORTS_RANGE_PYARROW:
2634 range_date_dtype = pandas.ArrowDtype(
2635 pyarrow.struct(
2636 [("start", pyarrow.date32()), ("end", pyarrow.date32())]
2637 )
2638 )
2639 else:
2640 warnings.warn(_RANGE_PYARROW_WARNING)
2641 range_date_dtype = None
2642
2643 if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE:
2644 if _versions_helpers.SUPPORTS_RANGE_PYARROW:
2645 range_datetime_dtype = pandas.ArrowDtype(
2646 pyarrow.struct(
2647 [
2648 ("start", pyarrow.timestamp("us")),
2649 ("end", pyarrow.timestamp("us")),
2650 ]
2651 )
2652 )
2653 else:
2654 warnings.warn(_RANGE_PYARROW_WARNING)
2655 range_datetime_dtype = None
2656
2657 if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE:
2658 if _versions_helpers.SUPPORTS_RANGE_PYARROW:
2659 range_timestamp_dtype = pandas.ArrowDtype(
2660 pyarrow.struct(
2661 [
2662 ("start", pyarrow.timestamp("us", tz="UTC")),
2663 ("end", pyarrow.timestamp("us", tz="UTC")),
2664 ]
2665 )
2666 )
2667 else:
2668 warnings.warn(_RANGE_PYARROW_WARNING)
2669 range_timestamp_dtype = None
2670
2671 if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
2672 raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
2673
2674 if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
2675 raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
2676
2677 if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
2678 raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
2679
2680 if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
2681 raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
2682
2683 if (
2684 date_dtype is not None
2685 and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
2686 and not hasattr(date_dtype, "__from_arrow__")
2687 ):
2688 raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)
2689
2690 if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
2691 raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)
2692
2693 if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
2694 raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)
2695
2696 if timestamp_dtype is not None and not hasattr(
2697 timestamp_dtype, "__from_arrow__"
2698 ):
2699 raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)
2700
2701 if dtypes is None:
2702 dtypes = {}
2703
2704 self._maybe_warn_max_results(bqstorage_client)
2705
2706 if not self._should_use_bqstorage(bqstorage_client, create_bqstorage_client):
2707 create_bqstorage_client = False
2708 bqstorage_client = None
2709
2710 record_batch = self.to_arrow(
2711 progress_bar_type=progress_bar_type,
2712 bqstorage_client=bqstorage_client,
2713 create_bqstorage_client=create_bqstorage_client,
2714 timeout=timeout,
2715 )
2716
2717 # Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error,
2718 # when pyarrow converts date values to nanosecond precision. To avoid the error, we
2719 # set the date_as_object parameter to True, if necessary.
2720 date_as_object = False
2721 if date_dtype is DefaultPandasDTypes.DATE_DTYPE:
2722 date_dtype = db_dtypes.DateDtype()
2723 date_as_object = not all(
2724 self.__can_cast_timestamp_ns(col)
2725 for col in record_batch
2726 # Type can be date32 or date64 (plus units).
2727 # See: https://arrow.apache.org/docs/python/api/datatypes.html
2728 if pyarrow.types.is_date(col.type)
2729 )
2730
2731 timestamp_as_object = False
2732 if datetime_dtype is None and timestamp_dtype is None:
2733 timestamp_as_object = not all(
2734 self.__can_cast_timestamp_ns(col)
2735 for col in record_batch
2736 # Type can be datetime and timestamp (plus units and time zone).
2737 # See: https://arrow.apache.org/docs/python/api/datatypes.html
2738 if pyarrow.types.is_timestamp(col.type)
2739 )
2740
2741 df = record_batch.to_pandas(
2742 date_as_object=date_as_object,
2743 timestamp_as_object=timestamp_as_object,
2744 integer_object_nulls=True,
2745 types_mapper=_pandas_helpers.default_types_mapper(
2746 date_as_object=date_as_object,
2747 bool_dtype=bool_dtype,
2748 int_dtype=int_dtype,
2749 float_dtype=float_dtype,
2750 string_dtype=string_dtype,
2751 date_dtype=date_dtype,
2752 datetime_dtype=datetime_dtype,
2753 time_dtype=time_dtype,
2754 timestamp_dtype=timestamp_dtype,
2755 range_date_dtype=range_date_dtype,
2756 range_datetime_dtype=range_datetime_dtype,
2757 range_timestamp_dtype=range_timestamp_dtype,
2758 ),
2759 )
2760
2761 for column in dtypes:
2762 df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)
2763
2764 if geography_as_object:
2765 for field in self.schema:
2766 if field.field_type.upper() == "GEOGRAPHY" and field.mode != "REPEATED":
2767 df[field.name] = df[field.name].dropna().apply(_read_wkt)
2768
2769 return df
2770
2771 @staticmethod
2772 def __can_cast_timestamp_ns(column):
2773 try:
2774 column.cast("timestamp[ns]")
2775 except pyarrow.lib.ArrowInvalid:
2776 return False
2777 else:
2778 return True
2779
2780 # If changing the signature of this method, make sure to apply the same
2781 # changes to job.QueryJob.to_geodataframe()
2782 def to_geodataframe(
2783 self,
2784 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2785 dtypes: Optional[Dict[str, Any]] = None,
2786 progress_bar_type: Optional[str] = None,
2787 create_bqstorage_client: bool = True,
2788 geography_column: Optional[str] = None,
2789 bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
2790 int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
2791 float_dtype: Union[Any, None] = None,
2792 string_dtype: Union[Any, None] = None,
2793 timeout: Optional[float] = None,
2794 ) -> "geopandas.GeoDataFrame":
2795 """Create a GeoPandas GeoDataFrame by loading all pages of a query.
2796
2797 Args:
2798 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2799 A BigQuery Storage API client. If supplied, use the faster
2800 BigQuery Storage API to fetch rows from BigQuery.
2801
2802 This method requires the ``pyarrow`` and
2803 ``google-cloud-bigquery-storage`` libraries.
2804
2805 This method only exposes a subset of the capabilities of the
2806 BigQuery Storage API. For full access to all features
2807 (projections, filters, snapshots) use the Storage API directly.
2808
2809 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
2810 A dictionary of column names pandas ``dtype``s. The provided
2811 ``dtype`` is used when constructing the series for the column
2812 specified. Otherwise, the default pandas behavior is used.
2813 progress_bar_type (Optional[str]):
2814 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
2815 display a progress bar while the data downloads. Install the
2816 ``tqdm`` package to use this feature.
2817
2818 Possible values of ``progress_bar_type`` include:
2819
2820 ``None``
2821 No progress bar.
2822 ``'tqdm'``
2823 Use the :func:`tqdm.tqdm` function to print a progress bar
2824 to :data:`sys.stdout`.
2825 ``'tqdm_notebook'``
2826 Use the :func:`tqdm.notebook.tqdm` function to display a
2827 progress bar as a Jupyter notebook widget.
2828 ``'tqdm_gui'``
2829 Use the :func:`tqdm.tqdm_gui` function to display a
2830 progress bar as a graphical dialog box.
2831
2832 create_bqstorage_client (Optional[bool]):
2833 If ``True`` (default), create a BigQuery Storage API client
2834 using the default API settings. The BigQuery Storage API
2835 is a faster way to fetch rows from BigQuery. See the
2836 ``bqstorage_client`` parameter for more information.
2837
2838 This argument does nothing if ``bqstorage_client`` is supplied.
2839
2840 geography_column (Optional[str]):
2841 If there are more than one GEOGRAPHY column,
2842 identifies which one to use to construct a geopandas
2843 GeoDataFrame. This option can be ommitted if there's
2844 only one GEOGRAPHY column.
2845 bool_dtype (Optional[pandas.Series.dtype, None]):
2846 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
2847 to convert BigQuery Boolean type, instead of relying on the default
2848 ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
2849 then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
2850 type can be found at:
2851 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2852 int_dtype (Optional[pandas.Series.dtype, None]):
2853 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2854 to convert BigQuery Integer types, instead of relying on the default
2855 ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2856 then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2857 Integer types can be found at:
2858 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2859 float_dtype (Optional[pandas.Series.dtype, None]):
2860 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2861 to convert BigQuery Float type, instead of relying on the default
2862 ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2863 then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2864 type can be found at:
2865 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2866 string_dtype (Optional[pandas.Series.dtype, None]):
2867 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2868 convert BigQuery String type, instead of relying on the default
2869 ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2870 then the data type will be ``numpy.dtype("object")``. BigQuery String
2871 type can be found at:
2872 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2873
2874 Returns:
2875 geopandas.GeoDataFrame:
2876 A :class:`geopandas.GeoDataFrame` populated with row
2877 data and column headers from the query results. The
2878 column headers are derived from the destination
2879 table's schema.
2880
2881 Raises:
2882 ValueError:
2883 If the :mod:`geopandas` library cannot be imported, or the
2884 :mod:`google.cloud.bigquery_storage_v1` module is
2885 required but cannot be imported.
2886
2887 .. versionadded:: 2.24.0
2888 """
2889 if geopandas is None:
2890 raise ValueError(_NO_GEOPANDAS_ERROR)
2891
2892 geography_columns = set(
2893 field.name
2894 for field in self.schema
2895 if field.field_type.upper() == "GEOGRAPHY"
2896 )
2897 if not geography_columns:
2898 raise TypeError(
2899 "There must be at least one GEOGRAPHY column"
2900 " to create a GeoDataFrame"
2901 )
2902
2903 if geography_column:
2904 if geography_column not in geography_columns:
2905 raise ValueError(
2906 f"The given geography column, {geography_column}, doesn't name"
2907 f" a GEOGRAPHY column in the result."
2908 )
2909 elif len(geography_columns) == 1:
2910 [geography_column] = geography_columns
2911 else:
2912 raise ValueError(
2913 "There is more than one GEOGRAPHY column in the result. "
2914 "The geography_column argument must be used to specify which "
2915 "one to use to create a GeoDataFrame"
2916 )
2917
2918 df = self.to_dataframe(
2919 bqstorage_client,
2920 dtypes,
2921 progress_bar_type,
2922 create_bqstorage_client,
2923 geography_as_object=True,
2924 bool_dtype=bool_dtype,
2925 int_dtype=int_dtype,
2926 float_dtype=float_dtype,
2927 string_dtype=string_dtype,
2928 timeout=timeout,
2929 )
2930
2931 return geopandas.GeoDataFrame(
2932 df, crs=_COORDINATE_REFERENCE_SYSTEM, geometry=geography_column
2933 )
2934
2935
2936class _EmptyRowIterator(RowIterator):
2937 """An empty row iterator.
2938
2939 This class prevents API requests when there are no rows to fetch or rows
2940 are impossible to fetch, such as with query results for DDL CREATE VIEW
2941 statements.
2942 """
2943
2944 def __init__(
2945 self, client=None, api_request=None, path=None, schema=(), *args, **kwargs
2946 ):
2947 super().__init__(
2948 client=client,
2949 api_request=api_request,
2950 path=path,
2951 schema=schema,
2952 *args,
2953 **kwargs,
2954 )
2955 self._total_rows = 0
2956
2957 def to_arrow(
2958 self,
2959 progress_bar_type=None,
2960 bqstorage_client=None,
2961 create_bqstorage_client=True,
2962 timeout: Optional[float] = None,
2963 ) -> "pyarrow.Table":
2964 """[Beta] Create an empty class:`pyarrow.Table`.
2965
2966 Args:
2967 progress_bar_type (str): Ignored. Added for compatibility with RowIterator.
2968 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
2969 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2970 timeout (Optional[float]): Ignored. Added for compatibility with RowIterator.
2971
2972 Returns:
2973 pyarrow.Table: An empty :class:`pyarrow.Table`.
2974 """
2975 if pyarrow is None:
2976 raise ValueError(_NO_PYARROW_ERROR)
2977 return pyarrow.Table.from_arrays(())
2978
2979 def to_dataframe(
2980 self,
2981 bqstorage_client=None,
2982 dtypes=None,
2983 progress_bar_type=None,
2984 create_bqstorage_client=True,
2985 geography_as_object=False,
2986 bool_dtype=None,
2987 int_dtype=None,
2988 float_dtype=None,
2989 string_dtype=None,
2990 date_dtype=None,
2991 datetime_dtype=None,
2992 time_dtype=None,
2993 timestamp_dtype=None,
2994 range_date_dtype=None,
2995 range_datetime_dtype=None,
2996 range_timestamp_dtype=None,
2997 timeout: Optional[float] = None,
2998 ) -> "pandas.DataFrame":
2999 """Create an empty dataframe.
3000
3001 Args:
3002 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
3003 dtypes (Any): Ignored. Added for compatibility with RowIterator.
3004 progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
3005 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
3006 geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
3007 bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
3008 int_dtype (Any): Ignored. Added for compatibility with RowIterator.
3009 float_dtype (Any): Ignored. Added for compatibility with RowIterator.
3010 string_dtype (Any): Ignored. Added for compatibility with RowIterator.
3011 date_dtype (Any): Ignored. Added for compatibility with RowIterator.
3012 datetime_dtype (Any): Ignored. Added for compatibility with RowIterator.
3013 time_dtype (Any): Ignored. Added for compatibility with RowIterator.
3014 timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator.
3015 range_date_dtype (Any): Ignored. Added for compatibility with RowIterator.
3016 range_datetime_dtype (Any): Ignored. Added for compatibility with RowIterator.
3017 range_timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator.
3018 timeout (Optional[float]): Ignored. Added for compatibility with RowIterator.
3019
3020 Returns:
3021 pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
3022 """
3023 _pandas_helpers.verify_pandas_imports()
3024 return pandas.DataFrame()
3025
3026 def to_geodataframe(
3027 self,
3028 bqstorage_client=None,
3029 dtypes=None,
3030 progress_bar_type=None,
3031 create_bqstorage_client=True,
3032 geography_column: Optional[str] = None,
3033 bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
3034 int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
3035 float_dtype: Union[Any, None] = None,
3036 string_dtype: Union[Any, None] = None,
3037 timeout: Optional[float] = None,
3038 ) -> "pandas.DataFrame":
3039 """Create an empty dataframe.
3040
3041 Args:
3042 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
3043 dtypes (Any): Ignored. Added for compatibility with RowIterator.
3044 progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
3045 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
3046 geography_column (str): Ignored. Added for compatibility with RowIterator.
3047 bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
3048 int_dtype (Any): Ignored. Added for compatibility with RowIterator.
3049 float_dtype (Any): Ignored. Added for compatibility with RowIterator.
3050 string_dtype (Any): Ignored. Added for compatibility with RowIterator.
3051 timeout (Optional[float]): Ignored. Added for compatibility with RowIterator.
3052
3053 Returns:
3054 pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
3055 """
3056 if geopandas is None:
3057 raise ValueError(_NO_GEOPANDAS_ERROR)
3058
3059 # Since an empty GeoDataFrame has no geometry column, we do not CRS on it,
3060 # because that's deprecated.
3061 return geopandas.GeoDataFrame()
3062
3063 def to_dataframe_iterable(
3064 self,
3065 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
3066 dtypes: Optional[Dict[str, Any]] = None,
3067 max_queue_size: Optional[int] = None,
3068 max_stream_count: Optional[int] = None,
3069 timeout: Optional[float] = None,
3070 ) -> Iterator["pandas.DataFrame"]:
3071 """Create an iterable of pandas DataFrames, to process the table as a stream.
3072
3073 .. versionadded:: 2.21.0
3074
3075 Args:
3076 bqstorage_client:
3077 Ignored. Added for compatibility with RowIterator.
3078
3079 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
3080 Ignored. Added for compatibility with RowIterator.
3081
3082 max_queue_size:
3083 Ignored. Added for compatibility with RowIterator.
3084
3085 max_stream_count:
3086 Ignored. Added for compatibility with RowIterator.
3087
3088 timeout (Optional[float]):
3089 Ignored. Added for compatibility with RowIterator.
3090
3091 Returns:
3092 An iterator yielding a single empty :class:`~pandas.DataFrame`.
3093
3094 Raises:
3095 ValueError:
3096 If the :mod:`pandas` library cannot be imported.
3097 """
3098 _pandas_helpers.verify_pandas_imports()
3099 return iter((pandas.DataFrame(),))
3100
3101 def to_arrow_iterable(
3102 self,
3103 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
3104 max_queue_size: Optional[int] = None,
3105 max_stream_count: Optional[int] = None,
3106 timeout: Optional[float] = None,
3107 ) -> Iterator["pyarrow.RecordBatch"]:
3108 """Create an iterable of pandas DataFrames, to process the table as a stream.
3109
3110 .. versionadded:: 2.31.0
3111
3112 Args:
3113 bqstorage_client:
3114 Ignored. Added for compatibility with RowIterator.
3115
3116 max_queue_size:
3117 Ignored. Added for compatibility with RowIterator.
3118
3119 max_stream_count:
3120 Ignored. Added for compatibility with RowIterator.
3121
3122 timeout (Optional[float]):
3123 Ignored. Added for compatibility with RowIterator.
3124
3125 Returns:
3126 An iterator yielding a single empty :class:`~pyarrow.RecordBatch`.
3127 """
3128 return iter((pyarrow.record_batch([]),))
3129
3130 def __iter__(self):
3131 return iter(())
3132
3133
3134class PartitionRange(object):
3135 """Definition of the ranges for range partitioning.
3136
3137 .. note::
3138 **Beta**. The integer range partitioning feature is in a pre-release
3139 state and might change or have limited support.
3140
3141 Args:
3142 start (Optional[int]):
3143 Sets the
3144 :attr:`~google.cloud.bigquery.table.PartitionRange.start`
3145 property.
3146 end (Optional[int]):
3147 Sets the
3148 :attr:`~google.cloud.bigquery.table.PartitionRange.end`
3149 property.
3150 interval (Optional[int]):
3151 Sets the
3152 :attr:`~google.cloud.bigquery.table.PartitionRange.interval`
3153 property.
3154 _properties (Optional[dict]):
3155 Private. Used to construct object from API resource.
3156 """
3157
3158 def __init__(self, start=None, end=None, interval=None, _properties=None) -> None:
3159 if _properties is None:
3160 _properties = {}
3161 self._properties = _properties
3162
3163 if start is not None:
3164 self.start = start
3165 if end is not None:
3166 self.end = end
3167 if interval is not None:
3168 self.interval = interval
3169
3170 @property
3171 def start(self):
3172 """int: The start of range partitioning, inclusive."""
3173 return _helpers._int_or_none(self._properties.get("start"))
3174
3175 @start.setter
3176 def start(self, value):
3177 self._properties["start"] = _helpers._str_or_none(value)
3178
3179 @property
3180 def end(self):
3181 """int: The end of range partitioning, exclusive."""
3182 return _helpers._int_or_none(self._properties.get("end"))
3183
3184 @end.setter
3185 def end(self, value):
3186 self._properties["end"] = _helpers._str_or_none(value)
3187
3188 @property
3189 def interval(self):
3190 """int: The width of each interval."""
3191 return _helpers._int_or_none(self._properties.get("interval"))
3192
3193 @interval.setter
3194 def interval(self, value):
3195 self._properties["interval"] = _helpers._str_or_none(value)
3196
3197 def _key(self):
3198 return tuple(sorted(self._properties.items()))
3199
3200 def __eq__(self, other):
3201 if not isinstance(other, PartitionRange):
3202 return NotImplemented
3203 return self._key() == other._key()
3204
3205 def __ne__(self, other):
3206 return not self == other
3207
3208 def __repr__(self):
3209 key_vals = ["{}={}".format(key, val) for key, val in self._key()]
3210 return "PartitionRange({})".format(", ".join(key_vals))
3211
3212
3213class RangePartitioning(object):
3214 """Range-based partitioning configuration for a table.
3215
3216 .. note::
3217 **Beta**. The integer range partitioning feature is in a pre-release
3218 state and might change or have limited support.
3219
3220 Args:
3221 range_ (Optional[google.cloud.bigquery.table.PartitionRange]):
3222 Sets the
3223 :attr:`google.cloud.bigquery.table.RangePartitioning.range_`
3224 property.
3225 field (Optional[str]):
3226 Sets the
3227 :attr:`google.cloud.bigquery.table.RangePartitioning.field`
3228 property.
3229 _properties (Optional[dict]):
3230 Private. Used to construct object from API resource.
3231 """
3232
3233 def __init__(self, range_=None, field=None, _properties=None) -> None:
3234 if _properties is None:
3235 _properties = {}
3236 self._properties: Dict[str, Any] = _properties
3237
3238 if range_ is not None:
3239 self.range_ = range_
3240 if field is not None:
3241 self.field = field
3242
3243 # Trailing underscore to prevent conflict with built-in range() function.
3244 @property
3245 def range_(self):
3246 """google.cloud.bigquery.table.PartitionRange: Defines the
3247 ranges for range partitioning.
3248
3249 Raises:
3250 ValueError:
3251 If the value is not a :class:`PartitionRange`.
3252 """
3253 range_properties = self._properties.setdefault("range", {})
3254 return PartitionRange(_properties=range_properties)
3255
3256 @range_.setter
3257 def range_(self, value):
3258 if not isinstance(value, PartitionRange):
3259 raise ValueError("Expected a PartitionRange, but got {}.".format(value))
3260 self._properties["range"] = value._properties
3261
3262 @property
3263 def field(self):
3264 """str: The table is partitioned by this field.
3265
3266 The field must be a top-level ``NULLABLE`` / ``REQUIRED`` field. The
3267 only supported type is ``INTEGER`` / ``INT64``.
3268 """
3269 return self._properties.get("field")
3270
3271 @field.setter
3272 def field(self, value):
3273 self._properties["field"] = value
3274
3275 def _key(self):
3276 return (("field", self.field), ("range_", self.range_))
3277
3278 def __eq__(self, other):
3279 if not isinstance(other, RangePartitioning):
3280 return NotImplemented
3281 return self._key() == other._key()
3282
3283 def __ne__(self, other):
3284 return not self == other
3285
3286 def __repr__(self):
3287 key_vals = ["{}={}".format(key, repr(val)) for key, val in self._key()]
3288 return "RangePartitioning({})".format(", ".join(key_vals))
3289
3290
3291class TimePartitioningType(object):
3292 """Specifies the type of time partitioning to perform."""
3293
3294 DAY = "DAY"
3295 """str: Generates one partition per day."""
3296
3297 HOUR = "HOUR"
3298 """str: Generates one partition per hour."""
3299
3300 MONTH = "MONTH"
3301 """str: Generates one partition per month."""
3302
3303 YEAR = "YEAR"
3304 """str: Generates one partition per year."""
3305
3306
3307class TimePartitioning(object):
3308 """Configures time-based partitioning for a table.
3309
3310 Args:
3311 type_ (Optional[google.cloud.bigquery.table.TimePartitioningType]):
3312 Specifies the type of time partitioning to perform. Defaults to
3313 :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
3314
3315 Supported values are:
3316
3317 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.HOUR`
3318 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`
3319 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.MONTH`
3320 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.YEAR`
3321
3322 field (Optional[str]):
3323 If set, the table is partitioned by this field. If not set, the
3324 table is partitioned by pseudo column ``_PARTITIONTIME``. The field
3325 must be a top-level ``TIMESTAMP``, ``DATETIME``, or ``DATE``
3326 field. Its mode must be ``NULLABLE`` or ``REQUIRED``.
3327
3328 See the `time-unit column-partitioned tables guide
3329 <https://cloud.google.com/bigquery/docs/creating-column-partitions>`_
3330 in the BigQuery documentation.
3331 expiration_ms(Optional[int]):
3332 Number of milliseconds for which to keep the storage for a
3333 partition.
3334 require_partition_filter (Optional[bool]):
3335 DEPRECATED: Use
3336 :attr:`~google.cloud.bigquery.table.Table.require_partition_filter`,
3337 instead.
3338 """
3339
3340 def __init__(
3341 self, type_=None, field=None, expiration_ms=None, require_partition_filter=None
3342 ) -> None:
3343 self._properties: Dict[str, Any] = {}
3344 if type_ is None:
3345 self.type_ = TimePartitioningType.DAY
3346 else:
3347 self.type_ = type_
3348 if field is not None:
3349 self.field = field
3350 if expiration_ms is not None:
3351 self.expiration_ms = expiration_ms
3352 if require_partition_filter is not None:
3353 self.require_partition_filter = require_partition_filter
3354
3355 @property
3356 def type_(self):
3357 """google.cloud.bigquery.table.TimePartitioningType: The type of time
3358 partitioning to use.
3359 """
3360 return self._properties.get("type")
3361
3362 @type_.setter
3363 def type_(self, value):
3364 self._properties["type"] = value
3365
3366 @property
3367 def field(self):
3368 """str: Field in the table to use for partitioning"""
3369 return self._properties.get("field")
3370
3371 @field.setter
3372 def field(self, value):
3373 self._properties["field"] = value
3374
3375 @property
3376 def expiration_ms(self):
3377 """int: Number of milliseconds to keep the storage for a partition."""
3378 return _helpers._int_or_none(self._properties.get("expirationMs"))
3379
3380 @expiration_ms.setter
3381 def expiration_ms(self, value):
3382 if value is not None:
3383 # Allow explicitly setting the expiration to None.
3384 value = str(value)
3385 self._properties["expirationMs"] = value
3386
3387 @property
3388 def require_partition_filter(self):
3389 """bool: Specifies whether partition filters are required for queries
3390
3391 DEPRECATED: Use
3392 :attr:`~google.cloud.bigquery.table.Table.require_partition_filter`,
3393 instead.
3394 """
3395 warnings.warn(
3396 (
3397 "TimePartitioning.require_partition_filter will be removed in "
3398 "future versions. Please use Table.require_partition_filter "
3399 "instead."
3400 ),
3401 PendingDeprecationWarning,
3402 stacklevel=2,
3403 )
3404 return self._properties.get("requirePartitionFilter")
3405
3406 @require_partition_filter.setter
3407 def require_partition_filter(self, value):
3408 warnings.warn(
3409 (
3410 "TimePartitioning.require_partition_filter will be removed in "
3411 "future versions. Please use Table.require_partition_filter "
3412 "instead."
3413 ),
3414 PendingDeprecationWarning,
3415 stacklevel=2,
3416 )
3417 self._properties["requirePartitionFilter"] = value
3418
3419 @classmethod
3420 def from_api_repr(cls, api_repr: dict) -> "TimePartitioning":
3421 """Return a :class:`TimePartitioning` object deserialized from a dict.
3422
3423 This method creates a new ``TimePartitioning`` instance that points to
3424 the ``api_repr`` parameter as its internal properties dict. This means
3425 that when a ``TimePartitioning`` instance is stored as a property of
3426 another object, any changes made at the higher level will also appear
3427 here::
3428
3429 >>> time_partitioning = TimePartitioning()
3430 >>> table.time_partitioning = time_partitioning
3431 >>> table.time_partitioning.field = 'timecolumn'
3432 >>> time_partitioning.field
3433 'timecolumn'
3434
3435 Args:
3436 api_repr (Mapping[str, str]):
3437 The serialized representation of the TimePartitioning, such as
3438 what is output by :meth:`to_api_repr`.
3439
3440 Returns:
3441 google.cloud.bigquery.table.TimePartitioning:
3442 The ``TimePartitioning`` object.
3443 """
3444 instance = cls()
3445 instance._properties = api_repr
3446 return instance
3447
3448 def to_api_repr(self) -> dict:
3449 """Return a dictionary representing this object.
3450
3451 This method returns the properties dict of the ``TimePartitioning``
3452 instance rather than making a copy. This means that when a
3453 ``TimePartitioning`` instance is stored as a property of another
3454 object, any changes made at the higher level will also appear here.
3455
3456 Returns:
3457 dict:
3458 A dictionary representing the TimePartitioning object in
3459 serialized form.
3460 """
3461 return self._properties
3462
3463 def _key(self):
3464 # because we are only "renaming" top level keys shallow copy is sufficient here.
3465 properties = self._properties.copy()
3466 # calling repr for non built-in type objects.
3467 properties["type_"] = repr(properties.pop("type"))
3468 if "field" in properties:
3469 # calling repr for non built-in type objects.
3470 properties["field"] = repr(properties["field"])
3471 if "requirePartitionFilter" in properties:
3472 properties["require_partition_filter"] = properties.pop(
3473 "requirePartitionFilter"
3474 )
3475 if "expirationMs" in properties:
3476 properties["expiration_ms"] = properties.pop("expirationMs")
3477 return tuple(sorted(properties.items()))
3478
3479 def __eq__(self, other):
3480 if not isinstance(other, TimePartitioning):
3481 return NotImplemented
3482 return self._key() == other._key()
3483
3484 def __ne__(self, other):
3485 return not self == other
3486
3487 def __hash__(self):
3488 return hash(self._key())
3489
3490 def __repr__(self):
3491 key_vals = ["{}={}".format(key, val) for key, val in self._key()]
3492 return "TimePartitioning({})".format(",".join(key_vals))
3493
3494
3495class PrimaryKey:
3496 """Represents the primary key constraint on a table's columns.
3497
3498 Args:
3499 columns: The columns that are composed of the primary key constraint.
3500 """
3501
3502 def __init__(self, columns: List[str]):
3503 self.columns = columns
3504
3505 def __eq__(self, other):
3506 if not isinstance(other, PrimaryKey):
3507 raise TypeError("The value provided is not a BigQuery PrimaryKey.")
3508 return self.columns == other.columns
3509
3510
3511class ColumnReference:
3512 """The pair of the foreign key column and primary key column.
3513
3514 Args:
3515 referencing_column: The column that composes the foreign key.
3516 referenced_column: The column in the primary key that are referenced by the referencingColumn.
3517 """
3518
3519 def __init__(self, referencing_column: str, referenced_column: str):
3520 self.referencing_column = referencing_column
3521 self.referenced_column = referenced_column
3522
3523 def __eq__(self, other):
3524 if not isinstance(other, ColumnReference):
3525 raise TypeError("The value provided is not a BigQuery ColumnReference.")
3526 return (
3527 self.referencing_column == other.referencing_column
3528 and self.referenced_column == other.referenced_column
3529 )
3530
3531
3532class ForeignKey:
3533 """Represents a foreign key constraint on a table's columns.
3534
3535 Args:
3536 name: Set only if the foreign key constraint is named.
3537 referenced_table: The table that holds the primary key and is referenced by this foreign key.
3538 column_references: The columns that compose the foreign key.
3539 """
3540
3541 def __init__(
3542 self,
3543 name: str,
3544 referenced_table: TableReference,
3545 column_references: List[ColumnReference],
3546 ):
3547 self.name = name
3548 self.referenced_table = referenced_table
3549 self.column_references = column_references
3550
3551 def __eq__(self, other):
3552 if not isinstance(other, ForeignKey):
3553 raise TypeError("The value provided is not a BigQuery ForeignKey.")
3554 return (
3555 self.name == other.name
3556 and self.referenced_table == other.referenced_table
3557 and self.column_references == other.column_references
3558 )
3559
3560 @classmethod
3561 def from_api_repr(cls, api_repr: Dict[str, Any]) -> "ForeignKey":
3562 """Create an instance from API representation."""
3563 return cls(
3564 name=api_repr["name"],
3565 referenced_table=TableReference.from_api_repr(api_repr["referencedTable"]),
3566 column_references=[
3567 ColumnReference(
3568 column_reference_resource["referencingColumn"],
3569 column_reference_resource["referencedColumn"],
3570 )
3571 for column_reference_resource in api_repr["columnReferences"]
3572 ],
3573 )
3574
3575 def to_api_repr(self) -> Dict[str, Any]:
3576 """Return a dictionary representing this object."""
3577 return {
3578 "name": self.name,
3579 "referencedTable": self.referenced_table.to_api_repr(),
3580 "columnReferences": [
3581 {
3582 "referencingColumn": column_reference.referencing_column,
3583 "referencedColumn": column_reference.referenced_column,
3584 }
3585 for column_reference in self.column_references
3586 ],
3587 }
3588
3589
3590class TableConstraints:
3591 """The TableConstraints defines the primary key and foreign key.
3592
3593 Args:
3594 primary_key:
3595 Represents a primary key constraint on a table's columns. Present only if the table
3596 has a primary key. The primary key is not enforced.
3597 foreign_keys:
3598 Present only if the table has a foreign key. The foreign key is not enforced.
3599
3600 """
3601
3602 def __init__(
3603 self,
3604 primary_key: Optional[PrimaryKey],
3605 foreign_keys: Optional[List[ForeignKey]],
3606 ):
3607 self.primary_key = primary_key
3608 self.foreign_keys = foreign_keys
3609
3610 def __eq__(self, other):
3611 if not isinstance(other, TableConstraints) and other is not None:
3612 raise TypeError("The value provided is not a BigQuery TableConstraints.")
3613 return self.primary_key == (
3614 other.primary_key if other.primary_key else None
3615 ) and self.foreign_keys == (other.foreign_keys if other.foreign_keys else None)
3616
3617 @classmethod
3618 def from_api_repr(cls, resource: Dict[str, Any]) -> "TableConstraints":
3619 """Create an instance from API representation."""
3620 primary_key = None
3621 if "primaryKey" in resource:
3622 primary_key = PrimaryKey(resource["primaryKey"]["columns"])
3623
3624 foreign_keys = None
3625 if "foreignKeys" in resource:
3626 foreign_keys = [
3627 ForeignKey.from_api_repr(foreign_key_resource)
3628 for foreign_key_resource in resource["foreignKeys"]
3629 ]
3630 return cls(primary_key, foreign_keys)
3631
3632 def to_api_repr(self) -> Dict[str, Any]:
3633 """Return a dictionary representing this object."""
3634 resource: Dict[str, Any] = {}
3635 if self.primary_key:
3636 resource["primaryKey"] = {"columns": self.primary_key.columns}
3637 if self.foreign_keys:
3638 resource["foreignKeys"] = [
3639 foreign_key.to_api_repr() for foreign_key in self.foreign_keys
3640 ]
3641 return resource
3642
3643
3644class BigLakeConfiguration(object):
3645 """Configuration for managed tables for Apache Iceberg, formerly
3646 known as BigLake.
3647
3648 Args:
3649 connection_id (Optional[str]):
3650 The connection specifying the credentials to be used to read and write to external
3651 storage, such as Cloud Storage. The connection_id can have the form
3652 ``{project}.{location}.{connection_id}`` or
3653 ``projects/{project}/locations/{location}/connections/{connection_id}``.
3654 storage_uri (Optional[str]):
3655 The fully qualified location prefix of the external folder where table data is
3656 stored. The '*' wildcard character is not allowed. The URI should be in the
3657 format ``gs://bucket/path_to_table/``.
3658 file_format (Optional[str]):
3659 The file format the table data is stored in. See BigLakeFileFormat for available
3660 values.
3661 table_format (Optional[str]):
3662 The table format the metadata only snapshots are stored in. See BigLakeTableFormat
3663 for available values.
3664 _properties (Optional[dict]):
3665 Private. Used to construct object from API resource.
3666 """
3667
3668 def __init__(
3669 self,
3670 connection_id: Optional[str] = None,
3671 storage_uri: Optional[str] = None,
3672 file_format: Optional[str] = None,
3673 table_format: Optional[str] = None,
3674 _properties: Optional[dict] = None,
3675 ) -> None:
3676 if _properties is None:
3677 _properties = {}
3678 self._properties = _properties
3679 if connection_id is not None:
3680 self.connection_id = connection_id
3681 if storage_uri is not None:
3682 self.storage_uri = storage_uri
3683 if file_format is not None:
3684 self.file_format = file_format
3685 if table_format is not None:
3686 self.table_format = table_format
3687
3688 @property
3689 def connection_id(self) -> Optional[str]:
3690 """str: The connection specifying the credentials to be used to read and write to external
3691 storage, such as Cloud Storage."""
3692 return self._properties.get("connectionId")
3693
3694 @connection_id.setter
3695 def connection_id(self, value: Optional[str]):
3696 self._properties["connectionId"] = value
3697
3698 @property
3699 def storage_uri(self) -> Optional[str]:
3700 """str: The fully qualified location prefix of the external folder where table data is
3701 stored."""
3702 return self._properties.get("storageUri")
3703
3704 @storage_uri.setter
3705 def storage_uri(self, value: Optional[str]):
3706 self._properties["storageUri"] = value
3707
3708 @property
3709 def file_format(self) -> Optional[str]:
3710 """str: The file format the table data is stored in. See BigLakeFileFormat for available
3711 values."""
3712 return self._properties.get("fileFormat")
3713
3714 @file_format.setter
3715 def file_format(self, value: Optional[str]):
3716 self._properties["fileFormat"] = value
3717
3718 @property
3719 def table_format(self) -> Optional[str]:
3720 """str: The table format the metadata only snapshots are stored in. See BigLakeTableFormat
3721 for available values."""
3722 return self._properties.get("tableFormat")
3723
3724 @table_format.setter
3725 def table_format(self, value: Optional[str]):
3726 self._properties["tableFormat"] = value
3727
3728 def _key(self):
3729 return tuple(sorted(self._properties.items()))
3730
3731 def __eq__(self, other):
3732 if not isinstance(other, BigLakeConfiguration):
3733 return NotImplemented
3734 return self._key() == other._key()
3735
3736 def __ne__(self, other):
3737 return not self == other
3738
3739 def __hash__(self):
3740 return hash(self._key())
3741
3742 def __repr__(self):
3743 key_vals = ["{}={}".format(key, val) for key, val in self._key()]
3744 return "BigLakeConfiguration({})".format(",".join(key_vals))
3745
3746 @classmethod
3747 def from_api_repr(cls, resource: Dict[str, Any]) -> "BigLakeConfiguration":
3748 """Factory: construct a BigLakeConfiguration given its API representation.
3749
3750 Args:
3751 resource:
3752 BigLakeConfiguration representation returned from the API
3753
3754 Returns:
3755 BigLakeConfiguration parsed from ``resource``.
3756 """
3757 ref = cls()
3758 ref._properties = resource
3759 return ref
3760
3761 def to_api_repr(self) -> Dict[str, Any]:
3762 """Construct the API resource representation of this BigLakeConfiguration.
3763
3764 Returns:
3765 BigLakeConfiguration represented as an API resource.
3766 """
3767 return copy.deepcopy(self._properties)
3768
3769
3770def _item_to_row(iterator, resource):
3771 """Convert a JSON row to the native object.
3772
3773 .. note::
3774
3775 This assumes that the ``schema`` attribute has been
3776 added to the iterator after being created, which
3777 should be done by the caller.
3778
3779 Args:
3780 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.
3781 resource (Dict): An item to be converted to a row.
3782
3783 Returns:
3784 google.cloud.bigquery.table.Row: The next row in the page.
3785 """
3786 return Row(
3787 _helpers._row_tuple_from_json(resource, iterator.schema),
3788 iterator._field_to_index,
3789 )
3790
3791
3792def _row_iterator_page_columns(schema, response):
3793 """Make a generator of all the columns in a page from tabledata.list.
3794
3795 This enables creating a :class:`pandas.DataFrame` and other
3796 column-oriented data structures such as :class:`pyarrow.RecordBatch`
3797 """
3798 columns = []
3799 rows = response.get("rows", [])
3800
3801 def get_column_data(field_index, field):
3802 for row in rows:
3803 yield _helpers.DATA_FRAME_CELL_DATA_PARSER.to_py(
3804 row["f"][field_index]["v"], field
3805 )
3806
3807 for field_index, field in enumerate(schema):
3808 columns.append(get_column_data(field_index, field))
3809
3810 return columns
3811
3812
3813# pylint: disable=unused-argument
3814def _rows_page_start(iterator, page, response):
3815 """Grab total rows when :class:`~google.cloud.iterator.Page` starts.
3816
3817 Args:
3818 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.
3819 page (google.api_core.page_iterator.Page): The page that was just created.
3820 response (Dict): The JSON API response for a page of rows in a table.
3821 """
3822 # Make a (lazy) copy of the page in column-oriented format for use in data
3823 # science packages.
3824 page._columns = _row_iterator_page_columns(iterator._schema, response)
3825
3826 total_rows = response.get("totalRows")
3827 # Don't reset total_rows if it's not present in the next API response.
3828 if total_rows is not None:
3829 iterator._total_rows = int(total_rows)
3830
3831
3832# pylint: enable=unused-argument
3833
3834
3835def _table_arg_to_table_ref(value, default_project=None) -> TableReference:
3836 """Helper to convert a string or Table to TableReference.
3837
3838 This function keeps TableReference and other kinds of objects unchanged.
3839 """
3840 if isinstance(value, str):
3841 value = TableReference.from_string(value, default_project=default_project)
3842 if isinstance(value, (Table, TableListItem)):
3843 value = value.reference
3844 return value
3845
3846
3847def _table_arg_to_table(value, default_project=None) -> Table:
3848 """Helper to convert a string or TableReference to a Table.
3849
3850 This function keeps Table and other kinds of objects unchanged.
3851 """
3852 if isinstance(value, str):
3853 value = TableReference.from_string(value, default_project=default_project)
3854 if isinstance(value, TableReference):
3855 value = Table(value)
3856 if isinstance(value, TableListItem):
3857 newvalue = Table(value.reference)
3858 newvalue._properties = value._properties
3859 value = newvalue
3860
3861 return value