Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/google/cloud/bigquery/table.py: 37%
901 statements
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:07 +0000
« prev ^ index » next coverage.py v7.2.2, created at 2023-03-26 06:07 +0000
1# Copyright 2015 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15"""Define API Tables."""
17from __future__ import absolute_import
19import copy
20import datetime
21import functools
22import operator
23import typing
24from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
25import warnings
27try:
28 import pandas # type: ignore
29except ImportError: # pragma: NO COVER
30 pandas = None
32try:
33 import pyarrow # type: ignore
34except ImportError: # pragma: NO COVER
35 pyarrow = None
37try:
38 import db_dtypes # type: ignore
39except ImportError: # pragma: NO COVER
40 db_dtypes = None
42try:
43 import geopandas # type: ignore
44except ImportError:
45 geopandas = None
46else:
47 _COORDINATE_REFERENCE_SYSTEM = "EPSG:4326"
49try:
50 import shapely # type: ignore
51 from shapely import wkt # type: ignore
52except ImportError:
53 shapely = None
54else:
55 _read_wkt = wkt.loads
57import google.api_core.exceptions
58from google.api_core.page_iterator import HTTPIterator
60import google.cloud._helpers # type: ignore
61from google.cloud.bigquery import _helpers
62from google.cloud.bigquery import _pandas_helpers
63from google.cloud.bigquery.enums import DefaultPandasDTypes
64from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
65from google.cloud.bigquery.schema import _build_schema_resource
66from google.cloud.bigquery.schema import _parse_schema_resource
67from google.cloud.bigquery.schema import _to_schema_fields
68from google.cloud.bigquery._tqdm_helpers import get_progress_bar
69from google.cloud.bigquery.external_config import ExternalConfig
70from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
72if typing.TYPE_CHECKING: # pragma: NO COVER
73 # Unconditionally import optional dependencies again to tell pytype that
74 # they are not None, avoiding false "no attribute" errors.
75 import pandas
76 import pyarrow
77 import geopandas # type: ignore
78 from google.cloud import bigquery_storage # type: ignore
79 from google.cloud.bigquery.dataset import DatasetReference
82_NO_GEOPANDAS_ERROR = (
83 "The geopandas library is not installed, please install "
84 "geopandas to use the to_geodataframe() function."
85)
86_NO_PYARROW_ERROR = (
87 "The pyarrow library is not installed, please install "
88 "pyarrow to use the to_arrow() function."
89)
90_NO_SHAPELY_ERROR = (
91 "The shapely library is not installed, please install "
92 "shapely to use the geography_as_object option."
93)
95_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'
97_NO_SUPPORTED_DTYPE = (
98 "The dtype cannot to be converted to a pandas ExtensionArray "
99 "because the necessary `__from_arrow__` attribute is missing."
100)
103def _reference_getter(table):
104 """A :class:`~google.cloud.bigquery.table.TableReference` pointing to
105 this table.
107 Returns:
108 google.cloud.bigquery.table.TableReference: pointer to this table.
109 """
110 from google.cloud.bigquery import dataset
112 dataset_ref = dataset.DatasetReference(table.project, table.dataset_id)
113 return TableReference(dataset_ref, table.table_id)
116def _view_use_legacy_sql_getter(table):
117 """bool: Specifies whether to execute the view with Legacy or Standard SQL.
119 This boolean specifies whether to execute the view with Legacy SQL
120 (:data:`True`) or Standard SQL (:data:`False`). The client side default is
121 :data:`False`. The server-side default is :data:`True`. If this table is
122 not a view, :data:`None` is returned.
124 Raises:
125 ValueError: For invalid value types.
126 """
127 view = table._properties.get("view")
128 if view is not None:
129 # The server-side default for useLegacySql is True.
130 return view.get("useLegacySql", True)
131 # In some cases, such as in a table list no view object is present, but the
132 # resource still represents a view. Use the type as a fallback.
133 if table.table_type == "VIEW":
134 # The server-side default for useLegacySql is True.
135 return True
138class _TableBase:
139 """Base class for Table-related classes with common functionality."""
141 _PROPERTY_TO_API_FIELD: Dict[str, Union[str, List[str]]] = {
142 "dataset_id": ["tableReference", "datasetId"],
143 "project": ["tableReference", "projectId"],
144 "table_id": ["tableReference", "tableId"],
145 }
147 def __init__(self):
148 self._properties = {}
150 @property
151 def project(self) -> str:
152 """Project bound to the table."""
153 return _helpers._get_sub_prop(
154 self._properties, self._PROPERTY_TO_API_FIELD["project"]
155 )
157 @property
158 def dataset_id(self) -> str:
159 """ID of dataset containing the table."""
160 return _helpers._get_sub_prop(
161 self._properties, self._PROPERTY_TO_API_FIELD["dataset_id"]
162 )
164 @property
165 def table_id(self) -> str:
166 """The table ID."""
167 return _helpers._get_sub_prop(
168 self._properties, self._PROPERTY_TO_API_FIELD["table_id"]
169 )
171 @property
172 def path(self) -> str:
173 """URL path for the table's APIs."""
174 return (
175 f"/projects/{self.project}/datasets/{self.dataset_id}"
176 f"/tables/{self.table_id}"
177 )
179 def __eq__(self, other):
180 if isinstance(other, _TableBase):
181 return (
182 self.project == other.project
183 and self.dataset_id == other.dataset_id
184 and self.table_id == other.table_id
185 )
186 else:
187 return NotImplemented
189 def __hash__(self):
190 return hash((self.project, self.dataset_id, self.table_id))
193class TableReference(_TableBase):
194 """TableReferences are pointers to tables.
196 See
197 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablereference
199 Args:
200 dataset_ref: A pointer to the dataset
201 table_id: The ID of the table
202 """
204 _PROPERTY_TO_API_FIELD = {
205 "dataset_id": "datasetId",
206 "project": "projectId",
207 "table_id": "tableId",
208 }
210 def __init__(self, dataset_ref: "DatasetReference", table_id: str):
211 self._properties = {}
213 _helpers._set_sub_prop(
214 self._properties,
215 self._PROPERTY_TO_API_FIELD["project"],
216 dataset_ref.project,
217 )
218 _helpers._set_sub_prop(
219 self._properties,
220 self._PROPERTY_TO_API_FIELD["dataset_id"],
221 dataset_ref.dataset_id,
222 )
223 _helpers._set_sub_prop(
224 self._properties,
225 self._PROPERTY_TO_API_FIELD["table_id"],
226 table_id,
227 )
229 @classmethod
230 def from_string(
231 cls, table_id: str, default_project: str = None
232 ) -> "TableReference":
233 """Construct a table reference from table ID string.
235 Args:
236 table_id (str):
237 A table ID in standard SQL format. If ``default_project``
238 is not specified, this must included a project ID, dataset
239 ID, and table ID, each separated by ``.``.
240 default_project (Optional[str]):
241 The project ID to use when ``table_id`` does not
242 include a project ID.
244 Returns:
245 TableReference: Table reference parsed from ``table_id``.
247 Examples:
248 >>> TableReference.from_string('my-project.mydataset.mytable')
249 TableRef...(DatasetRef...('my-project', 'mydataset'), 'mytable')
251 Raises:
252 ValueError:
253 If ``table_id`` is not a fully-qualified table ID in
254 standard SQL format.
255 """
256 from google.cloud.bigquery.dataset import DatasetReference
258 (
259 output_project_id,
260 output_dataset_id,
261 output_table_id,
262 ) = _helpers._parse_3_part_id(
263 table_id, default_project=default_project, property_name="table_id"
264 )
266 return cls(
267 DatasetReference(output_project_id, output_dataset_id), output_table_id
268 )
270 @classmethod
271 def from_api_repr(cls, resource: dict) -> "TableReference":
272 """Factory: construct a table reference given its API representation
274 Args:
275 resource (Dict[str, object]):
276 Table reference representation returned from the API
278 Returns:
279 google.cloud.bigquery.table.TableReference:
280 Table reference parsed from ``resource``.
281 """
282 from google.cloud.bigquery.dataset import DatasetReference
284 project = resource["projectId"]
285 dataset_id = resource["datasetId"]
286 table_id = resource["tableId"]
288 return cls(DatasetReference(project, dataset_id), table_id)
290 def to_api_repr(self) -> dict:
291 """Construct the API resource representation of this table reference.
293 Returns:
294 Dict[str, object]: Table reference represented as an API resource
295 """
296 return copy.deepcopy(self._properties)
298 def to_bqstorage(self) -> str:
299 """Construct a BigQuery Storage API representation of this table.
301 Install the ``google-cloud-bigquery-storage`` package to use this
302 feature.
304 If the ``table_id`` contains a partition identifier (e.g.
305 ``my_table$201812``) or a snapshot identifier (e.g.
306 ``mytable@1234567890``), it is ignored. Use
307 :class:`google.cloud.bigquery_storage.types.ReadSession.TableReadOptions`
308 to filter rows by partition. Use
309 :class:`google.cloud.bigquery_storage.types.ReadSession.TableModifiers`
310 to select a specific snapshot to read from.
312 Returns:
313 str: A reference to this table in the BigQuery Storage API.
314 """
316 table_id, _, _ = self.table_id.partition("@")
317 table_id, _, _ = table_id.partition("$")
319 table_ref = (
320 f"projects/{self.project}/datasets/{self.dataset_id}/tables/{table_id}"
321 )
322 return table_ref
324 def __str__(self):
325 return f"{self.project}.{self.dataset_id}.{self.table_id}"
327 def __repr__(self):
328 from google.cloud.bigquery.dataset import DatasetReference
330 dataset_ref = DatasetReference(self.project, self.dataset_id)
331 return f"TableReference({dataset_ref!r}, '{self.table_id}')"
334class Table(_TableBase):
335 """Tables represent a set of rows whose values correspond to a schema.
337 See
338 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#resource-table
340 Args:
341 table_ref (Union[google.cloud.bigquery.table.TableReference, str]):
342 A pointer to a table. If ``table_ref`` is a string, it must
343 included a project ID, dataset ID, and table ID, each separated
344 by ``.``.
345 schema (Optional[Sequence[Union[ \
346 :class:`~google.cloud.bigquery.schema.SchemaField`, \
347 Mapping[str, Any] \
348 ]]]):
349 The table's schema. If any item is a mapping, its content must be
350 compatible with
351 :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
352 """
354 _PROPERTY_TO_API_FIELD = {
355 **_TableBase._PROPERTY_TO_API_FIELD,
356 "clustering_fields": "clustering",
357 "created": "creationTime",
358 "description": "description",
359 "encryption_configuration": "encryptionConfiguration",
360 "etag": "etag",
361 "expires": "expirationTime",
362 "external_data_configuration": "externalDataConfiguration",
363 "friendly_name": "friendlyName",
364 "full_table_id": "id",
365 "labels": "labels",
366 "location": "location",
367 "modified": "lastModifiedTime",
368 "mview_enable_refresh": "materializedView",
369 "mview_last_refresh_time": ["materializedView", "lastRefreshTime"],
370 "mview_query": "materializedView",
371 "mview_refresh_interval": "materializedView",
372 "num_bytes": "numBytes",
373 "num_rows": "numRows",
374 "partition_expiration": "timePartitioning",
375 "partitioning_type": "timePartitioning",
376 "range_partitioning": "rangePartitioning",
377 "time_partitioning": "timePartitioning",
378 "schema": "schema",
379 "snapshot_definition": "snapshotDefinition",
380 "clone_definition": "cloneDefinition",
381 "streaming_buffer": "streamingBuffer",
382 "self_link": "selfLink",
383 "time_partitioning": "timePartitioning",
384 "type": "type",
385 "view_use_legacy_sql": "view",
386 "view_query": "view",
387 "require_partition_filter": "requirePartitionFilter",
388 }
390 def __init__(self, table_ref, schema=None) -> None:
391 table_ref = _table_arg_to_table_ref(table_ref)
392 self._properties = {"tableReference": table_ref.to_api_repr(), "labels": {}}
393 # Let the @property do validation.
394 if schema is not None:
395 self.schema = schema
397 reference = property(_reference_getter)
399 @property
400 def require_partition_filter(self):
401 """bool: If set to true, queries over the partitioned table require a
402 partition filter that can be used for partition elimination to be
403 specified.
404 """
405 return self._properties.get(
406 self._PROPERTY_TO_API_FIELD["require_partition_filter"]
407 )
409 @require_partition_filter.setter
410 def require_partition_filter(self, value):
411 self._properties[
412 self._PROPERTY_TO_API_FIELD["require_partition_filter"]
413 ] = value
415 @property
416 def schema(self):
417 """Sequence[Union[ \
418 :class:`~google.cloud.bigquery.schema.SchemaField`, \
419 Mapping[str, Any] \
420 ]]:
421 Table's schema.
423 Raises:
424 Exception:
425 If ``schema`` is not a sequence, or if any item in the sequence
426 is not a :class:`~google.cloud.bigquery.schema.SchemaField`
427 instance or a compatible mapping representation of the field.
428 """
429 prop = self._properties.get(self._PROPERTY_TO_API_FIELD["schema"])
430 if not prop:
431 return []
432 else:
433 return _parse_schema_resource(prop)
435 @schema.setter
436 def schema(self, value):
437 api_field = self._PROPERTY_TO_API_FIELD["schema"]
439 if value is None:
440 self._properties[api_field] = None
441 else:
442 value = _to_schema_fields(value)
443 self._properties[api_field] = {"fields": _build_schema_resource(value)}
445 @property
446 def labels(self):
447 """Dict[str, str]: Labels for the table.
449 This method always returns a dict. To change a table's labels,
450 modify the dict, then call ``Client.update_table``. To delete a
451 label, set its value to :data:`None` before updating.
453 Raises:
454 ValueError: If ``value`` type is invalid.
455 """
456 return self._properties.setdefault(self._PROPERTY_TO_API_FIELD["labels"], {})
458 @labels.setter
459 def labels(self, value):
460 if not isinstance(value, dict):
461 raise ValueError("Pass a dict")
462 self._properties[self._PROPERTY_TO_API_FIELD["labels"]] = value
464 @property
465 def encryption_configuration(self):
466 """google.cloud.bigquery.encryption_configuration.EncryptionConfiguration: Custom
467 encryption configuration for the table.
469 Custom encryption configuration (e.g., Cloud KMS keys) or :data:`None`
470 if using default encryption.
472 See `protecting data with Cloud KMS keys
473 <https://cloud.google.com/bigquery/docs/customer-managed-encryption>`_
474 in the BigQuery documentation.
475 """
476 prop = self._properties.get(
477 self._PROPERTY_TO_API_FIELD["encryption_configuration"]
478 )
479 if prop is not None:
480 prop = EncryptionConfiguration.from_api_repr(prop)
481 return prop
483 @encryption_configuration.setter
484 def encryption_configuration(self, value):
485 api_repr = value
486 if value is not None:
487 api_repr = value.to_api_repr()
488 self._properties[
489 self._PROPERTY_TO_API_FIELD["encryption_configuration"]
490 ] = api_repr
492 @property
493 def created(self):
494 """Union[datetime.datetime, None]: Datetime at which the table was
495 created (:data:`None` until set from the server).
496 """
497 creation_time = self._properties.get(self._PROPERTY_TO_API_FIELD["created"])
498 if creation_time is not None:
499 # creation_time will be in milliseconds.
500 return google.cloud._helpers._datetime_from_microseconds(
501 1000.0 * float(creation_time)
502 )
504 @property
505 def etag(self):
506 """Union[str, None]: ETag for the table resource (:data:`None` until
507 set from the server).
508 """
509 return self._properties.get(self._PROPERTY_TO_API_FIELD["etag"])
511 @property
512 def modified(self):
513 """Union[datetime.datetime, None]: Datetime at which the table was last
514 modified (:data:`None` until set from the server).
515 """
516 modified_time = self._properties.get(self._PROPERTY_TO_API_FIELD["modified"])
517 if modified_time is not None:
518 # modified_time will be in milliseconds.
519 return google.cloud._helpers._datetime_from_microseconds(
520 1000.0 * float(modified_time)
521 )
523 @property
524 def num_bytes(self):
525 """Union[int, None]: The size of the table in bytes (:data:`None` until
526 set from the server).
527 """
528 return _helpers._int_or_none(
529 self._properties.get(self._PROPERTY_TO_API_FIELD["num_bytes"])
530 )
532 @property
533 def num_rows(self):
534 """Union[int, None]: The number of rows in the table (:data:`None`
535 until set from the server).
536 """
537 return _helpers._int_or_none(
538 self._properties.get(self._PROPERTY_TO_API_FIELD["num_rows"])
539 )
541 @property
542 def self_link(self):
543 """Union[str, None]: URL for the table resource (:data:`None` until set
544 from the server).
545 """
546 return self._properties.get(self._PROPERTY_TO_API_FIELD["self_link"])
548 @property
549 def full_table_id(self):
550 """Union[str, None]: ID for the table (:data:`None` until set from the
551 server).
553 In the format ``project-id:dataset_id.table_id``.
554 """
555 return self._properties.get(self._PROPERTY_TO_API_FIELD["full_table_id"])
557 @property
558 def table_type(self):
559 """Union[str, None]: The type of the table (:data:`None` until set from
560 the server).
562 Possible values are ``'TABLE'``, ``'VIEW'``, ``'MATERIALIZED_VIEW'`` or
563 ``'EXTERNAL'``.
564 """
565 return self._properties.get(self._PROPERTY_TO_API_FIELD["type"])
567 @property
568 def range_partitioning(self):
569 """Optional[google.cloud.bigquery.table.RangePartitioning]:
570 Configures range-based partitioning for a table.
572 .. note::
573 **Beta**. The integer range partitioning feature is in a
574 pre-release state and might change or have limited support.
576 Only specify at most one of
577 :attr:`~google.cloud.bigquery.table.Table.time_partitioning` or
578 :attr:`~google.cloud.bigquery.table.Table.range_partitioning`.
580 Raises:
581 ValueError:
582 If the value is not
583 :class:`~google.cloud.bigquery.table.RangePartitioning` or
584 :data:`None`.
585 """
586 resource = self._properties.get(
587 self._PROPERTY_TO_API_FIELD["range_partitioning"]
588 )
589 if resource is not None:
590 return RangePartitioning(_properties=resource)
592 @range_partitioning.setter
593 def range_partitioning(self, value):
594 resource = value
595 if isinstance(value, RangePartitioning):
596 resource = value._properties
597 elif value is not None:
598 raise ValueError(
599 "Expected value to be RangePartitioning or None, got {}.".format(value)
600 )
601 self._properties[self._PROPERTY_TO_API_FIELD["range_partitioning"]] = resource
603 @property
604 def time_partitioning(self):
605 """Optional[google.cloud.bigquery.table.TimePartitioning]: Configures time-based
606 partitioning for a table.
608 Only specify at most one of
609 :attr:`~google.cloud.bigquery.table.Table.time_partitioning` or
610 :attr:`~google.cloud.bigquery.table.Table.range_partitioning`.
612 Raises:
613 ValueError:
614 If the value is not
615 :class:`~google.cloud.bigquery.table.TimePartitioning` or
616 :data:`None`.
617 """
618 prop = self._properties.get(self._PROPERTY_TO_API_FIELD["time_partitioning"])
619 if prop is not None:
620 return TimePartitioning.from_api_repr(prop)
622 @time_partitioning.setter
623 def time_partitioning(self, value):
624 api_repr = value
625 if isinstance(value, TimePartitioning):
626 api_repr = value.to_api_repr()
627 elif value is not None:
628 raise ValueError(
629 "value must be google.cloud.bigquery.table.TimePartitioning " "or None"
630 )
631 self._properties[self._PROPERTY_TO_API_FIELD["time_partitioning"]] = api_repr
633 @property
634 def partitioning_type(self):
635 """Union[str, None]: Time partitioning of the table if it is
636 partitioned (Defaults to :data:`None`).
638 """
639 warnings.warn(
640 "This method will be deprecated in future versions. Please use "
641 "Table.time_partitioning.type_ instead.",
642 PendingDeprecationWarning,
643 stacklevel=2,
644 )
645 if self.time_partitioning is not None:
646 return self.time_partitioning.type_
648 @partitioning_type.setter
649 def partitioning_type(self, value):
650 warnings.warn(
651 "This method will be deprecated in future versions. Please use "
652 "Table.time_partitioning.type_ instead.",
653 PendingDeprecationWarning,
654 stacklevel=2,
655 )
656 api_field = self._PROPERTY_TO_API_FIELD["partitioning_type"]
657 if self.time_partitioning is None:
658 self._properties[api_field] = {}
659 self._properties[api_field]["type"] = value
661 @property
662 def partition_expiration(self):
663 """Union[int, None]: Expiration time in milliseconds for a partition.
665 If :attr:`partition_expiration` is set and :attr:`type_` is
666 not set, :attr:`type_` will default to
667 :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
668 """
669 warnings.warn(
670 "This method will be deprecated in future versions. Please use "
671 "Table.time_partitioning.expiration_ms instead.",
672 PendingDeprecationWarning,
673 stacklevel=2,
674 )
675 if self.time_partitioning is not None:
676 return self.time_partitioning.expiration_ms
678 @partition_expiration.setter
679 def partition_expiration(self, value):
680 warnings.warn(
681 "This method will be deprecated in future versions. Please use "
682 "Table.time_partitioning.expiration_ms instead.",
683 PendingDeprecationWarning,
684 stacklevel=2,
685 )
686 api_field = self._PROPERTY_TO_API_FIELD["partition_expiration"]
688 if self.time_partitioning is None:
689 self._properties[api_field] = {"type": TimePartitioningType.DAY}
690 self._properties[api_field]["expirationMs"] = str(value)
692 @property
693 def clustering_fields(self):
694 """Union[List[str], None]: Fields defining clustering for the table
696 (Defaults to :data:`None`).
698 Clustering fields are immutable after table creation.
700 .. note::
702 BigQuery supports clustering for both partitioned and
703 non-partitioned tables.
704 """
705 prop = self._properties.get(self._PROPERTY_TO_API_FIELD["clustering_fields"])
706 if prop is not None:
707 return list(prop.get("fields", ()))
709 @clustering_fields.setter
710 def clustering_fields(self, value):
711 """Union[List[str], None]: Fields defining clustering for the table
713 (Defaults to :data:`None`).
714 """
715 api_field = self._PROPERTY_TO_API_FIELD["clustering_fields"]
717 if value is not None:
718 prop = self._properties.setdefault(api_field, {})
719 prop["fields"] = value
720 else:
721 # In order to allow unsetting clustering fields completely, we explicitly
722 # set this property to None (as oposed to merely removing the key).
723 self._properties[api_field] = None
725 @property
726 def description(self):
727 """Union[str, None]: Description of the table (defaults to
728 :data:`None`).
730 Raises:
731 ValueError: For invalid value types.
732 """
733 return self._properties.get(self._PROPERTY_TO_API_FIELD["description"])
735 @description.setter
736 def description(self, value):
737 if not isinstance(value, str) and value is not None:
738 raise ValueError("Pass a string, or None")
739 self._properties[self._PROPERTY_TO_API_FIELD["description"]] = value
741 @property
742 def expires(self):
743 """Union[datetime.datetime, None]: Datetime at which the table will be
744 deleted.
746 Raises:
747 ValueError: For invalid value types.
748 """
749 expiration_time = self._properties.get(self._PROPERTY_TO_API_FIELD["expires"])
750 if expiration_time is not None:
751 # expiration_time will be in milliseconds.
752 return google.cloud._helpers._datetime_from_microseconds(
753 1000.0 * float(expiration_time)
754 )
756 @expires.setter
757 def expires(self, value):
758 if not isinstance(value, datetime.datetime) and value is not None:
759 raise ValueError("Pass a datetime, or None")
760 value_ms = google.cloud._helpers._millis_from_datetime(value)
761 self._properties[
762 self._PROPERTY_TO_API_FIELD["expires"]
763 ] = _helpers._str_or_none(value_ms)
765 @property
766 def friendly_name(self):
767 """Union[str, None]: Title of the table (defaults to :data:`None`).
769 Raises:
770 ValueError: For invalid value types.
771 """
772 return self._properties.get(self._PROPERTY_TO_API_FIELD["friendly_name"])
774 @friendly_name.setter
775 def friendly_name(self, value):
776 if not isinstance(value, str) and value is not None:
777 raise ValueError("Pass a string, or None")
778 self._properties[self._PROPERTY_TO_API_FIELD["friendly_name"]] = value
780 @property
781 def location(self):
782 """Union[str, None]: Location in which the table is hosted
784 Defaults to :data:`None`.
785 """
786 return self._properties.get(self._PROPERTY_TO_API_FIELD["location"])
788 @property
789 def view_query(self):
790 """Union[str, None]: SQL query defining the table as a view (defaults
791 to :data:`None`).
793 By default, the query is treated as Standard SQL. To use Legacy
794 SQL, set :attr:`view_use_legacy_sql` to :data:`True`.
796 Raises:
797 ValueError: For invalid value types.
798 """
799 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
800 return _helpers._get_sub_prop(self._properties, [api_field, "query"])
802 @view_query.setter
803 def view_query(self, value):
804 if not isinstance(value, str):
805 raise ValueError("Pass a string")
807 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
808 _helpers._set_sub_prop(self._properties, [api_field, "query"], value)
809 view = self._properties[api_field]
810 # The service defaults useLegacySql to True, but this
811 # client uses Standard SQL by default.
812 if view.get("useLegacySql") is None:
813 view["useLegacySql"] = False
815 @view_query.deleter
816 def view_query(self):
817 """Delete SQL query defining the table as a view."""
818 self._properties.pop(self._PROPERTY_TO_API_FIELD["view_query"], None)
820 view_use_legacy_sql = property(_view_use_legacy_sql_getter)
822 @view_use_legacy_sql.setter # type: ignore # (redefinition from above)
823 def view_use_legacy_sql(self, value):
824 if not isinstance(value, bool):
825 raise ValueError("Pass a boolean")
827 api_field = self._PROPERTY_TO_API_FIELD["view_query"]
828 if self._properties.get(api_field) is None:
829 self._properties[api_field] = {}
830 self._properties[api_field]["useLegacySql"] = value
832 @property
833 def mview_query(self):
834 """Optional[str]: SQL query defining the table as a materialized
835 view (defaults to :data:`None`).
836 """
837 api_field = self._PROPERTY_TO_API_FIELD["mview_query"]
838 return _helpers._get_sub_prop(self._properties, [api_field, "query"])
840 @mview_query.setter
841 def mview_query(self, value):
842 api_field = self._PROPERTY_TO_API_FIELD["mview_query"]
843 _helpers._set_sub_prop(self._properties, [api_field, "query"], str(value))
845 @mview_query.deleter
846 def mview_query(self):
847 """Delete SQL query defining the table as a materialized view."""
848 self._properties.pop(self._PROPERTY_TO_API_FIELD["mview_query"], None)
850 @property
851 def mview_last_refresh_time(self):
852 """Optional[datetime.datetime]: Datetime at which the materialized view was last
853 refreshed (:data:`None` until set from the server).
854 """
855 refresh_time = _helpers._get_sub_prop(
856 self._properties, self._PROPERTY_TO_API_FIELD["mview_last_refresh_time"]
857 )
858 if refresh_time is not None:
859 # refresh_time will be in milliseconds.
860 return google.cloud._helpers._datetime_from_microseconds(
861 1000 * int(refresh_time)
862 )
864 @property
865 def mview_enable_refresh(self):
866 """Optional[bool]: Enable automatic refresh of the materialized view
867 when the base table is updated. The default value is :data:`True`.
868 """
869 api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"]
870 return _helpers._get_sub_prop(self._properties, [api_field, "enableRefresh"])
872 @mview_enable_refresh.setter
873 def mview_enable_refresh(self, value):
874 api_field = self._PROPERTY_TO_API_FIELD["mview_enable_refresh"]
875 return _helpers._set_sub_prop(
876 self._properties, [api_field, "enableRefresh"], value
877 )
879 @property
880 def mview_refresh_interval(self):
881 """Optional[datetime.timedelta]: The maximum frequency at which this
882 materialized view will be refreshed. The default value is 1800000
883 milliseconds (30 minutes).
884 """
885 api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"]
886 refresh_interval = _helpers._get_sub_prop(
887 self._properties, [api_field, "refreshIntervalMs"]
888 )
889 if refresh_interval is not None:
890 return datetime.timedelta(milliseconds=int(refresh_interval))
892 @mview_refresh_interval.setter
893 def mview_refresh_interval(self, value):
894 if value is None:
895 refresh_interval_ms = None
896 else:
897 refresh_interval_ms = str(value // datetime.timedelta(milliseconds=1))
899 api_field = self._PROPERTY_TO_API_FIELD["mview_refresh_interval"]
900 _helpers._set_sub_prop(
901 self._properties,
902 [api_field, "refreshIntervalMs"],
903 refresh_interval_ms,
904 )
906 @property
907 def streaming_buffer(self):
908 """google.cloud.bigquery.StreamingBuffer: Information about a table's
909 streaming buffer.
910 """
911 sb = self._properties.get(self._PROPERTY_TO_API_FIELD["streaming_buffer"])
912 if sb is not None:
913 return StreamingBuffer(sb)
915 @property
916 def external_data_configuration(self):
917 """Union[google.cloud.bigquery.ExternalConfig, None]: Configuration for
918 an external data source (defaults to :data:`None`).
920 Raises:
921 ValueError: For invalid value types.
922 """
923 prop = self._properties.get(
924 self._PROPERTY_TO_API_FIELD["external_data_configuration"]
925 )
926 if prop is not None:
927 prop = ExternalConfig.from_api_repr(prop)
928 return prop
930 @external_data_configuration.setter
931 def external_data_configuration(self, value):
932 if not (value is None or isinstance(value, ExternalConfig)):
933 raise ValueError("Pass an ExternalConfig or None")
934 api_repr = value
935 if value is not None:
936 api_repr = value.to_api_repr()
937 self._properties[
938 self._PROPERTY_TO_API_FIELD["external_data_configuration"]
939 ] = api_repr
941 @property
942 def snapshot_definition(self) -> Optional["SnapshotDefinition"]:
943 """Information about the snapshot. This value is set via snapshot creation.
945 See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.snapshot_definition
946 """
947 snapshot_info = self._properties.get(
948 self._PROPERTY_TO_API_FIELD["snapshot_definition"]
949 )
950 if snapshot_info is not None:
951 snapshot_info = SnapshotDefinition(snapshot_info)
952 return snapshot_info
954 @property
955 def clone_definition(self) -> Optional["CloneDefinition"]:
956 """Information about the clone. This value is set via clone creation.
958 See: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#Table.FIELDS.clone_definition
959 """
960 clone_info = self._properties.get(
961 self._PROPERTY_TO_API_FIELD["clone_definition"]
962 )
963 if clone_info is not None:
964 clone_info = CloneDefinition(clone_info)
965 return clone_info
967 @classmethod
968 def from_string(cls, full_table_id: str) -> "Table":
969 """Construct a table from fully-qualified table ID.
971 Args:
972 full_table_id (str):
973 A fully-qualified table ID in standard SQL format. Must
974 included a project ID, dataset ID, and table ID, each
975 separated by ``.``.
977 Returns:
978 Table: Table parsed from ``full_table_id``.
980 Examples:
981 >>> Table.from_string('my-project.mydataset.mytable')
982 Table(TableRef...(D...('my-project', 'mydataset'), 'mytable'))
984 Raises:
985 ValueError:
986 If ``full_table_id`` is not a fully-qualified table ID in
987 standard SQL format.
988 """
989 return cls(TableReference.from_string(full_table_id))
991 @classmethod
992 def from_api_repr(cls, resource: dict) -> "Table":
993 """Factory: construct a table given its API representation
995 Args:
996 resource (Dict[str, object]):
997 Table resource representation from the API
999 Returns:
1000 google.cloud.bigquery.table.Table: Table parsed from ``resource``.
1002 Raises:
1003 KeyError:
1004 If the ``resource`` lacks the key ``'tableReference'``, or if
1005 the ``dict`` stored within the key ``'tableReference'`` lacks
1006 the keys ``'tableId'``, ``'projectId'``, or ``'datasetId'``.
1007 """
1008 from google.cloud.bigquery import dataset
1010 if (
1011 "tableReference" not in resource
1012 or "tableId" not in resource["tableReference"]
1013 ):
1014 raise KeyError(
1015 "Resource lacks required identity information:"
1016 '["tableReference"]["tableId"]'
1017 )
1018 project_id = _helpers._get_sub_prop(
1019 resource, cls._PROPERTY_TO_API_FIELD["project"]
1020 )
1021 table_id = _helpers._get_sub_prop(
1022 resource, cls._PROPERTY_TO_API_FIELD["table_id"]
1023 )
1024 dataset_id = _helpers._get_sub_prop(
1025 resource, cls._PROPERTY_TO_API_FIELD["dataset_id"]
1026 )
1027 dataset_ref = dataset.DatasetReference(project_id, dataset_id)
1029 table = cls(dataset_ref.table(table_id))
1030 table._properties = resource
1032 return table
1034 def to_api_repr(self) -> dict:
1035 """Constructs the API resource of this table
1037 Returns:
1038 Dict[str, object]: Table represented as an API resource
1039 """
1040 return copy.deepcopy(self._properties)
1042 def to_bqstorage(self) -> str:
1043 """Construct a BigQuery Storage API representation of this table.
1045 Returns:
1046 str: A reference to this table in the BigQuery Storage API.
1047 """
1048 return self.reference.to_bqstorage()
1050 def _build_resource(self, filter_fields):
1051 """Generate a resource for ``update``."""
1052 return _helpers._build_resource_from_properties(self, filter_fields)
1054 def __repr__(self):
1055 return "Table({})".format(repr(self.reference))
1057 def __str__(self):
1058 return f"{self.project}.{self.dataset_id}.{self.table_id}"
1061class TableListItem(_TableBase):
1062 """A read-only table resource from a list operation.
1064 For performance reasons, the BigQuery API only includes some of the table
1065 properties when listing tables. Notably,
1066 :attr:`~google.cloud.bigquery.table.Table.schema` and
1067 :attr:`~google.cloud.bigquery.table.Table.num_rows` are missing.
1069 For a full list of the properties that the BigQuery API returns, see the
1070 `REST documentation for tables.list
1071 <https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list>`_.
1074 Args:
1075 resource (Dict[str, object]):
1076 A table-like resource object from a table list response. A
1077 ``tableReference`` property is required.
1079 Raises:
1080 ValueError:
1081 If ``tableReference`` or one of its required members is missing
1082 from ``resource``.
1083 """
1085 def __init__(self, resource):
1086 if "tableReference" not in resource:
1087 raise ValueError("resource must contain a tableReference value")
1088 if "projectId" not in resource["tableReference"]:
1089 raise ValueError(
1090 "resource['tableReference'] must contain a projectId value"
1091 )
1092 if "datasetId" not in resource["tableReference"]:
1093 raise ValueError(
1094 "resource['tableReference'] must contain a datasetId value"
1095 )
1096 if "tableId" not in resource["tableReference"]:
1097 raise ValueError("resource['tableReference'] must contain a tableId value")
1099 self._properties = resource
1101 @property
1102 def created(self):
1103 """Union[datetime.datetime, None]: Datetime at which the table was
1104 created (:data:`None` until set from the server).
1105 """
1106 creation_time = self._properties.get("creationTime")
1107 if creation_time is not None:
1108 # creation_time will be in milliseconds.
1109 return google.cloud._helpers._datetime_from_microseconds(
1110 1000.0 * float(creation_time)
1111 )
1113 @property
1114 def expires(self):
1115 """Union[datetime.datetime, None]: Datetime at which the table will be
1116 deleted.
1117 """
1118 expiration_time = self._properties.get("expirationTime")
1119 if expiration_time is not None:
1120 # expiration_time will be in milliseconds.
1121 return google.cloud._helpers._datetime_from_microseconds(
1122 1000.0 * float(expiration_time)
1123 )
1125 reference = property(_reference_getter)
1127 @property
1128 def labels(self):
1129 """Dict[str, str]: Labels for the table.
1131 This method always returns a dict. To change a table's labels,
1132 modify the dict, then call ``Client.update_table``. To delete a
1133 label, set its value to :data:`None` before updating.
1134 """
1135 return self._properties.setdefault("labels", {})
1137 @property
1138 def full_table_id(self):
1139 """Union[str, None]: ID for the table (:data:`None` until set from the
1140 server).
1142 In the format ``project_id:dataset_id.table_id``.
1143 """
1144 return self._properties.get("id")
1146 @property
1147 def table_type(self):
1148 """Union[str, None]: The type of the table (:data:`None` until set from
1149 the server).
1151 Possible values are ``'TABLE'``, ``'VIEW'``, or ``'EXTERNAL'``.
1152 """
1153 return self._properties.get("type")
1155 @property
1156 def time_partitioning(self):
1157 """google.cloud.bigquery.table.TimePartitioning: Configures time-based
1158 partitioning for a table.
1159 """
1160 prop = self._properties.get("timePartitioning")
1161 if prop is not None:
1162 return TimePartitioning.from_api_repr(prop)
1164 @property
1165 def partitioning_type(self):
1166 """Union[str, None]: Time partitioning of the table if it is
1167 partitioned (Defaults to :data:`None`).
1168 """
1169 warnings.warn(
1170 "This method will be deprecated in future versions. Please use "
1171 "TableListItem.time_partitioning.type_ instead.",
1172 PendingDeprecationWarning,
1173 stacklevel=2,
1174 )
1175 if self.time_partitioning is not None:
1176 return self.time_partitioning.type_
1178 @property
1179 def partition_expiration(self):
1180 """Union[int, None]: Expiration time in milliseconds for a partition.
1182 If this property is set and :attr:`type_` is not set, :attr:`type_`
1183 will default to :attr:`TimePartitioningType.DAY`.
1184 """
1185 warnings.warn(
1186 "This method will be deprecated in future versions. Please use "
1187 "TableListItem.time_partitioning.expiration_ms instead.",
1188 PendingDeprecationWarning,
1189 stacklevel=2,
1190 )
1191 if self.time_partitioning is not None:
1192 return self.time_partitioning.expiration_ms
1194 @property
1195 def friendly_name(self):
1196 """Union[str, None]: Title of the table (defaults to :data:`None`)."""
1197 return self._properties.get("friendlyName")
1199 view_use_legacy_sql = property(_view_use_legacy_sql_getter)
1201 @property
1202 def clustering_fields(self):
1203 """Union[List[str], None]: Fields defining clustering for the table
1205 (Defaults to :data:`None`).
1207 Clustering fields are immutable after table creation.
1209 .. note::
1211 BigQuery supports clustering for both partitioned and
1212 non-partitioned tables.
1213 """
1214 prop = self._properties.get("clustering")
1215 if prop is not None:
1216 return list(prop.get("fields", ()))
1218 @classmethod
1219 def from_string(cls, full_table_id: str) -> "TableListItem":
1220 """Construct a table from fully-qualified table ID.
1222 Args:
1223 full_table_id (str):
1224 A fully-qualified table ID in standard SQL format. Must
1225 included a project ID, dataset ID, and table ID, each
1226 separated by ``.``.
1228 Returns:
1229 Table: Table parsed from ``full_table_id``.
1231 Examples:
1232 >>> Table.from_string('my-project.mydataset.mytable')
1233 Table(TableRef...(D...('my-project', 'mydataset'), 'mytable'))
1235 Raises:
1236 ValueError:
1237 If ``full_table_id`` is not a fully-qualified table ID in
1238 standard SQL format.
1239 """
1240 return cls(
1241 {"tableReference": TableReference.from_string(full_table_id).to_api_repr()}
1242 )
1244 def to_bqstorage(self) -> str:
1245 """Construct a BigQuery Storage API representation of this table.
1247 Returns:
1248 str: A reference to this table in the BigQuery Storage API.
1249 """
1250 return self.reference.to_bqstorage()
1252 def to_api_repr(self) -> dict:
1253 """Constructs the API resource of this table
1255 Returns:
1256 Dict[str, object]: Table represented as an API resource
1257 """
1258 return copy.deepcopy(self._properties)
1261def _row_from_mapping(mapping, schema):
1262 """Convert a mapping to a row tuple using the schema.
1264 Args:
1265 mapping (Dict[str, object])
1266 Mapping of row data: must contain keys for all required fields in
1267 the schema. Keys which do not correspond to a field in the schema
1268 are ignored.
1269 schema (List[google.cloud.bigquery.schema.SchemaField]):
1270 The schema of the table destination for the rows
1272 Returns:
1273 Tuple[object]:
1274 Tuple whose elements are ordered according to the schema.
1276 Raises:
1277 ValueError: If schema is empty.
1278 """
1279 if len(schema) == 0:
1280 raise ValueError(_TABLE_HAS_NO_SCHEMA)
1282 row = []
1283 for field in schema:
1284 if field.mode == "REQUIRED":
1285 row.append(mapping[field.name])
1286 elif field.mode == "REPEATED":
1287 row.append(mapping.get(field.name, ()))
1288 elif field.mode == "NULLABLE":
1289 row.append(mapping.get(field.name))
1290 else:
1291 raise ValueError("Unknown field mode: {}".format(field.mode))
1292 return tuple(row)
1295class StreamingBuffer(object):
1296 """Information about a table's streaming buffer.
1298 See https://cloud.google.com/bigquery/streaming-data-into-bigquery.
1300 Args:
1301 resource (Dict[str, object]):
1302 streaming buffer representation returned from the API
1303 """
1305 def __init__(self, resource):
1306 self.estimated_bytes = None
1307 if "estimatedBytes" in resource:
1308 self.estimated_bytes = int(resource["estimatedBytes"])
1309 self.estimated_rows = None
1310 if "estimatedRows" in resource:
1311 self.estimated_rows = int(resource["estimatedRows"])
1312 self.oldest_entry_time = None
1313 if "oldestEntryTime" in resource:
1314 self.oldest_entry_time = google.cloud._helpers._datetime_from_microseconds(
1315 1000.0 * int(resource["oldestEntryTime"])
1316 )
1319class SnapshotDefinition:
1320 """Information about base table and snapshot time of the snapshot.
1322 See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#snapshotdefinition
1324 Args:
1325 resource: Snapshot definition representation returned from the API.
1326 """
1328 def __init__(self, resource: Dict[str, Any]):
1329 self.base_table_reference = None
1330 if "baseTableReference" in resource:
1331 self.base_table_reference = TableReference.from_api_repr(
1332 resource["baseTableReference"]
1333 )
1335 self.snapshot_time = None
1336 if "snapshotTime" in resource:
1337 self.snapshot_time = google.cloud._helpers._rfc3339_to_datetime(
1338 resource["snapshotTime"]
1339 )
1342class CloneDefinition:
1343 """Information about base table and clone time of the clone.
1345 See https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#clonedefinition
1347 Args:
1348 resource: Clone definition representation returned from the API.
1349 """
1351 def __init__(self, resource: Dict[str, Any]):
1352 self.base_table_reference = None
1353 if "baseTableReference" in resource:
1354 self.base_table_reference = TableReference.from_api_repr(
1355 resource["baseTableReference"]
1356 )
1358 self.clone_time = None
1359 if "cloneTime" in resource:
1360 self.clone_time = google.cloud._helpers._rfc3339_to_datetime(
1361 resource["cloneTime"]
1362 )
1365class Row(object):
1366 """A BigQuery row.
1368 Values can be accessed by position (index), by key like a dict,
1369 or as properties.
1371 Args:
1372 values (Sequence[object]): The row values
1373 field_to_index (Dict[str, int]):
1374 A mapping from schema field names to indexes
1375 """
1377 # Choose unusual field names to try to avoid conflict with schema fields.
1378 __slots__ = ("_xxx_values", "_xxx_field_to_index")
1380 def __init__(self, values, field_to_index) -> None:
1381 self._xxx_values = values
1382 self._xxx_field_to_index = field_to_index
1384 def values(self):
1385 """Return the values included in this row.
1387 Returns:
1388 Sequence[object]: A sequence of length ``len(row)``.
1389 """
1390 return copy.deepcopy(self._xxx_values)
1392 def keys(self) -> Iterable[str]:
1393 """Return the keys for using a row as a dict.
1395 Returns:
1396 Iterable[str]: The keys corresponding to the columns of a row
1398 Examples:
1400 >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).keys())
1401 ['x', 'y']
1402 """
1403 return self._xxx_field_to_index.keys()
1405 def items(self) -> Iterable[Tuple[str, Any]]:
1406 """Return items as ``(key, value)`` pairs.
1408 Returns:
1409 Iterable[Tuple[str, object]]:
1410 The ``(key, value)`` pairs representing this row.
1412 Examples:
1414 >>> list(Row(('a', 'b'), {'x': 0, 'y': 1}).items())
1415 [('x', 'a'), ('y', 'b')]
1416 """
1417 for key, index in self._xxx_field_to_index.items():
1418 yield (key, copy.deepcopy(self._xxx_values[index]))
1420 def get(self, key: str, default: Any = None) -> Any:
1421 """Return a value for key, with a default value if it does not exist.
1423 Args:
1424 key (str): The key of the column to access
1425 default (object):
1426 The default value to use if the key does not exist. (Defaults
1427 to :data:`None`.)
1429 Returns:
1430 object:
1431 The value associated with the provided key, or a default value.
1433 Examples:
1434 When the key exists, the value associated with it is returned.
1436 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('x')
1437 'a'
1439 The default value is :data:`None` when the key does not exist.
1441 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z')
1442 None
1444 The default value can be overridden with the ``default`` parameter.
1446 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z', '')
1447 ''
1449 >>> Row(('a', 'b'), {'x': 0, 'y': 1}).get('z', default = '')
1450 ''
1451 """
1452 index = self._xxx_field_to_index.get(key)
1453 if index is None:
1454 return default
1455 return self._xxx_values[index]
1457 def __getattr__(self, name):
1458 value = self._xxx_field_to_index.get(name)
1459 if value is None:
1460 raise AttributeError("no row field {!r}".format(name))
1461 return self._xxx_values[value]
1463 def __len__(self):
1464 return len(self._xxx_values)
1466 def __getitem__(self, key):
1467 if isinstance(key, str):
1468 value = self._xxx_field_to_index.get(key)
1469 if value is None:
1470 raise KeyError("no row field {!r}".format(key))
1471 key = value
1472 return self._xxx_values[key]
1474 def __eq__(self, other):
1475 if not isinstance(other, Row):
1476 return NotImplemented
1477 return (
1478 self._xxx_values == other._xxx_values
1479 and self._xxx_field_to_index == other._xxx_field_to_index
1480 )
1482 def __ne__(self, other):
1483 return not self == other
1485 def __repr__(self):
1486 # sort field dict by value, for determinism
1487 items = sorted(self._xxx_field_to_index.items(), key=operator.itemgetter(1))
1488 f2i = "{" + ", ".join("%r: %d" % item for item in items) + "}"
1489 return "Row({}, {})".format(self._xxx_values, f2i)
1492class _NoopProgressBarQueue(object):
1493 """A fake Queue class that does nothing.
1495 This is used when there is no progress bar to send updates to.
1496 """
1498 def put_nowait(self, item):
1499 """Don't actually do anything with the item."""
1502class RowIterator(HTTPIterator):
1503 """A class for iterating through HTTP/JSON API row list responses.
1505 Args:
1506 client (Optional[google.cloud.bigquery.Client]):
1507 The API client instance. This should always be non-`None`, except for
1508 subclasses that do not use it, namely the ``_EmptyRowIterator``.
1509 api_request (Callable[google.cloud._http.JSONConnection.api_request]):
1510 The function to use to make API requests.
1511 path (str): The method path to query for the list of items.
1512 schema (Sequence[Union[ \
1513 :class:`~google.cloud.bigquery.schema.SchemaField`, \
1514 Mapping[str, Any] \
1515 ]]):
1516 The table's schema. If any item is a mapping, its content must be
1517 compatible with
1518 :meth:`~google.cloud.bigquery.schema.SchemaField.from_api_repr`.
1519 page_token (str): A token identifying a page in a result set to start
1520 fetching results from.
1521 max_results (Optional[int]): The maximum number of results to fetch.
1522 page_size (Optional[int]): The maximum number of rows in each page
1523 of results from this request. Non-positive values are ignored.
1524 Defaults to a sensible value set by the API.
1525 extra_params (Optional[Dict[str, object]]):
1526 Extra query string parameters for the API call.
1527 table (Optional[Union[ \
1528 google.cloud.bigquery.table.Table, \
1529 google.cloud.bigquery.table.TableReference, \
1530 ]]):
1531 The table which these rows belong to, or a reference to it. Used to
1532 call the BigQuery Storage API to fetch rows.
1533 selected_fields (Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]):
1534 A subset of columns to select from this table.
1535 total_rows (Optional[int]):
1536 Total number of rows in the table.
1537 first_page_response (Optional[dict]):
1538 API response for the first page of results. These are returned when
1539 the first page is requested.
1540 """
1542 def __init__(
1543 self,
1544 client,
1545 api_request,
1546 path,
1547 schema,
1548 page_token=None,
1549 max_results=None,
1550 page_size=None,
1551 extra_params=None,
1552 table=None,
1553 selected_fields=None,
1554 total_rows=None,
1555 first_page_response=None,
1556 ):
1557 super(RowIterator, self).__init__(
1558 client,
1559 api_request,
1560 path,
1561 item_to_value=_item_to_row,
1562 items_key="rows",
1563 page_token=page_token,
1564 max_results=max_results,
1565 extra_params=extra_params,
1566 page_start=_rows_page_start,
1567 next_token="pageToken",
1568 )
1569 schema = _to_schema_fields(schema)
1570 self._field_to_index = _helpers._field_to_index_mapping(schema)
1571 self._page_size = page_size
1572 self._preserve_order = False
1573 self._project = client.project if client is not None else None
1574 self._schema = schema
1575 self._selected_fields = selected_fields
1576 self._table = table
1577 self._total_rows = total_rows
1578 self._first_page_response = first_page_response
1580 def _is_completely_cached(self):
1581 """Check if all results are completely cached.
1583 This is useful to know, because we can avoid alternative download
1584 mechanisms.
1585 """
1586 if self._first_page_response is None or self.next_page_token:
1587 return False
1589 return self._first_page_response.get(self._next_token) is None
1591 def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client):
1592 """Returns if the BigQuery Storage API can be used.
1594 Returns:
1595 bool
1596 True if the BigQuery Storage client can be used or created.
1597 """
1598 using_bqstorage_api = bqstorage_client or create_bqstorage_client
1599 if not using_bqstorage_api:
1600 return False
1602 if self._is_completely_cached():
1603 return False
1605 if self.max_results is not None:
1606 return False
1608 try:
1609 from google.cloud import bigquery_storage # noqa: F401
1610 except ImportError:
1611 return False
1613 try:
1614 _helpers.BQ_STORAGE_VERSIONS.verify_version()
1615 except LegacyBigQueryStorageError as exc:
1616 warnings.warn(str(exc))
1617 return False
1619 return True
1621 def _get_next_page_response(self):
1622 """Requests the next page from the path provided.
1624 Returns:
1625 Dict[str, object]:
1626 The parsed JSON response of the next page's contents.
1627 """
1628 if self._first_page_response:
1629 response = self._first_page_response
1630 self._first_page_response = None
1631 return response
1633 params = self._get_query_params()
1634 if self._page_size is not None:
1635 if self.page_number and "startIndex" in params:
1636 del params["startIndex"]
1637 params["maxResults"] = self._page_size
1638 return self.api_request(
1639 method=self._HTTP_METHOD, path=self.path, query_params=params
1640 )
1642 @property
1643 def schema(self):
1644 """List[google.cloud.bigquery.schema.SchemaField]: The subset of
1645 columns to be read from the table."""
1646 return list(self._schema)
1648 @property
1649 def total_rows(self):
1650 """int: The total number of rows in the table."""
1651 return self._total_rows
1653 def _maybe_warn_max_results(
1654 self,
1655 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"],
1656 ):
1657 """Issue a warning if BQ Storage client is not ``None`` with ``max_results`` set.
1659 This helper method should be used directly in the relevant top-level public
1660 methods, so that the warning is issued for the correct line in user code.
1662 Args:
1663 bqstorage_client:
1664 The BigQuery Storage client intended to use for downloading result rows.
1665 """
1666 if bqstorage_client is not None and self.max_results is not None:
1667 warnings.warn(
1668 "Cannot use bqstorage_client if max_results is set, "
1669 "reverting to fetching data with the REST endpoint.",
1670 stacklevel=3,
1671 )
1673 def _to_page_iterable(
1674 self, bqstorage_download, tabledata_list_download, bqstorage_client=None
1675 ):
1676 if not self._validate_bqstorage(bqstorage_client, False):
1677 bqstorage_client = None
1679 result_pages = (
1680 bqstorage_download()
1681 if bqstorage_client is not None
1682 else tabledata_list_download()
1683 )
1684 yield from result_pages
1686 def to_arrow_iterable(
1687 self,
1688 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
1689 max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
1690 ) -> Iterator["pyarrow.RecordBatch"]:
1691 """[Beta] Create an iterable of class:`pyarrow.RecordBatch`, to process the table as a stream.
1693 Args:
1694 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1695 A BigQuery Storage API client. If supplied, use the faster
1696 BigQuery Storage API to fetch rows from BigQuery.
1698 This method requires the ``pyarrow`` and
1699 ``google-cloud-bigquery-storage`` libraries.
1701 This method only exposes a subset of the capabilities of the
1702 BigQuery Storage API. For full access to all features
1703 (projections, filters, snapshots) use the Storage API directly.
1705 max_queue_size (Optional[int]):
1706 The maximum number of result pages to hold in the internal queue when
1707 streaming query results over the BigQuery Storage API. Ignored if
1708 Storage API is not used.
1710 By default, the max queue size is set to the number of BQ Storage streams
1711 created by the server. If ``max_queue_size`` is :data:`None`, the queue
1712 size is infinite.
1714 Returns:
1715 pyarrow.RecordBatch:
1716 A generator of :class:`~pyarrow.RecordBatch`.
1718 .. versionadded:: 2.31.0
1719 """
1720 self._maybe_warn_max_results(bqstorage_client)
1722 bqstorage_download = functools.partial(
1723 _pandas_helpers.download_arrow_bqstorage,
1724 self._project,
1725 self._table,
1726 bqstorage_client,
1727 preserve_order=self._preserve_order,
1728 selected_fields=self._selected_fields,
1729 max_queue_size=max_queue_size,
1730 )
1731 tabledata_list_download = functools.partial(
1732 _pandas_helpers.download_arrow_row_iterator, iter(self.pages), self.schema
1733 )
1734 return self._to_page_iterable(
1735 bqstorage_download,
1736 tabledata_list_download,
1737 bqstorage_client=bqstorage_client,
1738 )
1740 # If changing the signature of this method, make sure to apply the same
1741 # changes to job.QueryJob.to_arrow()
1742 def to_arrow(
1743 self,
1744 progress_bar_type: str = None,
1745 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
1746 create_bqstorage_client: bool = True,
1747 ) -> "pyarrow.Table":
1748 """[Beta] Create a class:`pyarrow.Table` by loading all pages of a
1749 table or query.
1751 Args:
1752 progress_bar_type (Optional[str]):
1753 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
1754 display a progress bar while the data downloads. Install the
1755 ``tqdm`` package to use this feature.
1757 Possible values of ``progress_bar_type`` include:
1759 ``None``
1760 No progress bar.
1761 ``'tqdm'``
1762 Use the :func:`tqdm.tqdm` function to print a progress bar
1763 to :data:`sys.stdout`.
1764 ``'tqdm_notebook'``
1765 Use the :func:`tqdm.notebook.tqdm` function to display a
1766 progress bar as a Jupyter notebook widget.
1767 ``'tqdm_gui'``
1768 Use the :func:`tqdm.tqdm_gui` function to display a
1769 progress bar as a graphical dialog box.
1770 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1771 A BigQuery Storage API client. If supplied, use the faster BigQuery
1772 Storage API to fetch rows from BigQuery. This API is a billable API.
1774 This method requires ``google-cloud-bigquery-storage`` library.
1776 This method only exposes a subset of the capabilities of the
1777 BigQuery Storage API. For full access to all features
1778 (projections, filters, snapshots) use the Storage API directly.
1779 create_bqstorage_client (Optional[bool]):
1780 If ``True`` (default), create a BigQuery Storage API client using
1781 the default API settings. The BigQuery Storage API is a faster way
1782 to fetch rows from BigQuery. See the ``bqstorage_client`` parameter
1783 for more information.
1785 This argument does nothing if ``bqstorage_client`` is supplied.
1787 .. versionadded:: 1.24.0
1789 Returns:
1790 pyarrow.Table
1791 A :class:`pyarrow.Table` populated with row data and column
1792 headers from the query results. The column headers are derived
1793 from the destination table's schema.
1795 Raises:
1796 ValueError: If the :mod:`pyarrow` library cannot be imported.
1799 .. versionadded:: 1.17.0
1800 """
1801 if pyarrow is None:
1802 raise ValueError(_NO_PYARROW_ERROR)
1804 self._maybe_warn_max_results(bqstorage_client)
1806 if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client):
1807 create_bqstorage_client = False
1808 bqstorage_client = None
1810 owns_bqstorage_client = False
1811 if not bqstorage_client and create_bqstorage_client:
1812 bqstorage_client = self.client._ensure_bqstorage_client()
1813 owns_bqstorage_client = bqstorage_client is not None
1815 try:
1816 progress_bar = get_progress_bar(
1817 progress_bar_type, "Downloading", self.total_rows, "rows"
1818 )
1820 record_batches = []
1821 for record_batch in self.to_arrow_iterable(
1822 bqstorage_client=bqstorage_client
1823 ):
1824 record_batches.append(record_batch)
1826 if progress_bar is not None:
1827 # In some cases, the number of total rows is not populated
1828 # until the first page of rows is fetched. Update the
1829 # progress bar's total to keep an accurate count.
1830 progress_bar.total = progress_bar.total or self.total_rows
1831 progress_bar.update(record_batch.num_rows)
1833 if progress_bar is not None:
1834 # Indicate that the download has finished.
1835 progress_bar.close()
1836 finally:
1837 if owns_bqstorage_client:
1838 bqstorage_client._transport.grpc_channel.close() # type: ignore
1840 if record_batches and bqstorage_client is not None:
1841 return pyarrow.Table.from_batches(record_batches)
1842 else:
1843 # No records (not record_batches), use schema based on BigQuery schema
1844 # **or**
1845 # we used the REST API (bqstorage_client is None),
1846 # which doesn't add arrow extension metadata, so we let
1847 # `bq_to_arrow_schema` do it.
1848 arrow_schema = _pandas_helpers.bq_to_arrow_schema(self._schema)
1849 return pyarrow.Table.from_batches(record_batches, schema=arrow_schema)
1851 def to_dataframe_iterable(
1852 self,
1853 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
1854 dtypes: Dict[str, Any] = None,
1855 max_queue_size: int = _pandas_helpers._MAX_QUEUE_SIZE_DEFAULT, # type: ignore
1856 ) -> "pandas.DataFrame":
1857 """Create an iterable of pandas DataFrames, to process the table as a stream.
1859 Args:
1860 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1861 A BigQuery Storage API client. If supplied, use the faster
1862 BigQuery Storage API to fetch rows from BigQuery.
1864 This method requires ``google-cloud-bigquery-storage`` library.
1866 This method only exposes a subset of the capabilities of the
1867 BigQuery Storage API. For full access to all features
1868 (projections, filters, snapshots) use the Storage API directly.
1870 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
1871 A dictionary of column names pandas ``dtype``s. The provided
1872 ``dtype`` is used when constructing the series for the column
1873 specified. Otherwise, the default pandas behavior is used.
1875 max_queue_size (Optional[int]):
1876 The maximum number of result pages to hold in the internal queue when
1877 streaming query results over the BigQuery Storage API. Ignored if
1878 Storage API is not used.
1880 By default, the max queue size is set to the number of BQ Storage streams
1881 created by the server. If ``max_queue_size`` is :data:`None`, the queue
1882 size is infinite.
1884 .. versionadded:: 2.14.0
1886 Returns:
1887 pandas.DataFrame:
1888 A generator of :class:`~pandas.DataFrame`.
1890 Raises:
1891 ValueError:
1892 If the :mod:`pandas` library cannot be imported.
1893 """
1894 _pandas_helpers.verify_pandas_imports()
1896 if dtypes is None:
1897 dtypes = {}
1899 self._maybe_warn_max_results(bqstorage_client)
1901 column_names = [field.name for field in self._schema]
1902 bqstorage_download = functools.partial(
1903 _pandas_helpers.download_dataframe_bqstorage,
1904 self._project,
1905 self._table,
1906 bqstorage_client,
1907 column_names,
1908 dtypes,
1909 preserve_order=self._preserve_order,
1910 selected_fields=self._selected_fields,
1911 max_queue_size=max_queue_size,
1912 )
1913 tabledata_list_download = functools.partial(
1914 _pandas_helpers.download_dataframe_row_iterator,
1915 iter(self.pages),
1916 self.schema,
1917 dtypes,
1918 )
1919 return self._to_page_iterable(
1920 bqstorage_download,
1921 tabledata_list_download,
1922 bqstorage_client=bqstorage_client,
1923 )
1925 # If changing the signature of this method, make sure to apply the same
1926 # changes to job.QueryJob.to_dataframe()
1927 def to_dataframe(
1928 self,
1929 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
1930 dtypes: Dict[str, Any] = None,
1931 progress_bar_type: str = None,
1932 create_bqstorage_client: bool = True,
1933 geography_as_object: bool = False,
1934 bool_dtype: Union[Any, None] = DefaultPandasDTypes.BOOL_DTYPE,
1935 int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
1936 float_dtype: Union[Any, None] = None,
1937 string_dtype: Union[Any, None] = None,
1938 ) -> "pandas.DataFrame":
1939 """Create a pandas DataFrame by loading all pages of a query.
1941 Args:
1942 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
1943 A BigQuery Storage API client. If supplied, use the faster
1944 BigQuery Storage API to fetch rows from BigQuery.
1946 This method requires ``google-cloud-bigquery-storage`` library.
1948 This method only exposes a subset of the capabilities of the
1949 BigQuery Storage API. For full access to all features
1950 (projections, filters, snapshots) use the Storage API directly.
1952 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
1953 A dictionary of column names pandas ``dtype``s. The provided
1954 ``dtype`` is used when constructing the series for the column
1955 specified. Otherwise, the default pandas behavior is used.
1956 progress_bar_type (Optional[str]):
1957 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
1958 display a progress bar while the data downloads. Install the
1959 ``tqdm`` package to use this feature.
1961 Possible values of ``progress_bar_type`` include:
1963 ``None``
1964 No progress bar.
1965 ``'tqdm'``
1966 Use the :func:`tqdm.tqdm` function to print a progress bar
1967 to :data:`sys.stdout`.
1968 ``'tqdm_notebook'``
1969 Use the :func:`tqdm.notebook.tqdm` function to display a
1970 progress bar as a Jupyter notebook widget.
1971 ``'tqdm_gui'``
1972 Use the :func:`tqdm.tqdm_gui` function to display a
1973 progress bar as a graphical dialog box.
1975 .. versionadded:: 1.11.0
1977 create_bqstorage_client (Optional[bool]):
1978 If ``True`` (default), create a BigQuery Storage API client
1979 using the default API settings. The BigQuery Storage API
1980 is a faster way to fetch rows from BigQuery. See the
1981 ``bqstorage_client`` parameter for more information.
1983 This argument does nothing if ``bqstorage_client`` is supplied.
1985 .. versionadded:: 1.24.0
1987 geography_as_object (Optional[bool]):
1988 If ``True``, convert GEOGRAPHY data to :mod:`shapely`
1989 geometry objects. If ``False`` (default), don't cast
1990 geography data to :mod:`shapely` geometry objects.
1992 .. versionadded:: 2.24.0
1994 bool_dtype (Optional[pandas.Series.dtype, None]):
1995 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1996 to convert BigQuery Boolean type, instead of relying on the default
1997 ``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1998 then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1999 type can be found at:
2000 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2002 .. versionadded:: 3.7.1
2004 int_dtype (Optional[pandas.Series.dtype, None]):
2005 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2006 to convert BigQuery Integer types, instead of relying on the default
2007 ``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2008 then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2009 Integer types can be found at:
2010 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2012 .. versionadded:: 3.7.1
2014 float_dtype (Optional[pandas.Series.dtype, None]):
2015 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2016 to convert BigQuery Float type, instead of relying on the default
2017 ``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2018 then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2019 type can be found at:
2020 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2022 .. versionadded:: 3.7.1
2024 string_dtype (Optional[pandas.Series.dtype, None]):
2025 If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2026 convert BigQuery String type, instead of relying on the default
2027 ``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2028 then the data type will be ``numpy.dtype("object")``. BigQuery String
2029 type can be found at:
2030 https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2032 .. versionadded:: 3.7.1
2034 Returns:
2035 pandas.DataFrame:
2036 A :class:`~pandas.DataFrame` populated with row data and column
2037 headers from the query results. The column headers are derived
2038 from the destination table's schema.
2040 Raises:
2041 ValueError:
2042 If the :mod:`pandas` library cannot be imported, or
2043 the :mod:`google.cloud.bigquery_storage_v1` module is
2044 required but cannot be imported. Also if
2045 `geography_as_object` is `True`, but the
2046 :mod:`shapely` library cannot be imported. Also if
2047 `bool_dtype`, `int_dtype` or other dtype parameters
2048 is not supported dtype.
2050 """
2051 _pandas_helpers.verify_pandas_imports()
2053 if geography_as_object and shapely is None:
2054 raise ValueError(_NO_SHAPELY_ERROR)
2056 if bool_dtype is DefaultPandasDTypes.BOOL_DTYPE:
2057 bool_dtype = pandas.BooleanDtype()
2059 if int_dtype is DefaultPandasDTypes.INT_DTYPE:
2060 int_dtype = pandas.Int64Dtype()
2062 if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
2063 raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
2065 if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
2066 raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
2068 if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
2069 raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
2071 if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
2072 raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
2074 if dtypes is None:
2075 dtypes = {}
2077 self._maybe_warn_max_results(bqstorage_client)
2079 if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client):
2080 create_bqstorage_client = False
2081 bqstorage_client = None
2083 record_batch = self.to_arrow(
2084 progress_bar_type=progress_bar_type,
2085 bqstorage_client=bqstorage_client,
2086 create_bqstorage_client=create_bqstorage_client,
2087 )
2089 # When converting date or timestamp values to nanosecond precision, the result
2090 # can be out of pyarrow bounds. To avoid the error when converting to
2091 # Pandas, we set the date_as_object or timestamp_as_object parameter to True,
2092 # if necessary.
2093 date_as_object = not all(
2094 self.__can_cast_timestamp_ns(col)
2095 for col in record_batch
2096 # Type can be date32 or date64 (plus units).
2097 # See: https://arrow.apache.org/docs/python/api/datatypes.html
2098 if pyarrow.types.is_date(col.type)
2099 )
2101 timestamp_as_object = not all(
2102 self.__can_cast_timestamp_ns(col)
2103 for col in record_batch
2104 # Type can be datetime and timestamp (plus units and time zone).
2105 # See: https://arrow.apache.org/docs/python/api/datatypes.html
2106 if pyarrow.types.is_timestamp(col.type)
2107 )
2109 if len(record_batch) > 0:
2110 df = record_batch.to_pandas(
2111 date_as_object=date_as_object,
2112 timestamp_as_object=timestamp_as_object,
2113 integer_object_nulls=True,
2114 types_mapper=_pandas_helpers.default_types_mapper(
2115 date_as_object=date_as_object,
2116 bool_dtype=bool_dtype,
2117 int_dtype=int_dtype,
2118 float_dtype=float_dtype,
2119 string_dtype=string_dtype,
2120 ),
2121 )
2122 else:
2123 # Avoid "ValueError: need at least one array to concatenate" on
2124 # older versions of pandas when converting empty RecordBatch to
2125 # DataFrame. See: https://github.com/pandas-dev/pandas/issues/41241
2126 df = pandas.DataFrame([], columns=record_batch.schema.names)
2128 for column in dtypes:
2129 df[column] = pandas.Series(df[column], dtype=dtypes[column], copy=False)
2131 if geography_as_object:
2132 for field in self.schema:
2133 if field.field_type.upper() == "GEOGRAPHY" and field.mode != "REPEATED":
2134 df[field.name] = df[field.name].dropna().apply(_read_wkt)
2136 return df
2138 @staticmethod
2139 def __can_cast_timestamp_ns(column):
2140 try:
2141 column.cast("timestamp[ns]")
2142 except pyarrow.lib.ArrowInvalid:
2143 return False
2144 else:
2145 return True
2147 # If changing the signature of this method, make sure to apply the same
2148 # changes to job.QueryJob.to_geodataframe()
2149 def to_geodataframe(
2150 self,
2151 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2152 dtypes: Dict[str, Any] = None,
2153 progress_bar_type: str = None,
2154 create_bqstorage_client: bool = True,
2155 geography_column: Optional[str] = None,
2156 ) -> "geopandas.GeoDataFrame":
2157 """Create a GeoPandas GeoDataFrame by loading all pages of a query.
2159 Args:
2160 bqstorage_client (Optional[google.cloud.bigquery_storage_v1.BigQueryReadClient]):
2161 A BigQuery Storage API client. If supplied, use the faster
2162 BigQuery Storage API to fetch rows from BigQuery.
2164 This method requires the ``pyarrow`` and
2165 ``google-cloud-bigquery-storage`` libraries.
2167 This method only exposes a subset of the capabilities of the
2168 BigQuery Storage API. For full access to all features
2169 (projections, filters, snapshots) use the Storage API directly.
2171 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
2172 A dictionary of column names pandas ``dtype``s. The provided
2173 ``dtype`` is used when constructing the series for the column
2174 specified. Otherwise, the default pandas behavior is used.
2175 progress_bar_type (Optional[str]):
2176 If set, use the `tqdm <https://tqdm.github.io/>`_ library to
2177 display a progress bar while the data downloads. Install the
2178 ``tqdm`` package to use this feature.
2180 Possible values of ``progress_bar_type`` include:
2182 ``None``
2183 No progress bar.
2184 ``'tqdm'``
2185 Use the :func:`tqdm.tqdm` function to print a progress bar
2186 to :data:`sys.stdout`.
2187 ``'tqdm_notebook'``
2188 Use the :func:`tqdm.notebook.tqdm` function to display a
2189 progress bar as a Jupyter notebook widget.
2190 ``'tqdm_gui'``
2191 Use the :func:`tqdm.tqdm_gui` function to display a
2192 progress bar as a graphical dialog box.
2194 create_bqstorage_client (Optional[bool]):
2195 If ``True`` (default), create a BigQuery Storage API client
2196 using the default API settings. The BigQuery Storage API
2197 is a faster way to fetch rows from BigQuery. See the
2198 ``bqstorage_client`` parameter for more information.
2200 This argument does nothing if ``bqstorage_client`` is supplied.
2202 geography_column (Optional[str]):
2203 If there are more than one GEOGRAPHY column,
2204 identifies which one to use to construct a geopandas
2205 GeoDataFrame. This option can be ommitted if there's
2206 only one GEOGRAPHY column.
2208 Returns:
2209 geopandas.GeoDataFrame:
2210 A :class:`geopandas.GeoDataFrame` populated with row
2211 data and column headers from the query results. The
2212 column headers are derived from the destination
2213 table's schema.
2215 Raises:
2216 ValueError:
2217 If the :mod:`geopandas` library cannot be imported, or the
2218 :mod:`google.cloud.bigquery_storage_v1` module is
2219 required but cannot be imported.
2221 .. versionadded:: 2.24.0
2222 """
2223 if geopandas is None:
2224 raise ValueError(_NO_GEOPANDAS_ERROR)
2226 geography_columns = set(
2227 field.name
2228 for field in self.schema
2229 if field.field_type.upper() == "GEOGRAPHY"
2230 )
2231 if not geography_columns:
2232 raise TypeError(
2233 "There must be at least one GEOGRAPHY column"
2234 " to create a GeoDataFrame"
2235 )
2237 if geography_column:
2238 if geography_column not in geography_columns:
2239 raise ValueError(
2240 f"The given geography column, {geography_column}, doesn't name"
2241 f" a GEOGRAPHY column in the result."
2242 )
2243 elif len(geography_columns) == 1:
2244 [geography_column] = geography_columns
2245 else:
2246 raise ValueError(
2247 "There is more than one GEOGRAPHY column in the result. "
2248 "The geography_column argument must be used to specify which "
2249 "one to use to create a GeoDataFrame"
2250 )
2252 df = self.to_dataframe(
2253 bqstorage_client,
2254 dtypes,
2255 progress_bar_type,
2256 create_bqstorage_client,
2257 geography_as_object=True,
2258 )
2260 return geopandas.GeoDataFrame(
2261 df, crs=_COORDINATE_REFERENCE_SYSTEM, geometry=geography_column
2262 )
2265class _EmptyRowIterator(RowIterator):
2266 """An empty row iterator.
2268 This class prevents API requests when there are no rows to fetch or rows
2269 are impossible to fetch, such as with query results for DDL CREATE VIEW
2270 statements.
2271 """
2273 schema = ()
2274 pages = ()
2275 total_rows = 0
2277 def __init__(
2278 self, client=None, api_request=None, path=None, schema=(), *args, **kwargs
2279 ):
2280 super().__init__(
2281 client=client,
2282 api_request=api_request,
2283 path=path,
2284 schema=schema,
2285 *args,
2286 **kwargs,
2287 )
2289 def to_arrow(
2290 self,
2291 progress_bar_type=None,
2292 bqstorage_client=None,
2293 create_bqstorage_client=True,
2294 ) -> "pyarrow.Table":
2295 """[Beta] Create an empty class:`pyarrow.Table`.
2297 Args:
2298 progress_bar_type (str): Ignored. Added for compatibility with RowIterator.
2299 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
2300 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2302 Returns:
2303 pyarrow.Table: An empty :class:`pyarrow.Table`.
2304 """
2305 if pyarrow is None:
2306 raise ValueError(_NO_PYARROW_ERROR)
2307 return pyarrow.Table.from_arrays(())
2309 def to_dataframe(
2310 self,
2311 bqstorage_client=None,
2312 dtypes=None,
2313 progress_bar_type=None,
2314 create_bqstorage_client=True,
2315 geography_as_object=False,
2316 bool_dtype=None,
2317 int_dtype=None,
2318 float_dtype=None,
2319 string_dtype=None,
2320 ) -> "pandas.DataFrame":
2321 """Create an empty dataframe.
2323 Args:
2324 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
2325 dtypes (Any): Ignored. Added for compatibility with RowIterator.
2326 progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
2327 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2328 geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
2329 bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
2330 int_dtype (Any): Ignored. Added for compatibility with RowIterator.
2331 float_dtype (Any): Ignored. Added for compatibility with RowIterator.
2332 string_dtype (Any): Ignored. Added for compatibility with RowIterator.
2334 Returns:
2335 pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
2336 """
2337 _pandas_helpers.verify_pandas_imports()
2338 return pandas.DataFrame()
2340 def to_geodataframe(
2341 self,
2342 bqstorage_client=None,
2343 dtypes=None,
2344 progress_bar_type=None,
2345 create_bqstorage_client=True,
2346 geography_column: Optional[str] = None,
2347 ) -> "pandas.DataFrame":
2348 """Create an empty dataframe.
2350 Args:
2351 bqstorage_client (Any): Ignored. Added for compatibility with RowIterator.
2352 dtypes (Any): Ignored. Added for compatibility with RowIterator.
2353 progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
2354 create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2356 Returns:
2357 pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
2358 """
2359 if geopandas is None:
2360 raise ValueError(_NO_GEOPANDAS_ERROR)
2362 # Since an empty GeoDataFrame has no geometry column, we do not CRS on it,
2363 # because that's deprecated.
2364 return geopandas.GeoDataFrame()
2366 def to_dataframe_iterable(
2367 self,
2368 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2369 dtypes: Optional[Dict[str, Any]] = None,
2370 max_queue_size: Optional[int] = None,
2371 ) -> Iterator["pandas.DataFrame"]:
2372 """Create an iterable of pandas DataFrames, to process the table as a stream.
2374 .. versionadded:: 2.21.0
2376 Args:
2377 bqstorage_client:
2378 Ignored. Added for compatibility with RowIterator.
2380 dtypes (Optional[Map[str, Union[str, pandas.Series.dtype]]]):
2381 Ignored. Added for compatibility with RowIterator.
2383 max_queue_size:
2384 Ignored. Added for compatibility with RowIterator.
2386 Returns:
2387 An iterator yielding a single empty :class:`~pandas.DataFrame`.
2389 Raises:
2390 ValueError:
2391 If the :mod:`pandas` library cannot be imported.
2392 """
2393 _pandas_helpers.verify_pandas_imports()
2394 return iter((pandas.DataFrame(),))
2396 def to_arrow_iterable(
2397 self,
2398 bqstorage_client: Optional["bigquery_storage.BigQueryReadClient"] = None,
2399 max_queue_size: Optional[int] = None,
2400 ) -> Iterator["pyarrow.RecordBatch"]:
2401 """Create an iterable of pandas DataFrames, to process the table as a stream.
2403 .. versionadded:: 2.31.0
2405 Args:
2406 bqstorage_client:
2407 Ignored. Added for compatibility with RowIterator.
2409 max_queue_size:
2410 Ignored. Added for compatibility with RowIterator.
2412 Returns:
2413 An iterator yielding a single empty :class:`~pyarrow.RecordBatch`.
2414 """
2415 return iter((pyarrow.record_batch([]),))
2417 def __iter__(self):
2418 return iter(())
2421class PartitionRange(object):
2422 """Definition of the ranges for range partitioning.
2424 .. note::
2425 **Beta**. The integer range partitioning feature is in a pre-release
2426 state and might change or have limited support.
2428 Args:
2429 start (Optional[int]):
2430 Sets the
2431 :attr:`~google.cloud.bigquery.table.PartitionRange.start`
2432 property.
2433 end (Optional[int]):
2434 Sets the
2435 :attr:`~google.cloud.bigquery.table.PartitionRange.end`
2436 property.
2437 interval (Optional[int]):
2438 Sets the
2439 :attr:`~google.cloud.bigquery.table.PartitionRange.interval`
2440 property.
2441 _properties (Optional[dict]):
2442 Private. Used to construct object from API resource.
2443 """
2445 def __init__(self, start=None, end=None, interval=None, _properties=None) -> None:
2446 if _properties is None:
2447 _properties = {}
2448 self._properties = _properties
2450 if start is not None:
2451 self.start = start
2452 if end is not None:
2453 self.end = end
2454 if interval is not None:
2455 self.interval = interval
2457 @property
2458 def start(self):
2459 """int: The start of range partitioning, inclusive."""
2460 return _helpers._int_or_none(self._properties.get("start"))
2462 @start.setter
2463 def start(self, value):
2464 self._properties["start"] = _helpers._str_or_none(value)
2466 @property
2467 def end(self):
2468 """int: The end of range partitioning, exclusive."""
2469 return _helpers._int_or_none(self._properties.get("end"))
2471 @end.setter
2472 def end(self, value):
2473 self._properties["end"] = _helpers._str_or_none(value)
2475 @property
2476 def interval(self):
2477 """int: The width of each interval."""
2478 return _helpers._int_or_none(self._properties.get("interval"))
2480 @interval.setter
2481 def interval(self, value):
2482 self._properties["interval"] = _helpers._str_or_none(value)
2484 def _key(self):
2485 return tuple(sorted(self._properties.items()))
2487 def __eq__(self, other):
2488 if not isinstance(other, PartitionRange):
2489 return NotImplemented
2490 return self._key() == other._key()
2492 def __ne__(self, other):
2493 return not self == other
2495 def __repr__(self):
2496 key_vals = ["{}={}".format(key, val) for key, val in self._key()]
2497 return "PartitionRange({})".format(", ".join(key_vals))
2500class RangePartitioning(object):
2501 """Range-based partitioning configuration for a table.
2503 .. note::
2504 **Beta**. The integer range partitioning feature is in a pre-release
2505 state and might change or have limited support.
2507 Args:
2508 range_ (Optional[google.cloud.bigquery.table.PartitionRange]):
2509 Sets the
2510 :attr:`google.cloud.bigquery.table.RangePartitioning.range_`
2511 property.
2512 field (Optional[str]):
2513 Sets the
2514 :attr:`google.cloud.bigquery.table.RangePartitioning.field`
2515 property.
2516 _properties (Optional[dict]):
2517 Private. Used to construct object from API resource.
2518 """
2520 def __init__(self, range_=None, field=None, _properties=None) -> None:
2521 if _properties is None:
2522 _properties = {}
2523 self._properties: Dict[str, Any] = _properties
2525 if range_ is not None:
2526 self.range_ = range_
2527 if field is not None:
2528 self.field = field
2530 # Trailing underscore to prevent conflict with built-in range() function.
2531 @property
2532 def range_(self):
2533 """google.cloud.bigquery.table.PartitionRange: Defines the
2534 ranges for range partitioning.
2536 Raises:
2537 ValueError:
2538 If the value is not a :class:`PartitionRange`.
2539 """
2540 range_properties = self._properties.setdefault("range", {})
2541 return PartitionRange(_properties=range_properties)
2543 @range_.setter
2544 def range_(self, value):
2545 if not isinstance(value, PartitionRange):
2546 raise ValueError("Expected a PartitionRange, but got {}.".format(value))
2547 self._properties["range"] = value._properties
2549 @property
2550 def field(self):
2551 """str: The table is partitioned by this field.
2553 The field must be a top-level ``NULLABLE`` / ``REQUIRED`` field. The
2554 only supported type is ``INTEGER`` / ``INT64``.
2555 """
2556 return self._properties.get("field")
2558 @field.setter
2559 def field(self, value):
2560 self._properties["field"] = value
2562 def _key(self):
2563 return (("field", self.field), ("range_", self.range_))
2565 def __eq__(self, other):
2566 if not isinstance(other, RangePartitioning):
2567 return NotImplemented
2568 return self._key() == other._key()
2570 def __ne__(self, other):
2571 return not self == other
2573 def __repr__(self):
2574 key_vals = ["{}={}".format(key, repr(val)) for key, val in self._key()]
2575 return "RangePartitioning({})".format(", ".join(key_vals))
2578class TimePartitioningType(object):
2579 """Specifies the type of time partitioning to perform."""
2581 DAY = "DAY"
2582 """str: Generates one partition per day."""
2584 HOUR = "HOUR"
2585 """str: Generates one partition per hour."""
2587 MONTH = "MONTH"
2588 """str: Generates one partition per month."""
2590 YEAR = "YEAR"
2591 """str: Generates one partition per year."""
2594class TimePartitioning(object):
2595 """Configures time-based partitioning for a table.
2597 Args:
2598 type_ (Optional[google.cloud.bigquery.table.TimePartitioningType]):
2599 Specifies the type of time partitioning to perform. Defaults to
2600 :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
2602 Supported values are:
2604 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.HOUR`
2605 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`
2606 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.MONTH`
2607 * :attr:`~google.cloud.bigquery.table.TimePartitioningType.YEAR`
2609 field (Optional[str]):
2610 If set, the table is partitioned by this field. If not set, the
2611 table is partitioned by pseudo column ``_PARTITIONTIME``. The field
2612 must be a top-level ``TIMESTAMP``, ``DATETIME``, or ``DATE``
2613 field. Its mode must be ``NULLABLE`` or ``REQUIRED``.
2615 See the `time-unit column-partitioned tables guide
2616 <https://cloud.google.com/bigquery/docs/creating-column-partitions>`_
2617 in the BigQuery documentation.
2618 expiration_ms(Optional[int]):
2619 Number of milliseconds for which to keep the storage for a
2620 partition.
2621 require_partition_filter (Optional[bool]):
2622 DEPRECATED: Use
2623 :attr:`~google.cloud.bigquery.table.Table.require_partition_filter`,
2624 instead.
2625 """
2627 def __init__(
2628 self, type_=None, field=None, expiration_ms=None, require_partition_filter=None
2629 ) -> None:
2630 self._properties: Dict[str, Any] = {}
2631 if type_ is None:
2632 self.type_ = TimePartitioningType.DAY
2633 else:
2634 self.type_ = type_
2635 if field is not None:
2636 self.field = field
2637 if expiration_ms is not None:
2638 self.expiration_ms = expiration_ms
2639 if require_partition_filter is not None:
2640 self.require_partition_filter = require_partition_filter
2642 @property
2643 def type_(self):
2644 """google.cloud.bigquery.table.TimePartitioningType: The type of time
2645 partitioning to use.
2646 """
2647 return self._properties.get("type")
2649 @type_.setter
2650 def type_(self, value):
2651 self._properties["type"] = value
2653 @property
2654 def field(self):
2655 """str: Field in the table to use for partitioning"""
2656 return self._properties.get("field")
2658 @field.setter
2659 def field(self, value):
2660 self._properties["field"] = value
2662 @property
2663 def expiration_ms(self):
2664 """int: Number of milliseconds to keep the storage for a partition."""
2665 return _helpers._int_or_none(self._properties.get("expirationMs"))
2667 @expiration_ms.setter
2668 def expiration_ms(self, value):
2669 if value is not None:
2670 # Allow explicitly setting the expiration to None.
2671 value = str(value)
2672 self._properties["expirationMs"] = value
2674 @property
2675 def require_partition_filter(self):
2676 """bool: Specifies whether partition filters are required for queries
2678 DEPRECATED: Use
2679 :attr:`~google.cloud.bigquery.table.Table.require_partition_filter`,
2680 instead.
2681 """
2682 warnings.warn(
2683 (
2684 "TimePartitioning.require_partition_filter will be removed in "
2685 "future versions. Please use Table.require_partition_filter "
2686 "instead."
2687 ),
2688 PendingDeprecationWarning,
2689 stacklevel=2,
2690 )
2691 return self._properties.get("requirePartitionFilter")
2693 @require_partition_filter.setter
2694 def require_partition_filter(self, value):
2695 warnings.warn(
2696 (
2697 "TimePartitioning.require_partition_filter will be removed in "
2698 "future versions. Please use Table.require_partition_filter "
2699 "instead."
2700 ),
2701 PendingDeprecationWarning,
2702 stacklevel=2,
2703 )
2704 self._properties["requirePartitionFilter"] = value
2706 @classmethod
2707 def from_api_repr(cls, api_repr: dict) -> "TimePartitioning":
2708 """Return a :class:`TimePartitioning` object deserialized from a dict.
2710 This method creates a new ``TimePartitioning`` instance that points to
2711 the ``api_repr`` parameter as its internal properties dict. This means
2712 that when a ``TimePartitioning`` instance is stored as a property of
2713 another object, any changes made at the higher level will also appear
2714 here::
2716 >>> time_partitioning = TimePartitioning()
2717 >>> table.time_partitioning = time_partitioning
2718 >>> table.time_partitioning.field = 'timecolumn'
2719 >>> time_partitioning.field
2720 'timecolumn'
2722 Args:
2723 api_repr (Mapping[str, str]):
2724 The serialized representation of the TimePartitioning, such as
2725 what is output by :meth:`to_api_repr`.
2727 Returns:
2728 google.cloud.bigquery.table.TimePartitioning:
2729 The ``TimePartitioning`` object.
2730 """
2731 instance = cls()
2732 instance._properties = api_repr
2733 return instance
2735 def to_api_repr(self) -> dict:
2736 """Return a dictionary representing this object.
2738 This method returns the properties dict of the ``TimePartitioning``
2739 instance rather than making a copy. This means that when a
2740 ``TimePartitioning`` instance is stored as a property of another
2741 object, any changes made at the higher level will also appear here.
2743 Returns:
2744 dict:
2745 A dictionary representing the TimePartitioning object in
2746 serialized form.
2747 """
2748 return self._properties
2750 def _key(self):
2751 # because we are only "renaming" top level keys shallow copy is sufficient here.
2752 properties = self._properties.copy()
2753 # calling repr for non built-in type objects.
2754 properties["type_"] = repr(properties.pop("type"))
2755 if "field" in properties:
2756 # calling repr for non built-in type objects.
2757 properties["field"] = repr(properties["field"])
2758 if "requirePartitionFilter" in properties:
2759 properties["require_partition_filter"] = properties.pop(
2760 "requirePartitionFilter"
2761 )
2762 if "expirationMs" in properties:
2763 properties["expiration_ms"] = properties.pop("expirationMs")
2764 return tuple(sorted(properties.items()))
2766 def __eq__(self, other):
2767 if not isinstance(other, TimePartitioning):
2768 return NotImplemented
2769 return self._key() == other._key()
2771 def __ne__(self, other):
2772 return not self == other
2774 def __hash__(self):
2775 return hash(self._key())
2777 def __repr__(self):
2778 key_vals = ["{}={}".format(key, val) for key, val in self._key()]
2779 return "TimePartitioning({})".format(",".join(key_vals))
2782def _item_to_row(iterator, resource):
2783 """Convert a JSON row to the native object.
2785 .. note::
2787 This assumes that the ``schema`` attribute has been
2788 added to the iterator after being created, which
2789 should be done by the caller.
2791 Args:
2792 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.
2793 resource (Dict): An item to be converted to a row.
2795 Returns:
2796 google.cloud.bigquery.table.Row: The next row in the page.
2797 """
2798 return Row(
2799 _helpers._row_tuple_from_json(resource, iterator.schema),
2800 iterator._field_to_index,
2801 )
2804def _row_iterator_page_columns(schema, response):
2805 """Make a generator of all the columns in a page from tabledata.list.
2807 This enables creating a :class:`pandas.DataFrame` and other
2808 column-oriented data structures such as :class:`pyarrow.RecordBatch`
2809 """
2810 columns = []
2811 rows = response.get("rows", [])
2813 def get_column_data(field_index, field):
2814 for row in rows:
2815 yield _helpers._field_from_json(row["f"][field_index]["v"], field)
2817 for field_index, field in enumerate(schema):
2818 columns.append(get_column_data(field_index, field))
2820 return columns
2823# pylint: disable=unused-argument
2824def _rows_page_start(iterator, page, response):
2825 """Grab total rows when :class:`~google.cloud.iterator.Page` starts.
2827 Args:
2828 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.
2829 page (google.api_core.page_iterator.Page): The page that was just created.
2830 response (Dict): The JSON API response for a page of rows in a table.
2831 """
2832 # Make a (lazy) copy of the page in column-oriented format for use in data
2833 # science packages.
2834 page._columns = _row_iterator_page_columns(iterator._schema, response)
2836 total_rows = response.get("totalRows")
2837 if total_rows is not None:
2838 total_rows = int(total_rows)
2839 iterator._total_rows = total_rows
2842# pylint: enable=unused-argument
2845def _table_arg_to_table_ref(value, default_project=None) -> TableReference:
2846 """Helper to convert a string or Table to TableReference.
2848 This function keeps TableReference and other kinds of objects unchanged.
2849 """
2850 if isinstance(value, str):
2851 value = TableReference.from_string(value, default_project=default_project)
2852 if isinstance(value, (Table, TableListItem)):
2853 value = value.reference
2854 return value
2857def _table_arg_to_table(value, default_project=None) -> Table:
2858 """Helper to convert a string or TableReference to a Table.
2860 This function keeps Table and other kinds of objects unchanged.
2861 """
2862 if isinstance(value, str):
2863 value = TableReference.from_string(value, default_project=default_project)
2864 if isinstance(value, TableReference):
2865 value = Table(value)
2866 if isinstance(value, TableListItem):
2867 newvalue = Table(value.reference)
2868 newvalue._properties = value._properties
2869 value = newvalue
2871 return value