Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/google/cloud/bigquery/client.py: 20%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

15"""Client for interacting with the Google BigQuery API."""

17from __future__ import absolute_import

18from __future__ import division

20from collections import abc as collections_abc

21import copy

22import datetime

23import functools

24import gzip

25import io

26import itertools

27import json

28import math

29import os

30import packaging.version

31import tempfile

32import typing

33from typing import (

34 Any,

35 Dict,

36 IO,

37 Iterable,

38 Mapping,

39 List,

40 Optional,

41 Sequence,

42 Tuple,

43 Union,

44)

45import uuid

46import warnings

48try:

49 import pyarrow # type: ignore

51 _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)

52except ImportError: # pragma: NO COVER

53 pyarrow = None

55from google import resumable_media # type: ignore

56from google.resumable_media.requests import MultipartUpload # type: ignore

57from google.resumable_media.requests import ResumableUpload

59import google.api_core.client_options

60import google.api_core.exceptions as core_exceptions

61from google.api_core.iam import Policy

62from google.api_core import page_iterator

63from google.api_core import retry as retries

64import google.cloud._helpers # type: ignore

65from google.cloud import exceptions # pytype: disable=import-error

66from google.cloud.client import ClientWithProject # type: ignore # pytype: disable=import-error

68try:

69 from google.cloud.bigquery_storage_v1.services.big_query_read.client import (

70 DEFAULT_CLIENT_INFO as DEFAULT_BQSTORAGE_CLIENT_INFO,

71 )

72except ImportError:

73 DEFAULT_BQSTORAGE_CLIENT_INFO = None # type: ignore

76from google.cloud.bigquery import _job_helpers

77from google.cloud.bigquery._job_helpers import make_job_id as _make_job_id

78from google.cloud.bigquery._helpers import _get_sub_prop

79from google.cloud.bigquery._helpers import _record_field_to_json

80from google.cloud.bigquery._helpers import _str_or_none

81from google.cloud.bigquery._helpers import _verify_job_config_type

82from google.cloud.bigquery._helpers import _get_bigquery_host

83from google.cloud.bigquery._helpers import BQ_STORAGE_VERSIONS

84from google.cloud.bigquery._helpers import _DEFAULT_HOST

85from google.cloud.bigquery._http import Connection

86from google.cloud.bigquery import _pandas_helpers

87from google.cloud.bigquery.dataset import Dataset

88from google.cloud.bigquery.dataset import DatasetListItem

89from google.cloud.bigquery.dataset import DatasetReference

90from google.cloud.bigquery import enums

91from google.cloud.bigquery.enums import AutoRowIDs

92from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError

93from google.cloud.bigquery.opentelemetry_tracing import create_span

94from google.cloud.bigquery import job

95from google.cloud.bigquery.job import (

96 CopyJob,

97 CopyJobConfig,

98 ExtractJob,

99 ExtractJobConfig,

100 LoadJob,

101 LoadJobConfig,

102 QueryJob,

103 QueryJobConfig,

104)

105from google.cloud.bigquery.model import Model

106from google.cloud.bigquery.model import ModelReference

107from google.cloud.bigquery.model import _model_arg_to_model_ref

108from google.cloud.bigquery.query import _QueryResults

109from google.cloud.bigquery.retry import (

110 DEFAULT_JOB_RETRY,

111 DEFAULT_RETRY,

112 DEFAULT_TIMEOUT,

113)

114from google.cloud.bigquery.routine import Routine

115from google.cloud.bigquery.routine import RoutineReference

116from google.cloud.bigquery.schema import SchemaField

117from google.cloud.bigquery.table import _table_arg_to_table

118from google.cloud.bigquery.table import _table_arg_to_table_ref

119from google.cloud.bigquery.table import Table

120from google.cloud.bigquery.table import TableListItem

121from google.cloud.bigquery.table import TableReference

122from google.cloud.bigquery.table import RowIterator

123from google.cloud.bigquery.format_options import ParquetOptions

124from google.cloud.bigquery import _helpers

125

126TimeoutType = Union[float, None]

127ResumableTimeoutType = Union[

128 None, float, Tuple[float, float]

129] # for resumable media methods

130

131if typing.TYPE_CHECKING: # pragma: NO COVER

132 # os.PathLike is only subscriptable in Python 3.9+, thus shielding with a condition.

133 PathType = Union[str, bytes, os.PathLike[str], os.PathLike[bytes]]

134 import pandas # type: ignore

135 import requests # required by api-core

136

137_DEFAULT_CHUNKSIZE = 100 * 1024 * 1024 # 100 MB

138_MAX_MULTIPART_SIZE = 5 * 1024 * 1024

139_DEFAULT_NUM_RETRIES = 6

140_BASE_UPLOAD_TEMPLATE = "{host}/upload/bigquery/v2/projects/{project}/jobs?uploadType="

141_MULTIPART_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "multipart"

142_RESUMABLE_URL_TEMPLATE = _BASE_UPLOAD_TEMPLATE + "resumable"

143_GENERIC_CONTENT_TYPE = "*/*"

144_READ_LESS_THAN_SIZE = (

145 "Size {:d} was specified but the file-like object only had " "{:d} bytes remaining."

146)

147_NEED_TABLE_ARGUMENT = (

148 "The table argument should be a table ID string, Table, or TableReference"

149)

150_LIST_ROWS_FROM_QUERY_RESULTS_FIELDS = "jobReference,totalRows,pageToken,rows"

151

152# In microbenchmarks, it's been shown that even in ideal conditions (query

153# finished, local data), requests to getQueryResults can take 10+ seconds.

154# In less-than-ideal situations, the response can take even longer, as it must

155# be able to download a full 100+ MB row in that time. Don't let the

156# connection timeout before data can be downloaded.

157# https://github.com/googleapis/python-bigquery/issues/438

158_MIN_GET_QUERY_RESULTS_TIMEOUT = 120

159

160TIMEOUT_HEADER = "X-Server-Timeout"

161

162# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414

163_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])

164

165

166class Project(object):

167 """Wrapper for resource describing a BigQuery project.

168

169 Args:

170 project_id (str): Opaque ID of the project

171

172 numeric_id (int): Numeric ID of the project

173

174 friendly_name (str): Display name of the project

175 """

176

177 def __init__(self, project_id, numeric_id, friendly_name):

178 self.project_id = project_id

179 self.numeric_id = numeric_id

180 self.friendly_name = friendly_name

181

182 @classmethod

183 def from_api_repr(cls, resource):

184 """Factory: construct an instance from a resource dict."""

185 return cls(resource["id"], resource["numericId"], resource["friendlyName"])

186

187

188class Client(ClientWithProject):

189 """Client to bundle configuration needed for API requests.

190

191 Args:

192 project (Optional[str]):

193 Project ID for the project which the client acts on behalf of.

194 Will be passed when creating a dataset / job. If not passed,

195 falls back to the default inferred from the environment.

196 credentials (Optional[google.auth.credentials.Credentials]):

197 The OAuth2 Credentials to use for this client. If not passed

198 (and if no ``_http`` object is passed), falls back to the

199 default inferred from the environment.

200 _http (Optional[requests.Session]):

201 HTTP object to make requests. Can be any object that

202 defines ``request()`` with the same interface as

203 :meth:`requests.Session.request`. If not passed, an ``_http``

204 object is created that is bound to the ``credentials`` for the

205 current object.

206 This parameter should be considered private, and could change in

207 the future.

208 location (Optional[str]):

209 Default location for jobs / datasets / tables.

210 default_query_job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):

211 Default ``QueryJobConfig``.

212 Will be merged into job configs passed into the ``query`` method.

213 default_load_job_config (Optional[google.cloud.bigquery.job.LoadJobConfig]):

214 Default ``LoadJobConfig``.

215 Will be merged into job configs passed into the ``load_table_*`` methods.

216 client_info (Optional[google.api_core.client_info.ClientInfo]):

217 The client info used to send a user-agent string along with API

218 requests. If ``None``, then default info will be used. Generally,

219 you only need to set this if you're developing your own library

220 or partner tool.

221 client_options (Optional[Union[google.api_core.client_options.ClientOptions, Dict]]):

222 Client options used to set user options on the client. API Endpoint

223 should be set through client_options.

224

225 Raises:

226 google.auth.exceptions.DefaultCredentialsError:

227 Raised if ``credentials`` is not specified and the library fails

228 to acquire default credentials.

229 """

230

231 SCOPE = ("https://www.googleapis.com/auth/cloud-platform",) # type: ignore

232 """The scopes required for authenticating as a BigQuery consumer."""

233

234 def __init__(

235 self,

236 project=None,

237 credentials=None,

238 _http=None,

239 location=None,

240 default_query_job_config=None,

241 default_load_job_config=None,

242 client_info=None,

243 client_options=None,

244 ) -> None:

245 super(Client, self).__init__(

246 project=project,

247 credentials=credentials,

248 client_options=client_options,

249 _http=_http,

250 )

251

252 kw_args = {"client_info": client_info}

253 bq_host = _get_bigquery_host()

254 kw_args["api_endpoint"] = bq_host if bq_host != _DEFAULT_HOST else None

255 if client_options:

256 if type(client_options) == dict:

257 client_options = google.api_core.client_options.from_dict(

258 client_options

259 )

260 if client_options.api_endpoint:

261 api_endpoint = client_options.api_endpoint

262 kw_args["api_endpoint"] = api_endpoint

263

264 self._connection = Connection(self, **kw_args)

265 self._location = location

266 self._default_query_job_config = copy.deepcopy(default_query_job_config)

267 self._default_load_job_config = copy.deepcopy(default_load_job_config)

268

269 @property

270 def location(self):

271 """Default location for jobs / datasets / tables."""

272 return self._location

273

274 @property

275 def default_query_job_config(self):

276 """Default ``QueryJobConfig``.

277 Will be merged into job configs passed into the ``query`` method.

278 """

279 return self._default_query_job_config

280

281 @default_query_job_config.setter

282 def default_query_job_config(self, value: QueryJobConfig):

283 self._default_query_job_config = copy.deepcopy(value)

284

285 @property

286 def default_load_job_config(self):

287 """Default ``LoadJobConfig``.

288 Will be merged into job configs passed into the ``load_table_*`` methods.

289 """

290 return self._default_load_job_config

291

292 @default_load_job_config.setter

293 def default_load_job_config(self, value: LoadJobConfig):

294 self._default_load_job_config = copy.deepcopy(value)

295

296 def close(self):

297 """Close the underlying transport objects, releasing system resources.

298

299 .. note::

300

301 The client instance can be used for making additional requests even

302 after closing, in which case the underlying connections are

303 automatically re-created.

304 """

305 self._http._auth_request.session.close()

306 self._http.close()

307

308 def get_service_account_email(

309 self,

310 project: str = None,

311 retry: retries.Retry = DEFAULT_RETRY,

312 timeout: TimeoutType = DEFAULT_TIMEOUT,

313 ) -> str:

314 """Get the email address of the project's BigQuery service account

315

316 Note:

317 This is the service account that BigQuery uses to manage tables

318 encrypted by a key in KMS.

319

320 Args:

321 project (Optional[str]):

322 Project ID to use for retreiving service account email.

323 Defaults to the client's project.

324 retry (Optional[google.api_core.retry.Retry]): How to retry the RPC.

325 timeout (Optional[float]):

326 The number of seconds to wait for the underlying HTTP transport

327 before using ``retry``.

328

329 Returns:

330 str: service account email address

331

332 Example:

333

334 >>> from google.cloud import bigquery

335 >>> client = bigquery.Client()

336 >>> client.get_service_account_email()

337 my_service_account@my-project.iam.gserviceaccount.com

338

339 """

340 if project is None:

341 project = self.project

342 path = "/projects/%s/serviceAccount" % (project,)

343 span_attributes = {"path": path}

344 api_response = self._call_api(

345 retry,

346 span_name="BigQuery.getServiceAccountEmail",

347 span_attributes=span_attributes,

348 method="GET",

349 path=path,

350 timeout=timeout,

351 )

352 return api_response["email"]

353

354 def list_projects(

355 self,

356 max_results: Optional[int] = None,

357 page_token: str = None,

358 retry: retries.Retry = DEFAULT_RETRY,

359 timeout: TimeoutType = DEFAULT_TIMEOUT,

360 page_size: Optional[int] = None,

361 ) -> page_iterator.Iterator:

362 """List projects for the project associated with this client.

363

364 See

365 https://cloud.google.com/bigquery/docs/reference/rest/v2/projects/list

366

367 Args:

368 max_results (Optional[int]):

369 Maximum number of projects to return.

370 Defaults to a value set by the API.

371

372 page_token (Optional[str]):

373 Token representing a cursor into the projects. If not passed,

374 the API will return the first page of projects. The token marks

375 the beginning of the iterator to be returned and the value of

376 the ``page_token`` can be accessed at ``next_page_token`` of the

377 :class:`~google.api_core.page_iterator.HTTPIterator`.

378

379 retry (Optional[google.api_core.retry.Retry]): How to retry the RPC.

380

381 timeout (Optional[float]):

382 The number of seconds to wait for the underlying HTTP transport

383 before using ``retry``.

384

385 page_size (Optional[int]):

386 Maximum number of projects to return in each page.

387 Defaults to a value set by the API.

388

389 Returns:

390 google.api_core.page_iterator.Iterator:

391 Iterator of :class:`~google.cloud.bigquery.client.Project`

392 accessible to the current client.

393 """

394 span_attributes = {"path": "/projects"}

395

396 def api_request(*args, **kwargs):

397 return self._call_api(

398 retry,

399 span_name="BigQuery.listProjects",

400 span_attributes=span_attributes,

401 *args,

402 timeout=timeout,

403 **kwargs,

404 )

405

406 return page_iterator.HTTPIterator(

407 client=self,

408 api_request=api_request,

409 path="/projects",

410 item_to_value=_item_to_project,

411 items_key="projects",

412 page_token=page_token,

413 max_results=max_results,

414 page_size=page_size,

415 )

416

417 def list_datasets(

418 self,

419 project: str = None,

420 include_all: bool = False,

421 filter: str = None,

422 max_results: Optional[int] = None,

423 page_token: str = None,

424 retry: retries.Retry = DEFAULT_RETRY,

425 timeout: TimeoutType = DEFAULT_TIMEOUT,

426 page_size: Optional[int] = None,

427 ) -> page_iterator.Iterator:

428 """List datasets for the project associated with this client.

429

430 See

431 https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/list

432

433 Args:

434 project (Optional[str]):

435 Project ID to use for retreiving datasets. Defaults to the

436 client's project.

437 include_all (Optional[bool]):

438 True if results include hidden datasets. Defaults to False.

439 filter (Optional[str]):

440 An expression for filtering the results by label.

441 For syntax, see

442 https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/list#body.QUERY_PARAMETERS.filter

443 max_results (Optional[int]):

444 Maximum number of datasets to return.

445 page_token (Optional[str]):

446 Token representing a cursor into the datasets. If not passed,

447 the API will return the first page of datasets. The token marks

448 the beginning of the iterator to be returned and the value of

449 the ``page_token`` can be accessed at ``next_page_token`` of the

450 :class:`~google.api_core.page_iterator.HTTPIterator`.

451 retry (Optional[google.api_core.retry.Retry]):

452 How to retry the RPC.

453 timeout (Optional[float]):

454 The number of seconds to wait for the underlying HTTP transport

455 before using ``retry``.

456 page_size (Optional[int]):

457 Maximum number of datasets to return per page.

458

459 Returns:

460 google.api_core.page_iterator.Iterator:

461 Iterator of :class:`~google.cloud.bigquery.dataset.DatasetListItem`.

462 associated with the project.

463 """

464 extra_params: Dict[str, Any] = {}

465 if project is None:

466 project = self.project

467 if include_all:

468 extra_params["all"] = True

469 if filter:

470 # TODO: consider supporting a dict of label -> value for filter,

471 # and converting it into a string here.

472 extra_params["filter"] = filter

473 path = "/projects/%s/datasets" % (project,)

474

475 span_attributes = {"path": path}

476

477 def api_request(*args, **kwargs):

478

479 return self._call_api(

480 retry,

481 span_name="BigQuery.listDatasets",

482 span_attributes=span_attributes,

483 *args,

484 timeout=timeout,

485 **kwargs,

486 )

487

488 return page_iterator.HTTPIterator(

489 client=self,

490 api_request=api_request,

491 path=path,

492 item_to_value=_item_to_dataset,

493 items_key="datasets",

494 page_token=page_token,

495 max_results=max_results,

496 extra_params=extra_params,

497 page_size=page_size,

498 )

499

500 def dataset(self, dataset_id: str, project: str = None) -> DatasetReference:

501 """Deprecated: Construct a reference to a dataset.

502

503 .. deprecated:: 1.24.0

504 Construct a

505 :class:`~google.cloud.bigquery.dataset.DatasetReference` using its

506 constructor or use a string where previously a reference object

507 was used.

508

509 As of ``google-cloud-bigquery`` version 1.7.0, all client methods

510 that take a

511 :class:`~google.cloud.bigquery.dataset.DatasetReference` or

512 :class:`~google.cloud.bigquery.table.TableReference` also take a

513 string in standard SQL format, e.g. ``project.dataset_id`` or

514 ``project.dataset_id.table_id``.

515

516 Args:

517 dataset_id (str): ID of the dataset.

518

519 project (Optional[str]):

520 Project ID for the dataset (defaults to the project of the client).

521

522 Returns:

523 google.cloud.bigquery.dataset.DatasetReference:

524 a new ``DatasetReference`` instance.

525 """

526 if project is None:

527 project = self.project

528

529 warnings.warn(

530 "Client.dataset is deprecated and will be removed in a future version. "

531 "Use a string like 'my_project.my_dataset' or a "

532 "cloud.google.bigquery.DatasetReference object, instead.",

533 PendingDeprecationWarning,

534 stacklevel=2,

535 )

536 return DatasetReference(project, dataset_id)

537

538 def _ensure_bqstorage_client(

539 self,

540 bqstorage_client: Optional[

541 "google.cloud.bigquery_storage.BigQueryReadClient"

542 ] = None,

543 client_options: Optional[google.api_core.client_options.ClientOptions] = None,

544 client_info: Optional[

545 "google.api_core.gapic_v1.client_info.ClientInfo"

546 ] = DEFAULT_BQSTORAGE_CLIENT_INFO,

547 ) -> Optional["google.cloud.bigquery_storage.BigQueryReadClient"]:

548 """Create a BigQuery Storage API client using this client's credentials.

549

550 Args:

551 bqstorage_client:

552 An existing BigQuery Storage client instance. If ``None``, a new

553 instance is created and returned.

554 client_options:

555 Custom options used with a new BigQuery Storage client instance if one

556 is created.

557 client_info:

558 The client info used with a new BigQuery Storage client instance if one

559 is created.

560

561 Returns:

562 A BigQuery Storage API client.

563 """

564 try:

565 from google.cloud import bigquery_storage # type: ignore

566 except ImportError:

567 warnings.warn(

568 "Cannot create BigQuery Storage client, the dependency "

569 "google-cloud-bigquery-storage is not installed."

570 )

571 return None

572

573 try:

574 BQ_STORAGE_VERSIONS.verify_version()

575 except LegacyBigQueryStorageError as exc:

576 warnings.warn(str(exc))

577 return None

578 if bqstorage_client is None:

579 bqstorage_client = bigquery_storage.BigQueryReadClient(

580 credentials=self._credentials,

581 client_options=client_options,

582 client_info=client_info, # type: ignore # (None is also accepted)

583 )

584

585 return bqstorage_client

586

587 def _dataset_from_arg(self, dataset) -> Union[Dataset, DatasetReference]:

588 if isinstance(dataset, str):

589 dataset = DatasetReference.from_string(

590 dataset, default_project=self.project

591 )

592

593 if not isinstance(dataset, (Dataset, DatasetReference)):

594 if isinstance(dataset, DatasetListItem):

595 dataset = dataset.reference

596 else:

597 raise TypeError(

598 "dataset must be a Dataset, DatasetReference, DatasetListItem,"

599 " or string"

600 )

601 return dataset

602

603 def create_dataset(

604 self,

605 dataset: Union[str, Dataset, DatasetReference, DatasetListItem],

606 exists_ok: bool = False,

607 retry: retries.Retry = DEFAULT_RETRY,

608 timeout: TimeoutType = DEFAULT_TIMEOUT,

609 ) -> Dataset:

610 """API call: create the dataset via a POST request.

611

612 See

613 https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/insert

614

615 Args:

616 dataset (Union[ \

617 google.cloud.bigquery.dataset.Dataset, \

618 google.cloud.bigquery.dataset.DatasetReference, \

619 google.cloud.bigquery.dataset.DatasetListItem, \

620 str, \

621 ]):

622 A :class:`~google.cloud.bigquery.dataset.Dataset` to create.

623 If ``dataset`` is a reference, an empty dataset is created

624 with the specified ID and client's default location.

625 exists_ok (Optional[bool]):

626 Defaults to ``False``. If ``True``, ignore "already exists"

627 errors when creating the dataset.

628 retry (Optional[google.api_core.retry.Retry]):

629 How to retry the RPC.

630 timeout (Optional[float]):

631 The number of seconds to wait for the underlying HTTP transport

632 before using ``retry``.

633

634 Returns:

635 google.cloud.bigquery.dataset.Dataset:

636 A new ``Dataset`` returned from the API.

637

638 Raises:

639 google.cloud.exceptions.Conflict:

640 If the dataset already exists.

641

642 Example:

643

644 >>> from google.cloud import bigquery

645 >>> client = bigquery.Client()

646 >>> dataset = bigquery.Dataset('my_project.my_dataset')

647 >>> dataset = client.create_dataset(dataset)

648

649 """

650 dataset = self._dataset_from_arg(dataset)

651 if isinstance(dataset, DatasetReference):

652 dataset = Dataset(dataset)

653

654 path = "/projects/%s/datasets" % (dataset.project,)

655

656 data = dataset.to_api_repr()

657 if data.get("location") is None and self.location is not None:

658 data["location"] = self.location

659

660 try:

661 span_attributes = {"path": path}

662

663 api_response = self._call_api(

664 retry,

665 span_name="BigQuery.createDataset",

666 span_attributes=span_attributes,

667 method="POST",

668 path=path,

669 data=data,

670 timeout=timeout,

671 )

672 return Dataset.from_api_repr(api_response)

673 except core_exceptions.Conflict:

674 if not exists_ok:

675 raise

676 return self.get_dataset(dataset.reference, retry=retry)

677

678 def create_routine(

679 self,

680 routine: Routine,

681 exists_ok: bool = False,

682 retry: retries.Retry = DEFAULT_RETRY,

683 timeout: TimeoutType = DEFAULT_TIMEOUT,

684 ) -> Routine:

685 """[Beta] Create a routine via a POST request.

686

687 See

688 https://cloud.google.com/bigquery/docs/reference/rest/v2/routines/insert

689

690 Args:

691 routine (google.cloud.bigquery.routine.Routine):

692 A :class:`~google.cloud.bigquery.routine.Routine` to create.

693 The dataset that the routine belongs to must already exist.

694 exists_ok (Optional[bool]):

695 Defaults to ``False``. If ``True``, ignore "already exists"

696 errors when creating the routine.

697 retry (Optional[google.api_core.retry.Retry]):

698 How to retry the RPC.

699 timeout (Optional[float]):

700 The number of seconds to wait for the underlying HTTP transport

701 before using ``retry``.

702

703 Returns:

704 google.cloud.bigquery.routine.Routine:

705 A new ``Routine`` returned from the service.

706

707 Raises:

708 google.cloud.exceptions.Conflict:

709 If the routine already exists.

710 """

711 reference = routine.reference

712 path = "/projects/{}/datasets/{}/routines".format(

713 reference.project, reference.dataset_id

714 )

715 resource = routine.to_api_repr()

716 try:

717 span_attributes = {"path": path}

718 api_response = self._call_api(

719 retry,

720 span_name="BigQuery.createRoutine",

721 span_attributes=span_attributes,

722 method="POST",

723 path=path,

724 data=resource,

725 timeout=timeout,

726 )

727 return Routine.from_api_repr(api_response)

728 except core_exceptions.Conflict:

729 if not exists_ok:

730 raise

731 return self.get_routine(routine.reference, retry=retry)

732

733 def create_table(

734 self,

735 table: Union[str, Table, TableReference, TableListItem],

736 exists_ok: bool = False,

737 retry: retries.Retry = DEFAULT_RETRY,

738 timeout: TimeoutType = DEFAULT_TIMEOUT,

739 ) -> Table:

740 """API call: create a table via a PUT request

741

742 See

743 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/insert

744

745 Args:

746 table (Union[ \

747 google.cloud.bigquery.table.Table, \

748 google.cloud.bigquery.table.TableReference, \

749 google.cloud.bigquery.table.TableListItem, \

750 str, \

751 ]):

752 A :class:`~google.cloud.bigquery.table.Table` to create.

753 If ``table`` is a reference, an empty table is created

754 with the specified ID. The dataset that the table belongs to

755 must already exist.

756 exists_ok (Optional[bool]):

757 Defaults to ``False``. If ``True``, ignore "already exists"

758 errors when creating the table.

759 retry (Optional[google.api_core.retry.Retry]):

760 How to retry the RPC.

761 timeout (Optional[float]):

762 The number of seconds to wait for the underlying HTTP transport

763 before using ``retry``.

764

765 Returns:

766 google.cloud.bigquery.table.Table:

767 A new ``Table`` returned from the service.

768

769 Raises:

770 google.cloud.exceptions.Conflict:

771 If the table already exists.

772 """

773 table = _table_arg_to_table(table, default_project=self.project)

774 dataset_id = table.dataset_id

775 path = "/projects/%s/datasets/%s/tables" % (table.project, dataset_id)

776 data = table.to_api_repr()

777 try:

778 span_attributes = {"path": path, "dataset_id": dataset_id}

779 api_response = self._call_api(

780 retry,

781 span_name="BigQuery.createTable",

782 span_attributes=span_attributes,

783 method="POST",

784 path=path,

785 data=data,

786 timeout=timeout,

787 )

788 return Table.from_api_repr(api_response)

789 except core_exceptions.Conflict:

790 if not exists_ok:

791 raise

792 return self.get_table(table.reference, retry=retry)

793

794 def _call_api(

795 self,

796 retry,

797 span_name=None,

798 span_attributes=None,

799 job_ref=None,

800 headers: Optional[Dict[str, str]] = None,

801 **kwargs,

802 ):

803 kwargs = _add_server_timeout_header(headers, kwargs)

804 call = functools.partial(self._connection.api_request, **kwargs)

805

806 if retry:

807 call = retry(call)

808

809 if span_name is not None:

810 with create_span(

811 name=span_name, attributes=span_attributes, client=self, job_ref=job_ref

812 ):

813 return call()

814

815 return call()

816

817 def get_dataset(

818 self,

819 dataset_ref: Union[DatasetReference, str],

820 retry: retries.Retry = DEFAULT_RETRY,

821 timeout: TimeoutType = DEFAULT_TIMEOUT,

822 ) -> Dataset:

823 """Fetch the dataset referenced by ``dataset_ref``

824

825 Args:

826 dataset_ref (Union[ \

827 google.cloud.bigquery.dataset.DatasetReference, \

828 str, \

829 ]):

830 A reference to the dataset to fetch from the BigQuery API.

831 If a string is passed in, this method attempts to create a

832 dataset reference from a string using

833 :func:`~google.cloud.bigquery.dataset.DatasetReference.from_string`.

834 retry (Optional[google.api_core.retry.Retry]):

835 How to retry the RPC.

836 timeout (Optional[float]):

837 The number of seconds to wait for the underlying HTTP transport

838 before using ``retry``.

839

840 Returns:

841 google.cloud.bigquery.dataset.Dataset:

842 A ``Dataset`` instance.

843 """

844 if isinstance(dataset_ref, str):

845 dataset_ref = DatasetReference.from_string(

846 dataset_ref, default_project=self.project

847 )

848 path = dataset_ref.path

849 span_attributes = {"path": path}

850 api_response = self._call_api(

851 retry,

852 span_name="BigQuery.getDataset",

853 span_attributes=span_attributes,

854 method="GET",

855 path=path,

856 timeout=timeout,

857 )

858 return Dataset.from_api_repr(api_response)

859

860 def get_iam_policy(

861 self,

862 table: Union[Table, TableReference, TableListItem, str],

863 requested_policy_version: int = 1,

864 retry: retries.Retry = DEFAULT_RETRY,

865 timeout: TimeoutType = DEFAULT_TIMEOUT,

866 ) -> Policy:

867 table = _table_arg_to_table_ref(table, default_project=self.project)

868

869 if requested_policy_version != 1:

870 raise ValueError("only IAM policy version 1 is supported")

871

872 body = {"options": {"requestedPolicyVersion": 1}}

873

874 path = "{}:getIamPolicy".format(table.path)

875 span_attributes = {"path": path}

876 response = self._call_api(

877 retry,

878 span_name="BigQuery.getIamPolicy",

879 span_attributes=span_attributes,

880 method="POST",

881 path=path,

882 data=body,

883 timeout=timeout,

884 )

885

886 return Policy.from_api_repr(response)

887

888 def set_iam_policy(

889 self,

890 table: Union[Table, TableReference, TableListItem, str],

891 policy: Policy,

892 updateMask: str = None,

893 retry: retries.Retry = DEFAULT_RETRY,

894 timeout: TimeoutType = DEFAULT_TIMEOUT,

895 ) -> Policy:

896 table = _table_arg_to_table_ref(table, default_project=self.project)

897

898 if not isinstance(policy, (Policy)):

899 raise TypeError("policy must be a Policy")

900

901 body = {"policy": policy.to_api_repr()}

902

903 if updateMask is not None:

904 body["updateMask"] = updateMask

905

906 path = "{}:setIamPolicy".format(table.path)

907 span_attributes = {"path": path}

908

909 response = self._call_api(

910 retry,

911 span_name="BigQuery.setIamPolicy",

912 span_attributes=span_attributes,

913 method="POST",

914 path=path,

915 data=body,

916 timeout=timeout,

917 )

918

919 return Policy.from_api_repr(response)

920

921 def test_iam_permissions(

922 self,

923 table: Union[Table, TableReference, TableListItem, str],

924 permissions: Sequence[str],

925 retry: retries.Retry = DEFAULT_RETRY,

926 timeout: TimeoutType = DEFAULT_TIMEOUT,

927 ) -> Dict[str, Any]:

928 table = _table_arg_to_table_ref(table, default_project=self.project)

929

930 body = {"permissions": permissions}

931

932 path = "{}:testIamPermissions".format(table.path)

933 span_attributes = {"path": path}

934 response = self._call_api(

935 retry,

936 span_name="BigQuery.testIamPermissions",

937 span_attributes=span_attributes,

938 method="POST",

939 path=path,

940 data=body,

941 timeout=timeout,

942 )

943

944 return response

945

946 def get_model(

947 self,

948 model_ref: Union[ModelReference, str],

949 retry: retries.Retry = DEFAULT_RETRY,

950 timeout: TimeoutType = DEFAULT_TIMEOUT,

951 ) -> Model:

952 """[Beta] Fetch the model referenced by ``model_ref``.

953

954 Args:

955 model_ref (Union[ \

956 google.cloud.bigquery.model.ModelReference, \

957 str, \

958 ]):

959 A reference to the model to fetch from the BigQuery API.

960 If a string is passed in, this method attempts to create a

961 model reference from a string using

962 :func:`google.cloud.bigquery.model.ModelReference.from_string`.

963 retry (Optional[google.api_core.retry.Retry]):

964 How to retry the RPC.

965 timeout (Optional[float]):

966 The number of seconds to wait for the underlying HTTP transport

967 before using ``retry``.

968

969 Returns:

970 google.cloud.bigquery.model.Model: A ``Model`` instance.

971 """

972 if isinstance(model_ref, str):

973 model_ref = ModelReference.from_string(

974 model_ref, default_project=self.project

975 )

976 path = model_ref.path

977 span_attributes = {"path": path}

978

979 api_response = self._call_api(

980 retry,

981 span_name="BigQuery.getModel",

982 span_attributes=span_attributes,

983 method="GET",

984 path=path,

985 timeout=timeout,

986 )

987 return Model.from_api_repr(api_response)

988

989 def get_routine(

990 self,

991 routine_ref: Union[Routine, RoutineReference, str],

992 retry: retries.Retry = DEFAULT_RETRY,

993 timeout: TimeoutType = DEFAULT_TIMEOUT,

994 ) -> Routine:

995 """[Beta] Get the routine referenced by ``routine_ref``.

996

997 Args:

998 routine_ref (Union[ \

999 google.cloud.bigquery.routine.Routine, \

1000 google.cloud.bigquery.routine.RoutineReference, \

1001 str, \

1002 ]):

1003 A reference to the routine to fetch from the BigQuery API. If

1004 a string is passed in, this method attempts to create a

1005 reference from a string using

1006 :func:`google.cloud.bigquery.routine.RoutineReference.from_string`.

1007 retry (Optional[google.api_core.retry.Retry]):

1008 How to retry the API call.

1009 timeout (Optional[float]):

1010 The number of seconds to wait for the underlying HTTP transport

1011 before using ``retry``.

1012

1013 Returns:

1014 google.cloud.bigquery.routine.Routine:

1015 A ``Routine`` instance.

1016 """

1017 if isinstance(routine_ref, str):

1018 routine_ref = RoutineReference.from_string(

1019 routine_ref, default_project=self.project

1020 )

1021 path = routine_ref.path

1022 span_attributes = {"path": path}

1023 api_response = self._call_api(

1024 retry,

1025 span_name="BigQuery.getRoutine",

1026 span_attributes=span_attributes,

1027 method="GET",

1028 path=path,

1029 timeout=timeout,

1030 )

1031 return Routine.from_api_repr(api_response)

1032

1033 def get_table(

1034 self,

1035 table: Union[Table, TableReference, TableListItem, str],

1036 retry: retries.Retry = DEFAULT_RETRY,

1037 timeout: TimeoutType = DEFAULT_TIMEOUT,

1038 ) -> Table:

1039 """Fetch the table referenced by ``table``.

1040

1041 Args:

1042 table (Union[ \

1043 google.cloud.bigquery.table.Table, \

1044 google.cloud.bigquery.table.TableReference, \

1045 google.cloud.bigquery.table.TableListItem, \

1046 str, \

1047 ]):

1048 A reference to the table to fetch from the BigQuery API.

1049 If a string is passed in, this method attempts to create a

1050 table reference from a string using

1051 :func:`google.cloud.bigquery.table.TableReference.from_string`.

1052 retry (Optional[google.api_core.retry.Retry]):

1053 How to retry the RPC.

1054 timeout (Optional[float]):

1055 The number of seconds to wait for the underlying HTTP transport

1056 before using ``retry``.

1057

1058 Returns:

1059 google.cloud.bigquery.table.Table:

1060 A ``Table`` instance.

1061 """

1062 table_ref = _table_arg_to_table_ref(table, default_project=self.project)

1063 path = table_ref.path

1064 span_attributes = {"path": path}

1065 api_response = self._call_api(

1066 retry,

1067 span_name="BigQuery.getTable",

1068 span_attributes=span_attributes,

1069 method="GET",

1070 path=path,

1071 timeout=timeout,

1072 )

1073 return Table.from_api_repr(api_response)

1074

1075 def update_dataset(

1076 self,

1077 dataset: Dataset,

1078 fields: Sequence[str],

1079 retry: retries.Retry = DEFAULT_RETRY,

1080 timeout: TimeoutType = DEFAULT_TIMEOUT,

1081 ) -> Dataset:

1082 """Change some fields of a dataset.

1083

1084 Use ``fields`` to specify which fields to update. At least one field

1085 must be provided. If a field is listed in ``fields`` and is ``None`` in

1086 ``dataset``, it will be deleted.

1087

1088 If ``dataset.etag`` is not ``None``, the update will only

1089 succeed if the dataset on the server has the same ETag. Thus

1090 reading a dataset with ``get_dataset``, changing its fields,

1091 and then passing it to ``update_dataset`` will ensure that the changes

1092 will only be saved if no modifications to the dataset occurred

1093 since the read.

1094

1095 Args:

1096 dataset (google.cloud.bigquery.dataset.Dataset):

1097 The dataset to update.

1098 fields (Sequence[str]):

1099 The properties of ``dataset`` to change. These are strings

1100 corresponding to the properties of

1101 :class:`~google.cloud.bigquery.dataset.Dataset`.

1102

1103 For example, to update the default expiration times, specify

1104 both properties in the ``fields`` argument:

1105

1106 .. code-block:: python

1107

1108 bigquery_client.update_dataset(

1109 dataset,

1110 [

1111 "default_partition_expiration_ms",

1112 "default_table_expiration_ms",

1113 ]

1114 )

1115 retry (Optional[google.api_core.retry.Retry]):

1116 How to retry the RPC.

1117 timeout (Optional[float]):

1118 The number of seconds to wait for the underlying HTTP transport

1119 before using ``retry``.

1120

1121 Returns:

1122 google.cloud.bigquery.dataset.Dataset:

1123 The modified ``Dataset`` instance.

1124 """

1125 partial = dataset._build_resource(fields)

1126 if dataset.etag is not None:

1127 headers: Optional[Dict[str, str]] = {"If-Match": dataset.etag}

1128 else:

1129 headers = None

1130 path = dataset.path

1131 span_attributes = {"path": path, "fields": fields}

1132

1133 api_response = self._call_api(

1134 retry,

1135 span_name="BigQuery.updateDataset",

1136 span_attributes=span_attributes,

1137 method="PATCH",

1138 path=path,

1139 data=partial,

1140 headers=headers,

1141 timeout=timeout,

1142 )

1143 return Dataset.from_api_repr(api_response)

1144

1145 def update_model(

1146 self,

1147 model: Model,

1148 fields: Sequence[str],

1149 retry: retries.Retry = DEFAULT_RETRY,

1150 timeout: TimeoutType = DEFAULT_TIMEOUT,

1151 ) -> Model:

1152 """[Beta] Change some fields of a model.

1153

1154 Use ``fields`` to specify which fields to update. At least one field

1155 must be provided. If a field is listed in ``fields`` and is ``None``

1156 in ``model``, the field value will be deleted.

1157

1158 If ``model.etag`` is not ``None``, the update will only succeed if

1159 the model on the server has the same ETag. Thus reading a model with

1160 ``get_model``, changing its fields, and then passing it to

1161 ``update_model`` will ensure that the changes will only be saved if

1162 no modifications to the model occurred since the read.

1163

1164 Args:

1165 model (google.cloud.bigquery.model.Model): The model to update.

1166 fields (Sequence[str]):

1167 The properties of ``model`` to change. These are strings

1168 corresponding to the properties of

1169 :class:`~google.cloud.bigquery.model.Model`.

1170

1171 For example, to update the descriptive properties of the model,

1172 specify them in the ``fields`` argument:

1173

1174 .. code-block:: python

1175

1176 bigquery_client.update_model(

1177 model, ["description", "friendly_name"]

1178 )

1179 retry (Optional[google.api_core.retry.Retry]):

1180 A description of how to retry the API call.

1181 timeout (Optional[float]):

1182 The number of seconds to wait for the underlying HTTP transport

1183 before using ``retry``.

1184

1185 Returns:

1186 google.cloud.bigquery.model.Model:

1187 The model resource returned from the API call.

1188 """

1189 partial = model._build_resource(fields)

1190 if model.etag:

1191 headers: Optional[Dict[str, str]] = {"If-Match": model.etag}

1192 else:

1193 headers = None

1194 path = model.path

1195 span_attributes = {"path": path, "fields": fields}

1196

1197 api_response = self._call_api(

1198 retry,

1199 span_name="BigQuery.updateModel",

1200 span_attributes=span_attributes,

1201 method="PATCH",

1202 path=path,

1203 data=partial,

1204 headers=headers,

1205 timeout=timeout,

1206 )

1207 return Model.from_api_repr(api_response)

1208

1209 def update_routine(

1210 self,

1211 routine: Routine,

1212 fields: Sequence[str],

1213 retry: retries.Retry = DEFAULT_RETRY,

1214 timeout: TimeoutType = DEFAULT_TIMEOUT,

1215 ) -> Routine:

1216 """[Beta] Change some fields of a routine.

1217

1218 Use ``fields`` to specify which fields to update. At least one field

1219 must be provided. If a field is listed in ``fields`` and is ``None``

1220 in ``routine``, the field value will be deleted.

1221

1222 .. warning::

1223 During beta, partial updates are not supported. You must provide

1224 all fields in the resource.

1225

1226 If :attr:`~google.cloud.bigquery.routine.Routine.etag` is not

1227 ``None``, the update will only succeed if the resource on the server

1228 has the same ETag. Thus reading a routine with

1229 :func:`~google.cloud.bigquery.client.Client.get_routine`, changing

1230 its fields, and then passing it to this method will ensure that the

1231 changes will only be saved if no modifications to the resource

1232 occurred since the read.

1233

1234 Args:

1235 routine (google.cloud.bigquery.routine.Routine):

1236 The routine to update.

1237 fields (Sequence[str]):

1238 The fields of ``routine`` to change, spelled as the

1239 :class:`~google.cloud.bigquery.routine.Routine` properties.

1240

1241 For example, to update the description property of the routine,

1242 specify it in the ``fields`` argument:

1243

1244 .. code-block:: python

1245

1246 bigquery_client.update_routine(

1247 routine, ["description"]

1248 )

1249 retry (Optional[google.api_core.retry.Retry]):

1250 A description of how to retry the API call.

1251 timeout (Optional[float]):

1252 The number of seconds to wait for the underlying HTTP transport

1253 before using ``retry``.

1254

1255 Returns:

1256 google.cloud.bigquery.routine.Routine:

1257 The routine resource returned from the API call.

1258 """

1259 partial = routine._build_resource(fields)

1260 if routine.etag:

1261 headers: Optional[Dict[str, str]] = {"If-Match": routine.etag}

1262 else:

1263 headers = None

1264

1265 # TODO: remove when routines update supports partial requests.

1266 partial["routineReference"] = routine.reference.to_api_repr()

1267

1268 path = routine.path

1269 span_attributes = {"path": path, "fields": fields}

1270

1271 api_response = self._call_api(

1272 retry,

1273 span_name="BigQuery.updateRoutine",

1274 span_attributes=span_attributes,

1275 method="PUT",

1276 path=path,

1277 data=partial,

1278 headers=headers,

1279 timeout=timeout,

1280 )

1281 return Routine.from_api_repr(api_response)

1282

1283 def update_table(

1284 self,

1285 table: Table,

1286 fields: Sequence[str],

1287 retry: retries.Retry = DEFAULT_RETRY,

1288 timeout: TimeoutType = DEFAULT_TIMEOUT,

1289 ) -> Table:

1290 """Change some fields of a table.

1291

1292 Use ``fields`` to specify which fields to update. At least one field

1293 must be provided. If a field is listed in ``fields`` and is ``None``

1294 in ``table``, the field value will be deleted.

1295

1296 If ``table.etag`` is not ``None``, the update will only succeed if

1297 the table on the server has the same ETag. Thus reading a table with

1298 ``get_table``, changing its fields, and then passing it to

1299 ``update_table`` will ensure that the changes will only be saved if

1300 no modifications to the table occurred since the read.

1301

1302 Args:

1303 table (google.cloud.bigquery.table.Table): The table to update.

1304 fields (Sequence[str]):

1305 The fields of ``table`` to change, spelled as the

1306 :class:`~google.cloud.bigquery.table.Table` properties.

1307

1308 For example, to update the descriptive properties of the table,

1309 specify them in the ``fields`` argument:

1310

1311 .. code-block:: python

1312

1313 bigquery_client.update_table(

1314 table,

1315 ["description", "friendly_name"]

1316 )

1317 retry (Optional[google.api_core.retry.Retry]):

1318 A description of how to retry the API call.

1319 timeout (Optional[float]):

1320 The number of seconds to wait for the underlying HTTP transport

1321 before using ``retry``.

1322

1323 Returns:

1324 google.cloud.bigquery.table.Table:

1325 The table resource returned from the API call.

1326 """

1327 partial = table._build_resource(fields)

1328 if table.etag is not None:

1329 headers: Optional[Dict[str, str]] = {"If-Match": table.etag}

1330 else:

1331 headers = None

1332

1333 path = table.path

1334 span_attributes = {"path": path, "fields": fields}

1335

1336 api_response = self._call_api(

1337 retry,

1338 span_name="BigQuery.updateTable",

1339 span_attributes=span_attributes,

1340 method="PATCH",

1341 path=path,

1342 data=partial,

1343 headers=headers,

1344 timeout=timeout,

1345 )

1346 return Table.from_api_repr(api_response)

1347

1348 def list_models(

1349 self,

1350 dataset: Union[Dataset, DatasetReference, DatasetListItem, str],

1351 max_results: Optional[int] = None,

1352 page_token: str = None,

1353 retry: retries.Retry = DEFAULT_RETRY,

1354 timeout: TimeoutType = DEFAULT_TIMEOUT,

1355 page_size: Optional[int] = None,

1356 ) -> page_iterator.Iterator:

1357 """[Beta] List models in the dataset.

1358

1359 See

1360 https://cloud.google.com/bigquery/docs/reference/rest/v2/models/list

1361

1362 Args:

1363 dataset (Union[ \

1364 google.cloud.bigquery.dataset.Dataset, \

1365 google.cloud.bigquery.dataset.DatasetReference, \

1366 google.cloud.bigquery.dataset.DatasetListItem, \

1367 str, \

1368 ]):

1369 A reference to the dataset whose models to list from the

1370 BigQuery API. If a string is passed in, this method attempts

1371 to create a dataset reference from a string using

1372 :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`.

1373 max_results (Optional[int]):

1374 Maximum number of models to return. Defaults to a

1375 value set by the API.

1376 page_token (Optional[str]):

1377 Token representing a cursor into the models. If not passed,

1378 the API will return the first page of models. The token marks

1379 the beginning of the iterator to be returned and the value of

1380 the ``page_token`` can be accessed at ``next_page_token`` of the

1381 :class:`~google.api_core.page_iterator.HTTPIterator`.

1382 retry (Optional[google.api_core.retry.Retry]):

1383 How to retry the RPC.

1384 timeout (Optional[float]):

1385 The number of seconds to wait for the underlying HTTP transport

1386 before using ``retry``.

1387 page_size (Optional[int]):

1388 Maximum number of models to return per page.

1389 Defaults to a value set by the API.

1390

1391 Returns:

1392 google.api_core.page_iterator.Iterator:

1393 Iterator of

1394 :class:`~google.cloud.bigquery.model.Model` contained

1395 within the requested dataset.

1396 """

1397 dataset = self._dataset_from_arg(dataset)

1398

1399 path = "%s/models" % dataset.path

1400 span_attributes = {"path": path}

1401

1402 def api_request(*args, **kwargs):

1403 return self._call_api(

1404 retry,

1405 span_name="BigQuery.listModels",

1406 span_attributes=span_attributes,

1407 *args,

1408 timeout=timeout,

1409 **kwargs,

1410 )

1411

1412 result = page_iterator.HTTPIterator(

1413 client=self,

1414 api_request=api_request,

1415 path=path,

1416 item_to_value=_item_to_model,

1417 items_key="models",

1418 page_token=page_token,

1419 max_results=max_results,

1420 page_size=page_size,

1421 )

1422 result.dataset = dataset # type: ignore

1423 return result

1424

1425 def list_routines(

1426 self,

1427 dataset: Union[Dataset, DatasetReference, DatasetListItem, str],

1428 max_results: Optional[int] = None,

1429 page_token: str = None,

1430 retry: retries.Retry = DEFAULT_RETRY,

1431 timeout: TimeoutType = DEFAULT_TIMEOUT,

1432 page_size: Optional[int] = None,

1433 ) -> page_iterator.Iterator:

1434 """[Beta] List routines in the dataset.

1435

1436 See

1437 https://cloud.google.com/bigquery/docs/reference/rest/v2/routines/list

1438

1439 Args:

1440 dataset (Union[ \

1441 google.cloud.bigquery.dataset.Dataset, \

1442 google.cloud.bigquery.dataset.DatasetReference, \

1443 google.cloud.bigquery.dataset.DatasetListItem, \

1444 str, \

1445 ]):

1446 A reference to the dataset whose routines to list from the

1447 BigQuery API. If a string is passed in, this method attempts

1448 to create a dataset reference from a string using

1449 :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`.

1450 max_results (Optional[int]):

1451 Maximum number of routines to return. Defaults

1452 to a value set by the API.

1453 page_token (Optional[str]):

1454 Token representing a cursor into the routines. If not passed,

1455 the API will return the first page of routines. The token marks

1456 the beginning of the iterator to be returned and the value of the

1457 ``page_token`` can be accessed at ``next_page_token`` of the

1458 :class:`~google.api_core.page_iterator.HTTPIterator`.

1459 retry (Optional[google.api_core.retry.Retry]):

1460 How to retry the RPC.

1461 timeout (Optional[float]):

1462 The number of seconds to wait for the underlying HTTP transport

1463 before using ``retry``.

1464 page_size (Optional[int]):

1465 Maximum number of routines to return per page.

1466 Defaults to a value set by the API.

1467

1468 Returns:

1469 google.api_core.page_iterator.Iterator:

1470 Iterator of all

1471 :class:`~google.cloud.bigquery.routine.Routine`s contained

1472 within the requested dataset, limited by ``max_results``.

1473 """

1474 dataset = self._dataset_from_arg(dataset)

1475 path = "{}/routines".format(dataset.path)

1476

1477 span_attributes = {"path": path}

1478

1479 def api_request(*args, **kwargs):

1480 return self._call_api(

1481 retry,

1482 span_name="BigQuery.listRoutines",

1483 span_attributes=span_attributes,

1484 *args,

1485 timeout=timeout,

1486 **kwargs,

1487 )

1488

1489 result = page_iterator.HTTPIterator(

1490 client=self,

1491 api_request=api_request,

1492 path=path,

1493 item_to_value=_item_to_routine,

1494 items_key="routines",

1495 page_token=page_token,

1496 max_results=max_results,

1497 page_size=page_size,

1498 )

1499 result.dataset = dataset # type: ignore

1500 return result

1501

1502 def list_tables(

1503 self,

1504 dataset: Union[Dataset, DatasetReference, DatasetListItem, str],

1505 max_results: Optional[int] = None,

1506 page_token: str = None,

1507 retry: retries.Retry = DEFAULT_RETRY,

1508 timeout: TimeoutType = DEFAULT_TIMEOUT,

1509 page_size: Optional[int] = None,

1510 ) -> page_iterator.Iterator:

1511 """List tables in the dataset.

1512

1513 See

1514 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/list

1515

1516 Args:

1517 dataset (Union[ \

1518 google.cloud.bigquery.dataset.Dataset, \

1519 google.cloud.bigquery.dataset.DatasetReference, \

1520 google.cloud.bigquery.dataset.DatasetListItem, \

1521 str, \

1522 ]):

1523 A reference to the dataset whose tables to list from the

1524 BigQuery API. If a string is passed in, this method attempts

1525 to create a dataset reference from a string using

1526 :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`.

1527 max_results (Optional[int]):

1528 Maximum number of tables to return. Defaults

1529 to a value set by the API.

1530 page_token (Optional[str]):

1531 Token representing a cursor into the tables. If not passed,

1532 the API will return the first page of tables. The token marks

1533 the beginning of the iterator to be returned and the value of

1534 the ``page_token`` can be accessed at ``next_page_token`` of the

1535 :class:`~google.api_core.page_iterator.HTTPIterator`.

1536 retry (Optional[google.api_core.retry.Retry]):

1537 How to retry the RPC.

1538 timeout (Optional[float]):

1539 The number of seconds to wait for the underlying HTTP transport

1540 before using ``retry``.

1541 page_size (Optional[int]):

1542 Maximum number of tables to return per page.

1543 Defaults to a value set by the API.

1544

1545 Returns:

1546 google.api_core.page_iterator.Iterator:

1547 Iterator of

1548 :class:`~google.cloud.bigquery.table.TableListItem` contained

1549 within the requested dataset.

1550 """

1551 dataset = self._dataset_from_arg(dataset)

1552 path = "%s/tables" % dataset.path

1553 span_attributes = {"path": path}

1554

1555 def api_request(*args, **kwargs):

1556 return self._call_api(

1557 retry,

1558 span_name="BigQuery.listTables",

1559 span_attributes=span_attributes,

1560 *args,

1561 timeout=timeout,

1562 **kwargs,

1563 )

1564

1565 result = page_iterator.HTTPIterator(

1566 client=self,

1567 api_request=api_request,

1568 path=path,

1569 item_to_value=_item_to_table,

1570 items_key="tables",

1571 page_token=page_token,

1572 max_results=max_results,

1573 page_size=page_size,

1574 )

1575 result.dataset = dataset # type: ignore

1576 return result

1577

1578 def delete_dataset(

1579 self,

1580 dataset: Union[Dataset, DatasetReference, DatasetListItem, str],

1581 delete_contents: bool = False,

1582 retry: retries.Retry = DEFAULT_RETRY,

1583 timeout: TimeoutType = DEFAULT_TIMEOUT,

1584 not_found_ok: bool = False,

1585 ) -> None:

1586 """Delete a dataset.

1587

1588 See

1589 https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/delete

1590

1591 Args:

1592 dataset (Union[ \

1593 google.cloud.bigquery.dataset.Dataset, \

1594 google.cloud.bigquery.dataset.DatasetReference, \

1595 google.cloud.bigquery.dataset.DatasetListItem, \

1596 str, \

1597 ]):

1598 A reference to the dataset to delete. If a string is passed

1599 in, this method attempts to create a dataset reference from a

1600 string using

1601 :func:`google.cloud.bigquery.dataset.DatasetReference.from_string`.

1602 delete_contents (Optional[bool]):

1603 If True, delete all the tables in the dataset. If False and

1604 the dataset contains tables, the request will fail.

1605 Default is False.

1606 retry (Optional[google.api_core.retry.Retry]):

1607 How to retry the RPC.

1608 timeout (Optional[float]):

1609 The number of seconds to wait for the underlying HTTP transport

1610 before using ``retry``.

1611 not_found_ok (Optional[bool]):

1612 Defaults to ``False``. If ``True``, ignore "not found" errors

1613 when deleting the dataset.

1614 """

1615 dataset = self._dataset_from_arg(dataset)

1616 params = {}

1617 path = dataset.path

1618 if delete_contents:

1619 params["deleteContents"] = "true"

1620 span_attributes = {"path": path, "deleteContents": delete_contents}

1621 else:

1622 span_attributes = {"path": path}

1623

1624 try:

1625 self._call_api(

1626 retry,

1627 span_name="BigQuery.deleteDataset",

1628 span_attributes=span_attributes,

1629 method="DELETE",

1630 path=path,

1631 query_params=params,

1632 timeout=timeout,

1633 )

1634 except core_exceptions.NotFound:

1635 if not not_found_ok:

1636 raise

1637

1638 def delete_model(

1639 self,

1640 model: Union[Model, ModelReference, str],

1641 retry: retries.Retry = DEFAULT_RETRY,

1642 timeout: TimeoutType = DEFAULT_TIMEOUT,

1643 not_found_ok: bool = False,

1644 ) -> None:

1645 """[Beta] Delete a model

1646

1647 See

1648 https://cloud.google.com/bigquery/docs/reference/rest/v2/models/delete

1649

1650 Args:

1651 model (Union[ \

1652 google.cloud.bigquery.model.Model, \

1653 google.cloud.bigquery.model.ModelReference, \

1654 str, \

1655 ]):

1656 A reference to the model to delete. If a string is passed in,

1657 this method attempts to create a model reference from a

1658 string using

1659 :func:`google.cloud.bigquery.model.ModelReference.from_string`.

1660 retry (Optional[google.api_core.retry.Retry]):

1661 How to retry the RPC.

1662 timeout (Optional[float]):

1663 The number of seconds to wait for the underlying HTTP transport

1664 before using ``retry``.

1665 not_found_ok (Optional[bool]):

1666 Defaults to ``False``. If ``True``, ignore "not found" errors

1667 when deleting the model.

1668 """

1669 if isinstance(model, str):

1670 model = ModelReference.from_string(model, default_project=self.project)

1671

1672 if not isinstance(model, (Model, ModelReference)):

1673 raise TypeError("model must be a Model or a ModelReference")

1674

1675 path = model.path

1676 try:

1677 span_attributes = {"path": path}

1678 self._call_api(

1679 retry,

1680 span_name="BigQuery.deleteModel",

1681 span_attributes=span_attributes,

1682 method="DELETE",

1683 path=path,

1684 timeout=timeout,

1685 )

1686 except core_exceptions.NotFound:

1687 if not not_found_ok:

1688 raise

1689

1690 def delete_job_metadata(

1691 self,

1692 job_id: Union[str, LoadJob, CopyJob, ExtractJob, QueryJob],

1693 project: Optional[str] = None,

1694 location: Optional[str] = None,

1695 retry: retries.Retry = DEFAULT_RETRY,

1696 timeout: TimeoutType = DEFAULT_TIMEOUT,

1697 not_found_ok: bool = False,

1698 ):

1699 """[Beta] Delete job metadata from job history.

1700

1701 Note: This does not stop a running job. Use

1702 :func:`~google.cloud.bigquery.client.Client.cancel_job` instead.

1703

1704 Args:

1705 job_id: Job or job identifier.

1706

1707 Keyword Arguments:

1708 project:

1709 ID of the project which owns the job (defaults to the client's project).

1710 location:

1711 Location where the job was run. Ignored if ``job_id`` is a job

1712 object.

1713 retry:

1714 How to retry the RPC.

1715 timeout:

1716 The number of seconds to wait for the underlying HTTP transport

1717 before using ``retry``.

1718 not_found_ok:

1719 Defaults to ``False``. If ``True``, ignore "not found" errors

1720 when deleting the job.

1721 """

1722 extra_params = {}

1723

1724 project, location, job_id = _extract_job_reference(

1725 job_id, project=project, location=location

1726 )

1727

1728 if project is None:

1729 project = self.project

1730

1731 if location is None:

1732 location = self.location

1733

1734 # Location is always required for jobs.delete()

1735 extra_params["location"] = location

1736

1737 path = f"/projects/{project}/jobs/{job_id}/delete"

1738

1739 span_attributes = {"path": path, "job_id": job_id, "location": location}

1740

1741 try:

1742 self._call_api(

1743 retry,

1744 span_name="BigQuery.deleteJob",

1745 span_attributes=span_attributes,

1746 method="DELETE",

1747 path=path,

1748 query_params=extra_params,

1749 timeout=timeout,

1750 )

1751 except google.api_core.exceptions.NotFound:

1752 if not not_found_ok:

1753 raise

1754

1755 def delete_routine(

1756 self,

1757 routine: Union[Routine, RoutineReference, str],

1758 retry: retries.Retry = DEFAULT_RETRY,

1759 timeout: TimeoutType = DEFAULT_TIMEOUT,

1760 not_found_ok: bool = False,

1761 ) -> None:

1762 """[Beta] Delete a routine.

1763

1764 See

1765 https://cloud.google.com/bigquery/docs/reference/rest/v2/routines/delete

1766

1767 Args:

1768 routine (Union[ \

1769 google.cloud.bigquery.routine.Routine, \

1770 google.cloud.bigquery.routine.RoutineReference, \

1771 str, \

1772 ]):

1773 A reference to the routine to delete. If a string is passed

1774 in, this method attempts to create a routine reference from a

1775 string using

1776 :func:`google.cloud.bigquery.routine.RoutineReference.from_string`.

1777 retry (Optional[google.api_core.retry.Retry]):

1778 How to retry the RPC.

1779 timeout (Optional[float]):

1780 The number of seconds to wait for the underlying HTTP transport

1781 before using ``retry``.

1782 not_found_ok (Optional[bool]):

1783 Defaults to ``False``. If ``True``, ignore "not found" errors

1784 when deleting the routine.

1785 """

1786 if isinstance(routine, str):

1787 routine = RoutineReference.from_string(

1788 routine, default_project=self.project

1789 )

1790 path = routine.path

1791

1792 if not isinstance(routine, (Routine, RoutineReference)):

1793 raise TypeError("routine must be a Routine or a RoutineReference")

1794

1795 try:

1796 span_attributes = {"path": path}

1797 self._call_api(

1798 retry,

1799 span_name="BigQuery.deleteRoutine",

1800 span_attributes=span_attributes,

1801 method="DELETE",

1802 path=path,

1803 timeout=timeout,

1804 )

1805 except core_exceptions.NotFound:

1806 if not not_found_ok:

1807 raise

1808

1809 def delete_table(

1810 self,

1811 table: Union[Table, TableReference, TableListItem, str],

1812 retry: retries.Retry = DEFAULT_RETRY,

1813 timeout: TimeoutType = DEFAULT_TIMEOUT,

1814 not_found_ok: bool = False,

1815 ) -> None:

1816 """Delete a table

1817

1818 See

1819 https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/delete

1820

1821 Args:

1822 table (Union[ \

1823 google.cloud.bigquery.table.Table, \

1824 google.cloud.bigquery.table.TableReference, \

1825 google.cloud.bigquery.table.TableListItem, \

1826 str, \

1827 ]):

1828 A reference to the table to delete. If a string is passed in,

1829 this method attempts to create a table reference from a

1830 string using

1831 :func:`google.cloud.bigquery.table.TableReference.from_string`.

1832 retry (Optional[google.api_core.retry.Retry]):

1833 How to retry the RPC.

1834 timeout (Optional[float]):

1835 The number of seconds to wait for the underlying HTTP transport

1836 before using ``retry``.

1837 not_found_ok (Optional[bool]):

1838 Defaults to ``False``. If ``True``, ignore "not found" errors

1839 when deleting the table.

1840 """

1841 table = _table_arg_to_table_ref(table, default_project=self.project)

1842 if not isinstance(table, TableReference):

1843 raise TypeError("Unable to get TableReference for table '{}'".format(table))

1844

1845 try:

1846 path = table.path

1847 span_attributes = {"path": path}

1848 self._call_api(

1849 retry,

1850 span_name="BigQuery.deleteTable",

1851 span_attributes=span_attributes,

1852 method="DELETE",

1853 path=path,

1854 timeout=timeout,

1855 )

1856 except core_exceptions.NotFound:

1857 if not not_found_ok:

1858 raise

1859

1860 def _get_query_results(

1861 self,

1862 job_id: str,

1863 retry: retries.Retry,

1864 project: str = None,

1865 timeout_ms: Optional[int] = None,

1866 location: str = None,

1867 timeout: TimeoutType = DEFAULT_TIMEOUT,

1868 ) -> _QueryResults:

1869 """Get the query results object for a query job.

1870

1871 Args:

1872 job_id (str): Name of the query job.

1873 retry (google.api_core.retry.Retry):

1874 How to retry the RPC.

1875 project (Optional[str]):

1876 Project ID for the query job (defaults to the project of the client).

1877 timeout_ms (Optional[int]):

1878 Number of milliseconds the the API call should wait for the query

1879 to complete before the request times out.

1880 location (Optional[str]): Location of the query job.

1881 timeout (Optional[float]):

1882 The number of seconds to wait for the underlying HTTP transport

1883 before using ``retry``. If set, this connection timeout may be

1884 increased to a minimum value. This prevents retries on what

1885 would otherwise be a successful response.

1886

1887 Returns:

1888 google.cloud.bigquery.query._QueryResults:

1889 A new ``_QueryResults`` instance.

1890 """

1891

1892 extra_params: Dict[str, Any] = {"maxResults": 0}

1893

1894 if timeout is not None:

1895 timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT)

1896

1897 if project is None:

1898 project = self.project

1899

1900 if timeout_ms is not None:

1901 extra_params["timeoutMs"] = timeout_ms

1902

1903 if location is None:

1904 location = self.location

1905

1906 if location is not None:

1907 extra_params["location"] = location

1908

1909 path = "/projects/{}/queries/{}".format(project, job_id)

1910

1911 # This call is typically made in a polling loop that checks whether the

1912 # job is complete (from QueryJob.done(), called ultimately from

1913 # QueryJob.result()). So we don't need to poll here.

1914 span_attributes = {"path": path}

1915 resource = self._call_api(

1916 retry,

1917 span_name="BigQuery.getQueryResults",

1918 span_attributes=span_attributes,

1919 method="GET",

1920 path=path,

1921 query_params=extra_params,

1922 timeout=timeout,

1923 )

1924 return _QueryResults.from_api_repr(resource)

1925

1926 def job_from_resource(

1927 self, resource: dict

1928 ) -> Union[job.CopyJob, job.ExtractJob, job.LoadJob, job.QueryJob, job.UnknownJob]:

1929 """Detect correct job type from resource and instantiate.

1930

1931 Args:

1932 resource (Dict): one job resource from API response

1933

1934 Returns:

1935 The job instance, constructed via the resource.

1936 """

1937 config = resource.get("configuration", {})

1938 if "load" in config:

1939 return job.LoadJob.from_api_repr(resource, self)

1940 elif "copy" in config:

1941 return job.CopyJob.from_api_repr(resource, self)

1942 elif "extract" in config:

1943 return job.ExtractJob.from_api_repr(resource, self)

1944 elif "query" in config:

1945 return job.QueryJob.from_api_repr(resource, self)

1946 return job.UnknownJob.from_api_repr(resource, self)

1947

1948 def create_job(

1949 self,

1950 job_config: dict,

1951 retry: retries.Retry = DEFAULT_RETRY,

1952 timeout: TimeoutType = DEFAULT_TIMEOUT,

1953 ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob]:

1954 """Create a new job.

1955 Args:

1956 job_config (dict): configuration job representation returned from the API.

1957

1958 Keyword Arguments:

1959 retry (Optional[google.api_core.retry.Retry]):

1960 How to retry the RPC.

1961 timeout (Optional[float]):

1962 The number of seconds to wait for the underlying HTTP transport

1963 before using ``retry``.

1964

1965 Returns:

1966 Union[ \

1967 google.cloud.bigquery.job.LoadJob, \

1968 google.cloud.bigquery.job.CopyJob, \

1969 google.cloud.bigquery.job.ExtractJob, \

1970 google.cloud.bigquery.job.QueryJob \

1971 ]:

1972 A new job instance.

1973 """

1974

1975 if "load" in job_config:

1976 load_job_config = google.cloud.bigquery.job.LoadJobConfig.from_api_repr(

1977 job_config

1978 )

1979 destination = _get_sub_prop(job_config, ["load", "destinationTable"])

1980 source_uris = _get_sub_prop(job_config, ["load", "sourceUris"])

1981 destination = TableReference.from_api_repr(destination)

1982 return self.load_table_from_uri(

1983 source_uris,

1984 destination,

1985 job_config=typing.cast(LoadJobConfig, load_job_config),

1986 retry=retry,

1987 timeout=timeout,

1988 )

1989 elif "copy" in job_config:

1990 copy_job_config = google.cloud.bigquery.job.CopyJobConfig.from_api_repr(

1991 job_config

1992 )

1993 destination = _get_sub_prop(job_config, ["copy", "destinationTable"])

1994 destination = TableReference.from_api_repr(destination)

1995 return self.copy_table(

1996 [], # Source table(s) already in job_config resource.

1997 destination,

1998 job_config=typing.cast(CopyJobConfig, copy_job_config),

1999 retry=retry,

2000 timeout=timeout,

2001 )

2002 elif "extract" in job_config:

2003 extract_job_config = (

2004 google.cloud.bigquery.job.ExtractJobConfig.from_api_repr(job_config)

2005 )

2006 source = _get_sub_prop(job_config, ["extract", "sourceTable"])

2007 if source:

2008 source_type = "Table"

2009 source = TableReference.from_api_repr(source)

2010 else:

2011 source = _get_sub_prop(job_config, ["extract", "sourceModel"])

2012 source_type = "Model"

2013 source = ModelReference.from_api_repr(source)

2014 destination_uris = _get_sub_prop(job_config, ["extract", "destinationUris"])

2015 return self.extract_table(

2016 source,

2017 destination_uris,

2018 job_config=typing.cast(ExtractJobConfig, extract_job_config),

2019 retry=retry,

2020 timeout=timeout,

2021 source_type=source_type,

2022 )

2023 elif "query" in job_config:

2024 query_job_config = google.cloud.bigquery.job.QueryJobConfig.from_api_repr(

2025 job_config

2026 )

2027 query = _get_sub_prop(job_config, ["query", "query"])

2028 return self.query(

2029 query,

2030 job_config=typing.cast(QueryJobConfig, query_job_config),

2031 retry=retry,

2032 timeout=timeout,

2033 )

2034 else:

2035 raise TypeError("Invalid job configuration received.")

2036

2037 def get_job(

2038 self,

2039 job_id: Union[str, job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob],

2040 project: str = None,

2041 location: str = None,

2042 retry: retries.Retry = DEFAULT_RETRY,

2043 timeout: TimeoutType = DEFAULT_TIMEOUT,

2044 ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob, job.UnknownJob]:

2045 """Fetch a job for the project associated with this client.

2046

2047 See

2048 https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/get

2049

2050 Args:

2051 job_id:

2052 Job identifier.

2053

2054 Keyword Arguments:

2055 project (Optional[str]):

2056 ID of the project which owns the job (defaults to the client's project).

2057 location (Optional[str]):

2058 Location where the job was run. Ignored if ``job_id`` is a job

2059 object.

2060 retry (Optional[google.api_core.retry.Retry]):

2061 How to retry the RPC.

2062 timeout (Optional[float]):

2063 The number of seconds to wait for the underlying HTTP transport

2064 before using ``retry``.

2065

2066 Returns:

2067 Job instance, based on the resource returned by the API.

2068 """

2069 extra_params = {"projection": "full"}

2070

2071 project, location, job_id = _extract_job_reference(

2072 job_id, project=project, location=location

2073 )

2074

2075 if project is None:

2076 project = self.project

2077

2078 if location is None:

2079 location = self.location

2080

2081 if location is not None:

2082 extra_params["location"] = location

2083

2084 path = "/projects/{}/jobs/{}".format(project, job_id)

2085

2086 span_attributes = {"path": path, "job_id": job_id, "location": location}

2087

2088 resource = self._call_api(

2089 retry,

2090 span_name="BigQuery.getJob",

2091 span_attributes=span_attributes,

2092 method="GET",

2093 path=path,

2094 query_params=extra_params,

2095 timeout=timeout,

2096 )

2097

2098 return self.job_from_resource(resource)

2099

2100 def cancel_job(

2101 self,

2102 job_id: str,

2103 project: str = None,

2104 location: str = None,

2105 retry: retries.Retry = DEFAULT_RETRY,

2106 timeout: TimeoutType = DEFAULT_TIMEOUT,

2107 ) -> Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob]:

2108 """Attempt to cancel a job from a job ID.

2109

2110 See

2111 https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/cancel

2112

2113 Args:

2114 job_id (Union[ \

2115 str, \

2116 google.cloud.bigquery.job.LoadJob, \

2117 google.cloud.bigquery.job.CopyJob, \

2118 google.cloud.bigquery.job.ExtractJob, \

2119 google.cloud.bigquery.job.QueryJob \

2120 ]): Job identifier.

2121

2122 Keyword Arguments:

2123 project (Optional[str]):

2124 ID of the project which owns the job (defaults to the client's project).

2125 location (Optional[str]):

2126 Location where the job was run. Ignored if ``job_id`` is a job

2127 object.

2128 retry (Optional[google.api_core.retry.Retry]):

2129 How to retry the RPC.

2130 timeout (Optional[float]):

2131 The number of seconds to wait for the underlying HTTP transport

2132 before using ``retry``.

2133

2134 Returns:

2135 Union[ \

2136 google.cloud.bigquery.job.LoadJob, \

2137 google.cloud.bigquery.job.CopyJob, \

2138 google.cloud.bigquery.job.ExtractJob, \

2139 google.cloud.bigquery.job.QueryJob, \

2140 ]:

2141 Job instance, based on the resource returned by the API.

2142 """

2143 extra_params = {"projection": "full"}

2144

2145 project, location, job_id = _extract_job_reference(

2146 job_id, project=project, location=location

2147 )

2148

2149 if project is None:

2150 project = self.project

2151

2152 if location is None:

2153 location = self.location

2154

2155 if location is not None:

2156 extra_params["location"] = location

2157

2158 path = "/projects/{}/jobs/{}/cancel".format(project, job_id)

2159

2160 span_attributes = {"path": path, "job_id": job_id, "location": location}

2161

2162 resource = self._call_api(

2163 retry,

2164 span_name="BigQuery.cancelJob",

2165 span_attributes=span_attributes,

2166 method="POST",

2167 path=path,

2168 query_params=extra_params,

2169 timeout=timeout,

2170 )

2171

2172 job_instance = self.job_from_resource(resource["job"]) # never an UnknownJob

2173

2174 return typing.cast(

2175 Union[job.LoadJob, job.CopyJob, job.ExtractJob, job.QueryJob],

2176 job_instance,

2177 )

2178

2179 def list_jobs(

2180 self,

2181 project: str = None,

2182 parent_job: Optional[Union[QueryJob, str]] = None,

2183 max_results: Optional[int] = None,

2184 page_token: str = None,

2185 all_users: bool = None,

2186 state_filter: str = None,

2187 retry: retries.Retry = DEFAULT_RETRY,

2188 timeout: TimeoutType = DEFAULT_TIMEOUT,

2189 min_creation_time: datetime.datetime = None,

2190 max_creation_time: datetime.datetime = None,

2191 page_size: Optional[int] = None,

2192 ) -> page_iterator.Iterator:

2193 """List jobs for the project associated with this client.

2194

2195 See

2196 https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/list

2197

2198 Args:

2199 project (Optional[str]):

2200 Project ID to use for retreiving datasets. Defaults

2201 to the client's project.

2202 parent_job (Optional[Union[ \

2203 google.cloud.bigquery.job._AsyncJob, \

2204 str, \

2205 ]]):

2206 If set, retrieve only child jobs of the specified parent.

2207 max_results (Optional[int]):

2208 Maximum number of jobs to return.

2209 page_token (Optional[str]):

2210 Opaque marker for the next "page" of jobs. If not

2211 passed, the API will return the first page of jobs. The token

2212 marks the beginning of the iterator to be returned and the

2213 value of the ``page_token`` can be accessed at

2214 ``next_page_token`` of

2215 :class:`~google.api_core.page_iterator.HTTPIterator`.

2216 all_users (Optional[bool]):

2217 If true, include jobs owned by all users in the project.

2218 Defaults to :data:`False`.

2219 state_filter (Optional[str]):

2220 If set, include only jobs matching the given state. One of:

2221 * ``"done"``

2222 * ``"pending"``

2223 * ``"running"``

2224 retry (Optional[google.api_core.retry.Retry]):

2225 How to retry the RPC.

2226 timeout (Optional[float]):

2227 The number of seconds to wait for the underlying HTTP transport

2228 before using ``retry``.

2229 min_creation_time (Optional[datetime.datetime]):

2230 Min value for job creation time. If set, only jobs created

2231 after or at this timestamp are returned. If the datetime has

2232 no time zone assumes UTC time.

2233 max_creation_time (Optional[datetime.datetime]):

2234 Max value for job creation time. If set, only jobs created

2235 before or at this timestamp are returned. If the datetime has

2236 no time zone assumes UTC time.

2237 page_size (Optional[int]):

2238 Maximum number of jobs to return per page.

2239

2240 Returns:

2241 google.api_core.page_iterator.Iterator:

2242 Iterable of job instances.

2243 """

2244 if isinstance(parent_job, job._AsyncJob):

2245 parent_job = parent_job.job_id # pytype: disable=attribute-error

2246

2247 extra_params = {

2248 "allUsers": all_users,

2249 "stateFilter": state_filter,

2250 "minCreationTime": _str_or_none(

2251 google.cloud._helpers._millis_from_datetime(min_creation_time)

2252 ),

2253 "maxCreationTime": _str_or_none(

2254 google.cloud._helpers._millis_from_datetime(max_creation_time)

2255 ),

2256 "projection": "full",

2257 "parentJobId": parent_job,

2258 }

2259

2260 extra_params = {

2261 param: value for param, value in extra_params.items() if value is not None

2262 }

2263

2264 if project is None:

2265 project = self.project

2266

2267 path = "/projects/%s/jobs" % (project,)

2268

2269 span_attributes = {"path": path}

2270

2271 def api_request(*args, **kwargs):

2272 return self._call_api(

2273 retry,

2274 span_name="BigQuery.listJobs",

2275 span_attributes=span_attributes,

2276 *args,

2277 timeout=timeout,

2278 **kwargs,

2279 )

2280

2281 return page_iterator.HTTPIterator(

2282 client=self,

2283 api_request=api_request,

2284 path=path,

2285 item_to_value=_item_to_job,

2286 items_key="jobs",

2287 page_token=page_token,

2288 max_results=max_results,

2289 extra_params=extra_params,

2290 page_size=page_size,

2291 )

2292

2293 def load_table_from_uri(

2294 self,

2295 source_uris: Union[str, Sequence[str]],

2296 destination: Union[Table, TableReference, TableListItem, str],

2297 job_id: str = None,

2298 job_id_prefix: str = None,

2299 location: str = None,

2300 project: str = None,

2301 job_config: LoadJobConfig = None,

2302 retry: retries.Retry = DEFAULT_RETRY,

2303 timeout: TimeoutType = DEFAULT_TIMEOUT,

2304 ) -> job.LoadJob:

2305 """Starts a job for loading data into a table from Cloud Storage.

2306

2307 See

2308 https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload

2309

2310 Args:

2311 source_uris (Union[str, Sequence[str]]):

2312 URIs of data files to be loaded; in format

2313 ``gs://<bucket_name>/<object_name_or_glob>``.

2314 destination (Union[ \

2315 google.cloud.bigquery.table.Table, \

2316 google.cloud.bigquery.table.TableReference, \

2317 google.cloud.bigquery.table.TableListItem, \

2318 str, \

2319 ]):

2320 Table into which data is to be loaded. If a string is passed

2321 in, this method attempts to create a table reference from a

2322 string using

2323 :func:`google.cloud.bigquery.table.TableReference.from_string`.

2324

2325 Keyword Arguments:

2326 job_id (Optional[str]): Name of the job.

2327 job_id_prefix (Optional[str]):

2328 The user-provided prefix for a randomly generated job ID.

2329 This parameter will be ignored if a ``job_id`` is also given.

2330 location (Optional[str]):

2331 Location where to run the job. Must match the location of the

2332 destination table.

2333 project (Optional[str]):

2334 Project ID of the project of where to run the job. Defaults

2335 to the client's project.

2336 job_config (Optional[google.cloud.bigquery.job.LoadJobConfig]):

2337 Extra configuration options for the job.

2338 retry (Optional[google.api_core.retry.Retry]):

2339 How to retry the RPC.

2340 timeout (Optional[float]):

2341 The number of seconds to wait for the underlying HTTP transport

2342 before using ``retry``.

2343

2344 Returns:

2345 google.cloud.bigquery.job.LoadJob: A new load job.

2346

2347 Raises:

2348 TypeError:

2349 If ``job_config`` is not an instance of

2350 :class:`~google.cloud.bigquery.job.LoadJobConfig` class.

2351 """

2352 job_id = _make_job_id(job_id, job_id_prefix)

2353

2354 if project is None:

2355 project = self.project

2356

2357 if location is None:

2358 location = self.location

2359

2360 job_ref = job._JobReference(job_id, project=project, location=location)

2361

2362 if isinstance(source_uris, str):

2363 source_uris = [source_uris]

2364

2365 destination = _table_arg_to_table_ref(destination, default_project=self.project)

2366

2367 if job_config is not None:

2368 _verify_job_config_type(job_config, LoadJobConfig)

2369 else:

2370 job_config = job.LoadJobConfig()

2371

2372 new_job_config = job_config._fill_from_default(self._default_load_job_config)

2373

2374 load_job = job.LoadJob(job_ref, source_uris, destination, self, new_job_config)

2375 load_job._begin(retry=retry, timeout=timeout)

2376

2377 return load_job

2378

2379 def load_table_from_file(

2380 self,

2381 file_obj: IO[bytes],

2382 destination: Union[Table, TableReference, TableListItem, str],

2383 rewind: bool = False,

2384 size: Optional[int] = None,

2385 num_retries: int = _DEFAULT_NUM_RETRIES,

2386 job_id: str = None,

2387 job_id_prefix: str = None,

2388 location: str = None,

2389 project: str = None,

2390 job_config: LoadJobConfig = None,

2391 timeout: ResumableTimeoutType = DEFAULT_TIMEOUT,

2392 ) -> job.LoadJob:

2393 """Upload the contents of this table from a file-like object.

2394

2395 Similar to :meth:`load_table_from_uri`, this method creates, starts and

2396 returns a :class:`~google.cloud.bigquery.job.LoadJob`.

2397

2398 Args:

2399 file_obj:

2400 A file handle opened in binary mode for reading.

2401 destination:

2402 Table into which data is to be loaded. If a string is passed

2403 in, this method attempts to create a table reference from a

2404 string using

2405 :func:`google.cloud.bigquery.table.TableReference.from_string`.

2406

2407 Keyword Arguments:

2408 rewind:

2409 If True, seek to the beginning of the file handle before

2410 reading the file.

2411 size:

2412 The number of bytes to read from the file handle. If size is

2413 ``None`` or large, resumable upload will be used. Otherwise,

2414 multipart upload will be used.

2415 num_retries: Number of upload retries. Defaults to 6.

2416 job_id: Name of the job.

2417 job_id_prefix:

2418 The user-provided prefix for a randomly generated job ID.

2419 This parameter will be ignored if a ``job_id`` is also given.

2420 location:

2421 Location where to run the job. Must match the location of the

2422 destination table.

2423 project:

2424 Project ID of the project of where to run the job. Defaults

2425 to the client's project.

2426 job_config:

2427 Extra configuration options for the job.

2428 timeout:

2429 The number of seconds to wait for the underlying HTTP transport

2430 before using ``retry``. Depending on the retry strategy, a request

2431 may be repeated several times using the same timeout each time.

2432

2433 Can also be passed as a tuple (connect_timeout, read_timeout).

2434 See :meth:`requests.Session.request` documentation for details.

2435

2436 Returns:

2437 google.cloud.bigquery.job.LoadJob: A new load job.

2438

2439 Raises:

2440 ValueError:

2441 If ``size`` is not passed in and can not be determined, or if

2442 the ``file_obj`` can be detected to be a file opened in text

2443 mode.

2444

2445 TypeError:

2446 If ``job_config`` is not an instance of

2447 :class:`~google.cloud.bigquery.job.LoadJobConfig` class.

2448 """

2449 job_id = _make_job_id(job_id, job_id_prefix)

2450

2451 if project is None:

2452 project = self.project

2453

2454 if location is None:

2455 location = self.location

2456

2457 destination = _table_arg_to_table_ref(destination, default_project=self.project)

2458 job_ref = job._JobReference(job_id, project=project, location=location)

2459

2460 if job_config is not None:

2461 _verify_job_config_type(job_config, LoadJobConfig)

2462 else:

2463 job_config = job.LoadJobConfig()

2464

2465 new_job_config = job_config._fill_from_default(self._default_load_job_config)

2466

2467 load_job = job.LoadJob(job_ref, None, destination, self, new_job_config)

2468 job_resource = load_job.to_api_repr()

2469

2470 if rewind:

2471 file_obj.seek(0, os.SEEK_SET)

2472

2473 _check_mode(file_obj)

2474

2475 try:

2476 if size is None or size >= _MAX_MULTIPART_SIZE:

2477 response = self._do_resumable_upload(

2478 file_obj, job_resource, num_retries, timeout, project=project

2479 )

2480 else:

2481 response = self._do_multipart_upload(

2482 file_obj, job_resource, size, num_retries, timeout, project=project

2483 )

2484 except resumable_media.InvalidResponse as exc:

2485 raise exceptions.from_http_response(exc.response)

2486

2487 return typing.cast(LoadJob, self.job_from_resource(response.json()))

2488

2489 def load_table_from_dataframe(

2490 self,

2491 dataframe: "pandas.DataFrame",

2492 destination: Union[Table, TableReference, str],

2493 num_retries: int = _DEFAULT_NUM_RETRIES,

2494 job_id: str = None,

2495 job_id_prefix: str = None,

2496 location: str = None,

2497 project: str = None,

2498 job_config: LoadJobConfig = None,

2499 parquet_compression: str = "snappy",

2500 timeout: ResumableTimeoutType = DEFAULT_TIMEOUT,

2501 ) -> job.LoadJob:

2502 """Upload the contents of a table from a pandas DataFrame.

2503

2504 Similar to :meth:`load_table_from_uri`, this method creates, starts and

2505 returns a :class:`~google.cloud.bigquery.job.LoadJob`.

2506

2507 .. note::

2508

2509 REPEATED fields are NOT supported when using the CSV source format.

2510 They are supported when using the PARQUET source format, but

2511 due to the way they are encoded in the ``parquet`` file,

2512 a mismatch with the existing table schema can occur, so

2513 REPEATED fields are not properly supported when using ``pyarrow<4.0.0``

2514 using the parquet format.

2515

2516 https://github.com/googleapis/python-bigquery/issues/19

2517

2518 Args:

2519 dataframe:

2520 A :class:`~pandas.DataFrame` containing the data to load.

2521 destination:

2522 The destination table to use for loading the data. If it is an

2523 existing table, the schema of the :class:`~pandas.DataFrame`

2524 must match the schema of the destination table. If the table

2525 does not yet exist, the schema is inferred from the

2526 :class:`~pandas.DataFrame`.

2527

2528 If a string is passed in, this method attempts to create a

2529 table reference from a string using

2530 :func:`google.cloud.bigquery.table.TableReference.from_string`.

2531

2532 Keyword Arguments:

2533 num_retries: Number of upload retries.

2534 job_id: Name of the job.

2535 job_id_prefix:

2536 The user-provided prefix for a randomly generated

2537 job ID. This parameter will be ignored if a ``job_id`` is

2538 also given.

2539 location:

2540 Location where to run the job. Must match the location of the

2541 destination table.

2542 project:

2543 Project ID of the project of where to run the job. Defaults

2544 to the client's project.

2545 job_config:

2546 Extra configuration options for the job.

2547

2548 To override the default pandas data type conversions, supply

2549 a value for

2550 :attr:`~google.cloud.bigquery.job.LoadJobConfig.schema` with

2551 column names matching those of the dataframe. The BigQuery

2552 schema is used to determine the correct data type conversion.

2553 Indexes are not loaded.

2554

2555 By default, this method uses the parquet source format. To

2556 override this, supply a value for

2557 :attr:`~google.cloud.bigquery.job.LoadJobConfig.source_format`

2558 with the format name. Currently only

2559 :attr:`~google.cloud.bigquery.job.SourceFormat.CSV` and

2560 :attr:`~google.cloud.bigquery.job.SourceFormat.PARQUET` are

2561 supported.

2562 parquet_compression:

2563 [Beta] The compression method to use if intermittently

2564 serializing ``dataframe`` to a parquet file.

2565

2566 The argument is directly passed as the ``compression``

2567 argument to the underlying ``pyarrow.parquet.write_table()``

2568 method (the default value "snappy" gets converted to uppercase).

2569 https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table

2570

2571 If the job config schema is missing, the argument is directly

2572 passed as the ``compression`` argument to the underlying

2573 ``DataFrame.to_parquet()`` method.

2574 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet

2575 timeout:

2576 The number of seconds to wait for the underlying HTTP transport

2577 before using ``retry``. Depending on the retry strategy, a request may

2578 be repeated several times using the same timeout each time.

2579

2580 Can also be passed as a tuple (connect_timeout, read_timeout).

2581 See :meth:`requests.Session.request` documentation for details.

2582

2583 Returns:

2584 google.cloud.bigquery.job.LoadJob: A new load job.

2585

2586 Raises:

2587 ValueError:

2588 If a usable parquet engine cannot be found. This method

2589 requires :mod:`pyarrow` to be installed.

2590 TypeError:

2591 If ``job_config`` is not an instance of

2592 :class:`~google.cloud.bigquery.job.LoadJobConfig` class.

2593 """

2594 job_id = _make_job_id(job_id, job_id_prefix)

2595

2596 if job_config is not None:

2597 _verify_job_config_type(job_config, LoadJobConfig)

2598 else:

2599 job_config = job.LoadJobConfig()

2600

2601 new_job_config = job_config._fill_from_default(self._default_load_job_config)

2602

2603 supported_formats = {job.SourceFormat.CSV, job.SourceFormat.PARQUET}

2604 if new_job_config.source_format is None:

2605 # default value

2606 new_job_config.source_format = job.SourceFormat.PARQUET

2607

2608 if (

2609 new_job_config.source_format == job.SourceFormat.PARQUET

2610 and new_job_config.parquet_options is None

2611 ):

2612 parquet_options = ParquetOptions()

2613 # default value

2614 parquet_options.enable_list_inference = True

2615 new_job_config.parquet_options = parquet_options

2616

2617 if new_job_config.source_format not in supported_formats:

2618 raise ValueError(

2619 "Got unexpected source_format: '{}'. Currently, only PARQUET and CSV are supported".format(

2620 new_job_config.source_format

2621 )

2622 )

2623

2624 if pyarrow is None and new_job_config.source_format == job.SourceFormat.PARQUET:

2625 # pyarrow is now the only supported parquet engine.

2626 raise ValueError("This method requires pyarrow to be installed")

2627

2628 if location is None:

2629 location = self.location

2630

2631 # If table schema is not provided, we try to fetch the existing table

2632 # schema, and check if dataframe schema is compatible with it - except

2633 # for WRITE_TRUNCATE jobs, the existing schema does not matter then.

2634 if (

2635 not new_job_config.schema

2636 and new_job_config.write_disposition != job.WriteDisposition.WRITE_TRUNCATE

2637 ):

2638 try:

2639 table = self.get_table(destination)

2640 except core_exceptions.NotFound:

2641 pass

2642 else:

2643 columns_and_indexes = frozenset(

2644 name

2645 for name, _ in _pandas_helpers.list_columns_and_indexes(dataframe)

2646 )

2647 new_job_config.schema = [

2648 # Field description and policy tags are not needed to

2649 # serialize a data frame.

2650 SchemaField(

2651 field.name,

2652 field.field_type,

2653 mode=field.mode,

2654 fields=field.fields,

2655 )

2656 # schema fields not present in the dataframe are not needed

2657 for field in table.schema

2658 if field.name in columns_and_indexes

2659 ]

2660

2661 new_job_config.schema = _pandas_helpers.dataframe_to_bq_schema(

2662 dataframe, new_job_config.schema

2663 )

2664

2665 if not new_job_config.schema:

2666 # the schema could not be fully detected

2667 warnings.warn(

2668 "Schema could not be detected for all columns. Loading from a "

2669 "dataframe without a schema will be deprecated in the future, "

2670 "please provide a schema.",

2671 PendingDeprecationWarning,

2672 stacklevel=2,

2673 )

2674

2675 tmpfd, tmppath = tempfile.mkstemp(

2676 suffix="_job_{}.{}".format(job_id[:8], new_job_config.source_format.lower())

2677 )

2678 os.close(tmpfd)

2679

2680 try:

2681

2682 if new_job_config.source_format == job.SourceFormat.PARQUET:

2683 if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:

2684 msg = (

2685 "Loading dataframe data in PARQUET format with pyarrow "

2686 f"{_PYARROW_VERSION} can result in data corruption. It is "

2687 "therefore *strongly* advised to use a different pyarrow "

2688 "version or a different source format. "

2689 "See: https://github.com/googleapis/python-bigquery/issues/781"

2690 )

2691 warnings.warn(msg, category=RuntimeWarning)

2692

2693 if new_job_config.schema:

2694 if parquet_compression == "snappy": # adjust the default value

2695 parquet_compression = parquet_compression.upper()

2696

2697 _pandas_helpers.dataframe_to_parquet(

2698 dataframe,

2699 new_job_config.schema,

2700 tmppath,

2701 parquet_compression=parquet_compression,

2702 parquet_use_compliant_nested_type=True,

2703 )

2704 else:

2705 dataframe.to_parquet(

2706 tmppath,

2707 engine="pyarrow",

2708 compression=parquet_compression,

2709 **(

2710 {"use_compliant_nested_type": True}

2711 if _helpers.PYARROW_VERSIONS.use_compliant_nested_type

2712 else {}

2713 ),

2714 )

2715

2716 else:

2717

2718 dataframe.to_csv(

2719 tmppath,

2720 index=False,

2721 header=False,

2722 encoding="utf-8",

2723 float_format="%.17g",

2724 date_format="%Y-%m-%d %H:%M:%S.%f",

2725 )

2726

2727 with open(tmppath, "rb") as tmpfile:

2728 file_size = os.path.getsize(tmppath)

2729 return self.load_table_from_file(

2730 tmpfile,

2731 destination,

2732 num_retries=num_retries,

2733 rewind=True,

2734 size=file_size,

2735 job_id=job_id,

2736 job_id_prefix=job_id_prefix,

2737 location=location,

2738 project=project,

2739 job_config=new_job_config,

2740 timeout=timeout,

2741 )

2742

2743 finally:

2744 os.remove(tmppath)

2745

2746 def load_table_from_json(

2747 self,

2748 json_rows: Iterable[Dict[str, Any]],

2749 destination: Union[Table, TableReference, TableListItem, str],

2750 num_retries: int = _DEFAULT_NUM_RETRIES,

2751 job_id: str = None,

2752 job_id_prefix: str = None,

2753 location: str = None,

2754 project: str = None,

2755 job_config: LoadJobConfig = None,

2756 timeout: ResumableTimeoutType = DEFAULT_TIMEOUT,

2757 ) -> job.LoadJob:

2758 """Upload the contents of a table from a JSON string or dict.

2759

2760 Args:

2761 json_rows (Iterable[Dict[str, Any]]):

2762 Row data to be inserted. Keys must match the table schema fields

2763 and values must be JSON-compatible representations.

2764

2765 .. note::

2766

2767 If your data is already a newline-delimited JSON string,

2768 it is best to wrap it into a file-like object and pass it

2769 to :meth:`~google.cloud.bigquery.client.Client.load_table_from_file`::

2770

2771 import io

2772 from google.cloud import bigquery

2773

2774 data = u'{"foo": "bar"}'

2775 data_as_file = io.StringIO(data)

2776

2777 client = bigquery.Client()

2778 client.load_table_from_file(data_as_file, ...)

2779

2780 destination:

2781 Table into which data is to be loaded. If a string is passed

2782 in, this method attempts to create a table reference from a

2783 string using

2784 :func:`google.cloud.bigquery.table.TableReference.from_string`.

2785

2786 Keyword Arguments:

2787 num_retries: Number of upload retries.

2788 job_id: Name of the job.

2789 job_id_prefix:

2790 The user-provided prefix for a randomly generated job ID.

2791 This parameter will be ignored if a ``job_id`` is also given.

2792 location:

2793 Location where to run the job. Must match the location of the

2794 destination table.

2795 project:

2796 Project ID of the project of where to run the job. Defaults

2797 to the client's project.

2798 job_config:

2799 Extra configuration options for the job. The ``source_format``

2800 setting is always set to

2801 :attr:`~google.cloud.bigquery.job.SourceFormat.NEWLINE_DELIMITED_JSON`.

2802 timeout:

2803 The number of seconds to wait for the underlying HTTP transport

2804 before using ``retry``. Depending on the retry strategy, a request may

2805 be repeated several times using the same timeout each time.

2806

2807 Can also be passed as a tuple (connect_timeout, read_timeout).

2808 See :meth:`requests.Session.request` documentation for details.

2809

2810 Returns:

2811 google.cloud.bigquery.job.LoadJob: A new load job.

2812

2813 Raises:

2814 TypeError:

2815 If ``job_config`` is not an instance of

2816 :class:`~google.cloud.bigquery.job.LoadJobConfig` class.

2817 """

2818 job_id = _make_job_id(job_id, job_id_prefix)

2819

2820 if job_config is not None:

2821 _verify_job_config_type(job_config, LoadJobConfig)

2822 else:

2823 job_config = job.LoadJobConfig()

2824

2825 new_job_config = job_config._fill_from_default(self._default_load_job_config)

2826

2827 new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON

2828

2829 if new_job_config.schema is None:

2830 new_job_config.autodetect = True

2831

2832 if project is None:

2833 project = self.project

2834

2835 if location is None:

2836 location = self.location

2837

2838 destination = _table_arg_to_table_ref(destination, default_project=self.project)

2839

2840 data_str = "\n".join(json.dumps(item, ensure_ascii=False) for item in json_rows)

2841 encoded_str = data_str.encode()

2842 data_file = io.BytesIO(encoded_str)

2843 return self.load_table_from_file(

2844 data_file,

2845 destination,

2846 size=len(encoded_str),

2847 num_retries=num_retries,

2848 job_id=job_id,

2849 job_id_prefix=job_id_prefix,

2850 location=location,

2851 project=project,

2852 job_config=new_job_config,

2853 timeout=timeout,

2854 )

2855

2856 def _do_resumable_upload(

2857 self,

2858 stream: IO[bytes],

2859 metadata: Mapping[str, str],

2860 num_retries: int,

2861 timeout: Optional[ResumableTimeoutType],

2862 project: Optional[str] = None,

2863 ) -> "requests.Response":

2864 """Perform a resumable upload.

2865

2866 Args:

2867 stream: A bytes IO object open for reading.

2868

2869 metadata: The metadata associated with the upload.

2870

2871 num_retries:

2872 Number of upload retries. (Deprecated: This

2873 argument will be removed in a future release.)

2874

2875 timeout:

2876 The number of seconds to wait for the underlying HTTP transport

2877 before using ``retry``. Depending on the retry strategy, a request may

2878 be repeated several times using the same timeout each time.

2879

2880 Can also be passed as a tuple (connect_timeout, read_timeout).

2881 See :meth:`requests.Session.request` documentation for details.

2882

2883 project:

2884 Project ID of the project of where to run the upload. Defaults

2885 to the client's project.

2886

2887 Returns:

2888 The "200 OK" response object returned after the final chunk

2889 is uploaded.

2890 """

2891 upload, transport = self._initiate_resumable_upload(

2892 stream, metadata, num_retries, timeout, project=project

2893 )

2894

2895 while not upload.finished:

2896 response = upload.transmit_next_chunk(transport, timeout=timeout)

2897

2898 return response

2899

2900 def _initiate_resumable_upload(

2901 self,

2902 stream: IO[bytes],

2903 metadata: Mapping[str, str],

2904 num_retries: int,

2905 timeout: Optional[ResumableTimeoutType],

2906 project: Optional[str] = None,

2907 ):

2908 """Initiate a resumable upload.

2909

2910 Args:

2911 stream: A bytes IO object open for reading.

2912

2913 metadata: The metadata associated with the upload.

2914

2915 num_retries:

2916 Number of upload retries. (Deprecated: This

2917 argument will be removed in a future release.)

2918

2919 timeout:

2920 The number of seconds to wait for the underlying HTTP transport

2921 before using ``retry``. Depending on the retry strategy, a request may

2922 be repeated several times using the same timeout each time.

2923

2924 Can also be passed as a tuple (connect_timeout, read_timeout).

2925 See :meth:`requests.Session.request` documentation for details.

2926

2927 project:

2928 Project ID of the project of where to run the upload. Defaults

2929 to the client's project.

2930

2931 Returns:

2932 Tuple:

2933 Pair of

2934

2935 * The :class:`~google.resumable_media.requests.ResumableUpload`

2936 that was created

2937 * The ``transport`` used to initiate the upload.

2938 """

2939 chunk_size = _DEFAULT_CHUNKSIZE

2940 transport = self._http

2941 headers = _get_upload_headers(self._connection.user_agent)

2942

2943 if project is None:

2944 project = self.project

2945 # TODO: Increase the minimum version of google-cloud-core to 1.6.0

2946 # and remove this logic. See:

2947 # https://github.com/googleapis/python-bigquery/issues/509

2948 hostname = (

2949 self._connection.API_BASE_URL

2950 if not hasattr(self._connection, "get_api_base_url_for_mtls")

2951 else self._connection.get_api_base_url_for_mtls()

2952 )

2953 upload_url = _RESUMABLE_URL_TEMPLATE.format(host=hostname, project=project)

2954

2955 # TODO: modify ResumableUpload to take a retry.Retry object

2956 # that it can use for the initial RPC.

2957 upload = ResumableUpload(upload_url, chunk_size, headers=headers)

2958

2959 if num_retries is not None:

2960 upload._retry_strategy = resumable_media.RetryStrategy(

2961 max_retries=num_retries

2962 )

2963

2964 upload.initiate(

2965 transport,

2966 stream,

2967 metadata,

2968 _GENERIC_CONTENT_TYPE,

2969 stream_final=False,

2970 timeout=timeout,

2971 )

2972

2973 return upload, transport

2974

2975 def _do_multipart_upload(

2976 self,

2977 stream: IO[bytes],

2978 metadata: Mapping[str, str],

2979 size: int,

2980 num_retries: int,

2981 timeout: Optional[ResumableTimeoutType],

2982 project: Optional[str] = None,

2983 ):

2984 """Perform a multipart upload.

2985

2986 Args:

2987 stream: A bytes IO object open for reading.

2988

2989 metadata: The metadata associated with the upload.

2990

2991 size:

2992 The number of bytes to be uploaded (which will be read

2993 from ``stream``). If not provided, the upload will be

2994 concluded once ``stream`` is exhausted (or :data:`None`).

2995

2996 num_retries:

2997 Number of upload retries. (Deprecated: This

2998 argument will be removed in a future release.)

2999

3000 timeout:

3001 The number of seconds to wait for the underlying HTTP transport

3002 before using ``retry``. Depending on the retry strategy, a request may

3003 be repeated several times using the same timeout each time.

3004

3005 Can also be passed as a tuple (connect_timeout, read_timeout).

3006 See :meth:`requests.Session.request` documentation for details.

3007

3008 project:

3009 Project ID of the project of where to run the upload. Defaults

3010 to the client's project.

3011

3012 Returns:

3013 requests.Response:

3014 The "200 OK" response object returned after the multipart

3015 upload request.

3016

3017 Raises:

3018 ValueError:

3019 if the ``stream`` has fewer than ``size``

3020 bytes remaining.

3021 """

3022 data = stream.read(size)

3023 if len(data) < size:

3024 msg = _READ_LESS_THAN_SIZE.format(size, len(data))

3025 raise ValueError(msg)

3026

3027 headers = _get_upload_headers(self._connection.user_agent)

3028

3029 if project is None:

3030 project = self.project

3031

3032 # TODO: Increase the minimum version of google-cloud-core to 1.6.0

3033 # and remove this logic. See:

3034 # https://github.com/googleapis/python-bigquery/issues/509

3035 hostname = (

3036 self._connection.API_BASE_URL

3037 if not hasattr(self._connection, "get_api_base_url_for_mtls")

3038 else self._connection.get_api_base_url_for_mtls()

3039 )

3040 upload_url = _MULTIPART_URL_TEMPLATE.format(host=hostname, project=project)

3041 upload = MultipartUpload(upload_url, headers=headers)

3042

3043 if num_retries is not None:

3044 upload._retry_strategy = resumable_media.RetryStrategy(

3045 max_retries=num_retries

3046 )

3047

3048 response = upload.transmit(

3049 self._http, data, metadata, _GENERIC_CONTENT_TYPE, timeout=timeout

3050 )

3051

3052 return response

3053

3054 def copy_table(

3055 self,

3056 sources: Union[

3057 Table,

3058 TableReference,

3059 TableListItem,

3060 str,

3061 Sequence[Union[Table, TableReference, TableListItem, str]],

3062 ],

3063 destination: Union[Table, TableReference, TableListItem, str],

3064 job_id: str = None,

3065 job_id_prefix: str = None,

3066 location: str = None,

3067 project: str = None,

3068 job_config: CopyJobConfig = None,

3069 retry: retries.Retry = DEFAULT_RETRY,

3070 timeout: TimeoutType = DEFAULT_TIMEOUT,

3071 ) -> job.CopyJob:

3072 """Copy one or more tables to another table.

3073

3074 See

3075 https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationtablecopy

3076

3077 Args:

3078 sources (Union[ \

3079 google.cloud.bigquery.table.Table, \

3080 google.cloud.bigquery.table.TableReference, \

3081 google.cloud.bigquery.table.TableListItem, \

3082 str, \

3083 Sequence[ \

3084 Union[ \

3085 google.cloud.bigquery.table.Table, \

3086 google.cloud.bigquery.table.TableReference, \

3087 google.cloud.bigquery.table.TableListItem, \

3088 str, \

3089 ] \

3090 ], \

3091 ]):

3092 Table or tables to be copied.

3093 destination (Union[ \

3094 google.cloud.bigquery.table.Table, \

3095 google.cloud.bigquery.table.TableReference, \

3096 google.cloud.bigquery.table.TableListItem, \

3097 str, \

3098 ]):

3099 Table into which data is to be copied.

3100

3101 Keyword Arguments:

3102 job_id (Optional[str]): The ID of the job.

3103 job_id_prefix (Optional[str]):

3104 The user-provided prefix for a randomly generated job ID.

3105 This parameter will be ignored if a ``job_id`` is also given.

3106 location (Optional[str]):

3107 Location where to run the job. Must match the location of any

3108 source table as well as the destination table.

3109 project (Optional[str]):

3110 Project ID of the project of where to run the job. Defaults

3111 to the client's project.

3112 job_config (Optional[google.cloud.bigquery.job.CopyJobConfig]):

3113 Extra configuration options for the job.

3114 retry (Optional[google.api_core.retry.Retry]):

3115 How to retry the RPC.

3116 timeout (Optional[float]):

3117 The number of seconds to wait for the underlying HTTP transport

3118 before using ``retry``.

3119

3120 Returns:

3121 google.cloud.bigquery.job.CopyJob: A new copy job instance.

3122

3123 Raises:

3124 TypeError:

3125 If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.CopyJobConfig`

3126 class.

3127 """

3128 job_id = _make_job_id(job_id, job_id_prefix)

3129

3130 if project is None:

3131 project = self.project

3132

3133 if location is None:

3134 location = self.location

3135

3136 job_ref = job._JobReference(job_id, project=project, location=location)

3137

3138 # sources can be one of many different input types. (string, Table,

3139 # TableReference, or a sequence of any of those.) Convert them all to a

3140 # list of TableReferences.

3141 #

3142 # _table_arg_to_table_ref leaves lists unmodified.

3143 sources = _table_arg_to_table_ref(sources, default_project=self.project)

3144

3145 if not isinstance(sources, collections_abc.Sequence):

3146 sources = [sources]

3147

3148 sources = [

3149 _table_arg_to_table_ref(source, default_project=self.project)

3150 for source in sources

3151 ]

3152

3153 destination = _table_arg_to_table_ref(destination, default_project=self.project)

3154

3155 if job_config:

3156 _verify_job_config_type(job_config, google.cloud.bigquery.job.CopyJobConfig)

3157 job_config = copy.deepcopy(job_config)

3158

3159 copy_job = job.CopyJob(

3160 job_ref, sources, destination, client=self, job_config=job_config

3161 )

3162 copy_job._begin(retry=retry, timeout=timeout)

3163

3164 return copy_job

3165

3166 def extract_table(

3167 self,

3168 source: Union[Table, TableReference, TableListItem, Model, ModelReference, str],

3169 destination_uris: Union[str, Sequence[str]],

3170 job_id: str = None,

3171 job_id_prefix: str = None,

3172 location: str = None,

3173 project: str = None,

3174 job_config: ExtractJobConfig = None,

3175 retry: retries.Retry = DEFAULT_RETRY,

3176 timeout: TimeoutType = DEFAULT_TIMEOUT,

3177 source_type: str = "Table",

3178 ) -> job.ExtractJob:

3179 """Start a job to extract a table into Cloud Storage files.

3180

3181 See

3182 https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationextract

3183

3184 Args:

3185 source (Union[ \

3186 google.cloud.bigquery.table.Table, \

3187 google.cloud.bigquery.table.TableReference, \

3188 google.cloud.bigquery.table.TableListItem, \

3189 google.cloud.bigquery.model.Model, \

3190 google.cloud.bigquery.model.ModelReference, \

3191 src, \

3192 ]):

3193 Table or Model to be extracted.

3194 destination_uris (Union[str, Sequence[str]]):

3195 URIs of Cloud Storage file(s) into which table data is to be

3196 extracted; in format

3197 ``gs://<bucket_name>/<object_name_or_glob>``.

3198

3199 Keyword Arguments:

3200 job_id (Optional[str]): The ID of the job.

3201 job_id_prefix (Optional[str]):

3202 The user-provided prefix for a randomly generated job ID.

3203 This parameter will be ignored if a ``job_id`` is also given.

3204 location (Optional[str]):

3205 Location where to run the job. Must match the location of the

3206 source table.

3207 project (Optional[str]):

3208 Project ID of the project of where to run the job. Defaults

3209 to the client's project.

3210 job_config (Optional[google.cloud.bigquery.job.ExtractJobConfig]):

3211 Extra configuration options for the job.

3212 retry (Optional[google.api_core.retry.Retry]):

3213 How to retry the RPC.

3214 timeout (Optional[float]):

3215 The number of seconds to wait for the underlying HTTP transport

3216 before using ``retry``.

3217 source_type (Optional[str]):

3218 Type of source to be extracted.``Table`` or ``Model``. Defaults to ``Table``.

3219 Returns:

3220 google.cloud.bigquery.job.ExtractJob: A new extract job instance.

3221

3222 Raises:

3223 TypeError:

3224 If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.ExtractJobConfig`

3225 class.

3226 ValueError:

3227 If ``source_type`` is not among ``Table``,``Model``.

3228 """

3229 job_id = _make_job_id(job_id, job_id_prefix)

3230

3231 if project is None:

3232 project = self.project

3233

3234 if location is None:

3235 location = self.location

3236

3237 job_ref = job._JobReference(job_id, project=project, location=location)

3238 src = source_type.lower()

3239 if src == "table":

3240 source = _table_arg_to_table_ref(source, default_project=self.project)

3241 elif src == "model":

3242 source = _model_arg_to_model_ref(source, default_project=self.project)

3243 else:

3244 raise ValueError(

3245 "Cannot pass `{}` as a ``source_type``, pass Table or Model".format(

3246 source_type

3247 )

3248 )

3249

3250 if isinstance(destination_uris, str):

3251 destination_uris = [destination_uris]

3252

3253 if job_config:

3254 _verify_job_config_type(

3255 job_config, google.cloud.bigquery.job.ExtractJobConfig

3256 )

3257 job_config = copy.deepcopy(job_config)

3258

3259 extract_job = job.ExtractJob(

3260 job_ref, source, destination_uris, client=self, job_config=job_config

3261 )

3262 extract_job._begin(retry=retry, timeout=timeout)

3263

3264 return extract_job

3265

3266 def query(

3267 self,

3268 query: str,

3269 job_config: QueryJobConfig = None,

3270 job_id: str = None,

3271 job_id_prefix: str = None,

3272 location: str = None,

3273 project: str = None,

3274 retry: retries.Retry = DEFAULT_RETRY,

3275 timeout: TimeoutType = DEFAULT_TIMEOUT,

3276 job_retry: retries.Retry = DEFAULT_JOB_RETRY,

3277 api_method: Union[str, enums.QueryApiMethod] = enums.QueryApiMethod.INSERT,

3278 ) -> job.QueryJob:

3279 """Run a SQL query.

3280

3281 See

3282 https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationquery

3283

3284 Args:

3285 query (str):

3286 SQL query to be executed. Defaults to the standard SQL

3287 dialect. Use the ``job_config`` parameter to change dialects.

3288

3289 Keyword Arguments:

3290 job_config (Optional[google.cloud.bigquery.job.QueryJobConfig]):

3291 Extra configuration options for the job.

3292 To override any options that were previously set in

3293 the ``default_query_job_config`` given to the

3294 ``Client`` constructor, manually set those options to ``None``,

3295 or whatever value is preferred.

3296 job_id (Optional[str]): ID to use for the query job.

3297 job_id_prefix (Optional[str]):

3298 The prefix to use for a randomly generated job ID. This parameter

3299 will be ignored if a ``job_id`` is also given.

3300 location (Optional[str]):

3301 Location where to run the job. Must match the location of the

3302 table used in the query as well as the destination table.

3303 project (Optional[str]):

3304 Project ID of the project of where to run the job. Defaults

3305 to the client's project.

3306 retry (Optional[google.api_core.retry.Retry]):

3307 How to retry the RPC. This only applies to making RPC

3308 calls. It isn't used to retry failed jobs. This has

3309 a reasonable default that should only be overridden

3310 with care.

3311 timeout (Optional[float]):

3312 The number of seconds to wait for the underlying HTTP transport

3313 before using ``retry``.

3314 job_retry (Optional[google.api_core.retry.Retry]):

3315 How to retry failed jobs. The default retries

3316 rate-limit-exceeded errors. Passing ``None`` disables

3317 job retry.

3318

3319 Not all jobs can be retried. If ``job_id`` is

3320 provided, then the job returned by the query will not

3321 be retryable, and an exception will be raised if a

3322 non-``None`` (and non-default) value for ``job_retry``

3323 is also provided.

3324

3325 Note that errors aren't detected until ``result()`` is

3326 called on the job returned. The ``job_retry``

3327 specified here becomes the default ``job_retry`` for

3328 ``result()``, where it can also be specified.

3329 api_method (Union[str, enums.QueryApiMethod]):

3330 Method with which to start the query job.

3331

3332 See :class:`google.cloud.bigquery.enums.QueryApiMethod` for

3333 details on the difference between the query start methods.

3334

3335 Returns:

3336 google.cloud.bigquery.job.QueryJob: A new query job instance.

3337

3338 Raises:

3339 TypeError:

3340 If ``job_config`` is not an instance of

3341 :class:`~google.cloud.bigquery.job.QueryJobConfig`

3342 class, or if both ``job_id`` and non-``None`` non-default

3343 ``job_retry`` are provided.

3344 """

3345 job_id_given = job_id is not None

3346 if (

3347 job_id_given

3348 and job_retry is not None

3349 and job_retry is not DEFAULT_JOB_RETRY

3350 ):

3351 raise TypeError(

3352 "`job_retry` was provided, but the returned job is"

3353 " not retryable, because a custom `job_id` was"

3354 " provided."

3355 )

3356

3357 if job_id_given and api_method == enums.QueryApiMethod.QUERY:

3358 raise TypeError(

3359 "`job_id` was provided, but the 'QUERY' `api_method` was requested."

3360 )

3361

3362 if project is None:

3363 project = self.project

3364

3365 if location is None:

3366 location = self.location

3367

3368 if self._default_query_job_config:

3369 if job_config:

3370 _verify_job_config_type(

3371 job_config, google.cloud.bigquery.job.QueryJobConfig

3372 )

3373 # anything that's not defined on the incoming

3374 # that is in the default,

3375 # should be filled in with the default

3376 # the incoming therefore has precedence

3377 #

3378 # Note that _fill_from_default doesn't mutate the receiver

3379 job_config = job_config._fill_from_default(

3380 self._default_query_job_config

3381 )

3382 else:

3383 _verify_job_config_type(

3384 self._default_query_job_config,

3385 google.cloud.bigquery.job.QueryJobConfig,

3386 )

3387 job_config = self._default_query_job_config

3388

3389 # Note that we haven't modified the original job_config (or

3390 # _default_query_job_config) up to this point.

3391 if api_method == enums.QueryApiMethod.QUERY:

3392 return _job_helpers.query_jobs_query(

3393 self,

3394 query,

3395 job_config,

3396 location,

3397 project,

3398 retry,

3399 timeout,

3400 job_retry,

3401 )

3402 elif api_method == enums.QueryApiMethod.INSERT:

3403 return _job_helpers.query_jobs_insert(

3404 self,

3405 query,

3406 job_config,

3407 job_id,

3408 job_id_prefix,

3409 location,

3410 project,

3411 retry,

3412 timeout,

3413 job_retry,

3414 )

3415 else:

3416 raise ValueError(f"Got unexpected value for api_method: {repr(api_method)}")

3417

3418 def insert_rows(

3419 self,

3420 table: Union[Table, TableReference, str],

3421 rows: Union[Iterable[Tuple], Iterable[Mapping[str, Any]]],

3422 selected_fields: Sequence[SchemaField] = None,

3423 **kwargs,

3424 ) -> Sequence[Dict[str, Any]]:

3425 """Insert rows into a table via the streaming API.

3426

3427 See

3428 https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll

3429

3430 BigQuery will reject insertAll payloads that exceed a defined limit (10MB).

3431 Additionally, if a payload vastly exceeds this limit, the request is rejected

3432 by the intermediate architecture, which returns a 413 (Payload Too Large) status code.

3433

3434

3435 See

3436 https://cloud.google.com/bigquery/quotas#streaming_inserts

3437

3438 Args:

3439 table (Union[ \

3440 google.cloud.bigquery.table.Table, \

3441 google.cloud.bigquery.table.TableReference, \

3442 str, \

3443 ]):

3444 The destination table for the row data, or a reference to it.

3445 rows (Union[Sequence[Tuple], Sequence[Dict]]):

3446 Row data to be inserted. If a list of tuples is given, each

3447 tuple should contain data for each schema field on the

3448 current table and in the same order as the schema fields. If

3449 a list of dictionaries is given, the keys must include all

3450 required fields in the schema. Keys which do not correspond

3451 to a field in the schema are ignored.

3452 selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]):

3453 The fields to return. Required if ``table`` is a

3454 :class:`~google.cloud.bigquery.table.TableReference`.

3455 kwargs (dict):

3456 Keyword arguments to

3457 :meth:`~google.cloud.bigquery.client.Client.insert_rows_json`.

3458

3459 Returns:

3460 Sequence[Mappings]:

3461 One mapping per row with insert errors: the "index" key

3462 identifies the row, and the "errors" key contains a list of

3463 the mappings describing one or more problems with the row.

3464

3465 Raises:

3466 ValueError: if table's schema is not set or `rows` is not a `Sequence`.

3467 """

3468 if not isinstance(rows, (collections_abc.Sequence, collections_abc.Iterator)):

3469 raise TypeError("rows argument should be a sequence of dicts or tuples")

3470

3471 table = _table_arg_to_table(table, default_project=self.project)

3472

3473 if not isinstance(table, Table):

3474 raise TypeError(_NEED_TABLE_ARGUMENT)

3475

3476 schema = table.schema

3477

3478 # selected_fields can override the table schema.

3479 if selected_fields is not None:

3480 schema = selected_fields

3481

3482 if len(schema) == 0:

3483 raise ValueError(

3484 (

3485 "Could not determine schema for table '{}'. Call client.get_table() "

3486 "or pass in a list of schema fields to the selected_fields argument."

3487 ).format(table)

3488 )

3489

3490 json_rows = [_record_field_to_json(schema, row) for row in rows]

3491

3492 return self.insert_rows_json(table, json_rows, **kwargs)

3493

3494 def insert_rows_from_dataframe(

3495 self,

3496 table: Union[Table, TableReference, str],

3497 dataframe,

3498 selected_fields: Sequence[SchemaField] = None,

3499 chunk_size: int = 500,

3500 **kwargs: Dict,

3501 ) -> Sequence[Sequence[dict]]:

3502 """Insert rows into a table from a dataframe via the streaming API.

3503

3504 BigQuery will reject insertAll payloads that exceed a defined limit (10MB).

3505 Additionally, if a payload vastly exceeds this limit, the request is rejected

3506 by the intermediate architecture, which returns a 413 (Payload Too Large) status code.

3507

3508 See

3509 https://cloud.google.com/bigquery/quotas#streaming_inserts

3510

3511 Args:

3512 table (Union[ \

3513 google.cloud.bigquery.table.Table, \

3514 google.cloud.bigquery.table.TableReference, \

3515 str, \

3516 ]):

3517 The destination table for the row data, or a reference to it.

3518 dataframe (pandas.DataFrame):

3519 A :class:`~pandas.DataFrame` containing the data to load. Any

3520 ``NaN`` values present in the dataframe are omitted from the

3521 streaming API request(s).

3522 selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]):

3523 The fields to return. Required if ``table`` is a

3524 :class:`~google.cloud.bigquery.table.TableReference`.

3525 chunk_size (int):

3526 The number of rows to stream in a single chunk. Must be positive.

3527 kwargs (Dict):

3528 Keyword arguments to

3529 :meth:`~google.cloud.bigquery.client.Client.insert_rows_json`.

3530

3531 Returns:

3532 Sequence[Sequence[Mappings]]:

3533 A list with insert errors for each insert chunk. Each element

3534 is a list containing one mapping per row with insert errors:

3535 the "index" key identifies the row, and the "errors" key

3536 contains a list of the mappings describing one or more problems

3537 with the row.

3538

3539 Raises:

3540 ValueError: if table's schema is not set

3541 """

3542 insert_results = []

3543

3544 chunk_count = int(math.ceil(len(dataframe) / chunk_size))

3545 rows_iter = _pandas_helpers.dataframe_to_json_generator(dataframe)

3546

3547 for _ in range(chunk_count):

3548 rows_chunk = itertools.islice(rows_iter, chunk_size)

3549 result = self.insert_rows(table, rows_chunk, selected_fields, **kwargs)

3550 insert_results.append(result)

3551

3552 return insert_results

3553

3554 def insert_rows_json(

3555 self,

3556 table: Union[Table, TableReference, TableListItem, str],

3557 json_rows: Sequence[Mapping[str, Any]],

3558 row_ids: Union[

3559 Iterable[Optional[str]], AutoRowIDs, None

3560 ] = AutoRowIDs.GENERATE_UUID,

3561 skip_invalid_rows: bool = None,

3562 ignore_unknown_values: bool = None,

3563 template_suffix: str = None,

3564 retry: retries.Retry = DEFAULT_RETRY,

3565 timeout: TimeoutType = DEFAULT_TIMEOUT,

3566 ) -> Sequence[dict]:

3567 """Insert rows into a table without applying local type conversions.

3568

3569 See

3570 https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll

3571

3572 BigQuery will reject insertAll payloads that exceed a defined limit (10MB).

3573 Additionally, if a payload vastly exceeds this limit, the request is rejected

3574 by the intermediate architecture, which returns a 413 (Payload Too Large) status code.

3575

3576 See

3577 https://cloud.google.com/bigquery/quotas#streaming_inserts

3578

3579 Args:

3580 table (Union[ \

3581 google.cloud.bigquery.table.Table \

3582 google.cloud.bigquery.table.TableReference, \

3583 google.cloud.bigquery.table.TableListItem, \

3584 str \

3585 ]):

3586 The destination table for the row data, or a reference to it.

3587 json_rows (Sequence[Dict]):

3588 Row data to be inserted. Keys must match the table schema fields

3589 and values must be JSON-compatible representations.

3590 row_ids (Union[Iterable[str], AutoRowIDs, None]):

3591 Unique IDs, one per row being inserted. An ID can also be

3592 ``None``, indicating that an explicit insert ID should **not**

3593 be used for that row. If the argument is omitted altogether,

3594 unique IDs are created automatically.

3595

3596 .. versionchanged:: 2.21.0

3597 Can also be an iterable, not just a sequence, or an

3598 :class:`AutoRowIDs` enum member.

3599

3600 .. deprecated:: 2.21.0

3601 Passing ``None`` to explicitly request autogenerating insert IDs is

3602 deprecated, use :attr:`AutoRowIDs.GENERATE_UUID` instead.

3603

3604 skip_invalid_rows (Optional[bool]):

3605 Insert all valid rows of a request, even if invalid rows exist.

3606 The default value is ``False``, which causes the entire request

3607 to fail if any invalid rows exist.

3608 ignore_unknown_values (Optional[bool]):

3609 Accept rows that contain values that do not match the schema.

3610 The unknown values are ignored. Default is ``False``, which

3611 treats unknown values as errors.

3612 template_suffix (Optional[str]):

3613 Treat ``name`` as a template table and provide a suffix.

3614 BigQuery will create the table ``<name> + <template_suffix>``

3615 based on the schema of the template table. See

3616 https://cloud.google.com/bigquery/streaming-data-into-bigquery#template-tables

3617 retry (Optional[google.api_core.retry.Retry]):

3618 How to retry the RPC.

3619 timeout (Optional[float]):

3620 The number of seconds to wait for the underlying HTTP transport

3621 before using ``retry``.

3622

3623 Returns:

3624 Sequence[Mappings]:

3625 One mapping per row with insert errors: the "index" key

3626 identifies the row, and the "errors" key contains a list of

3627 the mappings describing one or more problems with the row.

3628

3629 Raises:

3630 TypeError: if `json_rows` is not a `Sequence`.

3631 """

3632 if not isinstance(

3633 json_rows, (collections_abc.Sequence, collections_abc.Iterator)

3634 ):

3635 raise TypeError("json_rows argument should be a sequence of dicts")

3636 # Convert table to just a reference because unlike insert_rows,

3637 # insert_rows_json doesn't need the table schema. It's not doing any

3638 # type conversions.

3639 table = _table_arg_to_table_ref(table, default_project=self.project)

3640 rows_info: List[Any] = []

3641 data: Dict[str, Any] = {"rows": rows_info}

3642

3643 if row_ids is None:

3644 warnings.warn(

3645 "Passing None for row_ids is deprecated. To explicitly request "

3646 "autogenerated insert IDs, use AutoRowIDs.GENERATE_UUID instead",

3647 category=DeprecationWarning,

3648 )

3649 row_ids = AutoRowIDs.GENERATE_UUID

3650

3651 if not isinstance(row_ids, AutoRowIDs):

3652 try:

3653 row_ids_iter = iter(row_ids)

3654 except TypeError:

3655 msg = "row_ids is neither an iterable nor an AutoRowIDs enum member"

3656 raise TypeError(msg)

3657

3658 for i, row in enumerate(json_rows):

3659 info: Dict[str, Any] = {"json": row}

3660

3661 if row_ids is AutoRowIDs.GENERATE_UUID:

3662 info["insertId"] = str(uuid.uuid4())

3663 elif row_ids is AutoRowIDs.DISABLED:

3664 info["insertId"] = None

3665 else:

3666 try:

3667 insert_id = next(row_ids_iter)

3668 except StopIteration:

3669 msg = f"row_ids did not generate enough IDs, error at index {i}"

3670 raise ValueError(msg)

3671 else:

3672 info["insertId"] = insert_id

3673

3674 rows_info.append(info)

3675

3676 if skip_invalid_rows is not None:

3677 data["skipInvalidRows"] = skip_invalid_rows

3678

3679 if ignore_unknown_values is not None:

3680 data["ignoreUnknownValues"] = ignore_unknown_values

3681

3682 if template_suffix is not None:

3683 data["templateSuffix"] = template_suffix

3684

3685 path = "%s/insertAll" % table.path

3686 # We can always retry, because every row has an insert ID.

3687 span_attributes = {"path": path}

3688 response = self._call_api(

3689 retry,

3690 span_name="BigQuery.insertRowsJson",

3691 span_attributes=span_attributes,

3692 method="POST",

3693 path=path,

3694 data=data,

3695 timeout=timeout,

3696 )

3697 errors = []

3698

3699 for error in response.get("insertErrors", ()):

3700 errors.append({"index": int(error["index"]), "errors": error["errors"]})

3701

3702 return errors

3703

3704 def list_partitions(

3705 self,

3706 table: Union[Table, TableReference, TableListItem, str],

3707 retry: retries.Retry = DEFAULT_RETRY,

3708 timeout: TimeoutType = DEFAULT_TIMEOUT,

3709 ) -> Sequence[str]:

3710 """List the partitions in a table.

3711

3712 Args:

3713 table (Union[ \

3714 google.cloud.bigquery.table.Table, \

3715 google.cloud.bigquery.table.TableReference, \

3716 google.cloud.bigquery.table.TableListItem, \

3717 str, \

3718 ]):

3719 The table or reference from which to get partition info

3720 retry (Optional[google.api_core.retry.Retry]):

3721 How to retry the RPC.

3722 timeout (Optional[float]):

3723 The number of seconds to wait for the underlying HTTP transport

3724 before using ``retry``.

3725 If multiple requests are made under the hood, ``timeout``

3726 applies to each individual request.

3727

3728 Returns:

3729 List[str]:

3730 A list of the partition ids present in the partitioned table

3731 """

3732 table = _table_arg_to_table_ref(table, default_project=self.project)

3733 meta_table = self.get_table(

3734 TableReference(

3735 DatasetReference(table.project, table.dataset_id),

3736 "%s$__PARTITIONS_SUMMARY__" % table.table_id,

3737 ),

3738 retry=retry,

3739 timeout=timeout,

3740 )

3741

3742 subset = [col for col in meta_table.schema if col.name == "partition_id"]

3743 return [

3744 row[0]

3745 for row in self.list_rows(

3746 meta_table, selected_fields=subset, retry=retry, timeout=timeout

3747 )

3748 ]

3749

3750 def list_rows(

3751 self,

3752 table: Union[Table, TableListItem, TableReference, str],

3753 selected_fields: Sequence[SchemaField] = None,

3754 max_results: Optional[int] = None,

3755 page_token: str = None,

3756 start_index: Optional[int] = None,

3757 page_size: Optional[int] = None,

3758 retry: retries.Retry = DEFAULT_RETRY,

3759 timeout: TimeoutType = DEFAULT_TIMEOUT,

3760 ) -> RowIterator:

3761 """List the rows of the table.

3762

3763 See

3764 https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/list

3765

3766 .. note::

3767

3768 This method assumes that the provided schema is up-to-date with the

3769 schema as defined on the back-end: if the two schemas are not

3770 identical, the values returned may be incomplete. To ensure that the

3771 local copy of the schema is up-to-date, call ``client.get_table``.

3772

3773 Args:

3774 table (Union[ \

3775 google.cloud.bigquery.table.Table, \

3776 google.cloud.bigquery.table.TableListItem, \

3777 google.cloud.bigquery.table.TableReference, \

3778 str, \

3779 ]):

3780 The table to list, or a reference to it. When the table

3781 object does not contain a schema and ``selected_fields`` is

3782 not supplied, this method calls ``get_table`` to fetch the

3783 table schema.

3784 selected_fields (Sequence[google.cloud.bigquery.schema.SchemaField]):

3785 The fields to return. If not supplied, data for all columns

3786 are downloaded.

3787 max_results (Optional[int]):

3788 Maximum number of rows to return.

3789 page_token (Optional[str]):

3790 Token representing a cursor into the table's rows.

3791 If not passed, the API will return the first page of the

3792 rows. The token marks the beginning of the iterator to be

3793 returned and the value of the ``page_token`` can be accessed

3794 at ``next_page_token`` of the

3795 :class:`~google.cloud.bigquery.table.RowIterator`.

3796 start_index (Optional[int]):

3797 The zero-based index of the starting row to read.

3798 page_size (Optional[int]):

3799 The maximum number of rows in each page of results from this request.

3800 Non-positive values are ignored. Defaults to a sensible value set by the API.

3801 retry (Optional[google.api_core.retry.Retry]):

3802 How to retry the RPC.

3803 timeout (Optional[float]):

3804 The number of seconds to wait for the underlying HTTP transport

3805 before using ``retry``.

3806 If multiple requests are made under the hood, ``timeout``

3807 applies to each individual request.

3808

3809 Returns:

3810 google.cloud.bigquery.table.RowIterator:

3811 Iterator of row data

3812 :class:`~google.cloud.bigquery.table.Row`-s. During each

3813 page, the iterator will have the ``total_rows`` attribute

3814 set, which counts the total number of rows **in the table**

3815 (this is distinct from the total number of rows in the

3816 current page: ``iterator.page.num_items``).

3817 """

3818 table = _table_arg_to_table(table, default_project=self.project)

3819

3820 if not isinstance(table, Table):

3821 raise TypeError(_NEED_TABLE_ARGUMENT)

3822

3823 schema = table.schema

3824

3825 # selected_fields can override the table schema.

3826 if selected_fields is not None:

3827 schema = selected_fields

3828

3829 # No schema, but no selected_fields. Assume the developer wants all

3830 # columns, so get the table resource for them rather than failing.

3831 elif len(schema) == 0:

3832 table = self.get_table(table.reference, retry=retry, timeout=timeout)

3833 schema = table.schema

3834

3835 params: Dict[str, Any] = {}

3836 if selected_fields is not None:

3837 params["selectedFields"] = ",".join(field.name for field in selected_fields)

3838 if start_index is not None:

3839 params["startIndex"] = start_index

3840

3841 params["formatOptions.useInt64Timestamp"] = True

3842 row_iterator = RowIterator(

3843 client=self,

3844 api_request=functools.partial(self._call_api, retry, timeout=timeout),

3845 path="%s/data" % (table.path,),

3846 schema=schema,

3847 page_token=page_token,

3848 max_results=max_results,

3849 page_size=page_size,

3850 extra_params=params,

3851 table=table,

3852 # Pass in selected_fields separately from schema so that full

3853 # tables can be fetched without a column filter.

3854 selected_fields=selected_fields,

3855 total_rows=getattr(table, "num_rows", None),

3856 )

3857 return row_iterator

3858

3859 def _list_rows_from_query_results(

3860 self,

3861 job_id: str,

3862 location: str,

3863 project: str,

3864 schema: SchemaField,

3865 total_rows: Optional[int] = None,

3866 destination: Union[Table, TableReference, TableListItem, str] = None,

3867 max_results: Optional[int] = None,

3868 start_index: Optional[int] = None,

3869 page_size: Optional[int] = None,

3870 retry: retries.Retry = DEFAULT_RETRY,

3871 timeout: TimeoutType = DEFAULT_TIMEOUT,

3872 ) -> RowIterator:

3873 """List the rows of a completed query.

3874 See

3875 https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/getQueryResults

3876 Args:

3877 job_id (str):

3878 ID of a query job.

3879 location (str): Location of the query job.

3880 project (str):

3881 ID of the project where the query job was run.

3882 schema (Sequence[google.cloud.bigquery.schema.SchemaField]):

3883 The fields expected in these query results. Used to convert

3884 from JSON to expected Python types.

3885 total_rows (Optional[int]):

3886 Total number of rows in the query results.

3887 destination (Optional[Union[ \

3888 google.cloud.bigquery.table.Table, \

3889 google.cloud.bigquery.table.TableListItem, \

3890 google.cloud.bigquery.table.TableReference, \

3891 str, \

3892 ]]):

3893 Destination table reference. Used to fetch the query results

3894 with the BigQuery Storage API.

3895 max_results (Optional[int]):

3896 Maximum number of rows to return across the whole iterator.

3897 start_index (Optional[int]):

3898 The zero-based index of the starting row to read.

3899 page_size (Optional[int]):

3900 The maximum number of rows in each page of results from this request.

3901 Non-positive values are ignored. Defaults to a sensible value set by the API.

3902 retry (Optional[google.api_core.retry.Retry]):

3903 How to retry the RPC.

3904 timeout (Optional[float]):

3905 The number of seconds to wait for the underlying HTTP transport

3906 before using ``retry``. If set, this connection timeout may be

3907 increased to a minimum value. This prevents retries on what

3908 would otherwise be a successful response.

3909 If multiple requests are made under the hood, ``timeout``

3910 applies to each individual request.

3911 Returns:

3912 google.cloud.bigquery.table.RowIterator:

3913 Iterator of row data

3914 :class:`~google.cloud.bigquery.table.Row`-s.

3915 """

3916 params: Dict[str, Any] = {

3917 "fields": _LIST_ROWS_FROM_QUERY_RESULTS_FIELDS,

3918 "location": location,

3919 }

3920

3921 if timeout is not None:

3922 timeout = max(timeout, _MIN_GET_QUERY_RESULTS_TIMEOUT)

3923

3924 if start_index is not None:

3925 params["startIndex"] = start_index

3926

3927 params["formatOptions.useInt64Timestamp"] = True

3928 row_iterator = RowIterator(

3929 client=self,

3930 api_request=functools.partial(self._call_api, retry, timeout=timeout),

3931 path=f"/projects/{project}/queries/{job_id}",

3932 schema=schema,

3933 max_results=max_results,

3934 page_size=page_size,

3935 table=destination,

3936 extra_params=params,

3937 total_rows=total_rows,

3938 )

3939 return row_iterator

3940

3941 def _schema_from_json_file_object(self, file_obj):

3942 """Helper function for schema_from_json that takes a

3943 file object that describes a table schema.

3944

3945 Returns:

3946 List of schema field objects.

3947 """

3948 json_data = json.load(file_obj)

3949 return [SchemaField.from_api_repr(field) for field in json_data]

3950

3951 def _schema_to_json_file_object(self, schema_list, file_obj):

3952 """Helper function for schema_to_json that takes a schema list and file

3953 object and writes the schema list to the file object with json.dump

3954 """

3955 json.dump(schema_list, file_obj, indent=2, sort_keys=True)

3956

3957 def schema_from_json(self, file_or_path: "PathType"):

3958 """Takes a file object or file path that contains json that describes

3959 a table schema.

3960

3961 Returns:

3962 List of schema field objects.

3963 """

3964 if isinstance(file_or_path, io.IOBase):

3965 return self._schema_from_json_file_object(file_or_path)

3966

3967 with open(file_or_path) as file_obj:

3968 return self._schema_from_json_file_object(file_obj)

3969

3970 def schema_to_json(

3971 self, schema_list: Sequence[SchemaField], destination: "PathType"

3972 ):

3973 """Takes a list of schema field objects.

3974

3975 Serializes the list of schema field objects as json to a file.

3976

3977 Destination is a file path or a file object.

3978 """

3979 json_schema_list = [f.to_api_repr() for f in schema_list]

3980

3981 if isinstance(destination, io.IOBase):

3982 return self._schema_to_json_file_object(json_schema_list, destination)

3983

3984 with open(destination, mode="w") as file_obj:

3985 return self._schema_to_json_file_object(json_schema_list, file_obj)

3986

3987 def __enter__(self):

3988 return self

3989

3990 def __exit__(self, exc_type, exc_value, traceback):

3991 self.close()

3992

3993

3994# pylint: disable=unused-argument

3995def _item_to_project(iterator, resource):

3996 """Convert a JSON project to the native object.

3997

3998 Args:

3999 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.

4000

4001 resource (Dict): An item to be converted to a project.

4002

4003 Returns:

4004 google.cloud.bigquery.client.Project: The next project in the page.

4005 """

4006 return Project.from_api_repr(resource)

4007

4008

4009# pylint: enable=unused-argument

4010

4011

4012def _item_to_dataset(iterator, resource):

4013 """Convert a JSON dataset to the native object.

4014

4015 Args:

4016 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.

4017

4018 resource (Dict): An item to be converted to a dataset.

4019

4020 Returns:

4021 google.cloud.bigquery.dataset.DatasetListItem: The next dataset in the page.

4022 """

4023 return DatasetListItem(resource)

4024

4025

4026def _item_to_job(iterator, resource):

4027 """Convert a JSON job to the native object.

4028

4029 Args:

4030 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.

4031

4032 resource (Dict): An item to be converted to a job.

4033

4034 Returns:

4035 job instance: The next job in the page.

4036 """

4037 return iterator.client.job_from_resource(resource)

4038

4039

4040def _item_to_model(iterator, resource):

4041 """Convert a JSON model to the native object.

4042

4043 Args:

4044 iterator (google.api_core.page_iterator.Iterator):

4045 The iterator that is currently in use.

4046 resource (Dict): An item to be converted to a model.

4047

4048 Returns:

4049 google.cloud.bigquery.model.Model: The next model in the page.

4050 """

4051 return Model.from_api_repr(resource)

4052

4053

4054def _item_to_routine(iterator, resource):

4055 """Convert a JSON model to the native object.

4056

4057 Args:

4058 iterator (google.api_core.page_iterator.Iterator):

4059 The iterator that is currently in use.

4060 resource (Dict): An item to be converted to a routine.

4061

4062 Returns:

4063 google.cloud.bigquery.routine.Routine: The next routine in the page.

4064 """

4065 return Routine.from_api_repr(resource)

4066

4067

4068def _item_to_table(iterator, resource):

4069 """Convert a JSON table to the native object.

4070

4071 Args:

4072 iterator (google.api_core.page_iterator.Iterator): The iterator that is currently in use.

4073

4074 resource (Dict): An item to be converted to a table.

4075

4076 Returns:

4077 google.cloud.bigquery.table.Table: The next table in the page.

4078 """

4079 return TableListItem(resource)

4080

4081

4082def _extract_job_reference(job, project=None, location=None):

4083 """Extract fully-qualified job reference from a job-like object.

4084

4085 Args:

4086 job_id (Union[ \

4087 str, \

4088 google.cloud.bigquery.job.LoadJob, \

4089 google.cloud.bigquery.job.CopyJob, \

4090 google.cloud.bigquery.job.ExtractJob, \

4091 google.cloud.bigquery.job.QueryJob \

4092 ]): Job identifier.

4093 project (Optional[str]):

4094 Project where the job was run. Ignored if ``job_id`` is a job

4095 object.

4096 location (Optional[str]):

4097 Location where the job was run. Ignored if ``job_id`` is a job

4098 object.

4099

4100 Returns:

4101 Tuple[str, str, str]: ``(project, location, job_id)``

4102 """

4103 if hasattr(job, "job_id"):

4104 project = job.project

4105 job_id = job.job_id

4106 location = job.location

4107 else:

4108 job_id = job

4109

4110 return (project, location, job_id)

4111

4112

4113def _check_mode(stream):

4114 """Check that a stream was opened in read-binary mode.

4115

4116 Args:

4117 stream (IO[bytes]): A bytes IO object open for reading.

4118

4119 Raises:

4120 ValueError:

4121 if the ``stream.mode`` is a valid attribute

4122 and is not among ``rb``, ``r+b`` or ``rb+``.

4123 """

4124 mode = getattr(stream, "mode", None)

4125

4126 if isinstance(stream, gzip.GzipFile):

4127 if mode != gzip.READ: # pytype: disable=module-attr

4128 raise ValueError(

4129 "Cannot upload gzip files opened in write mode: use "

4130 "gzip.GzipFile(filename, mode='rb')"

4131 )

4132 else:

4133 if mode is not None and mode not in ("rb", "r+b", "rb+"):

4134 raise ValueError(

4135 "Cannot upload files opened in text mode: use "

4136 "open(filename, mode='rb') or open(filename, mode='r+b')"

4137 )

4138

4139

4140def _get_upload_headers(user_agent):

4141 """Get the headers for an upload request.

4142

4143 Args:

4144 user_agent (str): The user-agent for requests.

4145

4146 Returns:

4147 Dict: The headers to be used for the request.

4148 """

4149 return {

4150 "Accept": "application/json",

4151 "Accept-Encoding": "gzip, deflate",

4152 "User-Agent": user_agent,

4153 "content-type": "application/json; charset=UTF-8",

4154 }

4155

4156

4157def _add_server_timeout_header(headers: Optional[Dict[str, str]], kwargs):

4158 timeout = kwargs.get("timeout")

4159 if timeout is not None:

4160 if headers is None:

4161 headers = {}

4162 headers[TIMEOUT_HEADER] = str(timeout)

4163

4164 if headers:

4165 kwargs["headers"] = headers

4166

4167 return kwargs