Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/google/cloud/bigquery/_pandas

377 msg = f"""Error converting Pandas column with name: "{series.name}" and datatype: "{series.dtype}" to an appropriate pyarrow datatype: Array, ListArray, or StructArray"""

378 _LOGGER.error(msg)

379 raise pyarrow.ArrowTypeError(msg)

380

381

382def get_column_or_index(dataframe, name):

383 """Return a column or index as a pandas series."""

384 if name in dataframe.columns:

385 return dataframe[name].reset_index(drop=True)

386

387 if isinstance(dataframe.index, pandas.MultiIndex):

388 if name in dataframe.index.names:

389 return (

390 dataframe.index.get_level_values(name)

391 .to_series()

392 .reset_index(drop=True)

393 )

394 else:

395 if name == dataframe.index.name:

396 return dataframe.index.to_series().reset_index(drop=True)

397

398 raise ValueError("column or index '{}' not found.".format(name))

399

400

401def list_columns_and_indexes(dataframe):

402 """Return all index and column names with dtypes.

403

404 Returns:

405 Sequence[Tuple[str, dtype]]:

406 Returns a sorted list of indexes and column names with

407 corresponding dtypes. If an index is missing a name or has the

408 same name as a column, the index is omitted.

409 """

410 column_names = frozenset(dataframe.columns)

411 columns_and_indexes = []

412 if isinstance(dataframe.index, pandas.MultiIndex):

413 for name in dataframe.index.names:

414 if name and name not in column_names:

415 values = dataframe.index.get_level_values(name)

416 columns_and_indexes.append((name, values.dtype))

417 else:

418 if dataframe.index.name and dataframe.index.name not in column_names:

419 columns_and_indexes.append((dataframe.index.name, dataframe.index.dtype))

420

421 columns_and_indexes += zip(dataframe.columns, dataframe.dtypes)

422 return columns_and_indexes

423

424

425def _first_valid(series):

426 first_valid_index = series.first_valid_index()

427 if first_valid_index is not None:

428 return series.at[first_valid_index]

429

430

431def _first_array_valid(series):

432 """Return the first "meaningful" element from the array series.

433

434 Here, "meaningful" means the first non-None element in one of the arrays that can

435 be used for type detextion.

436 """

437 first_valid_index = series.first_valid_index()

438 if first_valid_index is None:

439 return None

440

441 valid_array = series.at[first_valid_index]

442 valid_item = next((item for item in valid_array if not pandas.isna(item)), None)

443

444 if valid_item is not None:

445 return valid_item

446

447 # Valid item is None because all items in the "valid" array are invalid. Try

448 # to find a true valid array manually.

449 for array in islice(series, first_valid_index + 1, None):

450 try:

451 array_iter = iter(array)

452 except TypeError:

453 continue # Not an array, apparently, e.g. None, thus skip.

454 valid_item = next((item for item in array_iter if not pandas.isna(item)), None)

455 if valid_item is not None:

456 break

457

458 return valid_item

459

460

461def dataframe_to_bq_schema(dataframe, bq_schema):

462 """Convert a pandas DataFrame schema to a BigQuery schema.

463

464 DEPRECATED: Use

465 pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(),

466 instead. See: go/pandas-gbq-and-bigframes-redundancy.

467

468 Args:

469 dataframe (pandas.DataFrame):

470 DataFrame for which the client determines the BigQuery schema.

471 bq_schema (Sequence[Union[ \

472 :class:`~google.cloud.bigquery.schema.SchemaField`, \

473 Mapping[str, Any] \

474 ]]):

475 A BigQuery schema. Use this argument to override the autodetected

476 type for some or all of the DataFrame columns.

477

478 Returns:

479 Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]:

480 The automatically determined schema. Returns None if the type of

481 any column cannot be determined.

482 """

483 if pandas_gbq is None:

484 warnings.warn(

485 "Loading pandas DataFrame into BigQuery will require pandas-gbq "

486 "package version 0.26.1 or greater in the future. "

487 f"Tried to import pandas-gbq and got: {pandas_gbq_import_exception}",

488 category=FutureWarning,

489 )

490 else:

491 return pandas_gbq.schema.pandas_to_bigquery.dataframe_to_bigquery_fields(

492 dataframe,

493 override_bigquery_fields=bq_schema,

494 index=True,

495 )

496

497 if bq_schema:

498 bq_schema = schema._to_schema_fields(bq_schema)

499 bq_schema_index = {field.name: field for field in bq_schema}

500 bq_schema_unused = set(bq_schema_index.keys())

501 else:

502 bq_schema_index = {}

503 bq_schema_unused = set()

504

505 bq_schema_out = []

506 unknown_type_columns = []

507 dataframe_reset_index = dataframe.reset_index()

508 for column, dtype in list_columns_and_indexes(dataframe):

509 # Step 1: use provided type from schema, if present.

510 bq_field = bq_schema_index.get(column)

511 if bq_field:

512 bq_schema_out.append(bq_field)

513 bq_schema_unused.discard(bq_field.name)

514 continue

515

516 # Step 2: try to automatically determine the type based on the

517 # pandas dtype.

518 bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)

519 if bq_type is None:

520 sample_data = _first_valid(dataframe_reset_index[column])

521 if (

522 isinstance(sample_data, _BaseGeometry)

523 and sample_data is not None # Paranoia

524 ):

525 bq_type = "GEOGRAPHY"

526 if bq_type is not None:

527 bq_schema_out.append(schema.SchemaField(column, bq_type))

528 continue

529

530 # Step 3: try with pyarrow if available

531 bq_field = _get_schema_by_pyarrow(column, dataframe_reset_index[column])

532 if bq_field is not None:

533 bq_schema_out.append(bq_field)

534 continue

535

536 unknown_type_columns.append(column)

537

538 # Catch any schema mismatch. The developer explicitly asked to serialize a

539 # column, but it was not found.

540 if bq_schema_unused:

541 raise ValueError(

542 "bq_schema contains fields not present in dataframe: {}".format(

543 bq_schema_unused

544 )

545 )

546

547 if unknown_type_columns != []:

548 msg = "Could not determine the type of columns: {}".format(

549 ", ".join(unknown_type_columns)

550 )

551 warnings.warn(msg)

552 return None # We cannot detect the schema in full.

553

554 return tuple(bq_schema_out)

555

556

557def _get_schema_by_pyarrow(name, series):

558 """Attempt to detect the type of the given series by leveraging PyArrow's

559 type detection capabilities.

560

561 This function requires the ``pyarrow`` library to be installed and

562 available. If the series type cannot be determined or ``pyarrow`` is not

563 available, ``None`` is returned.

564

565 Args:

566 name (str):

567 the column name of the SchemaField.

568 series (pandas.Series):

569 The Series data for which to detect the data type.

570 Returns:

571 Optional[google.cloud.bigquery.schema.SchemaField]:

572 A tuple containing the BigQuery-compatible type string (e.g.,

573 "STRING", "INTEGER", "TIMESTAMP", "DATETIME", "NUMERIC", "BIGNUMERIC")

574 and the mode string ("NULLABLE", "REPEATED").

575 Returns ``None`` if the type cannot be determined or ``pyarrow``

576 is not imported.

577 """

578

579 if not pyarrow:

580 return None

581

582 arrow_table = pyarrow.array(series)

583 if pyarrow.types.is_list(arrow_table.type):

584 # `pyarrow.ListType`

585 mode = "REPEATED"

586 type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.values.type.id)

587

588 # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds

589 # it to such datetimes, causing them to be recognized as TIMESTAMP type.

590 # We thus additionally check the actual data to see if we need to overrule

591 # that and choose DATETIME instead.

592 # Note that this should only be needed for datetime values inside a list,

593 # since scalar datetime values have a proper Pandas dtype that allows

594 # distinguishing between timezone-naive and timezone-aware values before

595 # even requiring the additional schema augment logic in this method.

596 if type == "TIMESTAMP":

597 valid_item = _first_array_valid(series)

598 if isinstance(valid_item, datetime) and valid_item.tzinfo is None:

599 type = "DATETIME"

600 else:

601 mode = "NULLABLE" # default mode

602 type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id)

603 if type == "NUMERIC" and arrow_table.type.scale > 9:

604 type = "BIGNUMERIC"

605

606 if type is not None:

607 return schema.SchemaField(name, type, mode)

608 else:

609 return None

610

611

612def dataframe_to_arrow(dataframe, bq_schema):

613 """Convert pandas dataframe to Arrow table, using BigQuery schema.

614

615 Args:

616 dataframe (pandas.DataFrame):

617 DataFrame to convert to Arrow table.

618 bq_schema (Sequence[Union[ \

619 :class:`~google.cloud.bigquery.schema.SchemaField`, \

620 Mapping[str, Any] \

621 ]]):

622 Desired BigQuery schema. The number of columns must match the

623 number of columns in the DataFrame.

624

625 Returns:

626 pyarrow.Table:

627 Table containing dataframe data, with schema derived from

628 BigQuery schema.

629 """

630 column_names = set(dataframe.columns)

631 column_and_index_names = set(

632 name for name, _ in list_columns_and_indexes(dataframe)

633 )

634

635 bq_schema = schema._to_schema_fields(bq_schema)

636 bq_field_names = set(field.name for field in bq_schema)

637

638 extra_fields = bq_field_names - column_and_index_names

639 if extra_fields:

640 raise ValueError(

641 "bq_schema contains fields not present in dataframe: {}".format(

642 extra_fields

643 )

644 )

645

646 # It's okay for indexes to be missing from bq_schema, but it's not okay to

647 # be missing columns.

648 missing_fields = column_names - bq_field_names

649 if missing_fields:

650 raise ValueError(

651 "bq_schema is missing fields from dataframe: {}".format(missing_fields)

652 )

653

654 arrow_arrays = []

655 arrow_names = []

656 arrow_fields = []

657 for bq_field in bq_schema:

658 arrow_names.append(bq_field.name)

659 arrow_arrays.append(

660 bq_to_arrow_array(get_column_or_index(dataframe, bq_field.name), bq_field)

661 )

662 arrow_fields.append(bq_to_arrow_field(bq_field, arrow_arrays[-1].type))

663

664 if all((field is not None for field in arrow_fields)):

665 return pyarrow.Table.from_arrays(

666 arrow_arrays, schema=pyarrow.schema(arrow_fields)

667 )

668 return pyarrow.Table.from_arrays(arrow_arrays, names=arrow_names)

669

670

671def dataframe_to_parquet(

672 dataframe,

673 bq_schema,

674 filepath,

675 parquet_compression="SNAPPY",

676 parquet_use_compliant_nested_type=True,

677):

678 """Write dataframe as a Parquet file, according to the desired BQ schema.

679

680 This function requires the :mod:`pyarrow` package. Arrow is used as an

681 intermediate format.

682

683 Args:

684 dataframe (pandas.DataFrame):

685 DataFrame to convert to Parquet file.

686 bq_schema (Sequence[Union[ \

687 :class:`~google.cloud.bigquery.schema.SchemaField`, \

688 Mapping[str, Any] \

689 ]]):

690 Desired BigQuery schema. Number of columns must match number of

691 columns in the DataFrame.

692 filepath (str):

693 Path to write Parquet file to.

694 parquet_compression (Optional[str]):

695 The compression codec to use by the the ``pyarrow.parquet.write_table``

696 serializing method. Defaults to "SNAPPY".

697 https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table

698 parquet_use_compliant_nested_type (bool):

699 Whether the ``pyarrow.parquet.write_table`` serializing method should write

700 compliant Parquet nested type (lists). Defaults to ``True``.

701 https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types

702 https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table

703

704 This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``.

705 """

706 pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)

707

708 import pyarrow.parquet # type: ignore

709

710 kwargs = (

711 {"use_compliant_nested_type": parquet_use_compliant_nested_type}

712 if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type

713 else {}

714 )

715

716 bq_schema = schema._to_schema_fields(bq_schema)

717 arrow_table = dataframe_to_arrow(dataframe, bq_schema)

718 pyarrow.parquet.write_table(

719 arrow_table,

720 filepath,

721 compression=parquet_compression,

722 **kwargs,

723 )

724

725

726def _row_iterator_page_to_arrow(page, column_names, arrow_types):

727 # Iterate over the page to force the API request to get the page data.

728 try:

729 next(iter(page))

730 except StopIteration:

731 pass

732

733 arrays = []

734 for column_index, arrow_type in enumerate(arrow_types):

735 arrays.append(pyarrow.array(page._columns[column_index], type=arrow_type))

736

737 if isinstance(column_names, pyarrow.Schema):

738 return pyarrow.RecordBatch.from_arrays(arrays, schema=column_names)

739 return pyarrow.RecordBatch.from_arrays(arrays, names=column_names)

740

741

742def download_arrow_row_iterator(pages, bq_schema):

743 """Use HTTP JSON RowIterator to construct an iterable of RecordBatches.

744

745 Args:

746 pages (Iterator[:class:`google.api_core.page_iterator.Page`]):

747 An iterator over the result pages.

748 bq_schema (Sequence[Union[ \

749 :class:`~google.cloud.bigquery.schema.SchemaField`, \

750 Mapping[str, Any] \

751 ]]):

752 A decription of the fields in result pages.

753 Yields:

754 :class:`pyarrow.RecordBatch`

755 The next page of records as a ``pyarrow`` record batch.

756 """

757 bq_schema = schema._to_schema_fields(bq_schema)

758 column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]

759 arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]

760

761 for page in pages:

762 yield _row_iterator_page_to_arrow(page, column_names, arrow_types)

763

764

765def _row_iterator_page_to_dataframe(page, column_names, dtypes):

766 # Iterate over the page to force the API request to get the page data.

767 try:

768 next(iter(page))

769 except StopIteration:

770 pass

771

772 columns = {}

773 for column_index, column_name in enumerate(column_names):

774 dtype = dtypes.get(column_name)

775 columns[column_name] = pandas.Series(page._columns[column_index], dtype=dtype)

776

777 return pandas.DataFrame(columns, columns=column_names)

778

779

780def download_dataframe_row_iterator(pages, bq_schema, dtypes):

781 """Use HTTP JSON RowIterator to construct a DataFrame.

782

783 Args:

784 pages (Iterator[:class:`google.api_core.page_iterator.Page`]):

785 An iterator over the result pages.

786 bq_schema (Sequence[Union[ \

787 :class:`~google.cloud.bigquery.schema.SchemaField`, \

788 Mapping[str, Any] \

789 ]]):

790 A decription of the fields in result pages.

791 dtypes(Mapping[str, numpy.dtype]):

792 The types of columns in result data to hint construction of the

793 resulting DataFrame. Not all column types have to be specified.

794 Yields:

795 :class:`pandas.DataFrame`

796 The next page of records as a ``pandas.DataFrame`` record batch.

797 """

798 bq_schema = schema._to_schema_fields(bq_schema)

799 column_names = [field.name for field in bq_schema]

800 for page in pages:

801 yield _row_iterator_page_to_dataframe(page, column_names, dtypes)

802

803

804def _bqstorage_page_to_arrow(page):

805 return page.to_arrow()

806

807

808def _bqstorage_page_to_dataframe(column_names, dtypes, page):

809 # page.to_dataframe() does not preserve column order in some versions

810 # of google-cloud-bigquery-storage. Access by column name to rearrange.

811 return page.to_dataframe(dtypes=dtypes)[column_names]

812

813

814def _download_table_bqstorage_stream(

815 download_state, bqstorage_client, session, stream, worker_queue, page_to_item

816):

817 download_state.start()

818 try:

819 reader = bqstorage_client.read_rows(stream.name)

820

821 # Avoid deprecation warnings for passing in unnecessary read session.

822 # https://github.com/googleapis/python-bigquery-storage/issues/229

823 if _versions_helpers.BQ_STORAGE_VERSIONS.is_read_session_optional:

824 rowstream = reader.rows()

825 else:

826 rowstream = reader.rows(session)

827

828 for page in rowstream.pages:

829 item = page_to_item(page)

830

831 # Make sure we set a timeout on put() so that we give the worker

832 # thread opportunities to shutdown gracefully, for example if the

833 # parent thread shuts down or the parent generator object which

834 # collects rows from all workers goes out of scope. See:

835 # https://github.com/googleapis/python-bigquery/issues/2032

836 while True:

837 if download_state.done:

838 return

839 try:

840 worker_queue.put(item, timeout=_PROGRESS_INTERVAL)

841 break

842 except queue.Full:

843 continue

844 finally:

845 download_state.finish()

846

847

848def _nowait(futures):

849 """Separate finished and unfinished threads, much like

850 :func:`concurrent.futures.wait`, but don't wait.

851 """

852 done = []

853 not_done = []

854 for future in futures:

855 if future.done():

856 done.append(future)

857 else:

858 not_done.append(future)

859 return done, not_done

860

861

862def _download_table_bqstorage(

863 project_id: str,

864 table: Any,

865 bqstorage_client: Any,

866 preserve_order: bool = False,

867 selected_fields: Optional[List[Any]] = None,

868 page_to_item: Optional[Callable] = None,

869 max_queue_size: Any = _MAX_QUEUE_SIZE_DEFAULT,

870 max_stream_count: Optional[int] = None,

871 download_state: Optional[_DownloadState] = None,

872) -> Generator[Any, None, None]:

873 """Downloads a BigQuery table using the BigQuery Storage API.

874

875 This method uses the faster, but potentially more expensive, BigQuery

876 Storage API to download a table as a Pandas DataFrame. It supports

877 parallel downloads and optional data transformations.

878

879 Args:

880 project_id (str): The ID of the Google Cloud project containing

881 the table.

882 table (Any): The BigQuery table to download.

883 bqstorage_client (Any): An

884 authenticated BigQuery Storage API client.

885 preserve_order (bool, optional): Whether to preserve the order

886 of the rows as they are read from BigQuery. If True this limits

887 the number of streams to one and overrides `max_stream_count`.

888 Defaults to False.

889 selected_fields (Optional[List[SchemaField]]):

890 A list of BigQuery schema fields to select for download. If None,

891 all fields are downloaded. Defaults to None.

892 page_to_item (Optional[Callable]): An optional callable

893 function that takes a page of data from the BigQuery Storage API

894 max_stream_count (Optional[int]): The maximum number of

895 concurrent streams to use for downloading data. If `preserve_order`

896 is True, the requested streams are limited to 1 regardless of the

897 `max_stream_count` value. If 0 or None, then the number of

898 requested streams will be unbounded. Defaults to None.

899 download_state (Optional[_DownloadState]):

900 A threadsafe state object which can be used to observe the

901 behavior of the worker threads created by this method.

902

903 Yields:

904 pandas.DataFrame: Pandas DataFrames, one for each chunk of data

905 downloaded from BigQuery.

906

907 Raises:

908 ValueError: If attempting to read from a specific partition or snapshot.

909

910 Note:

911 This method requires the `google-cloud-bigquery-storage` library

912 to be installed.

913 """

914

915 from google.cloud import bigquery_storage

916

917 if "$" in table.table_id:

918 raise ValueError(

919 "Reading from a specific partition is not currently supported."

920 )

921 if "@" in table.table_id:

922 raise ValueError("Reading from a specific snapshot is not currently supported.")

923

924 requested_streams = determine_requested_streams(preserve_order, max_stream_count)

925

926 requested_session = bigquery_storage.types.stream.ReadSession(

927 table=table.to_bqstorage(),

928 data_format=bigquery_storage.types.stream.DataFormat.ARROW,

929 )

930 if selected_fields is not None:

931 for field in selected_fields:

932 requested_session.read_options.selected_fields.append(field.name)

933

934 if _ARROW_COMPRESSION_SUPPORT:

935 requested_session.read_options.arrow_serialization_options.buffer_compression = (

936 # CompressionCodec(1) -> LZ4_FRAME

937 ArrowSerializationOptions.CompressionCodec(1)

938 )

939

940 session = bqstorage_client.create_read_session(

941 parent="projects/{}".format(project_id),

942 read_session=requested_session,

943 max_stream_count=requested_streams,

944 )

945

946 _LOGGER.debug(

947 "Started reading table '{}.{}.{}' with BQ Storage API session '{}'.".format(

948 table.project, table.dataset_id, table.table_id, session.name

949 )

950 )

951

952 # Avoid reading rows from an empty table.

953 if not session.streams:

954 return

955

956 total_streams = len(session.streams)

957

958 # Use _DownloadState to notify worker threads when to quit.

959 # See: https://stackoverflow.com/a/29237343/101923

960 if download_state is None:

961 download_state = _DownloadState()

962

963 # Create a queue to collect frames as they are created in each thread.

964 #

965 # The queue needs to be bounded by default, because if the user code processes the

966 # fetched result pages too slowly, while at the same time new pages are rapidly being

967 # fetched from the server, the queue can grow to the point where the process runs

968 # out of memory.

969 if max_queue_size is _MAX_QUEUE_SIZE_DEFAULT:

970 max_queue_size = total_streams

971 elif max_queue_size is None:

972 max_queue_size = 0 # unbounded

973

974 worker_queue: queue.Queue[int] = queue.Queue(maxsize=max_queue_size)

975

976 with concurrent.futures.ThreadPoolExecutor(max_workers=total_streams) as pool:

977 try:

978 # Manually submit jobs and wait for download to complete rather

979 # than using pool.map because pool.map continues running in the

980 # background even if there is an exception on the main thread.

981 # See: https://github.com/googleapis/google-cloud-python/pull/7698

982 not_done = [

983 pool.submit(

984 _download_table_bqstorage_stream,

985 download_state,

986 bqstorage_client,

987 session,

988 stream,

989 worker_queue,

990 page_to_item,

991 )

992 for stream in session.streams

993 ]

994

995 while not_done:

996 # Don't block on the worker threads. For performance reasons,

997 # we want to block on the queue's get method, instead. This

998 # prevents the queue from filling up, because the main thread

999 # has smaller gaps in time between calls to the queue's get

1000 # method. For a detailed explanation, see:

1001 # https://friendliness.dev/2019/06/18/python-nowait/

1002 done, not_done = _nowait(not_done)

1003 for future in done:

1004 # Call result() on any finished threads to raise any

1005 # exceptions encountered.

1006 future.result()

1007

1008 try:

1009 frame = worker_queue.get(timeout=_PROGRESS_INTERVAL)

1010 yield frame

1011 except queue.Empty: # pragma: NO COVER

1012 continue

1013

1014 # Return any remaining values after the workers finished.

1015 while True: # pragma: NO COVER

1016 try:

1017 frame = worker_queue.get_nowait()

1018 yield frame

1019 except queue.Empty: # pragma: NO COVER

1020 break

1021 finally:

1022 # No need for a lock because reading/replacing a variable is

1023 # defined to be an atomic operation in the Python language

1024 # definition (enforced by the global interpreter lock).

1025 download_state.done = True

1026

1027 # Shutdown all background threads, now that they should know to

1028 # exit early.

1029 pool.shutdown(wait=True)

1030

1031

1032def download_arrow_bqstorage(

1033 project_id,

1034 table,

1035 bqstorage_client,

1036 preserve_order=False,

1037 selected_fields=None,

1038 max_queue_size=_MAX_QUEUE_SIZE_DEFAULT,

1039 max_stream_count=None,

1040):

1041 return _download_table_bqstorage(

1042 project_id,

1043 table,

1044 bqstorage_client,

1045 preserve_order=preserve_order,

1046 selected_fields=selected_fields,

1047 page_to_item=_bqstorage_page_to_arrow,

1048 max_queue_size=max_queue_size,

1049 max_stream_count=max_stream_count,

1050 )

1051

1052

1053def download_dataframe_bqstorage(

1054 project_id,

1055 table,

1056 bqstorage_client,

1057 column_names,

1058 dtypes,

1059 preserve_order=False,

1060 selected_fields=None,

1061 max_queue_size=_MAX_QUEUE_SIZE_DEFAULT,

1062 max_stream_count=None,

1063):

1064 page_to_item = functools.partial(_bqstorage_page_to_dataframe, column_names, dtypes)

1065 return _download_table_bqstorage(

1066 project_id,

1067 table,

1068 bqstorage_client,

1069 preserve_order=preserve_order,

1070 selected_fields=selected_fields,

1071 page_to_item=page_to_item,

1072 max_queue_size=max_queue_size,

1073 max_stream_count=max_stream_count,

1074 )

1075

1076

1077def dataframe_to_json_generator(dataframe):

1078 for row in dataframe.itertuples(index=False, name=None):

1079 output = {}

1080 for column, value in zip(dataframe.columns, row):

1081 # Omit NaN values.

1082 is_nan = pandas.isna(value)

1083

1084 # isna() can also return an array-like of bools, but the latter's boolean

1085 # value is ambiguous, hence an extra check. An array-like value is *not*

1086 # considered a NaN, however.

1087 if isinstance(is_nan, bool) and is_nan:

1088 continue

1089

1090 # Convert numpy types to corresponding Python types.

1091 # https://stackoverflow.com/a/60441783/101923

1092 if isinstance(value, numpy.bool_):

1093 value = bool(value)

1094 elif isinstance(

1095 value,

1096 (

1097 numpy.int64,

1098 numpy.int32,

1099 numpy.int16,

1100 numpy.int8,

1101 numpy.uint64,

1102 numpy.uint32,

1103 numpy.uint16,

1104 numpy.uint8,

1105 ),

1106 ):

1107 value = int(value)

1108 output[column] = value

1109

1110 yield output

1111

1112

1113def verify_pandas_imports():

1114 if pandas is None:

1115 raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception

1116 if db_dtypes is None:

1117 raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception

1118

1119

1120def determine_requested_streams(

1121 preserve_order: bool,

1122 max_stream_count: Union[int, None],

1123) -> int:

1124 """Determines the value of requested_streams based on the values of

1125 `preserve_order` and `max_stream_count`.

1126

1127 Args:

1128 preserve_order (bool): Whether to preserve the order of streams. If True,

1129 this limits the number of streams to one. `preserve_order` takes

1130 precedence over `max_stream_count`.

1131 max_stream_count (Union[int, None]]): The maximum number of streams

1132 allowed. Must be a non-negative number or None, where None indicates

1133 the value is unset. NOTE: if `preserve_order` is also set, it takes

1134 precedence over `max_stream_count`, thus to ensure that `max_stream_count`

1135 is used, ensure that `preserve_order` is None.

1136

1137 Returns:

1138 (int) The appropriate value for requested_streams.

1139 """

1140

1141 if preserve_order:

1142 # If preserve order is set, it takes precedence.

1143 # Limit the requested streams to 1, to ensure that order

1144 # is preserved)

1145 return 1

1146

1147 elif max_stream_count is not None:

1148 # If preserve_order is not set, only then do we consider max_stream_count

1149 if max_stream_count <= -1:

1150 raise ValueError("max_stream_count must be non-negative OR None")

1151 return max_stream_count

1152

1153 # Default to zero requested streams (unbounded).

1154 return 0

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/google/cloud/bigquery/_pandas_helpers.py: 19%

421 statements