1# Copyright 2017 Google Inc. 
    2# 
    3# Licensed under the Apache License, Version 2.0 (the "License"); 
    4# you may not use this file except in compliance with the License. 
    5# You may obtain a copy of the License at 
    6# 
    7#     http://www.apache.org/licenses/LICENSE-2.0 
    8# 
    9# Unless required by applicable law or agreed to in writing, software 
    10# distributed under the License is distributed on an "AS IS" BASIS, 
    11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
    12# See the License for the specific language governing permissions and 
    13# limitations under the License. 
    14 
    15"""Shared utilities used by both downloads and uploads.""" 
    16 
    17from __future__ import absolute_import 
    18 
    19import base64 
    20import hashlib 
    21import logging 
    22 
    23from urllib.parse import parse_qs 
    24from urllib.parse import urlencode 
    25from urllib.parse import urlsplit 
    26from urllib.parse import urlunsplit 
    27 
    28from google.cloud.storage import retry 
    29from google.cloud.storage.exceptions import InvalidResponse 
    30 
    31 
    32RANGE_HEADER = "range" 
    33CONTENT_RANGE_HEADER = "content-range" 
    34CONTENT_ENCODING_HEADER = "content-encoding" 
    35 
    36_SLOW_CRC32C_WARNING = ( 
    37    "Currently using crcmod in pure python form. This is a slow " 
    38    "implementation. Python 3 has a faster implementation, `google-crc32c`, " 
    39    "which will be used if it is installed." 
    40) 
    41_GENERATION_HEADER = "x-goog-generation" 
    42_HASH_HEADER = "x-goog-hash" 
    43_STORED_CONTENT_ENCODING_HEADER = "x-goog-stored-content-encoding" 
    44 
    45_MISSING_CHECKSUM = """\ 
    46No {checksum_type} checksum was returned from the service while downloading {} 
    47(which happens for composite objects), so client-side content integrity 
    48checking is not being performed.""" 
    49_LOGGER = logging.getLogger(__name__) 
    50 
    51 
    52def do_nothing(): 
    53    """Simple default callback.""" 
    54 
    55 
    56def header_required(response, name, get_headers, callback=do_nothing): 
    57    """Checks that a specific header is in a headers dictionary. 
    58 
    59    Args: 
    60        response (object): An HTTP response object, expected to have a 
    61            ``headers`` attribute that is a ``Mapping[str, str]``. 
    62        name (str): The name of a required header. 
    63        get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers 
    64            from an HTTP response. 
    65        callback (Optional[Callable]): A callback that takes no arguments, 
    66            to be executed when an exception is being raised. 
    67 
    68    Returns: 
    69        str: The desired header. 
    70 
    71    Raises: 
    72        ~google.cloud.storage.exceptions.InvalidResponse: If the header 
    73            is missing. 
    74    """ 
    75    headers = get_headers(response) 
    76    if name not in headers: 
    77        callback() 
    78        raise InvalidResponse(response, "Response headers must contain header", name) 
    79 
    80    return headers[name] 
    81 
    82 
    83def require_status_code(response, status_codes, get_status_code, callback=do_nothing): 
    84    """Require a response has a status code among a list. 
    85 
    86    Args: 
    87        response (object): The HTTP response object. 
    88        status_codes (tuple): The acceptable status codes. 
    89        get_status_code (Callable[Any, int]): Helper to get a status code 
    90            from a response. 
    91        callback (Optional[Callable]): A callback that takes no arguments, 
    92            to be executed when an exception is being raised. 
    93 
    94    Returns: 
    95        int: The status code. 
    96 
    97    Raises: 
    98        ~google.cloud.storage.exceptions.InvalidResponse: If the status code 
    99            is not one of the values in ``status_codes``. 
    100    """ 
    101    status_code = get_status_code(response) 
    102    if status_code not in status_codes: 
    103        if status_code not in retry._RETRYABLE_STATUS_CODES: 
    104            callback() 
    105        raise InvalidResponse( 
    106            response, 
    107            "Request failed with status code", 
    108            status_code, 
    109            "Expected one of", 
    110            *status_codes 
    111        ) 
    112    return status_code 
    113 
    114 
    115def _get_metadata_key(checksum_type): 
    116    if checksum_type == "md5": 
    117        return "md5Hash" 
    118    else: 
    119        return checksum_type 
    120 
    121 
    122def prepare_checksum_digest(digest_bytestring): 
    123    """Convert a checksum object into a digest encoded for an HTTP header. 
    124 
    125    Args: 
    126        bytes: A checksum digest bytestring. 
    127 
    128    Returns: 
    129        str: A base64 string representation of the input. 
    130    """ 
    131    encoded_digest = base64.b64encode(digest_bytestring) 
    132    # NOTE: ``b64encode`` returns ``bytes``, but HTTP headers expect ``str``. 
    133    return encoded_digest.decode("utf-8") 
    134 
    135 
    136def _get_expected_checksum(response, get_headers, media_url, checksum_type): 
    137    """Get the expected checksum and checksum object for the download response. 
    138 
    139    Args: 
    140        response (~requests.Response): The HTTP response object. 
    141        get_headers (callable: response->dict): returns response headers. 
    142        media_url (str): The URL containing the media to be downloaded. 
    143        checksum_type Optional(str): The checksum type to read from the headers, 
    144            exactly as it will appear in the headers (case-sensitive). Must be 
    145            "md5", "crc32c" or None. 
    146 
    147    Returns: 
    148        Tuple (Optional[str], object): The expected checksum of the response, 
    149        if it can be detected from the ``X-Goog-Hash`` header, and the 
    150        appropriate checksum object for the expected checksum. 
    151    """ 
    152    if checksum_type not in ["md5", "crc32c", None]: 
    153        raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``") 
    154    elif checksum_type in ["md5", "crc32c"]: 
    155        headers = get_headers(response) 
    156        expected_checksum = _parse_checksum_header( 
    157            headers.get(_HASH_HEADER), response, checksum_label=checksum_type 
    158        ) 
    159 
    160        if expected_checksum is None: 
    161            msg = _MISSING_CHECKSUM.format( 
    162                media_url, checksum_type=checksum_type.upper() 
    163            ) 
    164            _LOGGER.info(msg) 
    165            checksum_object = _DoNothingHash() 
    166        else: 
    167            checksum_object = _get_checksum_object(checksum_type) 
    168    else: 
    169        expected_checksum = None 
    170        checksum_object = _DoNothingHash() 
    171 
    172    return (expected_checksum, checksum_object) 
    173 
    174 
    175def _get_uploaded_checksum_from_headers(response, get_headers, checksum_type): 
    176    """Get the computed checksum and checksum object from the response headers. 
    177 
    178    Args: 
    179        response (~requests.Response): The HTTP response object. 
    180        get_headers (callable: response->dict): returns response headers. 
    181        checksum_type Optional(str): The checksum type to read from the headers, 
    182            exactly as it will appear in the headers (case-sensitive). Must be 
    183            "md5", "crc32c" or None. 
    184 
    185    Returns: 
    186        Tuple (Optional[str], object): The checksum of the response, 
    187        if it can be detected from the ``X-Goog-Hash`` header, and the 
    188        appropriate checksum object for the expected checksum. 
    189    """ 
    190    if checksum_type not in ["md5", "crc32c", None]: 
    191        raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``") 
    192    elif checksum_type in ["md5", "crc32c"]: 
    193        headers = get_headers(response) 
    194        remote_checksum = _parse_checksum_header( 
    195            headers.get(_HASH_HEADER), response, checksum_label=checksum_type 
    196        ) 
    197    else: 
    198        remote_checksum = None 
    199 
    200    return remote_checksum 
    201 
    202 
    203def _parse_checksum_header(header_value, response, checksum_label): 
    204    """Parses the checksum header from an ``X-Goog-Hash`` value. 
    205 
    206    .. _header reference: https://cloud.google.com/storage/docs/\ 
    207                          xml-api/reference-headers#xgooghash 
    208 
    209    Expects ``header_value`` (if not :data:`None`) to be in one of the three 
    210    following formats: 
    211 
    212    * ``crc32c=n03x6A==`` 
    213    * ``md5=Ojk9c3dhfxgoKVVHYwFbHQ==`` 
    214    * ``crc32c=n03x6A==,md5=Ojk9c3dhfxgoKVVHYwFbHQ==`` 
    215 
    216    See the `header reference`_ for more information. 
    217 
    218    Args: 
    219        header_value (Optional[str]): The ``X-Goog-Hash`` header from 
    220            a download response. 
    221        response (~requests.Response): The HTTP response object. 
    222        checksum_label (str): The label of the header value to read, as in the 
    223            examples above. Typically "md5" or "crc32c" 
    224 
    225    Returns: 
    226        Optional[str]: The expected checksum of the response, if it 
    227        can be detected from the ``X-Goog-Hash`` header; otherwise, None. 
    228 
    229    Raises: 
    230        ~google.cloud.storage.exceptions.InvalidResponse: If there are 
    231            multiple checksums of the requested type in ``header_value``. 
    232    """ 
    233    if header_value is None: 
    234        return None 
    235 
    236    matches = [] 
    237    for checksum in header_value.split(","): 
    238        name, value = checksum.split("=", 1) 
    239        # Official docs say "," is the separator, but real-world responses have encountered ", " 
    240        if name.lstrip() == checksum_label: 
    241            matches.append(value) 
    242 
    243    if len(matches) == 0: 
    244        return None 
    245    elif len(matches) == 1: 
    246        return matches[0] 
    247    else: 
    248        raise InvalidResponse( 
    249            response, 
    250            "X-Goog-Hash header had multiple ``{}`` values.".format(checksum_label), 
    251            header_value, 
    252            matches, 
    253        ) 
    254 
    255 
    256def _get_checksum_object(checksum_type): 
    257    """Respond with a checksum object for a supported type, if not None. 
    258 
    259    Raises ValueError if checksum_type is unsupported. 
    260    """ 
    261    if checksum_type == "md5": 
    262        return hashlib.md5() 
    263    elif checksum_type == "crc32c": 
    264        # In order to support platforms that don't have google_crc32c 
    265        # support, only perform the import on demand. 
    266        import google_crc32c 
    267 
    268        return google_crc32c.Checksum() 
    269    elif checksum_type is None: 
    270        return None 
    271    else: 
    272        raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``") 
    273 
    274 
    275def _is_crc32c_available_and_fast(): 
    276    """Return True if the google_crc32c C extension is installed. 
    277 
    278    Return False if either the package is not installed, or if only the 
    279    pure-Python version is installed. 
    280    """ 
    281    try: 
    282        import google_crc32c 
    283 
    284        if google_crc32c.implementation == "c": 
    285            return True 
    286    except Exception: 
    287        pass 
    288    return False 
    289 
    290 
    291def _parse_generation_header(response, get_headers): 
    292    """Parses the generation header from an ``X-Goog-Generation`` value. 
    293 
    294    Args: 
    295        response (~requests.Response): The HTTP response object. 
    296        get_headers (callable: response->dict): returns response headers. 
    297 
    298    Returns: 
    299        Optional[long]: The object generation from the response, if it 
    300        can be detected from the ``X-Goog-Generation`` header; otherwise, None. 
    301    """ 
    302    headers = get_headers(response) 
    303    object_generation = headers.get(_GENERATION_HEADER, None) 
    304 
    305    if object_generation is None: 
    306        return None 
    307    else: 
    308        return int(object_generation) 
    309 
    310 
    311def _get_generation_from_url(media_url): 
    312    """Retrieve the object generation query param specified in the media url. 
    313 
    314    Args: 
    315        media_url (str): The URL containing the media to be downloaded. 
    316 
    317    Returns: 
    318        long: The object generation from the media url if exists; otherwise, None. 
    319    """ 
    320 
    321    _, _, _, query, _ = urlsplit(media_url) 
    322    query_params = parse_qs(query) 
    323    object_generation = query_params.get("generation", None) 
    324 
    325    if object_generation is None: 
    326        return None 
    327    else: 
    328        return int(object_generation[0]) 
    329 
    330 
    331def add_query_parameters(media_url, query_params): 
    332    """Add query parameters to a base url. 
    333 
    334    Args: 
    335        media_url (str): The URL containing the media to be downloaded. 
    336        query_params (dict): Names and values of the query parameters to add. 
    337 
    338    Returns: 
    339        str: URL with additional query strings appended. 
    340    """ 
    341 
    342    if len(query_params) == 0: 
    343        return media_url 
    344 
    345    scheme, netloc, path, query, frag = urlsplit(media_url) 
    346    params = parse_qs(query) 
    347    new_params = {**params, **query_params} 
    348    query = urlencode(new_params, doseq=True) 
    349    return urlunsplit((scheme, netloc, path, query, frag)) 
    350 
    351 
    352def _is_decompressive_transcoding(response, get_headers): 
    353    """Returns True if the object was served decompressed. This happens when the 
    354    "x-goog-stored-content-encoding" header is "gzip" and "content-encoding" header 
    355    is not "gzip". See more at: https://cloud.google.com/storage/docs/transcoding#transcoding_and_gzip 
    356    Args: 
    357        response (~requests.Response): The HTTP response object. 
    358        get_headers (callable: response->dict): returns response headers. 
    359    Returns: 
    360        bool: Returns True if decompressive transcoding has occurred; otherwise, False. 
    361    """ 
    362    headers = get_headers(response) 
    363    return ( 
    364        headers.get(_STORED_CONTENT_ENCODING_HEADER) == "gzip" 
    365        and headers.get(CONTENT_ENCODING_HEADER) != "gzip" 
    366    ) 
    367 
    368 
    369class _DoNothingHash(object): 
    370    """Do-nothing hash object. 
    371 
    372    Intended as a stand-in for ``hashlib.md5`` or a crc32c checksum 
    373    implementation in cases where it isn't necessary to compute the hash. 
    374    """ 
    375 
    376    def update(self, unused_chunk): 
    377        """Do-nothing ``update`` method. 
    378 
    379        Intended to match the interface of ``hashlib.md5`` and other checksums. 
    380 
    381        Args: 
    382            unused_chunk (bytes): A chunk of data. 
    383        """