1# Copyright 2017 Google Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Shared utilities used by both downloads and uploads."""
16
17from __future__ import absolute_import
18
19import base64
20import hashlib
21import logging
22import random
23import warnings
24
25from urllib.parse import parse_qs
26from urllib.parse import urlencode
27from urllib.parse import urlsplit
28from urllib.parse import urlunsplit
29
30from google.resumable_media import common
31
32
33RANGE_HEADER = "range"
34CONTENT_RANGE_HEADER = "content-range"
35CONTENT_ENCODING_HEADER = "content-encoding"
36
37_SLOW_CRC32C_WARNING = (
38 "Currently using crcmod in pure python form. This is a slow "
39 "implementation. Python 3 has a faster implementation, `google-crc32c`, "
40 "which will be used if it is installed."
41)
42_GENERATION_HEADER = "x-goog-generation"
43_HASH_HEADER = "x-goog-hash"
44_STORED_CONTENT_ENCODING_HEADER = "x-goog-stored-content-encoding"
45
46_MISSING_CHECKSUM = """\
47No {checksum_type} checksum was returned from the service while downloading {}
48(which happens for composite objects), so client-side content integrity
49checking is not being performed."""
50_LOGGER = logging.getLogger(__name__)
51
52
53def do_nothing():
54 """Simple default callback."""
55
56
57def header_required(response, name, get_headers, callback=do_nothing):
58 """Checks that a specific header is in a headers dictionary.
59
60 Args:
61 response (object): An HTTP response object, expected to have a
62 ``headers`` attribute that is a ``Mapping[str, str]``.
63 name (str): The name of a required header.
64 get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers
65 from an HTTP response.
66 callback (Optional[Callable]): A callback that takes no arguments,
67 to be executed when an exception is being raised.
68
69 Returns:
70 str: The desired header.
71
72 Raises:
73 ~google.resumable_media.common.InvalidResponse: If the header
74 is missing.
75 """
76 headers = get_headers(response)
77 if name not in headers:
78 callback()
79 raise common.InvalidResponse(
80 response, "Response headers must contain header", name
81 )
82
83 return headers[name]
84
85
86def require_status_code(response, status_codes, get_status_code, callback=do_nothing):
87 """Require a response has a status code among a list.
88
89 Args:
90 response (object): The HTTP response object.
91 status_codes (tuple): The acceptable status codes.
92 get_status_code (Callable[Any, int]): Helper to get a status code
93 from a response.
94 callback (Optional[Callable]): A callback that takes no arguments,
95 to be executed when an exception is being raised.
96
97 Returns:
98 int: The status code.
99
100 Raises:
101 ~google.resumable_media.common.InvalidResponse: If the status code
102 is not one of the values in ``status_codes``.
103 """
104 status_code = get_status_code(response)
105 if status_code not in status_codes:
106 if status_code not in common.RETRYABLE:
107 callback()
108 raise common.InvalidResponse(
109 response,
110 "Request failed with status code",
111 status_code,
112 "Expected one of",
113 *status_codes
114 )
115 return status_code
116
117
118def calculate_retry_wait(base_wait, max_sleep, multiplier=2.0):
119 """Calculate the amount of time to wait before a retry attempt.
120
121 Wait time grows exponentially with the number of attempts, until
122 ``max_sleep``.
123
124 A random amount of jitter (between 0 and 1 seconds) is added to spread out
125 retry attempts from different clients.
126
127 Args:
128 base_wait (float): The "base" wait time (i.e. without any jitter)
129 that will be multiplied until it reaches the maximum sleep.
130 max_sleep (float): Maximum value that a sleep time is allowed to be.
131 multiplier (float): Multiplier to apply to the base wait.
132
133 Returns:
134 Tuple[float, float]: The new base wait time as well as the wait time
135 to be applied (with a random amount of jitter between 0 and 1 seconds
136 added).
137 """
138 new_base_wait = multiplier * base_wait
139 if new_base_wait > max_sleep:
140 new_base_wait = max_sleep
141
142 jitter_ms = random.randint(0, 1000)
143 return new_base_wait, new_base_wait + 0.001 * jitter_ms
144
145
146def _get_crc32c_object():
147 """Get crc32c object
148 Attempt to use the Google-CRC32c package. If it isn't available, try
149 to use CRCMod. CRCMod might be using a 'slow' varietal. If so, warn...
150 """
151 try:
152 import google_crc32c # type: ignore
153
154 crc_obj = google_crc32c.Checksum()
155 except ImportError:
156 try:
157 import crcmod # type: ignore
158
159 crc_obj = crcmod.predefined.Crc("crc-32c")
160 _is_fast_crcmod()
161
162 except ImportError:
163 raise ImportError("Failed to import either `google-crc32c` or `crcmod`")
164
165 return crc_obj
166
167
168def _is_fast_crcmod():
169 # Determine if this is using the slow form of crcmod.
170 nested_crcmod = __import__(
171 "crcmod.crcmod",
172 globals(),
173 locals(),
174 ["_usingExtension"],
175 0,
176 )
177 fast_crc = getattr(nested_crcmod, "_usingExtension", False)
178 if not fast_crc:
179 warnings.warn(_SLOW_CRC32C_WARNING, RuntimeWarning, stacklevel=2)
180 return fast_crc
181
182
183def _get_metadata_key(checksum_type):
184 if checksum_type == "md5":
185 return "md5Hash"
186 else:
187 return checksum_type
188
189
190def prepare_checksum_digest(digest_bytestring):
191 """Convert a checksum object into a digest encoded for an HTTP header.
192
193 Args:
194 bytes: A checksum digest bytestring.
195
196 Returns:
197 str: A base64 string representation of the input.
198 """
199 encoded_digest = base64.b64encode(digest_bytestring)
200 # NOTE: ``b64encode`` returns ``bytes``, but HTTP headers expect ``str``.
201 return encoded_digest.decode("utf-8")
202
203
204def _get_expected_checksum(response, get_headers, media_url, checksum_type):
205 """Get the expected checksum and checksum object for the download response.
206
207 Args:
208 response (~requests.Response): The HTTP response object.
209 get_headers (callable: response->dict): returns response headers.
210 media_url (str): The URL containing the media to be downloaded.
211 checksum_type Optional(str): The checksum type to read from the headers,
212 exactly as it will appear in the headers (case-sensitive). Must be
213 "md5", "crc32c" or None.
214
215 Returns:
216 Tuple (Optional[str], object): The expected checksum of the response,
217 if it can be detected from the ``X-Goog-Hash`` header, and the
218 appropriate checksum object for the expected checksum.
219 """
220 if checksum_type not in ["md5", "crc32c", None]:
221 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
222 elif checksum_type in ["md5", "crc32c"]:
223 headers = get_headers(response)
224 expected_checksum = _parse_checksum_header(
225 headers.get(_HASH_HEADER), response, checksum_label=checksum_type
226 )
227
228 if expected_checksum is None:
229 msg = _MISSING_CHECKSUM.format(
230 media_url, checksum_type=checksum_type.upper()
231 )
232 _LOGGER.info(msg)
233 checksum_object = _DoNothingHash()
234 else:
235 if checksum_type == "md5":
236 checksum_object = hashlib.md5()
237 else:
238 checksum_object = _get_crc32c_object()
239 else:
240 expected_checksum = None
241 checksum_object = _DoNothingHash()
242
243 return (expected_checksum, checksum_object)
244
245
246def _get_uploaded_checksum_from_headers(response, get_headers, checksum_type):
247 """Get the computed checksum and checksum object from the response headers.
248
249 Args:
250 response (~requests.Response): The HTTP response object.
251 get_headers (callable: response->dict): returns response headers.
252 checksum_type Optional(str): The checksum type to read from the headers,
253 exactly as it will appear in the headers (case-sensitive). Must be
254 "md5", "crc32c" or None.
255
256 Returns:
257 Tuple (Optional[str], object): The checksum of the response,
258 if it can be detected from the ``X-Goog-Hash`` header, and the
259 appropriate checksum object for the expected checksum.
260 """
261 if checksum_type not in ["md5", "crc32c", None]:
262 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
263 elif checksum_type in ["md5", "crc32c"]:
264 headers = get_headers(response)
265 remote_checksum = _parse_checksum_header(
266 headers.get(_HASH_HEADER), response, checksum_label=checksum_type
267 )
268 else:
269 remote_checksum = None
270
271 return remote_checksum
272
273
274def _parse_checksum_header(header_value, response, checksum_label):
275 """Parses the checksum header from an ``X-Goog-Hash`` value.
276
277 .. _header reference: https://cloud.google.com/storage/docs/\
278 xml-api/reference-headers#xgooghash
279
280 Expects ``header_value`` (if not :data:`None`) to be in one of the three
281 following formats:
282
283 * ``crc32c=n03x6A==``
284 * ``md5=Ojk9c3dhfxgoKVVHYwFbHQ==``
285 * ``crc32c=n03x6A==,md5=Ojk9c3dhfxgoKVVHYwFbHQ==``
286
287 See the `header reference`_ for more information.
288
289 Args:
290 header_value (Optional[str]): The ``X-Goog-Hash`` header from
291 a download response.
292 response (~requests.Response): The HTTP response object.
293 checksum_label (str): The label of the header value to read, as in the
294 examples above. Typically "md5" or "crc32c"
295
296 Returns:
297 Optional[str]: The expected checksum of the response, if it
298 can be detected from the ``X-Goog-Hash`` header; otherwise, None.
299
300 Raises:
301 ~google.resumable_media.common.InvalidResponse: If there are
302 multiple checksums of the requested type in ``header_value``.
303 """
304 if header_value is None:
305 return None
306
307 matches = []
308 for checksum in header_value.split(","):
309 name, value = checksum.split("=", 1)
310 # Official docs say "," is the separator, but real-world responses have encountered ", "
311 if name.lstrip() == checksum_label:
312 matches.append(value)
313
314 if len(matches) == 0:
315 return None
316 elif len(matches) == 1:
317 return matches[0]
318 else:
319 raise common.InvalidResponse(
320 response,
321 "X-Goog-Hash header had multiple ``{}`` values.".format(checksum_label),
322 header_value,
323 matches,
324 )
325
326
327def _get_checksum_object(checksum_type):
328 """Respond with a checksum object for a supported type, if not None.
329
330 Raises ValueError if checksum_type is unsupported.
331 """
332 if checksum_type == "md5":
333 return hashlib.md5()
334 elif checksum_type == "crc32c":
335 return _get_crc32c_object()
336 elif checksum_type is None:
337 return None
338 else:
339 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
340
341
342def _parse_generation_header(response, get_headers):
343 """Parses the generation header from an ``X-Goog-Generation`` value.
344
345 Args:
346 response (~requests.Response): The HTTP response object.
347 get_headers (callable: response->dict): returns response headers.
348
349 Returns:
350 Optional[long]: The object generation from the response, if it
351 can be detected from the ``X-Goog-Generation`` header; otherwise, None.
352 """
353 headers = get_headers(response)
354 object_generation = headers.get(_GENERATION_HEADER, None)
355
356 if object_generation is None:
357 return None
358 else:
359 return int(object_generation)
360
361
362def _get_generation_from_url(media_url):
363 """Retrieve the object generation query param specified in the media url.
364
365 Args:
366 media_url (str): The URL containing the media to be downloaded.
367
368 Returns:
369 long: The object generation from the media url if exists; otherwise, None.
370 """
371
372 _, _, _, query, _ = urlsplit(media_url)
373 query_params = parse_qs(query)
374 object_generation = query_params.get("generation", None)
375
376 if object_generation is None:
377 return None
378 else:
379 return int(object_generation[0])
380
381
382def add_query_parameters(media_url, query_params):
383 """Add query parameters to a base url.
384
385 Args:
386 media_url (str): The URL containing the media to be downloaded.
387 query_params (dict): Names and values of the query parameters to add.
388
389 Returns:
390 str: URL with additional query strings appended.
391 """
392
393 if len(query_params) == 0:
394 return media_url
395
396 scheme, netloc, path, query, frag = urlsplit(media_url)
397 params = parse_qs(query)
398 new_params = {**params, **query_params}
399 query = urlencode(new_params, doseq=True)
400 return urlunsplit((scheme, netloc, path, query, frag))
401
402
403def _is_decompressive_transcoding(response, get_headers):
404 """Returns True if the object was served decompressed. This happens when the
405 "x-goog-stored-content-encoding" header is "gzip" and "content-encoding" header
406 is not "gzip". See more at: https://cloud.google.com/storage/docs/transcoding#transcoding_and_gzip
407 Args:
408 response (~requests.Response): The HTTP response object.
409 get_headers (callable: response->dict): returns response headers.
410 Returns:
411 bool: Returns True if decompressive transcoding has occurred; otherwise, False.
412 """
413 headers = get_headers(response)
414 return (
415 headers.get(_STORED_CONTENT_ENCODING_HEADER) == "gzip"
416 and headers.get(CONTENT_ENCODING_HEADER) != "gzip"
417 )
418
419
420class _DoNothingHash(object):
421 """Do-nothing hash object.
422
423 Intended as a stand-in for ``hashlib.md5`` or a crc32c checksum
424 implementation in cases where it isn't necessary to compute the hash.
425 """
426
427 def update(self, unused_chunk):
428 """Do-nothing ``update`` method.
429
430 Intended to match the interface of ``hashlib.md5`` and other checksums.
431
432 Args:
433 unused_chunk (bytes): A chunk of data.
434 """