1# Copyright 2017 Google Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Shared utilities used by both downloads and uploads."""
16
17from __future__ import absolute_import
18
19import base64
20import hashlib
21import logging
22
23from urllib.parse import parse_qs
24from urllib.parse import urlencode
25from urllib.parse import urlsplit
26from urllib.parse import urlunsplit
27
28from google.cloud.storage import retry
29from google.cloud.storage.exceptions import InvalidResponse
30
31
32RANGE_HEADER = "range"
33CONTENT_RANGE_HEADER = "content-range"
34CONTENT_ENCODING_HEADER = "content-encoding"
35
36_SLOW_CRC32C_WARNING = (
37 "Currently using crcmod in pure python form. This is a slow "
38 "implementation. Python 3 has a faster implementation, `google-crc32c`, "
39 "which will be used if it is installed."
40)
41_GENERATION_HEADER = "x-goog-generation"
42_HASH_HEADER = "x-goog-hash"
43_STORED_CONTENT_ENCODING_HEADER = "x-goog-stored-content-encoding"
44
45_MISSING_CHECKSUM = """\
46No {checksum_type} checksum was returned from the service while downloading {}
47(which happens for composite objects), so client-side content integrity
48checking is not being performed."""
49_LOGGER = logging.getLogger(__name__)
50
51
52def do_nothing():
53 """Simple default callback."""
54
55
56def header_required(response, name, get_headers, callback=do_nothing):
57 """Checks that a specific header is in a headers dictionary.
58
59 Args:
60 response (object): An HTTP response object, expected to have a
61 ``headers`` attribute that is a ``Mapping[str, str]``.
62 name (str): The name of a required header.
63 get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers
64 from an HTTP response.
65 callback (Optional[Callable]): A callback that takes no arguments,
66 to be executed when an exception is being raised.
67
68 Returns:
69 str: The desired header.
70
71 Raises:
72 ~google.cloud.storage.exceptions.InvalidResponse: If the header
73 is missing.
74 """
75 headers = get_headers(response)
76 if name not in headers:
77 callback()
78 raise InvalidResponse(response, "Response headers must contain header", name)
79
80 return headers[name]
81
82
83def require_status_code(response, status_codes, get_status_code, callback=do_nothing):
84 """Require a response has a status code among a list.
85
86 Args:
87 response (object): The HTTP response object.
88 status_codes (tuple): The acceptable status codes.
89 get_status_code (Callable[Any, int]): Helper to get a status code
90 from a response.
91 callback (Optional[Callable]): A callback that takes no arguments,
92 to be executed when an exception is being raised.
93
94 Returns:
95 int: The status code.
96
97 Raises:
98 ~google.cloud.storage.exceptions.InvalidResponse: If the status code
99 is not one of the values in ``status_codes``.
100 """
101 status_code = get_status_code(response)
102 if status_code not in status_codes:
103 if status_code not in retry._RETRYABLE_STATUS_CODES:
104 callback()
105 raise InvalidResponse(
106 response,
107 "Request failed with status code",
108 status_code,
109 "Expected one of",
110 *status_codes
111 )
112 return status_code
113
114
115def _get_metadata_key(checksum_type):
116 if checksum_type == "md5":
117 return "md5Hash"
118 else:
119 return checksum_type
120
121
122def prepare_checksum_digest(digest_bytestring):
123 """Convert a checksum object into a digest encoded for an HTTP header.
124
125 Args:
126 bytes: A checksum digest bytestring.
127
128 Returns:
129 str: A base64 string representation of the input.
130 """
131 encoded_digest = base64.b64encode(digest_bytestring)
132 # NOTE: ``b64encode`` returns ``bytes``, but HTTP headers expect ``str``.
133 return encoded_digest.decode("utf-8")
134
135
136def _get_expected_checksum(response, get_headers, media_url, checksum_type):
137 """Get the expected checksum and checksum object for the download response.
138
139 Args:
140 response (~requests.Response): The HTTP response object.
141 get_headers (callable: response->dict): returns response headers.
142 media_url (str): The URL containing the media to be downloaded.
143 checksum_type Optional(str): The checksum type to read from the headers,
144 exactly as it will appear in the headers (case-sensitive). Must be
145 "md5", "crc32c" or None.
146
147 Returns:
148 Tuple (Optional[str], object): The expected checksum of the response,
149 if it can be detected from the ``X-Goog-Hash`` header, and the
150 appropriate checksum object for the expected checksum.
151 """
152 if checksum_type not in ["md5", "crc32c", None]:
153 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
154 elif checksum_type in ["md5", "crc32c"]:
155 headers = get_headers(response)
156 expected_checksum = _parse_checksum_header(
157 headers.get(_HASH_HEADER), response, checksum_label=checksum_type
158 )
159
160 if expected_checksum is None:
161 msg = _MISSING_CHECKSUM.format(
162 media_url, checksum_type=checksum_type.upper()
163 )
164 _LOGGER.info(msg)
165 checksum_object = _DoNothingHash()
166 else:
167 checksum_object = _get_checksum_object(checksum_type)
168 else:
169 expected_checksum = None
170 checksum_object = _DoNothingHash()
171
172 return (expected_checksum, checksum_object)
173
174
175def _get_uploaded_checksum_from_headers(response, get_headers, checksum_type):
176 """Get the computed checksum and checksum object from the response headers.
177
178 Args:
179 response (~requests.Response): The HTTP response object.
180 get_headers (callable: response->dict): returns response headers.
181 checksum_type Optional(str): The checksum type to read from the headers,
182 exactly as it will appear in the headers (case-sensitive). Must be
183 "md5", "crc32c" or None.
184
185 Returns:
186 Tuple (Optional[str], object): The checksum of the response,
187 if it can be detected from the ``X-Goog-Hash`` header, and the
188 appropriate checksum object for the expected checksum.
189 """
190 if checksum_type not in ["md5", "crc32c", None]:
191 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
192 elif checksum_type in ["md5", "crc32c"]:
193 headers = get_headers(response)
194 remote_checksum = _parse_checksum_header(
195 headers.get(_HASH_HEADER), response, checksum_label=checksum_type
196 )
197 else:
198 remote_checksum = None
199
200 return remote_checksum
201
202
203def _parse_checksum_header(header_value, response, checksum_label):
204 """Parses the checksum header from an ``X-Goog-Hash`` value.
205
206 .. _header reference: https://cloud.google.com/storage/docs/\
207 xml-api/reference-headers#xgooghash
208
209 Expects ``header_value`` (if not :data:`None`) to be in one of the three
210 following formats:
211
212 * ``crc32c=n03x6A==``
213 * ``md5=Ojk9c3dhfxgoKVVHYwFbHQ==``
214 * ``crc32c=n03x6A==,md5=Ojk9c3dhfxgoKVVHYwFbHQ==``
215
216 See the `header reference`_ for more information.
217
218 Args:
219 header_value (Optional[str]): The ``X-Goog-Hash`` header from
220 a download response.
221 response (~requests.Response): The HTTP response object.
222 checksum_label (str): The label of the header value to read, as in the
223 examples above. Typically "md5" or "crc32c"
224
225 Returns:
226 Optional[str]: The expected checksum of the response, if it
227 can be detected from the ``X-Goog-Hash`` header; otherwise, None.
228
229 Raises:
230 ~google.cloud.storage.exceptions.InvalidResponse: If there are
231 multiple checksums of the requested type in ``header_value``.
232 """
233 if header_value is None:
234 return None
235
236 matches = []
237 for checksum in header_value.split(","):
238 name, value = checksum.split("=", 1)
239 # Official docs say "," is the separator, but real-world responses have encountered ", "
240 if name.lstrip() == checksum_label:
241 matches.append(value)
242
243 if len(matches) == 0:
244 return None
245 elif len(matches) == 1:
246 return matches[0]
247 else:
248 raise InvalidResponse(
249 response,
250 "X-Goog-Hash header had multiple ``{}`` values.".format(checksum_label),
251 header_value,
252 matches,
253 )
254
255
256def _get_checksum_object(checksum_type):
257 """Respond with a checksum object for a supported type, if not None.
258
259 Raises ValueError if checksum_type is unsupported.
260 """
261 if checksum_type == "md5":
262 return hashlib.md5()
263 elif checksum_type == "crc32c":
264 # In order to support platforms that don't have google_crc32c
265 # support, only perform the import on demand.
266 import google_crc32c
267
268 return google_crc32c.Checksum()
269 elif checksum_type is None:
270 return None
271 else:
272 raise ValueError("checksum must be ``'md5'``, ``'crc32c'`` or ``None``")
273
274
275def _is_crc32c_available_and_fast():
276 """Return True if the google_crc32c C extension is installed.
277
278 Return False if either the package is not installed, or if only the
279 pure-Python version is installed.
280 """
281 try:
282 import google_crc32c
283
284 if google_crc32c.implementation == "c":
285 return True
286 except Exception:
287 pass
288 return False
289
290
291def _parse_generation_header(response, get_headers):
292 """Parses the generation header from an ``X-Goog-Generation`` value.
293
294 Args:
295 response (~requests.Response): The HTTP response object.
296 get_headers (callable: response->dict): returns response headers.
297
298 Returns:
299 Optional[long]: The object generation from the response, if it
300 can be detected from the ``X-Goog-Generation`` header; otherwise, None.
301 """
302 headers = get_headers(response)
303 object_generation = headers.get(_GENERATION_HEADER, None)
304
305 if object_generation is None:
306 return None
307 else:
308 return int(object_generation)
309
310
311def _get_generation_from_url(media_url):
312 """Retrieve the object generation query param specified in the media url.
313
314 Args:
315 media_url (str): The URL containing the media to be downloaded.
316
317 Returns:
318 long: The object generation from the media url if exists; otherwise, None.
319 """
320
321 _, _, _, query, _ = urlsplit(media_url)
322 query_params = parse_qs(query)
323 object_generation = query_params.get("generation", None)
324
325 if object_generation is None:
326 return None
327 else:
328 return int(object_generation[0])
329
330
331def add_query_parameters(media_url, query_params):
332 """Add query parameters to a base url.
333
334 Args:
335 media_url (str): The URL containing the media to be downloaded.
336 query_params (dict): Names and values of the query parameters to add.
337
338 Returns:
339 str: URL with additional query strings appended.
340 """
341
342 if len(query_params) == 0:
343 return media_url
344
345 scheme, netloc, path, query, frag = urlsplit(media_url)
346 params = parse_qs(query)
347 new_params = {**params, **query_params}
348 query = urlencode(new_params, doseq=True)
349 return urlunsplit((scheme, netloc, path, query, frag))
350
351
352def _is_decompressive_transcoding(response, get_headers):
353 """Returns True if the object was served decompressed. This happens when the
354 "x-goog-stored-content-encoding" header is "gzip" and "content-encoding" header
355 is not "gzip". See more at: https://cloud.google.com/storage/docs/transcoding#transcoding_and_gzip
356 Args:
357 response (~requests.Response): The HTTP response object.
358 get_headers (callable: response->dict): returns response headers.
359 Returns:
360 bool: Returns True if decompressive transcoding has occurred; otherwise, False.
361 """
362 headers = get_headers(response)
363 return (
364 headers.get(_STORED_CONTENT_ENCODING_HEADER) == "gzip"
365 and headers.get(CONTENT_ENCODING_HEADER) != "gzip"
366 )
367
368
369class _DoNothingHash(object):
370 """Do-nothing hash object.
371
372 Intended as a stand-in for ``hashlib.md5`` or a crc32c checksum
373 implementation in cases where it isn't necessary to compute the hash.
374 """
375
376 def update(self, unused_chunk):
377 """Do-nothing ``update`` method.
378
379 Intended to match the interface of ``hashlib.md5`` and other checksums.
380
381 Args:
382 unused_chunk (bytes): A chunk of data.
383 """