1# Copyright 2017 Google Inc.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Virtual bases classes for downloading media from Google APIs."""
16
17import http.client
18import re
19
20from google.resumable_media import _helpers
21from google.resumable_media import common
22
23
24_CONTENT_RANGE_RE = re.compile(
25 r"bytes (?P<start_byte>\d+)-(?P<end_byte>\d+)/(?P<total_bytes>\d+)",
26 flags=re.IGNORECASE,
27)
28_ACCEPTABLE_STATUS_CODES = (http.client.OK, http.client.PARTIAL_CONTENT)
29_GET = "GET"
30_ZERO_CONTENT_RANGE_HEADER = "bytes */0"
31
32
33class DownloadBase(object):
34 """Base class for download helpers.
35
36 Defines core shared behavior across different download types.
37
38 Args:
39 media_url (str): The URL containing the media to be downloaded.
40 stream (IO[bytes]): A write-able stream (i.e. file-like object) that
41 the downloaded resource can be written to.
42 start (int): The first byte in a range to be downloaded.
43 end (int): The last byte in a range to be downloaded.
44 headers (Optional[Mapping[str, str]]): Extra headers that should
45 be sent with the request, e.g. headers for encrypted data.
46
47 Attributes:
48 media_url (str): The URL containing the media to be downloaded.
49 start (Optional[int]): The first byte in a range to be downloaded.
50 end (Optional[int]): The last byte in a range to be downloaded.
51 """
52
53 def __init__(self, media_url, stream=None, start=None, end=None, headers=None):
54 self.media_url = media_url
55 self._stream = stream
56 self.start = start
57 self.end = end
58 if headers is None:
59 headers = {}
60 self._headers = headers
61 self._finished = False
62 self._retry_strategy = common.RetryStrategy()
63
64 @property
65 def finished(self):
66 """bool: Flag indicating if the download has completed."""
67 return self._finished
68
69 @staticmethod
70 def _get_status_code(response):
71 """Access the status code from an HTTP response.
72
73 Args:
74 response (object): The HTTP response object.
75
76 Raises:
77 NotImplementedError: Always, since virtual.
78 """
79 raise NotImplementedError("This implementation is virtual.")
80
81 @staticmethod
82 def _get_headers(response):
83 """Access the headers from an HTTP response.
84
85 Args:
86 response (object): The HTTP response object.
87
88 Raises:
89 NotImplementedError: Always, since virtual.
90 """
91 raise NotImplementedError("This implementation is virtual.")
92
93 @staticmethod
94 def _get_body(response):
95 """Access the response body from an HTTP response.
96
97 Args:
98 response (object): The HTTP response object.
99
100 Raises:
101 NotImplementedError: Always, since virtual.
102 """
103 raise NotImplementedError("This implementation is virtual.")
104
105
106class Download(DownloadBase):
107 """Helper to manage downloading a resource from a Google API.
108
109 "Slices" of the resource can be retrieved by specifying a range
110 with ``start`` and / or ``end``. However, in typical usage, neither
111 ``start`` nor ``end`` is expected to be provided.
112
113 Args:
114 media_url (str): The URL containing the media to be downloaded.
115 stream (IO[bytes]): A write-able stream (i.e. file-like object) that
116 the downloaded resource can be written to.
117 start (int): The first byte in a range to be downloaded. If not
118 provided, but ``end`` is provided, will download from the
119 beginning to ``end`` of the media.
120 end (int): The last byte in a range to be downloaded. If not
121 provided, but ``start`` is provided, will download from the
122 ``start`` to the end of the media.
123 headers (Optional[Mapping[str, str]]): Extra headers that should
124 be sent with the request, e.g. headers for encrypted data.
125 checksum Optional([str]): The type of checksum to compute to verify
126 the integrity of the object. The response headers must contain
127 a checksum of the requested type. If the headers lack an
128 appropriate checksum (for instance in the case of transcoded or
129 ranged downloads where the remote service does not know the
130 correct checksum) an INFO-level log will be emitted. Supported
131 values are "md5", "crc32c" and None.
132 """
133
134 def __init__(
135 self, media_url, stream=None, start=None, end=None, headers=None, checksum="md5"
136 ):
137 super(Download, self).__init__(
138 media_url, stream=stream, start=start, end=end, headers=headers
139 )
140 self.checksum = checksum
141 self._bytes_downloaded = 0
142 self._expected_checksum = None
143 self._checksum_object = None
144 self._object_generation = None
145
146 def _prepare_request(self):
147 """Prepare the contents of an HTTP request.
148
149 This is everything that must be done before a request that doesn't
150 require network I/O (or other I/O). This is based on the `sans-I/O`_
151 philosophy.
152
153 Returns:
154 Tuple[str, str, NoneType, Mapping[str, str]]: The quadruple
155
156 * HTTP verb for the request (always GET)
157 * the URL for the request
158 * the body of the request (always :data:`None`)
159 * headers for the request
160
161 Raises:
162 ValueError: If the current :class:`Download` has already
163 finished.
164
165 .. _sans-I/O: https://sans-io.readthedocs.io/
166 """
167 if self.finished:
168 raise ValueError("A download can only be used once.")
169
170 add_bytes_range(self.start, self.end, self._headers)
171 return _GET, self.media_url, None, self._headers
172
173 def _process_response(self, response):
174 """Process the response from an HTTP request.
175
176 This is everything that must be done after a request that doesn't
177 require network I/O (or other I/O). This is based on the `sans-I/O`_
178 philosophy.
179
180 Args:
181 response (object): The HTTP response object.
182
183 .. _sans-I/O: https://sans-io.readthedocs.io/
184 """
185 # Tombstone the current Download so it cannot be used again.
186 self._finished = True
187 _helpers.require_status_code(
188 response, _ACCEPTABLE_STATUS_CODES, self._get_status_code
189 )
190
191 def consume(self, transport, timeout=None):
192 """Consume the resource to be downloaded.
193
194 If a ``stream`` is attached to this download, then the downloaded
195 resource will be written to the stream.
196
197 Args:
198 transport (object): An object which can make authenticated
199 requests.
200 timeout (Optional[Union[float, Tuple[float, float]]]):
201 The number of seconds to wait for the server response.
202 Depending on the retry strategy, a request may be repeated
203 several times using the same timeout each time.
204
205 Can also be passed as a tuple (connect_timeout, read_timeout).
206 See :meth:`requests.Session.request` documentation for details.
207
208 Raises:
209 NotImplementedError: Always, since virtual.
210 """
211 raise NotImplementedError("This implementation is virtual.")
212
213
214class ChunkedDownload(DownloadBase):
215 """Download a resource in chunks from a Google API.
216
217 Args:
218 media_url (str): The URL containing the media to be downloaded.
219 chunk_size (int): The number of bytes to be retrieved in each
220 request.
221 stream (IO[bytes]): A write-able stream (i.e. file-like object) that
222 will be used to concatenate chunks of the resource as they are
223 downloaded.
224 start (int): The first byte in a range to be downloaded. If not
225 provided, defaults to ``0``.
226 end (int): The last byte in a range to be downloaded. If not
227 provided, will download to the end of the media.
228 headers (Optional[Mapping[str, str]]): Extra headers that should
229 be sent with each request, e.g. headers for data encryption
230 key headers.
231
232 Attributes:
233 media_url (str): The URL containing the media to be downloaded.
234 start (Optional[int]): The first byte in a range to be downloaded.
235 end (Optional[int]): The last byte in a range to be downloaded.
236 chunk_size (int): The number of bytes to be retrieved in each request.
237
238 Raises:
239 ValueError: If ``start`` is negative.
240 """
241
242 def __init__(self, media_url, chunk_size, stream, start=0, end=None, headers=None):
243 if start < 0:
244 raise ValueError(
245 "On a chunked download the starting value cannot be negative."
246 )
247 super(ChunkedDownload, self).__init__(
248 media_url, stream=stream, start=start, end=end, headers=headers
249 )
250 self.chunk_size = chunk_size
251 self._bytes_downloaded = 0
252 self._total_bytes = None
253 self._invalid = False
254
255 @property
256 def bytes_downloaded(self):
257 """int: Number of bytes that have been downloaded."""
258 return self._bytes_downloaded
259
260 @property
261 def total_bytes(self):
262 """Optional[int]: The total number of bytes to be downloaded."""
263 return self._total_bytes
264
265 @property
266 def invalid(self):
267 """bool: Indicates if the download is in an invalid state.
268
269 This will occur if a call to :meth:`consume_next_chunk` fails.
270 """
271 return self._invalid
272
273 def _get_byte_range(self):
274 """Determines the byte range for the next request.
275
276 Returns:
277 Tuple[int, int]: The pair of begin and end byte for the next
278 chunked request.
279 """
280 curr_start = self.start + self.bytes_downloaded
281 curr_end = curr_start + self.chunk_size - 1
282 # Make sure ``curr_end`` does not exceed ``end``.
283 if self.end is not None:
284 curr_end = min(curr_end, self.end)
285 # Make sure ``curr_end`` does not exceed ``total_bytes - 1``.
286 if self.total_bytes is not None:
287 curr_end = min(curr_end, self.total_bytes - 1)
288 return curr_start, curr_end
289
290 def _prepare_request(self):
291 """Prepare the contents of an HTTP request.
292
293 This is everything that must be done before a request that doesn't
294 require network I/O (or other I/O). This is based on the `sans-I/O`_
295 philosophy.
296
297 .. note:
298
299 This method will be used multiple times, so ``headers`` will
300 be mutated in between requests. However, we don't make a copy
301 since the same keys are being updated.
302
303 Returns:
304 Tuple[str, str, NoneType, Mapping[str, str]]: The quadruple
305
306 * HTTP verb for the request (always GET)
307 * the URL for the request
308 * the body of the request (always :data:`None`)
309 * headers for the request
310
311 Raises:
312 ValueError: If the current download has finished.
313 ValueError: If the current download is invalid.
314
315 .. _sans-I/O: https://sans-io.readthedocs.io/
316 """
317 if self.finished:
318 raise ValueError("Download has finished.")
319 if self.invalid:
320 raise ValueError("Download is invalid and cannot be re-used.")
321
322 curr_start, curr_end = self._get_byte_range()
323 add_bytes_range(curr_start, curr_end, self._headers)
324 return _GET, self.media_url, None, self._headers
325
326 def _make_invalid(self):
327 """Simple setter for ``invalid``.
328
329 This is intended to be passed along as a callback to helpers that
330 raise an exception so they can mark this instance as invalid before
331 raising.
332 """
333 self._invalid = True
334
335 def _process_response(self, response):
336 """Process the response from an HTTP request.
337
338 This is everything that must be done after a request that doesn't
339 require network I/O. This is based on the `sans-I/O`_ philosophy.
340
341 For the time being, this **does require** some form of I/O to write
342 a chunk to ``stream``. However, this will (almost) certainly not be
343 network I/O.
344
345 Updates the current state after consuming a chunk. First,
346 increments ``bytes_downloaded`` by the number of bytes in the
347 ``content-length`` header.
348
349 If ``total_bytes`` is already set, this assumes (but does not check)
350 that we already have the correct value and doesn't bother to check
351 that it agrees with the headers.
352
353 We expect the **total** length to be in the ``content-range`` header,
354 but this header is only present on requests which sent the ``range``
355 header. This response header should be of the form
356 ``bytes {start}-{end}/{total}`` and ``{end} - {start} + 1``
357 should be the same as the ``Content-Length``.
358
359 Args:
360 response (object): The HTTP response object (need headers).
361
362 Raises:
363 ~google.resumable_media.common.InvalidResponse: If the number
364 of bytes in the body doesn't match the content length header.
365
366 .. _sans-I/O: https://sans-io.readthedocs.io/
367 """
368 # Verify the response before updating the current instance.
369 if _check_for_zero_content_range(
370 response, self._get_status_code, self._get_headers
371 ):
372 self._finished = True
373 return
374
375 _helpers.require_status_code(
376 response,
377 _ACCEPTABLE_STATUS_CODES,
378 self._get_status_code,
379 callback=self._make_invalid,
380 )
381 headers = self._get_headers(response)
382 response_body = self._get_body(response)
383
384 start_byte, end_byte, total_bytes = get_range_info(
385 response, self._get_headers, callback=self._make_invalid
386 )
387
388 transfer_encoding = headers.get("transfer-encoding")
389
390 if transfer_encoding is None:
391 content_length = _helpers.header_required(
392 response,
393 "content-length",
394 self._get_headers,
395 callback=self._make_invalid,
396 )
397 num_bytes = int(content_length)
398 if len(response_body) != num_bytes:
399 self._make_invalid()
400 raise common.InvalidResponse(
401 response,
402 "Response is different size than content-length",
403 "Expected",
404 num_bytes,
405 "Received",
406 len(response_body),
407 )
408 else:
409 # 'content-length' header not allowed with chunked encoding.
410 num_bytes = end_byte - start_byte + 1
411
412 # First update ``bytes_downloaded``.
413 self._bytes_downloaded += num_bytes
414 # If the end byte is past ``end`` or ``total_bytes - 1`` we are done.
415 if self.end is not None and end_byte >= self.end:
416 self._finished = True
417 elif end_byte >= total_bytes - 1:
418 self._finished = True
419 # NOTE: We only use ``total_bytes`` if not already known.
420 if self.total_bytes is None:
421 self._total_bytes = total_bytes
422 # Write the response body to the stream.
423 self._stream.write(response_body)
424
425 def consume_next_chunk(self, transport, timeout=None):
426 """Consume the next chunk of the resource to be downloaded.
427
428 Args:
429 transport (object): An object which can make authenticated
430 requests.
431 timeout (Optional[Union[float, Tuple[float, float]]]):
432 The number of seconds to wait for the server response.
433 Depending on the retry strategy, a request may be repeated
434 several times using the same timeout each time.
435
436 Can also be passed as a tuple (connect_timeout, read_timeout).
437 See :meth:`requests.Session.request` documentation for details.
438
439 Raises:
440 NotImplementedError: Always, since virtual.
441 """
442 raise NotImplementedError("This implementation is virtual.")
443
444
445def add_bytes_range(start, end, headers):
446 """Add a bytes range to a header dictionary.
447
448 Some possible inputs and the corresponding bytes ranges::
449
450 >>> headers = {}
451 >>> add_bytes_range(None, None, headers)
452 >>> headers
453 {}
454 >>> add_bytes_range(500, 999, headers)
455 >>> headers['range']
456 'bytes=500-999'
457 >>> add_bytes_range(None, 499, headers)
458 >>> headers['range']
459 'bytes=0-499'
460 >>> add_bytes_range(-500, None, headers)
461 >>> headers['range']
462 'bytes=-500'
463 >>> add_bytes_range(9500, None, headers)
464 >>> headers['range']
465 'bytes=9500-'
466
467 Args:
468 start (Optional[int]): The first byte in a range. Can be zero,
469 positive, negative or :data:`None`.
470 end (Optional[int]): The last byte in a range. Assumed to be
471 positive.
472 headers (Mapping[str, str]): A headers mapping which can have the
473 bytes range added if at least one of ``start`` or ``end``
474 is not :data:`None`.
475 """
476 if start is None:
477 if end is None:
478 # No range to add.
479 return
480 else:
481 # NOTE: This assumes ``end`` is non-negative.
482 bytes_range = "0-{:d}".format(end)
483 else:
484 if end is None:
485 if start < 0:
486 bytes_range = "{:d}".format(start)
487 else:
488 bytes_range = "{:d}-".format(start)
489 else:
490 # NOTE: This is invalid if ``start < 0``.
491 bytes_range = "{:d}-{:d}".format(start, end)
492
493 headers[_helpers.RANGE_HEADER] = "bytes=" + bytes_range
494
495
496def get_range_info(response, get_headers, callback=_helpers.do_nothing):
497 """Get the start, end and total bytes from a content range header.
498
499 Args:
500 response (object): An HTTP response object.
501 get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers
502 from an HTTP response.
503 callback (Optional[Callable]): A callback that takes no arguments,
504 to be executed when an exception is being raised.
505
506 Returns:
507 Tuple[int, int, int]: The start byte, end byte and total bytes.
508
509 Raises:
510 ~google.resumable_media.common.InvalidResponse: If the
511 ``Content-Range`` header is not of the form
512 ``bytes {start}-{end}/{total}``.
513 """
514 content_range = _helpers.header_required(
515 response, _helpers.CONTENT_RANGE_HEADER, get_headers, callback=callback
516 )
517 match = _CONTENT_RANGE_RE.match(content_range)
518 if match is None:
519 callback()
520 raise common.InvalidResponse(
521 response,
522 "Unexpected content-range header",
523 content_range,
524 'Expected to be of the form "bytes {start}-{end}/{total}"',
525 )
526
527 return (
528 int(match.group("start_byte")),
529 int(match.group("end_byte")),
530 int(match.group("total_bytes")),
531 )
532
533
534def _check_for_zero_content_range(response, get_status_code, get_headers):
535 """Validate if response status code is 416 and content range is zero.
536
537 This is the special case for handling zero bytes files.
538
539 Args:
540 response (object): An HTTP response object.
541 get_status_code (Callable[Any, int]): Helper to get a status code
542 from a response.
543 get_headers (Callable[Any, Mapping[str, str]]): Helper to get headers
544 from an HTTP response.
545
546 Returns:
547 bool: True if content range total bytes is zero, false otherwise.
548 """
549 if get_status_code(response) == http.client.REQUESTED_RANGE_NOT_SATISFIABLE:
550 content_range = _helpers.header_required(
551 response,
552 _helpers.CONTENT_RANGE_HEADER,
553 get_headers,
554 callback=_helpers.do_nothing,
555 )
556 if content_range == _ZERO_CONTENT_RANGE_HEADER:
557 return True
558 return False