Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/smart_open/smart_open_lib.py: 27%
165 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:57 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:57 +0000
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
4#
5# This code is distributed under the terms and conditions
6# from the MIT License (MIT).
7#
9"""Implements the majority of smart_open's top-level API.
11The main functions are:
13 * ``parse_uri()``
14 * ``open()``
16"""
18import collections
19import io
20import locale
21import logging
22import os
23import os.path as P
24import pathlib
25import urllib.parse
26import warnings
28#
29# This module defines a function called smart_open so we cannot use
30# smart_open.submodule to reference to the submodules.
31#
32import smart_open.local_file as so_file
33import smart_open.compression as so_compression
35from smart_open import doctools
36from smart_open import transport
38#
39# For backwards compatibility and keeping old unit tests happy.
40#
41from smart_open.compression import register_compressor # noqa: F401
42from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401
43from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401
45logger = logging.getLogger(__name__)
47DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
50def _sniff_scheme(uri_as_string):
51 """Returns the scheme of the URL only, as a string."""
52 #
53 # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
54 # no protocol given => assume a local file
55 #
56 if os.name == 'nt' and '://' not in uri_as_string:
57 uri_as_string = 'file://' + uri_as_string
59 return urllib.parse.urlsplit(uri_as_string).scheme
62def parse_uri(uri_as_string):
63 """
64 Parse the given URI from a string.
66 Parameters
67 ----------
68 uri_as_string: str
69 The URI to parse.
71 Returns
72 -------
73 collections.namedtuple
74 The parsed URI.
76 Notes
77 -----
78 smart_open/doctools.py magic goes here
79 """
80 scheme = _sniff_scheme(uri_as_string)
81 submodule = transport.get_transport(scheme)
82 as_dict = submodule.parse_uri(uri_as_string)
84 #
85 # The conversion to a namedtuple is just to keep the old tests happy while
86 # I'm still refactoring.
87 #
88 Uri = collections.namedtuple('Uri', sorted(as_dict.keys()))
89 return Uri(**as_dict)
92#
93# To keep old unit tests happy while I'm refactoring.
94#
95_parse_uri = parse_uri
97_builtin_open = open
100def open(
101 uri,
102 mode='r',
103 buffering=-1,
104 encoding=None,
105 errors=None,
106 newline=None,
107 closefd=True,
108 opener=None,
109 compression=so_compression.INFER_FROM_EXTENSION,
110 transport_params=None,
111 ):
112 r"""Open the URI object, returning a file-like object.
114 The URI is usually a string in a variety of formats.
115 For a full list of examples, see the :func:`parse_uri` function.
117 The URI may also be one of:
119 - an instance of the pathlib.Path class
120 - a stream (anything that implements io.IOBase-like functionality)
122 Parameters
123 ----------
124 uri: str or object
125 The object to open.
126 mode: str, optional
127 Mimicks built-in open parameter of the same name.
128 buffering: int, optional
129 Mimicks built-in open parameter of the same name.
130 encoding: str, optional
131 Mimicks built-in open parameter of the same name.
132 errors: str, optional
133 Mimicks built-in open parameter of the same name.
134 newline: str, optional
135 Mimicks built-in open parameter of the same name.
136 closefd: boolean, optional
137 Mimicks built-in open parameter of the same name. Ignored.
138 opener: object, optional
139 Mimicks built-in open parameter of the same name. Ignored.
140 compression: str, optional (see smart_open.compression.get_supported_compression_types)
141 Explicitly specify the compression/decompression behavior.
142 transport_params: dict, optional
143 Additional parameters for the transport layer (see notes below).
145 Returns
146 -------
147 A file-like object.
149 Notes
150 -----
151 smart_open has several implementations for its transport layer (e.g. S3, HTTP).
152 Each transport layer has a different set of keyword arguments for overriding
153 default behavior. If you specify a keyword argument that is *not* supported
154 by the transport layer being used, smart_open will ignore that argument and
155 log a warning message.
157 smart_open/doctools.py magic goes here
159 See Also
160 --------
161 - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__
162 - `smart_open README.rst
163 <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__
165 """
166 logger.debug('%r', locals())
168 if not isinstance(mode, str):
169 raise TypeError('mode should be a string')
171 if compression not in so_compression.get_supported_compression_types():
172 raise ValueError(f'invalid compression type: {compression}')
174 if transport_params is None:
175 transport_params = {}
177 fobj = _shortcut_open(
178 uri,
179 mode,
180 compression=compression,
181 buffering=buffering,
182 encoding=encoding,
183 errors=errors,
184 newline=newline,
185 )
186 if fobj is not None:
187 return fobj
189 #
190 # This is a work-around for the problem described in Issue #144.
191 # If the user has explicitly specified an encoding, then assume they want
192 # us to open the destination in text mode, instead of the default binary.
193 #
194 # If we change the default mode to be text, and match the normal behavior
195 # of Py2 and 3, then the above assumption will be unnecessary.
196 #
197 if encoding is not None and 'b' in mode:
198 mode = mode.replace('b', '')
200 if isinstance(uri, pathlib.Path):
201 uri = str(uri)
203 explicit_encoding = encoding
204 encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING
206 #
207 # This is how we get from the filename to the end result. Decompression is
208 # optional, but it always accepts bytes and returns bytes.
209 #
210 # Decoding is also optional, accepts bytes and returns text. The diagram
211 # below is for reading, for writing, the flow is from right to left, but
212 # the code is identical.
213 #
214 # open as binary decompress? decode?
215 # filename ---------------> bytes -------------> bytes ---------> text
216 # binary decompressed decode
217 #
219 try:
220 binary_mode = _get_binary_mode(mode)
221 except ValueError as ve:
222 raise NotImplementedError(ve.args[0])
224 binary = _open_binary_stream(uri, binary_mode, transport_params)
225 decompressed = so_compression.compression_wrapper(binary, binary_mode, compression)
227 if 'b' not in mode or explicit_encoding is not None:
228 decoded = _encoding_wrapper(
229 decompressed,
230 mode,
231 encoding=encoding,
232 errors=errors,
233 newline=newline,
234 )
235 else:
236 decoded = decompressed
238 #
239 # There are some useful methods in the binary readers, e.g. to_boto3, that get
240 # hidden by the multiple layers of wrapping we just performed. Promote
241 # them so they are visible to the user.
242 #
243 if decoded != binary:
244 promoted_attrs = ['to_boto3']
245 for attr in promoted_attrs:
246 try:
247 setattr(decoded, attr, getattr(binary, attr))
248 except AttributeError:
249 pass
251 return decoded
254def _get_binary_mode(mode_str):
255 #
256 # https://docs.python.org/3/library/functions.html#open
257 #
258 # The order of characters in the mode parameter appears to be unspecified.
259 # The implementation follows the examples, just to be safe.
260 #
261 mode = list(mode_str)
262 binmode = []
264 if 't' in mode and 'b' in mode:
265 raise ValueError("can't have text and binary mode at once")
267 counts = [mode.count(x) for x in 'rwa']
268 if sum(counts) > 1:
269 raise ValueError("must have exactly one of create/read/write/append mode")
271 def transfer(char):
272 binmode.append(mode.pop(mode.index(char)))
274 if 'a' in mode:
275 transfer('a')
276 elif 'w' in mode:
277 transfer('w')
278 elif 'r' in mode:
279 transfer('r')
280 else:
281 raise ValueError(
282 "Must have exactly one of create/read/write/append "
283 "mode and at most one plus"
284 )
286 if 'b' in mode:
287 transfer('b')
288 elif 't' in mode:
289 mode.pop(mode.index('t'))
290 binmode.append('b')
291 else:
292 binmode.append('b')
294 if '+' in mode:
295 transfer('+')
297 #
298 # There shouldn't be anything left in the mode list at this stage.
299 # If there is, then either we've missed something and the implementation
300 # of this function is broken, or the original input mode is invalid.
301 #
302 if mode:
303 raise ValueError('invalid mode: %r' % mode_str)
305 return ''.join(binmode)
308def _shortcut_open(
309 uri,
310 mode,
311 compression,
312 buffering=-1,
313 encoding=None,
314 errors=None,
315 newline=None,
316 ):
317 """Try to open the URI using the standard library io.open function.
319 This can be much faster than the alternative of opening in binary mode and
320 then decoding.
322 This is only possible under the following conditions:
324 1. Opening a local file; and
325 2. Compression is disabled
327 If it is not possible to use the built-in open for the specified URI, returns None.
329 :param str uri: A string indicating what to open.
330 :param str mode: The mode to pass to the open function.
331 :param str compression: The compression type selected.
332 :returns: The opened file
333 :rtype: file
334 """
335 if not isinstance(uri, str):
336 return None
338 scheme = _sniff_scheme(uri)
339 if scheme not in (transport.NO_SCHEME, so_file.SCHEME):
340 return None
342 local_path = so_file.extract_local_path(uri)
343 if compression == so_compression.INFER_FROM_EXTENSION:
344 _, extension = P.splitext(local_path)
345 if extension in so_compression.get_supported_extensions():
346 return None
347 elif compression != so_compression.NO_COMPRESSION:
348 return None
350 open_kwargs = {}
351 if encoding is not None:
352 open_kwargs['encoding'] = encoding
353 mode = mode.replace('b', '')
354 if newline is not None:
355 open_kwargs['newline'] = newline
357 #
358 # binary mode of the builtin/stdlib open function doesn't take an errors argument
359 #
360 if errors and 'b' not in mode:
361 open_kwargs['errors'] = errors
363 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)
366def _open_binary_stream(uri, mode, transport_params):
367 """Open an arbitrary URI in the specified binary mode.
369 Not all modes are supported for all protocols.
371 :arg uri: The URI to open. May be a string, or something else.
372 :arg str mode: The mode to open with. Must be rb, wb or ab.
373 :arg transport_params: Keyword argumens for the transport layer.
374 :returns: A named file object
375 :rtype: file-like object with a .name attribute
376 """
377 if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'):
378 #
379 # This should really be a ValueError, but for the sake of compatibility
380 # with older versions, which raise NotImplementedError, we do the same.
381 #
382 raise NotImplementedError('unsupported mode: %r' % mode)
384 if isinstance(uri, int):
385 #
386 # We're working with a file descriptor. If we open it, its name is
387 # just the integer value, which isn't helpful. Unfortunately, there's
388 # no easy cross-platform way to go from a file descriptor to the filename,
389 # so we just give up here. The user will have to handle their own
390 # compression, etc. explicitly.
391 #
392 fobj = _builtin_open(uri, mode, closefd=False)
393 return fobj
395 if not isinstance(uri, str):
396 raise TypeError("don't know how to handle uri %s" % repr(uri))
398 scheme = _sniff_scheme(uri)
399 submodule = transport.get_transport(scheme)
400 fobj = submodule.open_uri(uri, mode, transport_params)
401 if not hasattr(fobj, 'name'):
402 fobj.name = uri
404 return fobj
407def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None):
408 """Decode bytes into text, if necessary.
410 If mode specifies binary access, does nothing, unless the encoding is
411 specified. A non-null encoding implies text mode.
413 :arg fileobj: must quack like a filehandle object.
414 :arg str mode: is the mode which was originally requested by the user.
415 :arg str encoding: The text encoding to use. If mode is binary, overrides mode.
416 :arg str errors: The method to use when handling encoding/decoding errors.
417 :returns: a file object
418 """
419 logger.debug('encoding_wrapper: %r', locals())
421 #
422 # If the mode is binary, but the user specified an encoding, assume they
423 # want text. If we don't make this assumption, ignore the encoding and
424 # return bytes, smart_open behavior will diverge from the built-in open:
425 #
426 # open(filename, encoding='utf-8') returns a text stream in Py3
427 # smart_open(filename, encoding='utf-8') would return a byte stream
428 # without our assumption, because the default mode is rb.
429 #
430 if 'b' in mode and encoding is None:
431 return fileobj
433 if encoding is None:
434 encoding = DEFAULT_ENCODING
436 fileobj = io.TextIOWrapper(
437 fileobj,
438 encoding=encoding,
439 errors=errors,
440 newline=newline,
441 write_through=True,
442 )
443 return fileobj
446class patch_pathlib(object):
447 """Replace `Path.open` with `smart_open.open`"""
449 def __init__(self):
450 self.old_impl = _patch_pathlib(open)
452 def __enter__(self):
453 return self
455 def __exit__(self, exc_type, exc_val, exc_tb):
456 _patch_pathlib(self.old_impl)
459def _patch_pathlib(func):
460 """Replace `Path.open` with `func`"""
461 old_impl = pathlib.Path.open
462 pathlib.Path.open = func
463 return old_impl
466def smart_open(
467 uri,
468 mode='rb',
469 buffering=-1,
470 encoding=None,
471 errors=None,
472 newline=None,
473 closefd=True,
474 opener=None,
475 ignore_extension=False,
476 **kwargs
477 ):
478 #
479 # This is a thin wrapper of smart_open.open. It's here for backward
480 # compatibility. It works exactly like smart_open.open when the passed
481 # parameters are identical. Otherwise, it raises a DeprecationWarning.
482 #
483 # For completeness, the main differences of the old smart_open function:
484 #
485 # 1. Default mode was read binary (mode='rb')
486 # 2. compression parameter was called ignore_extension
487 # 3. Transport parameters were passed directly as kwargs
488 #
489 url = 'https://github.com/RaRe-Technologies/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst'
490 if kwargs:
491 raise DeprecationWarning(
492 'The following keyword parameters are not supported: %r. '
493 'See %s for more information.' % (sorted(kwargs), url)
494 )
495 message = 'This function is deprecated. See %s for more information' % url
496 warnings.warn(message, category=DeprecationWarning)
498 if ignore_extension:
499 compression = so_compression.NO_COMPRESSION
500 else:
501 compression = so_compression.INFER_FROM_EXTENSION
502 del kwargs, url, message, ignore_extension
503 return open(**locals())
506#
507# Prevent failures with doctools from messing up the entire library. We don't
508# expect such failures, but contributed modules (e.g. new transport mechanisms)
509# may not be as polished.
510#
511try:
512 doctools.tweak_open_docstring(open)
513 doctools.tweak_parse_uri_docstring(parse_uri)
514except Exception as ex:
515 logger.error(
516 'Encountered a non-fatal error while building docstrings (see below). '
517 'help(smart_open) will provide incomplete information as a result. '
518 'For full help text, see '
519 '<https://github.com/RaRe-Technologies/smart_open/blob/master/help.txt>.'
520 )
521 logger.exception(ex)