1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
4#
5# This code is distributed under the terms and conditions
6# from the MIT License (MIT).
7#
8
9"""Implements the majority of smart_open's top-level API."""
10
11import collections
12import locale
13import logging
14import os
15import os.path as P
16import pathlib
17import urllib.parse
18import warnings
19
20#
21# This module defines a function called smart_open so we cannot use
22# smart_open.submodule to reference to the submodules.
23#
24import smart_open.local_file as so_file
25import smart_open.compression as so_compression
26import smart_open.utils as so_utils
27
28from smart_open import doctools
29from smart_open import transport
30
31#
32# For backwards compatibility and keeping old unit tests happy.
33#
34from smart_open.compression import register_compressor # noqa: F401
35from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401
36from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401
37
38logger = logging.getLogger(__name__)
39
40DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
41
42
43def _sniff_scheme(uri_as_string):
44 """Returns the scheme of the URL only, as a string."""
45 #
46 # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
47 # no protocol given => assume a local file
48 #
49 if os.name == 'nt' and '://' not in uri_as_string:
50 uri_as_string = 'file://' + uri_as_string
51
52 return urllib.parse.urlsplit(uri_as_string).scheme
53
54
55def parse_uri(uri_as_string):
56 """
57 Parse the given URI from a string.
58
59 Parameters
60 ----------
61 uri_as_string: str
62 The URI to parse.
63
64 Returns
65 -------
66 collections.namedtuple
67 The parsed URI.
68
69 Notes
70 -----
71 smart_open/doctools.py magic goes here
72 """
73 scheme = _sniff_scheme(uri_as_string)
74 submodule = transport.get_transport(scheme)
75 as_dict = submodule.parse_uri(uri_as_string)
76
77 #
78 # The conversion to a namedtuple is just to keep the old tests happy while
79 # I'm still refactoring.
80 #
81 Uri = collections.namedtuple('Uri', sorted(as_dict.keys()))
82 return Uri(**as_dict)
83
84
85#
86# To keep old unit tests happy while I'm refactoring.
87#
88_parse_uri = parse_uri
89
90_builtin_open = open
91
92
93def open(
94 uri,
95 mode='r',
96 buffering=-1,
97 encoding=None,
98 errors=None,
99 newline=None,
100 closefd=True,
101 opener=None,
102 compression=so_compression.INFER_FROM_EXTENSION,
103 transport_params=None,
104 ):
105 r"""Open the URI object, returning a file-like object.
106
107 The URI is usually a string in a variety of formats.
108 For a full list of examples, see the :func:`parse_uri` function.
109
110 The URI may also be one of:
111
112 - an instance of the pathlib.Path class
113 - a stream (anything that implements io.IOBase-like functionality)
114
115 Parameters
116 ----------
117 uri: str or object
118 The object to open.
119 mode: str, optional
120 Mimicks built-in open parameter of the same name.
121 buffering: int, optional
122 Mimicks built-in open parameter of the same name.
123 encoding: str, optional
124 Mimicks built-in open parameter of the same name.
125 errors: str, optional
126 Mimicks built-in open parameter of the same name.
127 newline: str, optional
128 Mimicks built-in open parameter of the same name.
129 closefd: boolean, optional
130 Mimicks built-in open parameter of the same name. Ignored.
131 opener: object, optional
132 Mimicks built-in open parameter of the same name. Ignored.
133 compression: str, optional (see smart_open.compression.get_supported_compression_types)
134 Explicitly specify the compression/decompression behavior.
135 transport_params: dict, optional
136 Additional parameters for the transport layer (see notes below).
137
138 Returns
139 -------
140 A file-like object.
141
142 Notes
143 -----
144 smart_open has several implementations for its transport layer (e.g. S3, HTTP).
145 Each transport layer has a different set of keyword arguments for overriding
146 default behavior. If you specify a keyword argument that is *not* supported
147 by the transport layer being used, smart_open will ignore that argument and
148 log a warning message.
149
150 smart_open/doctools.py magic goes here
151
152 See Also
153 --------
154 - `Standard library reference <https://docs.python.org/3.13/library/functions.html#open>`__
155 - `smart_open README.rst
156 <https://github.com/piskvorky/smart_open/blob/master/README.rst>`__
157
158 """
159 logger.debug('%r', locals())
160
161 if not isinstance(mode, str):
162 raise TypeError('mode should be a string')
163
164 if compression not in so_compression.get_supported_compression_types():
165 raise ValueError(f'invalid compression type: {compression}')
166
167 if transport_params is None:
168 transport_params = {}
169
170 fobj = _shortcut_open(
171 uri,
172 mode,
173 compression=compression,
174 buffering=buffering,
175 encoding=encoding,
176 errors=errors,
177 newline=newline,
178 )
179 if fobj is not None:
180 return fobj
181
182 #
183 # This is a work-around for the problem described in Issue #144.
184 # If the user has explicitly specified an encoding, then assume they want
185 # us to open the destination in text mode, instead of the default binary.
186 #
187 # If we change the default mode to be text, and match the normal behavior
188 # of Py2 and 3, then the above assumption will be unnecessary.
189 #
190 if encoding is not None and 'b' in mode:
191 mode = mode.replace('b', '')
192
193 if isinstance(uri, pathlib.Path):
194 uri = str(uri)
195
196 explicit_encoding = encoding
197 encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING
198
199 #
200 # This is how we get from the filename to the end result. Decompression is
201 # optional, but it always accepts bytes and returns bytes.
202 #
203 # Decoding is also optional, accepts bytes and returns text. The diagram
204 # below is for reading, for writing, the flow is from right to left, but
205 # the code is identical.
206 #
207 # open as binary decompress? decode?
208 # filename ---------------> bytes -------------> bytes ---------> text
209 # binary decompressed decode
210 #
211
212 try:
213 binary_mode = _get_binary_mode(mode)
214 except ValueError as ve:
215 raise NotImplementedError(ve.args[0])
216
217 binary = _open_binary_stream(uri, binary_mode, transport_params)
218 filename = (
219 binary.name
220 # if name attribute is not string-like (e.g. ftp socket fileno)...
221 if isinstance(getattr(binary, "name", None), (str, bytes))
222 # ...fall back to uri
223 else uri
224 )
225 decompressed = so_compression.compression_wrapper(
226 binary,
227 binary_mode,
228 compression,
229 filename=filename,
230 )
231
232 if 'b' not in mode or explicit_encoding is not None:
233 decoded = _encoding_wrapper(
234 decompressed,
235 mode,
236 encoding=encoding,
237 errors=errors,
238 newline=newline,
239 )
240 else:
241 decoded = decompressed
242
243 #
244 # There are some useful methods in the binary readers, e.g. to_boto3, that get
245 # hidden by the multiple layers of wrapping we just performed. Promote
246 # them so they are visible to the user.
247 #
248 if decoded != binary:
249 promoted_attrs = ['to_boto3']
250 for attr in promoted_attrs:
251 try:
252 setattr(decoded, attr, getattr(binary, attr))
253 except AttributeError:
254 pass
255
256 return so_utils.FileLikeProxy(decoded, binary)
257
258
259def _get_binary_mode(mode_str):
260 #
261 # https://docs.python.org/3/library/functions.html#open
262 #
263 # The order of characters in the mode parameter appears to be unspecified.
264 # The implementation follows the examples, just to be safe.
265 #
266 mode = list(mode_str)
267 binmode = []
268
269 if 't' in mode and 'b' in mode:
270 raise ValueError("can't have text and binary mode at once")
271
272 counts = [mode.count(x) for x in 'rwa']
273 if sum(counts) > 1:
274 raise ValueError("must have exactly one of create/read/write/append mode")
275
276 def transfer(char):
277 binmode.append(mode.pop(mode.index(char)))
278
279 if 'a' in mode:
280 transfer('a')
281 elif 'w' in mode:
282 transfer('w')
283 elif 'r' in mode:
284 transfer('r')
285 else:
286 raise ValueError(
287 "Must have exactly one of create/read/write/append "
288 "mode and at most one plus"
289 )
290
291 if 'b' in mode:
292 transfer('b')
293 elif 't' in mode:
294 mode.pop(mode.index('t'))
295 binmode.append('b')
296 else:
297 binmode.append('b')
298
299 if '+' in mode:
300 transfer('+')
301
302 #
303 # There shouldn't be anything left in the mode list at this stage.
304 # If there is, then either we've missed something and the implementation
305 # of this function is broken, or the original input mode is invalid.
306 #
307 if mode:
308 raise ValueError('invalid mode: %r' % mode_str)
309
310 return ''.join(binmode)
311
312
313def _shortcut_open(
314 uri,
315 mode,
316 compression,
317 buffering=-1,
318 encoding=None,
319 errors=None,
320 newline=None,
321 ):
322 """Try to open the URI using the standard library io.open function.
323
324 This can be much faster than the alternative of opening in binary mode and
325 then decoding.
326
327 This is only possible under the following conditions:
328
329 1. Opening a local file; and
330 2. Compression is disabled
331
332 If it is not possible to use the built-in open for the specified URI, returns None.
333
334 :param str uri: A string indicating what to open.
335 :param str mode: The mode to pass to the open function.
336 :param str compression: The compression type selected.
337 :returns: The opened file
338 :rtype: file
339 """
340 if not isinstance(uri, str):
341 return None
342
343 scheme = _sniff_scheme(uri)
344 if scheme not in (transport.NO_SCHEME, so_file.SCHEME):
345 return None
346
347 local_path = so_file.extract_local_path(uri)
348 if compression == so_compression.INFER_FROM_EXTENSION:
349 _, extension = P.splitext(local_path)
350 if extension in so_compression.get_supported_extensions():
351 return None
352 elif compression != so_compression.NO_COMPRESSION:
353 return None
354
355 open_kwargs = {}
356 if encoding is not None:
357 open_kwargs['encoding'] = encoding
358 mode = mode.replace('b', '')
359 if newline is not None:
360 open_kwargs['newline'] = newline
361
362 #
363 # binary mode of the builtin/stdlib open function doesn't take an errors argument
364 #
365 if errors and 'b' not in mode:
366 open_kwargs['errors'] = errors
367
368 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)
369
370
371def _open_binary_stream(uri, mode, transport_params):
372 """Open an arbitrary URI in the specified binary mode.
373
374 Not all modes are supported for all protocols.
375
376 :arg uri: The URI to open. May be a string, or something else.
377 :arg str mode: The mode to open with. Must be rb, wb or ab.
378 :arg transport_params: Keyword argumens for the transport layer.
379 :returns: A named file object
380 :rtype: file-like object with a .name attribute
381 """
382 if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'):
383 #
384 # This should really be a ValueError, but for the sake of compatibility
385 # with older versions, which raise NotImplementedError, we do the same.
386 #
387 raise NotImplementedError('unsupported mode: %r' % mode)
388
389 if isinstance(uri, int):
390 #
391 # We're working with a file descriptor. If we open it, its name is
392 # just the integer value, which isn't helpful. Unfortunately, there's
393 # no easy cross-platform way to go from a file descriptor to the filename,
394 # so we just give up here. The user will have to handle their own
395 # compression, etc. explicitly.
396 #
397 fobj = _builtin_open(uri, mode, closefd=False)
398 return fobj
399
400 if not isinstance(uri, str):
401 raise TypeError("don't know how to handle uri %s" % repr(uri))
402
403 scheme = _sniff_scheme(uri)
404 submodule = transport.get_transport(scheme)
405 fobj = submodule.open_uri(uri, mode, transport_params)
406 if not hasattr(fobj, 'name'):
407 fobj.name = uri
408
409 return fobj
410
411
412def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None):
413 """Decode bytes into text, if necessary.
414
415 If mode specifies binary access, does nothing, unless the encoding is
416 specified. A non-null encoding implies text mode.
417
418 :arg fileobj: must quack like a filehandle object.
419 :arg str mode: is the mode which was originally requested by the user.
420 :arg str encoding: The text encoding to use. If mode is binary, overrides mode.
421 :arg str errors: The method to use when handling encoding/decoding errors.
422 :returns: a file object
423 """
424 logger.debug('encoding_wrapper: %r', locals())
425
426 #
427 # If the mode is binary, but the user specified an encoding, assume they
428 # want text. If we don't make this assumption, ignore the encoding and
429 # return bytes, smart_open behavior will diverge from the built-in open:
430 #
431 # open(filename, encoding='utf-8') returns a text stream in Py3
432 # smart_open(filename, encoding='utf-8') would return a byte stream
433 # without our assumption, because the default mode is rb.
434 #
435 if 'b' in mode and encoding is None:
436 return fileobj
437
438 if encoding is None:
439 encoding = DEFAULT_ENCODING
440
441 fileobj = so_utils.TextIOWrapper(
442 fileobj,
443 encoding=encoding,
444 errors=errors,
445 newline=newline,
446 write_through=True,
447 )
448 return fileobj
449
450
451class patch_pathlib(object):
452 """Replace `Path.open` with `smart_open.open`"""
453
454 def __init__(self):
455 self.old_impl = _patch_pathlib(open)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, exc_type, exc_val, exc_tb):
461 _patch_pathlib(self.old_impl)
462
463
464def _patch_pathlib(func):
465 """Replace `Path.open` with `func`"""
466 old_impl = pathlib.Path.open
467 pathlib.Path.open = func
468 return old_impl
469
470
471def smart_open(
472 uri,
473 mode='rb',
474 buffering=-1,
475 encoding=None,
476 errors=None,
477 newline=None,
478 closefd=True,
479 opener=None,
480 ignore_extension=False,
481 **kwargs
482 ):
483 #
484 # This is a thin wrapper of smart_open.open. It's here for backward
485 # compatibility. It works exactly like smart_open.open when the passed
486 # parameters are identical. Otherwise, it raises a DeprecationWarning.
487 #
488 # For completeness, the main differences of the old smart_open function:
489 #
490 # 1. Default mode was read binary (mode='rb')
491 # 2. compression parameter was called ignore_extension
492 # 3. Transport parameters were passed directly as kwargs
493 #
494 url = 'https://github.com/piskvorky/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst'
495 if kwargs:
496 raise DeprecationWarning(
497 'The following keyword parameters are not supported: %r. '
498 'See %s for more information.' % (sorted(kwargs), url)
499 )
500 message = 'This function is deprecated. See %s for more information' % url
501 warnings.warn(message, category=DeprecationWarning)
502
503 if ignore_extension:
504 compression = so_compression.NO_COMPRESSION
505 else:
506 compression = so_compression.INFER_FROM_EXTENSION
507 del kwargs, url, message, ignore_extension
508 return open(**locals())
509
510
511#
512# Prevent failures with doctools from messing up the entire library. We don't
513# expect such failures, but contributed modules (e.g. new transport mechanisms)
514# may not be as polished.
515#
516try:
517 doctools.tweak_open_docstring(open)
518 doctools.tweak_parse_uri_docstring(parse_uri)
519except Exception as ex:
520 logger.error(
521 'Encountered a non-fatal error while building docstrings (see below). '
522 'help(smart_open) will provide incomplete information as a result. '
523 'For full help text, see '
524 '<https://github.com/piskvorky/smart_open/blob/master/help.txt>.'
525 )
526 logger.exception(ex)