1#
2# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com>
3#
4# This code is distributed under the terms and conditions
5# from the MIT License (MIT).
6#
7
8"""Implements the majority of smart_open's top-level API."""
9
10from __future__ import annotations
11
12import collections
13import contextlib
14import locale
15import logging
16import os
17import os.path
18import pathlib
19import urllib.parse
20from typing import IO, TYPE_CHECKING, Any, BinaryIO, Literal, TextIO, cast, overload
21
22import smart_open.compression as so_compression
23
24#
25# This module defines a function called smart_open so we cannot use
26# smart_open.submodule to reference to the submodules.
27#
28import smart_open.local_file as so_file
29import smart_open.utils as so_utils
30from smart_open import doctools, transport
31
32if TYPE_CHECKING:
33 from collections.abc import Callable
34
35 from typing_extensions import Self
36
37 from smart_open._typing import CompressionKwargs, TransportParams, Uri
38
39logger = logging.getLogger(__name__)
40
41DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False)
42
43
44def _sniff_scheme(uri_as_string: str) -> str:
45 """Returns the scheme of the URL only, as a string."""
46 #
47 # urlsplit doesn't work on Windows -- it parses the drive as the scheme...
48 # no protocol given => assume a local file
49 #
50 if os.name == "nt" and "://" not in uri_as_string:
51 uri_as_string = "file://" + uri_as_string
52
53 return urllib.parse.urlsplit(uri_as_string).scheme
54
55
56def parse_uri(uri_as_string: str) -> tuple[Any, ...]:
57 """Parse the given URI from a string.
58
59 Args:
60 uri_as_string: The URI to parse.
61
62 Returns:
63 The parsed URI as a ``collections.namedtuple``.
64
65 smart_open/doctools.py magic goes here
66 """
67 scheme = _sniff_scheme(uri_as_string)
68 submodule = transport.get_transport(scheme)
69 as_dict = submodule.parse_uri(uri_as_string)
70
71 #
72 # The conversion to a namedtuple is just to keep the old tests happy while
73 # I'm still refactoring.
74 #
75 Uri = collections.namedtuple("Uri", sorted(as_dict.keys())) # noqa: PYI024 # legacy public type
76 return Uri(**as_dict)
77
78
79#
80# To keep old unit tests happy while I'm refactoring.
81#
82_parse_uri = parse_uri
83
84_builtin_open = open
85
86
87@overload
88def open(
89 uri: Uri,
90 mode: Literal["r", "w", "a", "x", "r+", "w+", "a+", "rt", "wt", "at", "xt"] = ...,
91 buffering: int = ...,
92 encoding: str | None = ...,
93 errors: str | None = ...,
94 newline: str | None = ...,
95 closefd: bool = ..., # noqa: FBT001 # public API
96 opener: Callable[[str, int], int] | None = ...,
97 compression: str = ...,
98 compression_kwargs: CompressionKwargs | None = ...,
99 transport_params: TransportParams | None = ...,
100) -> TextIO: ...
101
102
103@overload
104def open(
105 uri: Uri,
106 mode: Literal["rb", "wb", "ab", "xb", "rb+", "wb+", "ab+", "br", "bw", "ba"],
107 buffering: int = ...,
108 *,
109 encoding: None = ...,
110 errors: str | None = ...,
111 newline: str | None = ...,
112 closefd: bool = ...,
113 opener: Callable[[str, int], int] | None = ...,
114 compression: str = ...,
115 compression_kwargs: CompressionKwargs | None = ...,
116 transport_params: TransportParams | None = ...,
117) -> BinaryIO: ...
118
119
120@overload
121def open(
122 uri: Uri,
123 mode: str = ...,
124 buffering: int = ...,
125 encoding: str | None = ...,
126 errors: str | None = ...,
127 newline: str | None = ...,
128 closefd: bool = ..., # noqa: FBT001 # public API
129 opener: Callable[[str, int], int] | None = ...,
130 compression: str = ...,
131 compression_kwargs: CompressionKwargs | None = ...,
132 transport_params: TransportParams | None = ...,
133) -> IO[Any]: ...
134
135
136def open( # noqa: C901, PLR0913 # legacy public API; refactor in a dedicated PR
137 uri: Uri,
138 mode: str = "r",
139 buffering: int = -1,
140 encoding: str | None = None,
141 errors: str | None = None,
142 newline: str | None = None,
143 closefd: bool = True, # noqa: FBT001, FBT002 # public API
144 opener: Callable[[str, int], int] | None = None,
145 compression: str = so_compression.INFER_FROM_EXTENSION,
146 compression_kwargs: CompressionKwargs | None = None,
147 transport_params: TransportParams | None = None,
148) -> IO[Any]:
149 r"""Open the URI object, returning a file-like object.
150
151 The URI is usually a string in a variety of formats.
152 For a full list of examples, see the :func:`parse_uri` function.
153
154 The URI may also be one of:
155
156 - an instance of the pathlib.Path class
157 - a stream (anything that implements io.IOBase-like functionality)
158
159 Args:
160 uri: The object to open.
161 mode: Mimicks built-in open parameter of the same name.
162 buffering: Mimicks built-in open parameter of the same name.
163 encoding: Mimicks built-in open parameter of the same name.
164 errors: Mimicks built-in open parameter of the same name.
165 newline: Mimicks built-in open parameter of the same name.
166 closefd: Mimicks built-in open parameter of the same name. Ignored.
167 opener: Mimicks built-in open parameter of the same name. Ignored.
168 compression: Explicitly specify the compression/decompression behavior.
169 See ``smart_open.compression.get_supported_compression_types``.
170 compression_kwargs: Keyword arguments forwarded to the registered
171 compressor callback. Examples of each library's max-compression
172 option: ``{'compresslevel': 9}`` for .gz/.bz2, ``{'preset': 9}`` for
173 .xz, ``{'level': 22}`` for .zst, ``{'compression_level': 12}`` for
174 .lz4. Ignored when compression is 'disable' or the URI's extension
175 doesn't match a registered compressor.
176 transport_params: Additional parameters for the transport layer (see
177 notes below).
178
179 Returns:
180 A file-like object.
181
182 Raises:
183 TypeError: If ``mode`` is not a string or if the URI type is not
184 recognized.
185 ValueError: If ``compression`` is not a supported value.
186 NotImplementedError: If ``mode`` cannot be parsed into a valid binary
187 mode.
188
189 Note:
190 smart_open has several implementations for its transport layer
191 (e.g. S3, HTTP). Each transport layer has a different set of keyword
192 arguments for overriding default behavior. If you specify a keyword
193 argument that is *not* supported by the transport layer being used,
194 smart_open will ignore that argument and log a warning message.
195
196 smart_open/doctools.py magic goes here
197
198 See Also:
199 - `Standard library reference <https://docs.python.org/3.14/library/functions.html#open>`__
200 - `smart_open README.md
201 <https://github.com/piskvorky/smart_open/blob/master/README.md>`__
202 """
203 logger.debug("%r", locals())
204
205 if not isinstance(mode, str):
206 msg = "mode should be a string"
207 raise TypeError(msg)
208
209 if compression not in so_compression.get_supported_compression_types():
210 msg = f"invalid compression type: {compression}"
211 raise ValueError(msg)
212
213 if transport_params is None:
214 transport_params = {}
215
216 fobj = _shortcut_open(
217 uri,
218 mode,
219 compression=compression,
220 buffering=buffering,
221 encoding=encoding,
222 errors=errors,
223 newline=newline,
224 )
225 if fobj is not None:
226 return fobj
227
228 #
229 # This is a work-around for the problem described in Issue #144.
230 # If the user has explicitly specified an encoding, then assume they want
231 # us to open the destination in text mode, instead of the default binary.
232 #
233 # If we change the default mode to be text, and match the normal behavior
234 # of Py2 and 3, then the above assumption will be unnecessary.
235 #
236 if encoding is not None and "b" in mode:
237 mode = mode.replace("b", "")
238
239 if isinstance(uri, pathlib.Path):
240 uri = str(uri)
241
242 explicit_encoding = encoding
243 encoding = explicit_encoding or DEFAULT_ENCODING
244
245 #
246 # This is how we get from the filename to the end result. Decompression is
247 # optional, but it always accepts bytes and returns bytes.
248 #
249 # Decoding is also optional, accepts bytes and returns text. The diagram
250 # below is for reading, for writing, the flow is from right to left, but
251 # the code is identical.
252 #
253 # open as binary decompress? decode?
254 # filename ---------------> bytes -------------> bytes ---------> text
255 # binary decompressed decode
256 #
257
258 try:
259 binary_mode = _get_binary_mode(mode)
260 except ValueError as ve:
261 raise NotImplementedError(ve.args[0]) from ve
262
263 binary = _open_binary_stream(uri, binary_mode, transport_params)
264 name = getattr(binary, "name", None)
265 # prefer the stream's own name; if it's not string-like (e.g. ftp socket fileno), fall back to uri
266 filename = name if isinstance(name, str) else uri if isinstance(uri, str) else None
267 decompressed = so_compression.compression_wrapper(
268 binary,
269 binary_mode,
270 compression,
271 filename=filename,
272 compression_kwargs=compression_kwargs,
273 )
274
275 if "b" not in mode or explicit_encoding is not None:
276 decoded = _encoding_wrapper(
277 decompressed,
278 mode,
279 encoding=encoding,
280 errors=errors,
281 newline=newline,
282 )
283 else:
284 decoded = decompressed
285
286 #
287 # There are some useful methods in the binary readers, e.g. to_boto3, that get
288 # hidden by the multiple layers of wrapping we just performed. Promote
289 # them so they are visible to the user.
290 #
291 if decoded != binary:
292 promoted_attrs = ["to_boto3"]
293 for attr in promoted_attrs:
294 with contextlib.suppress(AttributeError):
295 setattr(decoded, attr, getattr(binary, attr))
296
297 return cast("IO[Any]", so_utils.FileLikeProxy(decoded, binary))
298
299
300def _get_binary_mode(mode_str: str) -> str: # noqa: C901 # legacy internal helper; refactor in a dedicated PR
301 #
302 # https://docs.python.org/3/library/functions.html#open
303 #
304 # The order of characters in the mode parameter appears to be unspecified.
305 # The implementation follows the examples, just to be safe.
306 #
307 mode = list(mode_str)
308 binmode = []
309
310 if "t" in mode and "b" in mode:
311 msg = "can't have text and binary mode at once"
312 raise ValueError(msg)
313
314 counts = [mode.count(x) for x in "rwa"]
315 if sum(counts) > 1:
316 msg = "must have exactly one of create/read/write/append mode"
317 raise ValueError(msg)
318
319 def transfer(char: str) -> None:
320 binmode.append(mode.pop(mode.index(char)))
321
322 if "a" in mode:
323 transfer("a")
324 elif "w" in mode:
325 transfer("w")
326 elif "r" in mode:
327 transfer("r")
328 else:
329 msg = "Must have exactly one of create/read/write/append mode and at most one plus"
330 raise ValueError(msg)
331
332 if "b" in mode:
333 transfer("b")
334 elif "t" in mode:
335 mode.pop(mode.index("t"))
336 binmode.append("b")
337 else:
338 binmode.append("b")
339
340 if "+" in mode:
341 transfer("+")
342
343 #
344 # There shouldn't be anything left in the mode list at this stage.
345 # If there is, then either we've missed something and the implementation
346 # of this function is broken, or the original input mode is invalid.
347 #
348 if mode:
349 msg = f"invalid mode: {mode_str!r}"
350 raise ValueError(msg)
351
352 return "".join(binmode)
353
354
355def _shortcut_open( # noqa: PLR0913 # legacy internal helper; refactor in a dedicated PR
356 uri: Uri,
357 mode: str,
358 compression: str,
359 buffering: int = -1,
360 encoding: str | None = None,
361 errors: str | None = None,
362 newline: str | None = None,
363) -> IO[Any] | None:
364 """Try to open the URI using the standard library io.open function.
365
366 This can be much faster than the alternative of opening in binary mode and
367 then decoding.
368
369 This is only possible under the following conditions:
370
371 1. Opening a local file; and
372 2. Compression is disabled
373
374 If it is not possible to use the built-in open for the specified URI,
375 returns None.
376
377 Args:
378 uri: A string indicating what to open.
379 mode: The mode to pass to the open function.
380 compression: The compression type selected.
381 buffering: Mimicks built-in open parameter of the same name.
382 encoding: Mimicks built-in open parameter of the same name.
383 errors: Mimicks built-in open parameter of the same name.
384 newline: Mimicks built-in open parameter of the same name.
385
386 Returns:
387 The opened file, or None if no shortcut is possible.
388 """
389 if not isinstance(uri, str):
390 return None
391
392 scheme = _sniff_scheme(uri)
393 if scheme not in (transport.NO_SCHEME, so_file.SCHEME):
394 return None
395
396 local_path = so_file.extract_local_path(uri)
397 if compression == so_compression.INFER_FROM_EXTENSION:
398 extension = pathlib.Path(local_path).suffix
399 if extension in so_compression.get_supported_extensions():
400 return None
401 elif compression != so_compression.NO_COMPRESSION:
402 return None
403
404 open_kwargs: dict[str, Any] = {}
405 if encoding is not None:
406 open_kwargs["encoding"] = encoding
407 mode = mode.replace("b", "")
408 if newline is not None:
409 open_kwargs["newline"] = newline
410
411 #
412 # binary mode of the builtin/stdlib open function doesn't take an errors argument
413 #
414 if errors and "b" not in mode:
415 open_kwargs["errors"] = errors
416
417 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs)
418
419
420def _open_binary_stream(uri: Uri, mode: str, transport_params: TransportParams) -> IO[bytes]:
421 """Open an arbitrary URI in the specified binary mode.
422
423 Not all modes are supported for all protocols.
424
425 Args:
426 uri: The URI to open. May be a string, or something else.
427 mode: The mode to open with. Must be rb, wb or ab.
428 transport_params: Keyword arguments for the transport layer.
429
430 Returns:
431 A file-like object with a ``.name`` attribute.
432
433 Raises:
434 NotImplementedError: If ``mode`` is not a supported binary mode.
435 TypeError: If ``uri`` is not a string or integer file descriptor.
436 """
437 if mode not in ("rb", "rb+", "wb", "wb+", "ab", "ab+"):
438 #
439 # This should really be a ValueError, but for the sake of compatibility
440 # with older versions, which raise NotImplementedError, we do the same.
441 #
442 msg = f"unsupported mode: {mode!r}"
443 raise NotImplementedError(msg)
444
445 if isinstance(uri, int):
446 #
447 # We're working with a file descriptor. If we open it, its name is
448 # just the integer value, which isn't helpful. Unfortunately, there's
449 # no easy cross-platform way to go from a file descriptor to the filename,
450 # so we just give up here. The user will have to handle their own
451 # compression, etc. explicitly.
452 #
453 return _builtin_open(uri, mode, closefd=False)
454
455 if not isinstance(uri, str):
456 msg = f"don't know how to handle uri {uri!r}"
457 raise TypeError(msg)
458
459 scheme = _sniff_scheme(uri)
460 submodule = transport.get_transport(scheme)
461 fobj = submodule.open_uri(uri, mode, transport_params)
462 if not hasattr(fobj, "name"):
463 fobj.name = uri
464
465 return fobj
466
467
468def _encoding_wrapper(
469 fileobj: IO[Any],
470 mode: str,
471 encoding: str | None = None,
472 errors: str | None = None,
473 newline: str | None = None,
474) -> IO[Any]:
475 """Decode bytes into text, if necessary.
476
477 If mode specifies binary access, does nothing, unless the encoding is
478 specified. A non-null encoding implies text mode.
479
480 Args:
481 fileobj: Must quack like a filehandle object.
482 mode: The mode which was originally requested by the user.
483 encoding: The text encoding to use. If mode is binary, overrides mode.
484 errors: The method to use when handling encoding/decoding errors.
485 newline: Forwarded to the text wrapper.
486
487 Returns:
488 A file object.
489 """
490 logger.debug("encoding_wrapper: %r", locals())
491
492 #
493 # If the mode is binary, but the user specified an encoding, assume they
494 # want text. If we don't make this assumption, ignore the encoding and
495 # return bytes, smart_open behavior will diverge from the built-in open:
496 #
497 # open(filename, encoding='utf-8') returns a text stream in Py3
498 # smart_open(filename, encoding='utf-8') would return a byte stream
499 # without our assumption, because the default mode is rb.
500 #
501 if "b" in mode and encoding is None:
502 return fileobj
503
504 if encoding is None:
505 encoding = DEFAULT_ENCODING
506
507 return so_utils.TextIOWrapper(
508 fileobj,
509 encoding=encoding,
510 errors=errors,
511 newline=newline,
512 write_through=True,
513 )
514
515
516class patch_pathlib: # noqa: N801 # function-shaped name in public API
517 """Replace `Path.open` with `smart_open.open`."""
518
519 def __init__(self) -> None:
520 self.old_impl = _patch_pathlib(open)
521
522 def __enter__(self) -> Self: # noqa: D105
523 return self
524
525 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: # noqa: D105
526 _patch_pathlib(self.old_impl)
527
528
529def _patch_pathlib(func: Callable[..., Any]) -> Callable[..., Any]:
530 """Replace `Path.open` with `func`."""
531 old_impl = pathlib.Path.open
532 pathlib.Path.open = func # ty: ignore[invalid-assignment] # intentional monkeypatch
533 return old_impl
534
535
536#
537# Prevent failures with doctools from messing up the entire library. We don't
538# expect such failures, but contributed modules (e.g. new transport mechanisms)
539# may not be as polished.
540#
541try:
542 doctools.tweak_open_docstring(open)
543 doctools.tweak_parse_uri_docstring(parse_uri)
544except Exception:
545 logger.exception(
546 "Encountered a non-fatal error while building docstrings (see below). "
547 "help(smart_open) will provide incomplete information as a result. "
548 "For full help text, see "
549 "<https://github.com/piskvorky/smart_open/blob/master/help.txt>."
550 )