Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/smart_open/smart_open_lib.py: 67%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

168 statements  

1# 

2# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com> 

3# 

4# This code is distributed under the terms and conditions 

5# from the MIT License (MIT). 

6# 

7 

8"""Implements the majority of smart_open's top-level API.""" 

9 

10from __future__ import annotations 

11 

12import collections 

13import contextlib 

14import locale 

15import logging 

16import os 

17import os.path 

18import pathlib 

19import urllib.parse 

20from typing import IO, TYPE_CHECKING, Any, BinaryIO, Literal, TextIO, cast, overload 

21 

22import smart_open.compression as so_compression 

23 

24# 

25# This module defines a function called smart_open so we cannot use 

26# smart_open.submodule to reference to the submodules. 

27# 

28import smart_open.local_file as so_file 

29import smart_open.utils as so_utils 

30from smart_open import doctools, transport 

31 

32if TYPE_CHECKING: 

33 from collections.abc import Callable 

34 

35 from typing_extensions import Self 

36 

37 from smart_open._typing import CompressionKwargs, TransportParams, Uri 

38 

39logger = logging.getLogger(__name__) 

40 

41DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 

42 

43 

44def _sniff_scheme(uri_as_string: str) -> str: 

45 """Returns the scheme of the URL only, as a string.""" 

46 # 

47 # urlsplit doesn't work on Windows -- it parses the drive as the scheme... 

48 # no protocol given => assume a local file 

49 # 

50 if os.name == "nt" and "://" not in uri_as_string: 

51 uri_as_string = "file://" + uri_as_string 

52 

53 return urllib.parse.urlsplit(uri_as_string).scheme 

54 

55 

56def parse_uri(uri_as_string: str) -> tuple[Any, ...]: 

57 """Parse the given URI from a string. 

58 

59 Args: 

60 uri_as_string: The URI to parse. 

61 

62 Returns: 

63 The parsed URI as a ``collections.namedtuple``. 

64 

65 smart_open/doctools.py magic goes here 

66 """ 

67 scheme = _sniff_scheme(uri_as_string) 

68 submodule = transport.get_transport(scheme) 

69 as_dict = submodule.parse_uri(uri_as_string) 

70 

71 # 

72 # The conversion to a namedtuple is just to keep the old tests happy while 

73 # I'm still refactoring. 

74 # 

75 Uri = collections.namedtuple("Uri", sorted(as_dict.keys())) # noqa: PYI024 # legacy public type 

76 return Uri(**as_dict) 

77 

78 

79# 

80# To keep old unit tests happy while I'm refactoring. 

81# 

82_parse_uri = parse_uri 

83 

84_builtin_open = open 

85 

86 

87@overload 

88def open( 

89 uri: Uri, 

90 mode: Literal["r", "w", "a", "x", "r+", "w+", "a+", "rt", "wt", "at", "xt"] = ..., 

91 buffering: int = ..., 

92 encoding: str | None = ..., 

93 errors: str | None = ..., 

94 newline: str | None = ..., 

95 closefd: bool = ..., # noqa: FBT001 # public API 

96 opener: Callable[[str, int], int] | None = ..., 

97 compression: str = ..., 

98 compression_kwargs: CompressionKwargs | None = ..., 

99 transport_params: TransportParams | None = ..., 

100) -> TextIO: ... 

101 

102 

103@overload 

104def open( 

105 uri: Uri, 

106 mode: Literal["rb", "wb", "ab", "xb", "rb+", "wb+", "ab+", "br", "bw", "ba"], 

107 buffering: int = ..., 

108 *, 

109 encoding: None = ..., 

110 errors: str | None = ..., 

111 newline: str | None = ..., 

112 closefd: bool = ..., 

113 opener: Callable[[str, int], int] | None = ..., 

114 compression: str = ..., 

115 compression_kwargs: CompressionKwargs | None = ..., 

116 transport_params: TransportParams | None = ..., 

117) -> BinaryIO: ... 

118 

119 

120@overload 

121def open( 

122 uri: Uri, 

123 mode: str = ..., 

124 buffering: int = ..., 

125 encoding: str | None = ..., 

126 errors: str | None = ..., 

127 newline: str | None = ..., 

128 closefd: bool = ..., # noqa: FBT001 # public API 

129 opener: Callable[[str, int], int] | None = ..., 

130 compression: str = ..., 

131 compression_kwargs: CompressionKwargs | None = ..., 

132 transport_params: TransportParams | None = ..., 

133) -> IO[Any]: ... 

134 

135 

136def open( # noqa: C901, PLR0913 # legacy public API; refactor in a dedicated PR 

137 uri: Uri, 

138 mode: str = "r", 

139 buffering: int = -1, 

140 encoding: str | None = None, 

141 errors: str | None = None, 

142 newline: str | None = None, 

143 closefd: bool = True, # noqa: FBT001, FBT002 # public API 

144 opener: Callable[[str, int], int] | None = None, 

145 compression: str = so_compression.INFER_FROM_EXTENSION, 

146 compression_kwargs: CompressionKwargs | None = None, 

147 transport_params: TransportParams | None = None, 

148) -> IO[Any]: 

149 r"""Open the URI object, returning a file-like object. 

150 

151 The URI is usually a string in a variety of formats. 

152 For a full list of examples, see the :func:`parse_uri` function. 

153 

154 The URI may also be one of: 

155 

156 - an instance of the pathlib.Path class 

157 - a stream (anything that implements io.IOBase-like functionality) 

158 

159 Args: 

160 uri: The object to open. 

161 mode: Mimicks built-in open parameter of the same name. 

162 buffering: Mimicks built-in open parameter of the same name. 

163 encoding: Mimicks built-in open parameter of the same name. 

164 errors: Mimicks built-in open parameter of the same name. 

165 newline: Mimicks built-in open parameter of the same name. 

166 closefd: Mimicks built-in open parameter of the same name. Ignored. 

167 opener: Mimicks built-in open parameter of the same name. Ignored. 

168 compression: Explicitly specify the compression/decompression behavior. 

169 See ``smart_open.compression.get_supported_compression_types``. 

170 compression_kwargs: Keyword arguments forwarded to the registered 

171 compressor callback. Examples of each library's max-compression 

172 option: ``{'compresslevel': 9}`` for .gz/.bz2, ``{'preset': 9}`` for 

173 .xz, ``{'level': 22}`` for .zst, ``{'compression_level': 12}`` for 

174 .lz4. Ignored when compression is 'disable' or the URI's extension 

175 doesn't match a registered compressor. 

176 transport_params: Additional parameters for the transport layer (see 

177 notes below). 

178 

179 Returns: 

180 A file-like object. 

181 

182 Raises: 

183 TypeError: If ``mode`` is not a string or if the URI type is not 

184 recognized. 

185 ValueError: If ``compression`` is not a supported value. 

186 NotImplementedError: If ``mode`` cannot be parsed into a valid binary 

187 mode. 

188 

189 Note: 

190 smart_open has several implementations for its transport layer 

191 (e.g. S3, HTTP). Each transport layer has a different set of keyword 

192 arguments for overriding default behavior. If you specify a keyword 

193 argument that is *not* supported by the transport layer being used, 

194 smart_open will ignore that argument and log a warning message. 

195 

196 smart_open/doctools.py magic goes here 

197 

198 See Also: 

199 - `Standard library reference <https://docs.python.org/3.14/library/functions.html#open>`__ 

200 - `smart_open README.md 

201 <https://github.com/piskvorky/smart_open/blob/master/README.md>`__ 

202 """ 

203 logger.debug("%r", locals()) 

204 

205 if not isinstance(mode, str): 

206 msg = "mode should be a string" 

207 raise TypeError(msg) 

208 

209 if compression not in so_compression.get_supported_compression_types(): 

210 msg = f"invalid compression type: {compression}" 

211 raise ValueError(msg) 

212 

213 if transport_params is None: 

214 transport_params = {} 

215 

216 fobj = _shortcut_open( 

217 uri, 

218 mode, 

219 compression=compression, 

220 buffering=buffering, 

221 encoding=encoding, 

222 errors=errors, 

223 newline=newline, 

224 ) 

225 if fobj is not None: 

226 return fobj 

227 

228 # 

229 # This is a work-around for the problem described in Issue #144. 

230 # If the user has explicitly specified an encoding, then assume they want 

231 # us to open the destination in text mode, instead of the default binary. 

232 # 

233 # If we change the default mode to be text, and match the normal behavior 

234 # of Py2 and 3, then the above assumption will be unnecessary. 

235 # 

236 if encoding is not None and "b" in mode: 

237 mode = mode.replace("b", "") 

238 

239 if isinstance(uri, pathlib.Path): 

240 uri = str(uri) 

241 

242 explicit_encoding = encoding 

243 encoding = explicit_encoding or DEFAULT_ENCODING 

244 

245 # 

246 # This is how we get from the filename to the end result. Decompression is 

247 # optional, but it always accepts bytes and returns bytes. 

248 # 

249 # Decoding is also optional, accepts bytes and returns text. The diagram 

250 # below is for reading, for writing, the flow is from right to left, but 

251 # the code is identical. 

252 # 

253 # open as binary decompress? decode? 

254 # filename ---------------> bytes -------------> bytes ---------> text 

255 # binary decompressed decode 

256 # 

257 

258 try: 

259 binary_mode = _get_binary_mode(mode) 

260 except ValueError as ve: 

261 raise NotImplementedError(ve.args[0]) from ve 

262 

263 binary = _open_binary_stream(uri, binary_mode, transport_params) 

264 name = getattr(binary, "name", None) 

265 # prefer the stream's own name; if it's not string-like (e.g. ftp socket fileno), fall back to uri 

266 filename = name if isinstance(name, str) else uri if isinstance(uri, str) else None 

267 decompressed = so_compression.compression_wrapper( 

268 binary, 

269 binary_mode, 

270 compression, 

271 filename=filename, 

272 compression_kwargs=compression_kwargs, 

273 ) 

274 

275 if "b" not in mode or explicit_encoding is not None: 

276 decoded = _encoding_wrapper( 

277 decompressed, 

278 mode, 

279 encoding=encoding, 

280 errors=errors, 

281 newline=newline, 

282 ) 

283 else: 

284 decoded = decompressed 

285 

286 # 

287 # There are some useful methods in the binary readers, e.g. to_boto3, that get 

288 # hidden by the multiple layers of wrapping we just performed. Promote 

289 # them so they are visible to the user. 

290 # 

291 if decoded != binary: 

292 promoted_attrs = ["to_boto3"] 

293 for attr in promoted_attrs: 

294 with contextlib.suppress(AttributeError): 

295 setattr(decoded, attr, getattr(binary, attr)) 

296 

297 return cast("IO[Any]", so_utils.FileLikeProxy(decoded, binary)) 

298 

299 

300def _get_binary_mode(mode_str: str) -> str: # noqa: C901 # legacy internal helper; refactor in a dedicated PR 

301 # 

302 # https://docs.python.org/3/library/functions.html#open 

303 # 

304 # The order of characters in the mode parameter appears to be unspecified. 

305 # The implementation follows the examples, just to be safe. 

306 # 

307 mode = list(mode_str) 

308 binmode = [] 

309 

310 if "t" in mode and "b" in mode: 

311 msg = "can't have text and binary mode at once" 

312 raise ValueError(msg) 

313 

314 counts = [mode.count(x) for x in "rwa"] 

315 if sum(counts) > 1: 

316 msg = "must have exactly one of create/read/write/append mode" 

317 raise ValueError(msg) 

318 

319 def transfer(char: str) -> None: 

320 binmode.append(mode.pop(mode.index(char))) 

321 

322 if "a" in mode: 

323 transfer("a") 

324 elif "w" in mode: 

325 transfer("w") 

326 elif "r" in mode: 

327 transfer("r") 

328 else: 

329 msg = "Must have exactly one of create/read/write/append mode and at most one plus" 

330 raise ValueError(msg) 

331 

332 if "b" in mode: 

333 transfer("b") 

334 elif "t" in mode: 

335 mode.pop(mode.index("t")) 

336 binmode.append("b") 

337 else: 

338 binmode.append("b") 

339 

340 if "+" in mode: 

341 transfer("+") 

342 

343 # 

344 # There shouldn't be anything left in the mode list at this stage. 

345 # If there is, then either we've missed something and the implementation 

346 # of this function is broken, or the original input mode is invalid. 

347 # 

348 if mode: 

349 msg = f"invalid mode: {mode_str!r}" 

350 raise ValueError(msg) 

351 

352 return "".join(binmode) 

353 

354 

355def _shortcut_open( # noqa: PLR0913 # legacy internal helper; refactor in a dedicated PR 

356 uri: Uri, 

357 mode: str, 

358 compression: str, 

359 buffering: int = -1, 

360 encoding: str | None = None, 

361 errors: str | None = None, 

362 newline: str | None = None, 

363) -> IO[Any] | None: 

364 """Try to open the URI using the standard library io.open function. 

365 

366 This can be much faster than the alternative of opening in binary mode and 

367 then decoding. 

368 

369 This is only possible under the following conditions: 

370 

371 1. Opening a local file; and 

372 2. Compression is disabled 

373 

374 If it is not possible to use the built-in open for the specified URI, 

375 returns None. 

376 

377 Args: 

378 uri: A string indicating what to open. 

379 mode: The mode to pass to the open function. 

380 compression: The compression type selected. 

381 buffering: Mimicks built-in open parameter of the same name. 

382 encoding: Mimicks built-in open parameter of the same name. 

383 errors: Mimicks built-in open parameter of the same name. 

384 newline: Mimicks built-in open parameter of the same name. 

385 

386 Returns: 

387 The opened file, or None if no shortcut is possible. 

388 """ 

389 if not isinstance(uri, str): 

390 return None 

391 

392 scheme = _sniff_scheme(uri) 

393 if scheme not in (transport.NO_SCHEME, so_file.SCHEME): 

394 return None 

395 

396 local_path = so_file.extract_local_path(uri) 

397 if compression == so_compression.INFER_FROM_EXTENSION: 

398 extension = pathlib.Path(local_path).suffix 

399 if extension in so_compression.get_supported_extensions(): 

400 return None 

401 elif compression != so_compression.NO_COMPRESSION: 

402 return None 

403 

404 open_kwargs: dict[str, Any] = {} 

405 if encoding is not None: 

406 open_kwargs["encoding"] = encoding 

407 mode = mode.replace("b", "") 

408 if newline is not None: 

409 open_kwargs["newline"] = newline 

410 

411 # 

412 # binary mode of the builtin/stdlib open function doesn't take an errors argument 

413 # 

414 if errors and "b" not in mode: 

415 open_kwargs["errors"] = errors 

416 

417 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) 

418 

419 

420def _open_binary_stream(uri: Uri, mode: str, transport_params: TransportParams) -> IO[bytes]: 

421 """Open an arbitrary URI in the specified binary mode. 

422 

423 Not all modes are supported for all protocols. 

424 

425 Args: 

426 uri: The URI to open. May be a string, or something else. 

427 mode: The mode to open with. Must be rb, wb or ab. 

428 transport_params: Keyword arguments for the transport layer. 

429 

430 Returns: 

431 A file-like object with a ``.name`` attribute. 

432 

433 Raises: 

434 NotImplementedError: If ``mode`` is not a supported binary mode. 

435 TypeError: If ``uri`` is not a string or integer file descriptor. 

436 """ 

437 if mode not in ("rb", "rb+", "wb", "wb+", "ab", "ab+"): 

438 # 

439 # This should really be a ValueError, but for the sake of compatibility 

440 # with older versions, which raise NotImplementedError, we do the same. 

441 # 

442 msg = f"unsupported mode: {mode!r}" 

443 raise NotImplementedError(msg) 

444 

445 if isinstance(uri, int): 

446 # 

447 # We're working with a file descriptor. If we open it, its name is 

448 # just the integer value, which isn't helpful. Unfortunately, there's 

449 # no easy cross-platform way to go from a file descriptor to the filename, 

450 # so we just give up here. The user will have to handle their own 

451 # compression, etc. explicitly. 

452 # 

453 return _builtin_open(uri, mode, closefd=False) 

454 

455 if not isinstance(uri, str): 

456 msg = f"don't know how to handle uri {uri!r}" 

457 raise TypeError(msg) 

458 

459 scheme = _sniff_scheme(uri) 

460 submodule = transport.get_transport(scheme) 

461 fobj = submodule.open_uri(uri, mode, transport_params) 

462 if not hasattr(fobj, "name"): 

463 fobj.name = uri 

464 

465 return fobj 

466 

467 

468def _encoding_wrapper( 

469 fileobj: IO[Any], 

470 mode: str, 

471 encoding: str | None = None, 

472 errors: str | None = None, 

473 newline: str | None = None, 

474) -> IO[Any]: 

475 """Decode bytes into text, if necessary. 

476 

477 If mode specifies binary access, does nothing, unless the encoding is 

478 specified. A non-null encoding implies text mode. 

479 

480 Args: 

481 fileobj: Must quack like a filehandle object. 

482 mode: The mode which was originally requested by the user. 

483 encoding: The text encoding to use. If mode is binary, overrides mode. 

484 errors: The method to use when handling encoding/decoding errors. 

485 newline: Forwarded to the text wrapper. 

486 

487 Returns: 

488 A file object. 

489 """ 

490 logger.debug("encoding_wrapper: %r", locals()) 

491 

492 # 

493 # If the mode is binary, but the user specified an encoding, assume they 

494 # want text. If we don't make this assumption, ignore the encoding and 

495 # return bytes, smart_open behavior will diverge from the built-in open: 

496 # 

497 # open(filename, encoding='utf-8') returns a text stream in Py3 

498 # smart_open(filename, encoding='utf-8') would return a byte stream 

499 # without our assumption, because the default mode is rb. 

500 # 

501 if "b" in mode and encoding is None: 

502 return fileobj 

503 

504 if encoding is None: 

505 encoding = DEFAULT_ENCODING 

506 

507 return so_utils.TextIOWrapper( 

508 fileobj, 

509 encoding=encoding, 

510 errors=errors, 

511 newline=newline, 

512 write_through=True, 

513 ) 

514 

515 

516class patch_pathlib: # noqa: N801 # function-shaped name in public API 

517 """Replace `Path.open` with `smart_open.open`.""" 

518 

519 def __init__(self) -> None: 

520 self.old_impl = _patch_pathlib(open) 

521 

522 def __enter__(self) -> Self: # noqa: D105 

523 return self 

524 

525 def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: # noqa: D105 

526 _patch_pathlib(self.old_impl) 

527 

528 

529def _patch_pathlib(func: Callable[..., Any]) -> Callable[..., Any]: 

530 """Replace `Path.open` with `func`.""" 

531 old_impl = pathlib.Path.open 

532 pathlib.Path.open = func # ty: ignore[invalid-assignment] # intentional monkeypatch 

533 return old_impl 

534 

535 

536# 

537# Prevent failures with doctools from messing up the entire library. We don't 

538# expect such failures, but contributed modules (e.g. new transport mechanisms) 

539# may not be as polished. 

540# 

541try: 

542 doctools.tweak_open_docstring(open) 

543 doctools.tweak_parse_uri_docstring(parse_uri) 

544except Exception: 

545 logger.exception( 

546 "Encountered a non-fatal error while building docstrings (see below). " 

547 "help(smart_open) will provide incomplete information as a result. " 

548 "For full help text, see " 

549 "<https://github.com/piskvorky/smart_open/blob/master/help.txt>." 

550 )