Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/smart_open/smart_open_lib.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

167 statements  

1# -*- coding: utf-8 -*- 

2# 

3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com> 

4# 

5# This code is distributed under the terms and conditions 

6# from the MIT License (MIT). 

7# 

8 

9"""Implements the majority of smart_open's top-level API.""" 

10 

11import collections 

12import locale 

13import logging 

14import os 

15import os.path as P 

16import pathlib 

17import urllib.parse 

18import warnings 

19 

20# 

21# This module defines a function called smart_open so we cannot use 

22# smart_open.submodule to reference to the submodules. 

23# 

24import smart_open.local_file as so_file 

25import smart_open.compression as so_compression 

26import smart_open.utils as so_utils 

27 

28from smart_open import doctools 

29from smart_open import transport 

30 

31# 

32# For backwards compatibility and keeping old unit tests happy. 

33# 

34from smart_open.compression import register_compressor # noqa: F401 

35from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 

36from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 

37 

38logger = logging.getLogger(__name__) 

39 

40DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 

41 

42 

43def _sniff_scheme(uri_as_string): 

44 """Returns the scheme of the URL only, as a string.""" 

45 # 

46 # urlsplit doesn't work on Windows -- it parses the drive as the scheme... 

47 # no protocol given => assume a local file 

48 # 

49 if os.name == 'nt' and '://' not in uri_as_string: 

50 uri_as_string = 'file://' + uri_as_string 

51 

52 return urllib.parse.urlsplit(uri_as_string).scheme 

53 

54 

55def parse_uri(uri_as_string): 

56 """ 

57 Parse the given URI from a string. 

58 

59 Parameters 

60 ---------- 

61 uri_as_string: str 

62 The URI to parse. 

63 

64 Returns 

65 ------- 

66 collections.namedtuple 

67 The parsed URI. 

68 

69 Notes 

70 ----- 

71 smart_open/doctools.py magic goes here 

72 """ 

73 scheme = _sniff_scheme(uri_as_string) 

74 submodule = transport.get_transport(scheme) 

75 as_dict = submodule.parse_uri(uri_as_string) 

76 

77 # 

78 # The conversion to a namedtuple is just to keep the old tests happy while 

79 # I'm still refactoring. 

80 # 

81 Uri = collections.namedtuple('Uri', sorted(as_dict.keys())) 

82 return Uri(**as_dict) 

83 

84 

85# 

86# To keep old unit tests happy while I'm refactoring. 

87# 

88_parse_uri = parse_uri 

89 

90_builtin_open = open 

91 

92 

93def open( 

94 uri, 

95 mode='r', 

96 buffering=-1, 

97 encoding=None, 

98 errors=None, 

99 newline=None, 

100 closefd=True, 

101 opener=None, 

102 compression=so_compression.INFER_FROM_EXTENSION, 

103 transport_params=None, 

104 ): 

105 r"""Open the URI object, returning a file-like object. 

106 

107 The URI is usually a string in a variety of formats. 

108 For a full list of examples, see the :func:`parse_uri` function. 

109 

110 The URI may also be one of: 

111 

112 - an instance of the pathlib.Path class 

113 - a stream (anything that implements io.IOBase-like functionality) 

114 

115 Parameters 

116 ---------- 

117 uri: str or object 

118 The object to open. 

119 mode: str, optional 

120 Mimicks built-in open parameter of the same name. 

121 buffering: int, optional 

122 Mimicks built-in open parameter of the same name. 

123 encoding: str, optional 

124 Mimicks built-in open parameter of the same name. 

125 errors: str, optional 

126 Mimicks built-in open parameter of the same name. 

127 newline: str, optional 

128 Mimicks built-in open parameter of the same name. 

129 closefd: boolean, optional 

130 Mimicks built-in open parameter of the same name. Ignored. 

131 opener: object, optional 

132 Mimicks built-in open parameter of the same name. Ignored. 

133 compression: str, optional (see smart_open.compression.get_supported_compression_types) 

134 Explicitly specify the compression/decompression behavior. 

135 transport_params: dict, optional 

136 Additional parameters for the transport layer (see notes below). 

137 

138 Returns 

139 ------- 

140 A file-like object. 

141 

142 Notes 

143 ----- 

144 smart_open has several implementations for its transport layer (e.g. S3, HTTP). 

145 Each transport layer has a different set of keyword arguments for overriding 

146 default behavior. If you specify a keyword argument that is *not* supported 

147 by the transport layer being used, smart_open will ignore that argument and 

148 log a warning message. 

149 

150 smart_open/doctools.py magic goes here 

151 

152 See Also 

153 -------- 

154 - `Standard library reference <https://docs.python.org/3.13/library/functions.html#open>`__ 

155 - `smart_open README.rst 

156 <https://github.com/piskvorky/smart_open/blob/master/README.rst>`__ 

157 

158 """ 

159 logger.debug('%r', locals()) 

160 

161 if not isinstance(mode, str): 

162 raise TypeError('mode should be a string') 

163 

164 if compression not in so_compression.get_supported_compression_types(): 

165 raise ValueError(f'invalid compression type: {compression}') 

166 

167 if transport_params is None: 

168 transport_params = {} 

169 

170 fobj = _shortcut_open( 

171 uri, 

172 mode, 

173 compression=compression, 

174 buffering=buffering, 

175 encoding=encoding, 

176 errors=errors, 

177 newline=newline, 

178 ) 

179 if fobj is not None: 

180 return fobj 

181 

182 # 

183 # This is a work-around for the problem described in Issue #144. 

184 # If the user has explicitly specified an encoding, then assume they want 

185 # us to open the destination in text mode, instead of the default binary. 

186 # 

187 # If we change the default mode to be text, and match the normal behavior 

188 # of Py2 and 3, then the above assumption will be unnecessary. 

189 # 

190 if encoding is not None and 'b' in mode: 

191 mode = mode.replace('b', '') 

192 

193 if isinstance(uri, pathlib.Path): 

194 uri = str(uri) 

195 

196 explicit_encoding = encoding 

197 encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING 

198 

199 # 

200 # This is how we get from the filename to the end result. Decompression is 

201 # optional, but it always accepts bytes and returns bytes. 

202 # 

203 # Decoding is also optional, accepts bytes and returns text. The diagram 

204 # below is for reading, for writing, the flow is from right to left, but 

205 # the code is identical. 

206 # 

207 # open as binary decompress? decode? 

208 # filename ---------------> bytes -------------> bytes ---------> text 

209 # binary decompressed decode 

210 # 

211 

212 try: 

213 binary_mode = _get_binary_mode(mode) 

214 except ValueError as ve: 

215 raise NotImplementedError(ve.args[0]) 

216 

217 binary = _open_binary_stream(uri, binary_mode, transport_params) 

218 filename = ( 

219 binary.name 

220 # if name attribute is not string-like (e.g. ftp socket fileno)... 

221 if isinstance(getattr(binary, "name", None), (str, bytes)) 

222 # ...fall back to uri 

223 else uri 

224 ) 

225 decompressed = so_compression.compression_wrapper( 

226 binary, 

227 binary_mode, 

228 compression, 

229 filename=filename, 

230 ) 

231 

232 if 'b' not in mode or explicit_encoding is not None: 

233 decoded = _encoding_wrapper( 

234 decompressed, 

235 mode, 

236 encoding=encoding, 

237 errors=errors, 

238 newline=newline, 

239 ) 

240 else: 

241 decoded = decompressed 

242 

243 # 

244 # There are some useful methods in the binary readers, e.g. to_boto3, that get 

245 # hidden by the multiple layers of wrapping we just performed. Promote 

246 # them so they are visible to the user. 

247 # 

248 if decoded != binary: 

249 promoted_attrs = ['to_boto3'] 

250 for attr in promoted_attrs: 

251 try: 

252 setattr(decoded, attr, getattr(binary, attr)) 

253 except AttributeError: 

254 pass 

255 

256 return so_utils.FileLikeProxy(decoded, binary) 

257 

258 

259def _get_binary_mode(mode_str): 

260 # 

261 # https://docs.python.org/3/library/functions.html#open 

262 # 

263 # The order of characters in the mode parameter appears to be unspecified. 

264 # The implementation follows the examples, just to be safe. 

265 # 

266 mode = list(mode_str) 

267 binmode = [] 

268 

269 if 't' in mode and 'b' in mode: 

270 raise ValueError("can't have text and binary mode at once") 

271 

272 counts = [mode.count(x) for x in 'rwa'] 

273 if sum(counts) > 1: 

274 raise ValueError("must have exactly one of create/read/write/append mode") 

275 

276 def transfer(char): 

277 binmode.append(mode.pop(mode.index(char))) 

278 

279 if 'a' in mode: 

280 transfer('a') 

281 elif 'w' in mode: 

282 transfer('w') 

283 elif 'r' in mode: 

284 transfer('r') 

285 else: 

286 raise ValueError( 

287 "Must have exactly one of create/read/write/append " 

288 "mode and at most one plus" 

289 ) 

290 

291 if 'b' in mode: 

292 transfer('b') 

293 elif 't' in mode: 

294 mode.pop(mode.index('t')) 

295 binmode.append('b') 

296 else: 

297 binmode.append('b') 

298 

299 if '+' in mode: 

300 transfer('+') 

301 

302 # 

303 # There shouldn't be anything left in the mode list at this stage. 

304 # If there is, then either we've missed something and the implementation 

305 # of this function is broken, or the original input mode is invalid. 

306 # 

307 if mode: 

308 raise ValueError('invalid mode: %r' % mode_str) 

309 

310 return ''.join(binmode) 

311 

312 

313def _shortcut_open( 

314 uri, 

315 mode, 

316 compression, 

317 buffering=-1, 

318 encoding=None, 

319 errors=None, 

320 newline=None, 

321 ): 

322 """Try to open the URI using the standard library io.open function. 

323 

324 This can be much faster than the alternative of opening in binary mode and 

325 then decoding. 

326 

327 This is only possible under the following conditions: 

328 

329 1. Opening a local file; and 

330 2. Compression is disabled 

331 

332 If it is not possible to use the built-in open for the specified URI, returns None. 

333 

334 :param str uri: A string indicating what to open. 

335 :param str mode: The mode to pass to the open function. 

336 :param str compression: The compression type selected. 

337 :returns: The opened file 

338 :rtype: file 

339 """ 

340 if not isinstance(uri, str): 

341 return None 

342 

343 scheme = _sniff_scheme(uri) 

344 if scheme not in (transport.NO_SCHEME, so_file.SCHEME): 

345 return None 

346 

347 local_path = so_file.extract_local_path(uri) 

348 if compression == so_compression.INFER_FROM_EXTENSION: 

349 _, extension = P.splitext(local_path) 

350 if extension in so_compression.get_supported_extensions(): 

351 return None 

352 elif compression != so_compression.NO_COMPRESSION: 

353 return None 

354 

355 open_kwargs = {} 

356 if encoding is not None: 

357 open_kwargs['encoding'] = encoding 

358 mode = mode.replace('b', '') 

359 if newline is not None: 

360 open_kwargs['newline'] = newline 

361 

362 # 

363 # binary mode of the builtin/stdlib open function doesn't take an errors argument 

364 # 

365 if errors and 'b' not in mode: 

366 open_kwargs['errors'] = errors 

367 

368 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) 

369 

370 

371def _open_binary_stream(uri, mode, transport_params): 

372 """Open an arbitrary URI in the specified binary mode. 

373 

374 Not all modes are supported for all protocols. 

375 

376 :arg uri: The URI to open. May be a string, or something else. 

377 :arg str mode: The mode to open with. Must be rb, wb or ab. 

378 :arg transport_params: Keyword argumens for the transport layer. 

379 :returns: A named file object 

380 :rtype: file-like object with a .name attribute 

381 """ 

382 if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): 

383 # 

384 # This should really be a ValueError, but for the sake of compatibility 

385 # with older versions, which raise NotImplementedError, we do the same. 

386 # 

387 raise NotImplementedError('unsupported mode: %r' % mode) 

388 

389 if isinstance(uri, int): 

390 # 

391 # We're working with a file descriptor. If we open it, its name is 

392 # just the integer value, which isn't helpful. Unfortunately, there's 

393 # no easy cross-platform way to go from a file descriptor to the filename, 

394 # so we just give up here. The user will have to handle their own 

395 # compression, etc. explicitly. 

396 # 

397 fobj = _builtin_open(uri, mode, closefd=False) 

398 return fobj 

399 

400 if not isinstance(uri, str): 

401 raise TypeError("don't know how to handle uri %s" % repr(uri)) 

402 

403 scheme = _sniff_scheme(uri) 

404 submodule = transport.get_transport(scheme) 

405 fobj = submodule.open_uri(uri, mode, transport_params) 

406 if not hasattr(fobj, 'name'): 

407 fobj.name = uri 

408 

409 return fobj 

410 

411 

412def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None): 

413 """Decode bytes into text, if necessary. 

414 

415 If mode specifies binary access, does nothing, unless the encoding is 

416 specified. A non-null encoding implies text mode. 

417 

418 :arg fileobj: must quack like a filehandle object. 

419 :arg str mode: is the mode which was originally requested by the user. 

420 :arg str encoding: The text encoding to use. If mode is binary, overrides mode. 

421 :arg str errors: The method to use when handling encoding/decoding errors. 

422 :returns: a file object 

423 """ 

424 logger.debug('encoding_wrapper: %r', locals()) 

425 

426 # 

427 # If the mode is binary, but the user specified an encoding, assume they 

428 # want text. If we don't make this assumption, ignore the encoding and 

429 # return bytes, smart_open behavior will diverge from the built-in open: 

430 # 

431 # open(filename, encoding='utf-8') returns a text stream in Py3 

432 # smart_open(filename, encoding='utf-8') would return a byte stream 

433 # without our assumption, because the default mode is rb. 

434 # 

435 if 'b' in mode and encoding is None: 

436 return fileobj 

437 

438 if encoding is None: 

439 encoding = DEFAULT_ENCODING 

440 

441 fileobj = so_utils.TextIOWrapper( 

442 fileobj, 

443 encoding=encoding, 

444 errors=errors, 

445 newline=newline, 

446 write_through=True, 

447 ) 

448 return fileobj 

449 

450 

451class patch_pathlib(object): 

452 """Replace `Path.open` with `smart_open.open`""" 

453 

454 def __init__(self): 

455 self.old_impl = _patch_pathlib(open) 

456 

457 def __enter__(self): 

458 return self 

459 

460 def __exit__(self, exc_type, exc_val, exc_tb): 

461 _patch_pathlib(self.old_impl) 

462 

463 

464def _patch_pathlib(func): 

465 """Replace `Path.open` with `func`""" 

466 old_impl = pathlib.Path.open 

467 pathlib.Path.open = func 

468 return old_impl 

469 

470 

471def smart_open( 

472 uri, 

473 mode='rb', 

474 buffering=-1, 

475 encoding=None, 

476 errors=None, 

477 newline=None, 

478 closefd=True, 

479 opener=None, 

480 ignore_extension=False, 

481 **kwargs 

482 ): 

483 # 

484 # This is a thin wrapper of smart_open.open. It's here for backward 

485 # compatibility. It works exactly like smart_open.open when the passed 

486 # parameters are identical. Otherwise, it raises a DeprecationWarning. 

487 # 

488 # For completeness, the main differences of the old smart_open function: 

489 # 

490 # 1. Default mode was read binary (mode='rb') 

491 # 2. compression parameter was called ignore_extension 

492 # 3. Transport parameters were passed directly as kwargs 

493 # 

494 url = 'https://github.com/piskvorky/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst' 

495 if kwargs: 

496 raise DeprecationWarning( 

497 'The following keyword parameters are not supported: %r. ' 

498 'See %s for more information.' % (sorted(kwargs), url) 

499 ) 

500 message = 'This function is deprecated. See %s for more information' % url 

501 warnings.warn(message, category=DeprecationWarning) 

502 

503 if ignore_extension: 

504 compression = so_compression.NO_COMPRESSION 

505 else: 

506 compression = so_compression.INFER_FROM_EXTENSION 

507 del kwargs, url, message, ignore_extension 

508 return open(**locals()) 

509 

510 

511# 

512# Prevent failures with doctools from messing up the entire library. We don't 

513# expect such failures, but contributed modules (e.g. new transport mechanisms) 

514# may not be as polished. 

515# 

516try: 

517 doctools.tweak_open_docstring(open) 

518 doctools.tweak_parse_uri_docstring(parse_uri) 

519except Exception as ex: 

520 logger.error( 

521 'Encountered a non-fatal error while building docstrings (see below). ' 

522 'help(smart_open) will provide incomplete information as a result. ' 

523 'For full help text, see ' 

524 '<https://github.com/piskvorky/smart_open/blob/master/help.txt>.' 

525 ) 

526 logger.exception(ex)