Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/smart_open/smart_open_lib.py: 27%

165 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:57 +0000

1# -*- coding: utf-8 -*- 

2# 

3# Copyright (C) 2019 Radim Rehurek <me@radimrehurek.com> 

4# 

5# This code is distributed under the terms and conditions 

6# from the MIT License (MIT). 

7# 

8 

9"""Implements the majority of smart_open's top-level API. 

10 

11The main functions are: 

12 

13 * ``parse_uri()`` 

14 * ``open()`` 

15 

16""" 

17 

18import collections 

19import io 

20import locale 

21import logging 

22import os 

23import os.path as P 

24import pathlib 

25import urllib.parse 

26import warnings 

27 

28# 

29# This module defines a function called smart_open so we cannot use 

30# smart_open.submodule to reference to the submodules. 

31# 

32import smart_open.local_file as so_file 

33import smart_open.compression as so_compression 

34 

35from smart_open import doctools 

36from smart_open import transport 

37 

38# 

39# For backwards compatibility and keeping old unit tests happy. 

40# 

41from smart_open.compression import register_compressor # noqa: F401 

42from smart_open.utils import check_kwargs as _check_kwargs # noqa: F401 

43from smart_open.utils import inspect_kwargs as _inspect_kwargs # noqa: F401 

44 

45logger = logging.getLogger(__name__) 

46 

47DEFAULT_ENCODING = locale.getpreferredencoding(do_setlocale=False) 

48 

49 

50def _sniff_scheme(uri_as_string): 

51 """Returns the scheme of the URL only, as a string.""" 

52 # 

53 # urlsplit doesn't work on Windows -- it parses the drive as the scheme... 

54 # no protocol given => assume a local file 

55 # 

56 if os.name == 'nt' and '://' not in uri_as_string: 

57 uri_as_string = 'file://' + uri_as_string 

58 

59 return urllib.parse.urlsplit(uri_as_string).scheme 

60 

61 

62def parse_uri(uri_as_string): 

63 """ 

64 Parse the given URI from a string. 

65 

66 Parameters 

67 ---------- 

68 uri_as_string: str 

69 The URI to parse. 

70 

71 Returns 

72 ------- 

73 collections.namedtuple 

74 The parsed URI. 

75 

76 Notes 

77 ----- 

78 smart_open/doctools.py magic goes here 

79 """ 

80 scheme = _sniff_scheme(uri_as_string) 

81 submodule = transport.get_transport(scheme) 

82 as_dict = submodule.parse_uri(uri_as_string) 

83 

84 # 

85 # The conversion to a namedtuple is just to keep the old tests happy while 

86 # I'm still refactoring. 

87 # 

88 Uri = collections.namedtuple('Uri', sorted(as_dict.keys())) 

89 return Uri(**as_dict) 

90 

91 

92# 

93# To keep old unit tests happy while I'm refactoring. 

94# 

95_parse_uri = parse_uri 

96 

97_builtin_open = open 

98 

99 

100def open( 

101 uri, 

102 mode='r', 

103 buffering=-1, 

104 encoding=None, 

105 errors=None, 

106 newline=None, 

107 closefd=True, 

108 opener=None, 

109 compression=so_compression.INFER_FROM_EXTENSION, 

110 transport_params=None, 

111 ): 

112 r"""Open the URI object, returning a file-like object. 

113 

114 The URI is usually a string in a variety of formats. 

115 For a full list of examples, see the :func:`parse_uri` function. 

116 

117 The URI may also be one of: 

118 

119 - an instance of the pathlib.Path class 

120 - a stream (anything that implements io.IOBase-like functionality) 

121 

122 Parameters 

123 ---------- 

124 uri: str or object 

125 The object to open. 

126 mode: str, optional 

127 Mimicks built-in open parameter of the same name. 

128 buffering: int, optional 

129 Mimicks built-in open parameter of the same name. 

130 encoding: str, optional 

131 Mimicks built-in open parameter of the same name. 

132 errors: str, optional 

133 Mimicks built-in open parameter of the same name. 

134 newline: str, optional 

135 Mimicks built-in open parameter of the same name. 

136 closefd: boolean, optional 

137 Mimicks built-in open parameter of the same name. Ignored. 

138 opener: object, optional 

139 Mimicks built-in open parameter of the same name. Ignored. 

140 compression: str, optional (see smart_open.compression.get_supported_compression_types) 

141 Explicitly specify the compression/decompression behavior. 

142 transport_params: dict, optional 

143 Additional parameters for the transport layer (see notes below). 

144 

145 Returns 

146 ------- 

147 A file-like object. 

148 

149 Notes 

150 ----- 

151 smart_open has several implementations for its transport layer (e.g. S3, HTTP). 

152 Each transport layer has a different set of keyword arguments for overriding 

153 default behavior. If you specify a keyword argument that is *not* supported 

154 by the transport layer being used, smart_open will ignore that argument and 

155 log a warning message. 

156 

157 smart_open/doctools.py magic goes here 

158 

159 See Also 

160 -------- 

161 - `Standard library reference <https://docs.python.org/3.7/library/functions.html#open>`__ 

162 - `smart_open README.rst 

163 <https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst>`__ 

164 

165 """ 

166 logger.debug('%r', locals()) 

167 

168 if not isinstance(mode, str): 

169 raise TypeError('mode should be a string') 

170 

171 if compression not in so_compression.get_supported_compression_types(): 

172 raise ValueError(f'invalid compression type: {compression}') 

173 

174 if transport_params is None: 

175 transport_params = {} 

176 

177 fobj = _shortcut_open( 

178 uri, 

179 mode, 

180 compression=compression, 

181 buffering=buffering, 

182 encoding=encoding, 

183 errors=errors, 

184 newline=newline, 

185 ) 

186 if fobj is not None: 

187 return fobj 

188 

189 # 

190 # This is a work-around for the problem described in Issue #144. 

191 # If the user has explicitly specified an encoding, then assume they want 

192 # us to open the destination in text mode, instead of the default binary. 

193 # 

194 # If we change the default mode to be text, and match the normal behavior 

195 # of Py2 and 3, then the above assumption will be unnecessary. 

196 # 

197 if encoding is not None and 'b' in mode: 

198 mode = mode.replace('b', '') 

199 

200 if isinstance(uri, pathlib.Path): 

201 uri = str(uri) 

202 

203 explicit_encoding = encoding 

204 encoding = explicit_encoding if explicit_encoding else DEFAULT_ENCODING 

205 

206 # 

207 # This is how we get from the filename to the end result. Decompression is 

208 # optional, but it always accepts bytes and returns bytes. 

209 # 

210 # Decoding is also optional, accepts bytes and returns text. The diagram 

211 # below is for reading, for writing, the flow is from right to left, but 

212 # the code is identical. 

213 # 

214 # open as binary decompress? decode? 

215 # filename ---------------> bytes -------------> bytes ---------> text 

216 # binary decompressed decode 

217 # 

218 

219 try: 

220 binary_mode = _get_binary_mode(mode) 

221 except ValueError as ve: 

222 raise NotImplementedError(ve.args[0]) 

223 

224 binary = _open_binary_stream(uri, binary_mode, transport_params) 

225 decompressed = so_compression.compression_wrapper(binary, binary_mode, compression) 

226 

227 if 'b' not in mode or explicit_encoding is not None: 

228 decoded = _encoding_wrapper( 

229 decompressed, 

230 mode, 

231 encoding=encoding, 

232 errors=errors, 

233 newline=newline, 

234 ) 

235 else: 

236 decoded = decompressed 

237 

238 # 

239 # There are some useful methods in the binary readers, e.g. to_boto3, that get 

240 # hidden by the multiple layers of wrapping we just performed. Promote 

241 # them so they are visible to the user. 

242 # 

243 if decoded != binary: 

244 promoted_attrs = ['to_boto3'] 

245 for attr in promoted_attrs: 

246 try: 

247 setattr(decoded, attr, getattr(binary, attr)) 

248 except AttributeError: 

249 pass 

250 

251 return decoded 

252 

253 

254def _get_binary_mode(mode_str): 

255 # 

256 # https://docs.python.org/3/library/functions.html#open 

257 # 

258 # The order of characters in the mode parameter appears to be unspecified. 

259 # The implementation follows the examples, just to be safe. 

260 # 

261 mode = list(mode_str) 

262 binmode = [] 

263 

264 if 't' in mode and 'b' in mode: 

265 raise ValueError("can't have text and binary mode at once") 

266 

267 counts = [mode.count(x) for x in 'rwa'] 

268 if sum(counts) > 1: 

269 raise ValueError("must have exactly one of create/read/write/append mode") 

270 

271 def transfer(char): 

272 binmode.append(mode.pop(mode.index(char))) 

273 

274 if 'a' in mode: 

275 transfer('a') 

276 elif 'w' in mode: 

277 transfer('w') 

278 elif 'r' in mode: 

279 transfer('r') 

280 else: 

281 raise ValueError( 

282 "Must have exactly one of create/read/write/append " 

283 "mode and at most one plus" 

284 ) 

285 

286 if 'b' in mode: 

287 transfer('b') 

288 elif 't' in mode: 

289 mode.pop(mode.index('t')) 

290 binmode.append('b') 

291 else: 

292 binmode.append('b') 

293 

294 if '+' in mode: 

295 transfer('+') 

296 

297 # 

298 # There shouldn't be anything left in the mode list at this stage. 

299 # If there is, then either we've missed something and the implementation 

300 # of this function is broken, or the original input mode is invalid. 

301 # 

302 if mode: 

303 raise ValueError('invalid mode: %r' % mode_str) 

304 

305 return ''.join(binmode) 

306 

307 

308def _shortcut_open( 

309 uri, 

310 mode, 

311 compression, 

312 buffering=-1, 

313 encoding=None, 

314 errors=None, 

315 newline=None, 

316 ): 

317 """Try to open the URI using the standard library io.open function. 

318 

319 This can be much faster than the alternative of opening in binary mode and 

320 then decoding. 

321 

322 This is only possible under the following conditions: 

323 

324 1. Opening a local file; and 

325 2. Compression is disabled 

326 

327 If it is not possible to use the built-in open for the specified URI, returns None. 

328 

329 :param str uri: A string indicating what to open. 

330 :param str mode: The mode to pass to the open function. 

331 :param str compression: The compression type selected. 

332 :returns: The opened file 

333 :rtype: file 

334 """ 

335 if not isinstance(uri, str): 

336 return None 

337 

338 scheme = _sniff_scheme(uri) 

339 if scheme not in (transport.NO_SCHEME, so_file.SCHEME): 

340 return None 

341 

342 local_path = so_file.extract_local_path(uri) 

343 if compression == so_compression.INFER_FROM_EXTENSION: 

344 _, extension = P.splitext(local_path) 

345 if extension in so_compression.get_supported_extensions(): 

346 return None 

347 elif compression != so_compression.NO_COMPRESSION: 

348 return None 

349 

350 open_kwargs = {} 

351 if encoding is not None: 

352 open_kwargs['encoding'] = encoding 

353 mode = mode.replace('b', '') 

354 if newline is not None: 

355 open_kwargs['newline'] = newline 

356 

357 # 

358 # binary mode of the builtin/stdlib open function doesn't take an errors argument 

359 # 

360 if errors and 'b' not in mode: 

361 open_kwargs['errors'] = errors 

362 

363 return _builtin_open(local_path, mode, buffering=buffering, **open_kwargs) 

364 

365 

366def _open_binary_stream(uri, mode, transport_params): 

367 """Open an arbitrary URI in the specified binary mode. 

368 

369 Not all modes are supported for all protocols. 

370 

371 :arg uri: The URI to open. May be a string, or something else. 

372 :arg str mode: The mode to open with. Must be rb, wb or ab. 

373 :arg transport_params: Keyword argumens for the transport layer. 

374 :returns: A named file object 

375 :rtype: file-like object with a .name attribute 

376 """ 

377 if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): 

378 # 

379 # This should really be a ValueError, but for the sake of compatibility 

380 # with older versions, which raise NotImplementedError, we do the same. 

381 # 

382 raise NotImplementedError('unsupported mode: %r' % mode) 

383 

384 if isinstance(uri, int): 

385 # 

386 # We're working with a file descriptor. If we open it, its name is 

387 # just the integer value, which isn't helpful. Unfortunately, there's 

388 # no easy cross-platform way to go from a file descriptor to the filename, 

389 # so we just give up here. The user will have to handle their own 

390 # compression, etc. explicitly. 

391 # 

392 fobj = _builtin_open(uri, mode, closefd=False) 

393 return fobj 

394 

395 if not isinstance(uri, str): 

396 raise TypeError("don't know how to handle uri %s" % repr(uri)) 

397 

398 scheme = _sniff_scheme(uri) 

399 submodule = transport.get_transport(scheme) 

400 fobj = submodule.open_uri(uri, mode, transport_params) 

401 if not hasattr(fobj, 'name'): 

402 fobj.name = uri 

403 

404 return fobj 

405 

406 

407def _encoding_wrapper(fileobj, mode, encoding=None, errors=None, newline=None): 

408 """Decode bytes into text, if necessary. 

409 

410 If mode specifies binary access, does nothing, unless the encoding is 

411 specified. A non-null encoding implies text mode. 

412 

413 :arg fileobj: must quack like a filehandle object. 

414 :arg str mode: is the mode which was originally requested by the user. 

415 :arg str encoding: The text encoding to use. If mode is binary, overrides mode. 

416 :arg str errors: The method to use when handling encoding/decoding errors. 

417 :returns: a file object 

418 """ 

419 logger.debug('encoding_wrapper: %r', locals()) 

420 

421 # 

422 # If the mode is binary, but the user specified an encoding, assume they 

423 # want text. If we don't make this assumption, ignore the encoding and 

424 # return bytes, smart_open behavior will diverge from the built-in open: 

425 # 

426 # open(filename, encoding='utf-8') returns a text stream in Py3 

427 # smart_open(filename, encoding='utf-8') would return a byte stream 

428 # without our assumption, because the default mode is rb. 

429 # 

430 if 'b' in mode and encoding is None: 

431 return fileobj 

432 

433 if encoding is None: 

434 encoding = DEFAULT_ENCODING 

435 

436 fileobj = io.TextIOWrapper( 

437 fileobj, 

438 encoding=encoding, 

439 errors=errors, 

440 newline=newline, 

441 write_through=True, 

442 ) 

443 return fileobj 

444 

445 

446class patch_pathlib(object): 

447 """Replace `Path.open` with `smart_open.open`""" 

448 

449 def __init__(self): 

450 self.old_impl = _patch_pathlib(open) 

451 

452 def __enter__(self): 

453 return self 

454 

455 def __exit__(self, exc_type, exc_val, exc_tb): 

456 _patch_pathlib(self.old_impl) 

457 

458 

459def _patch_pathlib(func): 

460 """Replace `Path.open` with `func`""" 

461 old_impl = pathlib.Path.open 

462 pathlib.Path.open = func 

463 return old_impl 

464 

465 

466def smart_open( 

467 uri, 

468 mode='rb', 

469 buffering=-1, 

470 encoding=None, 

471 errors=None, 

472 newline=None, 

473 closefd=True, 

474 opener=None, 

475 ignore_extension=False, 

476 **kwargs 

477 ): 

478 # 

479 # This is a thin wrapper of smart_open.open. It's here for backward 

480 # compatibility. It works exactly like smart_open.open when the passed 

481 # parameters are identical. Otherwise, it raises a DeprecationWarning. 

482 # 

483 # For completeness, the main differences of the old smart_open function: 

484 # 

485 # 1. Default mode was read binary (mode='rb') 

486 # 2. compression parameter was called ignore_extension 

487 # 3. Transport parameters were passed directly as kwargs 

488 # 

489 url = 'https://github.com/RaRe-Technologies/smart_open/blob/develop/MIGRATING_FROM_OLDER_VERSIONS.rst' 

490 if kwargs: 

491 raise DeprecationWarning( 

492 'The following keyword parameters are not supported: %r. ' 

493 'See %s for more information.' % (sorted(kwargs), url) 

494 ) 

495 message = 'This function is deprecated. See %s for more information' % url 

496 warnings.warn(message, category=DeprecationWarning) 

497 

498 if ignore_extension: 

499 compression = so_compression.NO_COMPRESSION 

500 else: 

501 compression = so_compression.INFER_FROM_EXTENSION 

502 del kwargs, url, message, ignore_extension 

503 return open(**locals()) 

504 

505 

506# 

507# Prevent failures with doctools from messing up the entire library. We don't 

508# expect such failures, but contributed modules (e.g. new transport mechanisms) 

509# may not be as polished. 

510# 

511try: 

512 doctools.tweak_open_docstring(open) 

513 doctools.tweak_parse_uri_docstring(parse_uri) 

514except Exception as ex: 

515 logger.error( 

516 'Encountered a non-fatal error while building docstrings (see below). ' 

517 'help(smart_open) will provide incomplete information as a result. ' 

518 'For full help text, see ' 

519 '<https://github.com/RaRe-Technologies/smart_open/blob/master/help.txt>.' 

520 ) 

521 logger.exception(ex)