Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/docutils/io.py: 34%

268 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:06 +0000

1# $Id$ 

2# Author: David Goodger <goodger@python.org> 

3# Copyright: This module has been placed in the public domain. 

4 

5""" 

6I/O classes provide a uniform API for low-level input and output. Subclasses 

7exist for a variety of input/output mechanisms. 

8""" 

9 

10__docformat__ = 'reStructuredText' 

11 

12import codecs 

13import locale 

14import os 

15import re 

16import sys 

17import warnings 

18 

19from docutils import TransformSpec 

20 

21 

22# Guess the locale's preferred encoding. 

23# If no valid guess can be made, _locale_encoding is set to `None`: 

24# 

25# TODO: check whether this is set correctly with every OS and Python version 

26# or whether front-end tools need to call `locale.setlocale()` 

27# before importing this module 

28try: 

29 # Return locale encoding also in UTF-8 mode 

30 with warnings.catch_warnings(): 

31 warnings.simplefilter("ignore") 

32 _locale_encoding = (locale.getlocale()[1] 

33 or locale.getdefaultlocale()[1]) 

34 _locale_encoding = _locale_encoding.lower() 

35except ValueError as error: # OS X may set UTF-8 without language code 

36 # See https://bugs.python.org/issue18378 fixed in 3.8 

37 # and https://sourceforge.net/p/docutils/bugs/298/. 

38 # Drop the special case after requiring Python >= 3.8 

39 if "unknown locale: UTF-8" in error.args: 

40 _locale_encoding = "utf-8" 

41 else: 

42 _locale_encoding = None 

43except: # noqa any other problems determining the locale -> use None 

44 _locale_encoding = None 

45try: 

46 codecs.lookup(_locale_encoding) 

47except (LookupError, TypeError): 

48 _locale_encoding = None 

49 

50 

51class InputError(OSError): pass 

52class OutputError(OSError): pass 

53 

54 

55def check_encoding(stream, encoding): 

56 """Test, whether the encoding of `stream` matches `encoding`. 

57 

58 Returns 

59 

60 :None: if `encoding` or `stream.encoding` are not a valid encoding 

61 argument (e.g. ``None``) or `stream.encoding is missing. 

62 :True: if the encoding argument resolves to the same value as `encoding`, 

63 :False: if the encodings differ. 

64 """ 

65 try: 

66 return codecs.lookup(stream.encoding) == codecs.lookup(encoding) 

67 except (LookupError, AttributeError, TypeError): 

68 return None 

69 

70 

71def error_string(err): 

72 """Return string representation of Exception `err`. 

73 """ 

74 return f'{err.__class__.__name__}: {err}' 

75 

76 

77class Input(TransformSpec): 

78 """ 

79 Abstract base class for input wrappers. 

80 

81 Docutils input objects must provide a `read()` method that 

82 returns the source, typically as `str` instance. 

83 

84 Inheriting `TransformSpec` allows input objects to add 

85 "transforms" and "unknown_reference_resolvers" to the "Transformer". 

86 (Optional for custom input objects since Docutils 0.19.) 

87 """ 

88 

89 component_type = 'input' 

90 

91 default_source_path = None 

92 

93 def __init__(self, source=None, source_path=None, encoding=None, 

94 error_handler='strict'): 

95 self.encoding = encoding 

96 """Text encoding for the input source.""" 

97 

98 self.error_handler = error_handler 

99 """Text decoding error handler.""" 

100 

101 self.source = source 

102 """The source of input data.""" 

103 

104 self.source_path = source_path 

105 """A text reference to the source.""" 

106 

107 if not source_path: 

108 self.source_path = self.default_source_path 

109 

110 self.successful_encoding = None 

111 """The encoding that successfully decoded the source data.""" 

112 

113 def __repr__(self): 

114 return '%s: source=%r, source_path=%r' % (self.__class__, self.source, 

115 self.source_path) 

116 

117 def read(self): 

118 """Return input as `str`. Define in subclasses.""" 

119 raise NotImplementedError 

120 

121 def decode(self, data): 

122 """ 

123 Decode `data` if required. 

124 

125 Return Unicode `str` instances unchanged (nothing to decode). 

126 

127 If `self.encoding` is None, determine encoding from data 

128 or try UTF-8, locale encoding, and (as last ressort) 'latin-1'. 

129 The client application should call ``locale.setlocale`` at the 

130 beginning of processing:: 

131 

132 locale.setlocale(locale.LC_ALL, '') 

133 

134 Raise UnicodeError if unsuccessful. 

135 

136 Provisional: 

137 - Raise UnicodeError (instead of falling back to the locale 

138 encoding) if decoding the source with the default encoding (UTF-8) 

139 fails and Python is started in `UTF-8 mode`. 

140 

141 Raise UnicodeError (instead of falling back to "latin1") if both, 

142 default and locale encoding, fail. 

143 

144 - Only remove BOM (U+FEFF ZWNBSP at start of data), 

145 no other ZWNBSPs. 

146 """ 

147 if self.encoding and self.encoding.lower() == 'unicode': 

148 assert isinstance(data, str), ('input encoding is "unicode" ' 

149 'but `data` is no `str` instance') 

150 if isinstance(data, str): 

151 # nothing to decode 

152 return data 

153 if self.encoding: 

154 # We believe the user/application when the encoding is 

155 # explicitly given. 

156 encoding_candidates = [self.encoding] 

157 else: 

158 data_encoding = self.determine_encoding_from_data(data) 

159 if data_encoding: 

160 # If the data declares its encoding (explicitly or via a BOM), 

161 # we believe it. 

162 encoding_candidates = [data_encoding] 

163 else: 

164 # Apply heuristics only if no encoding is explicitly given and 

165 # no BOM found. Start with UTF-8, because that only matches 

166 # data that *IS* UTF-8: 

167 encoding_candidates = ['utf-8'] 

168 # TODO: use `locale.getpreferredlocale(do_setlocale=True)` 

169 # to respect UTF-8 mode (API change). 

170 # (Check if it is a valid encoding and not UTF-8) 

171 if _locale_encoding and _locale_encoding != 'utf-8': 

172 encoding_candidates.append(_locale_encoding) 

173 # TODO: don't fall back to 'latin-1' (API change). 

174 encoding_candidates.append('latin-1') 

175 for enc in encoding_candidates: 

176 try: 

177 decoded = str(data, enc, self.error_handler) 

178 self.successful_encoding = enc 

179 # Return decoded, removing BOM and other ZWNBSPs. 

180 # TODO: only remove BOM (ZWNBSP at start of data) 

181 # and only if 'self.encoding' is None. (API change) 

182 return decoded.replace('\ufeff', '') 

183 except (UnicodeError, LookupError) as err: 

184 # keep exception instance for use outside of the "for" loop. 

185 error = err 

186 raise UnicodeError( 

187 'Unable to decode input data. Tried the following encodings: ' 

188 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n' 

189 f'({error_string(error)})') 

190 

191 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)") 

192 """Encoding declaration pattern.""" 

193 

194 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), 

195 (codecs.BOM_UTF16_BE, 'utf-16-be'), 

196 (codecs.BOM_UTF16_LE, 'utf-16-le'),) 

197 """Sequence of (start_bytes, encoding) tuples for encoding detection. 

198 The first bytes of input data are checked against the start_bytes strings. 

199 A match indicates the given encoding.""" 

200 

201 def determine_encoding_from_data(self, data): 

202 """ 

203 Try to determine the encoding of `data` by looking *in* `data`. 

204 Check for a byte order mark (BOM) or an encoding declaration. 

205 """ 

206 # check for a byte order mark: 

207 for start_bytes, encoding in self.byte_order_marks: 

208 if data.startswith(start_bytes): 

209 return encoding 

210 # check for an encoding declaration pattern in first 2 lines of file: 

211 for line in data.splitlines()[:2]: 

212 match = self.coding_slug.search(line) 

213 if match: 

214 return match.group(1).decode('ascii') 

215 return None 

216 

217 def isatty(self): 

218 """Return True, if the input source is connected to a TTY device.""" 

219 try: 

220 return self.source.isatty() 

221 except AttributeError: 

222 return False 

223 

224 

225class Output(TransformSpec): 

226 """ 

227 Abstract base class for output wrappers. 

228 

229 Docutils output objects must provide a `write()` method that 

230 expects and handles one argument (the output). 

231 

232 Inheriting `TransformSpec` allows output objects to add 

233 "transforms" and "unknown_reference_resolvers" to the "Transformer". 

234 (Optional for custom output objects since Docutils 0.19.) 

235 """ 

236 

237 component_type = 'output' 

238 

239 default_destination_path = None 

240 

241 def __init__(self, destination=None, destination_path=None, 

242 encoding=None, error_handler='strict'): 

243 self.encoding = encoding 

244 """Text encoding for the output destination.""" 

245 

246 self.error_handler = error_handler or 'strict' 

247 """Text encoding error handler.""" 

248 

249 self.destination = destination 

250 """The destination for output data.""" 

251 

252 self.destination_path = destination_path 

253 """A text reference to the destination.""" 

254 

255 if not destination_path: 

256 self.destination_path = self.default_destination_path 

257 

258 def __repr__(self): 

259 return ('%s: destination=%r, destination_path=%r' 

260 % (self.__class__, self.destination, self.destination_path)) 

261 

262 def write(self, data): 

263 """Write `data`. Define in subclasses.""" 

264 raise NotImplementedError 

265 

266 def encode(self, data): 

267 """ 

268 Encode and return `data`. 

269 

270 If `data` is a `bytes` instance, it is returned unchanged. 

271 Otherwise it is encoded with `self.encoding`. 

272 

273 Provisional: If `self.encoding` is set to the pseudo encoding name 

274 "unicode", `data` must be a `str` instance and is returned unchanged. 

275 """ 

276 if self.encoding and self.encoding.lower() == 'unicode': 

277 assert isinstance(data, str), ('output encoding is "unicode" ' 

278 'but `data` is no `str` instance') 

279 return data 

280 if not isinstance(data, str): 

281 # Non-unicode (e.g. bytes) output. 

282 return data 

283 else: 

284 return data.encode(self.encoding, self.error_handler) 

285 

286 

287class ErrorOutput: 

288 """ 

289 Wrapper class for file-like error streams with 

290 failsafe de- and encoding of `str`, `bytes`, `unicode` and 

291 `Exception` instances. 

292 """ 

293 

294 def __init__(self, destination=None, encoding=None, 

295 encoding_errors='backslashreplace', 

296 decoding_errors='replace'): 

297 """ 

298 :Parameters: 

299 - `destination`: a file-like object, 

300 a string (path to a file), 

301 `None` (write to `sys.stderr`, default), or 

302 evaluating to `False` (write() requests are ignored). 

303 - `encoding`: `destination` text encoding. Guessed if None. 

304 - `encoding_errors`: how to treat encoding errors. 

305 """ 

306 if destination is None: 

307 destination = sys.stderr 

308 elif not destination: 

309 destination = False 

310 # if `destination` is a file name, open it 

311 elif isinstance(destination, str): 

312 destination = open(destination, 'w') 

313 

314 self.destination = destination 

315 """Where warning output is sent.""" 

316 

317 self.encoding = (encoding or getattr(destination, 'encoding', None) 

318 or _locale_encoding or 'ascii') 

319 """The output character encoding.""" 

320 

321 self.encoding_errors = encoding_errors 

322 """Encoding error handler.""" 

323 

324 self.decoding_errors = decoding_errors 

325 """Decoding error handler.""" 

326 

327 def write(self, data): 

328 """ 

329 Write `data` to self.destination. Ignore, if self.destination is False. 

330 

331 `data` can be a `bytes`, `str`, or `Exception` instance. 

332 """ 

333 if not self.destination: 

334 return 

335 if isinstance(data, Exception): 

336 data = str(data) 

337 try: 

338 self.destination.write(data) 

339 except UnicodeEncodeError: 

340 self.destination.write(data.encode(self.encoding, 

341 self.encoding_errors)) 

342 except TypeError: 

343 if isinstance(data, str): # destination may expect bytes 

344 self.destination.write(data.encode(self.encoding, 

345 self.encoding_errors)) 

346 elif self.destination in (sys.stderr, sys.stdout): 

347 # write bytes to raw stream 

348 self.destination.buffer.write(data) 

349 else: 

350 self.destination.write(str(data, self.encoding, 

351 self.decoding_errors)) 

352 

353 def close(self): 

354 """ 

355 Close the error-output stream. 

356 

357 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no 

358 close() method. 

359 """ 

360 if self.destination in (sys.stdout, sys.stderr): 

361 return 

362 try: 

363 self.destination.close() 

364 except AttributeError: 

365 pass 

366 

367 def isatty(self): 

368 """Return True, if the destination is connected to a TTY device.""" 

369 try: 

370 return self.destination.isatty() 

371 except AttributeError: 

372 return False 

373 

374 

375class FileInput(Input): 

376 

377 """ 

378 Input for single, simple file-like objects. 

379 """ 

380 def __init__(self, source=None, source_path=None, 

381 encoding=None, error_handler='strict', 

382 autoclose=True, mode='r'): 

383 """ 

384 :Parameters: 

385 - `source`: either a file-like object (which is read directly), or 

386 `None` (which implies `sys.stdin` if no `source_path` given). 

387 - `source_path`: a path to a file, which is opened and then read. 

388 - `encoding`: the expected text encoding of the input file. 

389 - `error_handler`: the encoding error handler to use. 

390 - `autoclose`: close automatically after read (except when 

391 `sys.stdin` is the source). 

392 - `mode`: how the file is to be opened (see standard function 

393 `open`). The default is read only ('r'). 

394 """ 

395 Input.__init__(self, source, source_path, encoding, error_handler) 

396 self.autoclose = autoclose 

397 self._stderr = ErrorOutput() 

398 

399 if source is None: 

400 if source_path: 

401 try: 

402 self.source = open(source_path, mode, 

403 encoding=self.encoding or 'utf-8-sig', 

404 errors=self.error_handler) 

405 except OSError as error: 

406 raise InputError(error.errno, error.strerror, source_path) 

407 else: 

408 self.source = sys.stdin 

409 elif check_encoding(self.source, self.encoding) is False: 

410 # TODO: re-open, warn or raise error? 

411 raise UnicodeError('Encoding clash: encoding given is "%s" ' 

412 'but source is opened with encoding "%s".' % 

413 (self.encoding, self.source.encoding)) 

414 if not source_path: 

415 try: 

416 self.source_path = self.source.name 

417 except AttributeError: 

418 pass 

419 

420 def read(self): 

421 """ 

422 Read and decode a single file and return the data (Unicode string). 

423 """ 

424 try: 

425 if self.source is sys.stdin: 

426 # read as binary data to circumvent auto-decoding 

427 data = self.source.buffer.read() 

428 else: 

429 data = self.source.read() 

430 except (UnicodeError, LookupError): 

431 if not self.encoding and self.source_path: 

432 # re-read in binary mode and decode with heuristics 

433 b_source = open(self.source_path, 'rb') 

434 data = b_source.read() 

435 b_source.close() 

436 else: 

437 raise 

438 finally: 

439 if self.autoclose: 

440 self.close() 

441 data = self.decode(data) 

442 # normalise newlines 

443 return '\n'.join(data.splitlines()+['']) 

444 

445 def readlines(self): 

446 """ 

447 Return lines of a single file as list of Unicode strings. 

448 """ 

449 return self.read().splitlines(True) 

450 

451 def close(self): 

452 if self.source is not sys.stdin: 

453 self.source.close() 

454 

455 

456class FileOutput(Output): 

457 

458 """Output for single, simple file-like objects.""" 

459 

460 default_destination_path = '<file>' 

461 

462 mode = 'w' 

463 """The mode argument for `open()`.""" 

464 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`). 

465 # (Do not use binary mode ('wb') for text files, as this prevents the 

466 # conversion of newlines to the system specific default.) 

467 

468 def __init__(self, destination=None, destination_path=None, 

469 encoding=None, error_handler='strict', autoclose=True, 

470 handle_io_errors=None, mode=None): 

471 """ 

472 :Parameters: 

473 - `destination`: either a file-like object (which is written 

474 directly) or `None` (which implies `sys.stdout` if no 

475 `destination_path` given). 

476 - `destination_path`: a path to a file, which is opened and then 

477 written. 

478 - `encoding`: the text encoding of the output file. 

479 - `error_handler`: the encoding error handler to use. 

480 - `autoclose`: close automatically after write (except when 

481 `sys.stdout` or `sys.stderr` is the destination). 

482 - `handle_io_errors`: ignored, deprecated, will be removed. 

483 - `mode`: how the file is to be opened (see standard function 

484 `open`). The default is 'w', providing universal newline 

485 support for text files. 

486 """ 

487 Output.__init__(self, destination, destination_path, 

488 encoding, error_handler) 

489 self.opened = True 

490 self.autoclose = autoclose 

491 if handle_io_errors is not None: 

492 warnings.warn('io.FileOutput: init argument "handle_io_errors" ' 

493 'is ignored and will be removed in ' 

494 'Docutils 2.0.', DeprecationWarning, stacklevel=2) 

495 if mode is not None: 

496 self.mode = mode 

497 self._stderr = ErrorOutput() 

498 if destination is None: 

499 if destination_path: 

500 self.opened = False 

501 else: 

502 self.destination = sys.stdout 

503 elif ( # destination is file-type object -> check mode: 

504 mode and hasattr(self.destination, 'mode') 

505 and mode != self.destination.mode): 

506 print('Warning: Destination mode "%s" differs from specified ' 

507 'mode "%s"' % (self.destination.mode, mode), 

508 file=self._stderr) 

509 if not destination_path: 

510 try: 

511 self.destination_path = self.destination.name 

512 except AttributeError: 

513 pass 

514 

515 def open(self): 

516 # Specify encoding 

517 if 'b' not in self.mode: 

518 kwargs = {'encoding': self.encoding, 

519 'errors': self.error_handler} 

520 else: 

521 kwargs = {} 

522 try: 

523 self.destination = open(self.destination_path, self.mode, **kwargs) 

524 except OSError as error: 

525 raise OutputError(error.errno, error.strerror, 

526 self.destination_path) 

527 self.opened = True 

528 

529 def write(self, data): 

530 """Write `data` to a single file, also return it. 

531 

532 `data` can be a `str` or `bytes` instance. 

533 If writing `bytes` fails, an attempt is made to write to 

534 the low-level interface ``self.destination.buffer``. 

535 

536 If `data` is a `str` instance and `self.encoding` and 

537 `self.destination.encoding` are set to different values, `data` 

538 is encoded to a `bytes` instance using `self.encoding`. 

539 

540 Provisional: future versions may raise an error if `self.encoding` 

541 and `self.destination.encoding` are set to different values. 

542 """ 

543 if not self.opened: 

544 self.open() 

545 if (isinstance(data, str) 

546 and check_encoding(self.destination, self.encoding) is False): 

547 if os.linesep != '\n': 

548 data = data.replace('\n', os.linesep) # fix endings 

549 data = self.encode(data) 

550 

551 try: 

552 self.destination.write(data) 

553 except TypeError as err: 

554 if isinstance(data, bytes): 

555 try: 

556 self.destination.buffer.write(data) 

557 except AttributeError: 

558 if check_encoding(self.destination, 

559 self.encoding) is False: 

560 raise ValueError( 

561 f'Encoding of {self.destination_path} ' 

562 f'({self.destination.encoding}) differs \n' 

563 f' from specified encoding ({self.encoding})') 

564 else: 

565 raise err 

566 except (UnicodeError, LookupError) as err: 

567 raise UnicodeError( 

568 'Unable to encode output data. output-encoding is: ' 

569 f'{self.encoding}.\n({error_string(err)})') 

570 finally: 

571 if self.autoclose: 

572 self.close() 

573 return data 

574 

575 def close(self): 

576 if self.destination not in (sys.stdout, sys.stderr): 

577 self.destination.close() 

578 self.opened = False 

579 

580 

581class BinaryFileOutput(FileOutput): 

582 """ 

583 A version of docutils.io.FileOutput which writes to a binary file. 

584 """ 

585 # Used by core.publish_cmdline_to_binary() which in turn is used by 

586 # tools/rst2odt.py but not by core.rst2odt(). 

587 mode = 'wb' 

588 

589 

590class StringInput(Input): 

591 """Input from a `str` or `bytes` instance.""" 

592 

593 default_source_path = '<string>' 

594 

595 def read(self): 

596 """Return the source as `str` instance. 

597 

598 Decode, if required (see `Input.decode`). 

599 """ 

600 return self.decode(self.source) 

601 

602 

603class StringOutput(Output): 

604 """Output to a `bytes` or `str` instance. 

605 

606 Provisional. 

607 """ 

608 

609 default_destination_path = '<string>' 

610 

611 def write(self, data): 

612 """Store `data` in `self.destination`, and return it. 

613 

614 If `self.encoding` is set to the pseudo encoding name "unicode", 

615 `data` must be a `str` instance and is stored/returned unchanged 

616 (cf. `Output.encode`). 

617 

618 Otherwise, `data` can be a `bytes` or `str` instance and is 

619 stored/returned as a `bytes` instance 

620 (`str` data is encoded with `self.encode()`). 

621 

622 Attention: the `output_encoding`_ setting may affect the content 

623 of the output (e.g. an encoding declaration in HTML or XML or the 

624 representation of characters as LaTeX macro vs. literal character). 

625 """ 

626 self.destination = self.encode(data) 

627 return self.destination 

628 

629 

630class NullInput(Input): 

631 

632 """Degenerate input: read nothing.""" 

633 

634 default_source_path = 'null input' 

635 

636 def read(self): 

637 """Return an empty string.""" 

638 return '' 

639 

640 

641class NullOutput(Output): 

642 

643 """Degenerate output: write nothing.""" 

644 

645 default_destination_path = 'null output' 

646 

647 def write(self, data): 

648 """Do nothing, return None.""" 

649 pass 

650 

651 

652class DocTreeInput(Input): 

653 

654 """ 

655 Adapter for document tree input. 

656 

657 The document tree must be passed in the ``source`` parameter. 

658 """ 

659 

660 default_source_path = 'doctree input' 

661 

662 def read(self): 

663 """Return the document tree.""" 

664 return self.source