Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/docutils/io.py: 32%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

290 statements  

1# $Id$ 

2# Author: David Goodger <goodger@python.org> 

3# Copyright: This module has been placed in the public domain. 

4 

5""" 

6I/O classes provide a uniform API for low-level input and output. Subclasses 

7exist for a variety of input/output mechanisms. 

8""" 

9 

10from __future__ import annotations 

11 

12__docformat__ = 'reStructuredText' 

13 

14import codecs 

15import locale 

16import os 

17import re 

18import sys 

19import warnings 

20from typing import TYPE_CHECKING 

21 

22from docutils import TransformSpec 

23 

24if TYPE_CHECKING: 

25 from typing import Any, BinaryIO, ClassVar, Final, Literal, TextIO 

26 

27 from docutils import nodes 

28 from docutils.nodes import StrPath 

29 

30# Guess the locale's preferred encoding. 

31# If no valid guess can be made, _locale_encoding is set to `None`: 

32# 

33# TODO: check whether this is set correctly with every OS and Python version 

34# or whether front-end tools need to call `locale.setlocale()` 

35# before importing this module 

36try: 

37 # Return locale encoding also in UTF-8 mode 

38 with warnings.catch_warnings(): 

39 warnings.simplefilter("ignore") 

40 _locale_encoding: str | None = (locale.getlocale()[1] 

41 or locale.getdefaultlocale()[1] 

42 ).lower() 

43except: # NoQA: E722 (catchall) 

44 # Any problem determining the locale: use None 

45 _locale_encoding = None 

46try: 

47 codecs.lookup(_locale_encoding) 

48except (LookupError, TypeError): 

49 _locale_encoding = None 

50 

51 

52class InputError(OSError): pass 

53class OutputError(OSError): pass 

54 

55 

56def check_encoding(stream: TextIO, encoding: str) -> bool | None: 

57 """Test, whether the encoding of `stream` matches `encoding`. 

58 

59 Returns 

60 

61 :None: if `encoding` or `stream.encoding` are not a valid encoding 

62 argument (e.g. ``None``) or `stream.encoding is missing. 

63 :True: if the encoding argument resolves to the same value as `encoding`, 

64 :False: if the encodings differ. 

65 """ 

66 try: 

67 return codecs.lookup(stream.encoding) == codecs.lookup(encoding) 

68 except (LookupError, AttributeError, TypeError): 

69 return None 

70 

71 

72def error_string(err: BaseException) -> str: 

73 """Return string representation of Exception `err`. 

74 """ 

75 return f'{err.__class__.__name__}: {err}' 

76 

77 

78class Input(TransformSpec): 

79 """ 

80 Abstract base class for input wrappers. 

81 

82 Docutils input objects must provide a `read()` method that 

83 returns the source, typically as `str` instance. 

84 

85 Inheriting `TransformSpec` allows input objects to add 

86 "transforms" and "unknown_reference_resolvers" to the "Transformer". 

87 (Optional for custom input objects since Docutils 0.19.) 

88 """ 

89 

90 component_type: Final = 'input' 

91 

92 default_source_path: ClassVar[str | None] = None 

93 

94 def __init__( 

95 self, 

96 source: str | TextIO | nodes.document | None = None, 

97 source_path: StrPath | None = None, 

98 encoding: str | Literal['unicode'] | None = 'utf-8', 

99 error_handler: str | None = 'strict', 

100 ) -> None: 

101 self.encoding = encoding 

102 """Text encoding for the input source.""" 

103 

104 self.error_handler = error_handler 

105 """Text decoding error handler.""" 

106 

107 self.source = source 

108 """The source of input data.""" 

109 

110 self.source_path = source_path 

111 """A text reference to the source.""" 

112 

113 if not source_path: 

114 self.source_path = self.default_source_path 

115 

116 self.successful_encoding = None 

117 """The encoding that successfully decoded the source data.""" 

118 

119 def __repr__(self) -> str: 

120 return '%s: source=%r, source_path=%r' % (self.__class__, self.source, 

121 self.source_path) 

122 

123 def read(self) -> str: 

124 """Return input as `str`. Define in subclasses.""" 

125 raise NotImplementedError 

126 

127 def decode(self, data: str | bytes) -> str: 

128 """ 

129 Decode `data` if required. 

130 

131 Return Unicode `str` instances unchanged (nothing to decode). 

132 

133 If `self.encoding` is None, determine encoding from data 

134 or try UTF-8 and the locale's preferred encoding. 

135 The client application should call ``locale.setlocale()`` at the 

136 beginning of processing:: 

137 

138 locale.setlocale(locale.LC_ALL, '') 

139 

140 Raise UnicodeError if unsuccessful. 

141 

142 Provisional: encoding detection will be removed in Docutils 1.0. 

143 """ 

144 if self.encoding and self.encoding.lower() == 'unicode': 

145 assert isinstance(data, str), ('input encoding is "unicode" ' 

146 'but `data` is no `str` instance') 

147 if isinstance(data, str): 

148 # nothing to decode 

149 return data 

150 if self.encoding: 

151 # We believe the user/application when the encoding is 

152 # explicitly given. 

153 encoding_candidates = [self.encoding] 

154 else: 

155 data_encoding = self.determine_encoding_from_data(data) 

156 if data_encoding: 

157 # `data` declares its encoding with "magic comment" or BOM, 

158 encoding_candidates = [data_encoding] 

159 else: 

160 # Apply heuristics if the encoding is not specified. 

161 # Start with UTF-8, because that only matches 

162 # data that *IS* UTF-8: 

163 encoding_candidates = ['utf-8'] 

164 # If UTF-8 fails, fall back to the locale's preferred encoding: 

165 if sys.version_info[:2] >= (3, 11): 

166 fallback = locale.getencoding() 

167 else: 

168 fallback = locale.getpreferredencoding(do_setlocale=False) 

169 if fallback and fallback.lower() != 'utf-8': 

170 encoding_candidates.append(fallback) 

171 for enc in encoding_candidates: 

172 try: 

173 decoded = str(data, enc, self.error_handler) 

174 self.successful_encoding = enc 

175 return decoded 

176 except (UnicodeError, LookupError) as err: 

177 # keep exception instance for use outside of the "for" loop. 

178 error = err 

179 raise UnicodeError( 

180 'Unable to decode input data. Tried the following encodings: ' 

181 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n' 

182 f'({error_string(error)})') 

183 

184 coding_slug: ClassVar[re.Pattern[bytes]] = re.compile( 

185 br'coding[:=]\s*([-\w.]+)' 

186 ) 

187 """Encoding declaration pattern.""" 

188 

189 byte_order_marks: ClassVar[tuple[tuple[bytes, str], ...]] = ( 

190 (codecs.BOM_UTF32_BE, 'utf-32'), 

191 (codecs.BOM_UTF32_LE, 'utf-32'), 

192 (codecs.BOM_UTF8, 'utf-8-sig'), 

193 (codecs.BOM_UTF16_BE, 'utf-16'), 

194 (codecs.BOM_UTF16_LE, 'utf-16'), 

195 ) 

196 """Sequence of (start_bytes, encoding) tuples for encoding detection. 

197 The first bytes of input data are checked against the start_bytes strings. 

198 A match indicates the given encoding.""" 

199 

200 def determine_encoding_from_data(self, data: bytes) -> str | None: 

201 """ 

202 Try to determine the encoding of `data` by looking *in* `data`. 

203 Check for a byte order mark (BOM) or an encoding declaration. 

204 """ 

205 # check for a byte order mark: 

206 for start_bytes, encoding in self.byte_order_marks: 

207 if data.startswith(start_bytes): 

208 return encoding 

209 # check for an encoding declaration pattern in first 2 lines of file: 

210 for line in data.splitlines()[:2]: 

211 match = self.coding_slug.search(line) 

212 if match: 

213 return match.group(1).decode('ascii') 

214 return None 

215 

216 def isatty(self) -> bool: 

217 """Return True, if the input source is connected to a TTY device.""" 

218 try: 

219 return self.source.isatty() 

220 except AttributeError: 

221 return False 

222 

223 

224class Output(TransformSpec): 

225 """ 

226 Abstract base class for output wrappers. 

227 

228 Docutils output objects must provide a `write()` method that 

229 expects and handles one argument (the output). 

230 

231 Inheriting `TransformSpec` allows output objects to add 

232 "transforms" and "unknown_reference_resolvers" to the "Transformer". 

233 (Optional for custom output objects since Docutils 0.19.) 

234 """ 

235 

236 component_type: Final = 'output' 

237 

238 default_destination_path: ClassVar[str | None] = None 

239 

240 def __init__( 

241 self, 

242 destination: TextIO | str | bytes | None = None, 

243 destination_path: StrPath | None = None, 

244 encoding: str | None = None, 

245 error_handler: str | None = 'strict', 

246 ) -> None: 

247 self.encoding: str | None = encoding 

248 """Text encoding for the output destination.""" 

249 

250 self.error_handler: str = error_handler or 'strict' 

251 """Text encoding error handler.""" 

252 

253 self.destination: TextIO | str | bytes | None = destination 

254 """The destination for output data.""" 

255 

256 self.destination_path: StrPath | None = destination_path 

257 """A text reference to the destination.""" 

258 

259 if not destination_path: 

260 self.destination_path = self.default_destination_path 

261 

262 def __repr__(self) -> str: 

263 return ('%s: destination=%r, destination_path=%r' 

264 % (self.__class__, self.destination, self.destination_path)) 

265 

266 def write(self, data: str | bytes) -> str | bytes | None: 

267 """Write `data`. Define in subclasses.""" 

268 raise NotImplementedError 

269 

270 def encode(self, data: str | bytes) -> str | bytes: 

271 """ 

272 Encode and return `data`. 

273 

274 If `data` is a `bytes` instance, it is returned unchanged. 

275 Otherwise it is encoded with `self.encoding`. 

276 

277 Provisional: If `self.encoding` is set to the pseudo encoding name 

278 "unicode", `data` must be a `str` instance and is returned unchanged. 

279 """ 

280 if self.encoding and self.encoding.lower() == 'unicode': 

281 assert isinstance(data, str), ('output encoding is "unicode" ' 

282 'but `data` is no `str` instance') 

283 return data 

284 if not isinstance(data, str): 

285 # Non-unicode (e.g. bytes) output. 

286 return data 

287 else: 

288 return data.encode(self.encoding, self.error_handler) 

289 

290 

291class ErrorOutput: 

292 """ 

293 Wrapper class for file-like error streams with 

294 failsafe de- and encoding of `str`, `bytes`, and `Exception` instances. 

295 """ 

296 

297 def __init__( 

298 self, 

299 destination: TextIO | BinaryIO | str | Literal[False] | None = None, 

300 encoding: str | None = None, 

301 encoding_errors: str = 'backslashreplace', 

302 decoding_errors: str = 'replace', 

303 ) -> None: 

304 """ 

305 :Parameters: 

306 - `destination`: a file-like object, 

307 a string (path to a file), 

308 `None` (write to `sys.stderr`, default), or 

309 evaluating to `False` (write() requests are ignored). 

310 - `encoding`: `destination` text encoding. Guessed if None. 

311 - `encoding_errors`: how to treat encoding errors. 

312 """ 

313 if destination is None: 

314 destination = sys.stderr 

315 elif not destination: 

316 destination = False 

317 # if `destination` is a file name, open it 

318 elif isinstance(destination, str): 

319 destination = open(destination, 'w') 

320 

321 self.destination: TextIO | BinaryIO | Literal[False] = destination 

322 """Where warning output is sent.""" 

323 

324 self.encoding: str = ( 

325 encoding 

326 or getattr(destination, 'encoding', None) 

327 or _locale_encoding 

328 or 'ascii' 

329 ) 

330 """The output character encoding.""" 

331 

332 self.encoding_errors: str = encoding_errors 

333 """Encoding error handler.""" 

334 

335 self.decoding_errors: str = decoding_errors 

336 """Decoding error handler.""" 

337 

338 def write(self, data: str | bytes | Exception) -> None: 

339 """ 

340 Write `data` to self.destination. Ignore, if self.destination is False. 

341 

342 `data` can be a `bytes`, `str`, or `Exception` instance. 

343 """ 

344 if not self.destination: 

345 return 

346 if isinstance(data, Exception): 

347 data = str(data) 

348 # The destination is either opened in text or binary mode. 

349 # If data has the wrong type, try to convert it. 

350 try: 

351 self.destination.write(data) 

352 except UnicodeEncodeError: 

353 # Encoding data from string to bytes failed with the 

354 # destination's encoding and error handler. 

355 # Try again with our own encoding and error handler. 

356 binary = data.encode(self.encoding, self.encoding_errors) 

357 self.destination.write(binary) 

358 except TypeError: 

359 if isinstance(data, str): # destination may expect bytes 

360 binary = data.encode(self.encoding, self.encoding_errors) 

361 self.destination.write(binary) 

362 elif self.destination in (sys.stderr, sys.stdout): 

363 # write bytes to raw stream 

364 self.destination.buffer.write(data) 

365 else: 

366 # destination in text mode, write str 

367 string = data.decode(self.encoding, self.decoding_errors) 

368 self.destination.write(string) 

369 

370 def close(self) -> None: 

371 """ 

372 Close the error-output stream. 

373 

374 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no 

375 close() method. 

376 """ 

377 if self.destination in (sys.stdout, sys.stderr): 

378 return 

379 try: 

380 self.destination.close() 

381 except AttributeError: 

382 pass 

383 

384 def isatty(self) -> bool: 

385 """Return True, if the destination is connected to a TTY device.""" 

386 try: 

387 return self.destination.isatty() 

388 except AttributeError: 

389 return False 

390 

391 

392class FileInput(Input): 

393 

394 """ 

395 Input for single, simple file-like objects. 

396 """ 

397 def __init__( 

398 self, 

399 source: TextIO | None = None, 

400 source_path: StrPath | None = None, 

401 encoding: str | Literal['unicode'] | None = 'utf-8', 

402 error_handler: str | None = 'strict', 

403 autoclose: bool = True, 

404 mode: Literal['r', 'rb', 'br'] = 'r' 

405 ) -> None: 

406 """ 

407 :Parameters: 

408 - `source`: either a file-like object (which is read directly), or 

409 `None` (which implies `sys.stdin` if no `source_path` given). 

410 - `source_path`: a path to a file, which is opened for reading. 

411 - `encoding`: the expected text encoding of the input file. 

412 - `error_handler`: the encoding error handler to use. 

413 - `autoclose`: close automatically after read (except when 

414 `sys.stdin` is the source). 

415 - `mode`: how the file is to be opened (see standard function 

416 `open`). The default is read only ('r'). 

417 """ 

418 super().__init__(source, source_path, encoding, error_handler) 

419 self.autoclose = autoclose 

420 self._stderr = ErrorOutput() 

421 

422 if source is None: 

423 if source_path: 

424 try: 

425 self.source = open(source_path, mode, 

426 encoding=self.encoding, 

427 errors=self.error_handler) 

428 except OSError as error: 

429 raise InputError(error.errno, error.strerror, source_path) 

430 else: 

431 self.source = sys.stdin 

432 elif check_encoding(self.source, self.encoding) is False: 

433 # TODO: re-open, warn or raise error? 

434 raise UnicodeError('Encoding clash: encoding given is "%s" ' 

435 'but source is opened with encoding "%s".' % 

436 (self.encoding, self.source.encoding)) 

437 if not source_path: 

438 try: 

439 self.source_path = self.source.name 

440 except AttributeError: 

441 pass 

442 

443 def read(self) -> str: 

444 """ 

445 Read and decode a single file, return as `str`. 

446 """ 

447 try: 

448 if not self.encoding and hasattr(self.source, 'buffer'): 

449 # read as binary data 

450 data = self.source.buffer.read() 

451 # decode with heuristics 

452 data = self.decode(data) 

453 # normalize newlines 

454 data = '\n'.join(data.splitlines()+['']) 

455 else: 

456 data = self.source.read() 

457 finally: 

458 if self.autoclose: 

459 self.close() 

460 return data 

461 

462 def readlines(self) -> list[str]: 

463 """ 

464 Return lines of a single file as list of strings. 

465 """ 

466 return self.read().splitlines(True) 

467 

468 def close(self) -> None: 

469 if self.source is not sys.stdin: 

470 self.source.close() 

471 

472 

473class FileOutput(Output): 

474 

475 """Output for single, simple file-like objects.""" 

476 

477 default_destination_path: Final = '<file>' 

478 

479 mode: Literal['w', 'a', 'x', 'wb', 'ab', 'xb', 'bw', 'ba', 'bx'] = 'w' 

480 """The mode argument for `open()`.""" 

481 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`). 

482 # (Do not use binary mode ('wb') for text files, as this prevents the 

483 # conversion of newlines to the system specific default.) 

484 

485 def __init__(self, 

486 destination: TextIO | None = None, 

487 destination_path: StrPath | None = None, 

488 encoding: str | None = None, 

489 error_handler: str | None = 'strict', 

490 autoclose: bool = True, 

491 handle_io_errors: None = None, 

492 mode=None, 

493 ) -> None: 

494 """ 

495 :Parameters: 

496 - `destination`: either a file-like object (which is written 

497 directly) or `None` (which implies `sys.stdout` if no 

498 `destination_path` given). 

499 - `destination_path`: a path to a file, which is opened and then 

500 written. 

501 - `encoding`: the text encoding of the output file. 

502 - `error_handler`: the encoding error handler to use. 

503 - `autoclose`: close automatically after write (except when 

504 `sys.stdout` or `sys.stderr` is the destination). 

505 - `handle_io_errors`: ignored, deprecated, will be removed. 

506 - `mode`: how the file is to be opened (see standard function 

507 `open`). The default is 'w', providing universal newline 

508 support for text files. 

509 """ 

510 super().__init__( 

511 destination, destination_path, encoding, error_handler) 

512 self.opened = True 

513 self.autoclose = autoclose 

514 if handle_io_errors is not None: 

515 warnings.warn('io.FileOutput: init argument "handle_io_errors" ' 

516 'is ignored and will be removed in ' 

517 'Docutils 2.0.', DeprecationWarning, stacklevel=2) 

518 if mode is not None: 

519 self.mode = mode 

520 self._stderr = ErrorOutput() 

521 if destination is None: 

522 if destination_path: 

523 self.opened = False 

524 else: 

525 self.destination = sys.stdout 

526 elif ( # destination is file-type object -> check mode: 

527 mode and hasattr(self.destination, 'mode') 

528 and mode != self.destination.mode): 

529 print('Warning: Destination mode "%s" differs from specified ' 

530 'mode "%s"' % (self.destination.mode, mode), 

531 file=self._stderr) 

532 if not destination_path: 

533 try: 

534 self.destination_path = self.destination.name 

535 except AttributeError: 

536 pass 

537 

538 def open(self) -> None: 

539 # Specify encoding 

540 if 'b' not in self.mode: 

541 kwargs = {'encoding': self.encoding, 

542 'errors': self.error_handler} 

543 else: 

544 kwargs = {} 

545 try: 

546 self.destination = open(self.destination_path, self.mode, **kwargs) 

547 except OSError as error: 

548 raise OutputError(error.errno, error.strerror, 

549 self.destination_path) 

550 self.opened = True 

551 

552 def write(self, data: str | bytes) -> str | bytes: 

553 """Write `data` to a single file, also return it. 

554 

555 `data` can be a `str` or `bytes` instance. 

556 If writing `bytes` fails, an attempt is made to write to 

557 the low-level interface ``self.destination.buffer``. 

558 

559 If `data` is a `str` instance and `self.encoding` and 

560 `self.destination.encoding` are set to different values, `data` 

561 is encoded to a `bytes` instance using `self.encoding`. 

562 

563 Provisional: future versions may raise an error if `self.encoding` 

564 and `self.destination.encoding` are set to different values. 

565 """ 

566 if not self.opened: 

567 self.open() 

568 if (isinstance(data, str) 

569 and check_encoding(self.destination, self.encoding) is False): 

570 if os.linesep != '\n': 

571 data = data.replace('\n', os.linesep) # fix endings 

572 data = self.encode(data) 

573 

574 try: 

575 self.destination.write(data) 

576 except TypeError as err: 

577 if isinstance(data, bytes): 

578 try: 

579 self.destination.buffer.write(data) 

580 except AttributeError: 

581 if check_encoding(self.destination, 

582 self.encoding) is False: 

583 raise ValueError( 

584 f'Encoding of {self.destination_path} ' 

585 f'({self.destination.encoding}) differs \n' 

586 f' from specified encoding ({self.encoding})') 

587 else: 

588 raise err 

589 except (UnicodeError, LookupError) as err: 

590 raise UnicodeError( 

591 'Unable to encode output data. output-encoding is: ' 

592 f'{self.encoding}.\n({error_string(err)})') 

593 finally: 

594 if self.autoclose: 

595 self.close() 

596 return data 

597 

598 def close(self) -> None: 

599 if self.destination not in (sys.stdout, sys.stderr): 

600 self.destination.close() 

601 self.opened = False 

602 

603 

604class BinaryFileOutput(FileOutput): 

605 """ 

606 A version of docutils.io.FileOutput which writes to a binary file. 

607 

608 Deprecated. Use `FileOutput` (works with `bytes` since Docutils 0.20). 

609 Will be removed in Docutils 0.24. 

610 """ 

611 # Used by core.publish_cmdline_to_binary() which is also deprecated. 

612 mode = 'wb' 

613 

614 def __init__(self, *args: Any, **kwargs: Any) -> None: 

615 warnings.warn('"BinaryFileOutput" is obsoleted by "FileOutput"' 

616 ' and will be removed in Docutils 0.24.', 

617 DeprecationWarning, stacklevel=2) 

618 super().__init__(*args, **kwargs) 

619 

620 

621class StringInput(Input): 

622 """Input from a `str` or `bytes` instance.""" 

623 

624 source: str | bytes 

625 

626 default_source_path: Final = '<string>' 

627 

628 def read(self) -> str: 

629 """Return the source as `str` instance. 

630 

631 Decode, if required (see `Input.decode`). 

632 """ 

633 return self.decode(self.source) 

634 

635 

636class StringOutput(Output): 

637 """Output to a `bytes` or `str` instance. 

638 

639 Provisional. 

640 """ 

641 

642 destination: str | bytes 

643 

644 default_destination_path: Final = '<string>' 

645 

646 def write(self, data: str | bytes) -> str | bytes: 

647 """Store `data` in `self.destination`, and return it. 

648 

649 If `self.encoding` is set to the pseudo encoding name "unicode", 

650 `data` must be a `str` instance and is stored/returned unchanged 

651 (cf. `Output.encode`). 

652 

653 Otherwise, `data` can be a `bytes` or `str` instance and is 

654 stored/returned as a `bytes` instance 

655 (`str` data is encoded with `self.encode()`). 

656 

657 Attention: the `output_encoding`_ setting may affect the content 

658 of the output (e.g. an encoding declaration in HTML or XML or the 

659 representation of characters as LaTeX macro vs. literal character). 

660 """ 

661 self.destination = self.encode(data) 

662 return self.destination 

663 

664 

665class NullInput(Input): 

666 

667 """Degenerate input: read nothing.""" 

668 

669 source: None 

670 

671 default_source_path: Final = 'null input' 

672 

673 def read(self) -> str: 

674 """Return an empty string.""" 

675 return '' 

676 

677 

678class NullOutput(Output): 

679 

680 """Degenerate output: write nothing.""" 

681 

682 destination: None 

683 

684 default_destination_path: Final = 'null output' 

685 

686 def write(self, data: str | bytes) -> None: 

687 """Do nothing, return None.""" 

688 

689 

690class DocTreeInput(Input): 

691 

692 """ 

693 Adapter for document tree input. 

694 

695 The document tree must be passed in the ``source`` parameter. 

696 """ 

697 

698 source: nodes.document 

699 

700 default_source_path: Final = 'doctree input' 

701 

702 def read(self) -> nodes.document: 

703 """Return the document tree.""" 

704 return self.source