Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/docutils/io.py: 32%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
5"""
6I/O classes provide a uniform API for low-level input and output. Subclasses
7exist for a variety of input/output mechanisms.
8"""
10from __future__ import annotations
12__docformat__ = 'reStructuredText'
14import codecs
15import locale
16import os
17import re
18import sys
19import warnings
20from typing import TYPE_CHECKING
22from docutils import TransformSpec
24if TYPE_CHECKING:
25 from typing import Any, BinaryIO, ClassVar, Final, Literal, TextIO
27 from docutils import nodes
28 from docutils.nodes import StrPath
30# Guess the locale's preferred encoding.
31# If no valid guess can be made, _locale_encoding is set to `None`:
32#
33# TODO: check whether this is set correctly with every OS and Python version
34# or whether front-end tools need to call `locale.setlocale()`
35# before importing this module
36try:
37 # Return locale encoding also in UTF-8 mode
38 with warnings.catch_warnings():
39 warnings.simplefilter("ignore")
40 _locale_encoding: str | None = (locale.getlocale()[1]
41 or locale.getdefaultlocale()[1]
42 ).lower()
43except: # NoQA: E722 (catchall)
44 # Any problem determining the locale: use None
45 _locale_encoding = None
46try:
47 codecs.lookup(_locale_encoding)
48except (LookupError, TypeError):
49 _locale_encoding = None
52class InputError(OSError): pass
53class OutputError(OSError): pass
56def check_encoding(stream: TextIO, encoding: str) -> bool | None:
57 """Test, whether the encoding of `stream` matches `encoding`.
59 Returns
61 :None: if `encoding` or `stream.encoding` are not a valid encoding
62 argument (e.g. ``None``) or `stream.encoding is missing.
63 :True: if the encoding argument resolves to the same value as `encoding`,
64 :False: if the encodings differ.
65 """
66 try:
67 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
68 except (LookupError, AttributeError, TypeError):
69 return None
72def error_string(err: BaseException) -> str:
73 """Return string representation of Exception `err`.
74 """
75 return f'{err.__class__.__name__}: {err}'
78class Input(TransformSpec):
79 """
80 Abstract base class for input wrappers.
82 Docutils input objects must provide a `read()` method that
83 returns the source, typically as `str` instance.
85 Inheriting `TransformSpec` allows input objects to add
86 "transforms" and "unknown_reference_resolvers" to the "Transformer".
87 (Optional for custom input objects since Docutils 0.19.)
88 """
90 component_type: Final = 'input'
92 default_source_path: ClassVar[str | None] = None
94 def __init__(
95 self,
96 source: str | TextIO | nodes.document | None = None,
97 source_path: StrPath | None = None,
98 encoding: str | Literal['unicode'] | None = 'utf-8',
99 error_handler: str | None = 'strict',
100 ) -> None:
101 self.encoding = encoding
102 """Text encoding for the input source."""
104 self.error_handler = error_handler
105 """Text decoding error handler."""
107 self.source = source
108 """The source of input data."""
110 self.source_path = source_path
111 """A text reference to the source."""
113 if not source_path:
114 self.source_path = self.default_source_path
116 self.successful_encoding = None
117 """The encoding that successfully decoded the source data."""
119 def __repr__(self) -> str:
120 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
121 self.source_path)
123 def read(self) -> str:
124 """Return input as `str`. Define in subclasses."""
125 raise NotImplementedError
127 def decode(self, data: str | bytes) -> str:
128 """
129 Decode `data` if required.
131 Return Unicode `str` instances unchanged (nothing to decode).
133 If `self.encoding` is None, determine encoding from data
134 or try UTF-8 and the locale's preferred encoding.
135 The client application should call ``locale.setlocale()`` at the
136 beginning of processing::
138 locale.setlocale(locale.LC_ALL, '')
140 Raise UnicodeError if unsuccessful.
142 Provisional: encoding detection will be removed in Docutils 1.0.
143 """
144 if self.encoding and self.encoding.lower() == 'unicode':
145 assert isinstance(data, str), ('input encoding is "unicode" '
146 'but `data` is no `str` instance')
147 if isinstance(data, str):
148 # nothing to decode
149 return data
150 if self.encoding:
151 # We believe the user/application when the encoding is
152 # explicitly given.
153 encoding_candidates = [self.encoding]
154 else:
155 data_encoding = self.determine_encoding_from_data(data)
156 if data_encoding:
157 # `data` declares its encoding with "magic comment" or BOM,
158 encoding_candidates = [data_encoding]
159 else:
160 # Apply heuristics if the encoding is not specified.
161 # Start with UTF-8, because that only matches
162 # data that *IS* UTF-8:
163 encoding_candidates = ['utf-8']
164 # If UTF-8 fails, fall back to the locale's preferred encoding:
165 if sys.version_info[:2] >= (3, 11):
166 fallback = locale.getencoding()
167 else:
168 fallback = locale.getpreferredencoding(do_setlocale=False)
169 if fallback and fallback.lower() != 'utf-8':
170 encoding_candidates.append(fallback)
171 for enc in encoding_candidates:
172 try:
173 decoded = str(data, enc, self.error_handler)
174 self.successful_encoding = enc
175 return decoded
176 except (UnicodeError, LookupError) as err:
177 # keep exception instance for use outside of the "for" loop.
178 error = err
179 raise UnicodeError(
180 'Unable to decode input data. Tried the following encodings: '
181 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
182 f'({error_string(error)})')
184 coding_slug: ClassVar[re.Pattern[bytes]] = re.compile(
185 br'coding[:=]\s*([-\w.]+)'
186 )
187 """Encoding declaration pattern."""
189 byte_order_marks: ClassVar[tuple[tuple[bytes, str], ...]] = (
190 (codecs.BOM_UTF32_BE, 'utf-32'),
191 (codecs.BOM_UTF32_LE, 'utf-32'),
192 (codecs.BOM_UTF8, 'utf-8-sig'),
193 (codecs.BOM_UTF16_BE, 'utf-16'),
194 (codecs.BOM_UTF16_LE, 'utf-16'),
195 )
196 """Sequence of (start_bytes, encoding) tuples for encoding detection.
197 The first bytes of input data are checked against the start_bytes strings.
198 A match indicates the given encoding."""
200 def determine_encoding_from_data(self, data: bytes) -> str | None:
201 """
202 Try to determine the encoding of `data` by looking *in* `data`.
203 Check for a byte order mark (BOM) or an encoding declaration.
204 """
205 # check for a byte order mark:
206 for start_bytes, encoding in self.byte_order_marks:
207 if data.startswith(start_bytes):
208 return encoding
209 # check for an encoding declaration pattern in first 2 lines of file:
210 for line in data.splitlines()[:2]:
211 match = self.coding_slug.search(line)
212 if match:
213 return match.group(1).decode('ascii')
214 return None
216 def isatty(self) -> bool:
217 """Return True, if the input source is connected to a TTY device."""
218 try:
219 return self.source.isatty()
220 except AttributeError:
221 return False
224class Output(TransformSpec):
225 """
226 Abstract base class for output wrappers.
228 Docutils output objects must provide a `write()` method that
229 expects and handles one argument (the output).
231 Inheriting `TransformSpec` allows output objects to add
232 "transforms" and "unknown_reference_resolvers" to the "Transformer".
233 (Optional for custom output objects since Docutils 0.19.)
234 """
236 component_type: Final = 'output'
238 default_destination_path: ClassVar[str | None] = None
240 def __init__(
241 self,
242 destination: TextIO | str | bytes | None = None,
243 destination_path: StrPath | None = None,
244 encoding: str | None = None,
245 error_handler: str | None = 'strict',
246 ) -> None:
247 self.encoding: str | None = encoding
248 """Text encoding for the output destination."""
250 self.error_handler: str = error_handler or 'strict'
251 """Text encoding error handler."""
253 self.destination: TextIO | str | bytes | None = destination
254 """The destination for output data."""
256 self.destination_path: StrPath | None = destination_path
257 """A text reference to the destination."""
259 if not destination_path:
260 self.destination_path = self.default_destination_path
262 def __repr__(self) -> str:
263 return ('%s: destination=%r, destination_path=%r'
264 % (self.__class__, self.destination, self.destination_path))
266 def write(self, data: str | bytes) -> str | bytes | None:
267 """Write `data`. Define in subclasses."""
268 raise NotImplementedError
270 def encode(self, data: str | bytes) -> str | bytes:
271 """
272 Encode and return `data`.
274 If `data` is a `bytes` instance, it is returned unchanged.
275 Otherwise it is encoded with `self.encoding`.
277 Provisional: If `self.encoding` is set to the pseudo encoding name
278 "unicode", `data` must be a `str` instance and is returned unchanged.
279 """
280 if self.encoding and self.encoding.lower() == 'unicode':
281 assert isinstance(data, str), ('output encoding is "unicode" '
282 'but `data` is no `str` instance')
283 return data
284 if not isinstance(data, str):
285 # Non-unicode (e.g. bytes) output.
286 return data
287 else:
288 return data.encode(self.encoding, self.error_handler)
291class ErrorOutput:
292 """
293 Wrapper class for file-like error streams with
294 failsafe de- and encoding of `str`, `bytes`, and `Exception` instances.
295 """
297 def __init__(
298 self,
299 destination: TextIO | BinaryIO | str | Literal[False] | None = None,
300 encoding: str | None = None,
301 encoding_errors: str = 'backslashreplace',
302 decoding_errors: str = 'replace',
303 ) -> None:
304 """
305 :Parameters:
306 - `destination`: a file-like object,
307 a string (path to a file),
308 `None` (write to `sys.stderr`, default), or
309 evaluating to `False` (write() requests are ignored).
310 - `encoding`: `destination` text encoding. Guessed if None.
311 - `encoding_errors`: how to treat encoding errors.
312 """
313 if destination is None:
314 destination = sys.stderr
315 elif not destination:
316 destination = False
317 # if `destination` is a file name, open it
318 elif isinstance(destination, str):
319 destination = open(destination, 'w')
321 self.destination: TextIO | BinaryIO | Literal[False] = destination
322 """Where warning output is sent."""
324 self.encoding: str = (
325 encoding
326 or getattr(destination, 'encoding', None)
327 or _locale_encoding
328 or 'ascii'
329 )
330 """The output character encoding."""
332 self.encoding_errors: str = encoding_errors
333 """Encoding error handler."""
335 self.decoding_errors: str = decoding_errors
336 """Decoding error handler."""
338 def write(self, data: str | bytes | Exception) -> None:
339 """
340 Write `data` to self.destination. Ignore, if self.destination is False.
342 `data` can be a `bytes`, `str`, or `Exception` instance.
343 """
344 if not self.destination:
345 return
346 if isinstance(data, Exception):
347 data = str(data)
348 # The destination is either opened in text or binary mode.
349 # If data has the wrong type, try to convert it.
350 try:
351 self.destination.write(data)
352 except UnicodeEncodeError:
353 # Encoding data from string to bytes failed with the
354 # destination's encoding and error handler.
355 # Try again with our own encoding and error handler.
356 binary = data.encode(self.encoding, self.encoding_errors)
357 self.destination.write(binary)
358 except TypeError:
359 if isinstance(data, str): # destination may expect bytes
360 binary = data.encode(self.encoding, self.encoding_errors)
361 self.destination.write(binary)
362 elif self.destination in (sys.stderr, sys.stdout):
363 # write bytes to raw stream
364 self.destination.buffer.write(data)
365 else:
366 # destination in text mode, write str
367 string = data.decode(self.encoding, self.decoding_errors)
368 self.destination.write(string)
370 def close(self) -> None:
371 """
372 Close the error-output stream.
374 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
375 close() method.
376 """
377 if self.destination in (sys.stdout, sys.stderr):
378 return
379 try:
380 self.destination.close()
381 except AttributeError:
382 pass
384 def isatty(self) -> bool:
385 """Return True, if the destination is connected to a TTY device."""
386 try:
387 return self.destination.isatty()
388 except AttributeError:
389 return False
392class FileInput(Input):
394 """
395 Input for single, simple file-like objects.
396 """
397 def __init__(
398 self,
399 source: TextIO | None = None,
400 source_path: StrPath | None = None,
401 encoding: str | Literal['unicode'] | None = 'utf-8',
402 error_handler: str | None = 'strict',
403 autoclose: bool = True,
404 mode: Literal['r', 'rb', 'br'] = 'r'
405 ) -> None:
406 """
407 :Parameters:
408 - `source`: either a file-like object (which is read directly), or
409 `None` (which implies `sys.stdin` if no `source_path` given).
410 - `source_path`: a path to a file, which is opened for reading.
411 - `encoding`: the expected text encoding of the input file.
412 - `error_handler`: the encoding error handler to use.
413 - `autoclose`: close automatically after read (except when
414 `sys.stdin` is the source).
415 - `mode`: how the file is to be opened (see standard function
416 `open`). The default is read only ('r').
417 """
418 super().__init__(source, source_path, encoding, error_handler)
419 self.autoclose = autoclose
420 self._stderr = ErrorOutput()
422 if source is None:
423 if source_path:
424 try:
425 self.source = open(source_path, mode,
426 encoding=self.encoding,
427 errors=self.error_handler)
428 except OSError as error:
429 raise InputError(error.errno, error.strerror, source_path)
430 else:
431 self.source = sys.stdin
432 elif check_encoding(self.source, self.encoding) is False:
433 # TODO: re-open, warn or raise error?
434 raise UnicodeError('Encoding clash: encoding given is "%s" '
435 'but source is opened with encoding "%s".' %
436 (self.encoding, self.source.encoding))
437 if not source_path:
438 try:
439 self.source_path = self.source.name
440 except AttributeError:
441 pass
443 def read(self) -> str:
444 """
445 Read and decode a single file, return as `str`.
446 """
447 try:
448 if not self.encoding and hasattr(self.source, 'buffer'):
449 # read as binary data
450 data = self.source.buffer.read()
451 # decode with heuristics
452 data = self.decode(data)
453 # normalize newlines
454 data = '\n'.join(data.splitlines()+[''])
455 else:
456 data = self.source.read()
457 finally:
458 if self.autoclose:
459 self.close()
460 return data
462 def readlines(self) -> list[str]:
463 """
464 Return lines of a single file as list of strings.
465 """
466 return self.read().splitlines(True)
468 def close(self) -> None:
469 if self.source is not sys.stdin:
470 self.source.close()
473class FileOutput(Output):
475 """Output for single, simple file-like objects."""
477 default_destination_path: Final = '<file>'
479 mode: Literal['w', 'a', 'x', 'wb', 'ab', 'xb', 'bw', 'ba', 'bx'] = 'w'
480 """The mode argument for `open()`."""
481 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
482 # (Do not use binary mode ('wb') for text files, as this prevents the
483 # conversion of newlines to the system specific default.)
485 def __init__(self,
486 destination: TextIO | None = None,
487 destination_path: StrPath | None = None,
488 encoding: str | None = None,
489 error_handler: str | None = 'strict',
490 autoclose: bool = True,
491 handle_io_errors: None = None,
492 mode=None,
493 ) -> None:
494 """
495 :Parameters:
496 - `destination`: either a file-like object (which is written
497 directly) or `None` (which implies `sys.stdout` if no
498 `destination_path` given).
499 - `destination_path`: a path to a file, which is opened and then
500 written.
501 - `encoding`: the text encoding of the output file.
502 - `error_handler`: the encoding error handler to use.
503 - `autoclose`: close automatically after write (except when
504 `sys.stdout` or `sys.stderr` is the destination).
505 - `handle_io_errors`: ignored, deprecated, will be removed.
506 - `mode`: how the file is to be opened (see standard function
507 `open`). The default is 'w', providing universal newline
508 support for text files.
509 """
510 super().__init__(
511 destination, destination_path, encoding, error_handler)
512 self.opened = True
513 self.autoclose = autoclose
514 if handle_io_errors is not None:
515 warnings.warn('io.FileOutput: init argument "handle_io_errors" '
516 'is ignored and will be removed in '
517 'Docutils 2.0.', DeprecationWarning, stacklevel=2)
518 if mode is not None:
519 self.mode = mode
520 self._stderr = ErrorOutput()
521 if destination is None:
522 if destination_path:
523 self.opened = False
524 else:
525 self.destination = sys.stdout
526 elif ( # destination is file-type object -> check mode:
527 mode and hasattr(self.destination, 'mode')
528 and mode != self.destination.mode):
529 print('Warning: Destination mode "%s" differs from specified '
530 'mode "%s"' % (self.destination.mode, mode),
531 file=self._stderr)
532 if not destination_path:
533 try:
534 self.destination_path = self.destination.name
535 except AttributeError:
536 pass
538 def open(self) -> None:
539 # Specify encoding
540 if 'b' not in self.mode:
541 kwargs = {'encoding': self.encoding,
542 'errors': self.error_handler}
543 else:
544 kwargs = {}
545 try:
546 self.destination = open(self.destination_path, self.mode, **kwargs)
547 except OSError as error:
548 raise OutputError(error.errno, error.strerror,
549 self.destination_path)
550 self.opened = True
552 def write(self, data: str | bytes) -> str | bytes:
553 """Write `data` to a single file, also return it.
555 `data` can be a `str` or `bytes` instance.
556 If writing `bytes` fails, an attempt is made to write to
557 the low-level interface ``self.destination.buffer``.
559 If `data` is a `str` instance and `self.encoding` and
560 `self.destination.encoding` are set to different values, `data`
561 is encoded to a `bytes` instance using `self.encoding`.
563 Provisional: future versions may raise an error if `self.encoding`
564 and `self.destination.encoding` are set to different values.
565 """
566 if not self.opened:
567 self.open()
568 if (isinstance(data, str)
569 and check_encoding(self.destination, self.encoding) is False):
570 if os.linesep != '\n':
571 data = data.replace('\n', os.linesep) # fix endings
572 data = self.encode(data)
574 try:
575 self.destination.write(data)
576 except TypeError as err:
577 if isinstance(data, bytes):
578 try:
579 self.destination.buffer.write(data)
580 except AttributeError:
581 if check_encoding(self.destination,
582 self.encoding) is False:
583 raise ValueError(
584 f'Encoding of {self.destination_path} '
585 f'({self.destination.encoding}) differs \n'
586 f' from specified encoding ({self.encoding})')
587 else:
588 raise err
589 except (UnicodeError, LookupError) as err:
590 raise UnicodeError(
591 'Unable to encode output data. output-encoding is: '
592 f'{self.encoding}.\n({error_string(err)})')
593 finally:
594 if self.autoclose:
595 self.close()
596 return data
598 def close(self) -> None:
599 if self.destination not in (sys.stdout, sys.stderr):
600 self.destination.close()
601 self.opened = False
604class BinaryFileOutput(FileOutput):
605 """
606 A version of docutils.io.FileOutput which writes to a binary file.
608 Deprecated. Use `FileOutput` (works with `bytes` since Docutils 0.20).
609 Will be removed in Docutils 0.24.
610 """
611 # Used by core.publish_cmdline_to_binary() which is also deprecated.
612 mode = 'wb'
614 def __init__(self, *args: Any, **kwargs: Any) -> None:
615 warnings.warn('"BinaryFileOutput" is obsoleted by "FileOutput"'
616 ' and will be removed in Docutils 0.24.',
617 DeprecationWarning, stacklevel=2)
618 super().__init__(*args, **kwargs)
621class StringInput(Input):
622 """Input from a `str` or `bytes` instance."""
624 source: str | bytes
626 default_source_path: Final = '<string>'
628 def read(self) -> str:
629 """Return the source as `str` instance.
631 Decode, if required (see `Input.decode`).
632 """
633 return self.decode(self.source)
636class StringOutput(Output):
637 """Output to a `bytes` or `str` instance.
639 Provisional.
640 """
642 destination: str | bytes
644 default_destination_path: Final = '<string>'
646 def write(self, data: str | bytes) -> str | bytes:
647 """Store `data` in `self.destination`, and return it.
649 If `self.encoding` is set to the pseudo encoding name "unicode",
650 `data` must be a `str` instance and is stored/returned unchanged
651 (cf. `Output.encode`).
653 Otherwise, `data` can be a `bytes` or `str` instance and is
654 stored/returned as a `bytes` instance
655 (`str` data is encoded with `self.encode()`).
657 Attention: the `output_encoding`_ setting may affect the content
658 of the output (e.g. an encoding declaration in HTML or XML or the
659 representation of characters as LaTeX macro vs. literal character).
660 """
661 self.destination = self.encode(data)
662 return self.destination
665class NullInput(Input):
667 """Degenerate input: read nothing."""
669 source: None
671 default_source_path: Final = 'null input'
673 def read(self) -> str:
674 """Return an empty string."""
675 return ''
678class NullOutput(Output):
680 """Degenerate output: write nothing."""
682 destination: None
684 default_destination_path: Final = 'null output'
686 def write(self, data: str | bytes) -> None:
687 """Do nothing, return None."""
690class DocTreeInput(Input):
692 """
693 Adapter for document tree input.
695 The document tree must be passed in the ``source`` parameter.
696 """
698 source: nodes.document
700 default_source_path: Final = 'doctree input'
702 def read(self) -> nodes.document:
703 """Return the document tree."""
704 return self.source