Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/docutils/io.py: 34%
268 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:06 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:06 +0000
1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
5"""
6I/O classes provide a uniform API for low-level input and output. Subclasses
7exist for a variety of input/output mechanisms.
8"""
10__docformat__ = 'reStructuredText'
12import codecs
13import locale
14import os
15import re
16import sys
17import warnings
19from docutils import TransformSpec
22# Guess the locale's preferred encoding.
23# If no valid guess can be made, _locale_encoding is set to `None`:
24#
25# TODO: check whether this is set correctly with every OS and Python version
26# or whether front-end tools need to call `locale.setlocale()`
27# before importing this module
28try:
29 # Return locale encoding also in UTF-8 mode
30 with warnings.catch_warnings():
31 warnings.simplefilter("ignore")
32 _locale_encoding = (locale.getlocale()[1]
33 or locale.getdefaultlocale()[1])
34 _locale_encoding = _locale_encoding.lower()
35except ValueError as error: # OS X may set UTF-8 without language code
36 # See https://bugs.python.org/issue18378 fixed in 3.8
37 # and https://sourceforge.net/p/docutils/bugs/298/.
38 # Drop the special case after requiring Python >= 3.8
39 if "unknown locale: UTF-8" in error.args:
40 _locale_encoding = "utf-8"
41 else:
42 _locale_encoding = None
43except: # noqa any other problems determining the locale -> use None
44 _locale_encoding = None
45try:
46 codecs.lookup(_locale_encoding)
47except (LookupError, TypeError):
48 _locale_encoding = None
51class InputError(OSError): pass
52class OutputError(OSError): pass
55def check_encoding(stream, encoding):
56 """Test, whether the encoding of `stream` matches `encoding`.
58 Returns
60 :None: if `encoding` or `stream.encoding` are not a valid encoding
61 argument (e.g. ``None``) or `stream.encoding is missing.
62 :True: if the encoding argument resolves to the same value as `encoding`,
63 :False: if the encodings differ.
64 """
65 try:
66 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
67 except (LookupError, AttributeError, TypeError):
68 return None
71def error_string(err):
72 """Return string representation of Exception `err`.
73 """
74 return f'{err.__class__.__name__}: {err}'
77class Input(TransformSpec):
78 """
79 Abstract base class for input wrappers.
81 Docutils input objects must provide a `read()` method that
82 returns the source, typically as `str` instance.
84 Inheriting `TransformSpec` allows input objects to add
85 "transforms" and "unknown_reference_resolvers" to the "Transformer".
86 (Optional for custom input objects since Docutils 0.19.)
87 """
89 component_type = 'input'
91 default_source_path = None
93 def __init__(self, source=None, source_path=None, encoding=None,
94 error_handler='strict'):
95 self.encoding = encoding
96 """Text encoding for the input source."""
98 self.error_handler = error_handler
99 """Text decoding error handler."""
101 self.source = source
102 """The source of input data."""
104 self.source_path = source_path
105 """A text reference to the source."""
107 if not source_path:
108 self.source_path = self.default_source_path
110 self.successful_encoding = None
111 """The encoding that successfully decoded the source data."""
113 def __repr__(self):
114 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
115 self.source_path)
117 def read(self):
118 """Return input as `str`. Define in subclasses."""
119 raise NotImplementedError
121 def decode(self, data):
122 """
123 Decode `data` if required.
125 Return Unicode `str` instances unchanged (nothing to decode).
127 If `self.encoding` is None, determine encoding from data
128 or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.
129 The client application should call ``locale.setlocale`` at the
130 beginning of processing::
132 locale.setlocale(locale.LC_ALL, '')
134 Raise UnicodeError if unsuccessful.
136 Provisional:
137 - Raise UnicodeError (instead of falling back to the locale
138 encoding) if decoding the source with the default encoding (UTF-8)
139 fails and Python is started in `UTF-8 mode`.
141 Raise UnicodeError (instead of falling back to "latin1") if both,
142 default and locale encoding, fail.
144 - Only remove BOM (U+FEFF ZWNBSP at start of data),
145 no other ZWNBSPs.
146 """
147 if self.encoding and self.encoding.lower() == 'unicode':
148 assert isinstance(data, str), ('input encoding is "unicode" '
149 'but `data` is no `str` instance')
150 if isinstance(data, str):
151 # nothing to decode
152 return data
153 if self.encoding:
154 # We believe the user/application when the encoding is
155 # explicitly given.
156 encoding_candidates = [self.encoding]
157 else:
158 data_encoding = self.determine_encoding_from_data(data)
159 if data_encoding:
160 # If the data declares its encoding (explicitly or via a BOM),
161 # we believe it.
162 encoding_candidates = [data_encoding]
163 else:
164 # Apply heuristics only if no encoding is explicitly given and
165 # no BOM found. Start with UTF-8, because that only matches
166 # data that *IS* UTF-8:
167 encoding_candidates = ['utf-8']
168 # TODO: use `locale.getpreferredlocale(do_setlocale=True)`
169 # to respect UTF-8 mode (API change).
170 # (Check if it is a valid encoding and not UTF-8)
171 if _locale_encoding and _locale_encoding != 'utf-8':
172 encoding_candidates.append(_locale_encoding)
173 # TODO: don't fall back to 'latin-1' (API change).
174 encoding_candidates.append('latin-1')
175 for enc in encoding_candidates:
176 try:
177 decoded = str(data, enc, self.error_handler)
178 self.successful_encoding = enc
179 # Return decoded, removing BOM and other ZWNBSPs.
180 # TODO: only remove BOM (ZWNBSP at start of data)
181 # and only if 'self.encoding' is None. (API change)
182 return decoded.replace('\ufeff', '')
183 except (UnicodeError, LookupError) as err:
184 # keep exception instance for use outside of the "for" loop.
185 error = err
186 raise UnicodeError(
187 'Unable to decode input data. Tried the following encodings: '
188 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
189 f'({error_string(error)})')
191 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
192 """Encoding declaration pattern."""
194 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
195 (codecs.BOM_UTF16_BE, 'utf-16-be'),
196 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
197 """Sequence of (start_bytes, encoding) tuples for encoding detection.
198 The first bytes of input data are checked against the start_bytes strings.
199 A match indicates the given encoding."""
201 def determine_encoding_from_data(self, data):
202 """
203 Try to determine the encoding of `data` by looking *in* `data`.
204 Check for a byte order mark (BOM) or an encoding declaration.
205 """
206 # check for a byte order mark:
207 for start_bytes, encoding in self.byte_order_marks:
208 if data.startswith(start_bytes):
209 return encoding
210 # check for an encoding declaration pattern in first 2 lines of file:
211 for line in data.splitlines()[:2]:
212 match = self.coding_slug.search(line)
213 if match:
214 return match.group(1).decode('ascii')
215 return None
217 def isatty(self):
218 """Return True, if the input source is connected to a TTY device."""
219 try:
220 return self.source.isatty()
221 except AttributeError:
222 return False
225class Output(TransformSpec):
226 """
227 Abstract base class for output wrappers.
229 Docutils output objects must provide a `write()` method that
230 expects and handles one argument (the output).
232 Inheriting `TransformSpec` allows output objects to add
233 "transforms" and "unknown_reference_resolvers" to the "Transformer".
234 (Optional for custom output objects since Docutils 0.19.)
235 """
237 component_type = 'output'
239 default_destination_path = None
241 def __init__(self, destination=None, destination_path=None,
242 encoding=None, error_handler='strict'):
243 self.encoding = encoding
244 """Text encoding for the output destination."""
246 self.error_handler = error_handler or 'strict'
247 """Text encoding error handler."""
249 self.destination = destination
250 """The destination for output data."""
252 self.destination_path = destination_path
253 """A text reference to the destination."""
255 if not destination_path:
256 self.destination_path = self.default_destination_path
258 def __repr__(self):
259 return ('%s: destination=%r, destination_path=%r'
260 % (self.__class__, self.destination, self.destination_path))
262 def write(self, data):
263 """Write `data`. Define in subclasses."""
264 raise NotImplementedError
266 def encode(self, data):
267 """
268 Encode and return `data`.
270 If `data` is a `bytes` instance, it is returned unchanged.
271 Otherwise it is encoded with `self.encoding`.
273 Provisional: If `self.encoding` is set to the pseudo encoding name
274 "unicode", `data` must be a `str` instance and is returned unchanged.
275 """
276 if self.encoding and self.encoding.lower() == 'unicode':
277 assert isinstance(data, str), ('output encoding is "unicode" '
278 'but `data` is no `str` instance')
279 return data
280 if not isinstance(data, str):
281 # Non-unicode (e.g. bytes) output.
282 return data
283 else:
284 return data.encode(self.encoding, self.error_handler)
287class ErrorOutput:
288 """
289 Wrapper class for file-like error streams with
290 failsafe de- and encoding of `str`, `bytes`, `unicode` and
291 `Exception` instances.
292 """
294 def __init__(self, destination=None, encoding=None,
295 encoding_errors='backslashreplace',
296 decoding_errors='replace'):
297 """
298 :Parameters:
299 - `destination`: a file-like object,
300 a string (path to a file),
301 `None` (write to `sys.stderr`, default), or
302 evaluating to `False` (write() requests are ignored).
303 - `encoding`: `destination` text encoding. Guessed if None.
304 - `encoding_errors`: how to treat encoding errors.
305 """
306 if destination is None:
307 destination = sys.stderr
308 elif not destination:
309 destination = False
310 # if `destination` is a file name, open it
311 elif isinstance(destination, str):
312 destination = open(destination, 'w')
314 self.destination = destination
315 """Where warning output is sent."""
317 self.encoding = (encoding or getattr(destination, 'encoding', None)
318 or _locale_encoding or 'ascii')
319 """The output character encoding."""
321 self.encoding_errors = encoding_errors
322 """Encoding error handler."""
324 self.decoding_errors = decoding_errors
325 """Decoding error handler."""
327 def write(self, data):
328 """
329 Write `data` to self.destination. Ignore, if self.destination is False.
331 `data` can be a `bytes`, `str`, or `Exception` instance.
332 """
333 if not self.destination:
334 return
335 if isinstance(data, Exception):
336 data = str(data)
337 try:
338 self.destination.write(data)
339 except UnicodeEncodeError:
340 self.destination.write(data.encode(self.encoding,
341 self.encoding_errors))
342 except TypeError:
343 if isinstance(data, str): # destination may expect bytes
344 self.destination.write(data.encode(self.encoding,
345 self.encoding_errors))
346 elif self.destination in (sys.stderr, sys.stdout):
347 # write bytes to raw stream
348 self.destination.buffer.write(data)
349 else:
350 self.destination.write(str(data, self.encoding,
351 self.decoding_errors))
353 def close(self):
354 """
355 Close the error-output stream.
357 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
358 close() method.
359 """
360 if self.destination in (sys.stdout, sys.stderr):
361 return
362 try:
363 self.destination.close()
364 except AttributeError:
365 pass
367 def isatty(self):
368 """Return True, if the destination is connected to a TTY device."""
369 try:
370 return self.destination.isatty()
371 except AttributeError:
372 return False
375class FileInput(Input):
377 """
378 Input for single, simple file-like objects.
379 """
380 def __init__(self, source=None, source_path=None,
381 encoding=None, error_handler='strict',
382 autoclose=True, mode='r'):
383 """
384 :Parameters:
385 - `source`: either a file-like object (which is read directly), or
386 `None` (which implies `sys.stdin` if no `source_path` given).
387 - `source_path`: a path to a file, which is opened and then read.
388 - `encoding`: the expected text encoding of the input file.
389 - `error_handler`: the encoding error handler to use.
390 - `autoclose`: close automatically after read (except when
391 `sys.stdin` is the source).
392 - `mode`: how the file is to be opened (see standard function
393 `open`). The default is read only ('r').
394 """
395 Input.__init__(self, source, source_path, encoding, error_handler)
396 self.autoclose = autoclose
397 self._stderr = ErrorOutput()
399 if source is None:
400 if source_path:
401 try:
402 self.source = open(source_path, mode,
403 encoding=self.encoding or 'utf-8-sig',
404 errors=self.error_handler)
405 except OSError as error:
406 raise InputError(error.errno, error.strerror, source_path)
407 else:
408 self.source = sys.stdin
409 elif check_encoding(self.source, self.encoding) is False:
410 # TODO: re-open, warn or raise error?
411 raise UnicodeError('Encoding clash: encoding given is "%s" '
412 'but source is opened with encoding "%s".' %
413 (self.encoding, self.source.encoding))
414 if not source_path:
415 try:
416 self.source_path = self.source.name
417 except AttributeError:
418 pass
420 def read(self):
421 """
422 Read and decode a single file and return the data (Unicode string).
423 """
424 try:
425 if self.source is sys.stdin:
426 # read as binary data to circumvent auto-decoding
427 data = self.source.buffer.read()
428 else:
429 data = self.source.read()
430 except (UnicodeError, LookupError):
431 if not self.encoding and self.source_path:
432 # re-read in binary mode and decode with heuristics
433 b_source = open(self.source_path, 'rb')
434 data = b_source.read()
435 b_source.close()
436 else:
437 raise
438 finally:
439 if self.autoclose:
440 self.close()
441 data = self.decode(data)
442 # normalise newlines
443 return '\n'.join(data.splitlines()+[''])
445 def readlines(self):
446 """
447 Return lines of a single file as list of Unicode strings.
448 """
449 return self.read().splitlines(True)
451 def close(self):
452 if self.source is not sys.stdin:
453 self.source.close()
456class FileOutput(Output):
458 """Output for single, simple file-like objects."""
460 default_destination_path = '<file>'
462 mode = 'w'
463 """The mode argument for `open()`."""
464 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
465 # (Do not use binary mode ('wb') for text files, as this prevents the
466 # conversion of newlines to the system specific default.)
468 def __init__(self, destination=None, destination_path=None,
469 encoding=None, error_handler='strict', autoclose=True,
470 handle_io_errors=None, mode=None):
471 """
472 :Parameters:
473 - `destination`: either a file-like object (which is written
474 directly) or `None` (which implies `sys.stdout` if no
475 `destination_path` given).
476 - `destination_path`: a path to a file, which is opened and then
477 written.
478 - `encoding`: the text encoding of the output file.
479 - `error_handler`: the encoding error handler to use.
480 - `autoclose`: close automatically after write (except when
481 `sys.stdout` or `sys.stderr` is the destination).
482 - `handle_io_errors`: ignored, deprecated, will be removed.
483 - `mode`: how the file is to be opened (see standard function
484 `open`). The default is 'w', providing universal newline
485 support for text files.
486 """
487 Output.__init__(self, destination, destination_path,
488 encoding, error_handler)
489 self.opened = True
490 self.autoclose = autoclose
491 if handle_io_errors is not None:
492 warnings.warn('io.FileOutput: init argument "handle_io_errors" '
493 'is ignored and will be removed in '
494 'Docutils 2.0.', DeprecationWarning, stacklevel=2)
495 if mode is not None:
496 self.mode = mode
497 self._stderr = ErrorOutput()
498 if destination is None:
499 if destination_path:
500 self.opened = False
501 else:
502 self.destination = sys.stdout
503 elif ( # destination is file-type object -> check mode:
504 mode and hasattr(self.destination, 'mode')
505 and mode != self.destination.mode):
506 print('Warning: Destination mode "%s" differs from specified '
507 'mode "%s"' % (self.destination.mode, mode),
508 file=self._stderr)
509 if not destination_path:
510 try:
511 self.destination_path = self.destination.name
512 except AttributeError:
513 pass
515 def open(self):
516 # Specify encoding
517 if 'b' not in self.mode:
518 kwargs = {'encoding': self.encoding,
519 'errors': self.error_handler}
520 else:
521 kwargs = {}
522 try:
523 self.destination = open(self.destination_path, self.mode, **kwargs)
524 except OSError as error:
525 raise OutputError(error.errno, error.strerror,
526 self.destination_path)
527 self.opened = True
529 def write(self, data):
530 """Write `data` to a single file, also return it.
532 `data` can be a `str` or `bytes` instance.
533 If writing `bytes` fails, an attempt is made to write to
534 the low-level interface ``self.destination.buffer``.
536 If `data` is a `str` instance and `self.encoding` and
537 `self.destination.encoding` are set to different values, `data`
538 is encoded to a `bytes` instance using `self.encoding`.
540 Provisional: future versions may raise an error if `self.encoding`
541 and `self.destination.encoding` are set to different values.
542 """
543 if not self.opened:
544 self.open()
545 if (isinstance(data, str)
546 and check_encoding(self.destination, self.encoding) is False):
547 if os.linesep != '\n':
548 data = data.replace('\n', os.linesep) # fix endings
549 data = self.encode(data)
551 try:
552 self.destination.write(data)
553 except TypeError as err:
554 if isinstance(data, bytes):
555 try:
556 self.destination.buffer.write(data)
557 except AttributeError:
558 if check_encoding(self.destination,
559 self.encoding) is False:
560 raise ValueError(
561 f'Encoding of {self.destination_path} '
562 f'({self.destination.encoding}) differs \n'
563 f' from specified encoding ({self.encoding})')
564 else:
565 raise err
566 except (UnicodeError, LookupError) as err:
567 raise UnicodeError(
568 'Unable to encode output data. output-encoding is: '
569 f'{self.encoding}.\n({error_string(err)})')
570 finally:
571 if self.autoclose:
572 self.close()
573 return data
575 def close(self):
576 if self.destination not in (sys.stdout, sys.stderr):
577 self.destination.close()
578 self.opened = False
581class BinaryFileOutput(FileOutput):
582 """
583 A version of docutils.io.FileOutput which writes to a binary file.
584 """
585 # Used by core.publish_cmdline_to_binary() which in turn is used by
586 # tools/rst2odt.py but not by core.rst2odt().
587 mode = 'wb'
590class StringInput(Input):
591 """Input from a `str` or `bytes` instance."""
593 default_source_path = '<string>'
595 def read(self):
596 """Return the source as `str` instance.
598 Decode, if required (see `Input.decode`).
599 """
600 return self.decode(self.source)
603class StringOutput(Output):
604 """Output to a `bytes` or `str` instance.
606 Provisional.
607 """
609 default_destination_path = '<string>'
611 def write(self, data):
612 """Store `data` in `self.destination`, and return it.
614 If `self.encoding` is set to the pseudo encoding name "unicode",
615 `data` must be a `str` instance and is stored/returned unchanged
616 (cf. `Output.encode`).
618 Otherwise, `data` can be a `bytes` or `str` instance and is
619 stored/returned as a `bytes` instance
620 (`str` data is encoded with `self.encode()`).
622 Attention: the `output_encoding`_ setting may affect the content
623 of the output (e.g. an encoding declaration in HTML or XML or the
624 representation of characters as LaTeX macro vs. literal character).
625 """
626 self.destination = self.encode(data)
627 return self.destination
630class NullInput(Input):
632 """Degenerate input: read nothing."""
634 default_source_path = 'null input'
636 def read(self):
637 """Return an empty string."""
638 return ''
641class NullOutput(Output):
643 """Degenerate output: write nothing."""
645 default_destination_path = 'null output'
647 def write(self, data):
648 """Do nothing, return None."""
649 pass
652class DocTreeInput(Input):
654 """
655 Adapter for document tree input.
657 The document tree must be passed in the ``source`` parameter.
658 """
660 default_source_path = 'doctree input'
662 def read(self):
663 """Return the document tree."""
664 return self.source