Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/docutils/io.py: 34%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# $Id$
2# Author: David Goodger <goodger@python.org>
3# Copyright: This module has been placed in the public domain.
5"""
6I/O classes provide a uniform API for low-level input and output. Subclasses
7exist for a variety of input/output mechanisms.
8"""
10__docformat__ = 'reStructuredText'
12import codecs
13import locale
14import os
15import re
16import sys
17import warnings
19from docutils import TransformSpec
22# Guess the locale's preferred encoding.
23# If no valid guess can be made, _locale_encoding is set to `None`:
24#
25# TODO: check whether this is set correctly with every OS and Python version
26# or whether front-end tools need to call `locale.setlocale()`
27# before importing this module
28try:
29 # Return locale encoding also in UTF-8 mode
30 with warnings.catch_warnings():
31 warnings.simplefilter("ignore")
32 _locale_encoding = (locale.getlocale()[1]
33 or locale.getdefaultlocale()[1])
34 _locale_encoding = _locale_encoding.lower()
35except: # noqa any other problems determining the locale -> use None
36 _locale_encoding = None
37try:
38 codecs.lookup(_locale_encoding)
39except (LookupError, TypeError):
40 _locale_encoding = None
43class InputError(OSError): pass
44class OutputError(OSError): pass
47def check_encoding(stream, encoding):
48 """Test, whether the encoding of `stream` matches `encoding`.
50 Returns
52 :None: if `encoding` or `stream.encoding` are not a valid encoding
53 argument (e.g. ``None``) or `stream.encoding is missing.
54 :True: if the encoding argument resolves to the same value as `encoding`,
55 :False: if the encodings differ.
56 """
57 try:
58 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
59 except (LookupError, AttributeError, TypeError):
60 return None
63def error_string(err):
64 """Return string representation of Exception `err`.
65 """
66 return f'{err.__class__.__name__}: {err}'
69class Input(TransformSpec):
70 """
71 Abstract base class for input wrappers.
73 Docutils input objects must provide a `read()` method that
74 returns the source, typically as `str` instance.
76 Inheriting `TransformSpec` allows input objects to add
77 "transforms" and "unknown_reference_resolvers" to the "Transformer".
78 (Optional for custom input objects since Docutils 0.19.)
79 """
81 component_type = 'input'
83 default_source_path = None
85 def __init__(self, source=None, source_path=None, encoding=None,
86 error_handler='strict'):
87 self.encoding = encoding
88 """Text encoding for the input source."""
90 self.error_handler = error_handler
91 """Text decoding error handler."""
93 self.source = source
94 """The source of input data."""
96 self.source_path = source_path
97 """A text reference to the source."""
99 if not source_path:
100 self.source_path = self.default_source_path
102 self.successful_encoding = None
103 """The encoding that successfully decoded the source data."""
105 def __repr__(self):
106 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
107 self.source_path)
109 def read(self):
110 """Return input as `str`. Define in subclasses."""
111 raise NotImplementedError
113 def decode(self, data):
114 """
115 Decode `data` if required.
117 Return Unicode `str` instances unchanged (nothing to decode).
119 If `self.encoding` is None, determine encoding from data
120 or try UTF-8 and the locale's preferred encoding.
121 The client application should call ``locale.setlocale()`` at the
122 beginning of processing::
124 locale.setlocale(locale.LC_ALL, '')
126 Raise UnicodeError if unsuccessful.
128 Provisional: encoding detection will be removed in Docutils 1.0.
129 """
130 if self.encoding and self.encoding.lower() == 'unicode':
131 assert isinstance(data, str), ('input encoding is "unicode" '
132 'but `data` is no `str` instance')
133 if isinstance(data, str):
134 # nothing to decode
135 return data
136 if self.encoding:
137 # We believe the user/application when the encoding is
138 # explicitly given.
139 encoding_candidates = [self.encoding]
140 else:
141 data_encoding = self.determine_encoding_from_data(data)
142 if data_encoding:
143 # `data` declares its encoding with "magic comment" or BOM,
144 encoding_candidates = [data_encoding]
145 else:
146 # Apply heuristics if the encoding is not specified.
147 # Start with UTF-8, because that only matches
148 # data that *IS* UTF-8:
149 encoding_candidates = ['utf-8']
150 # If UTF-8 fails, fall back to the locale's preferred encoding:
151 fallback = locale.getpreferredencoding(do_setlocale=False)
152 if fallback and fallback.lower() != 'utf-8':
153 encoding_candidates.append(fallback)
154 for enc in encoding_candidates:
155 try:
156 decoded = str(data, enc, self.error_handler)
157 self.successful_encoding = enc
158 return decoded
159 except (UnicodeError, LookupError) as err:
160 # keep exception instance for use outside of the "for" loop.
161 error = err
162 raise UnicodeError(
163 'Unable to decode input data. Tried the following encodings: '
164 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'
165 f'({error_string(error)})')
167 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
168 """Encoding declaration pattern."""
170 byte_order_marks = ((codecs.BOM_UTF32_BE, 'utf-32'),
171 (codecs.BOM_UTF32_LE, 'utf-32'),
172 (codecs.BOM_UTF8, 'utf-8-sig'),
173 (codecs.BOM_UTF16_BE, 'utf-16'),
174 (codecs.BOM_UTF16_LE, 'utf-16'),
175 )
176 """Sequence of (start_bytes, encoding) tuples for encoding detection.
177 The first bytes of input data are checked against the start_bytes strings.
178 A match indicates the given encoding."""
180 def determine_encoding_from_data(self, data):
181 """
182 Try to determine the encoding of `data` by looking *in* `data`.
183 Check for a byte order mark (BOM) or an encoding declaration.
184 """
185 # check for a byte order mark:
186 for start_bytes, encoding in self.byte_order_marks:
187 if data.startswith(start_bytes):
188 return encoding
189 # check for an encoding declaration pattern in first 2 lines of file:
190 for line in data.splitlines()[:2]:
191 match = self.coding_slug.search(line)
192 if match:
193 return match.group(1).decode('ascii')
194 return None
196 def isatty(self):
197 """Return True, if the input source is connected to a TTY device."""
198 try:
199 return self.source.isatty()
200 except AttributeError:
201 return False
204class Output(TransformSpec):
205 """
206 Abstract base class for output wrappers.
208 Docutils output objects must provide a `write()` method that
209 expects and handles one argument (the output).
211 Inheriting `TransformSpec` allows output objects to add
212 "transforms" and "unknown_reference_resolvers" to the "Transformer".
213 (Optional for custom output objects since Docutils 0.19.)
214 """
216 component_type = 'output'
218 default_destination_path = None
220 def __init__(self, destination=None, destination_path=None,
221 encoding=None, error_handler='strict'):
222 self.encoding = encoding
223 """Text encoding for the output destination."""
225 self.error_handler = error_handler or 'strict'
226 """Text encoding error handler."""
228 self.destination = destination
229 """The destination for output data."""
231 self.destination_path = destination_path
232 """A text reference to the destination."""
234 if not destination_path:
235 self.destination_path = self.default_destination_path
237 def __repr__(self):
238 return ('%s: destination=%r, destination_path=%r'
239 % (self.__class__, self.destination, self.destination_path))
241 def write(self, data):
242 """Write `data`. Define in subclasses."""
243 raise NotImplementedError
245 def encode(self, data):
246 """
247 Encode and return `data`.
249 If `data` is a `bytes` instance, it is returned unchanged.
250 Otherwise it is encoded with `self.encoding`.
252 Provisional: If `self.encoding` is set to the pseudo encoding name
253 "unicode", `data` must be a `str` instance and is returned unchanged.
254 """
255 if self.encoding and self.encoding.lower() == 'unicode':
256 assert isinstance(data, str), ('output encoding is "unicode" '
257 'but `data` is no `str` instance')
258 return data
259 if not isinstance(data, str):
260 # Non-unicode (e.g. bytes) output.
261 return data
262 else:
263 return data.encode(self.encoding, self.error_handler)
266class ErrorOutput:
267 """
268 Wrapper class for file-like error streams with
269 failsafe de- and encoding of `str`, `bytes`, `unicode` and
270 `Exception` instances.
271 """
273 def __init__(self, destination=None, encoding=None,
274 encoding_errors='backslashreplace',
275 decoding_errors='replace'):
276 """
277 :Parameters:
278 - `destination`: a file-like object,
279 a string (path to a file),
280 `None` (write to `sys.stderr`, default), or
281 evaluating to `False` (write() requests are ignored).
282 - `encoding`: `destination` text encoding. Guessed if None.
283 - `encoding_errors`: how to treat encoding errors.
284 """
285 if destination is None:
286 destination = sys.stderr
287 elif not destination:
288 destination = False
289 # if `destination` is a file name, open it
290 elif isinstance(destination, str):
291 destination = open(destination, 'w')
293 self.destination = destination
294 """Where warning output is sent."""
296 self.encoding = (encoding or getattr(destination, 'encoding', None)
297 or _locale_encoding or 'ascii')
298 """The output character encoding."""
300 self.encoding_errors = encoding_errors
301 """Encoding error handler."""
303 self.decoding_errors = decoding_errors
304 """Decoding error handler."""
306 def write(self, data):
307 """
308 Write `data` to self.destination. Ignore, if self.destination is False.
310 `data` can be a `bytes`, `str`, or `Exception` instance.
311 """
312 if not self.destination:
313 return
314 if isinstance(data, Exception):
315 data = str(data)
316 try:
317 self.destination.write(data)
318 except UnicodeEncodeError:
319 self.destination.write(data.encode(self.encoding,
320 self.encoding_errors))
321 except TypeError:
322 if isinstance(data, str): # destination may expect bytes
323 self.destination.write(data.encode(self.encoding,
324 self.encoding_errors))
325 elif self.destination in (sys.stderr, sys.stdout):
326 # write bytes to raw stream
327 self.destination.buffer.write(data)
328 else:
329 self.destination.write(str(data, self.encoding,
330 self.decoding_errors))
332 def close(self):
333 """
334 Close the error-output stream.
336 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no
337 close() method.
338 """
339 if self.destination in (sys.stdout, sys.stderr):
340 return
341 try:
342 self.destination.close()
343 except AttributeError:
344 pass
346 def isatty(self):
347 """Return True, if the destination is connected to a TTY device."""
348 try:
349 return self.destination.isatty()
350 except AttributeError:
351 return False
354class FileInput(Input):
356 """
357 Input for single, simple file-like objects.
358 """
359 def __init__(self, source=None, source_path=None,
360 encoding=None, error_handler='strict',
361 autoclose=True, mode='r'):
362 """
363 :Parameters:
364 - `source`: either a file-like object (which is read directly), or
365 `None` (which implies `sys.stdin` if no `source_path` given).
366 - `source_path`: a path to a file, which is opened for reading.
367 - `encoding`: the expected text encoding of the input file.
368 - `error_handler`: the encoding error handler to use.
369 - `autoclose`: close automatically after read (except when
370 `sys.stdin` is the source).
371 - `mode`: how the file is to be opened (see standard function
372 `open`). The default is read only ('r').
373 """
374 Input.__init__(self, source, source_path, encoding, error_handler)
375 self.autoclose = autoclose
376 self._stderr = ErrorOutput()
378 if source is None:
379 if source_path:
380 try:
381 self.source = open(source_path, mode,
382 encoding=self.encoding,
383 errors=self.error_handler)
384 except OSError as error:
385 raise InputError(error.errno, error.strerror, source_path)
386 else:
387 self.source = sys.stdin
388 elif check_encoding(self.source, self.encoding) is False:
389 # TODO: re-open, warn or raise error?
390 raise UnicodeError('Encoding clash: encoding given is "%s" '
391 'but source is opened with encoding "%s".' %
392 (self.encoding, self.source.encoding))
393 if not source_path:
394 try:
395 self.source_path = self.source.name
396 except AttributeError:
397 pass
399 def read(self):
400 """
401 Read and decode a single file, return as `str`.
402 """
403 try:
404 if not self.encoding and hasattr(self.source, 'buffer'):
405 # read as binary data
406 data = self.source.buffer.read()
407 # decode with heuristics
408 data = self.decode(data)
409 # normalize newlines
410 data = '\n'.join(data.splitlines()+[''])
411 else:
412 data = self.source.read()
413 finally:
414 if self.autoclose:
415 self.close()
416 return data
418 def readlines(self):
419 """
420 Return lines of a single file as list of strings.
421 """
422 return self.read().splitlines(True)
424 def close(self):
425 if self.source is not sys.stdin:
426 self.source.close()
429class FileOutput(Output):
431 """Output for single, simple file-like objects."""
433 default_destination_path = '<file>'
435 mode = 'w'
436 """The mode argument for `open()`."""
437 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
438 # (Do not use binary mode ('wb') for text files, as this prevents the
439 # conversion of newlines to the system specific default.)
441 def __init__(self, destination=None, destination_path=None,
442 encoding=None, error_handler='strict', autoclose=True,
443 handle_io_errors=None, mode=None):
444 """
445 :Parameters:
446 - `destination`: either a file-like object (which is written
447 directly) or `None` (which implies `sys.stdout` if no
448 `destination_path` given).
449 - `destination_path`: a path to a file, which is opened and then
450 written.
451 - `encoding`: the text encoding of the output file.
452 - `error_handler`: the encoding error handler to use.
453 - `autoclose`: close automatically after write (except when
454 `sys.stdout` or `sys.stderr` is the destination).
455 - `handle_io_errors`: ignored, deprecated, will be removed.
456 - `mode`: how the file is to be opened (see standard function
457 `open`). The default is 'w', providing universal newline
458 support for text files.
459 """
460 Output.__init__(self, destination, destination_path,
461 encoding, error_handler)
462 self.opened = True
463 self.autoclose = autoclose
464 if handle_io_errors is not None:
465 warnings.warn('io.FileOutput: init argument "handle_io_errors" '
466 'is ignored and will be removed in '
467 'Docutils 2.0.', DeprecationWarning, stacklevel=2)
468 if mode is not None:
469 self.mode = mode
470 self._stderr = ErrorOutput()
471 if destination is None:
472 if destination_path:
473 self.opened = False
474 else:
475 self.destination = sys.stdout
476 elif ( # destination is file-type object -> check mode:
477 mode and hasattr(self.destination, 'mode')
478 and mode != self.destination.mode):
479 print('Warning: Destination mode "%s" differs from specified '
480 'mode "%s"' % (self.destination.mode, mode),
481 file=self._stderr)
482 if not destination_path:
483 try:
484 self.destination_path = self.destination.name
485 except AttributeError:
486 pass
488 def open(self):
489 # Specify encoding
490 if 'b' not in self.mode:
491 kwargs = {'encoding': self.encoding,
492 'errors': self.error_handler}
493 else:
494 kwargs = {}
495 try:
496 self.destination = open(self.destination_path, self.mode, **kwargs)
497 except OSError as error:
498 raise OutputError(error.errno, error.strerror,
499 self.destination_path)
500 self.opened = True
502 def write(self, data):
503 """Write `data` to a single file, also return it.
505 `data` can be a `str` or `bytes` instance.
506 If writing `bytes` fails, an attempt is made to write to
507 the low-level interface ``self.destination.buffer``.
509 If `data` is a `str` instance and `self.encoding` and
510 `self.destination.encoding` are set to different values, `data`
511 is encoded to a `bytes` instance using `self.encoding`.
513 Provisional: future versions may raise an error if `self.encoding`
514 and `self.destination.encoding` are set to different values.
515 """
516 if not self.opened:
517 self.open()
518 if (isinstance(data, str)
519 and check_encoding(self.destination, self.encoding) is False):
520 if os.linesep != '\n':
521 data = data.replace('\n', os.linesep) # fix endings
522 data = self.encode(data)
524 try:
525 self.destination.write(data)
526 except TypeError as err:
527 if isinstance(data, bytes):
528 try:
529 self.destination.buffer.write(data)
530 except AttributeError:
531 if check_encoding(self.destination,
532 self.encoding) is False:
533 raise ValueError(
534 f'Encoding of {self.destination_path} '
535 f'({self.destination.encoding}) differs \n'
536 f' from specified encoding ({self.encoding})')
537 else:
538 raise err
539 except (UnicodeError, LookupError) as err:
540 raise UnicodeError(
541 'Unable to encode output data. output-encoding is: '
542 f'{self.encoding}.\n({error_string(err)})')
543 finally:
544 if self.autoclose:
545 self.close()
546 return data
548 def close(self):
549 if self.destination not in (sys.stdout, sys.stderr):
550 self.destination.close()
551 self.opened = False
554class BinaryFileOutput(FileOutput):
555 """
556 A version of docutils.io.FileOutput which writes to a binary file.
557 """
558 # Used by core.publish_cmdline_to_binary() which in turn is used by
559 # tools/rst2odt.py but not by core.rst2odt().
560 mode = 'wb'
563class StringInput(Input):
564 """Input from a `str` or `bytes` instance."""
566 default_source_path = '<string>'
568 def read(self):
569 """Return the source as `str` instance.
571 Decode, if required (see `Input.decode`).
572 """
573 return self.decode(self.source)
576class StringOutput(Output):
577 """Output to a `bytes` or `str` instance.
579 Provisional.
580 """
582 default_destination_path = '<string>'
584 def write(self, data):
585 """Store `data` in `self.destination`, and return it.
587 If `self.encoding` is set to the pseudo encoding name "unicode",
588 `data` must be a `str` instance and is stored/returned unchanged
589 (cf. `Output.encode`).
591 Otherwise, `data` can be a `bytes` or `str` instance and is
592 stored/returned as a `bytes` instance
593 (`str` data is encoded with `self.encode()`).
595 Attention: the `output_encoding`_ setting may affect the content
596 of the output (e.g. an encoding declaration in HTML or XML or the
597 representation of characters as LaTeX macro vs. literal character).
598 """
599 self.destination = self.encode(data)
600 return self.destination
603class NullInput(Input):
605 """Degenerate input: read nothing."""
607 default_source_path = 'null input'
609 def read(self):
610 """Return an empty string."""
611 return ''
614class NullOutput(Output):
616 """Degenerate output: write nothing."""
618 default_destination_path = 'null output'
620 def write(self, data):
621 """Do nothing, return None."""
622 pass
625class DocTreeInput(Input):
627 """
628 Adapter for document tree input.
630 The document tree must be passed in the ``source`` parameter.
631 """
633 default_source_path = 'doctree input'
635 def read(self):
636 """Return the document tree."""
637 return self.source