Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/docutils/io.py: 34%

1# $Id$

2# Author: David Goodger <goodger@python.org>

3# Copyright: This module has been placed in the public domain.

5"""

6I/O classes provide a uniform API for low-level input and output. Subclasses

7exist for a variety of input/output mechanisms.

8"""

10__docformat__ = 'reStructuredText'

12import codecs

13import locale

14import os

15import re

16import sys

17import warnings

19from docutils import TransformSpec

22# Guess the locale's preferred encoding.

23# If no valid guess can be made, _locale_encoding is set to `None`:

24#

25# TODO: check whether this is set correctly with every OS and Python version

26# or whether front-end tools need to call `locale.setlocale()`

27# before importing this module

28try:

29 # Return locale encoding also in UTF-8 mode

30 with warnings.catch_warnings():

31 warnings.simplefilter("ignore")

32 _locale_encoding = (locale.getlocale()[1]

33 or locale.getdefaultlocale()[1])

34 _locale_encoding = _locale_encoding.lower()

35except ValueError as error: # OS X may set UTF-8 without language code

36 # See https://bugs.python.org/issue18378 fixed in 3.8

37 # and https://sourceforge.net/p/docutils/bugs/298/.

38 # Drop the special case after requiring Python >= 3.8

39 if "unknown locale: UTF-8" in error.args:

40 _locale_encoding = "utf-8"

41 else:

42 _locale_encoding = None

43except: # noqa any other problems determining the locale -> use None

44 _locale_encoding = None

45try:

46 codecs.lookup(_locale_encoding)

47except (LookupError, TypeError):

48 _locale_encoding = None

51class InputError(OSError): pass

52class OutputError(OSError): pass

55def check_encoding(stream, encoding):

56 """Test, whether the encoding of `stream` matches `encoding`.

58 Returns

60 :None: if `encoding` or `stream.encoding` are not a valid encoding

61 argument (e.g. ``None``) or `stream.encoding is missing.

62 :True: if the encoding argument resolves to the same value as `encoding`,

63 :False: if the encodings differ.

64 """

65 try:

66 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)

67 except (LookupError, AttributeError, TypeError):

68 return None

71def error_string(err):

72 """Return string representation of Exception `err`.

73 """

74 return f'{err.__class__.__name__}: {err}'

77class Input(TransformSpec):

78 """

79 Abstract base class for input wrappers.

81 Docutils input objects must provide a `read()` method that

82 returns the source, typically as `str` instance.

84 Inheriting `TransformSpec` allows input objects to add

85 "transforms" and "unknown_reference_resolvers" to the "Transformer".

86 (Optional for custom input objects since Docutils 0.19.)

87 """

89 component_type = 'input'

91 default_source_path = None

93 def __init__(self, source=None, source_path=None, encoding=None,

94 error_handler='strict'):

95 self.encoding = encoding

96 """Text encoding for the input source."""

98 self.error_handler = error_handler

99 """Text decoding error handler."""

100

101 self.source = source

102 """The source of input data."""

103

104 self.source_path = source_path

105 """A text reference to the source."""

106

107 if not source_path:

108 self.source_path = self.default_source_path

109

110 self.successful_encoding = None

111 """The encoding that successfully decoded the source data."""

112

113 def __repr__(self):

114 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,

115 self.source_path)

116

117 def read(self):

118 """Return input as `str`. Define in subclasses."""

119 raise NotImplementedError

120

121 def decode(self, data):

122 """

123 Decode `data` if required.

124

125 Return Unicode `str` instances unchanged (nothing to decode).

126

127 If `self.encoding` is None, determine encoding from data

128 or try UTF-8, locale encoding, and (as last ressort) 'latin-1'.

129 The client application should call ``locale.setlocale`` at the

130 beginning of processing::

131

132 locale.setlocale(locale.LC_ALL, '')

133

134 Raise UnicodeError if unsuccessful.

135

136 Provisional:

137 - Raise UnicodeError (instead of falling back to the locale

138 encoding) if decoding the source with the default encoding (UTF-8)

139 fails and Python is started in `UTF-8 mode`.

140

141 Raise UnicodeError (instead of falling back to "latin1") if both,

142 default and locale encoding, fail.

143

144 - Only remove BOM (U+FEFF ZWNBSP at start of data),

145 no other ZWNBSPs.

146 """

147 if self.encoding and self.encoding.lower() == 'unicode':

148 assert isinstance(data, str), ('input encoding is "unicode" '

149 'but `data` is no `str` instance')

150 if isinstance(data, str):

151 # nothing to decode

152 return data

153 if self.encoding:

154 # We believe the user/application when the encoding is

155 # explicitly given.

156 encoding_candidates = [self.encoding]

157 else:

158 data_encoding = self.determine_encoding_from_data(data)

159 if data_encoding:

160 # If the data declares its encoding (explicitly or via a BOM),

161 # we believe it.

162 encoding_candidates = [data_encoding]

163 else:

164 # Apply heuristics only if no encoding is explicitly given and

165 # no BOM found. Start with UTF-8, because that only matches

166 # data that *IS* UTF-8:

167 encoding_candidates = ['utf-8']

168 # TODO: use `locale.getpreferredlocale(do_setlocale=True)`

169 # to respect UTF-8 mode (API change).

170 # (Check if it is a valid encoding and not UTF-8)

171 if _locale_encoding and _locale_encoding != 'utf-8':

172 encoding_candidates.append(_locale_encoding)

173 # TODO: don't fall back to 'latin-1' (API change).

174 encoding_candidates.append('latin-1')

175 for enc in encoding_candidates:

176 try:

177 decoded = str(data, enc, self.error_handler)

178 self.successful_encoding = enc

179 # Return decoded, removing BOM and other ZWNBSPs.

180 # TODO: only remove BOM (ZWNBSP at start of data)

181 # and only if 'self.encoding' is None. (API change)

182 return decoded.replace('\ufeff', '')

183 except (UnicodeError, LookupError) as err:

184 # keep exception instance for use outside of the "for" loop.

185 error = err

186 raise UnicodeError(

187 'Unable to decode input data. Tried the following encodings: '

188 f'{", ".join(repr(enc) for enc in encoding_candidates)}.\n'

189 f'({error_string(error)})')

190

191 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")

192 """Encoding declaration pattern."""

193

194 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),

195 (codecs.BOM_UTF16_BE, 'utf-16-be'),

196 (codecs.BOM_UTF16_LE, 'utf-16-le'),)

197 """Sequence of (start_bytes, encoding) tuples for encoding detection.

198 The first bytes of input data are checked against the start_bytes strings.

199 A match indicates the given encoding."""

200

201 def determine_encoding_from_data(self, data):

202 """

203 Try to determine the encoding of `data` by looking *in* `data`.

204 Check for a byte order mark (BOM) or an encoding declaration.

205 """

206 # check for a byte order mark:

207 for start_bytes, encoding in self.byte_order_marks:

208 if data.startswith(start_bytes):

209 return encoding

210 # check for an encoding declaration pattern in first 2 lines of file:

211 for line in data.splitlines()[:2]:

212 match = self.coding_slug.search(line)

213 if match:

214 return match.group(1).decode('ascii')

215 return None

216

217 def isatty(self):

218 """Return True, if the input source is connected to a TTY device."""

219 try:

220 return self.source.isatty()

221 except AttributeError:

222 return False

223

224

225class Output(TransformSpec):

226 """

227 Abstract base class for output wrappers.

228

229 Docutils output objects must provide a `write()` method that

230 expects and handles one argument (the output).

231

232 Inheriting `TransformSpec` allows output objects to add

233 "transforms" and "unknown_reference_resolvers" to the "Transformer".

234 (Optional for custom output objects since Docutils 0.19.)

235 """

236

237 component_type = 'output'

238

239 default_destination_path = None

240

241 def __init__(self, destination=None, destination_path=None,

242 encoding=None, error_handler='strict'):

243 self.encoding = encoding

244 """Text encoding for the output destination."""

245

246 self.error_handler = error_handler or 'strict'

247 """Text encoding error handler."""

248

249 self.destination = destination

250 """The destination for output data."""

251

252 self.destination_path = destination_path

253 """A text reference to the destination."""

254

255 if not destination_path:

256 self.destination_path = self.default_destination_path

257

258 def __repr__(self):

259 return ('%s: destination=%r, destination_path=%r'

260 % (self.__class__, self.destination, self.destination_path))

261

262 def write(self, data):

263 """Write `data`. Define in subclasses."""

264 raise NotImplementedError

265

266 def encode(self, data):

267 """

268 Encode and return `data`.

269

270 If `data` is a `bytes` instance, it is returned unchanged.

271 Otherwise it is encoded with `self.encoding`.

272

273 Provisional: If `self.encoding` is set to the pseudo encoding name

274 "unicode", `data` must be a `str` instance and is returned unchanged.

275 """

276 if self.encoding and self.encoding.lower() == 'unicode':

277 assert isinstance(data, str), ('output encoding is "unicode" '

278 'but `data` is no `str` instance')

279 return data

280 if not isinstance(data, str):

281 # Non-unicode (e.g. bytes) output.

282 return data

283 else:

284 return data.encode(self.encoding, self.error_handler)

285

286

287class ErrorOutput:

288 """

289 Wrapper class for file-like error streams with

290 failsafe de- and encoding of `str`, `bytes`, `unicode` and

291 `Exception` instances.

292 """

293

294 def __init__(self, destination=None, encoding=None,

295 encoding_errors='backslashreplace',

296 decoding_errors='replace'):

297 """

298 :Parameters:

299 - `destination`: a file-like object,

300 a string (path to a file),

301 `None` (write to `sys.stderr`, default), or

302 evaluating to `False` (write() requests are ignored).

303 - `encoding`: `destination` text encoding. Guessed if None.

304 - `encoding_errors`: how to treat encoding errors.

305 """

306 if destination is None:

307 destination = sys.stderr

308 elif not destination:

309 destination = False

310 # if `destination` is a file name, open it

311 elif isinstance(destination, str):

312 destination = open(destination, 'w')

313

314 self.destination = destination

315 """Where warning output is sent."""

316

317 self.encoding = (encoding or getattr(destination, 'encoding', None)

318 or _locale_encoding or 'ascii')

319 """The output character encoding."""

320

321 self.encoding_errors = encoding_errors

322 """Encoding error handler."""

323

324 self.decoding_errors = decoding_errors

325 """Decoding error handler."""

326

327 def write(self, data):

328 """

329 Write `data` to self.destination. Ignore, if self.destination is False.

330

331 `data` can be a `bytes`, `str`, or `Exception` instance.

332 """

333 if not self.destination:

334 return

335 if isinstance(data, Exception):

336 data = str(data)

337 try:

338 self.destination.write(data)

339 except UnicodeEncodeError:

340 self.destination.write(data.encode(self.encoding,

341 self.encoding_errors))

342 except TypeError:

343 if isinstance(data, str): # destination may expect bytes

344 self.destination.write(data.encode(self.encoding,

345 self.encoding_errors))

346 elif self.destination in (sys.stderr, sys.stdout):

347 # write bytes to raw stream

348 self.destination.buffer.write(data)

349 else:

350 self.destination.write(str(data, self.encoding,

351 self.decoding_errors))

352

353 def close(self):

354 """

355 Close the error-output stream.

356

357 Ignored if the destination is` sys.stderr` or `sys.stdout` or has no

358 close() method.

359 """

360 if self.destination in (sys.stdout, sys.stderr):

361 return

362 try:

363 self.destination.close()

364 except AttributeError:

365 pass

366

367 def isatty(self):

368 """Return True, if the destination is connected to a TTY device."""

369 try:

370 return self.destination.isatty()

371 except AttributeError:

372 return False

373

374

375class FileInput(Input):

376

377 """

378 Input for single, simple file-like objects.

379 """

380 def __init__(self, source=None, source_path=None,

381 encoding=None, error_handler='strict',

382 autoclose=True, mode='r'):

383 """

384 :Parameters:

385 - `source`: either a file-like object (which is read directly), or

386 `None` (which implies `sys.stdin` if no `source_path` given).

387 - `source_path`: a path to a file, which is opened and then read.

388 - `encoding`: the expected text encoding of the input file.

389 - `error_handler`: the encoding error handler to use.

390 - `autoclose`: close automatically after read (except when

391 `sys.stdin` is the source).

392 - `mode`: how the file is to be opened (see standard function

393 `open`). The default is read only ('r').

394 """

395 Input.__init__(self, source, source_path, encoding, error_handler)

396 self.autoclose = autoclose

397 self._stderr = ErrorOutput()

398

399 if source is None:

400 if source_path:

401 try:

402 self.source = open(source_path, mode,

403 encoding=self.encoding or 'utf-8-sig',

404 errors=self.error_handler)

405 except OSError as error:

406 raise InputError(error.errno, error.strerror, source_path)

407 else:

408 self.source = sys.stdin

409 elif check_encoding(self.source, self.encoding) is False:

410 # TODO: re-open, warn or raise error?

411 raise UnicodeError('Encoding clash: encoding given is "%s" '

412 'but source is opened with encoding "%s".' %

413 (self.encoding, self.source.encoding))

414 if not source_path:

415 try:

416 self.source_path = self.source.name

417 except AttributeError:

418 pass

419

420 def read(self):

421 """

422 Read and decode a single file and return the data (Unicode string).

423 """

424 try:

425 if self.source is sys.stdin:

426 # read as binary data to circumvent auto-decoding

427 data = self.source.buffer.read()

428 else:

429 data = self.source.read()

430 except (UnicodeError, LookupError):

431 if not self.encoding and self.source_path:

432 # re-read in binary mode and decode with heuristics

433 b_source = open(self.source_path, 'rb')

434 data = b_source.read()

435 b_source.close()

436 else:

437 raise

438 finally:

439 if self.autoclose:

440 self.close()

441 data = self.decode(data)

442 # normalise newlines

443 return '\n'.join(data.splitlines()+[''])

444

445 def readlines(self):

446 """

447 Return lines of a single file as list of Unicode strings.

448 """

449 return self.read().splitlines(True)

450

451 def close(self):

452 if self.source is not sys.stdin:

453 self.source.close()

454

455

456class FileOutput(Output):

457

458 """Output for single, simple file-like objects."""

459

460 default_destination_path = '<file>'

461

462 mode = 'w'

463 """The mode argument for `open()`."""

464 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).

465 # (Do not use binary mode ('wb') for text files, as this prevents the

466 # conversion of newlines to the system specific default.)

467

468 def __init__(self, destination=None, destination_path=None,

469 encoding=None, error_handler='strict', autoclose=True,

470 handle_io_errors=None, mode=None):

471 """

472 :Parameters:

473 - `destination`: either a file-like object (which is written

474 directly) or `None` (which implies `sys.stdout` if no

475 `destination_path` given).

476 - `destination_path`: a path to a file, which is opened and then

477 written.

478 - `encoding`: the text encoding of the output file.

479 - `error_handler`: the encoding error handler to use.

480 - `autoclose`: close automatically after write (except when

481 `sys.stdout` or `sys.stderr` is the destination).

482 - `handle_io_errors`: ignored, deprecated, will be removed.

483 - `mode`: how the file is to be opened (see standard function

484 `open`). The default is 'w', providing universal newline

485 support for text files.

486 """

487 Output.__init__(self, destination, destination_path,

488 encoding, error_handler)

489 self.opened = True

490 self.autoclose = autoclose

491 if handle_io_errors is not None:

492 warnings.warn('io.FileOutput: init argument "handle_io_errors" '

493 'is ignored and will be removed in '

494 'Docutils 2.0.', DeprecationWarning, stacklevel=2)

495 if mode is not None:

496 self.mode = mode

497 self._stderr = ErrorOutput()

498 if destination is None:

499 if destination_path:

500 self.opened = False

501 else:

502 self.destination = sys.stdout

503 elif ( # destination is file-type object -> check mode:

504 mode and hasattr(self.destination, 'mode')

505 and mode != self.destination.mode):

506 print('Warning: Destination mode "%s" differs from specified '

507 'mode "%s"' % (self.destination.mode, mode),

508 file=self._stderr)

509 if not destination_path:

510 try:

511 self.destination_path = self.destination.name

512 except AttributeError:

513 pass

514

515 def open(self):

516 # Specify encoding

517 if 'b' not in self.mode:

518 kwargs = {'encoding': self.encoding,

519 'errors': self.error_handler}

520 else:

521 kwargs = {}

522 try:

523 self.destination = open(self.destination_path, self.mode, **kwargs)

524 except OSError as error:

525 raise OutputError(error.errno, error.strerror,

526 self.destination_path)

527 self.opened = True

528

529 def write(self, data):

530 """Write `data` to a single file, also return it.

531

532 `data` can be a `str` or `bytes` instance.

533 If writing `bytes` fails, an attempt is made to write to

534 the low-level interface ``self.destination.buffer``.

535

536 If `data` is a `str` instance and `self.encoding` and

537 `self.destination.encoding` are set to different values, `data`

538 is encoded to a `bytes` instance using `self.encoding`.

539

540 Provisional: future versions may raise an error if `self.encoding`

541 and `self.destination.encoding` are set to different values.

542 """

543 if not self.opened:

544 self.open()

545 if (isinstance(data, str)

546 and check_encoding(self.destination, self.encoding) is False):

547 if os.linesep != '\n':

548 data = data.replace('\n', os.linesep) # fix endings

549 data = self.encode(data)

550

551 try:

552 self.destination.write(data)

553 except TypeError as err:

554 if isinstance(data, bytes):

555 try:

556 self.destination.buffer.write(data)

557 except AttributeError:

558 if check_encoding(self.destination,

559 self.encoding) is False:

560 raise ValueError(

561 f'Encoding of {self.destination_path} '

562 f'({self.destination.encoding}) differs \n'

563 f' from specified encoding ({self.encoding})')

564 else:

565 raise err

566 except (UnicodeError, LookupError) as err:

567 raise UnicodeError(

568 'Unable to encode output data. output-encoding is: '

569 f'{self.encoding}.\n({error_string(err)})')

570 finally:

571 if self.autoclose:

572 self.close()

573 return data

574

575 def close(self):

576 if self.destination not in (sys.stdout, sys.stderr):

577 self.destination.close()

578 self.opened = False

579

580

581class BinaryFileOutput(FileOutput):

582 """

583 A version of docutils.io.FileOutput which writes to a binary file.

584 """

585 # Used by core.publish_cmdline_to_binary() which in turn is used by

586 # tools/rst2odt.py but not by core.rst2odt().

587 mode = 'wb'

588

589

590class StringInput(Input):

591 """Input from a `str` or `bytes` instance."""

592

593 default_source_path = '<string>'

594

595 def read(self):

596 """Return the source as `str` instance.

597

598 Decode, if required (see `Input.decode`).

599 """

600 return self.decode(self.source)

601

602

603class StringOutput(Output):

604 """Output to a `bytes` or `str` instance.

605

606 Provisional.

607 """

608

609 default_destination_path = '<string>'

610

611 def write(self, data):

612 """Store `data` in `self.destination`, and return it.

613

614 If `self.encoding` is set to the pseudo encoding name "unicode",

615 `data` must be a `str` instance and is stored/returned unchanged

616 (cf. `Output.encode`).

617

618 Otherwise, `data` can be a `bytes` or `str` instance and is

619 stored/returned as a `bytes` instance

620 (`str` data is encoded with `self.encode()`).

621

622 Attention: the `output_encoding`_ setting may affect the content

623 of the output (e.g. an encoding declaration in HTML or XML or the

624 representation of characters as LaTeX macro vs. literal character).

625 """

626 self.destination = self.encode(data)

627 return self.destination

628

629

630class NullInput(Input):

631

632 """Degenerate input: read nothing."""

633

634 default_source_path = 'null input'

635

636 def read(self):

637 """Return an empty string."""

638 return ''

639

640

641class NullOutput(Output):

642

643 """Degenerate output: write nothing."""

644

645 default_destination_path = 'null output'

646

647 def write(self, data):

648 """Do nothing, return None."""

649 pass

650

651

652class DocTreeInput(Input):

653

654 """

655 Adapter for document tree input.

656

657 The document tree must be passed in the ``source`` parameter.

658 """

659

660 default_source_path = 'doctree input'

661

662 def read(self):

663 """Return the document tree."""

664 return self.source