Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/core.py: 68%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20from __future__ import annotations

22import codecs

23import sys

24import logging

25import importlib

26from typing import TYPE_CHECKING, Any, BinaryIO, Callable, ClassVar, Mapping, Sequence

27from . import util

28from .preprocessors import build_preprocessors

29from .blockprocessors import build_block_parser

30from .treeprocessors import build_treeprocessors

31from .inlinepatterns import build_inlinepatterns

32from .postprocessors import build_postprocessors

33from .extensions import Extension

34from .serializers import to_html_string, to_xhtml_string

35from .util import BLOCK_LEVEL_ELEMENTS

37if TYPE_CHECKING: # pragma: no cover

38 from xml.etree.ElementTree import Element

40__all__ = ['Markdown', 'markdown', 'markdownFromFile']

43logger = logging.getLogger('MARKDOWN')

46class Markdown:

47 """

48 A parser which converts Markdown to HTML.

50 Attributes:

51 Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.

52 Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.

53 Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.

54 See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.

55 Markdown.registeredExtensions (list[Extension]): List of extensions which have called

56 [`registerExtension`][markdown.Markdown.registerExtension] during setup.

57 Markdown.doc_tag (str): Element used to wrap document. Default: `div`.

58 Markdown.stripTopLevelTags (bool): Indicates whether the `doc_tag` should be removed. Default: 'True'.

59 Markdown.references (dict[str, tuple[str, str]]): A mapping of link references found in a parsed document

60 where the key is the reference name and the value is a tuple of the URL and title.

61 Markdown.htmlStash (util.HtmlStash): The instance of the `HtmlStash` used by an instance of this class.

62 Markdown.output_formats (dict[str, Callable[xml.etree.ElementTree.Element]]): A mapping of known output

63 formats by name and their respective serializers. Each serializer must be a callable which accepts an

64 [`Element`][xml.etree.ElementTree.Element] and returns a `str`.

65 Markdown.output_format (str): The output format set by

66 [`set_output_format`][markdown.Markdown.set_output_format].

67 Markdown.serializer (Callable[xml.etree.ElementTree.Element]): The serializer set by

68 [`set_output_format`][markdown.Markdown.set_output_format].

69 Markdown.preprocessors (util.Registry): A collection of [`preprocessors`][markdown.preprocessors].

70 Markdown.parser (blockparser.BlockParser): A collection of [`blockprocessors`][markdown.blockprocessors].

71 Markdown.inlinePatterns (util.Registry): A collection of [`inlinepatterns`][markdown.inlinepatterns].

72 Markdown.treeprocessors (util.Registry): A collection of [`treeprocessors`][markdown.treeprocessors].

73 Markdown.postprocessors (util.Registry): A collection of [`postprocessors`][markdown.postprocessors].

75 """

77 doc_tag = "div" # Element used to wrap document - later removed

79 output_formats: ClassVar[dict[str, Callable[[Element], str]]] = {

80 'html': to_html_string,

81 'xhtml': to_xhtml_string,

82 }

83 """

84 A mapping of known output formats by name and their respective serializers. Each serializer must be a

85 callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`.

86 """

88 def __init__(self, **kwargs):

89 """

90 Creates a new Markdown instance.

92 Keyword Arguments:

93 extensions (list[Extension | str]): A list of extensions.

95 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],

96 the instance will be used as-is. If an item is of type `str`, it is passed

97 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding

98 `extension_configs` and the returned instance of [`markdown.extensions.Extension`][]

99 is used.

100 extension_configs (dict[str, dict[str, Any]]): Configuration settings for extensions.

101 output_format (str): Format of output. Supported formats are:

102

103 * `xhtml`: Outputs XHTML style tags. Default.

104 * `html`: Outputs HTML style tags.

105 tab_length (int): Length of tabs in the source. Default: `4`

106

107 """

108

109 self.tab_length: int = kwargs.get('tab_length', 4)

110

111 self.ESCAPED_CHARS: list[str] = [

112 '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!'

113 ]

114 """ List of characters which get the backslash escape treatment. """

115

116 self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()

117

118 self.registeredExtensions: list[Extension] = []

119 self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.

120 self.stripTopLevelTags: bool = True

121

122 self.build_parser()

123

124 self.references: dict[str, tuple[str, str]] = {}

125 self.htmlStash: util.HtmlStash = util.HtmlStash()

126 self.registerExtensions(extensions=kwargs.get('extensions', []),

127 configs=kwargs.get('extension_configs', {}))

128 self.set_output_format(kwargs.get('output_format', 'xhtml'))

129 self.reset()

130

131 def build_parser(self) -> Markdown:

132 """

133 Build the parser from the various parts.

134

135 Assigns a value to each of the following attributes on the class instance:

136

137 * **`Markdown.preprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

138 [`preprocessors`][markdown.preprocessors].

139 * **`Markdown.parser`** ([`BlockParser`][markdown.blockparser.BlockParser]) -- A collection of

140 [`blockprocessors`][markdown.blockprocessors].

141 * **`Markdown.inlinePatterns`** ([`Registry`][markdown.util.Registry]) -- A collection of

142 [`inlinepatterns`][markdown.inlinepatterns].

143 * **`Markdown.treeprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

144 [`treeprocessors`][markdown.treeprocessors].

145 * **`Markdown.postprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of

146 [`postprocessors`][markdown.postprocessors].

147

148 This method could be redefined in a subclass to build a custom parser which is made up of a different

149 combination of processors and patterns.

150

151 """

152 self.preprocessors = build_preprocessors(self)

153 self.parser = build_block_parser(self)

154 self.inlinePatterns = build_inlinepatterns(self)

155 self.treeprocessors = build_treeprocessors(self)

156 self.postprocessors = build_postprocessors(self)

157 return self

158

159 def registerExtensions(

160 self,

161 extensions: Sequence[Extension | str],

162 configs: Mapping[str, dict[str, Any]]

163 ) -> Markdown:

164 """

165 Load a list of extensions into an instance of the `Markdown` class.

166

167 Arguments:

168 extensions (list[Extension | str]): A list of extensions.

169

170 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],

171 the instance will be used as-is. If an item is of type `str`, it is passed

172 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding `configs` and the

173 returned instance of [`markdown.extensions.Extension`][] is used.

174 configs (dict[str, dict[str, Any]]): Configuration settings for extensions.

175

176 """

177 for ext in extensions:

178 if isinstance(ext, str):

179 ext = self.build_extension(ext, configs.get(ext, {}))

180 if isinstance(ext, Extension):

181 ext.extendMarkdown(self)

182 logger.debug(

183 'Successfully loaded extension "%s.%s".'

184 % (ext.__class__.__module__, ext.__class__.__name__)

185 )

186 elif ext is not None:

187 raise TypeError(

188 'Extension "{}.{}" must be of type: "{}.{}"'.format(

189 ext.__class__.__module__, ext.__class__.__name__,

190 Extension.__module__, Extension.__name__

191 )

192 )

193 return self

194

195 def build_extension(self, ext_name: str, configs: Mapping[str, Any]) -> Extension:

196 """

197 Build extension from a string name, then return an instance using the given `configs`.

198

199 Arguments:

200 ext_name: Name of extension as a string.

201 configs: Configuration settings for extension.

202

203 Returns:

204 An instance of the extension with the given configuration settings.

205

206 First attempt to load an entry point. The string name must be registered as an entry point in the

207 `markdown.extensions` group which points to a subclass of the [`markdown.extensions.Extension`][] class.

208 If multiple distributions have registered the same name, the first one found is returned.

209

210 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and

211 return an instance. If no class is specified, import the module and call a `makeExtension` function and return

212 the [`markdown.extensions.Extension`][] instance returned by that function.

213 """

214 configs = dict(configs)

215

216 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name]

217 if entry_points:

218 ext = entry_points[0].load()

219 return ext(**configs)

220

221 # Get class name (if provided): `path.to.module:ClassName`

222 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')

223

224 try:

225 module = importlib.import_module(ext_name)

226 logger.debug(

227 'Successfully imported extension module "%s".' % ext_name

228 )

229 except ImportError as e:

230 message = 'Failed loading extension "%s".' % ext_name

231 e.args = (message,) + e.args[1:]

232 raise

233

234 if class_name:

235 # Load given class name from module.

236 return getattr(module, class_name)(**configs)

237 else:

238 # Expect `makeExtension()` function to return a class.

239 try:

240 return module.makeExtension(**configs)

241 except AttributeError as e:

242 message = e.args[0]

243 message = "Failed to initiate extension " \

244 "'%s': %s" % (ext_name, message)

245 e.args = (message,) + e.args[1:]

246 raise

247

248 def registerExtension(self, extension: Extension) -> Markdown:

249 """

250 Register an extension as having a resettable state.

251

252 Arguments:

253 extension: An instance of the extension to register.

254

255 This should get called once by an extension during setup. A "registered" extension's

256 `reset` method is called by [`Markdown.reset()`][markdown.Markdown.reset]. Not all extensions have or need a

257 resettable state, and so it should not be assumed that all extensions are "registered."

258

259 """

260 self.registeredExtensions.append(extension)

261 return self

262

263 def reset(self) -> Markdown:

264 """

265 Resets all state variables to prepare the parser instance for new input.

266

267 Called once upon creation of a class instance. Should be called manually between calls

268 to [`Markdown.convert`][markdown.Markdown.convert].

269 """

270 self.htmlStash.reset()

271 self.references.clear()

272

273 for extension in self.registeredExtensions:

274 if hasattr(extension, 'reset'):

275 extension.reset()

276

277 return self

278

279 def set_output_format(self, format: str) -> Markdown:

280 """

281 Set the output format for the class instance.

282

283 Arguments:

284 format: Must be a known value in `Markdown.output_formats`.

285

286 """

287 self.output_format = format.lower().rstrip('145') # ignore number

288 try:

289 self.serializer = self.output_formats[self.output_format]

290 except KeyError as e:

291 valid_formats = list(self.output_formats.keys())

292 valid_formats.sort()

293 message = 'Invalid Output Format: "%s". Use one of %s.' \

294 % (self.output_format,

295 '"' + '", "'.join(valid_formats) + '"')

296 e.args = (message,) + e.args[1:]

297 raise

298 return self

299

300 # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.

301 # As there is no standardization in ElementTree, the type of a given tag is unpredictable.

302 def is_block_level(self, tag: Any) -> bool:

303 """

304 Check if the given `tag` is a block level HTML tag.

305

306 Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is

307 not a string always returns `False`.

308

309 """

310 if isinstance(tag, str):

311 return tag.lower().rstrip('/') in self.block_level_elements

312 # Some ElementTree tags are not strings, so return False.

313 return False

314

315 def convert(self, source: str) -> str:

316 """

317 Convert a Markdown string to a string in the specified output format.

318

319 Arguments:

320 source: Markdown formatted text as Unicode or ASCII string.

321

322 Returns:

323 A string in the specified output format.

324

325 Markdown parsing takes place in five steps:

326

327 1. A bunch of [`preprocessors`][markdown.preprocessors] munge the input text.

328 2. A [`BlockParser`][markdown.blockparser.BlockParser] parses the high-level structural elements of the

329 pre-processed text into an [`ElementTree`][xml.etree.ElementTree.ElementTree] object.

330 3. A bunch of [`treeprocessors`][markdown.treeprocessors] are run against the

331 [`ElementTree`][xml.etree.ElementTree.ElementTree] object. One such `treeprocessor`

332 ([`markdown.treeprocessors.InlineProcessor`][]) runs [`inlinepatterns`][markdown.inlinepatterns]

333 against the [`ElementTree`][xml.etree.ElementTree.ElementTree] object, parsing inline markup.

334 4. Some [`postprocessors`][markdown.postprocessors] are run against the text after the

335 [`ElementTree`][xml.etree.ElementTree.ElementTree] object has been serialized into text.

336 5. The output is returned as a string.

337

338 !!! warning

339 The Python-Markdown library does ***not*** sanitize its HTML output.

340 If you are processing Markdown input from an untrusted source, it is your

341 responsibility to ensure that it is properly sanitized. For more

342 information see [Sanitizing HTML Output](../../sanitization.md).

343

344 """

345

346 # Fix up the source text

347 if not source.strip():

348 return '' # a blank Unicode string

349

350 try:

351 source = str(source)

352 except UnicodeDecodeError as e: # pragma: no cover

353 # Customize error message while maintaining original traceback

354 e.reason += '. -- Note: Markdown only accepts Unicode input!'

355 raise

356

357 # Split into lines and run the line preprocessors.

358 self.lines = source.split("\n")

359 for prep in self.preprocessors:

360 self.lines = prep.run(self.lines)

361

362 # Parse the high-level elements.

363 root = self.parser.parseDocument(self.lines).getroot()

364

365 # Run the tree-processors

366 for treeprocessor in self.treeprocessors:

367 newRoot = treeprocessor.run(root)

368 if newRoot is not None:

369 root = newRoot

370

371 # Serialize _properly_. Strip top-level tags.

372 output = self.serializer(root)

373 if self.stripTopLevelTags:

374 try:

375 start = output.index(

376 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2

377 end = output.rindex('</%s>' % self.doc_tag)

378 output = output[start:end].strip()

379 except ValueError as e: # pragma: no cover

380 if output.strip().endswith('<%s />' % self.doc_tag):

381 # We have an empty document

382 output = ''

383 else:

384 # We have a serious problem

385 raise ValueError('Markdown failed to strip top-level '

386 'tags. Document=%r' % output.strip()) from e

387

388 # Run the text post-processors

389 for pp in self.postprocessors:

390 output = pp.run(output)

391

392 return output.strip()

393

394 def convertFile(

395 self,

396 input: str | BinaryIO | None = None,

397 output: str | BinaryIO | None = None,

398 encoding: str | None = None,

399 ) -> Markdown:

400 """

401 Read Markdown text from a file or stream and write HTML output to a file or stream.

402

403 Decodes the input file using the provided encoding (defaults to `utf-8`),

404 passes the file content to markdown, and outputs the HTML to either

405 the provided stream or the file with provided name, using the same

406 encoding as the source file. The

407 [`xmlcharrefreplace`](https://docs.python.org/3/library/codecs.html#error-handlers)

408 error handler is used when encoding the output.

409

410 **Note:** This is the only place that decoding and encoding of Unicode

411 takes place in Python-Markdown. (All other code is Unicode-in /

412 Unicode-out.)

413

414 Arguments:

415 input: File object or path. Reads from `stdin` if `None`.

416 output: File object or path. Writes to `stdout` if `None`.

417 encoding: Encoding of input and output files. Defaults to `utf-8`.

418

419 !!! warning

420 The Python-Markdown library does ***not*** sanitize its HTML output.

421 As `Markdown.convertFile` writes directly to the file system, there is no

422 easy way to sanitize the output from Python code. Therefore, it is

423 recommended that the `Markdown.convertFile` method not be used on input

424 from an untrusted source. For more information see [Sanitizing HTML

425 Output](../../sanitization.md).

426

427 """

428

429 encoding = encoding or "utf-8"

430

431 # Read the source

432 if input:

433 if isinstance(input, str):

434 input_file = open(input, mode="r", encoding=encoding)

435 else:

436 input_file = codecs.getreader(encoding)(input)

437 text = input_file.read()

438 input_file.close()

439 else:

440 text = sys.stdin.read()

441

442 text = text.lstrip('\ufeff') # remove the byte-order mark

443

444 # Convert

445 html = self.convert(text)

446

447 # Write to file or stdout

448 if output:

449 if isinstance(output, str):

450 output_file = codecs.open(output, "w",

451 encoding=encoding,

452 errors="xmlcharrefreplace")

453 output_file.write(html)

454 output_file.close()

455 else:

456 writer = codecs.getwriter(encoding)

457 output_file = writer(output, errors="xmlcharrefreplace")

458 output_file.write(html)

459 # Don't close here. User may want to write more.

460 else:

461 # Encode manually and write bytes to stdout.

462 html = html.encode(encoding, "xmlcharrefreplace")

463 sys.stdout.buffer.write(html)

464

465 return self

466

467

468"""

469EXPORTED FUNCTIONS

470=============================================================================

471

472Those are the two functions we really mean to export: `markdown()` and

473`markdownFromFile()`.

474"""

475

476

477def markdown(text: str, **kwargs: Any) -> str:

478 """

479 Convert a markdown string to HTML and return HTML as a Unicode string.

480

481 This is a shortcut function for [`Markdown`][markdown.Markdown] class to cover the most

482 basic use case. It initializes an instance of [`Markdown`][markdown.Markdown], loads the

483 necessary extensions and runs the parser on the given text.

484

485 Arguments:

486 text: Markdown formatted text as Unicode or ASCII string.

487

488 Keyword arguments:

489 **kwargs: Any arguments accepted by the Markdown class.

490

491 Returns:

492 A string in the specified output format.

493

494 !!! warning

495 The Python-Markdown library does ***not*** sanitize its HTML output.

496 If you are processing Markdown input from an untrusted source, it is your

497 responsibility to ensure that it is properly sanitized. For more

498 information see [Sanitizing HTML Output](../../sanitization.md).

499

500 """

501 md = Markdown(**kwargs)

502 return md.convert(text)

503

504

505def markdownFromFile(**kwargs: Any):

506 """

507 Read Markdown text from a file or stream and write HTML output to a file or stream.

508

509 This is a shortcut function which initializes an instance of [`Markdown`][markdown.Markdown],

510 and calls the [`convertFile`][markdown.Markdown.convertFile] method rather than

511 [`convert`][markdown.Markdown.convert].

512

513 Keyword arguments:

514 input (str | BinaryIO): A file name or readable object.

515 output (str | BinaryIO): A file name or writable object.

516 encoding (str): Encoding of input and output.

517 **kwargs: Any arguments accepted by the `Markdown` class.

518

519 !!! warning

520 The Python-Markdown library does ***not*** sanitize its HTML output.

521 As `markdown.markdownFromFile` writes directly to the file system, there is no

522 easy way to sanitize the output from Python code. Therefore, it is

523 recommended that the `markdown.markdownFromFile` function not be used on input

524 from an untrusted source. For more information see [Sanitizing HTML

525 Output](../../sanitization.md).

526

527 """

528 md = Markdown(**kwargs)

529 md.convertFile(kwargs.get('input', None),

530 kwargs.get('output', None),

531 kwargs.get('encoding', None))