Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/core.py: 68%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20from __future__ import annotations
22import codecs
23import sys
24import logging
25import importlib
26from typing import TYPE_CHECKING, Any, BinaryIO, Callable, ClassVar, Mapping, Sequence
27from . import util
28from .preprocessors import build_preprocessors
29from .blockprocessors import build_block_parser
30from .treeprocessors import build_treeprocessors
31from .inlinepatterns import build_inlinepatterns
32from .postprocessors import build_postprocessors
33from .extensions import Extension
34from .serializers import to_html_string, to_xhtml_string
35from .util import BLOCK_LEVEL_ELEMENTS
37if TYPE_CHECKING: # pragma: no cover
38 from xml.etree.ElementTree import Element
40__all__ = ['Markdown', 'markdown', 'markdownFromFile']
43logger = logging.getLogger('MARKDOWN')
46class Markdown:
47 """
48 A parser which converts Markdown to HTML.
50 Attributes:
51 Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.
52 Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.
53 Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.
54 See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.
55 Markdown.registeredExtensions (list[Extension]): List of extensions which have called
56 [`registerExtension`][markdown.Markdown.registerExtension] during setup.
57 Markdown.doc_tag (str): Element used to wrap document. Default: `div`.
58 Markdown.stripTopLevelTags (bool): Indicates whether the `doc_tag` should be removed. Default: 'True'.
59 Markdown.references (dict[str, tuple[str, str]]): A mapping of link references found in a parsed document
60 where the key is the reference name and the value is a tuple of the URL and title.
61 Markdown.htmlStash (util.HtmlStash): The instance of the `HtmlStash` used by an instance of this class.
62 Markdown.output_formats (dict[str, Callable[xml.etree.ElementTree.Element]]): A mapping of known output
63 formats by name and their respective serializers. Each serializer must be a callable which accepts an
64 [`Element`][xml.etree.ElementTree.Element] and returns a `str`.
65 Markdown.output_format (str): The output format set by
66 [`set_output_format`][markdown.Markdown.set_output_format].
67 Markdown.serializer (Callable[xml.etree.ElementTree.Element]): The serializer set by
68 [`set_output_format`][markdown.Markdown.set_output_format].
69 Markdown.preprocessors (util.Registry): A collection of [`preprocessors`][markdown.preprocessors].
70 Markdown.parser (blockparser.BlockParser): A collection of [`blockprocessors`][markdown.blockprocessors].
71 Markdown.inlinePatterns (util.Registry): A collection of [`inlinepatterns`][markdown.inlinepatterns].
72 Markdown.treeprocessors (util.Registry): A collection of [`treeprocessors`][markdown.treeprocessors].
73 Markdown.postprocessors (util.Registry): A collection of [`postprocessors`][markdown.postprocessors].
75 """
77 doc_tag = "div" # Element used to wrap document - later removed
79 output_formats: ClassVar[dict[str, Callable[[Element], str]]] = {
80 'html': to_html_string,
81 'xhtml': to_xhtml_string,
82 }
83 """
84 A mapping of known output formats by name and their respective serializers. Each serializer must be a
85 callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`.
86 """
88 def __init__(self, **kwargs):
89 """
90 Creates a new Markdown instance.
92 Keyword Arguments:
93 extensions (list[Extension | str]): A list of extensions.
95 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],
96 the instance will be used as-is. If an item is of type `str`, it is passed
97 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding
98 `extension_configs` and the returned instance of [`markdown.extensions.Extension`][]
99 is used.
100 extension_configs (dict[str, dict[str, Any]]): Configuration settings for extensions.
101 output_format (str): Format of output. Supported formats are:
103 * `xhtml`: Outputs XHTML style tags. Default.
104 * `html`: Outputs HTML style tags.
105 tab_length (int): Length of tabs in the source. Default: `4`
107 """
109 self.tab_length: int = kwargs.get('tab_length', 4)
111 self.ESCAPED_CHARS: list[str] = [
112 '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!'
113 ]
114 """ List of characters which get the backslash escape treatment. """
116 self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()
118 self.registeredExtensions: list[Extension] = []
119 self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.
120 self.stripTopLevelTags: bool = True
122 self.build_parser()
124 self.references: dict[str, tuple[str, str]] = {}
125 self.htmlStash: util.HtmlStash = util.HtmlStash()
126 self.registerExtensions(extensions=kwargs.get('extensions', []),
127 configs=kwargs.get('extension_configs', {}))
128 self.set_output_format(kwargs.get('output_format', 'xhtml'))
129 self.reset()
131 def build_parser(self) -> Markdown:
132 """
133 Build the parser from the various parts.
135 Assigns a value to each of the following attributes on the class instance:
137 * **`Markdown.preprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
138 [`preprocessors`][markdown.preprocessors].
139 * **`Markdown.parser`** ([`BlockParser`][markdown.blockparser.BlockParser]) -- A collection of
140 [`blockprocessors`][markdown.blockprocessors].
141 * **`Markdown.inlinePatterns`** ([`Registry`][markdown.util.Registry]) -- A collection of
142 [`inlinepatterns`][markdown.inlinepatterns].
143 * **`Markdown.treeprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
144 [`treeprocessors`][markdown.treeprocessors].
145 * **`Markdown.postprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
146 [`postprocessors`][markdown.postprocessors].
148 This method could be redefined in a subclass to build a custom parser which is made up of a different
149 combination of processors and patterns.
151 """
152 self.preprocessors = build_preprocessors(self)
153 self.parser = build_block_parser(self)
154 self.inlinePatterns = build_inlinepatterns(self)
155 self.treeprocessors = build_treeprocessors(self)
156 self.postprocessors = build_postprocessors(self)
157 return self
159 def registerExtensions(
160 self,
161 extensions: Sequence[Extension | str],
162 configs: Mapping[str, dict[str, Any]]
163 ) -> Markdown:
164 """
165 Load a list of extensions into an instance of the `Markdown` class.
167 Arguments:
168 extensions (list[Extension | str]): A list of extensions.
170 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],
171 the instance will be used as-is. If an item is of type `str`, it is passed
172 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding `configs` and the
173 returned instance of [`markdown.extensions.Extension`][] is used.
174 configs (dict[str, dict[str, Any]]): Configuration settings for extensions.
176 """
177 for ext in extensions:
178 if isinstance(ext, str):
179 ext = self.build_extension(ext, configs.get(ext, {}))
180 if isinstance(ext, Extension):
181 ext.extendMarkdown(self)
182 logger.debug(
183 'Successfully loaded extension "%s.%s".'
184 % (ext.__class__.__module__, ext.__class__.__name__)
185 )
186 elif ext is not None:
187 raise TypeError(
188 'Extension "{}.{}" must be of type: "{}.{}"'.format(
189 ext.__class__.__module__, ext.__class__.__name__,
190 Extension.__module__, Extension.__name__
191 )
192 )
193 return self
195 def build_extension(self, ext_name: str, configs: Mapping[str, Any]) -> Extension:
196 """
197 Build extension from a string name, then return an instance using the given `configs`.
199 Arguments:
200 ext_name: Name of extension as a string.
201 configs: Configuration settings for extension.
203 Returns:
204 An instance of the extension with the given configuration settings.
206 First attempt to load an entry point. The string name must be registered as an entry point in the
207 `markdown.extensions` group which points to a subclass of the [`markdown.extensions.Extension`][] class.
208 If multiple distributions have registered the same name, the first one found is returned.
210 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and
211 return an instance. If no class is specified, import the module and call a `makeExtension` function and return
212 the [`markdown.extensions.Extension`][] instance returned by that function.
213 """
214 configs = dict(configs)
216 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name]
217 if entry_points:
218 ext = entry_points[0].load()
219 return ext(**configs)
221 # Get class name (if provided): `path.to.module:ClassName`
222 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')
224 try:
225 module = importlib.import_module(ext_name)
226 logger.debug(
227 'Successfully imported extension module "%s".' % ext_name
228 )
229 except ImportError as e:
230 message = 'Failed loading extension "%s".' % ext_name
231 e.args = (message,) + e.args[1:]
232 raise
234 if class_name:
235 # Load given class name from module.
236 return getattr(module, class_name)(**configs)
237 else:
238 # Expect `makeExtension()` function to return a class.
239 try:
240 return module.makeExtension(**configs)
241 except AttributeError as e:
242 message = e.args[0]
243 message = "Failed to initiate extension " \
244 "'%s': %s" % (ext_name, message)
245 e.args = (message,) + e.args[1:]
246 raise
248 def registerExtension(self, extension: Extension) -> Markdown:
249 """
250 Register an extension as having a resettable state.
252 Arguments:
253 extension: An instance of the extension to register.
255 This should get called once by an extension during setup. A "registered" extension's
256 `reset` method is called by [`Markdown.reset()`][markdown.Markdown.reset]. Not all extensions have or need a
257 resettable state, and so it should not be assumed that all extensions are "registered."
259 """
260 self.registeredExtensions.append(extension)
261 return self
263 def reset(self) -> Markdown:
264 """
265 Resets all state variables to prepare the parser instance for new input.
267 Called once upon creation of a class instance. Should be called manually between calls
268 to [`Markdown.convert`][markdown.Markdown.convert].
269 """
270 self.htmlStash.reset()
271 self.references.clear()
273 for extension in self.registeredExtensions:
274 if hasattr(extension, 'reset'):
275 extension.reset()
277 return self
279 def set_output_format(self, format: str) -> Markdown:
280 """
281 Set the output format for the class instance.
283 Arguments:
284 format: Must be a known value in `Markdown.output_formats`.
286 """
287 self.output_format = format.lower().rstrip('145') # ignore number
288 try:
289 self.serializer = self.output_formats[self.output_format]
290 except KeyError as e:
291 valid_formats = list(self.output_formats.keys())
292 valid_formats.sort()
293 message = 'Invalid Output Format: "%s". Use one of %s.' \
294 % (self.output_format,
295 '"' + '", "'.join(valid_formats) + '"')
296 e.args = (message,) + e.args[1:]
297 raise
298 return self
300 # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.
301 # As there is no standardization in ElementTree, the type of a given tag is unpredictable.
302 def is_block_level(self, tag: Any) -> bool:
303 """
304 Check if the given `tag` is a block level HTML tag.
306 Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is
307 not a string always returns `False`.
309 """
310 if isinstance(tag, str):
311 return tag.lower().rstrip('/') in self.block_level_elements
312 # Some ElementTree tags are not strings, so return False.
313 return False
315 def convert(self, source: str) -> str:
316 """
317 Convert a Markdown string to a string in the specified output format.
319 Arguments:
320 source: Markdown formatted text as Unicode or ASCII string.
322 Returns:
323 A string in the specified output format.
325 Markdown parsing takes place in five steps:
327 1. A bunch of [`preprocessors`][markdown.preprocessors] munge the input text.
328 2. A [`BlockParser`][markdown.blockparser.BlockParser] parses the high-level structural elements of the
329 pre-processed text into an [`ElementTree`][xml.etree.ElementTree.ElementTree] object.
330 3. A bunch of [`treeprocessors`][markdown.treeprocessors] are run against the
331 [`ElementTree`][xml.etree.ElementTree.ElementTree] object. One such `treeprocessor`
332 ([`markdown.treeprocessors.InlineProcessor`][]) runs [`inlinepatterns`][markdown.inlinepatterns]
333 against the [`ElementTree`][xml.etree.ElementTree.ElementTree] object, parsing inline markup.
334 4. Some [`postprocessors`][markdown.postprocessors] are run against the text after the
335 [`ElementTree`][xml.etree.ElementTree.ElementTree] object has been serialized into text.
336 5. The output is returned as a string.
338 !!! warning
339 The Python-Markdown library does ***not*** sanitize its HTML output.
340 If you are processing Markdown input from an untrusted source, it is your
341 responsibility to ensure that it is properly sanitized. For more
342 information see [Sanitizing HTML Output](../../sanitization.md).
344 """
346 # Fix up the source text
347 if not source.strip():
348 return '' # a blank Unicode string
350 try:
351 source = str(source)
352 except UnicodeDecodeError as e: # pragma: no cover
353 # Customize error message while maintaining original traceback
354 e.reason += '. -- Note: Markdown only accepts Unicode input!'
355 raise
357 # Split into lines and run the line preprocessors.
358 self.lines = source.split("\n")
359 for prep in self.preprocessors:
360 self.lines = prep.run(self.lines)
362 # Parse the high-level elements.
363 root = self.parser.parseDocument(self.lines).getroot()
365 # Run the tree-processors
366 for treeprocessor in self.treeprocessors:
367 newRoot = treeprocessor.run(root)
368 if newRoot is not None:
369 root = newRoot
371 # Serialize _properly_. Strip top-level tags.
372 output = self.serializer(root)
373 if self.stripTopLevelTags:
374 try:
375 start = output.index(
376 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
377 end = output.rindex('</%s>' % self.doc_tag)
378 output = output[start:end].strip()
379 except ValueError as e: # pragma: no cover
380 if output.strip().endswith('<%s />' % self.doc_tag):
381 # We have an empty document
382 output = ''
383 else:
384 # We have a serious problem
385 raise ValueError('Markdown failed to strip top-level '
386 'tags. Document=%r' % output.strip()) from e
388 # Run the text post-processors
389 for pp in self.postprocessors:
390 output = pp.run(output)
392 return output.strip()
394 def convertFile(
395 self,
396 input: str | BinaryIO | None = None,
397 output: str | BinaryIO | None = None,
398 encoding: str | None = None,
399 ) -> Markdown:
400 """
401 Read Markdown text from a file or stream and write HTML output to a file or stream.
403 Decodes the input file using the provided encoding (defaults to `utf-8`),
404 passes the file content to markdown, and outputs the HTML to either
405 the provided stream or the file with provided name, using the same
406 encoding as the source file. The
407 [`xmlcharrefreplace`](https://docs.python.org/3/library/codecs.html#error-handlers)
408 error handler is used when encoding the output.
410 **Note:** This is the only place that decoding and encoding of Unicode
411 takes place in Python-Markdown. (All other code is Unicode-in /
412 Unicode-out.)
414 Arguments:
415 input: File object or path. Reads from `stdin` if `None`.
416 output: File object or path. Writes to `stdout` if `None`.
417 encoding: Encoding of input and output files. Defaults to `utf-8`.
419 !!! warning
420 The Python-Markdown library does ***not*** sanitize its HTML output.
421 As `Markdown.convertFile` writes directly to the file system, there is no
422 easy way to sanitize the output from Python code. Therefore, it is
423 recommended that the `Markdown.convertFile` method not be used on input
424 from an untrusted source. For more information see [Sanitizing HTML
425 Output](../../sanitization.md).
427 """
429 encoding = encoding or "utf-8"
431 # Read the source
432 if input:
433 if isinstance(input, str):
434 input_file = open(input, mode="r", encoding=encoding)
435 else:
436 input_file = codecs.getreader(encoding)(input)
437 text = input_file.read()
438 input_file.close()
439 else:
440 text = sys.stdin.read()
442 text = text.lstrip('\ufeff') # remove the byte-order mark
444 # Convert
445 html = self.convert(text)
447 # Write to file or stdout
448 if output:
449 if isinstance(output, str):
450 output_file = codecs.open(output, "w",
451 encoding=encoding,
452 errors="xmlcharrefreplace")
453 output_file.write(html)
454 output_file.close()
455 else:
456 writer = codecs.getwriter(encoding)
457 output_file = writer(output, errors="xmlcharrefreplace")
458 output_file.write(html)
459 # Don't close here. User may want to write more.
460 else:
461 # Encode manually and write bytes to stdout.
462 html = html.encode(encoding, "xmlcharrefreplace")
463 sys.stdout.buffer.write(html)
465 return self
468"""
469EXPORTED FUNCTIONS
470=============================================================================
472Those are the two functions we really mean to export: `markdown()` and
473`markdownFromFile()`.
474"""
477def markdown(text: str, **kwargs: Any) -> str:
478 """
479 Convert a markdown string to HTML and return HTML as a Unicode string.
481 This is a shortcut function for [`Markdown`][markdown.Markdown] class to cover the most
482 basic use case. It initializes an instance of [`Markdown`][markdown.Markdown], loads the
483 necessary extensions and runs the parser on the given text.
485 Arguments:
486 text: Markdown formatted text as Unicode or ASCII string.
488 Keyword arguments:
489 **kwargs: Any arguments accepted by the Markdown class.
491 Returns:
492 A string in the specified output format.
494 !!! warning
495 The Python-Markdown library does ***not*** sanitize its HTML output.
496 If you are processing Markdown input from an untrusted source, it is your
497 responsibility to ensure that it is properly sanitized. For more
498 information see [Sanitizing HTML Output](../../sanitization.md).
500 """
501 md = Markdown(**kwargs)
502 return md.convert(text)
505def markdownFromFile(**kwargs: Any):
506 """
507 Read Markdown text from a file or stream and write HTML output to a file or stream.
509 This is a shortcut function which initializes an instance of [`Markdown`][markdown.Markdown],
510 and calls the [`convertFile`][markdown.Markdown.convertFile] method rather than
511 [`convert`][markdown.Markdown.convert].
513 Keyword arguments:
514 input (str | BinaryIO): A file name or readable object.
515 output (str | BinaryIO): A file name or writable object.
516 encoding (str): Encoding of input and output.
517 **kwargs: Any arguments accepted by the `Markdown` class.
519 !!! warning
520 The Python-Markdown library does ***not*** sanitize its HTML output.
521 As `markdown.markdownFromFile` writes directly to the file system, there is no
522 easy way to sanitize the output from Python code. Therefore, it is
523 recommended that the `markdown.markdownFromFile` function not be used on input
524 from an untrusted source. For more information see [Sanitizing HTML
525 Output](../../sanitization.md).
527 """
528 md = Markdown(**kwargs)
529 md.convertFile(kwargs.get('input', None),
530 kwargs.get('output', None),
531 kwargs.get('encoding', None))