Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/core.py: 68%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Python Markdown
3# A Python implementation of John Gruber's Markdown.
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
18# License: BSD (see LICENSE.md for details).
20from __future__ import annotations
22import codecs
23import sys
24import logging
25import importlib
26from typing import TYPE_CHECKING, Any, BinaryIO, Callable, ClassVar, Mapping, Sequence
27from . import util
28from .preprocessors import build_preprocessors
29from .blockprocessors import build_block_parser
30from .treeprocessors import build_treeprocessors
31from .inlinepatterns import build_inlinepatterns
32from .postprocessors import build_postprocessors
33from .extensions import Extension
34from .serializers import to_html_string, to_xhtml_string
35from .util import BLOCK_LEVEL_ELEMENTS
37if TYPE_CHECKING: # pragma: no cover
38 from xml.etree.ElementTree import Element
40__all__ = ['Markdown', 'markdown', 'markdownFromFile']
43logger = logging.getLogger('MARKDOWN')
46class Markdown:
47 """
48 A parser which converts Markdown to HTML.
50 Attributes:
51 Markdown.tab_length (int): The number of spaces which correspond to a single tab. Default: `4`.
52 Markdown.ESCAPED_CHARS (list[str]): List of characters which get the backslash escape treatment.
53 Markdown.block_level_elements (list[str]): List of HTML tags which get treated as block-level elements.
54 See [`markdown.util.BLOCK_LEVEL_ELEMENTS`][] for the full list of elements.
55 Markdown.registeredExtensions (list[Extension]): List of extensions which have called
56 [`registerExtension`][markdown.Markdown.registerExtension] during setup.
57 Markdown.doc_tag (str): Element used to wrap document. Default: `div`.
58 Markdown.stripTopLevelTags (bool): Indicates whether the `doc_tag` should be removed. Default: 'True'.
59 Markdown.references (dict[str, tuple[str, str]]): A mapping of link references found in a parsed document
60 where the key is the reference name and the value is a tuple of the URL and title.
61 Markdown.htmlStash (util.HtmlStash): The instance of the `HtmlStash` used by an instance of this class.
62 Markdown.output_formats (dict[str, Callable[xml.etree.ElementTree.Element]]): A mapping of known output
63 formats by name and their respective serializers. Each serializer must be a callable which accepts an
64 [`Element`][xml.etree.ElementTree.Element] and returns a `str`.
65 Markdown.output_format (str): The output format set by
66 [`set_output_format`][markdown.Markdown.set_output_format].
67 Markdown.serializer (Callable[xml.etree.ElementTree.Element]): The serializer set by
68 [`set_output_format`][markdown.Markdown.set_output_format].
69 Markdown.preprocessors (util.Registry): A collection of [`preprocessors`][markdown.preprocessors].
70 Markdown.parser (blockparser.BlockParser): A collection of [`blockprocessors`][markdown.blockprocessors].
71 Markdown.inlinePatterns (util.Registry): A collection of [`inlinepatterns`][markdown.inlinepatterns].
72 Markdown.treeprocessors (util.Registry): A collection of [`treeprocessors`][markdown.treeprocessors].
73 Markdown.postprocessors (util.Registry): A collection of [`postprocessors`][markdown.postprocessors].
75 """
77 doc_tag = "div" # Element used to wrap document - later removed
79 output_formats: ClassVar[dict[str, Callable[[Element], str]]] = {
80 'html': to_html_string,
81 'xhtml': to_xhtml_string,
82 }
83 """
84 A mapping of known output formats by name and their respective serializers. Each serializer must be a
85 callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`.
86 """
88 def __init__(self, **kwargs):
89 """
90 Creates a new Markdown instance.
92 Keyword Arguments:
93 extensions (list[Extension | str]): A list of extensions.
95 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],
96 the instance will be used as-is. If an item is of type `str`, it is passed
97 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding
98 `extension_configs` and the returned instance of [`markdown.extensions.Extension`][]
99 is used.
100 extension_configs (dict[str, dict[str, Any]]): Configuration settings for extensions.
101 output_format (str): Format of output. Supported formats are:
103 * `xhtml`: Outputs XHTML style tags. Default.
104 * `html`: Outputs HTML style tags.
105 tab_length (int): Length of tabs in the source. Default: `4`
107 """
109 self.tab_length: int = kwargs.get('tab_length', 4)
111 self.ESCAPED_CHARS: list[str] = [
112 '\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!'
113 ]
114 """ List of characters which get the backslash escape treatment. """
116 self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()
118 self.registeredExtensions: list[Extension] = []
119 self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.
120 self.stripTopLevelTags: bool = True
122 self.build_parser()
124 self.references: dict[str, tuple[str, str]] = {}
125 self.htmlStash: util.HtmlStash = util.HtmlStash()
126 self.registerExtensions(extensions=kwargs.get('extensions', []),
127 configs=kwargs.get('extension_configs', {}))
128 self.set_output_format(kwargs.get('output_format', 'xhtml'))
129 self.reset()
131 def build_parser(self) -> Markdown:
132 """
133 Build the parser from the various parts.
135 Assigns a value to each of the following attributes on the class instance:
137 * **`Markdown.preprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
138 [`preprocessors`][markdown.preprocessors].
139 * **`Markdown.parser`** ([`BlockParser`][markdown.blockparser.BlockParser]) -- A collection of
140 [`blockprocessors`][markdown.blockprocessors].
141 * **`Markdown.inlinePatterns`** ([`Registry`][markdown.util.Registry]) -- A collection of
142 [`inlinepatterns`][markdown.inlinepatterns].
143 * **`Markdown.treeprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
144 [`treeprocessors`][markdown.treeprocessors].
145 * **`Markdown.postprocessors`** ([`Registry`][markdown.util.Registry]) -- A collection of
146 [`postprocessors`][markdown.postprocessors].
148 This method could be redefined in a subclass to build a custom parser which is made up of a different
149 combination of processors and patterns.
151 """
152 self.preprocessors = build_preprocessors(self)
153 self.parser = build_block_parser(self)
154 self.inlinePatterns = build_inlinepatterns(self)
155 self.treeprocessors = build_treeprocessors(self)
156 self.postprocessors = build_postprocessors(self)
157 return self
159 def registerExtensions(
160 self,
161 extensions: Sequence[Extension | str],
162 configs: Mapping[str, dict[str, Any]]
163 ) -> Markdown:
164 """
165 Load a list of extensions into an instance of the `Markdown` class.
167 Arguments:
168 extensions (list[Extension | str]): A list of extensions.
170 If an item is an instance of a subclass of [`markdown.extensions.Extension`][],
171 the instance will be used as-is. If an item is of type `str`, it is passed
172 to [`build_extension`][markdown.Markdown.build_extension] with its corresponding `configs` and the
173 returned instance of [`markdown.extensions.Extension`][] is used.
174 configs (dict[str, dict[str, Any]]): Configuration settings for extensions.
176 """
177 for ext in extensions:
178 if isinstance(ext, str):
179 ext = self.build_extension(ext, configs.get(ext, {}))
180 if isinstance(ext, Extension):
181 ext.extendMarkdown(self)
182 logger.debug(
183 'Successfully loaded extension "%s.%s".'
184 % (ext.__class__.__module__, ext.__class__.__name__)
185 )
186 elif ext is not None:
187 raise TypeError(
188 'Extension "{}.{}" must be of type: "{}.{}"'.format(
189 ext.__class__.__module__, ext.__class__.__name__,
190 Extension.__module__, Extension.__name__
191 )
192 )
193 return self
195 def build_extension(self, ext_name: str, configs: Mapping[str, Any]) -> Extension:
196 """
197 Build extension from a string name, then return an instance using the given `configs`.
199 Arguments:
200 ext_name: Name of extension as a string.
201 configs: Configuration settings for extension.
203 Returns:
204 An instance of the extension with the given configuration settings.
206 First attempt to load an entry point. The string name must be registered as an entry point in the
207 `markdown.extensions` group which points to a subclass of the [`markdown.extensions.Extension`][] class.
208 If multiple distributions have registered the same name, the first one found is returned.
210 If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and
211 return an instance. If no class is specified, import the module and call a `makeExtension` function and return
212 the [`markdown.extensions.Extension`][] instance returned by that function.
213 """
214 configs = dict(configs)
216 entry_points = [ep for ep in util.get_installed_extensions() if ep.name == ext_name]
217 if entry_points:
218 ext = entry_points[0].load()
219 return ext(**configs)
221 # Get class name (if provided): `path.to.module:ClassName`
222 ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')
224 try:
225 module = importlib.import_module(ext_name)
226 logger.debug(
227 'Successfully imported extension module "%s".' % ext_name
228 )
229 except ImportError as e:
230 message = 'Failed loading extension "%s".' % ext_name
231 e.args = (message,) + e.args[1:]
232 raise
234 if class_name:
235 # Load given class name from module.
236 return getattr(module, class_name)(**configs)
237 else:
238 # Expect `makeExtension()` function to return a class.
239 try:
240 return module.makeExtension(**configs)
241 except AttributeError as e:
242 message = e.args[0]
243 message = "Failed to initiate extension " \
244 "'%s': %s" % (ext_name, message)
245 e.args = (message,) + e.args[1:]
246 raise
248 def registerExtension(self, extension: Extension) -> Markdown:
249 """
250 Register an extension as having a resettable state.
252 Arguments:
253 extension: An instance of the extension to register.
255 This should get called once by an extension during setup. A "registered" extension's
256 `reset` method is called by [`Markdown.reset()`][markdown.Markdown.reset]. Not all extensions have or need a
257 resettable state, and so it should not be assumed that all extensions are "registered."
259 """
260 self.registeredExtensions.append(extension)
261 return self
263 def reset(self) -> Markdown:
264 """
265 Resets all state variables to prepare the parser instance for new input.
267 Called once upon creation of a class instance. Should be called manually between calls
268 to [`Markdown.convert`][markdown.Markdown.convert].
269 """
270 self.htmlStash.reset()
271 self.references.clear()
273 for extension in self.registeredExtensions:
274 if hasattr(extension, 'reset'):
275 extension.reset()
277 return self
279 def set_output_format(self, format: str) -> Markdown:
280 """
281 Set the output format for the class instance.
283 Arguments:
284 format: Must be a known value in `Markdown.output_formats`.
286 """
287 self.output_format = format.lower().rstrip('145') # ignore number
288 try:
289 self.serializer = self.output_formats[self.output_format]
290 except KeyError as e:
291 valid_formats = list(self.output_formats.keys())
292 valid_formats.sort()
293 message = 'Invalid Output Format: "%s". Use one of %s.' \
294 % (self.output_format,
295 '"' + '", "'.join(valid_formats) + '"')
296 e.args = (message,) + e.args[1:]
297 raise
298 return self
300 # Note: the `tag` argument is type annotated `Any` as ElementTree uses many various objects as tags.
301 # As there is no standardization in ElementTree, the type of a given tag is unpredictable.
302 def is_block_level(self, tag: Any) -> bool:
303 """
304 Check if the given `tag` is a block level HTML tag.
306 Returns `True` for any string listed in `Markdown.block_level_elements`. A `tag` which is
307 not a string always returns `False`.
309 """
310 if isinstance(tag, str):
311 return tag.lower().rstrip('/') in self.block_level_elements
312 # Some ElementTree tags are not strings, so return False.
313 return False
315 def convert(self, source: str) -> str:
316 """
317 Convert a Markdown string to a string in the specified output format.
319 Arguments:
320 source: Markdown formatted text as Unicode or ASCII string.
322 Returns:
323 A string in the specified output format.
325 Markdown parsing takes place in five steps:
327 1. A bunch of [`preprocessors`][markdown.preprocessors] munge the input text.
328 2. A [`BlockParser`][markdown.blockparser.BlockParser] parses the high-level structural elements of the
329 pre-processed text into an [`ElementTree`][xml.etree.ElementTree.ElementTree] object.
330 3. A bunch of [`treeprocessors`][markdown.treeprocessors] are run against the
331 [`ElementTree`][xml.etree.ElementTree.ElementTree] object. One such `treeprocessor`
332 ([`markdown.treeprocessors.InlineProcessor`][]) runs [`inlinepatterns`][markdown.inlinepatterns]
333 against the [`ElementTree`][xml.etree.ElementTree.ElementTree] object, parsing inline markup.
334 4. Some [`postprocessors`][markdown.postprocessors] are run against the text after the
335 [`ElementTree`][xml.etree.ElementTree.ElementTree] object has been serialized into text.
336 5. The output is returned as a string.
338 """
340 # Fix up the source text
341 if not source.strip():
342 return '' # a blank Unicode string
344 try:
345 source = str(source)
346 except UnicodeDecodeError as e: # pragma: no cover
347 # Customize error message while maintaining original traceback
348 e.reason += '. -- Note: Markdown only accepts Unicode input!'
349 raise
351 # Split into lines and run the line preprocessors.
352 self.lines = source.split("\n")
353 for prep in self.preprocessors:
354 self.lines = prep.run(self.lines)
356 # Parse the high-level elements.
357 root = self.parser.parseDocument(self.lines).getroot()
359 # Run the tree-processors
360 for treeprocessor in self.treeprocessors:
361 newRoot = treeprocessor.run(root)
362 if newRoot is not None:
363 root = newRoot
365 # Serialize _properly_. Strip top-level tags.
366 output = self.serializer(root)
367 if self.stripTopLevelTags:
368 try:
369 start = output.index(
370 '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
371 end = output.rindex('</%s>' % self.doc_tag)
372 output = output[start:end].strip()
373 except ValueError as e: # pragma: no cover
374 if output.strip().endswith('<%s />' % self.doc_tag):
375 # We have an empty document
376 output = ''
377 else:
378 # We have a serious problem
379 raise ValueError('Markdown failed to strip top-level '
380 'tags. Document=%r' % output.strip()) from e
382 # Run the text post-processors
383 for pp in self.postprocessors:
384 output = pp.run(output)
386 return output.strip()
388 def convertFile(
389 self,
390 input: str | BinaryIO | None = None,
391 output: str | BinaryIO | None = None,
392 encoding: str | None = None,
393 ) -> Markdown:
394 """
395 Converts a Markdown file and returns the HTML as a Unicode string.
397 Decodes the file using the provided encoding (defaults to `utf-8`),
398 passes the file content to markdown, and outputs the HTML to either
399 the provided stream or the file with provided name, using the same
400 encoding as the source file. The
401 [`xmlcharrefreplace`](https://docs.python.org/3/library/codecs.html#error-handlers)
402 error handler is used when encoding the output.
404 **Note:** This is the only place that decoding and encoding of Unicode
405 takes place in Python-Markdown. (All other code is Unicode-in /
406 Unicode-out.)
408 Arguments:
409 input: File object or path. Reads from `stdin` if `None`.
410 output: File object or path. Writes to `stdout` if `None`.
411 encoding: Encoding of input and output files. Defaults to `utf-8`.
413 """
415 encoding = encoding or "utf-8"
417 # Read the source
418 if input:
419 if isinstance(input, str):
420 input_file = open(input, mode="r", encoding=encoding)
421 else:
422 input_file = codecs.getreader(encoding)(input)
423 text = input_file.read()
424 input_file.close()
425 else:
426 text = sys.stdin.read()
428 text = text.lstrip('\ufeff') # remove the byte-order mark
430 # Convert
431 html = self.convert(text)
433 # Write to file or stdout
434 if output:
435 if isinstance(output, str):
436 output_file = codecs.open(output, "w",
437 encoding=encoding,
438 errors="xmlcharrefreplace")
439 output_file.write(html)
440 output_file.close()
441 else:
442 writer = codecs.getwriter(encoding)
443 output_file = writer(output, errors="xmlcharrefreplace")
444 output_file.write(html)
445 # Don't close here. User may want to write more.
446 else:
447 # Encode manually and write bytes to stdout.
448 html = html.encode(encoding, "xmlcharrefreplace")
449 sys.stdout.buffer.write(html)
451 return self
454"""
455EXPORTED FUNCTIONS
456=============================================================================
458Those are the two functions we really mean to export: `markdown()` and
459`markdownFromFile()`.
460"""
463def markdown(text: str, **kwargs: Any) -> str:
464 """
465 Convert a markdown string to HTML and return HTML as a Unicode string.
467 This is a shortcut function for [`Markdown`][markdown.Markdown] class to cover the most
468 basic use case. It initializes an instance of [`Markdown`][markdown.Markdown], loads the
469 necessary extensions and runs the parser on the given text.
471 Arguments:
472 text: Markdown formatted text as Unicode or ASCII string.
474 Keyword arguments:
475 **kwargs: Any arguments accepted by the Markdown class.
477 Returns:
478 A string in the specified output format.
480 """
481 md = Markdown(**kwargs)
482 return md.convert(text)
485def markdownFromFile(**kwargs: Any):
486 """
487 Read Markdown text from a file and write output to a file or a stream.
489 This is a shortcut function which initializes an instance of [`Markdown`][markdown.Markdown],
490 and calls the [`convertFile`][markdown.Markdown.convertFile] method rather than
491 [`convert`][markdown.Markdown.convert].
493 Keyword arguments:
494 input (str | BinaryIO): A file name or readable object.
495 output (str | BinaryIO): A file name or writable object.
496 encoding (str): Encoding of input and output.
497 **kwargs: Any arguments accepted by the `Markdown` class.
499 """
500 md = Markdown(**kwargs)
501 md.convertFile(kwargs.get('input', None),
502 kwargs.get('output', None),
503 kwargs.get('encoding', None))