Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/preprocessors.py: 97%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21Preprocessors work on source text before it is broken down into its individual parts.

22This is an excellent place to clean up bad characters or to extract portions for later

23processing that the parser may otherwise choke on.

24"""

26from __future__ import annotations

28from typing import TYPE_CHECKING, Any

29from . import util

30from .htmlparser import HTMLExtractor

31import re

33if TYPE_CHECKING: # pragma: no cover

34 from markdown import Markdown

37def build_preprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Preprocessor]:

38 """ Build and return the default set of preprocessors used by Markdown. """

39 preprocessors = util.Registry()

40 preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)

41 preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)

42 return preprocessors

45class Preprocessor(util.Processor):

46 """

47 Preprocessors are run after the text is broken into lines.

49 Each preprocessor implements a `run` method that takes a pointer to a

50 list of lines of the document, modifies it as necessary and returns

51 either the same pointer or a pointer to a new list.

53 Preprocessors must extend `Preprocessor`.

55 """

56 def run(self, lines: list[str]) -> list[str]:

57 """

58 Each subclass of `Preprocessor` should override the `run` method, which

59 takes the document as a list of strings split by newlines and returns

60 the (possibly modified) list of lines.

62 """

63 pass # pragma: no cover

66class NormalizeWhitespace(Preprocessor):

67 """ Normalize whitespace for consistent parsing. """

69 def run(self, lines: list[str]) -> list[str]:

70 source = '\n'.join(lines)

71 source = source.replace(util.STX, "").replace(util.ETX, "")

72 source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"

73 source = source.expandtabs(self.md.tab_length)

74 source = re.sub(r'(?<=\n) +\n', '\n', source)

75 return source.split('\n')

78class HtmlBlockPreprocessor(Preprocessor):

79 """

80 Remove html blocks from the text and store them for later retrieval.

82 The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the

83 [`Markdown`][markdown.Markdown] instance.

84 """

86 def run(self, lines: list[str]) -> list[str]:

87 source = '\n'.join(lines)

88 parser = HTMLExtractor(self.md)

89 parser.feed(source)

90 parser.close()

91 return ''.join(parser.cleandoc).split('\n')