1# Python Markdown 
    2 
    3# A Python implementation of John Gruber's Markdown. 
    4 
    5# Documentation: https://python-markdown.github.io/ 
    6# GitHub: https://github.com/Python-Markdown/markdown/ 
    7# PyPI: https://pypi.org/project/Markdown/ 
    8 
    9# Started by Manfred Stienstra (http://www.dwerg.net/). 
    10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 
    11# Currently maintained by Waylan Limberg (https://github.com/waylan), 
    12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 
    13 
    14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 
    15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 
    16# Copyright 2004 Manfred Stienstra (the original version) 
    17 
    18# License: BSD (see LICENSE.md for details). 
    19 
    20""" 
    21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][].   The 
    22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still 
    23supported, though users are encouraged to migrate. 
    24 
    25The new `InlineProcessor` provides two major enhancements to `Patterns`: 
    26 
    271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with 
    28  `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain 
    29   what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`. 
    30 
    312.  The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis, 
    32    not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative 
    33    to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`).  If the boundaries are 
    34    returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`. 
    35 
    36    This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested 
    37    brackets, and explicit control of the span "consumed" by the processor. 
    38 
    39""" 
    40 
    41from __future__ import annotations 
    42 
    43from . import util 
    44from typing import TYPE_CHECKING, Any, Collection, NamedTuple 
    45import re 
    46import xml.etree.ElementTree as etree 
    47from html import entities 
    48 
    49if TYPE_CHECKING:  # pragma: no cover 
    50    from markdown import Markdown 
    51 
    52 
    53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]: 
    54    """ 
    55    Build the default set of inline patterns for Markdown. 
    56 
    57    The order in which processors and/or patterns are applied is very important - e.g. if we first replace 
    58    `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we 
    59    apply the expressions in the following order: 
    60 
    61    * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown 
    62      patterns by escaping them; 
    63 
    64    * then we handle the various types of links (auto-links must be handled before inline HTML); 
    65 
    66    * then we handle inline HTML.  At this point we will simply replace all inline HTML strings with a placeholder 
    67      and add the actual HTML to a stash; 
    68 
    69    * finally we apply strong, emphasis, etc. 
    70 
    71    """ 
    72    inlinePatterns = util.Registry() 
    73    inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) 
    74    inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) 
    75    inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) 
    76    inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) 
    77    inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) 
    78    inlinePatterns.register( 
    79        ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 
    80    ) 
    81    inlinePatterns.register( 
    82        ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 
    83    ) 
    84    inlinePatterns.register( 
    85        ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 
    86    ) 
    87    inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) 
    88    inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) 
    89    inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) 
    90    inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) 
    91    inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) 
    92    inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) 
    93    inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) 
    94    inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) 
    95    return inlinePatterns 
    96 
    97 
    98# The actual regular expressions for patterns 
    99# ----------------------------------------------------------------------------- 
    100 
    101NOIMG = r'(?<!\!)' 
    102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """ 
    103 
    104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' 
    105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """ 
    106 
    107ESCAPE_RE = r'\\(.)' 
    108""" Match a backslash escaped character (`\\<` or `\\*`). """ 
    109 
    110EMPHASIS_RE = r'(\*)([^\*]+)\1' 
    111""" Match emphasis with an asterisk (`*emphasis*`). """ 
    112 
    113STRONG_RE = r'(\*{2})(.+?)\1' 
    114""" Match strong with an asterisk (`**strong**`). """ 
    115 
    116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' 
    117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """ 
    118 
    119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' 
    120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """ 
    121 
    122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' 
    123""" Match strong emphasis with underscores (`__strong _em__`). """ 
    124 
    125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' 
    126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """ 
    127 
    128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' 
    129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """ 
    130 
    131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' 
    132""" Match strong emphasis with asterisk (`***strong**em*`). """ 
    133 
    134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' 
    135""" Match strong emphasis with underscores (`___strong__em_`). """ 
    136 
    137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' 
    138""" Match strong emphasis with asterisk (`**strong*em***`). """ 
    139 
    140LINK_RE = NOIMG + r'\[' 
    141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """ 
    142 
    143IMAGE_LINK_RE = r'\!\[' 
    144""" Match start of in-line image link (`` or ``). """ 
    145 
    146REFERENCE_RE = LINK_RE 
    147""" Match start of reference link (`[Label][3]`). """ 
    148 
    149IMAGE_REFERENCE_RE = IMAGE_LINK_RE 
    150""" Match start of image reference (`![alt text][2]`). """ 
    151 
    152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))' 
    153""" Match a stand-alone `*` or `_`. """ 
    154 
    155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' 
    156""" Match an automatic link (`<http://www.example.com>`). """ 
    157 
    158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' 
    159""" Match an automatic email link (`<me@example.com>`). """ 
    160 
    161HTML_RE = ( 
    162    r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|'          # Tag 
    163    r'!--(?:(?!<!--|-->).)*--|'                   # Comment 
    164    r'[?](?:(?!<[?]|[?]>).)*[?]|'                 # Processing instruction 
    165    r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]'  # `CDATA` 
    166    ')>)' 
    167) 
    168""" Match an HTML tag (`<...>`). """ 
    169 
    170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' 
    171""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """ 
    172 
    173LINE_BREAK_RE = r'  \n' 
    174""" Match two spaces at end of line. """ 
    175 
    176 
    177def dequote(string: str) -> str: 
    178    """Remove quotes from around a string.""" 
    179    if ((string.startswith('"') and string.endswith('"')) or 
    180       (string.startswith("'") and string.endswith("'"))): 
    181        return string[1:-1] 
    182    else: 
    183        return string 
    184 
    185 
    186class EmStrongItem(NamedTuple): 
    187    """Emphasis/strong pattern item.""" 
    188    pattern: re.Pattern[str] 
    189    builder: str 
    190    tags: str 
    191 
    192 
    193# The pattern classes 
    194# ----------------------------------------------------------------------------- 
    195 
    196 
    197class Pattern:  # pragma: no cover 
    198    """ 
    199    Base class that inline patterns subclass. 
    200 
    201    Inline patterns are handled by means of `Pattern` subclasses, one per regular expression. 
    202    Each pattern object uses a single regular expression and must support the following methods: 
    203    [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and 
    204    [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch]. 
    205 
    206    All the regular expressions used by `Pattern` subclasses must capture the whole block.  For this 
    207    reason, they all start with `^(.*)` and end with `(.*)!`.  When passing a regular expression on 
    208    class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression 
    209    is pre-compiled. 
    210 
    211    It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that 
    212    use a more efficient and flexible search approach be used instead. However, the older style 
    213    `Pattern` remains for backward compatibility with many existing third-party extensions. 
    214 
    215    """ 
    216 
    217    ANCESTOR_EXCLUDES: Collection[str] = tuple() 
    218    """ 
    219    A collection of elements which are undesirable ancestors. The processor will be skipped if it 
    220    would cause the content to be a descendant of one of the listed tag names. 
    221    """ 
    222 
    223    compiled_re: re.Pattern[str] 
    224    md: Markdown | None 
    225 
    226    def __init__(self, pattern: str, md: Markdown | None = None): 
    227        """ 
    228        Create an instant of an inline pattern. 
    229 
    230        Arguments: 
    231            pattern: A regular expression that matches a pattern. 
    232            md: An optional pointer to the instance of `markdown.Markdown` and is available as 
    233                `self.md` on the class instance. 
    234 
    235 
    236        """ 
    237        self.pattern = pattern 
    238        self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, 
    239                                      re.DOTALL | re.UNICODE) 
    240 
    241        self.md = md 
    242 
    243    def getCompiledRegExp(self) -> re.Pattern: 
    244        """ Return a compiled regular expression. """ 
    245        return self.compiled_re 
    246 
    247    def handleMatch(self, m: re.Match[str]) -> etree.Element | str: 
    248        """Return a ElementTree element from the given match. 
    249 
    250        Subclasses should override this method. 
    251 
    252        Arguments: 
    253            m: A match object containing a match of the pattern. 
    254 
    255        Returns: An ElementTree Element object. 
    256 
    257        """ 
    258        pass  # pragma: no cover 
    259 
    260    def type(self) -> str: 
    261        """ Return class name, to define pattern type """ 
    262        return self.__class__.__name__ 
    263 
    264    def unescape(self, text: str) -> str: 
    265        """ Return unescaped text given text with an inline placeholder. """ 
    266        try: 
    267            stash = self.md.treeprocessors['inline'].stashed_nodes 
    268        except KeyError:  # pragma: no cover 
    269            return text 
    270 
    271        def get_stash(m): 
    272            id = m.group(1) 
    273            if id in stash: 
    274                value = stash.get(id) 
    275                if isinstance(value, str): 
    276                    return value 
    277                else: 
    278                    # An `etree` Element - return text content only 
    279                    return ''.join(value.itertext()) 
    280        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 
    281 
    282 
    283class InlineProcessor(Pattern): 
    284    """ 
    285    Base class that inline processors subclass. 
    286 
    287    This is the newer style inline processor that uses a more 
    288    efficient and flexible search approach. 
    289 
    290    """ 
    291 
    292    def __init__(self, pattern: str, md: Markdown | None = None): 
    293        """ 
    294        Create an instant of an inline processor. 
    295 
    296        Arguments: 
    297            pattern: A regular expression that matches a pattern. 
    298            md: An optional pointer to the instance of `markdown.Markdown` and is available as 
    299                `self.md` on the class instance. 
    300 
    301        """ 
    302        self.pattern = pattern 
    303        self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) 
    304 
    305        # API for Markdown to pass `safe_mode` into instance 
    306        self.safe_mode = False 
    307        self.md = md 
    308 
    309    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]: 
    310        """Return a ElementTree element from the given match and the 
    311        start and end index of the matched text. 
    312 
    313        If `start` and/or `end` are returned as `None`, it will be 
    314        assumed that the processor did not find a valid region of text. 
    315 
    316        Subclasses should override this method. 
    317 
    318        Arguments: 
    319            m: A re match object containing a match of the pattern. 
    320            data: The buffer currently under analysis. 
    321 
    322        Returns: 
    323            el: The ElementTree element, text or None. 
    324            start: The start of the region that has been matched or None. 
    325            end: The end of the region that has been matched or None. 
    326 
    327        """ 
    328        pass  # pragma: no cover 
    329 
    330 
    331class SimpleTextPattern(Pattern):  # pragma: no cover 
    332    """ Return a simple text of `group(2)` of a Pattern. """ 
    333    def handleMatch(self, m: re.Match[str]) -> str: 
    334        """ Return string content of `group(2)` of a matching pattern. """ 
    335        return m.group(2) 
    336 
    337 
    338class SimpleTextInlineProcessor(InlineProcessor): 
    339    """ Return a simple text of `group(1)` of a Pattern. """ 
    340    def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 
    341        """ Return string content of `group(1)` of a matching pattern. """ 
    342        return m.group(1), m.start(0), m.end(0) 
    343 
    344 
    345class EscapeInlineProcessor(InlineProcessor): 
    346    """ Return an escaped character. """ 
    347 
    348    def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]: 
    349        """ 
    350        If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS] 
    351        then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped 
    352        in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX]. 
    353 
    354        If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`. 
    355        """ 
    356 
    357        char = m.group(1) 
    358        if char in self.md.ESCAPED_CHARS: 
    359            return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) 
    360        else: 
    361            return None, m.start(0), m.end(0) 
    362 
    363 
    364class SimpleTagPattern(Pattern):  # pragma: no cover 
    365    """ 
    366    Return element of type `tag` with a text attribute of `group(3)` 
    367    of a Pattern. 
    368 
    369    """ 
    370    def __init__(self, pattern: str, tag: str): 
    371        """ 
    372        Create an instant of an simple tag pattern. 
    373 
    374        Arguments: 
    375            pattern: A regular expression that matches a pattern. 
    376            tag: Tag of element. 
    377 
    378        """ 
    379        Pattern.__init__(self, pattern) 
    380        self.tag = tag 
    381        """ The tag of the rendered element. """ 
    382 
    383    def handleMatch(self, m: re.Match[str]) -> etree.Element: 
    384        """ 
    385        Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a 
    386        matching pattern as the Element's text. 
    387        """ 
    388        el = etree.Element(self.tag) 
    389        el.text = m.group(3) 
    390        return el 
    391 
    392 
    393class SimpleTagInlineProcessor(InlineProcessor): 
    394    """ 
    395    Return element of type `tag` with a text attribute of `group(2)` 
    396    of a Pattern. 
    397 
    398    """ 
    399    def __init__(self, pattern: str, tag: str): 
    400        """ 
    401        Create an instant of an simple tag processor. 
    402 
    403        Arguments: 
    404            pattern: A regular expression that matches a pattern. 
    405            tag: Tag of element. 
    406 
    407        """ 
    408        InlineProcessor.__init__(self, pattern) 
    409        self.tag = tag 
    410        """ The tag of the rendered element. """ 
    411 
    412    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:  # pragma: no cover 
    413        """ 
    414        Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a 
    415        matching pattern as the Element's text. 
    416        """ 
    417        el = etree.Element(self.tag) 
    418        el.text = m.group(2) 
    419        return el, m.start(0), m.end(0) 
    420 
    421 
    422class SubstituteTagPattern(SimpleTagPattern):  # pragma: no cover 
    423    """ Return an element of type `tag` with no children. """ 
    424    def handleMatch(self, m: re.Match[str]) -> etree.Element: 
    425        """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 
    426        return etree.Element(self.tag) 
    427 
    428 
    429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): 
    430    """ Return an element of type `tag` with no children. """ 
    431    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 
    432        """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 
    433        return etree.Element(self.tag), m.start(0), m.end(0) 
    434 
    435 
    436class BacktickInlineProcessor(InlineProcessor): 
    437    """ Return a `<code>` element containing the escaped matching text. """ 
    438    def __init__(self, pattern: str): 
    439        InlineProcessor.__init__(self, pattern) 
    440        self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) 
    441        self.tag = 'code' 
    442        """ The tag of the rendered element. """ 
    443 
    444    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]: 
    445        """ 
    446        If the match contains `group(3)` of a pattern, then return a `code` 
    447        [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with 
    448        [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString]. 
    449 
    450        If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped. 
    451 
    452        """ 
    453        if m.group(3): 
    454            el = etree.Element(self.tag) 
    455            el.text = util.AtomicString(util.code_escape(m.group(3).strip())) 
    456            return el, m.start(0), m.end(0) 
    457        else: 
    458            return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) 
    459 
    460 
    461class DoubleTagPattern(SimpleTagPattern):  # pragma: no cover 
    462    """Return a ElementTree element nested in tag2 nested in tag1. 
    463 
    464    Useful for strong emphasis etc. 
    465 
    466    """ 
    467    def handleMatch(self, m: re.Match[str]) -> etree.Element: 
    468        """ 
    469        Return [`Element`][xml.etree.ElementTree.Element] in following format: 
    470        `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional. 
    471 
    472        """ 
    473        tag1, tag2 = self.tag.split(",") 
    474        el1 = etree.Element(tag1) 
    475        el2 = etree.SubElement(el1, tag2) 
    476        el2.text = m.group(3) 
    477        if len(m.groups()) == 5: 
    478            el2.tail = m.group(4) 
    479        return el1 
    480 
    481 
    482class DoubleTagInlineProcessor(SimpleTagInlineProcessor): 
    483    """Return a ElementTree element nested in tag2 nested in tag1. 
    484 
    485    Useful for strong emphasis etc. 
    486 
    487    """ 
    488    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:  # pragma: no cover 
    489        """ 
    490        Return [`Element`][xml.etree.ElementTree.Element] in following format: 
    491        `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional. 
    492 
    493        """ 
    494        tag1, tag2 = self.tag.split(",") 
    495        el1 = etree.Element(tag1) 
    496        el2 = etree.SubElement(el1, tag2) 
    497        el2.text = m.group(2) 
    498        if len(m.groups()) == 3: 
    499            el2.tail = m.group(3) 
    500        return el1, m.start(0), m.end(0) 
    501 
    502 
    503class HtmlInlineProcessor(InlineProcessor): 
    504    """ Store raw inline html and return a placeholder. """ 
    505    def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 
    506        """ Store the text of `group(1)` of a pattern and return a placeholder string. """ 
    507        rawhtml = self.backslash_unescape(self.unescape(m.group(1))) 
    508        place_holder = self.md.htmlStash.store(rawhtml) 
    509        return place_holder, m.start(0), m.end(0) 
    510 
    511    def unescape(self, text: str) -> str: 
    512        """ Return unescaped text given text with an inline placeholder. """ 
    513        try: 
    514            stash = self.md.treeprocessors['inline'].stashed_nodes 
    515        except KeyError:  # pragma: no cover 
    516            return text 
    517 
    518        def get_stash(m: re.Match[str]) -> str: 
    519            id = m.group(1) 
    520            value = stash.get(id) 
    521            if value is not None: 
    522                try: 
    523                    return self.md.serializer(value) 
    524                except Exception: 
    525                    return r'\%s' % value 
    526 
    527        return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 
    528 
    529    def backslash_unescape(self, text: str) -> str: 
    530        """ Return text with backslash escapes undone (backslashes are restored). """ 
    531        try: 
    532            RE = self.md.treeprocessors['unescape'].RE 
    533        except KeyError:  # pragma: no cover 
    534            return text 
    535 
    536        def _unescape(m: re.Match[str]) -> str: 
    537            return chr(int(m.group(1))) 
    538 
    539        return RE.sub(_unescape, text) 
    540 
    541 
    542class AsteriskProcessor(InlineProcessor): 
    543    """Emphasis processor for handling strong and em matches inside asterisks.""" 
    544 
    545    PATTERNS = [ 
    546        EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 
    547        EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 
    548        EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 
    549        EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 
    550        EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 
    551    ] 
    552    """ The various strong and emphasis patterns handled by this processor. """ 
    553 
    554    def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element: 
    555        """Return single tag.""" 
    556        el1 = etree.Element(tag) 
    557        text = m.group(2) 
    558        self.parse_sub_patterns(text, el1, None, idx) 
    559        return el1 
    560 
    561    def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 
    562        """Return double tag.""" 
    563 
    564        tag1, tag2 = tags.split(",") 
    565        el1 = etree.Element(tag1) 
    566        el2 = etree.Element(tag2) 
    567        text = m.group(2) 
    568        self.parse_sub_patterns(text, el2, None, idx) 
    569        el1.append(el2) 
    570        if len(m.groups()) == 3: 
    571            text = m.group(3) 
    572            self.parse_sub_patterns(text, el1, el2, idx) 
    573        return el1 
    574 
    575    def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 
    576        """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" 
    577 
    578        tag1, tag2 = tags.split(",") 
    579        el1 = etree.Element(tag1) 
    580        el2 = etree.Element(tag2) 
    581        text = m.group(2) 
    582        self.parse_sub_patterns(text, el1, None, idx) 
    583        text = m.group(3) 
    584        el1.append(el2) 
    585        self.parse_sub_patterns(text, el2, None, idx) 
    586        return el1 
    587 
    588    def parse_sub_patterns( 
    589        self, data: str, parent: etree.Element, last: etree.Element | None, idx: int 
    590    ) -> None: 
    591        """ 
    592        Parses sub patterns. 
    593 
    594        `data`: text to evaluate. 
    595 
    596        `parent`: Parent to attach text and sub elements to. 
    597 
    598        `last`: Last appended child to parent. Can also be None if parent has no children. 
    599 
    600        `idx`: Current pattern index that was used to evaluate the parent. 
    601        """ 
    602 
    603        offset = 0 
    604        pos = 0 
    605 
    606        length = len(data) 
    607        while pos < length: 
    608            # Find the start of potential emphasis or strong tokens 
    609            if self.compiled_re.match(data, pos): 
    610                matched = False 
    611                # See if the we can match an emphasis/strong pattern 
    612                for index, item in enumerate(self.PATTERNS): 
    613                    # Only evaluate patterns that are after what was used on the parent 
    614                    if index <= idx: 
    615                        continue 
    616                    m = item.pattern.match(data, pos) 
    617                    if m: 
    618                        # Append child nodes to parent 
    619                        # Text nodes should be appended to the last 
    620                        # child if present, and if not, it should 
    621                        # be added as the parent's text node. 
    622                        text = data[offset:m.start(0)] 
    623                        if text: 
    624                            if last is not None: 
    625                                last.tail = text 
    626                            else: 
    627                                parent.text = text 
    628                        el = self.build_element(m, item.builder, item.tags, index) 
    629                        parent.append(el) 
    630                        last = el 
    631                        # Move our position past the matched hunk 
    632                        offset = pos = m.end(0) 
    633                        matched = True 
    634                if not matched: 
    635                    # We matched nothing, move on to the next character 
    636                    pos += 1 
    637            else: 
    638                # Increment position as no potential emphasis start was found. 
    639                pos += 1 
    640 
    641        # Append any leftover text as a text node. 
    642        text = data[offset:] 
    643        if text: 
    644            if last is not None: 
    645                last.tail = text 
    646            else: 
    647                parent.text = text 
    648 
    649    def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element: 
    650        """Element builder.""" 
    651 
    652        if builder == 'double2': 
    653            return self.build_double2(m, tags, index) 
    654        elif builder == 'double': 
    655            return self.build_double(m, tags, index) 
    656        else: 
    657            return self.build_single(m, tags, index) 
    658 
    659    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 
    660        """Parse patterns.""" 
    661 
    662        el = None 
    663        start = None 
    664        end = None 
    665 
    666        for index, item in enumerate(self.PATTERNS): 
    667            m1 = item.pattern.match(data, m.start(0)) 
    668            if m1: 
    669                start = m1.start(0) 
    670                end = m1.end(0) 
    671                el = self.build_element(m1, item.builder, item.tags, index) 
    672                break 
    673        return el, start, end 
    674 
    675 
    676class UnderscoreProcessor(AsteriskProcessor): 
    677    """Emphasis processor for handling strong and em matches inside underscores.""" 
    678 
    679    PATTERNS = [ 
    680        EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 
    681        EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 
    682        EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 
    683        EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 
    684        EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 
    685    ] 
    686    """ The various strong and emphasis patterns handled by this processor. """ 
    687 
    688 
    689class LinkInlineProcessor(InlineProcessor): 
    690    """ Return a link element from the given match. """ 
    691    RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) 
    692    RE_TITLE_CLEAN = re.compile(r'\s') 
    693 
    694    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 
    695        """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 
    696        text, index, handled = self.getText(data, m.end(0)) 
    697 
    698        if not handled: 
    699            return None, None, None 
    700 
    701        href, title, index, handled = self.getLink(data, index) 
    702        if not handled: 
    703            return None, None, None 
    704 
    705        el = etree.Element("a") 
    706        el.text = text 
    707 
    708        el.set("href", href) 
    709 
    710        if title is not None: 
    711            el.set("title", title) 
    712 
    713        return el, m.start(0), index 
    714 
    715    def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]: 
    716        """Parse data between `()` of `[Text]()` allowing recursive `()`. """ 
    717 
    718        href = '' 
    719        title: str | None = None 
    720        handled = False 
    721 
    722        m = self.RE_LINK.match(data, pos=index) 
    723        if m and m.group(1): 
    724            # Matches [Text](<link> "title") 
    725            href = m.group(1)[1:-1].strip() 
    726            if m.group(2): 
    727                title = m.group(2)[1:-1] 
    728            index = m.end(0) 
    729            handled = True 
    730        elif m: 
    731            # Track bracket nesting and index in string 
    732            bracket_count = 1 
    733            backtrack_count = 1 
    734            start_index = m.end() 
    735            index = start_index 
    736            last_bracket = -1 
    737 
    738            # Primary (first found) quote tracking. 
    739            quote: str | None = None 
    740            start_quote = -1 
    741            exit_quote = -1 
    742            ignore_matches = False 
    743 
    744            # Secondary (second found) quote tracking. 
    745            alt_quote = None 
    746            start_alt_quote = -1 
    747            exit_alt_quote = -1 
    748 
    749            # Track last character 
    750            last = '' 
    751 
    752            for pos in range(index, len(data)): 
    753                c = data[pos] 
    754                if c == '(': 
    755                    # Count nested ( 
    756                    # Don't increment the bracket count if we are sure we're in a title. 
    757                    if not ignore_matches: 
    758                        bracket_count += 1 
    759                    elif backtrack_count > 0: 
    760                        backtrack_count -= 1 
    761                elif c == ')': 
    762                    # Match nested ) to ( 
    763                    # Don't decrement if we are sure we are in a title that is unclosed. 
    764                    if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): 
    765                        bracket_count = 0 
    766                    elif not ignore_matches: 
    767                        bracket_count -= 1 
    768                    elif backtrack_count > 0: 
    769                        backtrack_count -= 1 
    770                        # We've found our backup end location if the title doesn't resolve. 
    771                        if backtrack_count == 0: 
    772                            last_bracket = index + 1 
    773 
    774                elif c in ("'", '"'): 
    775                    # Quote has started 
    776                    if not quote: 
    777                        # We'll assume we are now in a title. 
    778                        # Brackets are quoted, so no need to match them (except for the final one). 
    779                        ignore_matches = True 
    780                        backtrack_count = bracket_count 
    781                        bracket_count = 1 
    782                        start_quote = index + 1 
    783                        quote = c 
    784                    # Secondary quote (in case the first doesn't resolve): [text](link'"title") 
    785                    elif c != quote and not alt_quote: 
    786                        start_alt_quote = index + 1 
    787                        alt_quote = c 
    788                    # Update primary quote match 
    789                    elif c == quote: 
    790                        exit_quote = index + 1 
    791                    # Update secondary quote match 
    792                    elif alt_quote and c == alt_quote: 
    793                        exit_alt_quote = index + 1 
    794 
    795                index += 1 
    796 
    797                # Link is closed, so let's break out of the loop 
    798                if bracket_count == 0: 
    799                    # Get the title if we closed a title string right before link closed 
    800                    if exit_quote >= 0 and quote == last: 
    801                        href = data[start_index:start_quote - 1] 
    802                        title = ''.join(data[start_quote:exit_quote - 1]) 
    803                    elif exit_alt_quote >= 0 and alt_quote == last: 
    804                        href = data[start_index:start_alt_quote - 1] 
    805                        title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) 
    806                    else: 
    807                        href = data[start_index:index - 1] 
    808                    break 
    809 
    810                if c != ' ': 
    811                    last = c 
    812 
    813            # We have a scenario: `[test](link"notitle)` 
    814            # When we enter a string, we stop tracking bracket resolution in the main counter, 
    815            # but we do keep a backup counter up until we discover where we might resolve all brackets 
    816            # if the title string fails to resolve. 
    817            if bracket_count != 0 and backtrack_count == 0: 
    818                href = data[start_index:last_bracket - 1] 
    819                index = last_bracket 
    820                bracket_count = 0 
    821 
    822            handled = bracket_count == 0 
    823 
    824        if title is not None: 
    825            title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) 
    826 
    827        href = self.unescape(href).strip() 
    828 
    829        return href, title, index, handled 
    830 
    831    def getText(self, data: str, index: int) -> tuple[str, int, bool]: 
    832        """Parse the content between `[]` of the start of an image or link 
    833        resolving nested square brackets. 
    834 
    835        """ 
    836        bracket_count = 1 
    837        text = [] 
    838        for pos in range(index, len(data)): 
    839            c = data[pos] 
    840            if c == ']': 
    841                bracket_count -= 1 
    842            elif c == '[': 
    843                bracket_count += 1 
    844            index += 1 
    845            if bracket_count == 0: 
    846                break 
    847            text.append(c) 
    848        return ''.join(text), index, bracket_count == 0 
    849 
    850 
    851class ImageInlineProcessor(LinkInlineProcessor): 
    852    """ Return a `img` element from the given match. """ 
    853 
    854    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 
    855        """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 
    856        text, index, handled = self.getText(data, m.end(0)) 
    857        if not handled: 
    858            return None, None, None 
    859 
    860        src, title, index, handled = self.getLink(data, index) 
    861        if not handled: 
    862            return None, None, None 
    863 
    864        el = etree.Element("img") 
    865 
    866        el.set("src", src) 
    867 
    868        if title is not None: 
    869            el.set("title", title) 
    870 
    871        el.set('alt', self.unescape(text)) 
    872        return el, m.start(0), index 
    873 
    874 
    875class ReferenceInlineProcessor(LinkInlineProcessor): 
    876    """ Match to a stored reference and return link element. """ 
    877    NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) 
    878 
    879    RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) 
    880 
    881    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 
    882        """ 
    883        Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`. 
    884 
    885        """ 
    886        text, index, handled = self.getText(data, m.end(0)) 
    887        if not handled: 
    888            return None, None, None 
    889 
    890        id, end, handled = self.evalId(data, index, text) 
    891        if not handled: 
    892            return None, None, None 
    893 
    894        # Clean up line breaks in id 
    895        id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 
    896        if id not in self.md.references:  # ignore undefined refs 
    897            return None, m.start(0), end 
    898 
    899        href, title = self.md.references[id] 
    900 
    901        return self.makeTag(href, title, text), m.start(0), end 
    902 
    903    def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: 
    904        """ 
    905        Evaluate the id portion of `[ref][id]`. 
    906 
    907        If `[ref][]` use `[ref]`. 
    908        """ 
    909        m = self.RE_LINK.match(data, pos=index) 
    910        if not m: 
    911            return None, index, False 
    912        else: 
    913            id = m.group(1).lower() 
    914            end = m.end(0) 
    915            if not id: 
    916                id = text.lower() 
    917        return id, end, True 
    918 
    919    def makeTag(self, href: str, title: str, text: str) -> etree.Element: 
    920        """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """ 
    921        el = etree.Element('a') 
    922 
    923        el.set('href', href) 
    924        if title: 
    925            el.set('title', title) 
    926 
    927        el.text = text 
    928        return el 
    929 
    930 
    931class ShortReferenceInlineProcessor(ReferenceInlineProcessor): 
    932    """Short form of reference: `[google]`. """ 
    933    def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 
    934        """Evaluate the id of `[ref]`.  """ 
    935 
    936        return text.lower(), index, True 
    937 
    938 
    939class ImageReferenceInlineProcessor(ReferenceInlineProcessor): 
    940    """ Match to a stored reference and return `img` element. """ 
    941    def makeTag(self, href: str, title: str, text: str) -> etree.Element: 
    942        """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """ 
    943        el = etree.Element("img") 
    944        el.set("src", href) 
    945        if title: 
    946            el.set("title", title) 
    947        el.set("alt", self.unescape(text)) 
    948        return el 
    949 
    950 
    951class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): 
    952    """ Short form of image reference: `![ref]`. """ 
    953    def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 
    954        """Evaluate the id of `[ref]`.  """ 
    955 
    956        return text.lower(), index, True 
    957 
    958 
    959class AutolinkInlineProcessor(InlineProcessor): 
    960    """ Return a link Element given an auto-link (`<http://example/com>`). """ 
    961    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 
    962        """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """ 
    963        el = etree.Element("a") 
    964        el.set('href', self.unescape(m.group(1))) 
    965        el.text = util.AtomicString(m.group(1)) 
    966        return el, m.start(0), m.end(0) 
    967 
    968 
    969class AutomailInlineProcessor(InlineProcessor): 
    970    """ 
    971    Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`). 
    972    """ 
    973    def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 
    974        """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link  of `group(1)`. """ 
    975        el = etree.Element('a') 
    976        email = self.unescape(m.group(1)) 
    977        if email.startswith("mailto:"): 
    978            email = email[len("mailto:"):] 
    979 
    980        def codepoint2name(code: int) -> str: 
    981            """Return entity definition by code, or the code if not defined.""" 
    982            entity = entities.codepoint2name.get(code) 
    983            if entity: 
    984                return "{}{};".format(util.AMP_SUBSTITUTE, entity) 
    985            else: 
    986                return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 
    987 
    988        letters = [codepoint2name(ord(letter)) for letter in email] 
    989        el.text = util.AtomicString(''.join(letters)) 
    990 
    991        mailto = "mailto:" + email 
    992        mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 
    993                          ord(letter) for letter in mailto]) 
    994        el.set('href', mailto) 
    995        return el, m.start(0), m.end(0)