1# Python Markdown 
    2 
    3# A Python implementation of John Gruber's Markdown. 
    4 
    5# Documentation: https://python-markdown.github.io/ 
    6# GitHub: https://github.com/Python-Markdown/markdown/ 
    7# PyPI: https://pypi.org/project/Markdown/ 
    8 
    9# Started by Manfred Stienstra (http://www.dwerg.net/). 
    10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 
    11# Currently maintained by Waylan Limberg (https://github.com/waylan), 
    12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 
    13 
    14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 
    15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 
    16# Copyright 2004 Manfred Stienstra (the original version) 
    17 
    18# License: BSD (see LICENSE.md for details). 
    19 
    20""" 
    21Tree processors manipulate the tree created by block processors. They can even create an entirely 
    22new `ElementTree` object. This is an excellent place for creating summaries, adding collected 
    23references, or last minute adjustments. 
    24 
    25""" 
    26 
    27from __future__ import annotations 
    28 
    29import re 
    30import xml.etree.ElementTree as etree 
    31from typing import TYPE_CHECKING, Any 
    32from . import util 
    33from . import inlinepatterns 
    34 
    35if TYPE_CHECKING:  # pragma: no cover 
    36    from markdown import Markdown 
    37 
    38 
    39def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]: 
    40    """ Build the default  `treeprocessors` for Markdown. """ 
    41    treeprocessors = util.Registry() 
    42    treeprocessors.register(InlineProcessor(md), 'inline', 20) 
    43    treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) 
    44    treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) 
    45    return treeprocessors 
    46 
    47 
    48def isString(s: object) -> bool: 
    49    """ Return `True` if object is a string but not an  [`AtomicString`][markdown.util.AtomicString]. """ 
    50    if not isinstance(s, util.AtomicString): 
    51        return isinstance(s, str) 
    52    return False 
    53 
    54 
    55class Treeprocessor(util.Processor): 
    56    """ 
    57    `Treeprocessor`s are run on the `ElementTree` object before serialization. 
    58 
    59    Each `Treeprocessor` implements a `run` method that takes a pointer to an 
    60    `Element` and modifies it as necessary. 
    61 
    62    `Treeprocessors` must extend `markdown.Treeprocessor`. 
    63 
    64    """ 
    65    def run(self, root: etree.Element) -> etree.Element | None: 
    66        """ 
    67        Subclasses of `Treeprocessor` should implement a `run` method, which 
    68        takes a root `Element`. This method can return another `Element` 
    69        object, and the existing root `Element` will be replaced, or it can 
    70        modify the current tree and return `None`. 
    71        """ 
    72        pass  # pragma: no cover 
    73 
    74 
    75class InlineProcessor(Treeprocessor): 
    76    """ 
    77    A `Treeprocessor` that traverses a tree, applying inline patterns. 
    78    """ 
    79 
    80    def __init__(self, md: Markdown): 
    81        self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX 
    82        self.__placeholder_suffix = util.ETX 
    83        self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ 
    84                                      + len(self.__placeholder_suffix) 
    85        self.__placeholder_re = util.INLINE_PLACEHOLDER_RE 
    86        self.md = md 
    87        self.inlinePatterns = md.inlinePatterns 
    88        self.ancestors: list[str] = [] 
    89 
    90    def __makePlaceholder(self, type: str) -> tuple[str, str]: 
    91        """ Generate a placeholder """ 
    92        id = "%04d" % len(self.stashed_nodes) 
    93        hash = util.INLINE_PLACEHOLDER % id 
    94        return hash, id 
    95 
    96    def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]: 
    97        """ 
    98        Extract id from data string, start from index. 
    99 
    100        Arguments: 
    101            data: String. 
    102            index: Index, from which we start search. 
    103 
    104        Returns: 
    105            Placeholder id and string index, after the found placeholder. 
    106 
    107        """ 
    108        m = self.__placeholder_re.search(data, index) 
    109        if m: 
    110            return m.group(1), m.end() 
    111        else: 
    112            return None, index + 1 
    113 
    114    def __stashNode(self, node: etree.Element | str, type: str) -> str: 
    115        """ Add node to stash. """ 
    116        placeholder, id = self.__makePlaceholder(type) 
    117        self.stashed_nodes[id] = node 
    118        return placeholder 
    119 
    120    def __handleInline(self, data: str, patternIndex: int = 0) -> str: 
    121        """ 
    122        Process string with inline patterns and replace it with placeholders. 
    123 
    124        Arguments: 
    125            data: A line of Markdown text. 
    126            patternIndex: The index of the `inlinePattern` to start with. 
    127 
    128        Returns: 
    129            String with placeholders. 
    130 
    131        """ 
    132        if not isinstance(data, util.AtomicString): 
    133            startIndex = 0 
    134            count = len(self.inlinePatterns) 
    135            while patternIndex < count: 
    136                data, matched, startIndex = self.__applyPattern( 
    137                    self.inlinePatterns[patternIndex], data, patternIndex, startIndex 
    138                ) 
    139                if not matched: 
    140                    patternIndex += 1 
    141        return data 
    142 
    143    def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None: 
    144        """ 
    145        Process placeholders in `Element.text` or `Element.tail` 
    146        of Elements popped from `self.stashed_nodes`. 
    147 
    148        Arguments: 
    149            node: Parent node. 
    150            subnode: Processing node. 
    151            isText: Boolean variable, True - it's text, False - it's a tail. 
    152 
    153        """ 
    154        if isText: 
    155            text = subnode.text 
    156            subnode.text = None 
    157        else: 
    158            text = subnode.tail 
    159            subnode.tail = None 
    160 
    161        childResult = self.__processPlaceholders(text, subnode, isText) 
    162 
    163        if not isText and node is not subnode: 
    164            pos = list(node).index(subnode) + 1 
    165        else: 
    166            pos = 0 
    167 
    168        childResult.reverse() 
    169        for newChild in childResult: 
    170            node.insert(pos, newChild[0]) 
    171 
    172    def __processPlaceholders( 
    173        self, 
    174        data: str | None, 
    175        parent: etree.Element, 
    176        isText: bool = True 
    177    ) -> list[tuple[etree.Element, list[str]]]: 
    178        """ 
    179        Process string with placeholders and generate `ElementTree` tree. 
    180 
    181        Arguments: 
    182            data: String with placeholders instead of `ElementTree` elements. 
    183            parent: Element, which contains processing inline data. 
    184            isText: Boolean variable, True - it's text, False - it's a tail. 
    185 
    186        Returns: 
    187            List with `ElementTree` elements with applied inline patterns. 
    188 
    189        """ 
    190        def linkText(text: str | None) -> None: 
    191            if text: 
    192                if result: 
    193                    if result[-1][0].tail: 
    194                        result[-1][0].tail += text 
    195                    else: 
    196                        result[-1][0].tail = text 
    197                elif not isText: 
    198                    if parent.tail: 
    199                        parent.tail += text 
    200                    else: 
    201                        parent.tail = text 
    202                else: 
    203                    if parent.text: 
    204                        parent.text += text 
    205                    else: 
    206                        parent.text = text 
    207        result = [] 
    208        strartIndex = 0 
    209        while data: 
    210            index = data.find(self.__placeholder_prefix, strartIndex) 
    211            if index != -1: 
    212                id, phEndIndex = self.__findPlaceholder(data, index) 
    213 
    214                if id in self.stashed_nodes: 
    215                    node = self.stashed_nodes.get(id) 
    216 
    217                    if index > 0: 
    218                        text = data[strartIndex:index] 
    219                        linkText(text) 
    220 
    221                    if not isinstance(node, str):  # it's Element 
    222                        for child in [node] + list(node): 
    223                            if child.tail: 
    224                                if child.tail.strip(): 
    225                                    self.__processElementText( 
    226                                        node, child, False 
    227                                    ) 
    228                            if child.text: 
    229                                if child.text.strip(): 
    230                                    self.__processElementText(child, child) 
    231                    else:  # it's just a string 
    232                        linkText(node) 
    233                        strartIndex = phEndIndex 
    234                        continue 
    235 
    236                    strartIndex = phEndIndex 
    237                    result.append((node, self.ancestors[:])) 
    238 
    239                else:  # wrong placeholder 
    240                    end = index + len(self.__placeholder_prefix) 
    241                    linkText(data[strartIndex:end]) 
    242                    strartIndex = end 
    243            else: 
    244                text = data[strartIndex:] 
    245                if isinstance(data, util.AtomicString): 
    246                    # We don't want to loose the `AtomicString` 
    247                    text = util.AtomicString(text) 
    248                linkText(text) 
    249                data = "" 
    250 
    251        return result 
    252 
    253    def __applyPattern( 
    254        self, 
    255        pattern: inlinepatterns.Pattern, 
    256        data: str, 
    257        patternIndex: int, 
    258        startIndex: int = 0 
    259    ) -> tuple[str, bool, int]: 
    260        """ 
    261        Check if the line fits the pattern, create the necessary 
    262        elements, add it to `stashed_nodes`. 
    263 
    264        Arguments: 
    265            data: The text to be processed. 
    266            pattern: The pattern to be checked. 
    267            patternIndex: Index of current pattern. 
    268            startIndex: String index, from which we start searching. 
    269 
    270        Returns: 
    271            String with placeholders instead of `ElementTree` elements. 
    272 
    273        """ 
    274        new_style = isinstance(pattern, inlinepatterns.InlineProcessor) 
    275 
    276        for exclude in pattern.ANCESTOR_EXCLUDES: 
    277            if exclude.lower() in self.ancestors: 
    278                return data, False, 0 
    279 
    280        if new_style: 
    281            match = None 
    282            # Since `handleMatch` may reject our first match, 
    283            # we iterate over the buffer looking for matches 
    284            # until we can't find any more. 
    285            for match in pattern.getCompiledRegExp().finditer(data, startIndex): 
    286                node, start, end = pattern.handleMatch(match, data) 
    287                if start is None or end is None: 
    288                    startIndex += match.end(0) 
    289                    match = None 
    290                    continue 
    291                break 
    292        else:  # pragma: no cover 
    293            match = pattern.getCompiledRegExp().match(data[startIndex:]) 
    294            leftData = data[:startIndex] 
    295 
    296        if not match: 
    297            return data, False, 0 
    298 
    299        if not new_style:  # pragma: no cover 
    300            node = pattern.handleMatch(match) 
    301            start = match.start(0) 
    302            end = match.end(0) 
    303 
    304        if node is None: 
    305            return data, True, end 
    306 
    307        if not isinstance(node, str): 
    308            if not isinstance(node.text, util.AtomicString): 
    309                # We need to process current node too 
    310                for child in [node] + list(node): 
    311                    if not isString(node): 
    312                        if child.text: 
    313                            self.ancestors.append(child.tag.lower()) 
    314                            child.text = self.__handleInline( 
    315                                child.text, patternIndex + 1 
    316                            ) 
    317                            self.ancestors.pop() 
    318                        if child.tail: 
    319                            child.tail = self.__handleInline( 
    320                                child.tail, patternIndex 
    321                            ) 
    322 
    323        placeholder = self.__stashNode(node, pattern.type()) 
    324 
    325        if new_style: 
    326            return "{}{}{}".format(data[:start], 
    327                                   placeholder, data[end:]), True, 0 
    328        else:  # pragma: no cover 
    329            return "{}{}{}{}".format(leftData, 
    330                                     match.group(1), 
    331                                     placeholder, match.groups()[-1]), True, 0 
    332 
    333    def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None: 
    334        """Build the ancestor list.""" 
    335        ancestors = [] 
    336        while parent is not None: 
    337            if parent is not None: 
    338                ancestors.append(parent.tag.lower()) 
    339            parent = self.parent_map.get(parent) 
    340        ancestors.reverse() 
    341        parents.extend(ancestors) 
    342 
    343    def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element: 
    344        """Apply inline patterns to a parsed Markdown tree. 
    345 
    346        Iterate over `Element`, find elements with inline tag, apply inline 
    347        patterns and append newly created Elements to tree.  To avoid further 
    348        processing of string with inline patterns, instead of normal string, 
    349        use subclass [`AtomicString`][markdown.util.AtomicString]: 
    350 
    351            node.text = markdown.util.AtomicString("This will not be processed.") 
    352 
    353        Arguments: 
    354            tree: `Element` object, representing Markdown tree. 
    355            ancestors: List of parent tag names that precede the tree node (if needed). 
    356 
    357        Returns: 
    358            An element tree object with applied inline patterns. 
    359 
    360        """ 
    361        self.stashed_nodes: dict[str, etree.Element | str] = {} 
    362 
    363        # Ensure a valid parent list, but copy passed in lists 
    364        # to ensure we don't have the user accidentally change it on us. 
    365        tree_parents = [] if ancestors is None else ancestors[:] 
    366 
    367        self.parent_map = {c: p for p in tree.iter() for c in p} 
    368        stack = [(tree, tree_parents)] 
    369 
    370        while stack: 
    371            currElement, parents = stack.pop(0) 
    372 
    373            self.ancestors = parents 
    374            self.__build_ancestors(currElement, self.ancestors) 
    375 
    376            insertQueue = [] 
    377            for child in currElement: 
    378                if child.text and not isinstance( 
    379                    child.text, util.AtomicString 
    380                ): 
    381                    self.ancestors.append(child.tag.lower()) 
    382                    text = child.text 
    383                    child.text = None 
    384                    lst = self.__processPlaceholders( 
    385                        self.__handleInline(text), child 
    386                    ) 
    387                    for item in lst: 
    388                        self.parent_map[item[0]] = child 
    389                    stack += lst 
    390                    insertQueue.append((child, lst)) 
    391                    self.ancestors.pop() 
    392                if child.tail: 
    393                    tail = self.__handleInline(child.tail) 
    394                    dumby = etree.Element('d') 
    395                    child.tail = None 
    396                    tailResult = self.__processPlaceholders(tail, dumby, False) 
    397                    if dumby.tail: 
    398                        child.tail = dumby.tail 
    399                    pos = list(currElement).index(child) + 1 
    400                    tailResult.reverse() 
    401                    for newChild in tailResult: 
    402                        self.parent_map[newChild[0]] = currElement 
    403                        currElement.insert(pos, newChild[0]) 
    404                if len(child): 
    405                    self.parent_map[child] = currElement 
    406                    stack.append((child, self.ancestors[:])) 
    407 
    408            for element, lst in insertQueue: 
    409                for i, obj in enumerate(lst): 
    410                    newChild = obj[0] 
    411                    element.insert(i, newChild) 
    412        return tree 
    413 
    414 
    415class PrettifyTreeprocessor(Treeprocessor): 
    416    """ Add line breaks to the html document. """ 
    417 
    418    def _prettifyETree(self, elem: etree.Element) -> None: 
    419        """ Recursively add line breaks to `ElementTree` children. """ 
    420 
    421        i = "\n" 
    422        if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: 
    423            if (not elem.text or not elem.text.strip()) \ 
    424                    and len(elem) and self.md.is_block_level(elem[0].tag): 
    425                elem.text = i 
    426            for e in elem: 
    427                if self.md.is_block_level(e.tag): 
    428                    self._prettifyETree(e) 
    429        if not elem.tail or not elem.tail.strip(): 
    430            elem.tail = i 
    431 
    432    def run(self, root: etree.Element) -> None: 
    433        """ Add line breaks to `Element` object and its children. """ 
    434 
    435        self._prettifyETree(root) 
    436        # Do `<br />`'s separately as they are often in the middle of 
    437        # inline content and missed by `_prettifyETree`. 
    438        brs = root.iter('br') 
    439        for br in brs: 
    440            if not br.tail or not br.tail.strip(): 
    441                br.tail = '\n' 
    442            else: 
    443                br.tail = '\n%s' % br.tail 
    444        # Clean up extra empty lines at end of code blocks. 
    445        pres = root.iter('pre') 
    446        for pre in pres: 
    447            if len(pre) and pre[0].tag == 'code': 
    448                code = pre[0] 
    449                # Only prettify code containing text only 
    450                if not len(code) and code.text is not None: 
    451                    code.text = util.AtomicString(code.text.rstrip() + '\n') 
    452 
    453 
    454class UnescapeTreeprocessor(Treeprocessor): 
    455    """ Restore escaped chars """ 
    456 
    457    RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) 
    458 
    459    def _unescape(self, m: re.Match[str]) -> str: 
    460        return chr(int(m.group(1))) 
    461 
    462    def unescape(self, text: str) -> str: 
    463        return self.RE.sub(self._unescape, text) 
    464 
    465    def run(self, root: etree.Element) -> None: 
    466        """ Loop over all elements and unescape all text. """ 
    467        for elem in root.iter(): 
    468            # Unescape text content 
    469            if elem.text and not elem.tag == 'code': 
    470                elem.text = self.unescape(elem.text) 
    471            # Unescape tail content 
    472            if elem.tail: 
    473                elem.tail = self.unescape(elem.tail) 
    474            # Unescape attribute values 
    475            for key, value in elem.items(): 
    476                elem.set(key, self.unescape(value))