1# Python Markdown
2
3# A Python implementation of John Gruber's Markdown.
4
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
8
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
13
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
17
18# License: BSD (see LICENSE.md for details).
19
20"""
21Tree processors manipulate the tree created by block processors. They can even create an entirely
22new `ElementTree` object. This is an excellent place for creating summaries, adding collected
23references, or last minute adjustments.
24
25"""
26
27from __future__ import annotations
28
29import re
30import xml.etree.ElementTree as etree
31from typing import TYPE_CHECKING, Any
32from . import util
33from . import inlinepatterns
34
35if TYPE_CHECKING: # pragma: no cover
36 from markdown import Markdown
37
38
39def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]:
40 """ Build the default `treeprocessors` for Markdown. """
41 treeprocessors = util.Registry()
42 treeprocessors.register(InlineProcessor(md), 'inline', 20)
43 treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
44 treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
45 return treeprocessors
46
47
48def isString(s: object) -> bool:
49 """ Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """
50 if not isinstance(s, util.AtomicString):
51 return isinstance(s, str)
52 return False
53
54
55class Treeprocessor(util.Processor):
56 """
57 `Treeprocessor`s are run on the `ElementTree` object before serialization.
58
59 Each `Treeprocessor` implements a `run` method that takes a pointer to an
60 `Element` and modifies it as necessary.
61
62 `Treeprocessors` must extend `markdown.Treeprocessor`.
63
64 """
65 def run(self, root: etree.Element) -> etree.Element | None:
66 """
67 Subclasses of `Treeprocessor` should implement a `run` method, which
68 takes a root `Element`. This method can return another `Element`
69 object, and the existing root `Element` will be replaced, or it can
70 modify the current tree and return `None`.
71 """
72 pass # pragma: no cover
73
74
75class InlineProcessor(Treeprocessor):
76 """
77 A `Treeprocessor` that traverses a tree, applying inline patterns.
78 """
79
80 def __init__(self, md: Markdown):
81 self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
82 self.__placeholder_suffix = util.ETX
83 self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
84 + len(self.__placeholder_suffix)
85 self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
86 self.md = md
87 self.inlinePatterns = md.inlinePatterns
88 self.ancestors: list[str] = []
89
90 def __makePlaceholder(self, type: str) -> tuple[str, str]:
91 """ Generate a placeholder """
92 id = "%04d" % len(self.stashed_nodes)
93 hash = util.INLINE_PLACEHOLDER % id
94 return hash, id
95
96 def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]:
97 """
98 Extract id from data string, start from index.
99
100 Arguments:
101 data: String.
102 index: Index, from which we start search.
103
104 Returns:
105 Placeholder id and string index, after the found placeholder.
106
107 """
108 m = self.__placeholder_re.search(data, index)
109 if m:
110 return m.group(1), m.end()
111 else:
112 return None, index + 1
113
114 def __stashNode(self, node: etree.Element | str, type: str) -> str:
115 """ Add node to stash. """
116 placeholder, id = self.__makePlaceholder(type)
117 self.stashed_nodes[id] = node
118 return placeholder
119
120 def __handleInline(self, data: str, patternIndex: int = 0) -> str:
121 """
122 Process string with inline patterns and replace it with placeholders.
123
124 Arguments:
125 data: A line of Markdown text.
126 patternIndex: The index of the `inlinePattern` to start with.
127
128 Returns:
129 String with placeholders.
130
131 """
132 if not isinstance(data, util.AtomicString):
133 startIndex = 0
134 count = len(self.inlinePatterns)
135 while patternIndex < count:
136 data, matched, startIndex = self.__applyPattern(
137 self.inlinePatterns[patternIndex], data, patternIndex, startIndex
138 )
139 if not matched:
140 patternIndex += 1
141 return data
142
143 def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None:
144 """
145 Process placeholders in `Element.text` or `Element.tail`
146 of Elements popped from `self.stashed_nodes`.
147
148 Arguments:
149 node: Parent node.
150 subnode: Processing node.
151 isText: Boolean variable, True - it's text, False - it's a tail.
152
153 """
154 if isText:
155 text = subnode.text
156 subnode.text = None
157 else:
158 text = subnode.tail
159 subnode.tail = None
160
161 childResult = self.__processPlaceholders(text, subnode, isText)
162
163 if not isText and node is not subnode:
164 pos = list(node).index(subnode) + 1
165 else:
166 pos = 0
167
168 childResult.reverse()
169 for newChild in childResult:
170 node.insert(pos, newChild[0])
171
172 def __processPlaceholders(
173 self,
174 data: str | None,
175 parent: etree.Element,
176 isText: bool = True
177 ) -> list[tuple[etree.Element, list[str]]]:
178 """
179 Process string with placeholders and generate `ElementTree` tree.
180
181 Arguments:
182 data: String with placeholders instead of `ElementTree` elements.
183 parent: Element, which contains processing inline data.
184 isText: Boolean variable, True - it's text, False - it's a tail.
185
186 Returns:
187 List with `ElementTree` elements with applied inline patterns.
188
189 """
190 def linkText(text: str | None) -> None:
191 if text:
192 if result:
193 if result[-1][0].tail:
194 result[-1][0].tail += text
195 else:
196 result[-1][0].tail = text
197 elif not isText:
198 if parent.tail:
199 parent.tail += text
200 else:
201 parent.tail = text
202 else:
203 if parent.text:
204 parent.text += text
205 else:
206 parent.text = text
207 result = []
208 strartIndex = 0
209 while data:
210 index = data.find(self.__placeholder_prefix, strartIndex)
211 if index != -1:
212 id, phEndIndex = self.__findPlaceholder(data, index)
213
214 if id in self.stashed_nodes:
215 node = self.stashed_nodes.get(id)
216
217 if index > 0:
218 text = data[strartIndex:index]
219 linkText(text)
220
221 if not isinstance(node, str): # it's Element
222 for child in [node] + list(node):
223 if child.tail:
224 if child.tail.strip():
225 self.__processElementText(
226 node, child, False
227 )
228 if child.text:
229 if child.text.strip():
230 self.__processElementText(child, child)
231 else: # it's just a string
232 linkText(node)
233 strartIndex = phEndIndex
234 continue
235
236 strartIndex = phEndIndex
237 result.append((node, self.ancestors[:]))
238
239 else: # wrong placeholder
240 end = index + len(self.__placeholder_prefix)
241 linkText(data[strartIndex:end])
242 strartIndex = end
243 else:
244 text = data[strartIndex:]
245 if isinstance(data, util.AtomicString):
246 # We don't want to loose the `AtomicString`
247 text = util.AtomicString(text)
248 linkText(text)
249 data = ""
250
251 return result
252
253 def __applyPattern(
254 self,
255 pattern: inlinepatterns.Pattern,
256 data: str,
257 patternIndex: int,
258 startIndex: int = 0
259 ) -> tuple[str, bool, int]:
260 """
261 Check if the line fits the pattern, create the necessary
262 elements, add it to `stashed_nodes`.
263
264 Arguments:
265 data: The text to be processed.
266 pattern: The pattern to be checked.
267 patternIndex: Index of current pattern.
268 startIndex: String index, from which we start searching.
269
270 Returns:
271 String with placeholders instead of `ElementTree` elements.
272
273 """
274 new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
275
276 for exclude in pattern.ANCESTOR_EXCLUDES:
277 if exclude.lower() in self.ancestors:
278 return data, False, 0
279
280 if new_style:
281 match = None
282 # Since `handleMatch` may reject our first match,
283 # we iterate over the buffer looking for matches
284 # until we can't find any more.
285 for match in pattern.getCompiledRegExp().finditer(data, startIndex):
286 node, start, end = pattern.handleMatch(match, data)
287 if start is None or end is None:
288 startIndex += match.end(0)
289 match = None
290 continue
291 break
292 else: # pragma: no cover
293 match = pattern.getCompiledRegExp().match(data[startIndex:])
294 leftData = data[:startIndex]
295
296 if not match:
297 return data, False, 0
298
299 if not new_style: # pragma: no cover
300 node = pattern.handleMatch(match)
301 start = match.start(0)
302 end = match.end(0)
303
304 if node is None:
305 return data, True, end
306
307 if not isinstance(node, str):
308 if not isinstance(node.text, util.AtomicString):
309 # We need to process current node too
310 for child in [node] + list(node):
311 if not isString(node):
312 if child.text:
313 self.ancestors.append(child.tag.lower())
314 child.text = self.__handleInline(
315 child.text, patternIndex + 1
316 )
317 self.ancestors.pop()
318 if child.tail:
319 child.tail = self.__handleInline(
320 child.tail, patternIndex
321 )
322
323 placeholder = self.__stashNode(node, pattern.type())
324
325 if new_style:
326 return "{}{}{}".format(data[:start],
327 placeholder, data[end:]), True, 0
328 else: # pragma: no cover
329 return "{}{}{}{}".format(leftData,
330 match.group(1),
331 placeholder, match.groups()[-1]), True, 0
332
333 def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None:
334 """Build the ancestor list."""
335 ancestors = []
336 while parent is not None:
337 if parent is not None:
338 ancestors.append(parent.tag.lower())
339 parent = self.parent_map.get(parent)
340 ancestors.reverse()
341 parents.extend(ancestors)
342
343 def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element:
344 """Apply inline patterns to a parsed Markdown tree.
345
346 Iterate over `Element`, find elements with inline tag, apply inline
347 patterns and append newly created Elements to tree. To avoid further
348 processing of string with inline patterns, instead of normal string,
349 use subclass [`AtomicString`][markdown.util.AtomicString]:
350
351 node.text = markdown.util.AtomicString("This will not be processed.")
352
353 Arguments:
354 tree: `Element` object, representing Markdown tree.
355 ancestors: List of parent tag names that precede the tree node (if needed).
356
357 Returns:
358 An element tree object with applied inline patterns.
359
360 """
361 self.stashed_nodes: dict[str, etree.Element | str] = {}
362
363 # Ensure a valid parent list, but copy passed in lists
364 # to ensure we don't have the user accidentally change it on us.
365 tree_parents = [] if ancestors is None else ancestors[:]
366
367 self.parent_map = {c: p for p in tree.iter() for c in p}
368 stack = [(tree, tree_parents)]
369
370 while stack:
371 currElement, parents = stack.pop()
372
373 self.ancestors = parents
374 self.__build_ancestors(currElement, self.ancestors)
375
376 insertQueue = []
377 for child in currElement:
378 if child.text and not isinstance(
379 child.text, util.AtomicString
380 ):
381 self.ancestors.append(child.tag.lower())
382 text = child.text
383 child.text = None
384 lst = self.__processPlaceholders(
385 self.__handleInline(text), child
386 )
387 for item in lst:
388 self.parent_map[item[0]] = child
389 stack += lst
390 insertQueue.append((child, lst))
391 self.ancestors.pop()
392 if child.tail:
393 tail = self.__handleInline(child.tail)
394 dumby = etree.Element('d')
395 child.tail = None
396 tailResult = self.__processPlaceholders(tail, dumby, False)
397 if dumby.tail:
398 child.tail = dumby.tail
399 pos = list(currElement).index(child) + 1
400 tailResult.reverse()
401 for newChild in tailResult:
402 self.parent_map[newChild[0]] = currElement
403 currElement.insert(pos, newChild[0])
404 if len(child):
405 self.parent_map[child] = currElement
406 stack.append((child, self.ancestors[:]))
407
408 for element, lst in insertQueue:
409 for i, obj in enumerate(lst):
410 newChild = obj[0]
411 element.insert(i, newChild)
412 return tree
413
414
415class PrettifyTreeprocessor(Treeprocessor):
416 """ Add line breaks to the html document. """
417
418 def _prettifyETree(self, elem: etree.Element) -> None:
419 """ Recursively add line breaks to `ElementTree` children. """
420
421 i = "\n"
422 if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
423 if (not elem.text or not elem.text.strip()) \
424 and len(elem) and self.md.is_block_level(elem[0].tag):
425 elem.text = i
426 for e in elem:
427 if self.md.is_block_level(e.tag):
428 self._prettifyETree(e)
429 if not elem.tail or not elem.tail.strip():
430 elem.tail = i
431
432 def run(self, root: etree.Element) -> None:
433 """ Add line breaks to `Element` object and its children. """
434
435 self._prettifyETree(root)
436 # Do `<br />`'s separately as they are often in the middle of
437 # inline content and missed by `_prettifyETree`.
438 brs = root.iter('br')
439 for br in brs:
440 if not br.tail or not br.tail.strip():
441 br.tail = '\n'
442 else:
443 br.tail = '\n%s' % br.tail
444 # Clean up extra empty lines at end of code blocks.
445 pres = root.iter('pre')
446 for pre in pres:
447 if len(pre) and pre[0].tag == 'code':
448 code = pre[0]
449 # Only prettify code containing text only
450 if not len(code) and code.text is not None:
451 code.text = util.AtomicString(code.text.rstrip() + '\n')
452
453
454class UnescapeTreeprocessor(Treeprocessor):
455 """ Restore escaped chars """
456
457 RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
458
459 def _unescape(self, m: re.Match[str]) -> str:
460 return chr(int(m.group(1)))
461
462 def unescape(self, text: str) -> str:
463 return self.RE.sub(self._unescape, text)
464
465 def run(self, root: etree.Element) -> None:
466 """ Loop over all elements and unescape all text. """
467 for elem in root.iter():
468 # Unescape text content
469 if elem.text and not elem.tag == 'code':
470 elem.text = self.unescape(elem.text)
471 # Unescape tail content
472 if elem.tail:
473 elem.tail = self.unescape(elem.tail)
474 # Unescape attribute values
475 for key, value in elem.items():
476 elem.set(key, self.unescape(value))