1# Python Markdown
2
3# A Python implementation of John Gruber's Markdown.
4
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
8
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
13
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
17
18# License: BSD (see LICENSE.md for details).
19
20"""
21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The
22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still
23supported, though users are encouraged to migrate.
24
25The new `InlineProcessor` provides two major enhancements to `Patterns`:
26
271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with
28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain
29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`.
30
312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis,
32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative
33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are
34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`.
35
36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested
37 brackets, and explicit control of the span "consumed" by the processor.
38
39"""
40
41from __future__ import annotations
42
43from . import util
44from typing import TYPE_CHECKING, Any, Collection, NamedTuple
45import re
46import xml.etree.ElementTree as etree
47from html import entities
48
49if TYPE_CHECKING: # pragma: no cover
50 from markdown import Markdown
51
52
53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]:
54 """
55 Build the default set of inline patterns for Markdown.
56
57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace
58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we
59 apply the expressions in the following order:
60
61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown
62 patterns by escaping them;
63
64 * then we handle the various types of links (auto-links must be handled before inline HTML);
65
66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder
67 and add the actual HTML to a stash;
68
69 * finally we apply strong, emphasis, etc.
70
71 """
72 inlinePatterns = util.Registry()
73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
78 inlinePatterns.register(
79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
80 )
81 inlinePatterns.register(
82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
83 )
84 inlinePatterns.register(
85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
86 )
87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
95 return inlinePatterns
96
97
98# The actual regular expressions for patterns
99# -----------------------------------------------------------------------------
100
101NOIMG = r'(?<!\!)'
102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """
103
104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """
106
107ESCAPE_RE = r'\\(.)'
108""" Match a backslash escaped character (`\\<` or `\\*`). """
109
110EMPHASIS_RE = r'(\*)([^\*]+)\1'
111""" Match emphasis with an asterisk (`*emphasis*`). """
112
113STRONG_RE = r'(\*{2})(.+?)\1'
114""" Match strong with an asterisk (`**strong**`). """
115
116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """
118
119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """
121
122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
123""" Match strong emphasis with underscores (`__strong _em__`). """
124
125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """
127
128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """
130
131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
132""" Match strong emphasis with asterisk (`***strong**em*`). """
133
134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
135""" Match strong emphasis with underscores (`___strong__em_`). """
136
137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
138""" Match strong emphasis with asterisk (`**strong*em***`). """
139
140LINK_RE = NOIMG + r'\['
141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """
142
143IMAGE_LINK_RE = r'\!\['
144""" Match start of in-line image link (`` or ``). """
145
146REFERENCE_RE = LINK_RE
147""" Match start of reference link (`[Label][3]`). """
148
149IMAGE_REFERENCE_RE = IMAGE_LINK_RE
150""" Match start of image reference (`![alt text][2]`). """
151
152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))'
153""" Match a stand-alone `*` or `_`. """
154
155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
156""" Match an automatic link (`<http://www.example.com>`). """
157
158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
159""" Match an automatic email link (`<me@example.com>`). """
160
161HTML_RE = (
162 r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag
163 r'!--(?:(?!<!--|-->).)*--|' # Comment
164 r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction
165 r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA`
166 ')>)'
167)
168""" Match an HTML tag (`<...>`). """
169
170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
171""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """
172
173LINE_BREAK_RE = r' \n'
174""" Match two spaces at end of line. """
175
176
177def dequote(string: str) -> str:
178 """Remove quotes from around a string."""
179 if ((string.startswith('"') and string.endswith('"')) or
180 (string.startswith("'") and string.endswith("'"))):
181 return string[1:-1]
182 else:
183 return string
184
185
186class EmStrongItem(NamedTuple):
187 """Emphasis/strong pattern item."""
188 pattern: re.Pattern[str]
189 builder: str
190 tags: str
191
192
193# The pattern classes
194# -----------------------------------------------------------------------------
195
196
197class Pattern: # pragma: no cover
198 """
199 Base class that inline patterns subclass.
200
201 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression.
202 Each pattern object uses a single regular expression and must support the following methods:
203 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and
204 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch].
205
206 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this
207 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on
208 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression
209 is pre-compiled.
210
211 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that
212 use a more efficient and flexible search approach be used instead. However, the older style
213 `Pattern` remains for backward compatibility with many existing third-party extensions.
214
215 """
216
217 ANCESTOR_EXCLUDES: Collection[str] = tuple()
218 """
219 A collection of elements which are undesirable ancestors. The processor will be skipped if it
220 would cause the content to be a descendant of one of the listed tag names.
221 """
222
223 compiled_re: re.Pattern[str]
224 md: Markdown | None
225
226 def __init__(self, pattern: str, md: Markdown | None = None):
227 """
228 Create an instant of an inline pattern.
229
230 Arguments:
231 pattern: A regular expression that matches a pattern.
232 md: An optional pointer to the instance of `markdown.Markdown` and is available as
233 `self.md` on the class instance.
234
235
236 """
237 self.pattern = pattern
238 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
239 re.DOTALL | re.UNICODE)
240
241 self.md = md
242
243 def getCompiledRegExp(self) -> re.Pattern:
244 """ Return a compiled regular expression. """
245 return self.compiled_re
246
247 def handleMatch(self, m: re.Match[str]) -> etree.Element | str:
248 """Return a ElementTree element from the given match.
249
250 Subclasses should override this method.
251
252 Arguments:
253 m: A match object containing a match of the pattern.
254
255 Returns: An ElementTree Element object.
256
257 """
258 pass # pragma: no cover
259
260 def type(self) -> str:
261 """ Return class name, to define pattern type """
262 return self.__class__.__name__
263
264 def unescape(self, text: str) -> str:
265 """ Return unescaped text given text with an inline placeholder. """
266 try:
267 stash = self.md.treeprocessors['inline'].stashed_nodes
268 except KeyError: # pragma: no cover
269 return text
270
271 def get_stash(m):
272 id = m.group(1)
273 if id in stash:
274 value = stash.get(id)
275 if isinstance(value, str):
276 return value
277 else:
278 # An `etree` Element - return text content only
279 return ''.join(value.itertext())
280 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
281
282
283class InlineProcessor(Pattern):
284 """
285 Base class that inline processors subclass.
286
287 This is the newer style inline processor that uses a more
288 efficient and flexible search approach.
289
290 """
291
292 def __init__(self, pattern: str, md: Markdown | None = None):
293 """
294 Create an instant of an inline processor.
295
296 Arguments:
297 pattern: A regular expression that matches a pattern.
298 md: An optional pointer to the instance of `markdown.Markdown` and is available as
299 `self.md` on the class instance.
300
301 """
302 self.pattern = pattern
303 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
304
305 # API for Markdown to pass `safe_mode` into instance
306 self.safe_mode = False
307 self.md = md
308
309 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]:
310 """Return a ElementTree element from the given match and the
311 start and end index of the matched text.
312
313 If `start` and/or `end` are returned as `None`, it will be
314 assumed that the processor did not find a valid region of text.
315
316 Subclasses should override this method.
317
318 Arguments:
319 m: A re match object containing a match of the pattern.
320 data: The buffer currently under analysis.
321
322 Returns:
323 el: The ElementTree element, text or None.
324 start: The start of the region that has been matched or None.
325 end: The end of the region that has been matched or None.
326
327 """
328 pass # pragma: no cover
329
330
331class SimpleTextPattern(Pattern): # pragma: no cover
332 """ Return a simple text of `group(2)` of a Pattern. """
333 def handleMatch(self, m: re.Match[str]) -> str:
334 """ Return string content of `group(2)` of a matching pattern. """
335 return m.group(2)
336
337
338class SimpleTextInlineProcessor(InlineProcessor):
339 """ Return a simple text of `group(1)` of a Pattern. """
340 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
341 """ Return string content of `group(1)` of a matching pattern. """
342 return m.group(1), m.start(0), m.end(0)
343
344
345class EscapeInlineProcessor(InlineProcessor):
346 """ Return an escaped character. """
347
348 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]:
349 """
350 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS]
351 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped
352 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX].
353
354 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`.
355 """
356
357 char = m.group(1)
358 if char in self.md.ESCAPED_CHARS:
359 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
360 else:
361 return None, m.start(0), m.end(0)
362
363
364class SimpleTagPattern(Pattern): # pragma: no cover
365 """
366 Return element of type `tag` with a text attribute of `group(3)`
367 of a Pattern.
368
369 """
370 def __init__(self, pattern: str, tag: str):
371 """
372 Create an instant of an simple tag pattern.
373
374 Arguments:
375 pattern: A regular expression that matches a pattern.
376 tag: Tag of element.
377
378 """
379 Pattern.__init__(self, pattern)
380 self.tag = tag
381 """ The tag of the rendered element. """
382
383 def handleMatch(self, m: re.Match[str]) -> etree.Element:
384 """
385 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a
386 matching pattern as the Element's text.
387 """
388 el = etree.Element(self.tag)
389 el.text = m.group(3)
390 return el
391
392
393class SimpleTagInlineProcessor(InlineProcessor):
394 """
395 Return element of type `tag` with a text attribute of `group(2)`
396 of a Pattern.
397
398 """
399 def __init__(self, pattern: str, tag: str):
400 """
401 Create an instant of an simple tag processor.
402
403 Arguments:
404 pattern: A regular expression that matches a pattern.
405 tag: Tag of element.
406
407 """
408 InlineProcessor.__init__(self, pattern)
409 self.tag = tag
410 """ The tag of the rendered element. """
411
412 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
413 """
414 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a
415 matching pattern as the Element's text.
416 """
417 el = etree.Element(self.tag)
418 el.text = m.group(2)
419 return el, m.start(0), m.end(0)
420
421
422class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
423 """ Return an element of type `tag` with no children. """
424 def handleMatch(self, m: re.Match[str]) -> etree.Element:
425 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
426 return etree.Element(self.tag)
427
428
429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
430 """ Return an element of type `tag` with no children. """
431 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
432 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
433 return etree.Element(self.tag), m.start(0), m.end(0)
434
435
436class BacktickInlineProcessor(InlineProcessor):
437 """ Return a `<code>` element containing the escaped matching text. """
438 def __init__(self, pattern: str):
439 InlineProcessor.__init__(self, pattern)
440 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
441 self.tag = 'code'
442 """ The tag of the rendered element. """
443
444 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:
445 """
446 If the match contains `group(3)` of a pattern, then return a `code`
447 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with
448 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString].
449
450 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped.
451
452 """
453 if m.group(3):
454 el = etree.Element(self.tag)
455 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
456 return el, m.start(0), m.end(0)
457 else:
458 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
459
460
461class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
462 """Return a ElementTree element nested in tag2 nested in tag1.
463
464 Useful for strong emphasis etc.
465
466 """
467 def handleMatch(self, m: re.Match[str]) -> etree.Element:
468 """
469 Return [`Element`][xml.etree.ElementTree.Element] in following format:
470 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional.
471
472 """
473 tag1, tag2 = self.tag.split(",")
474 el1 = etree.Element(tag1)
475 el2 = etree.SubElement(el1, tag2)
476 el2.text = m.group(3)
477 if len(m.groups()) == 5:
478 el2.tail = m.group(4)
479 return el1
480
481
482class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
483 """Return a ElementTree element nested in tag2 nested in tag1.
484
485 Useful for strong emphasis etc.
486
487 """
488 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
489 """
490 Return [`Element`][xml.etree.ElementTree.Element] in following format:
491 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional.
492
493 """
494 tag1, tag2 = self.tag.split(",")
495 el1 = etree.Element(tag1)
496 el2 = etree.SubElement(el1, tag2)
497 el2.text = m.group(2)
498 if len(m.groups()) == 3:
499 el2.tail = m.group(3)
500 return el1, m.start(0), m.end(0)
501
502
503class HtmlInlineProcessor(InlineProcessor):
504 """ Store raw inline html and return a placeholder. """
505 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
506 """ Store the text of `group(1)` of a pattern and return a placeholder string. """
507 rawhtml = self.backslash_unescape(self.unescape(m.group(1)))
508 place_holder = self.md.htmlStash.store(rawhtml)
509 return place_holder, m.start(0), m.end(0)
510
511 def unescape(self, text: str) -> str:
512 """ Return unescaped text given text with an inline placeholder. """
513 try:
514 stash = self.md.treeprocessors['inline'].stashed_nodes
515 except KeyError: # pragma: no cover
516 return text
517
518 def get_stash(m: re.Match[str]) -> str:
519 id = m.group(1)
520 value = stash.get(id)
521 if value is not None:
522 try:
523 # Ensure we don't have a placeholder inside a placeholder
524 return self.unescape(self.md.serializer(value))
525 except Exception:
526 return r'\%s' % value
527
528 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
529
530 def backslash_unescape(self, text: str) -> str:
531 """ Return text with backslash escapes undone (backslashes are restored). """
532 try:
533 RE = self.md.treeprocessors['unescape'].RE
534 except KeyError: # pragma: no cover
535 return text
536
537 def _unescape(m: re.Match[str]) -> str:
538 return chr(int(m.group(1)))
539
540 return RE.sub(_unescape, text)
541
542
543class AsteriskProcessor(InlineProcessor):
544 """Emphasis processor for handling strong and em matches inside asterisks."""
545
546 PATTERNS = [
547 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
548 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
549 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
550 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
551 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
552 ]
553 """ The various strong and emphasis patterns handled by this processor. """
554
555 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element:
556 """Return single tag."""
557 el1 = etree.Element(tag)
558 text = m.group(2)
559 self.parse_sub_patterns(text, el1, None, idx)
560 return el1
561
562 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
563 """Return double tag."""
564
565 tag1, tag2 = tags.split(",")
566 el1 = etree.Element(tag1)
567 el2 = etree.Element(tag2)
568 text = m.group(2)
569 self.parse_sub_patterns(text, el2, None, idx)
570 el1.append(el2)
571 if len(m.groups()) == 3:
572 text = m.group(3)
573 self.parse_sub_patterns(text, el1, el2, idx)
574 return el1
575
576 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
577 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
578
579 tag1, tag2 = tags.split(",")
580 el1 = etree.Element(tag1)
581 el2 = etree.Element(tag2)
582 text = m.group(2)
583 self.parse_sub_patterns(text, el1, None, idx)
584 text = m.group(3)
585 el1.append(el2)
586 self.parse_sub_patterns(text, el2, None, idx)
587 return el1
588
589 def parse_sub_patterns(
590 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int
591 ) -> None:
592 """
593 Parses sub patterns.
594
595 `data`: text to evaluate.
596
597 `parent`: Parent to attach text and sub elements to.
598
599 `last`: Last appended child to parent. Can also be None if parent has no children.
600
601 `idx`: Current pattern index that was used to evaluate the parent.
602 """
603
604 offset = 0
605 pos = 0
606
607 length = len(data)
608 while pos < length:
609 # Find the start of potential emphasis or strong tokens
610 if self.compiled_re.match(data, pos):
611 matched = False
612 # See if the we can match an emphasis/strong pattern
613 for index, item in enumerate(self.PATTERNS):
614 # Only evaluate patterns that are after what was used on the parent
615 if index <= idx:
616 continue
617 m = item.pattern.match(data, pos)
618 if m:
619 # Append child nodes to parent
620 # Text nodes should be appended to the last
621 # child if present, and if not, it should
622 # be added as the parent's text node.
623 text = data[offset:m.start(0)]
624 if text:
625 if last is not None:
626 last.tail = text
627 else:
628 parent.text = text
629 el = self.build_element(m, item.builder, item.tags, index)
630 parent.append(el)
631 last = el
632 # Move our position past the matched hunk
633 offset = pos = m.end(0)
634 matched = True
635 if not matched:
636 # We matched nothing, move on to the next character
637 pos += 1
638 else:
639 # Increment position as no potential emphasis start was found.
640 pos += 1
641
642 # Append any leftover text as a text node.
643 text = data[offset:]
644 if text:
645 if last is not None:
646 last.tail = text
647 else:
648 parent.text = text
649
650 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element:
651 """Element builder."""
652
653 if builder == 'double2':
654 return self.build_double2(m, tags, index)
655 elif builder == 'double':
656 return self.build_double(m, tags, index)
657 else:
658 return self.build_single(m, tags, index)
659
660 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
661 """Parse patterns."""
662
663 el = None
664 start = None
665 end = None
666
667 for index, item in enumerate(self.PATTERNS):
668 m1 = item.pattern.match(data, m.start(0))
669 if m1:
670 start = m1.start(0)
671 end = m1.end(0)
672 el = self.build_element(m1, item.builder, item.tags, index)
673 break
674 return el, start, end
675
676
677class UnderscoreProcessor(AsteriskProcessor):
678 """Emphasis processor for handling strong and em matches inside underscores."""
679
680 PATTERNS = [
681 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
682 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
683 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
684 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
685 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
686 ]
687 """ The various strong and emphasis patterns handled by this processor. """
688
689
690class LinkInlineProcessor(InlineProcessor):
691 """ Return a link element from the given match. """
692 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
693 RE_TITLE_CLEAN = re.compile(r'\s')
694
695 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
696 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
697 text, index, handled = self.getText(data, m.end(0))
698
699 if not handled:
700 return None, None, None
701
702 href, title, index, handled = self.getLink(data, index)
703 if not handled:
704 return None, None, None
705
706 el = etree.Element("a")
707 el.text = text
708
709 el.set("href", href)
710
711 if title is not None:
712 el.set("title", title)
713
714 return el, m.start(0), index
715
716 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]:
717 """Parse data between `()` of `[Text]()` allowing recursive `()`. """
718
719 href = ''
720 title: str | None = None
721 handled = False
722
723 m = self.RE_LINK.match(data, pos=index)
724 if m and m.group(1):
725 # Matches [Text](<link> "title")
726 href = m.group(1)[1:-1].strip()
727 if m.group(2):
728 title = m.group(2)[1:-1]
729 index = m.end(0)
730 handled = True
731 elif m:
732 # Track bracket nesting and index in string
733 bracket_count = 1
734 backtrack_count = 1
735 start_index = m.end()
736 index = start_index
737 last_bracket = -1
738
739 # Primary (first found) quote tracking.
740 quote: str | None = None
741 start_quote = -1
742 exit_quote = -1
743 ignore_matches = False
744
745 # Secondary (second found) quote tracking.
746 alt_quote = None
747 start_alt_quote = -1
748 exit_alt_quote = -1
749
750 # Track last character
751 last = ''
752
753 for pos in range(index, len(data)):
754 c = data[pos]
755 if c == '(':
756 # Count nested (
757 # Don't increment the bracket count if we are sure we're in a title.
758 if not ignore_matches:
759 bracket_count += 1
760 elif backtrack_count > 0:
761 backtrack_count -= 1
762 elif c == ')':
763 # Match nested ) to (
764 # Don't decrement if we are sure we are in a title that is unclosed.
765 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
766 bracket_count = 0
767 elif not ignore_matches:
768 bracket_count -= 1
769 elif backtrack_count > 0:
770 backtrack_count -= 1
771 # We've found our backup end location if the title doesn't resolve.
772 if backtrack_count == 0:
773 last_bracket = index + 1
774
775 elif c in ("'", '"'):
776 # Quote has started
777 if not quote:
778 # We'll assume we are now in a title.
779 # Brackets are quoted, so no need to match them (except for the final one).
780 ignore_matches = True
781 backtrack_count = bracket_count
782 bracket_count = 1
783 start_quote = index + 1
784 quote = c
785 # Secondary quote (in case the first doesn't resolve): [text](link'"title")
786 elif c != quote and not alt_quote:
787 start_alt_quote = index + 1
788 alt_quote = c
789 # Update primary quote match
790 elif c == quote:
791 exit_quote = index + 1
792 # Update secondary quote match
793 elif alt_quote and c == alt_quote:
794 exit_alt_quote = index + 1
795
796 index += 1
797
798 # Link is closed, so let's break out of the loop
799 if bracket_count == 0:
800 # Get the title if we closed a title string right before link closed
801 if exit_quote >= 0 and quote == last:
802 href = data[start_index:start_quote - 1]
803 title = ''.join(data[start_quote:exit_quote - 1])
804 elif exit_alt_quote >= 0 and alt_quote == last:
805 href = data[start_index:start_alt_quote - 1]
806 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
807 else:
808 href = data[start_index:index - 1]
809 break
810
811 if c != ' ':
812 last = c
813
814 # We have a scenario: `[test](link"notitle)`
815 # When we enter a string, we stop tracking bracket resolution in the main counter,
816 # but we do keep a backup counter up until we discover where we might resolve all brackets
817 # if the title string fails to resolve.
818 if bracket_count != 0 and backtrack_count == 0:
819 href = data[start_index:last_bracket - 1]
820 index = last_bracket
821 bracket_count = 0
822
823 handled = bracket_count == 0
824
825 if title is not None:
826 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
827
828 href = self.unescape(href).strip()
829
830 return href, title, index, handled
831
832 def getText(self, data: str, index: int) -> tuple[str, int, bool]:
833 """Parse the content between `[]` of the start of an image or link
834 resolving nested square brackets.
835
836 """
837 bracket_count = 1
838 text = []
839 for pos in range(index, len(data)):
840 c = data[pos]
841 if c == ']':
842 bracket_count -= 1
843 elif c == '[':
844 bracket_count += 1
845 index += 1
846 if bracket_count == 0:
847 break
848 text.append(c)
849 return ''.join(text), index, bracket_count == 0
850
851
852class ImageInlineProcessor(LinkInlineProcessor):
853 """ Return a `img` element from the given match. """
854
855 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
856 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
857 text, index, handled = self.getText(data, m.end(0))
858 if not handled:
859 return None, None, None
860
861 src, title, index, handled = self.getLink(data, index)
862 if not handled:
863 return None, None, None
864
865 el = etree.Element("img")
866
867 el.set("src", src)
868
869 if title is not None:
870 el.set("title", title)
871
872 el.set('alt', self.unescape(text))
873 return el, m.start(0), index
874
875
876class ReferenceInlineProcessor(LinkInlineProcessor):
877 """ Match to a stored reference and return link element. """
878 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
879
880 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
881
882 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
883 """
884 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`.
885
886 """
887 text, index, handled = self.getText(data, m.end(0))
888 if not handled:
889 return None, None, None
890
891 id, end, handled = self.evalId(data, index, text)
892 if not handled:
893 return None, None, None
894
895 # Clean up line breaks in id
896 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
897 if id not in self.md.references: # ignore undefined refs
898 return None, m.start(0), end
899
900 href, title = self.md.references[id]
901
902 return self.makeTag(href, title, text), m.start(0), end
903
904 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:
905 """
906 Evaluate the id portion of `[ref][id]`.
907
908 If `[ref][]` use `[ref]`.
909 """
910 m = self.RE_LINK.match(data, pos=index)
911 if not m:
912 return None, index, False
913 else:
914 id = m.group(1).lower()
915 end = m.end(0)
916 if not id:
917 id = text.lower()
918 return id, end, True
919
920 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
921 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """
922 el = etree.Element('a')
923
924 el.set('href', href)
925 if title:
926 el.set('title', title)
927
928 el.text = text
929 return el
930
931
932class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
933 """Short form of reference: `[google]`. """
934 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
935 """Evaluate the id of `[ref]`. """
936
937 return text.lower(), index, True
938
939
940class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
941 """ Match to a stored reference and return `img` element. """
942 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
943 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """
944 el = etree.Element("img")
945 el.set("src", href)
946 if title:
947 el.set("title", title)
948 el.set("alt", self.unescape(text))
949 return el
950
951
952class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
953 """ Short form of image reference: `![ref]`. """
954 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
955 """Evaluate the id of `[ref]`. """
956
957 return text.lower(), index, True
958
959
960class AutolinkInlineProcessor(InlineProcessor):
961 """ Return a link Element given an auto-link (`<http://example/com>`). """
962 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
963 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """
964 el = etree.Element("a")
965 el.set('href', self.unescape(m.group(1)))
966 el.text = util.AtomicString(m.group(1))
967 return el, m.start(0), m.end(0)
968
969
970class AutomailInlineProcessor(InlineProcessor):
971 """
972 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`).
973 """
974 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
975 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """
976 el = etree.Element('a')
977 email = self.unescape(m.group(1))
978 if email.startswith("mailto:"):
979 email = email[len("mailto:"):]
980
981 def codepoint2name(code: int) -> str:
982 """Return entity definition by code, or the code if not defined."""
983 entity = entities.codepoint2name.get(code)
984 if entity:
985 return "{}{};".format(util.AMP_SUBSTITUTE, entity)
986 else:
987 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
988
989 letters = [codepoint2name(ord(letter)) for letter in email]
990 el.text = util.AtomicString(''.join(letters))
991
992 mailto = "mailto:" + email
993 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
994 ord(letter) for letter in mailto])
995 el.set('href', mailto)
996 return el, m.start(0), m.end(0)