1# Python Markdown
2
3# A Python implementation of John Gruber's Markdown.
4
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
8
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
13
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
17
18# License: BSD (see LICENSE.md for details).
19
20"""
21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The
22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still
23supported, though users are encouraged to migrate.
24
25The new `InlineProcessor` provides two major enhancements to `Patterns`:
26
271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with
28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain
29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`.
30
312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis,
32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative
33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are
34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`.
35
36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested
37 brackets, and explicit control of the span "consumed" by the processor.
38
39"""
40
41from __future__ import annotations
42
43from . import util
44from typing import TYPE_CHECKING, Any, Collection, NamedTuple
45import re
46import xml.etree.ElementTree as etree
47from html import entities
48
49if TYPE_CHECKING: # pragma: no cover
50 from markdown import Markdown
51
52
53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]:
54 """
55 Build the default set of inline patterns for Markdown.
56
57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace
58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we
59 apply the expressions in the following order:
60
61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown
62 patterns by escaping them;
63
64 * then we handle the various types of links (auto-links must be handled before inline HTML);
65
66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder
67 and add the actual HTML to a stash;
68
69 * finally we apply strong, emphasis, etc.
70
71 """
72 inlinePatterns = util.Registry()
73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
78 inlinePatterns.register(
79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
80 )
81 inlinePatterns.register(
82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
83 )
84 inlinePatterns.register(
85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
86 )
87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
95 return inlinePatterns
96
97
98# The actual regular expressions for patterns
99# -----------------------------------------------------------------------------
100
101NOIMG = r'(?<!\!)'
102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """
103
104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """
106
107ESCAPE_RE = r'\\(.)'
108""" Match a backslash escaped character (`\\<` or `\\*`). """
109
110EMPHASIS_RE = r'(\*)([^\*]+)\1'
111""" Match emphasis with an asterisk (`*emphasis*`). """
112
113STRONG_RE = r'(\*{2})(.+?)\1'
114""" Match strong with an asterisk (`**strong**`). """
115
116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """
118
119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """
121
122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
123""" Match strong emphasis with underscores (`__strong _em__`). """
124
125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """
127
128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """
130
131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
132""" Match strong emphasis with asterisk (`***strong**em*`). """
133
134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
135""" Match strong emphasis with underscores (`___strong__em_`). """
136
137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
138""" Match strong emphasis with asterisk (`**strong*em***`). """
139
140LINK_RE = NOIMG + r'\['
141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """
142
143IMAGE_LINK_RE = r'\!\['
144""" Match start of in-line image link (`` or ``). """
145
146REFERENCE_RE = LINK_RE
147""" Match start of reference link (`[Label][3]`). """
148
149IMAGE_REFERENCE_RE = IMAGE_LINK_RE
150""" Match start of image reference (`![alt text][2]`). """
151
152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))'
153""" Match a stand-alone `*` or `_`. """
154
155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
156""" Match an automatic link (`<http://www.example.com>`). """
157
158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
159""" Match an automatic email link (`<me@example.com>`). """
160
161HTML_RE = (
162 r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag
163 r'!--(?:(?!<!--|-->).)*--|' # Comment
164 r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction
165 r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA`
166 ')>)'
167)
168""" Match an HTML tag (`<...>`). """
169
170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
171""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """
172
173LINE_BREAK_RE = r' \n'
174""" Match two spaces at end of line. """
175
176
177def dequote(string: str) -> str:
178 """Remove quotes from around a string."""
179 if ((string.startswith('"') and string.endswith('"')) or
180 (string.startswith("'") and string.endswith("'"))):
181 return string[1:-1]
182 else:
183 return string
184
185
186class EmStrongItem(NamedTuple):
187 """Emphasis/strong pattern item."""
188 pattern: re.Pattern[str]
189 builder: str
190 tags: str
191
192
193# The pattern classes
194# -----------------------------------------------------------------------------
195
196
197class Pattern: # pragma: no cover
198 """
199 Base class that inline patterns subclass.
200
201 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression.
202 Each pattern object uses a single regular expression and must support the following methods:
203 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and
204 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch].
205
206 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this
207 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on
208 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression
209 is pre-compiled.
210
211 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that
212 use a more efficient and flexible search approach be used instead. However, the older style
213 `Pattern` remains for backward compatibility with many existing third-party extensions.
214
215 """
216
217 ANCESTOR_EXCLUDES: Collection[str] = tuple()
218 """
219 A collection of elements which are undesirable ancestors. The processor will be skipped if it
220 would cause the content to be a descendant of one of the listed tag names.
221 """
222
223 compiled_re: re.Pattern[str]
224 md: Markdown | None
225
226 def __init__(self, pattern: str, md: Markdown | None = None):
227 """
228 Create an instant of an inline pattern.
229
230 Arguments:
231 pattern: A regular expression that matches a pattern.
232 md: An optional pointer to the instance of `markdown.Markdown` and is available as
233 `self.md` on the class instance.
234
235
236 """
237 self.pattern = pattern
238 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
239 re.DOTALL | re.UNICODE)
240
241 self.md = md
242
243 def getCompiledRegExp(self) -> re.Pattern:
244 """ Return a compiled regular expression. """
245 return self.compiled_re
246
247 def handleMatch(self, m: re.Match[str]) -> etree.Element | str:
248 """Return a ElementTree element from the given match.
249
250 Subclasses should override this method.
251
252 Arguments:
253 m: A match object containing a match of the pattern.
254
255 Returns: An ElementTree Element object.
256
257 """
258 pass # pragma: no cover
259
260 def type(self) -> str:
261 """ Return class name, to define pattern type """
262 return self.__class__.__name__
263
264 def unescape(self, text: str) -> str:
265 """ Return unescaped text given text with an inline placeholder. """
266 try:
267 stash = self.md.treeprocessors['inline'].stashed_nodes
268 except KeyError: # pragma: no cover
269 return text
270
271 def get_stash(m):
272 id = m.group(1)
273 if id in stash:
274 value = stash.get(id)
275 if isinstance(value, str):
276 return value
277 else:
278 # An `etree` Element - return text content only
279 return ''.join(value.itertext())
280 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
281
282
283class InlineProcessor(Pattern):
284 """
285 Base class that inline processors subclass.
286
287 This is the newer style inline processor that uses a more
288 efficient and flexible search approach.
289
290 """
291
292 def __init__(self, pattern: str, md: Markdown | None = None):
293 """
294 Create an instant of an inline processor.
295
296 Arguments:
297 pattern: A regular expression that matches a pattern.
298 md: An optional pointer to the instance of `markdown.Markdown` and is available as
299 `self.md` on the class instance.
300
301 """
302 self.pattern = pattern
303 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
304
305 # API for Markdown to pass `safe_mode` into instance
306 self.safe_mode = False
307 self.md = md
308
309 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]:
310 """Return a ElementTree element from the given match and the
311 start and end index of the matched text.
312
313 If `start` and/or `end` are returned as `None`, it will be
314 assumed that the processor did not find a valid region of text.
315
316 Subclasses should override this method.
317
318 Arguments:
319 m: A re match object containing a match of the pattern.
320 data: The buffer currently under analysis.
321
322 Returns:
323 el: The ElementTree element, text or None.
324 start: The start of the region that has been matched or None.
325 end: The end of the region that has been matched or None.
326
327 """
328 pass # pragma: no cover
329
330
331class SimpleTextPattern(Pattern): # pragma: no cover
332 """ Return a simple text of `group(2)` of a Pattern. """
333 def handleMatch(self, m: re.Match[str]) -> str:
334 """ Return string content of `group(2)` of a matching pattern. """
335 return m.group(2)
336
337
338class SimpleTextInlineProcessor(InlineProcessor):
339 """ Return a simple text of `group(1)` of a Pattern. """
340 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
341 """ Return string content of `group(1)` of a matching pattern. """
342 return m.group(1), m.start(0), m.end(0)
343
344
345class EscapeInlineProcessor(InlineProcessor):
346 """ Return an escaped character. """
347
348 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]:
349 """
350 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS]
351 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped
352 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX].
353
354 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`.
355 """
356
357 char = m.group(1)
358 if char in self.md.ESCAPED_CHARS:
359 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
360 else:
361 return None, m.start(0), m.end(0)
362
363
364class SimpleTagPattern(Pattern): # pragma: no cover
365 """
366 Return element of type `tag` with a text attribute of `group(3)`
367 of a Pattern.
368
369 """
370 def __init__(self, pattern: str, tag: str):
371 """
372 Create an instant of an simple tag pattern.
373
374 Arguments:
375 pattern: A regular expression that matches a pattern.
376 tag: Tag of element.
377
378 """
379 Pattern.__init__(self, pattern)
380 self.tag = tag
381 """ The tag of the rendered element. """
382
383 def handleMatch(self, m: re.Match[str]) -> etree.Element:
384 """
385 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a
386 matching pattern as the Element's text.
387 """
388 el = etree.Element(self.tag)
389 el.text = m.group(3)
390 return el
391
392
393class SimpleTagInlineProcessor(InlineProcessor):
394 """
395 Return element of type `tag` with a text attribute of `group(2)`
396 of a Pattern.
397
398 """
399 def __init__(self, pattern: str, tag: str):
400 """
401 Create an instant of an simple tag processor.
402
403 Arguments:
404 pattern: A regular expression that matches a pattern.
405 tag: Tag of element.
406
407 """
408 InlineProcessor.__init__(self, pattern)
409 self.tag = tag
410 """ The tag of the rendered element. """
411
412 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
413 """
414 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a
415 matching pattern as the Element's text.
416 """
417 el = etree.Element(self.tag)
418 el.text = m.group(2)
419 return el, m.start(0), m.end(0)
420
421
422class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
423 """ Return an element of type `tag` with no children. """
424 def handleMatch(self, m: re.Match[str]) -> etree.Element:
425 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
426 return etree.Element(self.tag)
427
428
429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
430 """ Return an element of type `tag` with no children. """
431 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
432 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
433 return etree.Element(self.tag), m.start(0), m.end(0)
434
435
436class BacktickInlineProcessor(InlineProcessor):
437 """ Return a `<code>` element containing the escaped matching text. """
438 def __init__(self, pattern: str):
439 InlineProcessor.__init__(self, pattern)
440 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
441 self.tag = 'code'
442 """ The tag of the rendered element. """
443
444 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:
445 """
446 If the match contains `group(3)` of a pattern, then return a `code`
447 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with
448 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString].
449
450 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped.
451
452 """
453 if m.group(3):
454 el = etree.Element(self.tag)
455 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
456 return el, m.start(0), m.end(0)
457 else:
458 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
459
460
461class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
462 """Return a ElementTree element nested in tag2 nested in tag1.
463
464 Useful for strong emphasis etc.
465
466 """
467 def handleMatch(self, m: re.Match[str]) -> etree.Element:
468 """
469 Return [`Element`][xml.etree.ElementTree.Element] in following format:
470 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional.
471
472 """
473 tag1, tag2 = self.tag.split(",")
474 el1 = etree.Element(tag1)
475 el2 = etree.SubElement(el1, tag2)
476 el2.text = m.group(3)
477 if len(m.groups()) == 5:
478 el2.tail = m.group(4)
479 return el1
480
481
482class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
483 """Return a ElementTree element nested in tag2 nested in tag1.
484
485 Useful for strong emphasis etc.
486
487 """
488 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
489 """
490 Return [`Element`][xml.etree.ElementTree.Element] in following format:
491 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional.
492
493 """
494 tag1, tag2 = self.tag.split(",")
495 el1 = etree.Element(tag1)
496 el2 = etree.SubElement(el1, tag2)
497 el2.text = m.group(2)
498 if len(m.groups()) == 3:
499 el2.tail = m.group(3)
500 return el1, m.start(0), m.end(0)
501
502
503class HtmlInlineProcessor(InlineProcessor):
504 """ Store raw inline html and return a placeholder. """
505 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
506 """ Store the text of `group(1)` of a pattern and return a placeholder string. """
507 rawhtml = self.backslash_unescape(self.unescape(m.group(1)))
508 place_holder = self.md.htmlStash.store(rawhtml)
509 return place_holder, m.start(0), m.end(0)
510
511 def unescape(self, text: str) -> str:
512 """ Return unescaped text given text with an inline placeholder. """
513 try:
514 stash = self.md.treeprocessors['inline'].stashed_nodes
515 except KeyError: # pragma: no cover
516 return text
517
518 def get_stash(m: re.Match[str]) -> str:
519 id = m.group(1)
520 value = stash.get(id)
521 if value is not None:
522 try:
523 return self.md.serializer(value)
524 except Exception:
525 return r'\%s' % value
526
527 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
528
529 def backslash_unescape(self, text: str) -> str:
530 """ Return text with backslash escapes undone (backslashes are restored). """
531 try:
532 RE = self.md.treeprocessors['unescape'].RE
533 except KeyError: # pragma: no cover
534 return text
535
536 def _unescape(m: re.Match[str]) -> str:
537 return chr(int(m.group(1)))
538
539 return RE.sub(_unescape, text)
540
541
542class AsteriskProcessor(InlineProcessor):
543 """Emphasis processor for handling strong and em matches inside asterisks."""
544
545 PATTERNS = [
546 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
547 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
548 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
549 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
550 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
551 ]
552 """ The various strong and emphasis patterns handled by this processor. """
553
554 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element:
555 """Return single tag."""
556 el1 = etree.Element(tag)
557 text = m.group(2)
558 self.parse_sub_patterns(text, el1, None, idx)
559 return el1
560
561 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
562 """Return double tag."""
563
564 tag1, tag2 = tags.split(",")
565 el1 = etree.Element(tag1)
566 el2 = etree.Element(tag2)
567 text = m.group(2)
568 self.parse_sub_patterns(text, el2, None, idx)
569 el1.append(el2)
570 if len(m.groups()) == 3:
571 text = m.group(3)
572 self.parse_sub_patterns(text, el1, el2, idx)
573 return el1
574
575 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
576 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
577
578 tag1, tag2 = tags.split(",")
579 el1 = etree.Element(tag1)
580 el2 = etree.Element(tag2)
581 text = m.group(2)
582 self.parse_sub_patterns(text, el1, None, idx)
583 text = m.group(3)
584 el1.append(el2)
585 self.parse_sub_patterns(text, el2, None, idx)
586 return el1
587
588 def parse_sub_patterns(
589 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int
590 ) -> None:
591 """
592 Parses sub patterns.
593
594 `data`: text to evaluate.
595
596 `parent`: Parent to attach text and sub elements to.
597
598 `last`: Last appended child to parent. Can also be None if parent has no children.
599
600 `idx`: Current pattern index that was used to evaluate the parent.
601 """
602
603 offset = 0
604 pos = 0
605
606 length = len(data)
607 while pos < length:
608 # Find the start of potential emphasis or strong tokens
609 if self.compiled_re.match(data, pos):
610 matched = False
611 # See if the we can match an emphasis/strong pattern
612 for index, item in enumerate(self.PATTERNS):
613 # Only evaluate patterns that are after what was used on the parent
614 if index <= idx:
615 continue
616 m = item.pattern.match(data, pos)
617 if m:
618 # Append child nodes to parent
619 # Text nodes should be appended to the last
620 # child if present, and if not, it should
621 # be added as the parent's text node.
622 text = data[offset:m.start(0)]
623 if text:
624 if last is not None:
625 last.tail = text
626 else:
627 parent.text = text
628 el = self.build_element(m, item.builder, item.tags, index)
629 parent.append(el)
630 last = el
631 # Move our position past the matched hunk
632 offset = pos = m.end(0)
633 matched = True
634 if not matched:
635 # We matched nothing, move on to the next character
636 pos += 1
637 else:
638 # Increment position as no potential emphasis start was found.
639 pos += 1
640
641 # Append any leftover text as a text node.
642 text = data[offset:]
643 if text:
644 if last is not None:
645 last.tail = text
646 else:
647 parent.text = text
648
649 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element:
650 """Element builder."""
651
652 if builder == 'double2':
653 return self.build_double2(m, tags, index)
654 elif builder == 'double':
655 return self.build_double(m, tags, index)
656 else:
657 return self.build_single(m, tags, index)
658
659 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
660 """Parse patterns."""
661
662 el = None
663 start = None
664 end = None
665
666 for index, item in enumerate(self.PATTERNS):
667 m1 = item.pattern.match(data, m.start(0))
668 if m1:
669 start = m1.start(0)
670 end = m1.end(0)
671 el = self.build_element(m1, item.builder, item.tags, index)
672 break
673 return el, start, end
674
675
676class UnderscoreProcessor(AsteriskProcessor):
677 """Emphasis processor for handling strong and em matches inside underscores."""
678
679 PATTERNS = [
680 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
681 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
682 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
683 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
684 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
685 ]
686 """ The various strong and emphasis patterns handled by this processor. """
687
688
689class LinkInlineProcessor(InlineProcessor):
690 """ Return a link element from the given match. """
691 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
692 RE_TITLE_CLEAN = re.compile(r'\s')
693
694 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
695 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
696 text, index, handled = self.getText(data, m.end(0))
697
698 if not handled:
699 return None, None, None
700
701 href, title, index, handled = self.getLink(data, index)
702 if not handled:
703 return None, None, None
704
705 el = etree.Element("a")
706 el.text = text
707
708 el.set("href", href)
709
710 if title is not None:
711 el.set("title", title)
712
713 return el, m.start(0), index
714
715 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]:
716 """Parse data between `()` of `[Text]()` allowing recursive `()`. """
717
718 href = ''
719 title: str | None = None
720 handled = False
721
722 m = self.RE_LINK.match(data, pos=index)
723 if m and m.group(1):
724 # Matches [Text](<link> "title")
725 href = m.group(1)[1:-1].strip()
726 if m.group(2):
727 title = m.group(2)[1:-1]
728 index = m.end(0)
729 handled = True
730 elif m:
731 # Track bracket nesting and index in string
732 bracket_count = 1
733 backtrack_count = 1
734 start_index = m.end()
735 index = start_index
736 last_bracket = -1
737
738 # Primary (first found) quote tracking.
739 quote: str | None = None
740 start_quote = -1
741 exit_quote = -1
742 ignore_matches = False
743
744 # Secondary (second found) quote tracking.
745 alt_quote = None
746 start_alt_quote = -1
747 exit_alt_quote = -1
748
749 # Track last character
750 last = ''
751
752 for pos in range(index, len(data)):
753 c = data[pos]
754 if c == '(':
755 # Count nested (
756 # Don't increment the bracket count if we are sure we're in a title.
757 if not ignore_matches:
758 bracket_count += 1
759 elif backtrack_count > 0:
760 backtrack_count -= 1
761 elif c == ')':
762 # Match nested ) to (
763 # Don't decrement if we are sure we are in a title that is unclosed.
764 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
765 bracket_count = 0
766 elif not ignore_matches:
767 bracket_count -= 1
768 elif backtrack_count > 0:
769 backtrack_count -= 1
770 # We've found our backup end location if the title doesn't resolve.
771 if backtrack_count == 0:
772 last_bracket = index + 1
773
774 elif c in ("'", '"'):
775 # Quote has started
776 if not quote:
777 # We'll assume we are now in a title.
778 # Brackets are quoted, so no need to match them (except for the final one).
779 ignore_matches = True
780 backtrack_count = bracket_count
781 bracket_count = 1
782 start_quote = index + 1
783 quote = c
784 # Secondary quote (in case the first doesn't resolve): [text](link'"title")
785 elif c != quote and not alt_quote:
786 start_alt_quote = index + 1
787 alt_quote = c
788 # Update primary quote match
789 elif c == quote:
790 exit_quote = index + 1
791 # Update secondary quote match
792 elif alt_quote and c == alt_quote:
793 exit_alt_quote = index + 1
794
795 index += 1
796
797 # Link is closed, so let's break out of the loop
798 if bracket_count == 0:
799 # Get the title if we closed a title string right before link closed
800 if exit_quote >= 0 and quote == last:
801 href = data[start_index:start_quote - 1]
802 title = ''.join(data[start_quote:exit_quote - 1])
803 elif exit_alt_quote >= 0 and alt_quote == last:
804 href = data[start_index:start_alt_quote - 1]
805 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
806 else:
807 href = data[start_index:index - 1]
808 break
809
810 if c != ' ':
811 last = c
812
813 # We have a scenario: `[test](link"notitle)`
814 # When we enter a string, we stop tracking bracket resolution in the main counter,
815 # but we do keep a backup counter up until we discover where we might resolve all brackets
816 # if the title string fails to resolve.
817 if bracket_count != 0 and backtrack_count == 0:
818 href = data[start_index:last_bracket - 1]
819 index = last_bracket
820 bracket_count = 0
821
822 handled = bracket_count == 0
823
824 if title is not None:
825 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
826
827 href = self.unescape(href).strip()
828
829 return href, title, index, handled
830
831 def getText(self, data: str, index: int) -> tuple[str, int, bool]:
832 """Parse the content between `[]` of the start of an image or link
833 resolving nested square brackets.
834
835 """
836 bracket_count = 1
837 text = []
838 for pos in range(index, len(data)):
839 c = data[pos]
840 if c == ']':
841 bracket_count -= 1
842 elif c == '[':
843 bracket_count += 1
844 index += 1
845 if bracket_count == 0:
846 break
847 text.append(c)
848 return ''.join(text), index, bracket_count == 0
849
850
851class ImageInlineProcessor(LinkInlineProcessor):
852 """ Return a `img` element from the given match. """
853
854 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
855 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
856 text, index, handled = self.getText(data, m.end(0))
857 if not handled:
858 return None, None, None
859
860 src, title, index, handled = self.getLink(data, index)
861 if not handled:
862 return None, None, None
863
864 el = etree.Element("img")
865
866 el.set("src", src)
867
868 if title is not None:
869 el.set("title", title)
870
871 el.set('alt', self.unescape(text))
872 return el, m.start(0), index
873
874
875class ReferenceInlineProcessor(LinkInlineProcessor):
876 """ Match to a stored reference and return link element. """
877 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
878
879 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
880
881 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
882 """
883 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`.
884
885 """
886 text, index, handled = self.getText(data, m.end(0))
887 if not handled:
888 return None, None, None
889
890 id, end, handled = self.evalId(data, index, text)
891 if not handled:
892 return None, None, None
893
894 # Clean up line breaks in id
895 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
896 if id not in self.md.references: # ignore undefined refs
897 return None, m.start(0), end
898
899 href, title = self.md.references[id]
900
901 return self.makeTag(href, title, text), m.start(0), end
902
903 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:
904 """
905 Evaluate the id portion of `[ref][id]`.
906
907 If `[ref][]` use `[ref]`.
908 """
909 m = self.RE_LINK.match(data, pos=index)
910 if not m:
911 return None, index, False
912 else:
913 id = m.group(1).lower()
914 end = m.end(0)
915 if not id:
916 id = text.lower()
917 return id, end, True
918
919 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
920 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """
921 el = etree.Element('a')
922
923 el.set('href', href)
924 if title:
925 el.set('title', title)
926
927 el.text = text
928 return el
929
930
931class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
932 """Short form of reference: `[google]`. """
933 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
934 """Evaluate the id of `[ref]`. """
935
936 return text.lower(), index, True
937
938
939class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
940 """ Match to a stored reference and return `img` element. """
941 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
942 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """
943 el = etree.Element("img")
944 el.set("src", href)
945 if title:
946 el.set("title", title)
947 el.set("alt", self.unescape(text))
948 return el
949
950
951class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
952 """ Short form of image reference: `![ref]`. """
953 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
954 """Evaluate the id of `[ref]`. """
955
956 return text.lower(), index, True
957
958
959class AutolinkInlineProcessor(InlineProcessor):
960 """ Return a link Element given an auto-link (`<http://example/com>`). """
961 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
962 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """
963 el = etree.Element("a")
964 el.set('href', self.unescape(m.group(1)))
965 el.text = util.AtomicString(m.group(1))
966 return el, m.start(0), m.end(0)
967
968
969class AutomailInlineProcessor(InlineProcessor):
970 """
971 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`).
972 """
973 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
974 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """
975 el = etree.Element('a')
976 email = self.unescape(m.group(1))
977 if email.startswith("mailto:"):
978 email = email[len("mailto:"):]
979
980 def codepoint2name(code: int) -> str:
981 """Return entity definition by code, or the code if not defined."""
982 entity = entities.codepoint2name.get(code)
983 if entity:
984 return "{}{};".format(util.AMP_SUBSTITUTE, entity)
985 else:
986 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
987
988 letters = [codepoint2name(ord(letter)) for letter in email]
989 el.text = util.AtomicString(''.join(letters))
990
991 mailto = "mailto:" + email
992 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
993 ord(letter) for letter in mailto])
994 el.set('href', mailto)
995 return el, m.start(0), m.end(0)