1# Python Markdown
2
3# A Python implementation of John Gruber's Markdown.
4
5# Documentation: https://python-markdown.github.io/
6# GitHub: https://github.com/Python-Markdown/markdown/
7# PyPI: https://pypi.org/project/Markdown/
8
9# Started by Manfred Stienstra (http://www.dwerg.net/).
10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
11# Currently maintained by Waylan Limberg (https://github.com/waylan),
12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
13
14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
16# Copyright 2004 Manfred Stienstra (the original version)
17
18# License: BSD (see LICENSE.md for details).
19
20"""
21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The
22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still
23supported, though users are encouraged to migrate.
24
25The new `InlineProcessor` provides two major enhancements to `Patterns`:
26
271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with
28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain
29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`.
30
312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis,
32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative
33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are
34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`.
35
36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested
37 brackets, and explicit control of the span "consumed" by the processor.
38
39"""
40
41from __future__ import annotations
42
43from . import util
44from typing import TYPE_CHECKING, Any, Collection, NamedTuple
45import re
46import xml.etree.ElementTree as etree
47from html import entities
48
49if TYPE_CHECKING: # pragma: no cover
50 from markdown import Markdown
51
52
53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]:
54 """
55 Build the default set of inline patterns for Markdown.
56
57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace
58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we
59 apply the expressions in the following order:
60
61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown
62 patterns by escaping them;
63
64 * then we handle the various types of links (auto-links must be handled before inline HTML);
65
66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder
67 and add the actual HTML to a stash;
68
69 * finally we apply strong, emphasis, etc.
70
71 """
72 inlinePatterns = util.Registry()
73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)
74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)
75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)
76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)
77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)
78 inlinePatterns.register(
79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140
80 )
81 inlinePatterns.register(
82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130
83 )
84 inlinePatterns.register(
85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125
86 )
87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)
88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)
89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)
90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)
91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)
92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)
93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)
94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)
95 return inlinePatterns
96
97
98# The actual regular expressions for patterns
99# -----------------------------------------------------------------------------
100
101NOIMG = r'(?<!\!)'
102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """
103
104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'
105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """
106
107ESCAPE_RE = r'\\(.)'
108""" Match a backslash escaped character (`\\<` or `\\*`). """
109
110EMPHASIS_RE = r'(\*)([^\*]+)\1'
111""" Match emphasis with an asterisk (`*emphasis*`). """
112
113STRONG_RE = r'(\*{2})(.+?)\1'
114""" Match strong with an asterisk (`**strong**`). """
115
116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """
118
119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """
121
122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'
123""" Match strong emphasis with underscores (`__strong _em__`). """
124
125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'
126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """
127
128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'
129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """
130
131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'
132""" Match strong emphasis with asterisk (`***strong**em*`). """
133
134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'
135""" Match strong emphasis with underscores (`___strong__em_`). """
136
137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'
138""" Match strong emphasis with asterisk (`**strong*em***`). """
139
140LINK_RE = NOIMG + r'\['
141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """
142
143IMAGE_LINK_RE = r'\!\['
144""" Match start of in-line image link (`` or ``). """
145
146REFERENCE_RE = LINK_RE
147""" Match start of reference link (`[Label][3]`). """
148
149IMAGE_REFERENCE_RE = IMAGE_LINK_RE
150""" Match start of image reference (`![alt text][2]`). """
151
152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))'
153""" Match a stand-alone `*` or `_`. """
154
155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'
156""" Match an automatic link (`<http://www.example.com>`). """
157
158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
159""" Match an automatic email link (`<me@example.com>`). """
160
161HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
162""" Match an HTML tag (`<...>`). """
163
164ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
165""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """
166
167LINE_BREAK_RE = r' \n'
168""" Match two spaces at end of line. """
169
170
171def dequote(string: str) -> str:
172 """Remove quotes from around a string."""
173 if ((string.startswith('"') and string.endswith('"')) or
174 (string.startswith("'") and string.endswith("'"))):
175 return string[1:-1]
176 else:
177 return string
178
179
180class EmStrongItem(NamedTuple):
181 """Emphasis/strong pattern item."""
182 pattern: re.Pattern[str]
183 builder: str
184 tags: str
185
186
187# The pattern classes
188# -----------------------------------------------------------------------------
189
190
191class Pattern: # pragma: no cover
192 """
193 Base class that inline patterns subclass.
194
195 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression.
196 Each pattern object uses a single regular expression and must support the following methods:
197 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and
198 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch].
199
200 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this
201 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on
202 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression
203 is pre-compiled.
204
205 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that
206 use a more efficient and flexible search approach be used instead. However, the older style
207 `Pattern` remains for backward compatibility with many existing third-party extensions.
208
209 """
210
211 ANCESTOR_EXCLUDES: Collection[str] = tuple()
212 """
213 A collection of elements which are undesirable ancestors. The processor will be skipped if it
214 would cause the content to be a descendant of one of the listed tag names.
215 """
216
217 compiled_re: re.Pattern[str]
218 md: Markdown | None
219
220 def __init__(self, pattern: str, md: Markdown | None = None):
221 """
222 Create an instant of an inline pattern.
223
224 Arguments:
225 pattern: A regular expression that matches a pattern.
226 md: An optional pointer to the instance of `markdown.Markdown` and is available as
227 `self.md` on the class instance.
228
229
230 """
231 self.pattern = pattern
232 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,
233 re.DOTALL | re.UNICODE)
234
235 self.md = md
236
237 def getCompiledRegExp(self) -> re.Pattern:
238 """ Return a compiled regular expression. """
239 return self.compiled_re
240
241 def handleMatch(self, m: re.Match[str]) -> etree.Element | str:
242 """Return a ElementTree element from the given match.
243
244 Subclasses should override this method.
245
246 Arguments:
247 m: A match object containing a match of the pattern.
248
249 Returns: An ElementTree Element object.
250
251 """
252 pass # pragma: no cover
253
254 def type(self) -> str:
255 """ Return class name, to define pattern type """
256 return self.__class__.__name__
257
258 def unescape(self, text: str) -> str:
259 """ Return unescaped text given text with an inline placeholder. """
260 try:
261 stash = self.md.treeprocessors['inline'].stashed_nodes
262 except KeyError: # pragma: no cover
263 return text
264
265 def get_stash(m):
266 id = m.group(1)
267 if id in stash:
268 value = stash.get(id)
269 if isinstance(value, str):
270 return value
271 else:
272 # An `etree` Element - return text content only
273 return ''.join(value.itertext())
274 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
275
276
277class InlineProcessor(Pattern):
278 """
279 Base class that inline processors subclass.
280
281 This is the newer style inline processor that uses a more
282 efficient and flexible search approach.
283
284 """
285
286 def __init__(self, pattern: str, md: Markdown | None = None):
287 """
288 Create an instant of an inline processor.
289
290 Arguments:
291 pattern: A regular expression that matches a pattern.
292 md: An optional pointer to the instance of `markdown.Markdown` and is available as
293 `self.md` on the class instance.
294
295 """
296 self.pattern = pattern
297 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
298
299 # API for Markdown to pass `safe_mode` into instance
300 self.safe_mode = False
301 self.md = md
302
303 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]:
304 """Return a ElementTree element from the given match and the
305 start and end index of the matched text.
306
307 If `start` and/or `end` are returned as `None`, it will be
308 assumed that the processor did not find a valid region of text.
309
310 Subclasses should override this method.
311
312 Arguments:
313 m: A re match object containing a match of the pattern.
314 data: The buffer currently under analysis.
315
316 Returns:
317 el: The ElementTree element, text or None.
318 start: The start of the region that has been matched or None.
319 end: The end of the region that has been matched or None.
320
321 """
322 pass # pragma: no cover
323
324
325class SimpleTextPattern(Pattern): # pragma: no cover
326 """ Return a simple text of `group(2)` of a Pattern. """
327 def handleMatch(self, m: re.Match[str]) -> str:
328 """ Return string content of `group(2)` of a matching pattern. """
329 return m.group(2)
330
331
332class SimpleTextInlineProcessor(InlineProcessor):
333 """ Return a simple text of `group(1)` of a Pattern. """
334 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
335 """ Return string content of `group(1)` of a matching pattern. """
336 return m.group(1), m.start(0), m.end(0)
337
338
339class EscapeInlineProcessor(InlineProcessor):
340 """ Return an escaped character. """
341
342 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]:
343 """
344 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS]
345 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped
346 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX].
347
348 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`.
349 """
350
351 char = m.group(1)
352 if char in self.md.ESCAPED_CHARS:
353 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)
354 else:
355 return None, m.start(0), m.end(0)
356
357
358class SimpleTagPattern(Pattern): # pragma: no cover
359 """
360 Return element of type `tag` with a text attribute of `group(3)`
361 of a Pattern.
362
363 """
364 def __init__(self, pattern: str, tag: str):
365 """
366 Create an instant of an simple tag pattern.
367
368 Arguments:
369 pattern: A regular expression that matches a pattern.
370 tag: Tag of element.
371
372 """
373 Pattern.__init__(self, pattern)
374 self.tag = tag
375 """ The tag of the rendered element. """
376
377 def handleMatch(self, m: re.Match[str]) -> etree.Element:
378 """
379 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a
380 matching pattern as the Element's text.
381 """
382 el = etree.Element(self.tag)
383 el.text = m.group(3)
384 return el
385
386
387class SimpleTagInlineProcessor(InlineProcessor):
388 """
389 Return element of type `tag` with a text attribute of `group(2)`
390 of a Pattern.
391
392 """
393 def __init__(self, pattern: str, tag: str):
394 """
395 Create an instant of an simple tag processor.
396
397 Arguments:
398 pattern: A regular expression that matches a pattern.
399 tag: Tag of element.
400
401 """
402 InlineProcessor.__init__(self, pattern)
403 self.tag = tag
404 """ The tag of the rendered element. """
405
406 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
407 """
408 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a
409 matching pattern as the Element's text.
410 """
411 el = etree.Element(self.tag)
412 el.text = m.group(2)
413 return el, m.start(0), m.end(0)
414
415
416class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover
417 """ Return an element of type `tag` with no children. """
418 def handleMatch(self, m: re.Match[str]) -> etree.Element:
419 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
420 return etree.Element(self.tag)
421
422
423class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):
424 """ Return an element of type `tag` with no children. """
425 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
426 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """
427 return etree.Element(self.tag), m.start(0), m.end(0)
428
429
430class BacktickInlineProcessor(InlineProcessor):
431 """ Return a `<code>` element containing the escaped matching text. """
432 def __init__(self, pattern: str):
433 InlineProcessor.__init__(self, pattern)
434 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)
435 self.tag = 'code'
436 """ The tag of the rendered element. """
437
438 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:
439 """
440 If the match contains `group(3)` of a pattern, then return a `code`
441 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with
442 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString].
443
444 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped.
445
446 """
447 if m.group(3):
448 el = etree.Element(self.tag)
449 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
450 return el, m.start(0), m.end(0)
451 else:
452 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
453
454
455class DoubleTagPattern(SimpleTagPattern): # pragma: no cover
456 """Return a ElementTree element nested in tag2 nested in tag1.
457
458 Useful for strong emphasis etc.
459
460 """
461 def handleMatch(self, m: re.Match[str]) -> etree.Element:
462 """
463 Return [`Element`][xml.etree.ElementTree.Element] in following format:
464 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional.
465
466 """
467 tag1, tag2 = self.tag.split(",")
468 el1 = etree.Element(tag1)
469 el2 = etree.SubElement(el1, tag2)
470 el2.text = m.group(3)
471 if len(m.groups()) == 5:
472 el2.tail = m.group(4)
473 return el1
474
475
476class DoubleTagInlineProcessor(SimpleTagInlineProcessor):
477 """Return a ElementTree element nested in tag2 nested in tag1.
478
479 Useful for strong emphasis etc.
480
481 """
482 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover
483 """
484 Return [`Element`][xml.etree.ElementTree.Element] in following format:
485 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional.
486
487 """
488 tag1, tag2 = self.tag.split(",")
489 el1 = etree.Element(tag1)
490 el2 = etree.SubElement(el1, tag2)
491 el2.text = m.group(2)
492 if len(m.groups()) == 3:
493 el2.tail = m.group(3)
494 return el1, m.start(0), m.end(0)
495
496
497class HtmlInlineProcessor(InlineProcessor):
498 """ Store raw inline html and return a placeholder. """
499 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
500 """ Store the text of `group(1)` of a pattern and return a placeholder string. """
501 rawhtml = self.backslash_unescape(self.unescape(m.group(1)))
502 place_holder = self.md.htmlStash.store(rawhtml)
503 return place_holder, m.start(0), m.end(0)
504
505 def unescape(self, text: str) -> str:
506 """ Return unescaped text given text with an inline placeholder. """
507 try:
508 stash = self.md.treeprocessors['inline'].stashed_nodes
509 except KeyError: # pragma: no cover
510 return text
511
512 def get_stash(m: re.Match[str]) -> str:
513 id = m.group(1)
514 value = stash.get(id)
515 if value is not None:
516 try:
517 return self.md.serializer(value)
518 except Exception:
519 return r'\%s' % value
520
521 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
522
523 def backslash_unescape(self, text: str) -> str:
524 """ Return text with backslash escapes undone (backslashes are restored). """
525 try:
526 RE = self.md.treeprocessors['unescape'].RE
527 except KeyError: # pragma: no cover
528 return text
529
530 def _unescape(m: re.Match[str]) -> str:
531 return chr(int(m.group(1)))
532
533 return RE.sub(_unescape, text)
534
535
536class AsteriskProcessor(InlineProcessor):
537 """Emphasis processor for handling strong and em matches inside asterisks."""
538
539 PATTERNS = [
540 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
541 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
542 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
543 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
544 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
545 ]
546 """ The various strong and emphasis patterns handled by this processor. """
547
548 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element:
549 """Return single tag."""
550 el1 = etree.Element(tag)
551 text = m.group(2)
552 self.parse_sub_patterns(text, el1, None, idx)
553 return el1
554
555 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
556 """Return double tag."""
557
558 tag1, tag2 = tags.split(",")
559 el1 = etree.Element(tag1)
560 el2 = etree.Element(tag2)
561 text = m.group(2)
562 self.parse_sub_patterns(text, el2, None, idx)
563 el1.append(el2)
564 if len(m.groups()) == 3:
565 text = m.group(3)
566 self.parse_sub_patterns(text, el1, el2, idx)
567 return el1
568
569 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:
570 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""
571
572 tag1, tag2 = tags.split(",")
573 el1 = etree.Element(tag1)
574 el2 = etree.Element(tag2)
575 text = m.group(2)
576 self.parse_sub_patterns(text, el1, None, idx)
577 text = m.group(3)
578 el1.append(el2)
579 self.parse_sub_patterns(text, el2, None, idx)
580 return el1
581
582 def parse_sub_patterns(
583 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int
584 ) -> None:
585 """
586 Parses sub patterns.
587
588 `data`: text to evaluate.
589
590 `parent`: Parent to attach text and sub elements to.
591
592 `last`: Last appended child to parent. Can also be None if parent has no children.
593
594 `idx`: Current pattern index that was used to evaluate the parent.
595 """
596
597 offset = 0
598 pos = 0
599
600 length = len(data)
601 while pos < length:
602 # Find the start of potential emphasis or strong tokens
603 if self.compiled_re.match(data, pos):
604 matched = False
605 # See if the we can match an emphasis/strong pattern
606 for index, item in enumerate(self.PATTERNS):
607 # Only evaluate patterns that are after what was used on the parent
608 if index <= idx:
609 continue
610 m = item.pattern.match(data, pos)
611 if m:
612 # Append child nodes to parent
613 # Text nodes should be appended to the last
614 # child if present, and if not, it should
615 # be added as the parent's text node.
616 text = data[offset:m.start(0)]
617 if text:
618 if last is not None:
619 last.tail = text
620 else:
621 parent.text = text
622 el = self.build_element(m, item.builder, item.tags, index)
623 parent.append(el)
624 last = el
625 # Move our position past the matched hunk
626 offset = pos = m.end(0)
627 matched = True
628 if not matched:
629 # We matched nothing, move on to the next character
630 pos += 1
631 else:
632 # Increment position as no potential emphasis start was found.
633 pos += 1
634
635 # Append any leftover text as a text node.
636 text = data[offset:]
637 if text:
638 if last is not None:
639 last.tail = text
640 else:
641 parent.text = text
642
643 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element:
644 """Element builder."""
645
646 if builder == 'double2':
647 return self.build_double2(m, tags, index)
648 elif builder == 'double':
649 return self.build_double(m, tags, index)
650 else:
651 return self.build_single(m, tags, index)
652
653 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
654 """Parse patterns."""
655
656 el = None
657 start = None
658 end = None
659
660 for index, item in enumerate(self.PATTERNS):
661 m1 = item.pattern.match(data, m.start(0))
662 if m1:
663 start = m1.start(0)
664 end = m1.end(0)
665 el = self.build_element(m1, item.builder, item.tags, index)
666 break
667 return el, start, end
668
669
670class UnderscoreProcessor(AsteriskProcessor):
671 """Emphasis processor for handling strong and em matches inside underscores."""
672
673 PATTERNS = [
674 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),
675 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),
676 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),
677 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),
678 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')
679 ]
680 """ The various strong and emphasis patterns handled by this processor. """
681
682
683class LinkInlineProcessor(InlineProcessor):
684 """ Return a link element from the given match. """
685 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE)
686 RE_TITLE_CLEAN = re.compile(r'\s')
687
688 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
689 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
690 text, index, handled = self.getText(data, m.end(0))
691
692 if not handled:
693 return None, None, None
694
695 href, title, index, handled = self.getLink(data, index)
696 if not handled:
697 return None, None, None
698
699 el = etree.Element("a")
700 el.text = text
701
702 el.set("href", href)
703
704 if title is not None:
705 el.set("title", title)
706
707 return el, m.start(0), index
708
709 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]:
710 """Parse data between `()` of `[Text]()` allowing recursive `()`. """
711
712 href = ''
713 title: str | None = None
714 handled = False
715
716 m = self.RE_LINK.match(data, pos=index)
717 if m and m.group(1):
718 # Matches [Text](<link> "title")
719 href = m.group(1)[1:-1].strip()
720 if m.group(2):
721 title = m.group(2)[1:-1]
722 index = m.end(0)
723 handled = True
724 elif m:
725 # Track bracket nesting and index in string
726 bracket_count = 1
727 backtrack_count = 1
728 start_index = m.end()
729 index = start_index
730 last_bracket = -1
731
732 # Primary (first found) quote tracking.
733 quote: str | None = None
734 start_quote = -1
735 exit_quote = -1
736 ignore_matches = False
737
738 # Secondary (second found) quote tracking.
739 alt_quote = None
740 start_alt_quote = -1
741 exit_alt_quote = -1
742
743 # Track last character
744 last = ''
745
746 for pos in range(index, len(data)):
747 c = data[pos]
748 if c == '(':
749 # Count nested (
750 # Don't increment the bracket count if we are sure we're in a title.
751 if not ignore_matches:
752 bracket_count += 1
753 elif backtrack_count > 0:
754 backtrack_count -= 1
755 elif c == ')':
756 # Match nested ) to (
757 # Don't decrement if we are sure we are in a title that is unclosed.
758 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):
759 bracket_count = 0
760 elif not ignore_matches:
761 bracket_count -= 1
762 elif backtrack_count > 0:
763 backtrack_count -= 1
764 # We've found our backup end location if the title doesn't resolve.
765 if backtrack_count == 0:
766 last_bracket = index + 1
767
768 elif c in ("'", '"'):
769 # Quote has started
770 if not quote:
771 # We'll assume we are now in a title.
772 # Brackets are quoted, so no need to match them (except for the final one).
773 ignore_matches = True
774 backtrack_count = bracket_count
775 bracket_count = 1
776 start_quote = index + 1
777 quote = c
778 # Secondary quote (in case the first doesn't resolve): [text](link'"title")
779 elif c != quote and not alt_quote:
780 start_alt_quote = index + 1
781 alt_quote = c
782 # Update primary quote match
783 elif c == quote:
784 exit_quote = index + 1
785 # Update secondary quote match
786 elif alt_quote and c == alt_quote:
787 exit_alt_quote = index + 1
788
789 index += 1
790
791 # Link is closed, so let's break out of the loop
792 if bracket_count == 0:
793 # Get the title if we closed a title string right before link closed
794 if exit_quote >= 0 and quote == last:
795 href = data[start_index:start_quote - 1]
796 title = ''.join(data[start_quote:exit_quote - 1])
797 elif exit_alt_quote >= 0 and alt_quote == last:
798 href = data[start_index:start_alt_quote - 1]
799 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])
800 else:
801 href = data[start_index:index - 1]
802 break
803
804 if c != ' ':
805 last = c
806
807 # We have a scenario: `[test](link"notitle)`
808 # When we enter a string, we stop tracking bracket resolution in the main counter,
809 # but we do keep a backup counter up until we discover where we might resolve all brackets
810 # if the title string fails to resolve.
811 if bracket_count != 0 and backtrack_count == 0:
812 href = data[start_index:last_bracket - 1]
813 index = last_bracket
814 bracket_count = 0
815
816 handled = bracket_count == 0
817
818 if title is not None:
819 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))
820
821 href = self.unescape(href).strip()
822
823 return href, title, index, handled
824
825 def getText(self, data: str, index: int) -> tuple[str, int, bool]:
826 """Parse the content between `[]` of the start of an image or link
827 resolving nested square brackets.
828
829 """
830 bracket_count = 1
831 text = []
832 for pos in range(index, len(data)):
833 c = data[pos]
834 if c == ']':
835 bracket_count -= 1
836 elif c == '[':
837 bracket_count += 1
838 index += 1
839 if bracket_count == 0:
840 break
841 text.append(c)
842 return ''.join(text), index, bracket_count == 0
843
844
845class ImageInlineProcessor(LinkInlineProcessor):
846 """ Return a `img` element from the given match. """
847
848 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
849 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """
850 text, index, handled = self.getText(data, m.end(0))
851 if not handled:
852 return None, None, None
853
854 src, title, index, handled = self.getLink(data, index)
855 if not handled:
856 return None, None, None
857
858 el = etree.Element("img")
859
860 el.set("src", src)
861
862 if title is not None:
863 el.set("title", title)
864
865 el.set('alt', self.unescape(text))
866 return el, m.start(0), index
867
868
869class ReferenceInlineProcessor(LinkInlineProcessor):
870 """ Match to a stored reference and return link element. """
871 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)
872
873 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)
874
875 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:
876 """
877 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`.
878
879 """
880 text, index, handled = self.getText(data, m.end(0))
881 if not handled:
882 return None, None, None
883
884 id, end, handled = self.evalId(data, index, text)
885 if not handled:
886 return None, None, None
887
888 # Clean up line breaks in id
889 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
890 if id not in self.md.references: # ignore undefined refs
891 return None, m.start(0), end
892
893 href, title = self.md.references[id]
894
895 return self.makeTag(href, title, text), m.start(0), end
896
897 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:
898 """
899 Evaluate the id portion of `[ref][id]`.
900
901 If `[ref][]` use `[ref]`.
902 """
903 m = self.RE_LINK.match(data, pos=index)
904 if not m:
905 return None, index, False
906 else:
907 id = m.group(1).lower()
908 end = m.end(0)
909 if not id:
910 id = text.lower()
911 return id, end, True
912
913 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
914 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """
915 el = etree.Element('a')
916
917 el.set('href', href)
918 if title:
919 el.set('title', title)
920
921 el.text = text
922 return el
923
924
925class ShortReferenceInlineProcessor(ReferenceInlineProcessor):
926 """Short form of reference: `[google]`. """
927 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
928 """Evaluate the id of `[ref]`. """
929
930 return text.lower(), index, True
931
932
933class ImageReferenceInlineProcessor(ReferenceInlineProcessor):
934 """ Match to a stored reference and return `img` element. """
935 def makeTag(self, href: str, title: str, text: str) -> etree.Element:
936 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """
937 el = etree.Element("img")
938 el.set("src", href)
939 if title:
940 el.set("title", title)
941 el.set("alt", self.unescape(text))
942 return el
943
944
945class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):
946 """ Short form of image reference: `![ref]`. """
947 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:
948 """Evaluate the id of `[ref]`. """
949
950 return text.lower(), index, True
951
952
953class AutolinkInlineProcessor(InlineProcessor):
954 """ Return a link Element given an auto-link (`<http://example/com>`). """
955 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
956 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """
957 el = etree.Element("a")
958 el.set('href', self.unescape(m.group(1)))
959 el.text = util.AtomicString(m.group(1))
960 return el, m.start(0), m.end(0)
961
962
963class AutomailInlineProcessor(InlineProcessor):
964 """
965 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`).
966 """
967 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
968 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """
969 el = etree.Element('a')
970 email = self.unescape(m.group(1))
971 if email.startswith("mailto:"):
972 email = email[len("mailto:"):]
973
974 def codepoint2name(code: int) -> str:
975 """Return entity definition by code, or the code if not defined."""
976 entity = entities.codepoint2name.get(code)
977 if entity:
978 return "{}{};".format(util.AMP_SUBSTITUTE, entity)
979 else:
980 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
981
982 letters = [codepoint2name(ord(letter)) for letter in email]
983 el.text = util.AtomicString(''.join(letters))
984
985 mailto = "mailto:" + email
986 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
987 ord(letter) for letter in mailto])
988 el.set('href', mailto)
989 return el, m.start(0), m.end(0)