1"""Tokenizes paragraph content."""
2
3from __future__ import annotations
4
5from collections.abc import Callable
6import functools
7import re
8from typing import TYPE_CHECKING
9
10from . import rules_inline
11from .ruler import Ruler
12from .rules_inline.state_inline import StateInline
13from .token import Token
14from .utils import EnvType
15
16if TYPE_CHECKING:
17 from markdown_it import MarkdownIt
18
19
20# Default set of characters that terminate a text token and allow inline rules to fire.
21# '{}$%@~+=:' reserved for extensions.
22# Note: Don't confuse with "Markdown ASCII Punctuation" chars.
23# http://spec.commonmark.org/0.15/#ascii-punctuation-character
24_DEFAULT_TERMINATORS: frozenset[str] = frozenset(
25 {
26 "\n",
27 "!",
28 "#",
29 "$",
30 "%",
31 "&",
32 "*",
33 "+",
34 "-",
35 ":",
36 "<",
37 "=",
38 ">",
39 "@",
40 "[",
41 "\\",
42 "]",
43 "^",
44 "_",
45 "`",
46 "{",
47 "}",
48 "~",
49 }
50)
51
52
53# Lazily compiled regex for the default terminator set. The @cache ensures it is
54# compiled at most once (on first ParserInline instantiation) and shared across all
55# instances that have not added extra chars, keeping __init__ cost near zero.
56@functools.cache
57def _default_terminator_re() -> re.Pattern[str]:
58 return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]")
59
60
61# Parser rules
62RuleFuncInlineType = Callable[[StateInline, bool], bool]
63"""(state: StateInline, silent: bool) -> matched: bool)
64
65`silent` disables token generation, useful for lookahead.
66"""
67_rules: list[tuple[str, RuleFuncInlineType]] = [
68 ("text", rules_inline.text),
69 ("linkify", rules_inline.linkify),
70 ("newline", rules_inline.newline),
71 ("escape", rules_inline.escape),
72 ("backticks", rules_inline.backtick),
73 ("strikethrough", rules_inline.strikethrough.tokenize),
74 ("emphasis", rules_inline.emphasis.tokenize),
75 ("link", rules_inline.link),
76 ("image", rules_inline.image),
77 ("autolink", rules_inline.autolink),
78 ("html_inline", rules_inline.html_inline),
79 ("entity", rules_inline.entity),
80]
81
82# Note `rule2` ruleset was created specifically for emphasis/strikethrough
83# post-processing and may be changed in the future.
84#
85# Don't use this for anything except pairs (plugins working with `balance_pairs`).
86#
87RuleFuncInline2Type = Callable[[StateInline], None]
88_rules2: list[tuple[str, RuleFuncInline2Type]] = [
89 ("balance_pairs", rules_inline.link_pairs),
90 ("strikethrough", rules_inline.strikethrough.postProcess),
91 ("emphasis", rules_inline.emphasis.postProcess),
92 # rules for pairs separate '**' into its own text tokens, which may be left unused,
93 # rule below merges unused segments back with the rest of the text
94 ("fragments_join", rules_inline.fragments_join),
95]
96
97
98class ParserInline:
99 def __init__(self) -> None:
100 self.ruler = Ruler[RuleFuncInlineType]()
101 for name, rule in _rules:
102 self.ruler.push(name, rule)
103 # Second ruler used for post-processing (e.g. in emphasis-like rules)
104 self.ruler2 = Ruler[RuleFuncInline2Type]()
105 for name, rule2 in _rules2:
106 self.ruler2.push(name, rule2)
107 # Characters that stop the text rule, allowing other inline rules to fire.
108 # _extra_terminator_chars is only allocated when add_terminator_char() is called
109 # with a char outside the defaults, keeping __init__ allocation-free.
110 self._extra_terminator_chars: set[str] = set()
111 # Pre-compiled regex shared with all default instances (no copy in the common path).
112 self.terminator_re: re.Pattern[str] = _default_terminator_re()
113
114 def add_terminator_char(self, ch: str) -> None:
115 """Register a character that stops the ``text`` rule, allowing inline rules to fire.
116
117 This lets plugins declare which characters their inline rules react to,
118 mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation.
119
120 :param ch: A single character to add to the terminator set.
121 """
122 if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars:
123 self._extra_terminator_chars.add(ch)
124 self.terminator_re = re.compile(
125 "["
126 + re.escape(
127 "".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars)
128 )
129 + "]"
130 )
131
132 def skipToken(self, state: StateInline) -> None:
133 """Skip single token by running all rules in validation mode;
134 returns `True` if any rule reported success
135 """
136 ok = False
137 pos = state.pos
138 rules = self.ruler.getRules("")
139 maxNesting = state.md.options["maxNesting"]
140 cache = state.cache
141
142 if pos in cache:
143 state.pos = cache[pos]
144 return
145
146 if state.level < maxNesting:
147 for rule in rules:
148 # Increment state.level and decrement it later to limit recursion.
149 # It's harmless to do here, because no tokens are created.
150 # But ideally, we'd need a separate private state variable for this purpose.
151 state.level += 1
152 ok = rule(state, True)
153 state.level -= 1
154 if ok:
155 break
156 else:
157 # Too much nesting, just skip until the end of the paragraph.
158 #
159 # NOTE: this will cause links to behave incorrectly in the following case,
160 # when an amount of `[` is exactly equal to `maxNesting + 1`:
161 #
162 # [[[[[[[[[[[[[[[[[[[[[foo]()
163 #
164 # TODO: remove this workaround when CM standard will allow nested links
165 # (we can replace it by preventing links from being parsed in
166 # validation mode)
167 #
168 state.pos = state.posMax
169
170 if not ok:
171 state.pos += 1
172 cache[pos] = state.pos
173
174 def tokenize(self, state: StateInline) -> None:
175 """Generate tokens for input range."""
176 ok = False
177 rules = self.ruler.getRules("")
178 end = state.posMax
179 maxNesting = state.md.options["maxNesting"]
180
181 while state.pos < end:
182 # Try all possible rules.
183 # On success, rule should:
184 #
185 # - update `state.pos`
186 # - update `state.tokens`
187 # - return true
188
189 if state.level < maxNesting:
190 for rule in rules:
191 ok = rule(state, False)
192 if ok:
193 break
194
195 if ok:
196 if state.pos >= end:
197 break
198 continue
199
200 state.pending += state.src[state.pos]
201 state.pos += 1
202
203 if state.pending:
204 state.pushPending()
205
206 def parse(
207 self, src: str, md: MarkdownIt, env: EnvType, tokens: list[Token]
208 ) -> list[Token]:
209 """Process input string and push inline tokens into `tokens`"""
210 state = StateInline(src, md, env, tokens)
211 self.tokenize(state)
212 rules2 = self.ruler2.getRules("")
213 for rule in rules2:
214 rule(state)
215 return state.tokens