Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown_it/parser_inline.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

82 statements  

1"""Tokenizes paragraph content.""" 

2 

3from __future__ import annotations 

4 

5from collections.abc import Callable 

6import functools 

7import re 

8from typing import TYPE_CHECKING 

9 

10from . import rules_inline 

11from .ruler import Ruler 

12from .rules_inline.state_inline import StateInline 

13from .token import Token 

14from .utils import EnvType 

15 

16if TYPE_CHECKING: 

17 from markdown_it import MarkdownIt 

18 

19 

20# Default set of characters that terminate a text token and allow inline rules to fire. 

21# '{}$%@~+=:' reserved for extensions. 

22# Note: Don't confuse with "Markdown ASCII Punctuation" chars. 

23# http://spec.commonmark.org/0.15/#ascii-punctuation-character 

24_DEFAULT_TERMINATORS: frozenset[str] = frozenset( 

25 { 

26 "\n", 

27 "!", 

28 "#", 

29 "$", 

30 "%", 

31 "&", 

32 "*", 

33 "+", 

34 "-", 

35 ":", 

36 "<", 

37 "=", 

38 ">", 

39 "@", 

40 "[", 

41 "\\", 

42 "]", 

43 "^", 

44 "_", 

45 "`", 

46 "{", 

47 "}", 

48 "~", 

49 } 

50) 

51 

52 

53# Lazily compiled regex for the default terminator set. The @cache ensures it is 

54# compiled at most once (on first ParserInline instantiation) and shared across all 

55# instances that have not added extra chars, keeping __init__ cost near zero. 

56@functools.cache 

57def _default_terminator_re() -> re.Pattern[str]: 

58 return re.compile("[" + re.escape("".join(_DEFAULT_TERMINATORS)) + "]") 

59 

60 

61# Parser rules 

62RuleFuncInlineType = Callable[[StateInline, bool], bool] 

63"""(state: StateInline, silent: bool) -> matched: bool) 

64 

65`silent` disables token generation, useful for lookahead. 

66""" 

67_rules: list[tuple[str, RuleFuncInlineType]] = [ 

68 ("text", rules_inline.text), 

69 ("linkify", rules_inline.linkify), 

70 ("newline", rules_inline.newline), 

71 ("escape", rules_inline.escape), 

72 ("backticks", rules_inline.backtick), 

73 ("strikethrough", rules_inline.strikethrough.tokenize), 

74 ("emphasis", rules_inline.emphasis.tokenize), 

75 ("link", rules_inline.link), 

76 ("image", rules_inline.image), 

77 ("autolink", rules_inline.autolink), 

78 ("html_inline", rules_inline.html_inline), 

79 ("entity", rules_inline.entity), 

80] 

81 

82# Note `rule2` ruleset was created specifically for emphasis/strikethrough 

83# post-processing and may be changed in the future. 

84# 

85# Don't use this for anything except pairs (plugins working with `balance_pairs`). 

86# 

87RuleFuncInline2Type = Callable[[StateInline], None] 

88_rules2: list[tuple[str, RuleFuncInline2Type]] = [ 

89 ("balance_pairs", rules_inline.link_pairs), 

90 ("strikethrough", rules_inline.strikethrough.postProcess), 

91 ("emphasis", rules_inline.emphasis.postProcess), 

92 # rules for pairs separate '**' into its own text tokens, which may be left unused, 

93 # rule below merges unused segments back with the rest of the text 

94 ("fragments_join", rules_inline.fragments_join), 

95] 

96 

97 

98class ParserInline: 

99 def __init__(self) -> None: 

100 self.ruler = Ruler[RuleFuncInlineType]() 

101 for name, rule in _rules: 

102 self.ruler.push(name, rule) 

103 # Second ruler used for post-processing (e.g. in emphasis-like rules) 

104 self.ruler2 = Ruler[RuleFuncInline2Type]() 

105 for name, rule2 in _rules2: 

106 self.ruler2.push(name, rule2) 

107 # Characters that stop the text rule, allowing other inline rules to fire. 

108 # _extra_terminator_chars is only allocated when add_terminator_char() is called 

109 # with a char outside the defaults, keeping __init__ allocation-free. 

110 self._extra_terminator_chars: set[str] = set() 

111 # Pre-compiled regex shared with all default instances (no copy in the common path). 

112 self.terminator_re: re.Pattern[str] = _default_terminator_re() 

113 

114 def add_terminator_char(self, ch: str) -> None: 

115 """Register a character that stops the ``text`` rule, allowing inline rules to fire. 

116 

117 This lets plugins declare which characters their inline rules react to, 

118 mirroring the ``MARKER`` mechanism in the Rust markdown-it implementation. 

119 

120 :param ch: A single character to add to the terminator set. 

121 """ 

122 if ch not in _DEFAULT_TERMINATORS and ch not in self._extra_terminator_chars: 

123 self._extra_terminator_chars.add(ch) 

124 self.terminator_re = re.compile( 

125 "[" 

126 + re.escape( 

127 "".join(_DEFAULT_TERMINATORS | self._extra_terminator_chars) 

128 ) 

129 + "]" 

130 ) 

131 

132 def skipToken(self, state: StateInline) -> None: 

133 """Skip single token by running all rules in validation mode; 

134 returns `True` if any rule reported success 

135 """ 

136 ok = False 

137 pos = state.pos 

138 rules = self.ruler.getRules("") 

139 maxNesting = state.md.options["maxNesting"] 

140 cache = state.cache 

141 

142 if pos in cache: 

143 state.pos = cache[pos] 

144 return 

145 

146 if state.level < maxNesting: 

147 for rule in rules: 

148 # Increment state.level and decrement it later to limit recursion. 

149 # It's harmless to do here, because no tokens are created. 

150 # But ideally, we'd need a separate private state variable for this purpose. 

151 state.level += 1 

152 ok = rule(state, True) 

153 state.level -= 1 

154 if ok: 

155 break 

156 else: 

157 # Too much nesting, just skip until the end of the paragraph. 

158 # 

159 # NOTE: this will cause links to behave incorrectly in the following case, 

160 # when an amount of `[` is exactly equal to `maxNesting + 1`: 

161 # 

162 # [[[[[[[[[[[[[[[[[[[[[foo]() 

163 # 

164 # TODO: remove this workaround when CM standard will allow nested links 

165 # (we can replace it by preventing links from being parsed in 

166 # validation mode) 

167 # 

168 state.pos = state.posMax 

169 

170 if not ok: 

171 state.pos += 1 

172 cache[pos] = state.pos 

173 

174 def tokenize(self, state: StateInline) -> None: 

175 """Generate tokens for input range.""" 

176 ok = False 

177 rules = self.ruler.getRules("") 

178 end = state.posMax 

179 maxNesting = state.md.options["maxNesting"] 

180 

181 while state.pos < end: 

182 # Try all possible rules. 

183 # On success, rule should: 

184 # 

185 # - update `state.pos` 

186 # - update `state.tokens` 

187 # - return true 

188 

189 if state.level < maxNesting: 

190 for rule in rules: 

191 ok = rule(state, False) 

192 if ok: 

193 break 

194 

195 if ok: 

196 if state.pos >= end: 

197 break 

198 continue 

199 

200 state.pending += state.src[state.pos] 

201 state.pos += 1 

202 

203 if state.pending: 

204 state.pushPending() 

205 

206 def parse( 

207 self, src: str, md: MarkdownIt, env: EnvType, tokens: list[Token] 

208 ) -> list[Token]: 

209 """Process input string and push inline tokens into `tokens`""" 

210 state = StateInline(src, md, env, tokens) 

211 self.tokenize(state) 

212 rules2 = self.ruler2.getRules("") 

213 for rule in rules2: 

214 rule(state) 

215 return state.tokens