Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/mistune/inline_parser.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

237 statements  

1import re 

2from typing import ( 

3 Any, 

4 Dict, 

5 List, 

6 Match, 

7 MutableMapping, 

8 Optional, 

9) 

10 

11from .core import InlineState, Parser 

12from .helpers import ( 

13 HTML_ATTRIBUTES, 

14 HTML_TAGNAME, 

15 PREVENT_BACKSLASH, 

16 PUNCTUATION, 

17 parse_link, 

18 parse_link_label, 

19 parse_link_text, 

20 unescape_char, 

21) 

22from .util import escape_url, unikey 

23 

24PAREN_END_RE = re.compile(r"\s*\)") 

25 

26AUTO_EMAIL = ( 

27 r"""<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]""" 

28 r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" 

29 r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>" 

30) 

31 

32INLINE_HTML = ( 

33 r"<" + HTML_TAGNAME + HTML_ATTRIBUTES + r"\s*/?>|" # open tag 

34 r"</" + HTML_TAGNAME + r"\s*>|" # close tag 

35 r"<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|" # comment 

36 r"<\?[\s\S]+?\?>|" # script like <?php?> 

37 r"<![A-Z][\s\S]+?>|" # doctype 

38 r"<!\[CDATA[\s\S]+?\]\]>" # cdata 

39) 

40 

41EMPHASIS_END_RE = { 

42 "*": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*(?!\*)"), 

43 "_": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])_(?!_)\b"), 

44 "**": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*(?!\*)"), 

45 "__": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])__(?!_)\b"), 

46 "***": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*\*(?!\*)"), 

47 "___": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])___(?!_)\b"), 

48} 

49 

50 

51class InlineParser(Parser[InlineState]): 

52 sc_flag = 0 

53 state_cls = InlineState 

54 

55 #: linebreak leaves two spaces at the end of line 

56 STD_LINEBREAK = r"(?:\\| {2,})\n\s*" 

57 

58 #: every new line becomes <br> 

59 HARD_LINEBREAK = r" *\n\s*" 

60 

61 # we only need to find the start pattern of an inline token 

62 SPECIFICATION = { 

63 # e.g. \`, \$ 

64 "escape": r"(?:\\" + PUNCTUATION + ")+", 

65 # `code, ```code 

66 "codespan": r"`{1,}", 

67 # *w, **w, _w, __w 

68 "emphasis": r"\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])", 

69 # [link], ![img] 

70 "link": r"!?\[", 

71 # <https://example.com>. regex copied from commonmark.js 

72 "auto_link": r"<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>", 

73 "auto_email": AUTO_EMAIL, 

74 "inline_html": INLINE_HTML, 

75 "linebreak": STD_LINEBREAK, 

76 "softbreak": HARD_LINEBREAK, 

77 "prec_auto_link": r"<[A-Za-z][A-Za-z\d.+-]{1,31}:", 

78 "prec_inline_html": r"</?" + HTML_TAGNAME + r"|<!|<\?", 

79 } 

80 DEFAULT_RULES = ( 

81 "escape", 

82 "codespan", 

83 "emphasis", 

84 "link", 

85 "auto_link", 

86 "auto_email", 

87 "inline_html", 

88 "linebreak", 

89 ) 

90 

91 def __init__(self, hard_wrap: bool = False) -> None: 

92 super(InlineParser, self).__init__() 

93 

94 self.hard_wrap = hard_wrap 

95 # lazy add linebreak 

96 if hard_wrap: 

97 self.specification["linebreak"] = self.HARD_LINEBREAK 

98 else: 

99 self.rules.append("softbreak") 

100 

101 self._methods = {name: getattr(self, "parse_" + name) for name in self.rules} 

102 

103 def parse_escape(self, m: Match[str], state: InlineState) -> int: 

104 text = m.group(0) 

105 text = unescape_char(text) 

106 state.append_token( 

107 { 

108 "type": "text", 

109 "raw": text, 

110 } 

111 ) 

112 return m.end() 

113 

114 def parse_link(self, m: Match[str], state: InlineState) -> Optional[int]: 

115 pos = m.end() 

116 

117 marker = m.group(0) 

118 is_image = marker[0] == "!" 

119 if is_image and state.in_image: 

120 state.append_token({"type": "text", "raw": marker}) 

121 return pos 

122 elif not is_image and state.in_link: 

123 state.append_token({"type": "text", "raw": marker}) 

124 return pos 

125 

126 text = None 

127 label, end_pos = parse_link_label(state.src, pos) 

128 if label is None: 

129 text, end_pos = parse_link_text(state.src, pos) 

130 if text is None: 

131 return None 

132 

133 assert end_pos is not None 

134 

135 if text is None: 

136 text = label 

137 

138 assert text is not None 

139 

140 if end_pos >= len(state.src) and label is None: 

141 return None 

142 

143 rules = ["codespan", "prec_auto_link", "prec_inline_html"] 

144 prec_pos = self.precedence_scan(m, state, end_pos, rules) 

145 if prec_pos: 

146 return prec_pos 

147 

148 if end_pos < len(state.src): 

149 c = state.src[end_pos] 

150 if c == "(": 

151 # standard link [text](<url> "title") 

152 attrs, pos2 = parse_link(state.src, end_pos + 1) 

153 if pos2: 

154 token = self.__parse_link_token(is_image, text, attrs, state) 

155 state.append_token(token) 

156 return pos2 

157 

158 elif c == "[": 

159 # standard ref link [text][label] 

160 label2, pos2 = parse_link_label(state.src, end_pos + 1) 

161 if pos2: 

162 end_pos = pos2 

163 if label2: 

164 label = label2 

165 

166 if label is None: 

167 return None 

168 

169 ref_links = state.env.get("ref_links") 

170 if not ref_links: 

171 return None 

172 

173 key = unikey(label) 

174 env = ref_links.get(key) 

175 if env: 

176 attrs = {"url": env["url"], "title": env.get("title")} 

177 token = self.__parse_link_token(is_image, text, attrs, state) 

178 token["ref"] = key 

179 token["label"] = label 

180 state.append_token(token) 

181 return end_pos 

182 return None 

183 

184 def __parse_link_token( 

185 self, 

186 is_image: bool, 

187 text: str, 

188 attrs: Optional[Dict[str, Any]], 

189 state: InlineState, 

190 ) -> Dict[str, Any]: 

191 new_state = state.copy() 

192 new_state.src = text 

193 if is_image: 

194 new_state.in_image = True 

195 token = { 

196 "type": "image", 

197 "children": self.render(new_state), 

198 "attrs": attrs, 

199 } 

200 else: 

201 new_state.in_link = True 

202 token = { 

203 "type": "link", 

204 "children": self.render(new_state), 

205 "attrs": attrs, 

206 } 

207 return token 

208 

209 def parse_auto_link(self, m: Match[str], state: InlineState) -> int: 

210 text = m.group(0) 

211 pos = m.end() 

212 if state.in_link: 

213 self.process_text(text, state) 

214 return pos 

215 

216 text = text[1:-1] 

217 self._add_auto_link(text, text, state) 

218 return pos 

219 

220 def parse_auto_email(self, m: Match[str], state: InlineState) -> int: 

221 text = m.group(0) 

222 pos = m.end() 

223 if state.in_link: 

224 self.process_text(text, state) 

225 return pos 

226 

227 text = text[1:-1] 

228 url = "mailto:" + text 

229 self._add_auto_link(url, text, state) 

230 return pos 

231 

232 def _add_auto_link(self, url: str, text: str, state: InlineState) -> None: 

233 state.append_token( 

234 { 

235 "type": "link", 

236 "children": [{"type": "text", "raw": text}], 

237 "attrs": {"url": escape_url(url)}, 

238 } 

239 ) 

240 

241 def parse_emphasis(self, m: Match[str], state: InlineState) -> int: 

242 pos = m.end() 

243 

244 marker = m.group(0) 

245 mlen = len(marker) 

246 if mlen == 1 and state.in_emphasis: 

247 state.append_token({"type": "text", "raw": marker}) 

248 return pos 

249 elif mlen == 2 and state.in_strong: 

250 state.append_token({"type": "text", "raw": marker}) 

251 return pos 

252 

253 _end_re = EMPHASIS_END_RE[marker] 

254 m1 = _end_re.search(state.src, pos) 

255 if not m1: 

256 state.append_token({"type": "text", "raw": marker}) 

257 return pos 

258 

259 end_pos = m1.end() 

260 text = state.src[pos : end_pos - mlen] 

261 

262 prec_pos = self.precedence_scan(m, state, end_pos) 

263 if prec_pos: 

264 return prec_pos 

265 

266 new_state = state.copy() 

267 new_state.src = text 

268 if mlen == 1: 

269 new_state.in_emphasis = True 

270 children = self.render(new_state) 

271 state.append_token({"type": "emphasis", "children": children}) 

272 elif mlen == 2: 

273 new_state.in_strong = True 

274 children = self.render(new_state) 

275 state.append_token({"type": "strong", "children": children}) 

276 else: 

277 new_state.in_emphasis = True 

278 new_state.in_strong = True 

279 

280 children = [{"type": "strong", "children": self.render(new_state)}] 

281 state.append_token( 

282 { 

283 "type": "emphasis", 

284 "children": children, 

285 } 

286 ) 

287 return end_pos 

288 

289 def parse_codespan(self, m: Match[str], state: InlineState) -> int: 

290 marker = m.group(0) 

291 # require same marker with same length at end 

292 

293 pattern = re.compile(r"(.*?[^`])" + marker + r"(?!`)", re.S) 

294 

295 pos = m.end() 

296 m2 = pattern.match(state.src, pos) 

297 if m2: 

298 end_pos = m2.end() 

299 code = m2.group(1) 

300 # Line endings are treated like spaces 

301 code = code.replace("\n", " ") 

302 if len(code.strip()): 

303 if code.startswith(" ") and code.endswith(" "): 

304 code = code[1:-1] 

305 state.append_token({"type": "codespan", "raw": code}) 

306 return end_pos 

307 else: 

308 state.append_token({"type": "text", "raw": marker}) 

309 return pos 

310 

311 def parse_linebreak(self, m: Match[str], state: InlineState) -> int: 

312 state.append_token({"type": "linebreak"}) 

313 return m.end() 

314 

315 def parse_softbreak(self, m: Match[str], state: InlineState) -> int: 

316 state.append_token({"type": "softbreak"}) 

317 return m.end() 

318 

319 def parse_inline_html(self, m: Match[str], state: InlineState) -> int: 

320 end_pos = m.end() 

321 html = m.group(0) 

322 state.append_token({"type": "inline_html", "raw": html}) 

323 if html.startswith(("<a ", "<a>", "<A ", "<A>")): 

324 state.in_link = True 

325 elif html.startswith(("</a ", "</a>", "</A ", "</A>")): 

326 state.in_link = False 

327 return end_pos 

328 

329 def process_text(self, text: str, state: InlineState) -> None: 

330 state.append_token({"type": "text", "raw": text}) 

331 

332 def parse(self, state: InlineState) -> List[Dict[str, Any]]: 

333 pos = 0 

334 sc = self.compile_sc() 

335 while pos < len(state.src): 

336 m = sc.search(state.src, pos) 

337 if not m: 

338 break 

339 

340 end_pos = m.start() 

341 if end_pos > pos: 

342 hole = state.src[pos:end_pos] 

343 self.process_text(hole, state) 

344 

345 new_pos = self.parse_method(m, state) 

346 if not new_pos: 

347 # move cursor 1 character forward 

348 pos = end_pos + 1 

349 hole = state.src[end_pos:pos] 

350 self.process_text(hole, state) 

351 else: 

352 pos = new_pos 

353 

354 if pos == 0: 

355 # special case, just pure text 

356 self.process_text(state.src, state) 

357 elif pos < len(state.src): 

358 self.process_text(state.src[pos:], state) 

359 return state.tokens 

360 

361 def precedence_scan( 

362 self, 

363 m: Match[str], 

364 state: InlineState, 

365 end_pos: int, 

366 rules: Optional[List[str]] = None, 

367 ) -> Optional[int]: 

368 if rules is None: 

369 rules = ["codespan", "link", "prec_auto_link", "prec_inline_html"] 

370 

371 mark_pos = m.end() 

372 sc = self.compile_sc(rules) 

373 m1 = sc.search(state.src, mark_pos, end_pos) 

374 if not m1: 

375 return None 

376 

377 lastgroup = m1.lastgroup 

378 if not lastgroup: 

379 return None 

380 rule_name = lastgroup.replace("prec_", "") 

381 sc = self.compile_sc([rule_name]) 

382 m2 = sc.match(state.src, m1.start()) 

383 if not m2: 

384 return None 

385 

386 func = self._methods[rule_name] 

387 new_state = state.copy() 

388 new_state.src = state.src 

389 m2_pos = func(m2, new_state) 

390 if not m2_pos or m2_pos < end_pos: 

391 return None 

392 

393 raw_text = state.src[m.start() : m2.start()] 

394 state.append_token({"type": "text", "raw": raw_text}) 

395 for token in new_state.tokens: 

396 state.append_token(token) 

397 return m2_pos 

398 

399 def render(self, state: InlineState) -> List[Dict[str, Any]]: 

400 self.parse(state) 

401 return state.tokens 

402 

403 def __call__(self, s: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]: 

404 state = self.state_cls(env) 

405 state.src = s 

406 return self.render(state)