Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/mistune/block_parser.py: 100%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

271 statements  

1import re 

2from typing import Optional, List, Tuple, Match, Pattern 

3import string 

4from .util import ( 

5 unikey, 

6 escape_url, 

7 expand_tab, 

8 expand_leading_tab, 

9) 

10from .core import Parser, BlockState 

11from .helpers import ( 

12 LINK_LABEL, 

13 HTML_TAGNAME, 

14 HTML_ATTRIBUTES, 

15 BLOCK_TAGS, 

16 PRE_TAGS, 

17 unescape_char, 

18 parse_link_href, 

19 parse_link_title, 

20) 

21from .list_parser import parse_list, LIST_PATTERN 

22 

23_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M) 

24_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$") 

25_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M) 

26_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M) 

27 

28_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$") 

29_BLANK_TO_LINE = re.compile(r"[ \t]*\n") 

30 

31_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")" 

32_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)") 

33_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)") 

34_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+") 

35 

36 

37class BlockParser(Parser[BlockState]): 

38 state_cls = BlockState 

39 

40 BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M) 

41 

42 RAW_HTML = ( 

43 r"^ {0,3}(" 

44 r"</?" + HTML_TAGNAME + r"|" 

45 r"<!--|" # comment 

46 r"<\?|" # script 

47 r"<![A-Z]|" 

48 r"<!\[CDATA\[)" 

49 ) 

50 

51 BLOCK_HTML = ( 

52 r"^ {0,3}(?:" 

53 r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))" 

54 r"|<!--" # comment 

55 r"|<\?" # script 

56 r"|<![A-Z]" 

57 r"|<!\[CDATA\[)" 

58 ) 

59 

60 SPECIFICATION = { 

61 "blank_line": r"(^[ \t\v\f]*\n)+", 

62 "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$", 

63 "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$", 

64 "fenced_code": ( 

65 r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})" 

66 r"[ \t]*(?P<fenced_3>.*?)$" 

67 ), 

68 "indent_code": ( 

69 r"^(?: {4}| *\t)[^\n]+(?:\n+|$)" 

70 r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*" 

71 ), 

72 "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$", 

73 "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:", 

74 "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$", 

75 "list": LIST_PATTERN, 

76 "block_html": BLOCK_HTML, 

77 "raw_html": RAW_HTML, 

78 } 

79 

80 DEFAULT_RULES = ( 

81 "fenced_code", 

82 "indent_code", 

83 "atx_heading", 

84 "setex_heading", 

85 "thematic_break", 

86 "block_quote", 

87 "list", 

88 "ref_link", 

89 "raw_html", 

90 "blank_line", 

91 ) 

92 

93 def __init__( 

94 self, 

95 block_quote_rules: Optional[List[str]] = None, 

96 list_rules: Optional[List[str]] = None, 

97 max_nested_level: int = 6, 

98 ): 

99 super(BlockParser, self).__init__() 

100 

101 if block_quote_rules is None: 

102 block_quote_rules = list(self.DEFAULT_RULES) 

103 

104 if list_rules is None: 

105 list_rules = list(self.DEFAULT_RULES) 

106 

107 self.block_quote_rules = block_quote_rules 

108 self.list_rules = list_rules 

109 self.max_nested_level = max_nested_level 

110 # register default parse methods 

111 self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION} 

112 

113 def parse_blank_line(self, m: Match[str], state: BlockState) -> int: 

114 """Parse token for blank lines.""" 

115 state.append_token({"type": "blank_line"}) 

116 return m.end() 

117 

118 def parse_thematic_break(self, m: Match[str], state: BlockState) -> int: 

119 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML.""" 

120 state.append_token({"type": "thematic_break"}) 

121 # $ does not count '\n' 

122 return m.end() + 1 

123 

124 def parse_indent_code(self, m: Match[str], state: BlockState) -> int: 

125 """Parse token for code block which is indented by 4 spaces.""" 

126 # it is a part of the paragraph 

127 end_pos = state.append_paragraph() 

128 if end_pos: 

129 return end_pos 

130 

131 code = m.group(0) 

132 code = expand_leading_tab(code) 

133 code = _INDENT_CODE_TRIM.sub("", code) 

134 code = code.strip("\n") 

135 state.append_token({"type": "block_code", "raw": code, "style": "indent"}) 

136 return m.end() 

137 

138 def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]: 

139 """Parse token for fenced code block. A fenced code block is started with 

140 3 or more backtick(`) or tilde(~). 

141 

142 An example of a fenced code block: 

143 

144 .. code-block:: markdown 

145 

146 ```python 

147 def markdown(text): 

148 return mistune.html(text) 

149 ``` 

150 """ 

151 spaces = m.group("fenced_1") 

152 marker = m.group("fenced_2") 

153 info = m.group("fenced_3") 

154 

155 c = marker[0] 

156 if info and c == "`": 

157 # CommonMark Example 145 

158 # Info strings for backtick code blocks cannot contain backticks 

159 if info.find(c) != -1: 

160 return None 

161 

162 _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M) 

163 cursor_start = m.end() + 1 

164 

165 m2 = _end.search(state.src, cursor_start) 

166 if m2: 

167 code = state.src[cursor_start : m2.start()] 

168 end_pos = m2.end() 

169 else: 

170 code = state.src[cursor_start:] 

171 end_pos = state.cursor_max 

172 

173 if spaces and code: 

174 _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M) 

175 code = _trim_pattern.sub("", code) 

176 

177 token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker} 

178 if info: 

179 info = unescape_char(info) 

180 token["attrs"] = {"info": info.strip()} 

181 

182 state.append_token(token) 

183 return end_pos 

184 

185 def parse_atx_heading(self, m: Match[str], state: BlockState) -> int: 

186 """Parse token for ATX heading. An ATX heading is started with 1 to 6 

187 symbol of ``#``.""" 

188 level = len(m.group("atx_1")) 

189 text = m.group("atx_2").strip(string.whitespace) 

190 # remove last # 

191 if text: 

192 text = _ATX_HEADING_TRIM.sub("", text) 

193 

194 token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"} 

195 state.append_token(token) 

196 return m.end() + 1 

197 

198 def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]: 

199 """Parse token for setex style heading. A setex heading syntax looks like: 

200 

201 .. code-block:: markdown 

202 

203 H1 title 

204 ======== 

205 """ 

206 last_token = state.last_token() 

207 if last_token and last_token["type"] == "paragraph": 

208 level = 1 if m.group("setext_1") == "=" else 2 

209 last_token["type"] = "heading" 

210 last_token["style"] = "setext" 

211 last_token["attrs"] = {"level": level} 

212 return m.end() + 1 

213 

214 sc = self.compile_sc(["thematic_break", "list"]) 

215 m2 = sc.match(state.src, state.cursor) 

216 if m2: 

217 return self.parse_method(m2, state) 

218 return None 

219 

220 def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]: 

221 """Parse link references and save the link information into ``state.env``. 

222 

223 Here is an example of a link reference: 

224 

225 .. code-block:: markdown 

226 

227 a [link][example] 

228 

229 [example]: https://example.com "Optional title" 

230 

231 This method will save the link reference into ``state.env`` as:: 

232 

233 state.env['ref_links']['example'] = { 

234 'url': 'https://example.com', 

235 'title': "Optional title", 

236 } 

237 """ 

238 end_pos = state.append_paragraph() 

239 if end_pos: 

240 return end_pos 

241 

242 label = m.group("reflink_1") 

243 key = unikey(label) 

244 if not key: 

245 return None 

246 

247 href, href_pos = parse_link_href(state.src, m.end(), block=True) 

248 if href is None: 

249 return None 

250 

251 assert href_pos is not None 

252 

253 _blank = self.BLANK_LINE.search(state.src, href_pos) 

254 if _blank: 

255 max_pos = _blank.start() 

256 else: 

257 max_pos = state.cursor_max 

258 

259 title, title_pos = parse_link_title(state.src, href_pos, max_pos) 

260 if title_pos: 

261 m2 = _BLANK_TO_LINE.match(state.src, title_pos) 

262 if m2: 

263 title_pos = m2.end() 

264 else: 

265 title_pos = None 

266 title = None 

267 

268 if title_pos is None: 

269 m3 = _BLANK_TO_LINE.match(state.src, href_pos) 

270 if m3: 

271 href_pos = m3.end() 

272 else: 

273 href_pos = None 

274 href = None 

275 

276 end_pos = title_pos or href_pos 

277 if not end_pos: 

278 return None 

279 

280 if key not in state.env["ref_links"]: 

281 assert href is not None 

282 href = unescape_char(href) 

283 data = {"url": escape_url(href), "label": label} 

284 if title: 

285 data["title"] = title 

286 state.env["ref_links"][key] = data 

287 return end_pos 

288 

289 def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]: 

290 """Extract text and cursor end position of a block quote.""" 

291 

292 # cleanup at first to detect if it is code block 

293 text = m.group("quote_1") + "\n" 

294 text = expand_leading_tab(text, 3) 

295 text = _BLOCK_QUOTE_TRIM.sub("", text) 

296 

297 sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"]) 

298 require_marker = bool(sc.match(text)) 

299 

300 state.cursor = m.end() + 1 

301 

302 end_pos: Optional[int] = None 

303 if require_marker: 

304 m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

305 if m2: 

306 quote = m2.group(0) 

307 quote = _BLOCK_QUOTE_LEADING.sub("", quote) 

308 quote = expand_leading_tab(quote, 3) 

309 quote = _BLOCK_QUOTE_TRIM.sub("", quote) 

310 text += quote 

311 state.cursor = m2.end() 

312 else: 

313 prev_blank_line = False 

314 break_sc = self.compile_sc( 

315 [ 

316 "blank_line", 

317 "thematic_break", 

318 "fenced_code", 

319 "list", 

320 "block_html", 

321 ] 

322 ) 

323 while state.cursor < state.cursor_max: 

324 m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

325 if m3: 

326 quote = m3.group(0) 

327 quote = _BLOCK_QUOTE_LEADING.sub("", quote) 

328 quote = expand_leading_tab(quote, 3) 

329 quote = _BLOCK_QUOTE_TRIM.sub("", quote) 

330 text += quote 

331 state.cursor = m3.end() 

332 if not quote.strip(): 

333 prev_blank_line = True 

334 else: 

335 prev_blank_line = bool(_LINE_BLANK_END.search(quote)) 

336 continue 

337 

338 if prev_blank_line: 

339 # CommonMark Example 249 

340 # because of laziness, a blank line is needed between 

341 # a block quote and a following paragraph 

342 break 

343 

344 m4 = break_sc.match(state.src, state.cursor) 

345 if m4: 

346 end_pos = self.parse_method(m4, state) 

347 if end_pos: 

348 break 

349 

350 # lazy continuation line 

351 pos = state.find_line_end() 

352 line = state.get_text(pos) 

353 line = expand_leading_tab(line, 3) 

354 text += line 

355 state.cursor = pos 

356 

357 # according to CommonMark Example 6, the second tab should be 

358 # treated as 4 spaces 

359 return expand_tab(text), end_pos 

360 

361 def parse_block_quote(self, m: Match[str], state: BlockState) -> int: 

362 """Parse token for block quote. Here is an example of the syntax: 

363 

364 .. code-block:: markdown 

365 

366 > a block quote starts 

367 > with right arrows 

368 """ 

369 text, end_pos = self.extract_block_quote(m, state) 

370 # scan children state 

371 child = state.child_state(text) 

372 if state.depth() >= self.max_nested_level - 1: 

373 rules = list(self.block_quote_rules) 

374 rules.remove("block_quote") 

375 else: 

376 rules = self.block_quote_rules 

377 

378 self.parse(child, rules) 

379 token = {"type": "block_quote", "children": child.tokens} 

380 if end_pos: 

381 state.prepend_token(token) 

382 return end_pos 

383 state.append_token(token) 

384 return state.cursor 

385 

386 def parse_list(self, m: Match[str], state: BlockState) -> int: 

387 """Parse tokens for ordered and unordered list.""" 

388 return parse_list(self, m, state) 

389 

390 def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]: 

391 return self.parse_raw_html(m, state) 

392 

393 def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]: 

394 marker = m.group(0).strip() 

395 

396 # rule 2 

397 if marker == "<!--": 

398 return _parse_html_to_end(state, "-->", m.end()) 

399 

400 # rule 3 

401 if marker == "<?": 

402 return _parse_html_to_end(state, "?>", m.end()) 

403 

404 # rule 5 

405 if marker == "<![CDATA[": 

406 return _parse_html_to_end(state, "]]>", m.end()) 

407 

408 # rule 4 

409 if marker.startswith("<!"): 

410 return _parse_html_to_end(state, ">", m.end()) 

411 

412 close_tag = None 

413 open_tag = None 

414 if marker.startswith("</"): 

415 close_tag = marker[2:].lower() 

416 # rule 6 

417 if close_tag in BLOCK_TAGS: 

418 return _parse_html_to_newline(state, self.BLANK_LINE) 

419 else: 

420 open_tag = marker[1:].lower() 

421 # rule 1 

422 if open_tag in PRE_TAGS: 

423 end_tag = "</" + open_tag + ">" 

424 return _parse_html_to_end(state, end_tag, m.end()) 

425 # rule 6 

426 if open_tag in BLOCK_TAGS: 

427 return _parse_html_to_newline(state, self.BLANK_LINE) 

428 

429 # Blocks of type 7 may not interrupt a paragraph. 

430 end_pos = state.append_paragraph() 

431 if end_pos: 

432 return end_pos 

433 

434 # rule 7 

435 start_pos = m.end() 

436 end_pos = state.find_line_end() 

437 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or ( 

438 close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos) 

439 ): 

440 return _parse_html_to_newline(state, self.BLANK_LINE) 

441 

442 return None 

443 

444 def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None: 

445 sc = self.compile_sc(rules) 

446 

447 while state.cursor < state.cursor_max: 

448 m = sc.search(state.src, state.cursor) 

449 if not m: 

450 break 

451 

452 end_pos = m.start() 

453 if end_pos > state.cursor: 

454 text = state.get_text(end_pos) 

455 state.add_paragraph(text) 

456 state.cursor = end_pos 

457 

458 end_pos2 = self.parse_method(m, state) 

459 if end_pos2: 

460 state.cursor = end_pos2 

461 else: 

462 end_pos3 = state.find_line_end() 

463 text = state.get_text(end_pos3) 

464 state.add_paragraph(text) 

465 state.cursor = end_pos3 

466 

467 if state.cursor < state.cursor_max: 

468 text = state.src[state.cursor :] 

469 state.add_paragraph(text) 

470 state.cursor = state.cursor_max 

471 

472 

473def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int: 

474 marker_pos = state.src.find(end_marker, start_pos) 

475 if marker_pos == -1: 

476 text = state.src[state.cursor :] 

477 end_pos = state.cursor_max 

478 else: 

479 text = state.get_text(marker_pos) 

480 state.cursor = marker_pos 

481 end_pos = state.find_line_end() 

482 text += state.get_text(end_pos) 

483 

484 state.append_token({"type": "block_html", "raw": text}) 

485 return end_pos 

486 

487 

488def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int: 

489 m = newline.search(state.src, state.cursor) 

490 if m: 

491 end_pos = m.start() 

492 text = state.get_text(end_pos) 

493 else: 

494 text = state.src[state.cursor :] 

495 end_pos = state.cursor_max 

496 

497 state.append_token({"type": "block_html", "raw": text}) 

498 return end_pos