Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/mistune/block_parser.py: 100%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

270 statements  

1import re 

2from typing import Optional, List, Tuple, Match, Pattern 

3from .util import ( 

4 unikey, 

5 escape_url, 

6 expand_tab, 

7 expand_leading_tab, 

8) 

9from .core import Parser, BlockState 

10from .helpers import ( 

11 LINK_LABEL, 

12 HTML_TAGNAME, 

13 HTML_ATTRIBUTES, 

14 BLOCK_TAGS, 

15 PRE_TAGS, 

16 unescape_char, 

17 parse_link_href, 

18 parse_link_title, 

19) 

20from .list_parser import parse_list, LIST_PATTERN 

21 

22_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M) 

23_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$") 

24_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M) 

25_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M) 

26 

27_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$") 

28_BLANK_TO_LINE = re.compile(r"[ \t]*\n") 

29 

30_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")" 

31_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)") 

32_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)") 

33_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+") 

34 

35 

36class BlockParser(Parser[BlockState]): 

37 state_cls = BlockState 

38 

39 BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M) 

40 

41 RAW_HTML = ( 

42 r"^ {0,3}(" 

43 r"</?" + HTML_TAGNAME + r"|" 

44 r"<!--|" # comment 

45 r"<\?|" # script 

46 r"<![A-Z]|" 

47 r"<!\[CDATA\[)" 

48 ) 

49 

50 BLOCK_HTML = ( 

51 r"^ {0,3}(?:" 

52 r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))" 

53 r"|<!--" # comment 

54 r"|<\?" # script 

55 r"|<![A-Z]" 

56 r"|<!\[CDATA\[)" 

57 ) 

58 

59 SPECIFICATION = { 

60 "blank_line": r"(^[ \t\v\f]*\n)+", 

61 "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$", 

62 "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$", 

63 "fenced_code": ( 

64 r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})" 

65 r"[ \t]*(?P<fenced_3>.*?)$" 

66 ), 

67 "indent_code": ( 

68 r"^(?: {4}| *\t)[^\n]+(?:\n+|$)" 

69 r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*" 

70 ), 

71 "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$", 

72 "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:", 

73 "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$", 

74 "list": LIST_PATTERN, 

75 "block_html": BLOCK_HTML, 

76 "raw_html": RAW_HTML, 

77 } 

78 

79 DEFAULT_RULES = ( 

80 "fenced_code", 

81 "indent_code", 

82 "atx_heading", 

83 "setex_heading", 

84 "thematic_break", 

85 "block_quote", 

86 "list", 

87 "ref_link", 

88 "raw_html", 

89 "blank_line", 

90 ) 

91 

92 def __init__( 

93 self, 

94 block_quote_rules: Optional[List[str]] = None, 

95 list_rules: Optional[List[str]] = None, 

96 max_nested_level: int = 6, 

97 ): 

98 super(BlockParser, self).__init__() 

99 

100 if block_quote_rules is None: 

101 block_quote_rules = list(self.DEFAULT_RULES) 

102 

103 if list_rules is None: 

104 list_rules = list(self.DEFAULT_RULES) 

105 

106 self.block_quote_rules = block_quote_rules 

107 self.list_rules = list_rules 

108 self.max_nested_level = max_nested_level 

109 # register default parse methods 

110 self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION} 

111 

112 def parse_blank_line(self, m: Match[str], state: BlockState) -> int: 

113 """Parse token for blank lines.""" 

114 state.append_token({"type": "blank_line"}) 

115 return m.end() 

116 

117 def parse_thematic_break(self, m: Match[str], state: BlockState) -> int: 

118 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML.""" 

119 state.append_token({"type": "thematic_break"}) 

120 # $ does not count '\n' 

121 return m.end() + 1 

122 

123 def parse_indent_code(self, m: Match[str], state: BlockState) -> int: 

124 """Parse token for code block which is indented by 4 spaces.""" 

125 # it is a part of the paragraph 

126 end_pos = state.append_paragraph() 

127 if end_pos: 

128 return end_pos 

129 

130 code = m.group(0) 

131 code = expand_leading_tab(code) 

132 code = _INDENT_CODE_TRIM.sub("", code) 

133 code = code.strip("\n") 

134 state.append_token({"type": "block_code", "raw": code, "style": "indent"}) 

135 return m.end() 

136 

137 def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]: 

138 """Parse token for fenced code block. A fenced code block is started with 

139 3 or more backtick(`) or tilde(~). 

140 

141 An example of a fenced code block: 

142 

143 .. code-block:: markdown 

144 

145 ```python 

146 def markdown(text): 

147 return mistune.html(text) 

148 ``` 

149 """ 

150 spaces = m.group("fenced_1") 

151 marker = m.group("fenced_2") 

152 info = m.group("fenced_3") 

153 

154 c = marker[0] 

155 if info and c == "`": 

156 # CommonMark Example 145 

157 # Info strings for backtick code blocks cannot contain backticks 

158 if info.find(c) != -1: 

159 return None 

160 

161 _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M) 

162 cursor_start = m.end() + 1 

163 

164 m2 = _end.search(state.src, cursor_start) 

165 if m2: 

166 code = state.src[cursor_start : m2.start()] 

167 end_pos = m2.end() 

168 else: 

169 code = state.src[cursor_start:] 

170 end_pos = state.cursor_max 

171 

172 if spaces and code: 

173 _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M) 

174 code = _trim_pattern.sub("", code) 

175 

176 token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker} 

177 if info: 

178 info = unescape_char(info) 

179 token["attrs"] = {"info": info.strip()} 

180 

181 state.append_token(token) 

182 return end_pos 

183 

184 def parse_atx_heading(self, m: Match[str], state: BlockState) -> int: 

185 """Parse token for ATX heading. An ATX heading is started with 1 to 6 

186 symbol of ``#``.""" 

187 level = len(m.group("atx_1")) 

188 text = m.group("atx_2").strip() 

189 # remove last # 

190 if text: 

191 text = _ATX_HEADING_TRIM.sub("", text) 

192 

193 token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"} 

194 state.append_token(token) 

195 return m.end() + 1 

196 

197 def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]: 

198 """Parse token for setex style heading. A setex heading syntax looks like: 

199 

200 .. code-block:: markdown 

201 

202 H1 title 

203 ======== 

204 """ 

205 last_token = state.last_token() 

206 if last_token and last_token["type"] == "paragraph": 

207 level = 1 if m.group("setext_1") == "=" else 2 

208 last_token["type"] = "heading" 

209 last_token["style"] = "setext" 

210 last_token["attrs"] = {"level": level} 

211 return m.end() + 1 

212 

213 sc = self.compile_sc(["thematic_break", "list"]) 

214 m2 = sc.match(state.src, state.cursor) 

215 if m2: 

216 return self.parse_method(m2, state) 

217 return None 

218 

219 def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]: 

220 """Parse link references and save the link information into ``state.env``. 

221 

222 Here is an example of a link reference: 

223 

224 .. code-block:: markdown 

225 

226 a [link][example] 

227 

228 [example]: https://example.com "Optional title" 

229 

230 This method will save the link reference into ``state.env`` as:: 

231 

232 state.env['ref_links']['example'] = { 

233 'url': 'https://example.com', 

234 'title': "Optional title", 

235 } 

236 """ 

237 end_pos = state.append_paragraph() 

238 if end_pos: 

239 return end_pos 

240 

241 label = m.group("reflink_1") 

242 key = unikey(label) 

243 if not key: 

244 return None 

245 

246 href, href_pos = parse_link_href(state.src, m.end(), block=True) 

247 if href is None: 

248 return None 

249 

250 assert href_pos is not None 

251 

252 _blank = self.BLANK_LINE.search(state.src, href_pos) 

253 if _blank: 

254 max_pos = _blank.start() 

255 else: 

256 max_pos = state.cursor_max 

257 

258 title, title_pos = parse_link_title(state.src, href_pos, max_pos) 

259 if title_pos: 

260 m2 = _BLANK_TO_LINE.match(state.src, title_pos) 

261 if m2: 

262 title_pos = m2.end() 

263 else: 

264 title_pos = None 

265 title = None 

266 

267 if title_pos is None: 

268 m3 = _BLANK_TO_LINE.match(state.src, href_pos) 

269 if m3: 

270 href_pos = m3.end() 

271 else: 

272 href_pos = None 

273 href = None 

274 

275 end_pos = title_pos or href_pos 

276 if not end_pos: 

277 return None 

278 

279 if key not in state.env["ref_links"]: 

280 assert href is not None 

281 href = unescape_char(href) 

282 data = {"url": escape_url(href), "label": label} 

283 if title: 

284 data["title"] = title 

285 state.env["ref_links"][key] = data 

286 return end_pos 

287 

288 def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]: 

289 """Extract text and cursor end position of a block quote.""" 

290 

291 # cleanup at first to detect if it is code block 

292 text = m.group("quote_1") + "\n" 

293 text = expand_leading_tab(text, 3) 

294 text = _BLOCK_QUOTE_TRIM.sub("", text) 

295 

296 sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"]) 

297 require_marker = bool(sc.match(text)) 

298 

299 state.cursor = m.end() + 1 

300 

301 end_pos: Optional[int] = None 

302 if require_marker: 

303 m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

304 if m2: 

305 quote = m2.group(0) 

306 quote = _BLOCK_QUOTE_LEADING.sub("", quote) 

307 quote = expand_leading_tab(quote, 3) 

308 quote = _BLOCK_QUOTE_TRIM.sub("", quote) 

309 text += quote 

310 state.cursor = m2.end() 

311 else: 

312 prev_blank_line = False 

313 break_sc = self.compile_sc( 

314 [ 

315 "blank_line", 

316 "thematic_break", 

317 "fenced_code", 

318 "list", 

319 "block_html", 

320 ] 

321 ) 

322 while state.cursor < state.cursor_max: 

323 m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

324 if m3: 

325 quote = m3.group(0) 

326 quote = _BLOCK_QUOTE_LEADING.sub("", quote) 

327 quote = expand_leading_tab(quote, 3) 

328 quote = _BLOCK_QUOTE_TRIM.sub("", quote) 

329 text += quote 

330 state.cursor = m3.end() 

331 if not quote.strip(): 

332 prev_blank_line = True 

333 else: 

334 prev_blank_line = bool(_LINE_BLANK_END.search(quote)) 

335 continue 

336 

337 if prev_blank_line: 

338 # CommonMark Example 249 

339 # because of laziness, a blank line is needed between 

340 # a block quote and a following paragraph 

341 break 

342 

343 m4 = break_sc.match(state.src, state.cursor) 

344 if m4: 

345 end_pos = self.parse_method(m4, state) 

346 if end_pos: 

347 break 

348 

349 # lazy continuation line 

350 pos = state.find_line_end() 

351 line = state.get_text(pos) 

352 line = expand_leading_tab(line, 3) 

353 text += line 

354 state.cursor = pos 

355 

356 # according to CommonMark Example 6, the second tab should be 

357 # treated as 4 spaces 

358 return expand_tab(text), end_pos 

359 

360 def parse_block_quote(self, m: Match[str], state: BlockState) -> int: 

361 """Parse token for block quote. Here is an example of the syntax: 

362 

363 .. code-block:: markdown 

364 

365 > a block quote starts 

366 > with right arrows 

367 """ 

368 text, end_pos = self.extract_block_quote(m, state) 

369 # scan children state 

370 child = state.child_state(text) 

371 if state.depth() >= self.max_nested_level - 1: 

372 rules = list(self.block_quote_rules) 

373 rules.remove("block_quote") 

374 else: 

375 rules = self.block_quote_rules 

376 

377 self.parse(child, rules) 

378 token = {"type": "block_quote", "children": child.tokens} 

379 if end_pos: 

380 state.prepend_token(token) 

381 return end_pos 

382 state.append_token(token) 

383 return state.cursor 

384 

385 def parse_list(self, m: Match[str], state: BlockState) -> int: 

386 """Parse tokens for ordered and unordered list.""" 

387 return parse_list(self, m, state) 

388 

389 def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]: 

390 return self.parse_raw_html(m, state) 

391 

392 def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]: 

393 marker = m.group(0).strip() 

394 

395 # rule 2 

396 if marker == "<!--": 

397 return _parse_html_to_end(state, "-->", m.end()) 

398 

399 # rule 3 

400 if marker == "<?": 

401 return _parse_html_to_end(state, "?>", m.end()) 

402 

403 # rule 5 

404 if marker == "<![CDATA[": 

405 return _parse_html_to_end(state, "]]>", m.end()) 

406 

407 # rule 4 

408 if marker.startswith("<!"): 

409 return _parse_html_to_end(state, ">", m.end()) 

410 

411 close_tag = None 

412 open_tag = None 

413 if marker.startswith("</"): 

414 close_tag = marker[2:].lower() 

415 # rule 6 

416 if close_tag in BLOCK_TAGS: 

417 return _parse_html_to_newline(state, self.BLANK_LINE) 

418 else: 

419 open_tag = marker[1:].lower() 

420 # rule 1 

421 if open_tag in PRE_TAGS: 

422 end_tag = "</" + open_tag + ">" 

423 return _parse_html_to_end(state, end_tag, m.end()) 

424 # rule 6 

425 if open_tag in BLOCK_TAGS: 

426 return _parse_html_to_newline(state, self.BLANK_LINE) 

427 

428 # Blocks of type 7 may not interrupt a paragraph. 

429 end_pos = state.append_paragraph() 

430 if end_pos: 

431 return end_pos 

432 

433 # rule 7 

434 start_pos = m.end() 

435 end_pos = state.find_line_end() 

436 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or ( 

437 close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos) 

438 ): 

439 return _parse_html_to_newline(state, self.BLANK_LINE) 

440 

441 return None 

442 

443 def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None: 

444 sc = self.compile_sc(rules) 

445 

446 while state.cursor < state.cursor_max: 

447 m = sc.search(state.src, state.cursor) 

448 if not m: 

449 break 

450 

451 end_pos = m.start() 

452 if end_pos > state.cursor: 

453 text = state.get_text(end_pos) 

454 state.add_paragraph(text) 

455 state.cursor = end_pos 

456 

457 end_pos2 = self.parse_method(m, state) 

458 if end_pos2: 

459 state.cursor = end_pos2 

460 else: 

461 end_pos3 = state.find_line_end() 

462 text = state.get_text(end_pos3) 

463 state.add_paragraph(text) 

464 state.cursor = end_pos3 

465 

466 if state.cursor < state.cursor_max: 

467 text = state.src[state.cursor :] 

468 state.add_paragraph(text) 

469 state.cursor = state.cursor_max 

470 

471 

472def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int: 

473 marker_pos = state.src.find(end_marker, start_pos) 

474 if marker_pos == -1: 

475 text = state.src[state.cursor :] 

476 end_pos = state.cursor_max 

477 else: 

478 text = state.get_text(marker_pos) 

479 state.cursor = marker_pos 

480 end_pos = state.find_line_end() 

481 text += state.get_text(end_pos) 

482 

483 state.append_token({"type": "block_html", "raw": text}) 

484 return end_pos 

485 

486 

487def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int: 

488 m = newline.search(state.src, state.cursor) 

489 if m: 

490 end_pos = m.start() 

491 text = state.get_text(end_pos) 

492 else: 

493 text = state.src[state.cursor :] 

494 end_pos = state.cursor_max 

495 

496 state.append_token({"type": "block_html", "raw": text}) 

497 return end_pos