Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mistune/block_parser.py: 100%

265 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1import re 

2from typing import Optional, List, Tuple, Match 

3from .util import ( 

4 unikey, 

5 escape_url, 

6 expand_tab, 

7 expand_leading_tab, 

8) 

9from .core import Parser, BlockState 

10from .helpers import ( 

11 LINK_LABEL, 

12 HTML_TAGNAME, 

13 HTML_ATTRIBUTES, 

14 BLOCK_TAGS, 

15 PRE_TAGS, 

16 unescape_char, 

17 parse_link_href, 

18 parse_link_title, 

19) 

20from .list_parser import parse_list, LIST_PATTERN 

21 

22_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M) 

23_AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$') 

24_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M) 

25_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M) 

26 

27_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$') 

28_BLANK_TO_LINE = re.compile(r'[ \t]*\n') 

29 

30_BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS) 

31_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)') 

32_CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)') 

33_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+') 

34 

35 

36class BlockParser(Parser): 

37 BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M) 

38 

39 RAW_HTML = ( 

40 r'^ {0,3}(' 

41 r'</?' + HTML_TAGNAME + r'|' 

42 r'<!--|' # comment 

43 r'<\?|' # script 

44 r'<![A-Z]|' 

45 r'<!\[CDATA\[)' 

46 ) 

47 

48 BLOCK_HTML = ( 

49 r'^ {0,3}(?:' 

50 r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+|\n|$))' 

51 r'|<!--' # comment 

52 r'|<\?' # script 

53 r'|<![A-Z]' 

54 r'|<!\[CDATA\[)' 

55 ) 

56 

57 SPECIFICATION = { 

58 'blank_line': r'(^[ \t\v\f]*\n)+', 

59 'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*|[ \t]+.*?)$', 

60 'setex_heading': r'^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$', 

61 'fenced_code': ( 

62 r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})' 

63 r'[ \t]*(?P<fenced_3>.*?)$' 

64 ), 

65 'indent_code': ( 

66 r'^(?: {4}| *\t)[^\n]+(?:\n+|$)' 

67 r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*' 

68 ), 

69 'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$', 

70 'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:', 

71 'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$', 

72 'list': LIST_PATTERN, 

73 'block_html': BLOCK_HTML, 

74 'raw_html': RAW_HTML, 

75 } 

76 

77 DEFAULT_RULES = ( 

78 'fenced_code', 

79 'indent_code', 

80 'axt_heading', 

81 'setex_heading', 

82 'thematic_break', 

83 'block_quote', 

84 'list', 

85 'ref_link', 

86 'raw_html', 

87 'blank_line', 

88 ) 

89 

90 def __init__( 

91 self, 

92 block_quote_rules: Optional[List[str]]=None, 

93 list_rules: Optional[List[str]]=None, 

94 max_nested_level: int=6 

95 ): 

96 super(BlockParser, self).__init__() 

97 

98 if block_quote_rules is None: 

99 block_quote_rules = list(self.DEFAULT_RULES) 

100 

101 if list_rules is None: 

102 list_rules = list(self.DEFAULT_RULES) 

103 

104 self.block_quote_rules = block_quote_rules 

105 self.list_rules = list_rules 

106 self.max_nested_level = max_nested_level 

107 # register default parse methods 

108 self._methods = { 

109 name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION 

110 } 

111 

112 def parse_blank_line(self, m: Match, state: BlockState) -> int: 

113 """Parse token for blank lines.""" 

114 state.append_token({'type': 'blank_line'}) 

115 return m.end() 

116 

117 def parse_thematic_break(self, m: Match, state: BlockState) -> int: 

118 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML.""" 

119 state.append_token({'type': 'thematic_break'}) 

120 # $ does not count '\n' 

121 return m.end() + 1 

122 

123 def parse_indent_code(self, m: Match, state: BlockState) -> int: 

124 """Parse token for code block which is indented by 4 spaces.""" 

125 # it is a part of the paragraph 

126 end_pos = state.append_paragraph() 

127 if end_pos: 

128 return end_pos 

129 

130 code = m.group(0) 

131 code = expand_leading_tab(code) 

132 code = _INDENT_CODE_TRIM.sub('', code) 

133 code = code.strip('\n') 

134 state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'}) 

135 return m.end() 

136 

137 def parse_fenced_code(self, m: Match, state: BlockState) -> Optional[int]: 

138 """Parse token for fenced code block. A fenced code block is started with 

139 3 or more backtick(`) or tilde(~). 

140 

141 An example of a fenced code block: 

142 

143 .. code-block:: markdown 

144 

145 ```python 

146 def markdown(text): 

147 return mistune.html(text) 

148 ``` 

149 """ 

150 spaces = m.group('fenced_1') 

151 marker = m.group('fenced_2') 

152 info = m.group('fenced_3') 

153 

154 c = marker[0] 

155 if info and c == '`': 

156 # CommonMark Example 145 

157 # Info strings for backtick code blocks cannot contain backticks 

158 if info.find(c) != -1: 

159 return 

160 

161 _end = re.compile( 

162 r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M) 

163 cursor_start = m.end() + 1 

164 

165 m2 = _end.search(state.src, cursor_start) 

166 if m2: 

167 code = state.src[cursor_start:m2.start()] 

168 end_pos = m2.end() 

169 else: 

170 code = state.src[cursor_start:] 

171 end_pos = state.cursor_max 

172 

173 if spaces and code: 

174 _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M) 

175 code = _trim_pattern.sub('', code) 

176 

177 token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker} 

178 if info: 

179 info = unescape_char(info) 

180 token['attrs'] = {'info': info.strip()} 

181 

182 state.append_token(token) 

183 return end_pos 

184 

185 def parse_axt_heading(self, m: Match, state: BlockState) -> int: 

186 """Parse token for AXT heading. An AXT heading is started with 1 to 6 

187 symbol of ``#``.""" 

188 level = len(m.group('axt_1')) 

189 text = m.group('axt_2').strip() 

190 # remove last # 

191 if text: 

192 text = _AXT_HEADING_TRIM.sub('', text) 

193 

194 token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'} 

195 state.append_token(token) 

196 return m.end() + 1 

197 

198 def parse_setex_heading(self, m: Match, state: BlockState) -> Optional[int]: 

199 """Parse token for setex style heading. A setex heading syntax looks like: 

200 

201 .. code-block:: markdown 

202 

203 H1 title 

204 ======== 

205 """ 

206 last_token = state.last_token() 

207 if last_token and last_token['type'] == 'paragraph': 

208 level = 1 if m.group('setext_1') == '=' else 2 

209 last_token['type'] = 'heading' 

210 last_token['style'] = 'setext' 

211 last_token['attrs'] = {'level': level} 

212 return m.end() + 1 

213 

214 sc = self.compile_sc(['thematic_break', 'list']) 

215 m = sc.match(state.src, state.cursor) 

216 if m: 

217 return self.parse_method(m, state) 

218 

219 def parse_ref_link(self, m: Match, state: BlockState) -> Optional[int]: 

220 """Parse link references and save the link information into ``state.env``. 

221 

222 Here is an example of a link reference: 

223 

224 .. code-block:: markdown 

225 

226 a [link][example] 

227 

228 [example]: https://example.com "Optional title" 

229 

230 This method will save the link reference into ``state.env`` as:: 

231 

232 state.env['ref_links']['example'] = { 

233 'url': 'https://example.com', 

234 'title': "Optional title", 

235 } 

236 """ 

237 end_pos = state.append_paragraph() 

238 if end_pos: 

239 return end_pos 

240 

241 label = m.group('reflink_1') 

242 key = unikey(label) 

243 if not key: 

244 return 

245 

246 href, href_pos = parse_link_href(state.src, m.end(), block=True) 

247 if href is None: 

248 return 

249 

250 _blank = self.BLANK_LINE.search(state.src, href_pos) 

251 if _blank: 

252 max_pos = _blank.start() 

253 else: 

254 max_pos = state.cursor_max 

255 

256 title, title_pos = parse_link_title(state.src, href_pos, max_pos) 

257 if title_pos: 

258 m = _BLANK_TO_LINE.match(state.src, title_pos) 

259 if m: 

260 title_pos = m.end() 

261 else: 

262 title_pos = None 

263 title = None 

264 

265 if title_pos is None: 

266 m = _BLANK_TO_LINE.match(state.src, href_pos) 

267 if m: 

268 href_pos = m.end() 

269 else: 

270 href_pos = None 

271 href = None 

272 

273 end_pos = title_pos or href_pos 

274 if not end_pos: 

275 return 

276 

277 if key not in state.env['ref_links']: 

278 href = unescape_char(href) 

279 data = {'url': escape_url(href), 'label': label} 

280 if title: 

281 data['title'] = title 

282 state.env['ref_links'][key] = data 

283 return end_pos 

284 

285 def extract_block_quote(self, m: Match, state: BlockState) -> Tuple[str, int]: 

286 """Extract text and cursor end position of a block quote.""" 

287 

288 # cleanup at first to detect if it is code block 

289 text = m.group('quote_1') + '\n' 

290 text = expand_leading_tab(text, 3) 

291 text = _BLOCK_QUOTE_TRIM.sub('', text) 

292 

293 sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code']) 

294 require_marker = bool(sc.match(text)) 

295 

296 state.cursor = m.end() + 1 

297 

298 end_pos = None 

299 if require_marker: 

300 m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

301 if m: 

302 quote = m.group(0) 

303 quote = _BLOCK_QUOTE_LEADING.sub('', quote) 

304 quote = expand_leading_tab(quote, 3) 

305 quote = _BLOCK_QUOTE_TRIM.sub('', quote) 

306 text += quote 

307 state.cursor = m.end() 

308 else: 

309 prev_blank_line = False 

310 break_sc = self.compile_sc([ 

311 'blank_line', 'thematic_break', 'fenced_code', 

312 'list', 'block_html', 

313 ]) 

314 while state.cursor < state.cursor_max: 

315 m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) 

316 if m: 

317 quote = m.group(0) 

318 quote = _BLOCK_QUOTE_LEADING.sub('', quote) 

319 quote = expand_leading_tab(quote, 3) 

320 quote = _BLOCK_QUOTE_TRIM.sub('', quote) 

321 text += quote 

322 state.cursor = m.end() 

323 if not quote.strip(): 

324 prev_blank_line = True 

325 else: 

326 prev_blank_line = bool(_LINE_BLANK_END.search(quote)) 

327 continue 

328 

329 if prev_blank_line: 

330 # CommonMark Example 249 

331 # because of laziness, a blank line is needed between 

332 # a block quote and a following paragraph 

333 break 

334 

335 m = break_sc.match(state.src, state.cursor) 

336 if m: 

337 end_pos = self.parse_method(m, state) 

338 if end_pos: 

339 break 

340 

341 # lazy continuation line 

342 pos = state.find_line_end() 

343 line = state.get_text(pos) 

344 line = expand_leading_tab(line, 3) 

345 text += line 

346 state.cursor = pos 

347 

348 # according to CommonMark Example 6, the second tab should be 

349 # treated as 4 spaces 

350 return expand_tab(text), end_pos 

351 

352 def parse_block_quote(self, m: Match, state: BlockState) -> int: 

353 """Parse token for block quote. Here is an example of the syntax: 

354 

355 .. code-block:: markdown 

356 

357 > a block quote starts 

358 > with right arrows 

359 """ 

360 text, end_pos = self.extract_block_quote(m, state) 

361 # scan children state 

362 child = state.child_state(text) 

363 if state.depth() >= self.max_nested_level - 1: 

364 rules = list(self.block_quote_rules) 

365 rules.remove('block_quote') 

366 else: 

367 rules = self.block_quote_rules 

368 

369 self.parse(child, rules) 

370 token = {'type': 'block_quote', 'children': child.tokens} 

371 if end_pos: 

372 state.prepend_token(token) 

373 return end_pos 

374 state.append_token(token) 

375 return state.cursor 

376 

377 def parse_list(self, m: Match, state: BlockState) -> int: 

378 """Parse tokens for ordered and unordered list.""" 

379 return parse_list(self, m, state) 

380 

381 def parse_block_html(self, m: Match, state: BlockState) -> Optional[int]: 

382 return self.parse_raw_html(m, state) 

383 

384 def parse_raw_html(self, m: Match, state: BlockState) -> Optional[int]: 

385 marker = m.group(0).strip() 

386 

387 # rule 2 

388 if marker == '<!--': 

389 return _parse_html_to_end(state, '-->', m.end()) 

390 

391 # rule 3 

392 if marker == '<?': 

393 return _parse_html_to_end(state, '?>', m.end()) 

394 

395 # rule 5 

396 if marker == '<![CDATA[': 

397 return _parse_html_to_end(state, ']]>', m.end()) 

398 

399 # rule 4 

400 if marker.startswith('<!'): 

401 return _parse_html_to_end(state, '>', m.end()) 

402 

403 close_tag = None 

404 open_tag = None 

405 if marker.startswith('</'): 

406 close_tag = marker[2:].lower() 

407 # rule 6 

408 if close_tag in BLOCK_TAGS: 

409 return _parse_html_to_newline(state, self.BLANK_LINE) 

410 else: 

411 open_tag = marker[1:].lower() 

412 # rule 1 

413 if open_tag in PRE_TAGS: 

414 end_tag = '</' + open_tag + '>' 

415 return _parse_html_to_end(state, end_tag, m.end()) 

416 # rule 6 

417 if open_tag in BLOCK_TAGS: 

418 return _parse_html_to_newline(state, self.BLANK_LINE) 

419 

420 # Blocks of type 7 may not interrupt a paragraph. 

421 end_pos = state.append_paragraph() 

422 if end_pos: 

423 return end_pos 

424 

425 # rule 7 

426 start_pos = m.end() 

427 end_pos = state.find_line_end() 

428 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \ 

429 (close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)): 

430 return _parse_html_to_newline(state, self.BLANK_LINE) 

431 

432 def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None: 

433 sc = self.compile_sc(rules) 

434 

435 while state.cursor < state.cursor_max: 

436 m = sc.search(state.src, state.cursor) 

437 if not m: 

438 break 

439 

440 end_pos = m.start() 

441 if end_pos > state.cursor: 

442 text = state.get_text(end_pos) 

443 state.add_paragraph(text) 

444 state.cursor = end_pos 

445 

446 end_pos = self.parse_method(m, state) 

447 if end_pos: 

448 state.cursor = end_pos 

449 else: 

450 end_pos = state.find_line_end() 

451 text = state.get_text(end_pos) 

452 state.add_paragraph(text) 

453 state.cursor = end_pos 

454 

455 if state.cursor < state.cursor_max: 

456 text = state.src[state.cursor:] 

457 state.add_paragraph(text) 

458 state.cursor = state.cursor_max 

459 

460 

461def _parse_html_to_end(state, end_marker, start_pos): 

462 marker_pos = state.src.find(end_marker, start_pos) 

463 if marker_pos == -1: 

464 text = state.src[state.cursor:] 

465 end_pos = state.cursor_max 

466 else: 

467 text = state.get_text(marker_pos) 

468 state.cursor = marker_pos 

469 end_pos = state.find_line_end() 

470 text += state.get_text(end_pos) 

471 

472 state.append_token({'type': 'block_html', 'raw': text}) 

473 return end_pos 

474 

475 

476def _parse_html_to_newline(state, newline): 

477 m = newline.search(state.src, state.cursor) 

478 if m: 

479 end_pos = m.start() 

480 text = state.get_text(end_pos) 

481 else: 

482 text = state.src[state.cursor:] 

483 end_pos = state.cursor_max 

484 

485 state.append_token({'type': 'block_html', 'raw': text}) 

486 return end_pos