1import re
2from typing import Optional, List, Tuple, Match, Pattern
3from .util import (
4 unikey,
5 escape_url,
6 expand_tab,
7 expand_leading_tab,
8)
9from .core import Parser, BlockState
10from .helpers import (
11 LINK_LABEL,
12 HTML_TAGNAME,
13 HTML_ATTRIBUTES,
14 BLOCK_TAGS,
15 PRE_TAGS,
16 unescape_char,
17 parse_link_href,
18 parse_link_title,
19)
20from .list_parser import parse_list, LIST_PATTERN
21
22_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
23_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
24_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
25_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
26
27_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
28_BLANK_TO_LINE = re.compile(r"[ \t]*\n")
29
30_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
31_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
32_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
33_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
34
35
36class BlockParser(Parser[BlockState]):
37 state_cls = BlockState
38
39 BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
40
41 RAW_HTML = (
42 r"^ {0,3}("
43 r"</?" + HTML_TAGNAME + r"|"
44 r"<!--|" # comment
45 r"<\?|" # script
46 r"<![A-Z]|"
47 r"<!\[CDATA\[)"
48 )
49
50 BLOCK_HTML = (
51 r"^ {0,3}(?:"
52 r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
53 r"|<!--" # comment
54 r"|<\?" # script
55 r"|<![A-Z]"
56 r"|<!\[CDATA\[)"
57 )
58
59 SPECIFICATION = {
60 "blank_line": r"(^[ \t\v\f]*\n)+",
61 "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
62 "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
63 "fenced_code": (
64 r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
65 r"[ \t]*(?P<fenced_3>.*?)$"
66 ),
67 "indent_code": (
68 r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
69 r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
70 ),
71 "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
72 "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
73 "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
74 "list": LIST_PATTERN,
75 "block_html": BLOCK_HTML,
76 "raw_html": RAW_HTML,
77 }
78
79 DEFAULT_RULES = (
80 "fenced_code",
81 "indent_code",
82 "atx_heading",
83 "setex_heading",
84 "thematic_break",
85 "block_quote",
86 "list",
87 "ref_link",
88 "raw_html",
89 "blank_line",
90 )
91
92 def __init__(
93 self,
94 block_quote_rules: Optional[List[str]] = None,
95 list_rules: Optional[List[str]] = None,
96 max_nested_level: int = 6,
97 ):
98 super(BlockParser, self).__init__()
99
100 if block_quote_rules is None:
101 block_quote_rules = list(self.DEFAULT_RULES)
102
103 if list_rules is None:
104 list_rules = list(self.DEFAULT_RULES)
105
106 self.block_quote_rules = block_quote_rules
107 self.list_rules = list_rules
108 self.max_nested_level = max_nested_level
109 # register default parse methods
110 self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
111
112 def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
113 """Parse token for blank lines."""
114 state.append_token({"type": "blank_line"})
115 return m.end()
116
117 def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
118 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
119 state.append_token({"type": "thematic_break"})
120 # $ does not count '\n'
121 return m.end() + 1
122
123 def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
124 """Parse token for code block which is indented by 4 spaces."""
125 # it is a part of the paragraph
126 end_pos = state.append_paragraph()
127 if end_pos:
128 return end_pos
129
130 code = m.group(0)
131 code = expand_leading_tab(code)
132 code = _INDENT_CODE_TRIM.sub("", code)
133 code = code.strip("\n")
134 state.append_token({"type": "block_code", "raw": code, "style": "indent"})
135 return m.end()
136
137 def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
138 """Parse token for fenced code block. A fenced code block is started with
139 3 or more backtick(`) or tilde(~).
140
141 An example of a fenced code block:
142
143 .. code-block:: markdown
144
145 ```python
146 def markdown(text):
147 return mistune.html(text)
148 ```
149 """
150 spaces = m.group("fenced_1")
151 marker = m.group("fenced_2")
152 info = m.group("fenced_3")
153
154 c = marker[0]
155 if info and c == "`":
156 # CommonMark Example 145
157 # Info strings for backtick code blocks cannot contain backticks
158 if info.find(c) != -1:
159 return None
160
161 _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
162 cursor_start = m.end() + 1
163
164 m2 = _end.search(state.src, cursor_start)
165 if m2:
166 code = state.src[cursor_start : m2.start()]
167 end_pos = m2.end()
168 else:
169 code = state.src[cursor_start:]
170 end_pos = state.cursor_max
171
172 if spaces and code:
173 _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
174 code = _trim_pattern.sub("", code)
175
176 token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
177 if info:
178 info = unescape_char(info)
179 token["attrs"] = {"info": info.strip()}
180
181 state.append_token(token)
182 return end_pos
183
184 def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
185 """Parse token for ATX heading. An ATX heading is started with 1 to 6
186 symbol of ``#``."""
187 level = len(m.group("atx_1"))
188 text = m.group("atx_2").strip()
189 # remove last #
190 if text:
191 text = _ATX_HEADING_TRIM.sub("", text)
192
193 token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
194 state.append_token(token)
195 return m.end() + 1
196
197 def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
198 """Parse token for setex style heading. A setex heading syntax looks like:
199
200 .. code-block:: markdown
201
202 H1 title
203 ========
204 """
205 last_token = state.last_token()
206 if last_token and last_token["type"] == "paragraph":
207 level = 1 if m.group("setext_1") == "=" else 2
208 last_token["type"] = "heading"
209 last_token["style"] = "setext"
210 last_token["attrs"] = {"level": level}
211 return m.end() + 1
212
213 sc = self.compile_sc(["thematic_break", "list"])
214 m2 = sc.match(state.src, state.cursor)
215 if m2:
216 return self.parse_method(m2, state)
217 return None
218
219 def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
220 """Parse link references and save the link information into ``state.env``.
221
222 Here is an example of a link reference:
223
224 .. code-block:: markdown
225
226 a [link][example]
227
228 [example]: https://example.com "Optional title"
229
230 This method will save the link reference into ``state.env`` as::
231
232 state.env['ref_links']['example'] = {
233 'url': 'https://example.com',
234 'title': "Optional title",
235 }
236 """
237 end_pos = state.append_paragraph()
238 if end_pos:
239 return end_pos
240
241 label = m.group("reflink_1")
242 key = unikey(label)
243 if not key:
244 return None
245
246 href, href_pos = parse_link_href(state.src, m.end(), block=True)
247 if href is None:
248 return None
249
250 assert href_pos is not None
251
252 _blank = self.BLANK_LINE.search(state.src, href_pos)
253 if _blank:
254 max_pos = _blank.start()
255 else:
256 max_pos = state.cursor_max
257
258 title, title_pos = parse_link_title(state.src, href_pos, max_pos)
259 if title_pos:
260 m2 = _BLANK_TO_LINE.match(state.src, title_pos)
261 if m2:
262 title_pos = m2.end()
263 else:
264 title_pos = None
265 title = None
266
267 if title_pos is None:
268 m3 = _BLANK_TO_LINE.match(state.src, href_pos)
269 if m3:
270 href_pos = m3.end()
271 else:
272 href_pos = None
273 href = None
274
275 end_pos = title_pos or href_pos
276 if not end_pos:
277 return None
278
279 if key not in state.env["ref_links"]:
280 assert href is not None
281 href = unescape_char(href)
282 data = {"url": escape_url(href), "label": label}
283 if title:
284 data["title"] = title
285 state.env["ref_links"][key] = data
286 return end_pos
287
288 def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
289 """Extract text and cursor end position of a block quote."""
290
291 # cleanup at first to detect if it is code block
292 text = m.group("quote_1") + "\n"
293 text = expand_leading_tab(text, 3)
294 text = _BLOCK_QUOTE_TRIM.sub("", text)
295
296 sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
297 require_marker = bool(sc.match(text))
298
299 state.cursor = m.end() + 1
300
301 end_pos: Optional[int] = None
302 if require_marker:
303 m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
304 if m2:
305 quote = m2.group(0)
306 quote = _BLOCK_QUOTE_LEADING.sub("", quote)
307 quote = expand_leading_tab(quote, 3)
308 quote = _BLOCK_QUOTE_TRIM.sub("", quote)
309 text += quote
310 state.cursor = m2.end()
311 else:
312 prev_blank_line = False
313 break_sc = self.compile_sc(
314 [
315 "blank_line",
316 "thematic_break",
317 "fenced_code",
318 "list",
319 "block_html",
320 ]
321 )
322 while state.cursor < state.cursor_max:
323 m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
324 if m3:
325 quote = m3.group(0)
326 quote = _BLOCK_QUOTE_LEADING.sub("", quote)
327 quote = expand_leading_tab(quote, 3)
328 quote = _BLOCK_QUOTE_TRIM.sub("", quote)
329 text += quote
330 state.cursor = m3.end()
331 if not quote.strip():
332 prev_blank_line = True
333 else:
334 prev_blank_line = bool(_LINE_BLANK_END.search(quote))
335 continue
336
337 if prev_blank_line:
338 # CommonMark Example 249
339 # because of laziness, a blank line is needed between
340 # a block quote and a following paragraph
341 break
342
343 m4 = break_sc.match(state.src, state.cursor)
344 if m4:
345 end_pos = self.parse_method(m4, state)
346 if end_pos:
347 break
348
349 # lazy continuation line
350 pos = state.find_line_end()
351 line = state.get_text(pos)
352 line = expand_leading_tab(line, 3)
353 text += line
354 state.cursor = pos
355
356 # according to CommonMark Example 6, the second tab should be
357 # treated as 4 spaces
358 return expand_tab(text), end_pos
359
360 def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
361 """Parse token for block quote. Here is an example of the syntax:
362
363 .. code-block:: markdown
364
365 > a block quote starts
366 > with right arrows
367 """
368 text, end_pos = self.extract_block_quote(m, state)
369 # scan children state
370 child = state.child_state(text)
371 if state.depth() >= self.max_nested_level - 1:
372 rules = list(self.block_quote_rules)
373 rules.remove("block_quote")
374 else:
375 rules = self.block_quote_rules
376
377 self.parse(child, rules)
378 token = {"type": "block_quote", "children": child.tokens}
379 if end_pos:
380 state.prepend_token(token)
381 return end_pos
382 state.append_token(token)
383 return state.cursor
384
385 def parse_list(self, m: Match[str], state: BlockState) -> int:
386 """Parse tokens for ordered and unordered list."""
387 return parse_list(self, m, state)
388
389 def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
390 return self.parse_raw_html(m, state)
391
392 def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
393 marker = m.group(0).strip()
394
395 # rule 2
396 if marker == "<!--":
397 return _parse_html_to_end(state, "-->", m.end())
398
399 # rule 3
400 if marker == "<?":
401 return _parse_html_to_end(state, "?>", m.end())
402
403 # rule 5
404 if marker == "<![CDATA[":
405 return _parse_html_to_end(state, "]]>", m.end())
406
407 # rule 4
408 if marker.startswith("<!"):
409 return _parse_html_to_end(state, ">", m.end())
410
411 close_tag = None
412 open_tag = None
413 if marker.startswith("</"):
414 close_tag = marker[2:].lower()
415 # rule 6
416 if close_tag in BLOCK_TAGS:
417 return _parse_html_to_newline(state, self.BLANK_LINE)
418 else:
419 open_tag = marker[1:].lower()
420 # rule 1
421 if open_tag in PRE_TAGS:
422 end_tag = "</" + open_tag + ">"
423 return _parse_html_to_end(state, end_tag, m.end())
424 # rule 6
425 if open_tag in BLOCK_TAGS:
426 return _parse_html_to_newline(state, self.BLANK_LINE)
427
428 # Blocks of type 7 may not interrupt a paragraph.
429 end_pos = state.append_paragraph()
430 if end_pos:
431 return end_pos
432
433 # rule 7
434 start_pos = m.end()
435 end_pos = state.find_line_end()
436 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
437 close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
438 ):
439 return _parse_html_to_newline(state, self.BLANK_LINE)
440
441 return None
442
443 def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
444 sc = self.compile_sc(rules)
445
446 while state.cursor < state.cursor_max:
447 m = sc.search(state.src, state.cursor)
448 if not m:
449 break
450
451 end_pos = m.start()
452 if end_pos > state.cursor:
453 text = state.get_text(end_pos)
454 state.add_paragraph(text)
455 state.cursor = end_pos
456
457 end_pos2 = self.parse_method(m, state)
458 if end_pos2:
459 state.cursor = end_pos2
460 else:
461 end_pos3 = state.find_line_end()
462 text = state.get_text(end_pos3)
463 state.add_paragraph(text)
464 state.cursor = end_pos3
465
466 if state.cursor < state.cursor_max:
467 text = state.src[state.cursor :]
468 state.add_paragraph(text)
469 state.cursor = state.cursor_max
470
471
472def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
473 marker_pos = state.src.find(end_marker, start_pos)
474 if marker_pos == -1:
475 text = state.src[state.cursor :]
476 end_pos = state.cursor_max
477 else:
478 text = state.get_text(marker_pos)
479 state.cursor = marker_pos
480 end_pos = state.find_line_end()
481 text += state.get_text(end_pos)
482
483 state.append_token({"type": "block_html", "raw": text})
484 return end_pos
485
486
487def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
488 m = newline.search(state.src, state.cursor)
489 if m:
490 end_pos = m.start()
491 text = state.get_text(end_pos)
492 else:
493 text = state.src[state.cursor :]
494 end_pos = state.cursor_max
495
496 state.append_token({"type": "block_html", "raw": text})
497 return end_pos