1import re
2from typing import Optional, List, Tuple, Match, Pattern
3import string
4from .util import (
5 unikey,
6 escape_url,
7 expand_tab,
8 expand_leading_tab,
9)
10from .core import Parser, BlockState
11from .helpers import (
12 LINK_LABEL,
13 HTML_TAGNAME,
14 HTML_ATTRIBUTES,
15 BLOCK_TAGS,
16 PRE_TAGS,
17 unescape_char,
18 parse_link_href,
19 parse_link_title,
20)
21from .list_parser import parse_list, LIST_PATTERN
22
23_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
24_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
25_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
26_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
27
28_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
29_BLANK_TO_LINE = re.compile(r"[ \t]*\n")
30
31_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
32_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
33_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
34_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
35
36
37class BlockParser(Parser[BlockState]):
38 state_cls = BlockState
39
40 BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
41
42 RAW_HTML = (
43 r"^ {0,3}("
44 r"</?" + HTML_TAGNAME + r"|"
45 r"<!--|" # comment
46 r"<\?|" # script
47 r"<![A-Z]|"
48 r"<!\[CDATA\[)"
49 )
50
51 BLOCK_HTML = (
52 r"^ {0,3}(?:"
53 r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
54 r"|<!--" # comment
55 r"|<\?" # script
56 r"|<![A-Z]"
57 r"|<!\[CDATA\[)"
58 )
59
60 SPECIFICATION = {
61 "blank_line": r"(^[ \t\v\f]*\n)+",
62 "atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
63 "setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
64 "fenced_code": (
65 r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
66 r"[ \t]*(?P<fenced_3>.*?)$"
67 ),
68 "indent_code": (
69 r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
70 r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
71 ),
72 "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
73 "ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
74 "block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
75 "list": LIST_PATTERN,
76 "block_html": BLOCK_HTML,
77 "raw_html": RAW_HTML,
78 }
79
80 DEFAULT_RULES = (
81 "fenced_code",
82 "indent_code",
83 "atx_heading",
84 "setex_heading",
85 "thematic_break",
86 "block_quote",
87 "list",
88 "ref_link",
89 "raw_html",
90 "blank_line",
91 )
92
93 def __init__(
94 self,
95 block_quote_rules: Optional[List[str]] = None,
96 list_rules: Optional[List[str]] = None,
97 max_nested_level: int = 6,
98 ):
99 super(BlockParser, self).__init__()
100
101 if block_quote_rules is None:
102 block_quote_rules = list(self.DEFAULT_RULES)
103
104 if list_rules is None:
105 list_rules = list(self.DEFAULT_RULES)
106
107 self.block_quote_rules = block_quote_rules
108 self.list_rules = list_rules
109 self.max_nested_level = max_nested_level
110 # register default parse methods
111 self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
112
113 def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
114 """Parse token for blank lines."""
115 state.append_token({"type": "blank_line"})
116 return m.end()
117
118 def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
119 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
120 state.append_token({"type": "thematic_break"})
121 # $ does not count '\n'
122 return m.end() + 1
123
124 def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
125 """Parse token for code block which is indented by 4 spaces."""
126 # it is a part of the paragraph
127 end_pos = state.append_paragraph()
128 if end_pos:
129 return end_pos
130
131 code = m.group(0)
132 code = expand_leading_tab(code)
133 code = _INDENT_CODE_TRIM.sub("", code)
134 code = code.strip("\n")
135 state.append_token({"type": "block_code", "raw": code, "style": "indent"})
136 return m.end()
137
138 def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
139 """Parse token for fenced code block. A fenced code block is started with
140 3 or more backtick(`) or tilde(~).
141
142 An example of a fenced code block:
143
144 .. code-block:: markdown
145
146 ```python
147 def markdown(text):
148 return mistune.html(text)
149 ```
150 """
151 spaces = m.group("fenced_1")
152 marker = m.group("fenced_2")
153 info = m.group("fenced_3")
154
155 c = marker[0]
156 if info and c == "`":
157 # CommonMark Example 145
158 # Info strings for backtick code blocks cannot contain backticks
159 if info.find(c) != -1:
160 return None
161
162 _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
163 cursor_start = m.end() + 1
164
165 m2 = _end.search(state.src, cursor_start)
166 if m2:
167 code = state.src[cursor_start : m2.start()]
168 end_pos = m2.end()
169 else:
170 code = state.src[cursor_start:]
171 end_pos = state.cursor_max
172
173 if spaces and code:
174 _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
175 code = _trim_pattern.sub("", code)
176
177 token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
178 if info:
179 info = unescape_char(info)
180 token["attrs"] = {"info": info.strip()}
181
182 state.append_token(token)
183 return end_pos
184
185 def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
186 """Parse token for ATX heading. An ATX heading is started with 1 to 6
187 symbol of ``#``."""
188 level = len(m.group("atx_1"))
189 text = m.group("atx_2").strip(string.whitespace)
190 # remove last #
191 if text:
192 text = _ATX_HEADING_TRIM.sub("", text)
193
194 token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
195 state.append_token(token)
196 return m.end() + 1
197
198 def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
199 """Parse token for setex style heading. A setex heading syntax looks like:
200
201 .. code-block:: markdown
202
203 H1 title
204 ========
205 """
206 last_token = state.last_token()
207 if last_token and last_token["type"] == "paragraph":
208 level = 1 if m.group("setext_1") == "=" else 2
209 last_token["type"] = "heading"
210 last_token["style"] = "setext"
211 last_token["attrs"] = {"level": level}
212 return m.end() + 1
213
214 sc = self.compile_sc(["thematic_break", "list"])
215 m2 = sc.match(state.src, state.cursor)
216 if m2:
217 return self.parse_method(m2, state)
218 return None
219
220 def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
221 """Parse link references and save the link information into ``state.env``.
222
223 Here is an example of a link reference:
224
225 .. code-block:: markdown
226
227 a [link][example]
228
229 [example]: https://example.com "Optional title"
230
231 This method will save the link reference into ``state.env`` as::
232
233 state.env['ref_links']['example'] = {
234 'url': 'https://example.com',
235 'title': "Optional title",
236 }
237 """
238 end_pos = state.append_paragraph()
239 if end_pos:
240 return end_pos
241
242 label = m.group("reflink_1")
243 key = unikey(label)
244 if not key:
245 return None
246
247 href, href_pos = parse_link_href(state.src, m.end(), block=True)
248 if href is None:
249 return None
250
251 assert href_pos is not None
252
253 _blank = self.BLANK_LINE.search(state.src, href_pos)
254 if _blank:
255 max_pos = _blank.start()
256 else:
257 max_pos = state.cursor_max
258
259 title, title_pos = parse_link_title(state.src, href_pos, max_pos)
260 if title_pos:
261 m2 = _BLANK_TO_LINE.match(state.src, title_pos)
262 if m2:
263 title_pos = m2.end()
264 else:
265 title_pos = None
266 title = None
267
268 if title_pos is None:
269 m3 = _BLANK_TO_LINE.match(state.src, href_pos)
270 if m3:
271 href_pos = m3.end()
272 else:
273 href_pos = None
274 href = None
275
276 end_pos = title_pos or href_pos
277 if not end_pos:
278 return None
279
280 if key not in state.env["ref_links"]:
281 assert href is not None
282 href = unescape_char(href)
283 data = {"url": escape_url(href), "label": label}
284 if title:
285 data["title"] = title
286 state.env["ref_links"][key] = data
287 return end_pos
288
289 def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
290 """Extract text and cursor end position of a block quote."""
291
292 # cleanup at first to detect if it is code block
293 text = m.group("quote_1") + "\n"
294 text = expand_leading_tab(text, 3)
295 text = _BLOCK_QUOTE_TRIM.sub("", text)
296
297 sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
298 require_marker = bool(sc.match(text))
299
300 state.cursor = m.end() + 1
301
302 end_pos: Optional[int] = None
303 if require_marker:
304 m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
305 if m2:
306 quote = m2.group(0)
307 quote = _BLOCK_QUOTE_LEADING.sub("", quote)
308 quote = expand_leading_tab(quote, 3)
309 quote = _BLOCK_QUOTE_TRIM.sub("", quote)
310 text += quote
311 state.cursor = m2.end()
312 else:
313 prev_blank_line = False
314 break_sc = self.compile_sc(
315 [
316 "blank_line",
317 "thematic_break",
318 "fenced_code",
319 "list",
320 "block_html",
321 ]
322 )
323 while state.cursor < state.cursor_max:
324 m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
325 if m3:
326 quote = m3.group(0)
327 quote = _BLOCK_QUOTE_LEADING.sub("", quote)
328 quote = expand_leading_tab(quote, 3)
329 quote = _BLOCK_QUOTE_TRIM.sub("", quote)
330 text += quote
331 state.cursor = m3.end()
332 if not quote.strip():
333 prev_blank_line = True
334 else:
335 prev_blank_line = bool(_LINE_BLANK_END.search(quote))
336 continue
337
338 if prev_blank_line:
339 # CommonMark Example 249
340 # because of laziness, a blank line is needed between
341 # a block quote and a following paragraph
342 break
343
344 m4 = break_sc.match(state.src, state.cursor)
345 if m4:
346 end_pos = self.parse_method(m4, state)
347 if end_pos:
348 break
349
350 # lazy continuation line
351 pos = state.find_line_end()
352 line = state.get_text(pos)
353 line = expand_leading_tab(line, 3)
354 text += line
355 state.cursor = pos
356
357 # according to CommonMark Example 6, the second tab should be
358 # treated as 4 spaces
359 return expand_tab(text), end_pos
360
361 def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
362 """Parse token for block quote. Here is an example of the syntax:
363
364 .. code-block:: markdown
365
366 > a block quote starts
367 > with right arrows
368 """
369 text, end_pos = self.extract_block_quote(m, state)
370 # scan children state
371 child = state.child_state(text)
372 if state.depth() >= self.max_nested_level - 1:
373 rules = list(self.block_quote_rules)
374 rules.remove("block_quote")
375 else:
376 rules = self.block_quote_rules
377
378 self.parse(child, rules)
379 token = {"type": "block_quote", "children": child.tokens}
380 if end_pos:
381 state.prepend_token(token)
382 return end_pos
383 state.append_token(token)
384 return state.cursor
385
386 def parse_list(self, m: Match[str], state: BlockState) -> int:
387 """Parse tokens for ordered and unordered list."""
388 return parse_list(self, m, state)
389
390 def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
391 return self.parse_raw_html(m, state)
392
393 def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
394 marker = m.group(0).strip()
395
396 # rule 2
397 if marker == "<!--":
398 return _parse_html_to_end(state, "-->", m.end())
399
400 # rule 3
401 if marker == "<?":
402 return _parse_html_to_end(state, "?>", m.end())
403
404 # rule 5
405 if marker == "<![CDATA[":
406 return _parse_html_to_end(state, "]]>", m.end())
407
408 # rule 4
409 if marker.startswith("<!"):
410 return _parse_html_to_end(state, ">", m.end())
411
412 close_tag = None
413 open_tag = None
414 if marker.startswith("</"):
415 close_tag = marker[2:].lower()
416 # rule 6
417 if close_tag in BLOCK_TAGS:
418 return _parse_html_to_newline(state, self.BLANK_LINE)
419 else:
420 open_tag = marker[1:].lower()
421 # rule 1
422 if open_tag in PRE_TAGS:
423 end_tag = "</" + open_tag + ">"
424 return _parse_html_to_end(state, end_tag, m.end())
425 # rule 6
426 if open_tag in BLOCK_TAGS:
427 return _parse_html_to_newline(state, self.BLANK_LINE)
428
429 # Blocks of type 7 may not interrupt a paragraph.
430 end_pos = state.append_paragraph()
431 if end_pos:
432 return end_pos
433
434 # rule 7
435 start_pos = m.end()
436 end_pos = state.find_line_end()
437 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
438 close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
439 ):
440 return _parse_html_to_newline(state, self.BLANK_LINE)
441
442 return None
443
444 def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
445 sc = self.compile_sc(rules)
446
447 while state.cursor < state.cursor_max:
448 m = sc.search(state.src, state.cursor)
449 if not m:
450 break
451
452 end_pos = m.start()
453 if end_pos > state.cursor:
454 text = state.get_text(end_pos)
455 state.add_paragraph(text)
456 state.cursor = end_pos
457
458 end_pos2 = self.parse_method(m, state)
459 if end_pos2:
460 state.cursor = end_pos2
461 else:
462 end_pos3 = state.find_line_end()
463 text = state.get_text(end_pos3)
464 state.add_paragraph(text)
465 state.cursor = end_pos3
466
467 if state.cursor < state.cursor_max:
468 text = state.src[state.cursor :]
469 state.add_paragraph(text)
470 state.cursor = state.cursor_max
471
472
473def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
474 marker_pos = state.src.find(end_marker, start_pos)
475 if marker_pos == -1:
476 text = state.src[state.cursor :]
477 end_pos = state.cursor_max
478 else:
479 text = state.get_text(marker_pos)
480 state.cursor = marker_pos
481 end_pos = state.find_line_end()
482 text += state.get_text(end_pos)
483
484 state.append_token({"type": "block_html", "raw": text})
485 return end_pos
486
487
488def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
489 m = newline.search(state.src, state.cursor)
490 if m:
491 end_pos = m.start()
492 text = state.get_text(end_pos)
493 else:
494 text = state.src[state.cursor :]
495 end_pos = state.cursor_max
496
497 state.append_token({"type": "block_html", "raw": text})
498 return end_pos