Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mistune/block_parser.py: 100%
265 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1import re
2from typing import Optional, List, Tuple, Match
3from .util import (
4 unikey,
5 escape_url,
6 expand_tab,
7 expand_leading_tab,
8)
9from .core import Parser, BlockState
10from .helpers import (
11 LINK_LABEL,
12 HTML_TAGNAME,
13 HTML_ATTRIBUTES,
14 BLOCK_TAGS,
15 PRE_TAGS,
16 unescape_char,
17 parse_link_href,
18 parse_link_title,
19)
20from .list_parser import parse_list, LIST_PATTERN
22_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M)
23_AXT_HEADING_TRIM = re.compile(r'(\s+|^)#+\s*$')
24_BLOCK_QUOTE_TRIM = re.compile(r'^ ?', flags=re.M)
25_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M)
27_LINE_BLANK_END = re.compile(r'\n[ \t]*\n$')
28_BLANK_TO_LINE = re.compile(r'[ \t]*\n')
30_BLOCK_TAGS_PATTERN = '|'.join(BLOCK_TAGS) + '|' + '|'.join(PRE_TAGS)
31_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r'[ \t]*>[ \t]*(?:\n|$)')
32_CLOSE_TAG_END = re.compile(r'[ \t]*>[ \t]*(?:\n|$)')
33_STRICT_BLOCK_QUOTE = re.compile(r'( {0,3}>[^\n]*(?:\n|$))+')
36class BlockParser(Parser):
37 BLANK_LINE = re.compile(r'(^[ \t\v\f]*\n)+', re.M)
39 RAW_HTML = (
40 r'^ {0,3}('
41 r'</?' + HTML_TAGNAME + r'|'
42 r'<!--|' # comment
43 r'<\?|' # script
44 r'<![A-Z]|'
45 r'<!\[CDATA\[)'
46 )
48 BLOCK_HTML = (
49 r'^ {0,3}(?:'
50 r'(?:</?' + _BLOCK_TAGS_PATTERN + r'(?:[ \t]+|\n|$))'
51 r'|<!--' # comment
52 r'|<\?' # script
53 r'|<![A-Z]'
54 r'|<!\[CDATA\[)'
55 )
57 SPECIFICATION = {
58 'blank_line': r'(^[ \t\v\f]*\n)+',
59 'axt_heading': r'^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*|[ \t]+.*?)$',
60 'setex_heading': r'^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$',
61 'fenced_code': (
62 r'^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})'
63 r'[ \t]*(?P<fenced_3>.*?)$'
64 ),
65 'indent_code': (
66 r'^(?: {4}| *\t)[^\n]+(?:\n+|$)'
67 r'((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*'
68 ),
69 'thematic_break': r'^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$',
70 'ref_link': r'^ {0,3}\[(?P<reflink_1>' + LINK_LABEL + r')\]:',
71 'block_quote': r'^ {0,3}>(?P<quote_1>.*?)$',
72 'list': LIST_PATTERN,
73 'block_html': BLOCK_HTML,
74 'raw_html': RAW_HTML,
75 }
77 DEFAULT_RULES = (
78 'fenced_code',
79 'indent_code',
80 'axt_heading',
81 'setex_heading',
82 'thematic_break',
83 'block_quote',
84 'list',
85 'ref_link',
86 'raw_html',
87 'blank_line',
88 )
90 def __init__(
91 self,
92 block_quote_rules: Optional[List[str]]=None,
93 list_rules: Optional[List[str]]=None,
94 max_nested_level: int=6
95 ):
96 super(BlockParser, self).__init__()
98 if block_quote_rules is None:
99 block_quote_rules = list(self.DEFAULT_RULES)
101 if list_rules is None:
102 list_rules = list(self.DEFAULT_RULES)
104 self.block_quote_rules = block_quote_rules
105 self.list_rules = list_rules
106 self.max_nested_level = max_nested_level
107 # register default parse methods
108 self._methods = {
109 name: getattr(self, 'parse_' + name) for name in self.SPECIFICATION
110 }
112 def parse_blank_line(self, m: Match, state: BlockState) -> int:
113 """Parse token for blank lines."""
114 state.append_token({'type': 'blank_line'})
115 return m.end()
117 def parse_thematic_break(self, m: Match, state: BlockState) -> int:
118 """Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
119 state.append_token({'type': 'thematic_break'})
120 # $ does not count '\n'
121 return m.end() + 1
123 def parse_indent_code(self, m: Match, state: BlockState) -> int:
124 """Parse token for code block which is indented by 4 spaces."""
125 # it is a part of the paragraph
126 end_pos = state.append_paragraph()
127 if end_pos:
128 return end_pos
130 code = m.group(0)
131 code = expand_leading_tab(code)
132 code = _INDENT_CODE_TRIM.sub('', code)
133 code = code.strip('\n')
134 state.append_token({'type': 'block_code', 'raw': code, 'style': 'indent'})
135 return m.end()
137 def parse_fenced_code(self, m: Match, state: BlockState) -> Optional[int]:
138 """Parse token for fenced code block. A fenced code block is started with
139 3 or more backtick(`) or tilde(~).
141 An example of a fenced code block:
143 .. code-block:: markdown
145 ```python
146 def markdown(text):
147 return mistune.html(text)
148 ```
149 """
150 spaces = m.group('fenced_1')
151 marker = m.group('fenced_2')
152 info = m.group('fenced_3')
154 c = marker[0]
155 if info and c == '`':
156 # CommonMark Example 145
157 # Info strings for backtick code blocks cannot contain backticks
158 if info.find(c) != -1:
159 return
161 _end = re.compile(
162 r'^ {0,3}' + c + '{' + str(len(marker)) + r',}[ \t]*(?:\n|$)', re.M)
163 cursor_start = m.end() + 1
165 m2 = _end.search(state.src, cursor_start)
166 if m2:
167 code = state.src[cursor_start:m2.start()]
168 end_pos = m2.end()
169 else:
170 code = state.src[cursor_start:]
171 end_pos = state.cursor_max
173 if spaces and code:
174 _trim_pattern = re.compile('^ {0,' + str(len(spaces)) + '}', re.M)
175 code = _trim_pattern.sub('', code)
177 token = {'type': 'block_code', 'raw': code, 'style': 'fenced', 'marker': marker}
178 if info:
179 info = unescape_char(info)
180 token['attrs'] = {'info': info.strip()}
182 state.append_token(token)
183 return end_pos
185 def parse_axt_heading(self, m: Match, state: BlockState) -> int:
186 """Parse token for AXT heading. An AXT heading is started with 1 to 6
187 symbol of ``#``."""
188 level = len(m.group('axt_1'))
189 text = m.group('axt_2').strip()
190 # remove last #
191 if text:
192 text = _AXT_HEADING_TRIM.sub('', text)
194 token = {'type': 'heading', 'text': text, 'attrs': {'level': level}, 'style': 'axt'}
195 state.append_token(token)
196 return m.end() + 1
198 def parse_setex_heading(self, m: Match, state: BlockState) -> Optional[int]:
199 """Parse token for setex style heading. A setex heading syntax looks like:
201 .. code-block:: markdown
203 H1 title
204 ========
205 """
206 last_token = state.last_token()
207 if last_token and last_token['type'] == 'paragraph':
208 level = 1 if m.group('setext_1') == '=' else 2
209 last_token['type'] = 'heading'
210 last_token['style'] = 'setext'
211 last_token['attrs'] = {'level': level}
212 return m.end() + 1
214 sc = self.compile_sc(['thematic_break', 'list'])
215 m = sc.match(state.src, state.cursor)
216 if m:
217 return self.parse_method(m, state)
219 def parse_ref_link(self, m: Match, state: BlockState) -> Optional[int]:
220 """Parse link references and save the link information into ``state.env``.
222 Here is an example of a link reference:
224 .. code-block:: markdown
226 a [link][example]
228 [example]: https://example.com "Optional title"
230 This method will save the link reference into ``state.env`` as::
232 state.env['ref_links']['example'] = {
233 'url': 'https://example.com',
234 'title': "Optional title",
235 }
236 """
237 end_pos = state.append_paragraph()
238 if end_pos:
239 return end_pos
241 label = m.group('reflink_1')
242 key = unikey(label)
243 if not key:
244 return
246 href, href_pos = parse_link_href(state.src, m.end(), block=True)
247 if href is None:
248 return
250 _blank = self.BLANK_LINE.search(state.src, href_pos)
251 if _blank:
252 max_pos = _blank.start()
253 else:
254 max_pos = state.cursor_max
256 title, title_pos = parse_link_title(state.src, href_pos, max_pos)
257 if title_pos:
258 m = _BLANK_TO_LINE.match(state.src, title_pos)
259 if m:
260 title_pos = m.end()
261 else:
262 title_pos = None
263 title = None
265 if title_pos is None:
266 m = _BLANK_TO_LINE.match(state.src, href_pos)
267 if m:
268 href_pos = m.end()
269 else:
270 href_pos = None
271 href = None
273 end_pos = title_pos or href_pos
274 if not end_pos:
275 return
277 if key not in state.env['ref_links']:
278 href = unescape_char(href)
279 data = {'url': escape_url(href), 'label': label}
280 if title:
281 data['title'] = title
282 state.env['ref_links'][key] = data
283 return end_pos
285 def extract_block_quote(self, m: Match, state: BlockState) -> Tuple[str, int]:
286 """Extract text and cursor end position of a block quote."""
288 # cleanup at first to detect if it is code block
289 text = m.group('quote_1') + '\n'
290 text = expand_leading_tab(text, 3)
291 text = _BLOCK_QUOTE_TRIM.sub('', text)
293 sc = self.compile_sc(['blank_line', 'indent_code', 'fenced_code'])
294 require_marker = bool(sc.match(text))
296 state.cursor = m.end() + 1
298 end_pos = None
299 if require_marker:
300 m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
301 if m:
302 quote = m.group(0)
303 quote = _BLOCK_QUOTE_LEADING.sub('', quote)
304 quote = expand_leading_tab(quote, 3)
305 quote = _BLOCK_QUOTE_TRIM.sub('', quote)
306 text += quote
307 state.cursor = m.end()
308 else:
309 prev_blank_line = False
310 break_sc = self.compile_sc([
311 'blank_line', 'thematic_break', 'fenced_code',
312 'list', 'block_html',
313 ])
314 while state.cursor < state.cursor_max:
315 m = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
316 if m:
317 quote = m.group(0)
318 quote = _BLOCK_QUOTE_LEADING.sub('', quote)
319 quote = expand_leading_tab(quote, 3)
320 quote = _BLOCK_QUOTE_TRIM.sub('', quote)
321 text += quote
322 state.cursor = m.end()
323 if not quote.strip():
324 prev_blank_line = True
325 else:
326 prev_blank_line = bool(_LINE_BLANK_END.search(quote))
327 continue
329 if prev_blank_line:
330 # CommonMark Example 249
331 # because of laziness, a blank line is needed between
332 # a block quote and a following paragraph
333 break
335 m = break_sc.match(state.src, state.cursor)
336 if m:
337 end_pos = self.parse_method(m, state)
338 if end_pos:
339 break
341 # lazy continuation line
342 pos = state.find_line_end()
343 line = state.get_text(pos)
344 line = expand_leading_tab(line, 3)
345 text += line
346 state.cursor = pos
348 # according to CommonMark Example 6, the second tab should be
349 # treated as 4 spaces
350 return expand_tab(text), end_pos
352 def parse_block_quote(self, m: Match, state: BlockState) -> int:
353 """Parse token for block quote. Here is an example of the syntax:
355 .. code-block:: markdown
357 > a block quote starts
358 > with right arrows
359 """
360 text, end_pos = self.extract_block_quote(m, state)
361 # scan children state
362 child = state.child_state(text)
363 if state.depth() >= self.max_nested_level - 1:
364 rules = list(self.block_quote_rules)
365 rules.remove('block_quote')
366 else:
367 rules = self.block_quote_rules
369 self.parse(child, rules)
370 token = {'type': 'block_quote', 'children': child.tokens}
371 if end_pos:
372 state.prepend_token(token)
373 return end_pos
374 state.append_token(token)
375 return state.cursor
377 def parse_list(self, m: Match, state: BlockState) -> int:
378 """Parse tokens for ordered and unordered list."""
379 return parse_list(self, m, state)
381 def parse_block_html(self, m: Match, state: BlockState) -> Optional[int]:
382 return self.parse_raw_html(m, state)
384 def parse_raw_html(self, m: Match, state: BlockState) -> Optional[int]:
385 marker = m.group(0).strip()
387 # rule 2
388 if marker == '<!--':
389 return _parse_html_to_end(state, '-->', m.end())
391 # rule 3
392 if marker == '<?':
393 return _parse_html_to_end(state, '?>', m.end())
395 # rule 5
396 if marker == '<![CDATA[':
397 return _parse_html_to_end(state, ']]>', m.end())
399 # rule 4
400 if marker.startswith('<!'):
401 return _parse_html_to_end(state, '>', m.end())
403 close_tag = None
404 open_tag = None
405 if marker.startswith('</'):
406 close_tag = marker[2:].lower()
407 # rule 6
408 if close_tag in BLOCK_TAGS:
409 return _parse_html_to_newline(state, self.BLANK_LINE)
410 else:
411 open_tag = marker[1:].lower()
412 # rule 1
413 if open_tag in PRE_TAGS:
414 end_tag = '</' + open_tag + '>'
415 return _parse_html_to_end(state, end_tag, m.end())
416 # rule 6
417 if open_tag in BLOCK_TAGS:
418 return _parse_html_to_newline(state, self.BLANK_LINE)
420 # Blocks of type 7 may not interrupt a paragraph.
421 end_pos = state.append_paragraph()
422 if end_pos:
423 return end_pos
425 # rule 7
426 start_pos = m.end()
427 end_pos = state.find_line_end()
428 if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or \
429 (close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)):
430 return _parse_html_to_newline(state, self.BLANK_LINE)
432 def parse(self, state: BlockState, rules: Optional[List[str]]=None) -> None:
433 sc = self.compile_sc(rules)
435 while state.cursor < state.cursor_max:
436 m = sc.search(state.src, state.cursor)
437 if not m:
438 break
440 end_pos = m.start()
441 if end_pos > state.cursor:
442 text = state.get_text(end_pos)
443 state.add_paragraph(text)
444 state.cursor = end_pos
446 end_pos = self.parse_method(m, state)
447 if end_pos:
448 state.cursor = end_pos
449 else:
450 end_pos = state.find_line_end()
451 text = state.get_text(end_pos)
452 state.add_paragraph(text)
453 state.cursor = end_pos
455 if state.cursor < state.cursor_max:
456 text = state.src[state.cursor:]
457 state.add_paragraph(text)
458 state.cursor = state.cursor_max
461def _parse_html_to_end(state, end_marker, start_pos):
462 marker_pos = state.src.find(end_marker, start_pos)
463 if marker_pos == -1:
464 text = state.src[state.cursor:]
465 end_pos = state.cursor_max
466 else:
467 text = state.get_text(marker_pos)
468 state.cursor = marker_pos
469 end_pos = state.find_line_end()
470 text += state.get_text(end_pos)
472 state.append_token({'type': 'block_html', 'raw': text})
473 return end_pos
476def _parse_html_to_newline(state, newline):
477 m = newline.search(state.src, state.cursor)
478 if m:
479 end_pos = m.start()
480 text = state.get_text(end_pos)
481 else:
482 text = state.src[state.cursor:]
483 end_pos = state.cursor_max
485 state.append_token({'type': 'block_html', 'raw': text})
486 return end_pos