Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mistune/block_parser.py: 100%
221 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
1import re
2from .scanner import ScannerParser, Matcher
3from .inline_parser import ESCAPE_CHAR, LINK_LABEL
4from .util import unikey
6_NEW_LINES = re.compile(r'\r\n|\r')
7_BLANK_LINES = re.compile(r'^ +$', re.M)
9_TRIM_4 = re.compile(r'^ {1,4}')
10_EXPAND_TAB = re.compile(r'^( {0,3})\t', flags=re.M)
11_INDENT_CODE_TRIM = re.compile(r'^ {1,4}', flags=re.M)
12_BLOCK_QUOTE_TRIM = re.compile(r'^ {0,1}', flags=re.M)
13_BLOCK_QUOTE_LEADING = re.compile(r'^ *>', flags=re.M)
14_BLOCK_TAGS = {
15 'address', 'article', 'aside', 'base', 'basefont', 'blockquote',
16 'body', 'caption', 'center', 'col', 'colgroup', 'dd', 'details',
17 'dialog', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
18 'figure', 'footer', 'form', 'frame', 'frameset', 'h1', 'h2', 'h3',
19 'h4', 'h5', 'h6', 'head', 'header', 'hr', 'html', 'iframe',
20 'legend', 'li', 'link', 'main', 'menu', 'menuitem', 'meta', 'nav',
21 'noframes', 'ol', 'optgroup', 'option', 'p', 'param', 'section',
22 'source', 'summary', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead',
23 'title', 'tr', 'track', 'ul'
24}
25_BLOCK_HTML_RULE6 = (
26 r'</?(?:' + '|'.join(_BLOCK_TAGS) + r')'
27 r'(?: +|\n|/?>)[\s\S]*?'
28 r'(?:\n{2,}|\n*$)'
29)
30_BLOCK_HTML_RULE7 = (
31 # open tag
32 r'<(?!script|pre|style)([a-z][\w-]*)(?:'
33 r' +[a-zA-Z:_][\w.:-]*(?: *= *"[^"\n]*"|'
34 r''' *= *'[^'\n]*'| *= *[^\s"'=<>`]+)?'''
35 r')*? */?>(?=\s*\n)[\s\S]*?(?:\n{2,}|\n*$)|'
36 # close tag
37 r'</(?!script|pre|style)[a-z][\w-]*\s*>(?=\s*\n)[\s\S]*?(?:\n{2,}|\n*$)'
38)
40_PARAGRAPH_SPLIT = re.compile(r'\n{2,}')
41_LIST_BULLET = re.compile(r'^ *([\*\+-]|\d+[.)])')
44class BlockParser(ScannerParser):
45 scanner_cls = Matcher
47 NEWLINE = re.compile(r'\n+')
48 DEF_LINK = re.compile(
49 r' {0,3}\[(' + LINK_LABEL + r')\]:(?:[ \t]*\n)?[ \t]*'
50 r'<?([^\s>]+)>?(?:[ \t]*\n)?'
51 r'(?: +["(]([^\n]+)[")])? *\n+'
52 )
54 AXT_HEADING = re.compile(
55 r' {0,3}(#{1,6})(?!#+)(?: *\n+|'
56 r'\s+([^\n]*?)(?:\n+|\s+?#+\s*\n+))'
57 )
58 SETEX_HEADING = re.compile(r'([^\n]+)\n *(=|-){2,}[ \t]*\n+')
59 THEMATIC_BREAK = re.compile(
60 r' {0,3}((?:-[ \t]*){3,}|'
61 r'(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})\n+'
62 )
64 INDENT_CODE = re.compile(r'(?:\n*)(?:(?: {4}| *\t)[^\n]+\n*)+')
66 FENCED_CODE = re.compile(
67 r'( {0,3})(`{3,}|~{3,})([^`\n]*)\n'
68 r'(?:|([\s\S]*?)\n)'
69 r'(?: {0,3}\2[~`]* *\n+|$)'
70 )
71 BLOCK_QUOTE = re.compile(
72 r'(?: {0,3}>[^\n]*\n)+'
73 )
74 LIST_START = re.compile(
75 r'( {0,3})([\*\+-]|\d{1,9}[.)])(?:[ \t]*|[ \t][^\n]+)\n+'
76 )
78 BLOCK_HTML = re.compile((
79 r' {0,3}(?:'
80 r'<(script|pre|style)[\s>][\s\S]*?(?:</\1>[^\n]*\n+|$)|'
81 r'<!--(?!-?>)[\s\S]*?-->[^\n]*\n+|'
82 r'<\?[\s\S]*?\?>[^\n]*\n+|'
83 r'<![A-Z][\s\S]*?>[^\n]*\n+|'
84 r'<!\[CDATA\[[\s\S]*?\]\]>[^\n]*\n+'
85 r'|' + _BLOCK_HTML_RULE6 + '|' + _BLOCK_HTML_RULE7 + ')'
86 ), re.I)
88 LIST_MAX_DEPTH = 6
89 BLOCK_QUOTE_MAX_DEPTH = 6
90 RULE_NAMES = (
91 'newline', 'thematic_break',
92 'fenced_code', 'indent_code',
93 'block_quote', 'block_html',
94 'list_start',
95 'axt_heading', 'setex_heading',
96 'def_link',
97 )
99 def __init__(self):
100 super(BlockParser, self).__init__()
101 self.block_quote_rules = list(self.RULE_NAMES)
102 self.list_rules = list(self.RULE_NAMES)
104 def parse_newline(self, m, state):
105 return {'type': 'newline', 'blank': True}
107 def parse_thematic_break(self, m, state):
108 return {'type': 'thematic_break', 'blank': True}
110 def parse_indent_code(self, m, state):
111 text = expand_leading_tab(m.group(0))
112 code = _INDENT_CODE_TRIM.sub('', text)
113 code = code.lstrip('\n')
114 return self.tokenize_block_code(code, None, state)
116 def parse_fenced_code(self, m, state):
117 info = ESCAPE_CHAR.sub(r'\1', m.group(3))
118 spaces = m.group(1)
119 code = m.group(4) or ''
120 if spaces and code:
121 _trim_pattern = re.compile('^' + spaces, re.M)
122 code = _trim_pattern.sub('', code)
123 return self.tokenize_block_code(code + '\n', info, state)
125 def tokenize_block_code(self, code, info, state):
126 token = {'type': 'block_code', 'raw': code}
127 if info:
128 token['params'] = (info, )
129 return token
131 def parse_axt_heading(self, m, state):
132 level = len(m.group(1))
133 text = m.group(2) or ''
134 text = text.strip()
135 if set(text) == {'#'}:
136 text = ''
137 return self.tokenize_heading(text, level, state)
139 def parse_setex_heading(self, m, state):
140 level = 1 if m.group(2) == '=' else 2
141 text = m.group(1)
142 text = text.strip()
143 return self.tokenize_heading(text, level, state)
145 def tokenize_heading(self, text, level, state):
146 return {'type': 'heading', 'text': text, 'params': (level,)}
148 def get_block_quote_rules(self, depth):
149 if depth > self.BLOCK_QUOTE_MAX_DEPTH - 1:
150 rules = list(self.block_quote_rules)
151 rules.remove('block_quote')
152 return rules
153 return self.block_quote_rules
155 def parse_block_quote(self, m, state):
156 depth = state.get('block_quote_depth', 0) + 1
157 state['block_quote_depth'] = depth
159 # normalize block quote text
160 text = _BLOCK_QUOTE_LEADING.sub('', m.group(0))
161 text = expand_leading_tab(text)
162 text = _BLOCK_QUOTE_TRIM.sub('', text)
163 text = cleanup_lines(text)
165 rules = self.get_block_quote_rules(depth)
166 children = self.parse(text, state, rules)
167 state['block_quote_depth'] = depth - 1
168 return {'type': 'block_quote', 'children': children}
170 def get_list_rules(self, depth):
171 if depth > self.LIST_MAX_DEPTH - 1:
172 rules = list(self.list_rules)
173 rules.remove('list_start')
174 return rules
175 return self.list_rules
177 def parse_list_start(self, m, state, string):
178 items = []
179 spaces = m.group(1)
180 marker = m.group(2)
181 items, pos = _find_list_items(string, m.start(), spaces, marker)
182 tight = '\n\n' not in ''.join(items).strip()
184 ordered = len(marker) != 1
185 if ordered:
186 start = int(marker[:-1])
187 if start == 1:
188 start = None
189 else:
190 start = None
192 list_tights = state.get('list_tights', [])
193 list_tights.append(tight)
194 state['list_tights'] = list_tights
196 depth = len(list_tights)
197 rules = self.get_list_rules(depth)
198 children = [
199 self.parse_list_item(item, depth, state, rules)
200 for item in items
201 ]
202 list_tights.pop()
203 params = (ordered, depth, start)
204 token = {'type': 'list', 'children': children, 'params': params}
205 return token, pos
207 def parse_list_item(self, text, depth, state, rules):
208 text = self.normalize_list_item_text(text)
209 if not text:
210 children = [{'type': 'block_text', 'text': ''}]
211 else:
212 children = self.parse(text, state, rules)
213 return {
214 'type': 'list_item',
215 'params': (depth,),
216 'children': children,
217 }
219 @staticmethod
220 def normalize_list_item_text(text):
221 text_length = len(text)
222 text = _LIST_BULLET.sub('', text)
224 if not text.strip():
225 return ''
227 space = text_length - len(text)
228 text = expand_leading_tab(text)
229 if text.startswith(' '):
230 text = text[1:]
231 space += 1
232 else:
233 text_length = len(text)
234 text = _TRIM_4.sub('', text)
235 space += max(text_length - len(text), 1)
237 # outdent
238 if '\n ' in text:
239 pattern = re.compile(r'\n {1,' + str(space) + r'}')
240 text = pattern.sub(r'\n', text)
241 return text
243 def parse_block_html(self, m, state):
244 html = m.group(0).rstrip()
245 return {'type': 'block_html', 'raw': html}
247 def parse_def_link(self, m, state):
248 key = unikey(m.group(1))
249 link = m.group(2)
250 title = m.group(3)
251 if key not in state['def_links']:
252 state['def_links'][key] = (link, title)
254 def parse_text(self, text, state):
255 list_tights = state.get('list_tights')
256 if list_tights and list_tights[-1]:
257 return {'type': 'block_text', 'text': text.strip()}
259 tokens = []
260 for s in _PARAGRAPH_SPLIT.split(text):
261 s = s.strip()
262 if s:
263 tokens.append({'type': 'paragraph', 'text': s})
264 return tokens
266 def parse(self, s, state, rules=None):
267 if rules is None:
268 rules = self.rules
270 return list(self._scan(s, state, rules))
272 def render(self, tokens, inline, state):
273 data = self._iter_render(tokens, inline, state)
274 return inline.renderer.finalize(data)
276 def _iter_render(self, tokens, inline, state):
277 for tok in tokens:
278 method = inline.renderer._get_method(tok['type'])
279 if 'blank' in tok:
280 yield method()
281 continue
283 if 'children' in tok:
284 children = self.render(tok['children'], inline, state)
285 elif 'raw' in tok:
286 children = tok['raw']
287 else:
288 children = inline(tok['text'], state)
289 params = tok.get('params')
290 if params:
291 yield method(children, *params)
292 else:
293 yield method(children)
296def cleanup_lines(s):
297 s = _NEW_LINES.sub('\n', s)
298 s = _BLANK_LINES.sub('', s)
299 return s
302def expand_leading_tab(text):
303 return _EXPAND_TAB.sub(_expand_tab_repl, text)
306def _expand_tab_repl(m):
307 s = m.group(1)
308 return s + ' ' * (4 - len(s))
311def _create_list_item_pattern(spaces, marker):
312 prefix = r'( {0,' + str(len(spaces) + len(marker)) + r'})'
314 if len(marker) > 1:
315 if marker[-1] == '.':
316 prefix = prefix + r'\d{0,9}\.'
317 else:
318 prefix = prefix + r'\d{0,9}\)'
319 else:
320 prefix = prefix + re.escape(marker)
322 s1 = ' {' + str(len(marker) + 1) + ',}'
323 if len(marker) > 4:
324 s2 = ' {' + str(len(marker) - 4) + r',}\t'
325 else:
326 s2 = r' *\t'
327 return re.compile(
328 prefix + r'(?:[ \t]*|[ \t]+[^\n]+)\n+'
329 r'(?:\1(?:' + s1 + '|' + s2 + ')'
330 r'[^\n]+\n+)*'
331 )
334def _find_list_items(string, pos, spaces, marker):
335 items = []
337 if marker in {'*', '-'}:
338 is_hr = re.compile(
339 r' *((?:-[ \t]*){3,}|(?:\*[ \t]*){3,})\n+'
340 )
341 else:
342 is_hr = None
344 pattern = _create_list_item_pattern(spaces, marker)
345 while 1:
346 m = pattern.match(string, pos)
347 if not m:
348 break
350 text = m.group(0)
351 if is_hr and is_hr.match(text):
352 break
354 new_spaces = m.group(1)
355 if new_spaces != spaces:
356 spaces = new_spaces
357 pattern = _create_list_item_pattern(spaces, marker)
359 items.append(text)
360 pos = m.end()
361 return items, pos