Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mistune/inline_parser.py: 99%
231 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1import re
2from typing import Optional, List, Dict, Any, Match
3from .core import Parser, InlineState
4from .util import (
5 escape,
6 escape_url,
7 unikey,
8)
9from .helpers import (
10 PREVENT_BACKSLASH,
11 PUNCTUATION,
12 HTML_TAGNAME,
13 HTML_ATTRIBUTES,
14 unescape_char,
15 parse_link,
16 parse_link_label,
17 parse_link_text,
18)
20PAREN_END_RE = re.compile(r'\s*\)')
22AUTO_EMAIL = (
23 r'''<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]'''
24 r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?'
25 r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>'
26)
28INLINE_HTML = (
29 r'<' + HTML_TAGNAME + HTML_ATTRIBUTES + r'\s*/?>|' # open tag
30 r'</' + HTML_TAGNAME + r'\s*>|' # close tag
31 r'<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|' # comment
32 r'<\?[\s\S]+?\?>|' # script like <?php?>
33 r'<![A-Z][\s\S]+?>|' # doctype
34 r'<!\[CDATA[\s\S]+?\]\]>' # cdata
35)
37EMPHASIS_END_RE = {
38 '*': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*(?!\*)'),
39 '_': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])_(?!_)\b'),
41 '**': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*(?!\*)'),
42 '__': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])__(?!_)\b'),
44 '***': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*\*(?!\*)'),
45 '___': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])___(?!_)\b'),
46}
49class InlineParser(Parser):
50 sc_flag = 0
51 state_cls = InlineState
53 #: linebreak leaves two spaces at the end of line
54 STD_LINEBREAK = r'(?:\\| {2,})\n\s*'
56 #: every new line becomes <br>
57 HARD_LINEBREAK = r' *\n\s*'
59 # we only need to find the start pattern of an inline token
60 SPECIFICATION = {
61 # e.g. \`, \$
62 'escape': r'(?:\\' + PUNCTUATION + ')+',
64 # `code, ```code
65 'codespan': r'`{1,}',
67 # *w, **w, _w, __w
68 'emphasis': r'\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])',
70 # [link], ![img]
71 'link': r'!?\[',
73 # <https://example.com>. regex copied from commonmark.js
74 'auto_link': r'<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>',
75 'auto_email': AUTO_EMAIL,
77 'inline_html': INLINE_HTML,
79 'linebreak': STD_LINEBREAK,
80 'softbreak': HARD_LINEBREAK,
82 'prec_auto_link': r'<[A-Za-z][A-Za-z\d.+-]{1,31}:',
83 'prec_inline_html': r'</?' + HTML_TAGNAME + r'|<!|<\?',
84 }
85 DEFAULT_RULES = (
86 'escape',
87 'codespan',
88 'emphasis',
89 'link',
90 'auto_link',
91 'auto_email',
92 'inline_html',
93 'linebreak',
94 )
96 def __init__(self, hard_wrap: bool=False):
97 super(InlineParser, self).__init__()
99 self.hard_wrap = hard_wrap
100 # lazy add linebreak
101 if hard_wrap:
102 self.specification['linebreak'] = self.HARD_LINEBREAK
103 else:
104 self.rules.append('softbreak')
106 self._methods = {
107 name: getattr(self, 'parse_' + name) for name in self.rules
108 }
110 def parse_escape(self, m: Match, state: InlineState) -> int:
111 text = m.group(0)
112 text = unescape_char(text)
113 state.append_token({
114 'type': 'text',
115 'raw': text,
116 })
117 return m.end()
119 def parse_link(self, m: Match, state: InlineState) -> Optional[int]:
120 pos = m.end()
122 marker = m.group(0)
123 is_image = marker[0] == '!'
124 if is_image and state.in_image:
125 state.append_token({'type': 'text', 'raw': marker})
126 return pos
127 elif not is_image and state.in_link:
128 state.append_token({'type': 'text', 'raw': marker})
129 return pos
131 text = None
132 label, end_pos = parse_link_label(state.src, pos)
133 if label is None:
134 text, end_pos = parse_link_text(state.src, pos)
135 if text is None:
136 return
138 if text is None:
139 text = label
141 if end_pos >= len(state.src) and label is None:
142 return
144 rules = ['codespan', 'prec_auto_link', 'prec_inline_html']
145 prec_pos = self.precedence_scan(m, state, end_pos, rules)
146 if prec_pos:
147 return prec_pos
149 if end_pos < len(state.src):
150 c = state.src[end_pos]
151 if c == '(':
152 # standard link [text](<url> "title")
153 attrs, pos2 = parse_link(state.src, end_pos + 1)
154 if pos2:
155 token = self.__parse_link_token(is_image, text, attrs, state)
156 state.append_token(token)
157 return pos2
159 elif c == '[':
160 # standard ref link [text][label]
161 label2, pos2 = parse_link_label(state.src, end_pos + 1)
162 if pos2:
163 end_pos = pos2
164 if label2:
165 label = label2
167 if label is None:
168 return
170 ref_links = state.env.get('ref_links')
171 if not ref_links:
172 return
174 key = unikey(label)
175 env = ref_links.get(key)
176 if env:
177 attrs = {'url': env['url'], 'title': env.get('title')}
178 token = self.__parse_link_token(is_image, text, attrs, state)
179 token['ref'] = key
180 token['label'] = label
181 state.append_token(token)
182 return end_pos
184 def __parse_link_token(self, is_image, text, attrs, state):
185 new_state = state.copy()
186 new_state.src = text
187 if is_image:
188 new_state.in_image = True
189 token = {
190 'type': 'image',
191 'children': self.render(new_state),
192 'attrs': attrs,
193 }
194 else:
195 new_state.in_link = True
196 token = {
197 'type': 'link',
198 'children': self.render(new_state),
199 'attrs': attrs,
200 }
201 return token
203 def parse_auto_link(self, m: Match, state: InlineState) -> int:
204 text = m.group(0)
205 pos = m.end()
206 if state.in_link:
207 self.process_text(text, state)
208 return pos
210 text = text[1:-1]
211 self._add_auto_link(text, text, state)
212 return pos
214 def parse_auto_email(self, m: Match, state: InlineState) -> int:
215 text = m.group(0)
216 pos = m.end()
217 if state.in_link:
218 self.process_text(text, state)
219 return pos
221 text = text[1:-1]
222 url = 'mailto:' + text
223 self._add_auto_link(url, text, state)
224 return pos
226 def _add_auto_link(self, url, text, state):
227 state.append_token({
228 'type': 'link',
229 'children': [{'type': 'text', 'raw': text}],
230 'attrs': {'url': escape_url(url)},
231 })
233 def parse_emphasis(self, m: Match, state: InlineState) -> int:
234 pos = m.end()
236 marker = m.group(0)
237 mlen = len(marker)
238 if mlen == 1 and state.in_emphasis:
239 state.append_token({'type': 'text', 'raw': marker})
240 return pos
241 elif mlen == 2 and state.in_strong:
242 state.append_token({'type': 'text', 'raw': marker})
243 return pos
245 _end_re = EMPHASIS_END_RE[marker]
246 m1 = _end_re.search(state.src, pos)
247 if not m1:
248 state.append_token({'type': 'text', 'raw': marker})
249 return pos
251 end_pos = m1.end()
252 text = state.src[pos:end_pos-mlen]
254 prec_pos = self.precedence_scan(m, state, end_pos)
255 if prec_pos:
256 return prec_pos
258 new_state = state.copy()
259 new_state.src = text
260 if mlen == 1:
261 new_state.in_emphasis = True
262 children = self.render(new_state)
263 state.append_token({'type': 'emphasis', 'children': children})
264 elif mlen == 2:
265 new_state.in_strong = True
266 children = self.render(new_state)
267 state.append_token({'type': 'strong', 'children': children})
268 else:
269 new_state.in_emphasis = True
270 new_state.in_strong = True
272 children = [{
273 'type': 'strong',
274 'children': self.render(new_state)
275 }]
276 state.append_token({
277 'type': 'emphasis',
278 'children': children,
279 })
280 return end_pos
282 def parse_codespan(self, m: Match, state: InlineState) -> int:
283 marker = m.group(0)
284 # require same marker with same length at end
286 pattern = re.compile(r'(.*?[^`])' + marker + r'(?!`)', re.S)
288 pos = m.end()
289 m = pattern.match(state.src, pos)
290 if m:
291 end_pos = m.end()
292 code = m.group(1)
293 # Line endings are treated like spaces
294 code = code.replace('\n', ' ')
295 if len(code.strip()):
296 if code.startswith(' ') and code.endswith(' '):
297 code = code[1:-1]
298 state.append_token({'type': 'codespan', 'raw': escape(code)})
299 return end_pos
300 else:
301 state.append_token({'type': 'text', 'raw': marker})
302 return pos
304 def parse_linebreak(self, m: Match, state: InlineState) -> int:
305 state.append_token({'type': 'linebreak'})
306 return m.end()
308 def parse_softbreak(self, m: Match, state: InlineState) -> int:
309 state.append_token({'type': 'softbreak'})
310 return m.end()
312 def parse_inline_html(self, m: Match, state: InlineState) -> int:
313 end_pos = m.end()
314 html = m.group(0)
315 state.append_token({'type': 'inline_html', 'raw': html})
316 if html.startswith(('<a ', '<a>', '<A ', '<A>')):
317 state.in_link = True
318 elif html.startswith(('</a ', '</a>', '</A ', '</A>')):
319 state.in_link = False
320 return end_pos
322 def process_text(self, text: str, state: InlineState):
323 state.append_token({'type': 'text', 'raw': text})
325 def parse(self, state: InlineState) -> List[Dict[str, Any]]:
326 pos = 0
327 sc = self.compile_sc()
328 while pos < len(state.src):
329 m = sc.search(state.src, pos)
330 if not m:
331 break
333 end_pos = m.start()
334 if end_pos > pos:
335 hole = state.src[pos:end_pos]
336 self.process_text(hole, state)
338 new_pos = self.parse_method(m, state)
339 if not new_pos:
340 # move cursor 1 character forward
341 pos = end_pos + 1
342 hole = state.src[end_pos:pos]
343 self.process_text(hole, state)
344 else:
345 pos = new_pos
347 if pos == 0:
348 # special case, just pure text
349 self.process_text(state.src, state)
350 elif pos < len(state.src):
351 self.process_text(state.src[pos:], state)
352 return state.tokens
354 def precedence_scan(self, m: Match, state: InlineState, end_pos: int, rules=None):
355 if rules is None:
356 rules = ['codespan', 'link', 'prec_auto_link', 'prec_inline_html']
358 mark_pos = m.end()
359 sc = self.compile_sc(rules)
360 m1 = sc.search(state.src, mark_pos, end_pos)
361 if not m1:
362 return
364 rule_name = m1.lastgroup.replace('prec_', '')
365 sc = self.compile_sc([rule_name])
366 m2 = sc.match(state.src, m1.start())
367 if not m2:
368 return
370 func = self._methods[rule_name]
371 new_state = state.copy()
372 new_state.src = state.src
373 m2_pos = func(m2, new_state)
374 if not m2_pos or m2_pos < end_pos:
375 return
377 raw_text = state.src[m.start():m2.start()]
378 state.append_token({'type': 'text', 'raw': raw_text})
379 for token in new_state.tokens:
380 state.append_token(token)
381 return m2_pos
383 def render(self, state: InlineState):
384 self.parse(state)
385 return state.tokens
387 def __call__(self, s, env):
388 state = self.state_cls(env)
389 state.src = s
390 return self.render(state)