1import re
2from typing import (
3 Any,
4 Dict,
5 List,
6 Match,
7 MutableMapping,
8 Optional,
9)
10
11from .core import InlineState, Parser
12from .helpers import (
13 HTML_ATTRIBUTES,
14 HTML_TAGNAME,
15 PREVENT_BACKSLASH,
16 PUNCTUATION,
17 parse_link,
18 parse_link_label,
19 parse_link_text,
20 unescape_char,
21)
22from .util import escape_url, unikey
23
24PAREN_END_RE = re.compile(r"\s*\)")
25
26AUTO_EMAIL = (
27 r"""<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]"""
28 r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?"
29 r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>"
30)
31
32INLINE_HTML = (
33 r"<" + HTML_TAGNAME + HTML_ATTRIBUTES + r"\s*/?>|" # open tag
34 r"</" + HTML_TAGNAME + r"\s*>|" # close tag
35 r"<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|" # comment
36 r"<\?[\s\S]+?\?>|" # script like <?php?>
37 r"<![A-Z][\s\S]+?>|" # doctype
38 r"<!\[CDATA[\s\S]+?\]\]>" # cdata
39)
40
41EMPHASIS_END_RE = {
42 "*": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*(?!\*)"),
43 "_": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])_(?!_)\b"),
44 "**": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*(?!\*)"),
45 "__": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])__(?!_)\b"),
46 "***": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*\*(?!\*)"),
47 "___": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])___(?!_)\b"),
48}
49
50
51class InlineParser(Parser[InlineState]):
52 sc_flag = 0
53 state_cls = InlineState
54
55 #: linebreak leaves two spaces at the end of line
56 STD_LINEBREAK = r"(?:\\| {2,})\n\s*"
57
58 #: every new line becomes <br>
59 HARD_LINEBREAK = r" *\n\s*"
60
61 # we only need to find the start pattern of an inline token
62 SPECIFICATION = {
63 # e.g. \`, \$
64 "escape": r"(?:\\" + PUNCTUATION + ")+",
65 # `code, ```code
66 "codespan": r"`{1,}",
67 # *w, **w, _w, __w
68 "emphasis": r"\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])",
69 # [link], ![img]
70 "link": r"!?\[",
71 # <https://example.com>. regex copied from commonmark.js
72 "auto_link": r"<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>",
73 "auto_email": AUTO_EMAIL,
74 "inline_html": INLINE_HTML,
75 "linebreak": STD_LINEBREAK,
76 "softbreak": HARD_LINEBREAK,
77 "prec_auto_link": r"<[A-Za-z][A-Za-z\d.+-]{1,31}:",
78 "prec_inline_html": r"</?" + HTML_TAGNAME + r"|<!|<\?",
79 }
80 DEFAULT_RULES = (
81 "escape",
82 "codespan",
83 "emphasis",
84 "link",
85 "auto_link",
86 "auto_email",
87 "inline_html",
88 "linebreak",
89 )
90
91 def __init__(self, hard_wrap: bool = False) -> None:
92 super(InlineParser, self).__init__()
93
94 self.hard_wrap = hard_wrap
95 # lazy add linebreak
96 if hard_wrap:
97 self.specification["linebreak"] = self.HARD_LINEBREAK
98 else:
99 self.rules.append("softbreak")
100
101 self._methods = {name: getattr(self, "parse_" + name) for name in self.rules}
102
103 def parse_escape(self, m: Match[str], state: InlineState) -> int:
104 text = m.group(0)
105 text = unescape_char(text)
106 state.append_token(
107 {
108 "type": "text",
109 "raw": text,
110 }
111 )
112 return m.end()
113
114 def parse_link(self, m: Match[str], state: InlineState) -> Optional[int]:
115 pos = m.end()
116
117 marker = m.group(0)
118 is_image = marker[0] == "!"
119 if is_image and state.in_image:
120 state.append_token({"type": "text", "raw": marker})
121 return pos
122 elif not is_image and state.in_link:
123 state.append_token({"type": "text", "raw": marker})
124 return pos
125
126 text = None
127 label, end_pos = parse_link_label(state.src, pos)
128 if label is None:
129 text, end_pos = parse_link_text(state.src, pos)
130 if text is None:
131 return None
132
133 assert end_pos is not None
134
135 if text is None:
136 text = label
137
138 assert text is not None
139
140 if end_pos >= len(state.src) and label is None:
141 return None
142
143 rules = ["codespan", "prec_auto_link", "prec_inline_html"]
144 prec_pos = self.precedence_scan(m, state, end_pos, rules)
145 if prec_pos:
146 return prec_pos
147
148 if end_pos < len(state.src):
149 c = state.src[end_pos]
150 if c == "(":
151 # standard link [text](<url> "title")
152 attrs, pos2 = parse_link(state.src, end_pos + 1)
153 if pos2:
154 token = self.__parse_link_token(is_image, text, attrs, state)
155 state.append_token(token)
156 return pos2
157
158 elif c == "[":
159 # standard ref link [text][label]
160 label2, pos2 = parse_link_label(state.src, end_pos + 1)
161 if pos2:
162 end_pos = pos2
163 if label2:
164 label = label2
165
166 if label is None:
167 return None
168
169 ref_links = state.env.get("ref_links")
170 if not ref_links:
171 return None
172
173 key = unikey(label)
174 env = ref_links.get(key)
175 if env:
176 attrs = {"url": env["url"], "title": env.get("title")}
177 token = self.__parse_link_token(is_image, text, attrs, state)
178 token["ref"] = key
179 token["label"] = label
180 state.append_token(token)
181 return end_pos
182 return None
183
184 def __parse_link_token(
185 self,
186 is_image: bool,
187 text: str,
188 attrs: Optional[Dict[str, Any]],
189 state: InlineState,
190 ) -> Dict[str, Any]:
191 new_state = state.copy()
192 new_state.src = text
193 if is_image:
194 new_state.in_image = True
195 token = {
196 "type": "image",
197 "children": self.render(new_state),
198 "attrs": attrs,
199 }
200 else:
201 new_state.in_link = True
202 token = {
203 "type": "link",
204 "children": self.render(new_state),
205 "attrs": attrs,
206 }
207 return token
208
209 def parse_auto_link(self, m: Match[str], state: InlineState) -> int:
210 text = m.group(0)
211 pos = m.end()
212 if state.in_link:
213 self.process_text(text, state)
214 return pos
215
216 text = text[1:-1]
217 self._add_auto_link(text, text, state)
218 return pos
219
220 def parse_auto_email(self, m: Match[str], state: InlineState) -> int:
221 text = m.group(0)
222 pos = m.end()
223 if state.in_link:
224 self.process_text(text, state)
225 return pos
226
227 text = text[1:-1]
228 url = "mailto:" + text
229 self._add_auto_link(url, text, state)
230 return pos
231
232 def _add_auto_link(self, url: str, text: str, state: InlineState) -> None:
233 state.append_token(
234 {
235 "type": "link",
236 "children": [{"type": "text", "raw": text}],
237 "attrs": {"url": escape_url(url)},
238 }
239 )
240
241 def parse_emphasis(self, m: Match[str], state: InlineState) -> int:
242 pos = m.end()
243
244 marker = m.group(0)
245 mlen = len(marker)
246 if mlen == 1 and state.in_emphasis:
247 state.append_token({"type": "text", "raw": marker})
248 return pos
249 elif mlen == 2 and state.in_strong:
250 state.append_token({"type": "text", "raw": marker})
251 return pos
252
253 _end_re = EMPHASIS_END_RE[marker]
254 m1 = _end_re.search(state.src, pos)
255 if not m1:
256 state.append_token({"type": "text", "raw": marker})
257 return pos
258
259 end_pos = m1.end()
260 text = state.src[pos : end_pos - mlen]
261
262 prec_pos = self.precedence_scan(m, state, end_pos)
263 if prec_pos:
264 return prec_pos
265
266 new_state = state.copy()
267 new_state.src = text
268 if mlen == 1:
269 new_state.in_emphasis = True
270 children = self.render(new_state)
271 state.append_token({"type": "emphasis", "children": children})
272 elif mlen == 2:
273 new_state.in_strong = True
274 children = self.render(new_state)
275 state.append_token({"type": "strong", "children": children})
276 else:
277 new_state.in_emphasis = True
278 new_state.in_strong = True
279
280 children = [{"type": "strong", "children": self.render(new_state)}]
281 state.append_token(
282 {
283 "type": "emphasis",
284 "children": children,
285 }
286 )
287 return end_pos
288
289 def parse_codespan(self, m: Match[str], state: InlineState) -> int:
290 marker = m.group(0)
291 # require same marker with same length at end
292
293 pattern = re.compile(r"(.*?[^`])" + marker + r"(?!`)", re.S)
294
295 pos = m.end()
296 m2 = pattern.match(state.src, pos)
297 if m2:
298 end_pos = m2.end()
299 code = m2.group(1)
300 # Line endings are treated like spaces
301 code = code.replace("\n", " ")
302 if len(code.strip()):
303 if code.startswith(" ") and code.endswith(" "):
304 code = code[1:-1]
305 state.append_token({"type": "codespan", "raw": code})
306 return end_pos
307 else:
308 state.append_token({"type": "text", "raw": marker})
309 return pos
310
311 def parse_linebreak(self, m: Match[str], state: InlineState) -> int:
312 state.append_token({"type": "linebreak"})
313 return m.end()
314
315 def parse_softbreak(self, m: Match[str], state: InlineState) -> int:
316 state.append_token({"type": "softbreak"})
317 return m.end()
318
319 def parse_inline_html(self, m: Match[str], state: InlineState) -> int:
320 end_pos = m.end()
321 html = m.group(0)
322 state.append_token({"type": "inline_html", "raw": html})
323 if html.startswith(("<a ", "<a>", "<A ", "<A>")):
324 state.in_link = True
325 elif html.startswith(("</a ", "</a>", "</A ", "</A>")):
326 state.in_link = False
327 return end_pos
328
329 def process_text(self, text: str, state: InlineState) -> None:
330 state.append_token({"type": "text", "raw": text})
331
332 def parse(self, state: InlineState) -> List[Dict[str, Any]]:
333 pos = 0
334 sc = self.compile_sc()
335 while pos < len(state.src):
336 m = sc.search(state.src, pos)
337 if not m:
338 break
339
340 end_pos = m.start()
341 if end_pos > pos:
342 hole = state.src[pos:end_pos]
343 self.process_text(hole, state)
344
345 new_pos = self.parse_method(m, state)
346 if not new_pos:
347 # move cursor 1 character forward
348 pos = end_pos + 1
349 hole = state.src[end_pos:pos]
350 self.process_text(hole, state)
351 else:
352 pos = new_pos
353
354 if pos == 0:
355 # special case, just pure text
356 self.process_text(state.src, state)
357 elif pos < len(state.src):
358 self.process_text(state.src[pos:], state)
359 return state.tokens
360
361 def precedence_scan(
362 self,
363 m: Match[str],
364 state: InlineState,
365 end_pos: int,
366 rules: Optional[List[str]] = None,
367 ) -> Optional[int]:
368 if rules is None:
369 rules = ["codespan", "link", "prec_auto_link", "prec_inline_html"]
370
371 mark_pos = m.end()
372 sc = self.compile_sc(rules)
373 m1 = sc.search(state.src, mark_pos, end_pos)
374 if not m1:
375 return None
376
377 lastgroup = m1.lastgroup
378 if not lastgroup:
379 return None
380 rule_name = lastgroup.replace("prec_", "")
381 sc = self.compile_sc([rule_name])
382 m2 = sc.match(state.src, m1.start())
383 if not m2:
384 return None
385
386 func = self._methods[rule_name]
387 new_state = state.copy()
388 new_state.src = state.src
389 m2_pos = func(m2, new_state)
390 if not m2_pos or m2_pos < end_pos:
391 return None
392
393 raw_text = state.src[m.start() : m2.start()]
394 state.append_token({"type": "text", "raw": raw_text})
395 for token in new_state.tokens:
396 state.append_token(token)
397 return m2_pos
398
399 def render(self, state: InlineState) -> List[Dict[str, Any]]:
400 self.parse(state)
401 return state.tokens
402
403 def __call__(self, s: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]:
404 state = self.state_cls(env)
405 state.src = s
406 return self.render(state)