Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/mistune/inline_parser.py: 99%

231 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1import re 

2from typing import Optional, List, Dict, Any, Match 

3from .core import Parser, InlineState 

4from .util import ( 

5 escape, 

6 escape_url, 

7 unikey, 

8) 

9from .helpers import ( 

10 PREVENT_BACKSLASH, 

11 PUNCTUATION, 

12 HTML_TAGNAME, 

13 HTML_ATTRIBUTES, 

14 unescape_char, 

15 parse_link, 

16 parse_link_label, 

17 parse_link_text, 

18) 

19 

20PAREN_END_RE = re.compile(r'\s*\)') 

21 

22AUTO_EMAIL = ( 

23 r'''<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]''' 

24 r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?' 

25 r'(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>' 

26) 

27 

28INLINE_HTML = ( 

29 r'<' + HTML_TAGNAME + HTML_ATTRIBUTES + r'\s*/?>|' # open tag 

30 r'</' + HTML_TAGNAME + r'\s*>|' # close tag 

31 r'<!--(?!>|->)(?:(?!--)[\s\S])+?(?<!-)-->|' # comment 

32 r'<\?[\s\S]+?\?>|' # script like <?php?> 

33 r'<![A-Z][\s\S]+?>|' # doctype 

34 r'<!\[CDATA[\s\S]+?\]\]>' # cdata 

35) 

36 

37EMPHASIS_END_RE = { 

38 '*': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*(?!\*)'), 

39 '_': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])_(?!_)\b'), 

40 

41 '**': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*(?!\*)'), 

42 '__': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])__(?!_)\b'), 

43 

44 '***': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\\*|[^\s*])\*\*\*(?!\*)'), 

45 '___': re.compile(r'(?:' + PREVENT_BACKSLASH + r'\\_|[^\s_])___(?!_)\b'), 

46} 

47 

48 

49class InlineParser(Parser): 

50 sc_flag = 0 

51 state_cls = InlineState 

52 

53 #: linebreak leaves two spaces at the end of line 

54 STD_LINEBREAK = r'(?:\\| {2,})\n\s*' 

55 

56 #: every new line becomes <br> 

57 HARD_LINEBREAK = r' *\n\s*' 

58 

59 # we only need to find the start pattern of an inline token 

60 SPECIFICATION = { 

61 # e.g. \`, \$ 

62 'escape': r'(?:\\' + PUNCTUATION + ')+', 

63 

64 # `code, ```code 

65 'codespan': r'`{1,}', 

66 

67 # *w, **w, _w, __w 

68 'emphasis': r'\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])', 

69 

70 # [link], ![img] 

71 'link': r'!?\[', 

72 

73 # <https://example.com>. regex copied from commonmark.js 

74 'auto_link': r'<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>', 

75 'auto_email': AUTO_EMAIL, 

76 

77 'inline_html': INLINE_HTML, 

78 

79 'linebreak': STD_LINEBREAK, 

80 'softbreak': HARD_LINEBREAK, 

81 

82 'prec_auto_link': r'<[A-Za-z][A-Za-z\d.+-]{1,31}:', 

83 'prec_inline_html': r'</?' + HTML_TAGNAME + r'|<!|<\?', 

84 } 

85 DEFAULT_RULES = ( 

86 'escape', 

87 'codespan', 

88 'emphasis', 

89 'link', 

90 'auto_link', 

91 'auto_email', 

92 'inline_html', 

93 'linebreak', 

94 ) 

95 

96 def __init__(self, hard_wrap: bool=False): 

97 super(InlineParser, self).__init__() 

98 

99 self.hard_wrap = hard_wrap 

100 # lazy add linebreak 

101 if hard_wrap: 

102 self.specification['linebreak'] = self.HARD_LINEBREAK 

103 else: 

104 self.rules.append('softbreak') 

105 

106 self._methods = { 

107 name: getattr(self, 'parse_' + name) for name in self.rules 

108 } 

109 

110 def parse_escape(self, m: Match, state: InlineState) -> int: 

111 text = m.group(0) 

112 text = unescape_char(text) 

113 state.append_token({ 

114 'type': 'text', 

115 'raw': text, 

116 }) 

117 return m.end() 

118 

119 def parse_link(self, m: Match, state: InlineState) -> Optional[int]: 

120 pos = m.end() 

121 

122 marker = m.group(0) 

123 is_image = marker[0] == '!' 

124 if is_image and state.in_image: 

125 state.append_token({'type': 'text', 'raw': marker}) 

126 return pos 

127 elif not is_image and state.in_link: 

128 state.append_token({'type': 'text', 'raw': marker}) 

129 return pos 

130 

131 text = None 

132 label, end_pos = parse_link_label(state.src, pos) 

133 if label is None: 

134 text, end_pos = parse_link_text(state.src, pos) 

135 if text is None: 

136 return 

137 

138 if text is None: 

139 text = label 

140 

141 if end_pos >= len(state.src) and label is None: 

142 return 

143 

144 rules = ['codespan', 'prec_auto_link', 'prec_inline_html'] 

145 prec_pos = self.precedence_scan(m, state, end_pos, rules) 

146 if prec_pos: 

147 return prec_pos 

148 

149 if end_pos < len(state.src): 

150 c = state.src[end_pos] 

151 if c == '(': 

152 # standard link [text](<url> "title") 

153 attrs, pos2 = parse_link(state.src, end_pos + 1) 

154 if pos2: 

155 token = self.__parse_link_token(is_image, text, attrs, state) 

156 state.append_token(token) 

157 return pos2 

158 

159 elif c == '[': 

160 # standard ref link [text][label] 

161 label2, pos2 = parse_link_label(state.src, end_pos + 1) 

162 if pos2: 

163 end_pos = pos2 

164 if label2: 

165 label = label2 

166 

167 if label is None: 

168 return 

169 

170 ref_links = state.env.get('ref_links') 

171 if not ref_links: 

172 return 

173 

174 key = unikey(label) 

175 env = ref_links.get(key) 

176 if env: 

177 attrs = {'url': env['url'], 'title': env.get('title')} 

178 token = self.__parse_link_token(is_image, text, attrs, state) 

179 token['ref'] = key 

180 token['label'] = label 

181 state.append_token(token) 

182 return end_pos 

183 

184 def __parse_link_token(self, is_image, text, attrs, state): 

185 new_state = state.copy() 

186 new_state.src = text 

187 if is_image: 

188 new_state.in_image = True 

189 token = { 

190 'type': 'image', 

191 'children': self.render(new_state), 

192 'attrs': attrs, 

193 } 

194 else: 

195 new_state.in_link = True 

196 token = { 

197 'type': 'link', 

198 'children': self.render(new_state), 

199 'attrs': attrs, 

200 } 

201 return token 

202 

203 def parse_auto_link(self, m: Match, state: InlineState) -> int: 

204 text = m.group(0) 

205 pos = m.end() 

206 if state.in_link: 

207 self.process_text(text, state) 

208 return pos 

209 

210 text = text[1:-1] 

211 self._add_auto_link(text, text, state) 

212 return pos 

213 

214 def parse_auto_email(self, m: Match, state: InlineState) -> int: 

215 text = m.group(0) 

216 pos = m.end() 

217 if state.in_link: 

218 self.process_text(text, state) 

219 return pos 

220 

221 text = text[1:-1] 

222 url = 'mailto:' + text 

223 self._add_auto_link(url, text, state) 

224 return pos 

225 

226 def _add_auto_link(self, url, text, state): 

227 state.append_token({ 

228 'type': 'link', 

229 'children': [{'type': 'text', 'raw': text}], 

230 'attrs': {'url': escape_url(url)}, 

231 }) 

232 

233 def parse_emphasis(self, m: Match, state: InlineState) -> int: 

234 pos = m.end() 

235 

236 marker = m.group(0) 

237 mlen = len(marker) 

238 if mlen == 1 and state.in_emphasis: 

239 state.append_token({'type': 'text', 'raw': marker}) 

240 return pos 

241 elif mlen == 2 and state.in_strong: 

242 state.append_token({'type': 'text', 'raw': marker}) 

243 return pos 

244 

245 _end_re = EMPHASIS_END_RE[marker] 

246 m1 = _end_re.search(state.src, pos) 

247 if not m1: 

248 state.append_token({'type': 'text', 'raw': marker}) 

249 return pos 

250 

251 end_pos = m1.end() 

252 text = state.src[pos:end_pos-mlen] 

253 

254 prec_pos = self.precedence_scan(m, state, end_pos) 

255 if prec_pos: 

256 return prec_pos 

257 

258 new_state = state.copy() 

259 new_state.src = text 

260 if mlen == 1: 

261 new_state.in_emphasis = True 

262 children = self.render(new_state) 

263 state.append_token({'type': 'emphasis', 'children': children}) 

264 elif mlen == 2: 

265 new_state.in_strong = True 

266 children = self.render(new_state) 

267 state.append_token({'type': 'strong', 'children': children}) 

268 else: 

269 new_state.in_emphasis = True 

270 new_state.in_strong = True 

271 

272 children = [{ 

273 'type': 'strong', 

274 'children': self.render(new_state) 

275 }] 

276 state.append_token({ 

277 'type': 'emphasis', 

278 'children': children, 

279 }) 

280 return end_pos 

281 

282 def parse_codespan(self, m: Match, state: InlineState) -> int: 

283 marker = m.group(0) 

284 # require same marker with same length at end 

285 

286 pattern = re.compile(r'(.*?[^`])' + marker + r'(?!`)', re.S) 

287 

288 pos = m.end() 

289 m = pattern.match(state.src, pos) 

290 if m: 

291 end_pos = m.end() 

292 code = m.group(1) 

293 # Line endings are treated like spaces 

294 code = code.replace('\n', ' ') 

295 if len(code.strip()): 

296 if code.startswith(' ') and code.endswith(' '): 

297 code = code[1:-1] 

298 state.append_token({'type': 'codespan', 'raw': escape(code)}) 

299 return end_pos 

300 else: 

301 state.append_token({'type': 'text', 'raw': marker}) 

302 return pos 

303 

304 def parse_linebreak(self, m: Match, state: InlineState) -> int: 

305 state.append_token({'type': 'linebreak'}) 

306 return m.end() 

307 

308 def parse_softbreak(self, m: Match, state: InlineState) -> int: 

309 state.append_token({'type': 'softbreak'}) 

310 return m.end() 

311 

312 def parse_inline_html(self, m: Match, state: InlineState) -> int: 

313 end_pos = m.end() 

314 html = m.group(0) 

315 state.append_token({'type': 'inline_html', 'raw': html}) 

316 if html.startswith(('<a ', '<a>', '<A ', '<A>')): 

317 state.in_link = True 

318 elif html.startswith(('</a ', '</a>', '</A ', '</A>')): 

319 state.in_link = False 

320 return end_pos 

321 

322 def process_text(self, text: str, state: InlineState): 

323 state.append_token({'type': 'text', 'raw': text}) 

324 

325 def parse(self, state: InlineState) -> List[Dict[str, Any]]: 

326 pos = 0 

327 sc = self.compile_sc() 

328 while pos < len(state.src): 

329 m = sc.search(state.src, pos) 

330 if not m: 

331 break 

332 

333 end_pos = m.start() 

334 if end_pos > pos: 

335 hole = state.src[pos:end_pos] 

336 self.process_text(hole, state) 

337 

338 new_pos = self.parse_method(m, state) 

339 if not new_pos: 

340 # move cursor 1 character forward 

341 pos = end_pos + 1 

342 hole = state.src[end_pos:pos] 

343 self.process_text(hole, state) 

344 else: 

345 pos = new_pos 

346 

347 if pos == 0: 

348 # special case, just pure text 

349 self.process_text(state.src, state) 

350 elif pos < len(state.src): 

351 self.process_text(state.src[pos:], state) 

352 return state.tokens 

353 

354 def precedence_scan(self, m: Match, state: InlineState, end_pos: int, rules=None): 

355 if rules is None: 

356 rules = ['codespan', 'link', 'prec_auto_link', 'prec_inline_html'] 

357 

358 mark_pos = m.end() 

359 sc = self.compile_sc(rules) 

360 m1 = sc.search(state.src, mark_pos, end_pos) 

361 if not m1: 

362 return 

363 

364 rule_name = m1.lastgroup.replace('prec_', '') 

365 sc = self.compile_sc([rule_name]) 

366 m2 = sc.match(state.src, m1.start()) 

367 if not m2: 

368 return 

369 

370 func = self._methods[rule_name] 

371 new_state = state.copy() 

372 new_state.src = state.src 

373 m2_pos = func(m2, new_state) 

374 if not m2_pos or m2_pos < end_pos: 

375 return 

376 

377 raw_text = state.src[m.start():m2.start()] 

378 state.append_token({'type': 'text', 'raw': raw_text}) 

379 for token in new_state.tokens: 

380 state.append_token(token) 

381 return m2_pos 

382 

383 def render(self, state: InlineState): 

384 self.parse(state) 

385 return state.tokens 

386 

387 def __call__(self, s, env): 

388 state = self.state_cls(env) 

389 state.src = s 

390 return self.render(state)