Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/markdown/inlinepatterns.py: 42%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

408 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The 

22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still 

23supported, though users are encouraged to migrate. 

24 

25The new `InlineProcessor` provides two major enhancements to `Patterns`: 

26 

271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with 

28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain 

29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`. 

30 

312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis, 

32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative 

33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are 

34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`. 

35 

36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested 

37 brackets, and explicit control of the span "consumed" by the processor. 

38 

39""" 

40 

41from __future__ import annotations 

42 

43from . import util 

44from typing import TYPE_CHECKING, Any, Collection, NamedTuple 

45import re 

46import xml.etree.ElementTree as etree 

47from html import entities 

48 

49if TYPE_CHECKING: # pragma: no cover 

50 from markdown import Markdown 

51 

52 

53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]: 

54 """ 

55 Build the default set of inline patterns for Markdown. 

56 

57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace 

58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we 

59 apply the expressions in the following order: 

60 

61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown 

62 patterns by escaping them; 

63 

64 * then we handle the various types of links (auto-links must be handled before inline HTML); 

65 

66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder 

67 and add the actual HTML to a stash; 

68 

69 * finally we apply strong, emphasis, etc. 

70 

71 """ 

72 inlinePatterns = util.Registry() 

73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) 

74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) 

75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) 

76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) 

77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) 

78 inlinePatterns.register( 

79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 

80 ) 

81 inlinePatterns.register( 

82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 

83 ) 

84 inlinePatterns.register( 

85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 

86 ) 

87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) 

88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) 

89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) 

90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) 

91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) 

92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) 

93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) 

94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) 

95 return inlinePatterns 

96 

97 

98# The actual regular expressions for patterns 

99# ----------------------------------------------------------------------------- 

100 

101NOIMG = r'(?<!\!)' 

102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """ 

103 

104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' 

105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """ 

106 

107ESCAPE_RE = r'\\(.)' 

108""" Match a backslash escaped character (`\\<` or `\\*`). """ 

109 

110EMPHASIS_RE = r'(\*)([^\*]+)\1' 

111""" Match emphasis with an asterisk (`*emphasis*`). """ 

112 

113STRONG_RE = r'(\*{2})(.+?)\1' 

114""" Match strong with an asterisk (`**strong**`). """ 

115 

116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' 

117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """ 

118 

119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' 

120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """ 

121 

122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' 

123""" Match strong emphasis with underscores (`__strong _em__`). """ 

124 

125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' 

126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """ 

127 

128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' 

129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """ 

130 

131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' 

132""" Match strong emphasis with asterisk (`***strong**em*`). """ 

133 

134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' 

135""" Match strong emphasis with underscores (`___strong__em_`). """ 

136 

137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' 

138""" Match strong emphasis with asterisk (`**strong*em***`). """ 

139 

140LINK_RE = NOIMG + r'\[' 

141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """ 

142 

143IMAGE_LINK_RE = r'\!\[' 

144""" Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """ 

145 

146REFERENCE_RE = LINK_RE 

147""" Match start of reference link (`[Label][3]`). """ 

148 

149IMAGE_REFERENCE_RE = IMAGE_LINK_RE 

150""" Match start of image reference (`![alt text][2]`). """ 

151 

152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))' 

153""" Match a stand-alone `*` or `_`. """ 

154 

155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' 

156""" Match an automatic link (`<http://www.example.com>`). """ 

157 

158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' 

159""" Match an automatic email link (`<me@example.com>`). """ 

160 

161HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)' 

162""" Match an HTML tag (`<...>`). """ 

163 

164ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' 

165""" Match an HTML entity (`&#38;` (decimal) or `&#x26;` (hex) or `&amp;` (named)). """ 

166 

167LINE_BREAK_RE = r' \n' 

168""" Match two spaces at end of line. """ 

169 

170 

171def dequote(string: str) -> str: 

172 """Remove quotes from around a string.""" 

173 if ((string.startswith('"') and string.endswith('"')) or 

174 (string.startswith("'") and string.endswith("'"))): 

175 return string[1:-1] 

176 else: 

177 return string 

178 

179 

180class EmStrongItem(NamedTuple): 

181 """Emphasis/strong pattern item.""" 

182 pattern: re.Pattern[str] 

183 builder: str 

184 tags: str 

185 

186 

187# The pattern classes 

188# ----------------------------------------------------------------------------- 

189 

190 

191class Pattern: # pragma: no cover 

192 """ 

193 Base class that inline patterns subclass. 

194 

195 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression. 

196 Each pattern object uses a single regular expression and must support the following methods: 

197 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and 

198 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch]. 

199 

200 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this 

201 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on 

202 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression 

203 is pre-compiled. 

204 

205 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that 

206 use a more efficient and flexible search approach be used instead. However, the older style 

207 `Pattern` remains for backward compatibility with many existing third-party extensions. 

208 

209 """ 

210 

211 ANCESTOR_EXCLUDES: Collection[str] = tuple() 

212 """ 

213 A collection of elements which are undesirable ancestors. The processor will be skipped if it 

214 would cause the content to be a descendant of one of the listed tag names. 

215 """ 

216 

217 compiled_re: re.Pattern[str] 

218 md: Markdown | None 

219 

220 def __init__(self, pattern: str, md: Markdown | None = None): 

221 """ 

222 Create an instant of an inline pattern. 

223 

224 Arguments: 

225 pattern: A regular expression that matches a pattern. 

226 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

227 `self.md` on the class instance. 

228 

229 

230 """ 

231 self.pattern = pattern 

232 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, 

233 re.DOTALL | re.UNICODE) 

234 

235 self.md = md 

236 

237 def getCompiledRegExp(self) -> re.Pattern: 

238 """ Return a compiled regular expression. """ 

239 return self.compiled_re 

240 

241 def handleMatch(self, m: re.Match[str]) -> etree.Element | str: 

242 """Return a ElementTree element from the given match. 

243 

244 Subclasses should override this method. 

245 

246 Arguments: 

247 m: A match object containing a match of the pattern. 

248 

249 Returns: An ElementTree Element object. 

250 

251 """ 

252 pass # pragma: no cover 

253 

254 def type(self) -> str: 

255 """ Return class name, to define pattern type """ 

256 return self.__class__.__name__ 

257 

258 def unescape(self, text: str) -> str: 

259 """ Return unescaped text given text with an inline placeholder. """ 

260 try: 

261 stash = self.md.treeprocessors['inline'].stashed_nodes 

262 except KeyError: # pragma: no cover 

263 return text 

264 

265 def get_stash(m): 

266 id = m.group(1) 

267 if id in stash: 

268 value = stash.get(id) 

269 if isinstance(value, str): 

270 return value 

271 else: 

272 # An `etree` Element - return text content only 

273 return ''.join(value.itertext()) 

274 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

275 

276 

277class InlineProcessor(Pattern): 

278 """ 

279 Base class that inline processors subclass. 

280 

281 This is the newer style inline processor that uses a more 

282 efficient and flexible search approach. 

283 

284 """ 

285 

286 def __init__(self, pattern: str, md: Markdown | None = None): 

287 """ 

288 Create an instant of an inline processor. 

289 

290 Arguments: 

291 pattern: A regular expression that matches a pattern. 

292 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

293 `self.md` on the class instance. 

294 

295 """ 

296 self.pattern = pattern 

297 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) 

298 

299 # API for Markdown to pass `safe_mode` into instance 

300 self.safe_mode = False 

301 self.md = md 

302 

303 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]: 

304 """Return a ElementTree element from the given match and the 

305 start and end index of the matched text. 

306 

307 If `start` and/or `end` are returned as `None`, it will be 

308 assumed that the processor did not find a valid region of text. 

309 

310 Subclasses should override this method. 

311 

312 Arguments: 

313 m: A re match object containing a match of the pattern. 

314 data: The buffer currently under analysis. 

315 

316 Returns: 

317 el: The ElementTree element, text or None. 

318 start: The start of the region that has been matched or None. 

319 end: The end of the region that has been matched or None. 

320 

321 """ 

322 pass # pragma: no cover 

323 

324 

325class SimpleTextPattern(Pattern): # pragma: no cover 

326 """ Return a simple text of `group(2)` of a Pattern. """ 

327 def handleMatch(self, m: re.Match[str]) -> str: 

328 """ Return string content of `group(2)` of a matching pattern. """ 

329 return m.group(2) 

330 

331 

332class SimpleTextInlineProcessor(InlineProcessor): 

333 """ Return a simple text of `group(1)` of a Pattern. """ 

334 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

335 """ Return string content of `group(1)` of a matching pattern. """ 

336 return m.group(1), m.start(0), m.end(0) 

337 

338 

339class EscapeInlineProcessor(InlineProcessor): 

340 """ Return an escaped character. """ 

341 

342 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]: 

343 """ 

344 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS] 

345 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped 

346 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX]. 

347 

348 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`. 

349 """ 

350 

351 char = m.group(1) 

352 if char in self.md.ESCAPED_CHARS: 

353 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) 

354 else: 

355 return None, m.start(0), m.end(0) 

356 

357 

358class SimpleTagPattern(Pattern): # pragma: no cover 

359 """ 

360 Return element of type `tag` with a text attribute of `group(3)` 

361 of a Pattern. 

362 

363 """ 

364 def __init__(self, pattern: str, tag: str): 

365 """ 

366 Create an instant of an simple tag pattern. 

367 

368 Arguments: 

369 pattern: A regular expression that matches a pattern. 

370 tag: Tag of element. 

371 

372 """ 

373 Pattern.__init__(self, pattern) 

374 self.tag = tag 

375 """ The tag of the rendered element. """ 

376 

377 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

378 """ 

379 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a 

380 matching pattern as the Element's text. 

381 """ 

382 el = etree.Element(self.tag) 

383 el.text = m.group(3) 

384 return el 

385 

386 

387class SimpleTagInlineProcessor(InlineProcessor): 

388 """ 

389 Return element of type `tag` with a text attribute of `group(2)` 

390 of a Pattern. 

391 

392 """ 

393 def __init__(self, pattern: str, tag: str): 

394 """ 

395 Create an instant of an simple tag processor. 

396 

397 Arguments: 

398 pattern: A regular expression that matches a pattern. 

399 tag: Tag of element. 

400 

401 """ 

402 InlineProcessor.__init__(self, pattern) 

403 self.tag = tag 

404 """ The tag of the rendered element. """ 

405 

406 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

407 """ 

408 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a 

409 matching pattern as the Element's text. 

410 """ 

411 el = etree.Element(self.tag) 

412 el.text = m.group(2) 

413 return el, m.start(0), m.end(0) 

414 

415 

416class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover 

417 """ Return an element of type `tag` with no children. """ 

418 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

419 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

420 return etree.Element(self.tag) 

421 

422 

423class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): 

424 """ Return an element of type `tag` with no children. """ 

425 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

426 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

427 return etree.Element(self.tag), m.start(0), m.end(0) 

428 

429 

430class BacktickInlineProcessor(InlineProcessor): 

431 """ Return a `<code>` element containing the escaped matching text. """ 

432 def __init__(self, pattern: str): 

433 InlineProcessor.__init__(self, pattern) 

434 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) 

435 self.tag = 'code' 

436 """ The tag of the rendered element. """ 

437 

438 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]: 

439 """ 

440 If the match contains `group(3)` of a pattern, then return a `code` 

441 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with 

442 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString]. 

443 

444 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped. 

445 

446 """ 

447 if m.group(3): 

448 el = etree.Element(self.tag) 

449 el.text = util.AtomicString(util.code_escape(m.group(3).strip())) 

450 return el, m.start(0), m.end(0) 

451 else: 

452 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) 

453 

454 

455class DoubleTagPattern(SimpleTagPattern): # pragma: no cover 

456 """Return a ElementTree element nested in tag2 nested in tag1. 

457 

458 Useful for strong emphasis etc. 

459 

460 """ 

461 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

462 """ 

463 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

464 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional. 

465 

466 """ 

467 tag1, tag2 = self.tag.split(",") 

468 el1 = etree.Element(tag1) 

469 el2 = etree.SubElement(el1, tag2) 

470 el2.text = m.group(3) 

471 if len(m.groups()) == 5: 

472 el2.tail = m.group(4) 

473 return el1 

474 

475 

476class DoubleTagInlineProcessor(SimpleTagInlineProcessor): 

477 """Return a ElementTree element nested in tag2 nested in tag1. 

478 

479 Useful for strong emphasis etc. 

480 

481 """ 

482 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

483 """ 

484 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

485 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional. 

486 

487 """ 

488 tag1, tag2 = self.tag.split(",") 

489 el1 = etree.Element(tag1) 

490 el2 = etree.SubElement(el1, tag2) 

491 el2.text = m.group(2) 

492 if len(m.groups()) == 3: 

493 el2.tail = m.group(3) 

494 return el1, m.start(0), m.end(0) 

495 

496 

497class HtmlInlineProcessor(InlineProcessor): 

498 """ Store raw inline html and return a placeholder. """ 

499 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

500 """ Store the text of `group(1)` of a pattern and return a placeholder string. """ 

501 rawhtml = self.backslash_unescape(self.unescape(m.group(1))) 

502 place_holder = self.md.htmlStash.store(rawhtml) 

503 return place_holder, m.start(0), m.end(0) 

504 

505 def unescape(self, text: str) -> str: 

506 """ Return unescaped text given text with an inline placeholder. """ 

507 try: 

508 stash = self.md.treeprocessors['inline'].stashed_nodes 

509 except KeyError: # pragma: no cover 

510 return text 

511 

512 def get_stash(m: re.Match[str]) -> str: 

513 id = m.group(1) 

514 value = stash.get(id) 

515 if value is not None: 

516 try: 

517 return self.md.serializer(value) 

518 except Exception: 

519 return r'\%s' % value 

520 

521 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

522 

523 def backslash_unescape(self, text: str) -> str: 

524 """ Return text with backslash escapes undone (backslashes are restored). """ 

525 try: 

526 RE = self.md.treeprocessors['unescape'].RE 

527 except KeyError: # pragma: no cover 

528 return text 

529 

530 def _unescape(m: re.Match[str]) -> str: 

531 return chr(int(m.group(1))) 

532 

533 return RE.sub(_unescape, text) 

534 

535 

536class AsteriskProcessor(InlineProcessor): 

537 """Emphasis processor for handling strong and em matches inside asterisks.""" 

538 

539 PATTERNS = [ 

540 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

541 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

542 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

543 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

544 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

545 ] 

546 """ The various strong and emphasis patterns handled by this processor. """ 

547 

548 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element: 

549 """Return single tag.""" 

550 el1 = etree.Element(tag) 

551 text = m.group(2) 

552 self.parse_sub_patterns(text, el1, None, idx) 

553 return el1 

554 

555 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

556 """Return double tag.""" 

557 

558 tag1, tag2 = tags.split(",") 

559 el1 = etree.Element(tag1) 

560 el2 = etree.Element(tag2) 

561 text = m.group(2) 

562 self.parse_sub_patterns(text, el2, None, idx) 

563 el1.append(el2) 

564 if len(m.groups()) == 3: 

565 text = m.group(3) 

566 self.parse_sub_patterns(text, el1, el2, idx) 

567 return el1 

568 

569 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

570 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" 

571 

572 tag1, tag2 = tags.split(",") 

573 el1 = etree.Element(tag1) 

574 el2 = etree.Element(tag2) 

575 text = m.group(2) 

576 self.parse_sub_patterns(text, el1, None, idx) 

577 text = m.group(3) 

578 el1.append(el2) 

579 self.parse_sub_patterns(text, el2, None, idx) 

580 return el1 

581 

582 def parse_sub_patterns( 

583 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int 

584 ) -> None: 

585 """ 

586 Parses sub patterns. 

587 

588 `data`: text to evaluate. 

589 

590 `parent`: Parent to attach text and sub elements to. 

591 

592 `last`: Last appended child to parent. Can also be None if parent has no children. 

593 

594 `idx`: Current pattern index that was used to evaluate the parent. 

595 """ 

596 

597 offset = 0 

598 pos = 0 

599 

600 length = len(data) 

601 while pos < length: 

602 # Find the start of potential emphasis or strong tokens 

603 if self.compiled_re.match(data, pos): 

604 matched = False 

605 # See if the we can match an emphasis/strong pattern 

606 for index, item in enumerate(self.PATTERNS): 

607 # Only evaluate patterns that are after what was used on the parent 

608 if index <= idx: 

609 continue 

610 m = item.pattern.match(data, pos) 

611 if m: 

612 # Append child nodes to parent 

613 # Text nodes should be appended to the last 

614 # child if present, and if not, it should 

615 # be added as the parent's text node. 

616 text = data[offset:m.start(0)] 

617 if text: 

618 if last is not None: 

619 last.tail = text 

620 else: 

621 parent.text = text 

622 el = self.build_element(m, item.builder, item.tags, index) 

623 parent.append(el) 

624 last = el 

625 # Move our position past the matched hunk 

626 offset = pos = m.end(0) 

627 matched = True 

628 if not matched: 

629 # We matched nothing, move on to the next character 

630 pos += 1 

631 else: 

632 # Increment position as no potential emphasis start was found. 

633 pos += 1 

634 

635 # Append any leftover text as a text node. 

636 text = data[offset:] 

637 if text: 

638 if last is not None: 

639 last.tail = text 

640 else: 

641 parent.text = text 

642 

643 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element: 

644 """Element builder.""" 

645 

646 if builder == 'double2': 

647 return self.build_double2(m, tags, index) 

648 elif builder == 'double': 

649 return self.build_double(m, tags, index) 

650 else: 

651 return self.build_single(m, tags, index) 

652 

653 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

654 """Parse patterns.""" 

655 

656 el = None 

657 start = None 

658 end = None 

659 

660 for index, item in enumerate(self.PATTERNS): 

661 m1 = item.pattern.match(data, m.start(0)) 

662 if m1: 

663 start = m1.start(0) 

664 end = m1.end(0) 

665 el = self.build_element(m1, item.builder, item.tags, index) 

666 break 

667 return el, start, end 

668 

669 

670class UnderscoreProcessor(AsteriskProcessor): 

671 """Emphasis processor for handling strong and em matches inside underscores.""" 

672 

673 PATTERNS = [ 

674 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

675 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

676 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

677 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

678 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

679 ] 

680 """ The various strong and emphasis patterns handled by this processor. """ 

681 

682 

683class LinkInlineProcessor(InlineProcessor): 

684 """ Return a link element from the given match. """ 

685 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) 

686 RE_TITLE_CLEAN = re.compile(r'\s') 

687 

688 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

689 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

690 text, index, handled = self.getText(data, m.end(0)) 

691 

692 if not handled: 

693 return None, None, None 

694 

695 href, title, index, handled = self.getLink(data, index) 

696 if not handled: 

697 return None, None, None 

698 

699 el = etree.Element("a") 

700 el.text = text 

701 

702 el.set("href", href) 

703 

704 if title is not None: 

705 el.set("title", title) 

706 

707 return el, m.start(0), index 

708 

709 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]: 

710 """Parse data between `()` of `[Text]()` allowing recursive `()`. """ 

711 

712 href = '' 

713 title: str | None = None 

714 handled = False 

715 

716 m = self.RE_LINK.match(data, pos=index) 

717 if m and m.group(1): 

718 # Matches [Text](<link> "title") 

719 href = m.group(1)[1:-1].strip() 

720 if m.group(2): 

721 title = m.group(2)[1:-1] 

722 index = m.end(0) 

723 handled = True 

724 elif m: 

725 # Track bracket nesting and index in string 

726 bracket_count = 1 

727 backtrack_count = 1 

728 start_index = m.end() 

729 index = start_index 

730 last_bracket = -1 

731 

732 # Primary (first found) quote tracking. 

733 quote: str | None = None 

734 start_quote = -1 

735 exit_quote = -1 

736 ignore_matches = False 

737 

738 # Secondary (second found) quote tracking. 

739 alt_quote = None 

740 start_alt_quote = -1 

741 exit_alt_quote = -1 

742 

743 # Track last character 

744 last = '' 

745 

746 for pos in range(index, len(data)): 

747 c = data[pos] 

748 if c == '(': 

749 # Count nested ( 

750 # Don't increment the bracket count if we are sure we're in a title. 

751 if not ignore_matches: 

752 bracket_count += 1 

753 elif backtrack_count > 0: 

754 backtrack_count -= 1 

755 elif c == ')': 

756 # Match nested ) to ( 

757 # Don't decrement if we are sure we are in a title that is unclosed. 

758 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): 

759 bracket_count = 0 

760 elif not ignore_matches: 

761 bracket_count -= 1 

762 elif backtrack_count > 0: 

763 backtrack_count -= 1 

764 # We've found our backup end location if the title doesn't resolve. 

765 if backtrack_count == 0: 

766 last_bracket = index + 1 

767 

768 elif c in ("'", '"'): 

769 # Quote has started 

770 if not quote: 

771 # We'll assume we are now in a title. 

772 # Brackets are quoted, so no need to match them (except for the final one). 

773 ignore_matches = True 

774 backtrack_count = bracket_count 

775 bracket_count = 1 

776 start_quote = index + 1 

777 quote = c 

778 # Secondary quote (in case the first doesn't resolve): [text](link'"title") 

779 elif c != quote and not alt_quote: 

780 start_alt_quote = index + 1 

781 alt_quote = c 

782 # Update primary quote match 

783 elif c == quote: 

784 exit_quote = index + 1 

785 # Update secondary quote match 

786 elif alt_quote and c == alt_quote: 

787 exit_alt_quote = index + 1 

788 

789 index += 1 

790 

791 # Link is closed, so let's break out of the loop 

792 if bracket_count == 0: 

793 # Get the title if we closed a title string right before link closed 

794 if exit_quote >= 0 and quote == last: 

795 href = data[start_index:start_quote - 1] 

796 title = ''.join(data[start_quote:exit_quote - 1]) 

797 elif exit_alt_quote >= 0 and alt_quote == last: 

798 href = data[start_index:start_alt_quote - 1] 

799 title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) 

800 else: 

801 href = data[start_index:index - 1] 

802 break 

803 

804 if c != ' ': 

805 last = c 

806 

807 # We have a scenario: `[test](link"notitle)` 

808 # When we enter a string, we stop tracking bracket resolution in the main counter, 

809 # but we do keep a backup counter up until we discover where we might resolve all brackets 

810 # if the title string fails to resolve. 

811 if bracket_count != 0 and backtrack_count == 0: 

812 href = data[start_index:last_bracket - 1] 

813 index = last_bracket 

814 bracket_count = 0 

815 

816 handled = bracket_count == 0 

817 

818 if title is not None: 

819 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) 

820 

821 href = self.unescape(href).strip() 

822 

823 return href, title, index, handled 

824 

825 def getText(self, data: str, index: int) -> tuple[str, int, bool]: 

826 """Parse the content between `[]` of the start of an image or link 

827 resolving nested square brackets. 

828 

829 """ 

830 bracket_count = 1 

831 text = [] 

832 for pos in range(index, len(data)): 

833 c = data[pos] 

834 if c == ']': 

835 bracket_count -= 1 

836 elif c == '[': 

837 bracket_count += 1 

838 index += 1 

839 if bracket_count == 0: 

840 break 

841 text.append(c) 

842 return ''.join(text), index, bracket_count == 0 

843 

844 

845class ImageInlineProcessor(LinkInlineProcessor): 

846 """ Return a `img` element from the given match. """ 

847 

848 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

849 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

850 text, index, handled = self.getText(data, m.end(0)) 

851 if not handled: 

852 return None, None, None 

853 

854 src, title, index, handled = self.getLink(data, index) 

855 if not handled: 

856 return None, None, None 

857 

858 el = etree.Element("img") 

859 

860 el.set("src", src) 

861 

862 if title is not None: 

863 el.set("title", title) 

864 

865 el.set('alt', self.unescape(text)) 

866 return el, m.start(0), index 

867 

868 

869class ReferenceInlineProcessor(LinkInlineProcessor): 

870 """ Match to a stored reference and return link element. """ 

871 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) 

872 

873 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) 

874 

875 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

876 """ 

877 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`. 

878 

879 """ 

880 text, index, handled = self.getText(data, m.end(0)) 

881 if not handled: 

882 return None, None, None 

883 

884 id, end, handled = self.evalId(data, index, text) 

885 if not handled: 

886 return None, None, None 

887 

888 # Clean up line breaks in id 

889 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 

890 if id not in self.md.references: # ignore undefined refs 

891 return None, m.start(0), end 

892 

893 href, title = self.md.references[id] 

894 

895 return self.makeTag(href, title, text), m.start(0), end 

896 

897 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: 

898 """ 

899 Evaluate the id portion of `[ref][id]`. 

900 

901 If `[ref][]` use `[ref]`. 

902 """ 

903 m = self.RE_LINK.match(data, pos=index) 

904 if not m: 

905 return None, index, False 

906 else: 

907 id = m.group(1).lower() 

908 end = m.end(0) 

909 if not id: 

910 id = text.lower() 

911 return id, end, True 

912 

913 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

914 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """ 

915 el = etree.Element('a') 

916 

917 el.set('href', href) 

918 if title: 

919 el.set('title', title) 

920 

921 el.text = text 

922 return el 

923 

924 

925class ShortReferenceInlineProcessor(ReferenceInlineProcessor): 

926 """Short form of reference: `[google]`. """ 

927 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

928 """Evaluate the id of `[ref]`. """ 

929 

930 return text.lower(), index, True 

931 

932 

933class ImageReferenceInlineProcessor(ReferenceInlineProcessor): 

934 """ Match to a stored reference and return `img` element. """ 

935 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

936 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """ 

937 el = etree.Element("img") 

938 el.set("src", href) 

939 if title: 

940 el.set("title", title) 

941 el.set("alt", self.unescape(text)) 

942 return el 

943 

944 

945class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): 

946 """ Short form of image reference: `![ref]`. """ 

947 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

948 """Evaluate the id of `[ref]`. """ 

949 

950 return text.lower(), index, True 

951 

952 

953class AutolinkInlineProcessor(InlineProcessor): 

954 """ Return a link Element given an auto-link (`<http://example/com>`). """ 

955 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

956 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """ 

957 el = etree.Element("a") 

958 el.set('href', self.unescape(m.group(1))) 

959 el.text = util.AtomicString(m.group(1)) 

960 return el, m.start(0), m.end(0) 

961 

962 

963class AutomailInlineProcessor(InlineProcessor): 

964 """ 

965 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`). 

966 """ 

967 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

968 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """ 

969 el = etree.Element('a') 

970 email = self.unescape(m.group(1)) 

971 if email.startswith("mailto:"): 

972 email = email[len("mailto:"):] 

973 

974 def codepoint2name(code: int) -> str: 

975 """Return entity definition by code, or the code if not defined.""" 

976 entity = entities.codepoint2name.get(code) 

977 if entity: 

978 return "{}{};".format(util.AMP_SUBSTITUTE, entity) 

979 else: 

980 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 

981 

982 letters = [codepoint2name(ord(letter)) for letter in email] 

983 el.text = util.AtomicString(''.join(letters)) 

984 

985 mailto = "mailto:" + email 

986 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 

987 ord(letter) for letter in mailto]) 

988 el.set('href', mailto) 

989 return el, m.start(0), m.end(0)