Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/inlinepatterns.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

409 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The 

22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still 

23supported, though users are encouraged to migrate. 

24 

25The new `InlineProcessor` provides two major enhancements to `Patterns`: 

26 

271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with 

28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain 

29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`. 

30 

312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis, 

32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative 

33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are 

34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`. 

35 

36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested 

37 brackets, and explicit control of the span "consumed" by the processor. 

38 

39""" 

40 

41from __future__ import annotations 

42 

43from . import util 

44from typing import TYPE_CHECKING, Any, Collection, NamedTuple 

45import re 

46import xml.etree.ElementTree as etree 

47from html import entities 

48 

49if TYPE_CHECKING: # pragma: no cover 

50 from markdown import Markdown 

51 

52 

53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]: 

54 """ 

55 Build the default set of inline patterns for Markdown. 

56 

57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace 

58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we 

59 apply the expressions in the following order: 

60 

61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown 

62 patterns by escaping them; 

63 

64 * then we handle the various types of links (auto-links must be handled before inline HTML); 

65 

66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder 

67 and add the actual HTML to a stash; 

68 

69 * finally we apply strong, emphasis, etc. 

70 

71 """ 

72 inlinePatterns = util.Registry() 

73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) 

74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) 

75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) 

76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) 

77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) 

78 inlinePatterns.register( 

79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 

80 ) 

81 inlinePatterns.register( 

82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 

83 ) 

84 inlinePatterns.register( 

85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 

86 ) 

87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) 

88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) 

89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) 

90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) 

91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) 

92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) 

93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) 

94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) 

95 return inlinePatterns 

96 

97 

98# The actual regular expressions for patterns 

99# ----------------------------------------------------------------------------- 

100 

101NOIMG = r'(?<!\!)' 

102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """ 

103 

104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' 

105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """ 

106 

107ESCAPE_RE = r'\\(.)' 

108""" Match a backslash escaped character (`\\<` or `\\*`). """ 

109 

110EMPHASIS_RE = r'(\*)([^\*]+)\1' 

111""" Match emphasis with an asterisk (`*emphasis*`). """ 

112 

113STRONG_RE = r'(\*{2})(.+?)\1' 

114""" Match strong with an asterisk (`**strong**`). """ 

115 

116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' 

117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """ 

118 

119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' 

120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """ 

121 

122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' 

123""" Match strong emphasis with underscores (`__strong _em__`). """ 

124 

125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' 

126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """ 

127 

128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' 

129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """ 

130 

131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' 

132""" Match strong emphasis with asterisk (`***strong**em*`). """ 

133 

134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' 

135""" Match strong emphasis with underscores (`___strong__em_`). """ 

136 

137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' 

138""" Match strong emphasis with asterisk (`**strong*em***`). """ 

139 

140LINK_RE = NOIMG + r'\[' 

141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """ 

142 

143IMAGE_LINK_RE = r'\!\[' 

144""" Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """ 

145 

146REFERENCE_RE = LINK_RE 

147""" Match start of reference link (`[Label][3]`). """ 

148 

149IMAGE_REFERENCE_RE = IMAGE_LINK_RE 

150""" Match start of image reference (`![alt text][2]`). """ 

151 

152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))' 

153""" Match a stand-alone `*` or `_`. """ 

154 

155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' 

156""" Match an automatic link (`<http://www.example.com>`). """ 

157 

158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' 

159""" Match an automatic email link (`<me@example.com>`). """ 

160 

161HTML_RE = ( 

162 r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag 

163 r'!--(?:(?!<!--|-->).)*--|' # Comment 

164 r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction 

165 r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA` 

166 ')>)' 

167) 

168""" Match an HTML tag (`<...>`). """ 

169 

170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' 

171""" Match an HTML entity (`&#38;` (decimal) or `&#x26;` (hex) or `&amp;` (named)). """ 

172 

173LINE_BREAK_RE = r' \n' 

174""" Match two spaces at end of line. """ 

175 

176 

177def dequote(string: str) -> str: 

178 """Remove quotes from around a string.""" 

179 if ((string.startswith('"') and string.endswith('"')) or 

180 (string.startswith("'") and string.endswith("'"))): 

181 return string[1:-1] 

182 else: 

183 return string 

184 

185 

186class EmStrongItem(NamedTuple): 

187 """Emphasis/strong pattern item.""" 

188 pattern: re.Pattern[str] 

189 builder: str 

190 tags: str 

191 

192 

193# The pattern classes 

194# ----------------------------------------------------------------------------- 

195 

196 

197class Pattern: # pragma: no cover 

198 """ 

199 Base class that inline patterns subclass. 

200 

201 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression. 

202 Each pattern object uses a single regular expression and must support the following methods: 

203 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and 

204 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch]. 

205 

206 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this 

207 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on 

208 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression 

209 is pre-compiled. 

210 

211 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that 

212 use a more efficient and flexible search approach be used instead. However, the older style 

213 `Pattern` remains for backward compatibility with many existing third-party extensions. 

214 

215 """ 

216 

217 ANCESTOR_EXCLUDES: Collection[str] = tuple() 

218 """ 

219 A collection of elements which are undesirable ancestors. The processor will be skipped if it 

220 would cause the content to be a descendant of one of the listed tag names. 

221 """ 

222 

223 compiled_re: re.Pattern[str] 

224 md: Markdown | None 

225 

226 def __init__(self, pattern: str, md: Markdown | None = None): 

227 """ 

228 Create an instant of an inline pattern. 

229 

230 Arguments: 

231 pattern: A regular expression that matches a pattern. 

232 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

233 `self.md` on the class instance. 

234 

235 

236 """ 

237 self.pattern = pattern 

238 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, 

239 re.DOTALL | re.UNICODE) 

240 

241 self.md = md 

242 

243 def getCompiledRegExp(self) -> re.Pattern: 

244 """ Return a compiled regular expression. """ 

245 return self.compiled_re 

246 

247 def handleMatch(self, m: re.Match[str]) -> etree.Element | str: 

248 """Return a ElementTree element from the given match. 

249 

250 Subclasses should override this method. 

251 

252 Arguments: 

253 m: A match object containing a match of the pattern. 

254 

255 Returns: An ElementTree Element object. 

256 

257 """ 

258 pass # pragma: no cover 

259 

260 def type(self) -> str: 

261 """ Return class name, to define pattern type """ 

262 return self.__class__.__name__ 

263 

264 def unescape(self, text: str) -> str: 

265 """ Return unescaped text given text with an inline placeholder. """ 

266 try: 

267 stash = self.md.treeprocessors['inline'].stashed_nodes 

268 except KeyError: # pragma: no cover 

269 return text 

270 

271 def get_stash(m): 

272 id = m.group(1) 

273 if id in stash: 

274 value = stash.get(id) 

275 if isinstance(value, str): 

276 return value 

277 else: 

278 # An `etree` Element - return text content only 

279 return ''.join(value.itertext()) 

280 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

281 

282 

283class InlineProcessor(Pattern): 

284 """ 

285 Base class that inline processors subclass. 

286 

287 This is the newer style inline processor that uses a more 

288 efficient and flexible search approach. 

289 

290 """ 

291 

292 def __init__(self, pattern: str, md: Markdown | None = None): 

293 """ 

294 Create an instant of an inline processor. 

295 

296 Arguments: 

297 pattern: A regular expression that matches a pattern. 

298 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

299 `self.md` on the class instance. 

300 

301 """ 

302 self.pattern = pattern 

303 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) 

304 

305 # API for Markdown to pass `safe_mode` into instance 

306 self.safe_mode = False 

307 self.md = md 

308 

309 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]: 

310 """Return a ElementTree element from the given match and the 

311 start and end index of the matched text. 

312 

313 If `start` and/or `end` are returned as `None`, it will be 

314 assumed that the processor did not find a valid region of text. 

315 

316 Subclasses should override this method. 

317 

318 Arguments: 

319 m: A re match object containing a match of the pattern. 

320 data: The buffer currently under analysis. 

321 

322 Returns: 

323 el: The ElementTree element, text or None. 

324 start: The start of the region that has been matched or None. 

325 end: The end of the region that has been matched or None. 

326 

327 """ 

328 pass # pragma: no cover 

329 

330 

331class SimpleTextPattern(Pattern): # pragma: no cover 

332 """ Return a simple text of `group(2)` of a Pattern. """ 

333 def handleMatch(self, m: re.Match[str]) -> str: 

334 """ Return string content of `group(2)` of a matching pattern. """ 

335 return m.group(2) 

336 

337 

338class SimpleTextInlineProcessor(InlineProcessor): 

339 """ Return a simple text of `group(1)` of a Pattern. """ 

340 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

341 """ Return string content of `group(1)` of a matching pattern. """ 

342 return m.group(1), m.start(0), m.end(0) 

343 

344 

345class EscapeInlineProcessor(InlineProcessor): 

346 """ Return an escaped character. """ 

347 

348 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]: 

349 """ 

350 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS] 

351 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped 

352 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX]. 

353 

354 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`. 

355 """ 

356 

357 char = m.group(1) 

358 if char in self.md.ESCAPED_CHARS: 

359 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) 

360 else: 

361 return None, m.start(0), m.end(0) 

362 

363 

364class SimpleTagPattern(Pattern): # pragma: no cover 

365 """ 

366 Return element of type `tag` with a text attribute of `group(3)` 

367 of a Pattern. 

368 

369 """ 

370 def __init__(self, pattern: str, tag: str): 

371 """ 

372 Create an instant of an simple tag pattern. 

373 

374 Arguments: 

375 pattern: A regular expression that matches a pattern. 

376 tag: Tag of element. 

377 

378 """ 

379 Pattern.__init__(self, pattern) 

380 self.tag = tag 

381 """ The tag of the rendered element. """ 

382 

383 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

384 """ 

385 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a 

386 matching pattern as the Element's text. 

387 """ 

388 el = etree.Element(self.tag) 

389 el.text = m.group(3) 

390 return el 

391 

392 

393class SimpleTagInlineProcessor(InlineProcessor): 

394 """ 

395 Return element of type `tag` with a text attribute of `group(2)` 

396 of a Pattern. 

397 

398 """ 

399 def __init__(self, pattern: str, tag: str): 

400 """ 

401 Create an instant of an simple tag processor. 

402 

403 Arguments: 

404 pattern: A regular expression that matches a pattern. 

405 tag: Tag of element. 

406 

407 """ 

408 InlineProcessor.__init__(self, pattern) 

409 self.tag = tag 

410 """ The tag of the rendered element. """ 

411 

412 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

413 """ 

414 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a 

415 matching pattern as the Element's text. 

416 """ 

417 el = etree.Element(self.tag) 

418 el.text = m.group(2) 

419 return el, m.start(0), m.end(0) 

420 

421 

422class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover 

423 """ Return an element of type `tag` with no children. """ 

424 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

425 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

426 return etree.Element(self.tag) 

427 

428 

429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): 

430 """ Return an element of type `tag` with no children. """ 

431 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

432 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

433 return etree.Element(self.tag), m.start(0), m.end(0) 

434 

435 

436class BacktickInlineProcessor(InlineProcessor): 

437 """ Return a `<code>` element containing the escaped matching text. """ 

438 def __init__(self, pattern: str): 

439 InlineProcessor.__init__(self, pattern) 

440 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) 

441 self.tag = 'code' 

442 """ The tag of the rendered element. """ 

443 

444 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]: 

445 """ 

446 If the match contains `group(3)` of a pattern, then return a `code` 

447 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with 

448 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString]. 

449 

450 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped. 

451 

452 """ 

453 if m.group(3): 

454 el = etree.Element(self.tag) 

455 el.text = util.AtomicString(util.code_escape(m.group(3).strip())) 

456 return el, m.start(0), m.end(0) 

457 else: 

458 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) 

459 

460 

461class DoubleTagPattern(SimpleTagPattern): # pragma: no cover 

462 """Return a ElementTree element nested in tag2 nested in tag1. 

463 

464 Useful for strong emphasis etc. 

465 

466 """ 

467 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

468 """ 

469 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

470 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional. 

471 

472 """ 

473 tag1, tag2 = self.tag.split(",") 

474 el1 = etree.Element(tag1) 

475 el2 = etree.SubElement(el1, tag2) 

476 el2.text = m.group(3) 

477 if len(m.groups()) == 5: 

478 el2.tail = m.group(4) 

479 return el1 

480 

481 

482class DoubleTagInlineProcessor(SimpleTagInlineProcessor): 

483 """Return a ElementTree element nested in tag2 nested in tag1. 

484 

485 Useful for strong emphasis etc. 

486 

487 """ 

488 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

489 """ 

490 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

491 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional. 

492 

493 """ 

494 tag1, tag2 = self.tag.split(",") 

495 el1 = etree.Element(tag1) 

496 el2 = etree.SubElement(el1, tag2) 

497 el2.text = m.group(2) 

498 if len(m.groups()) == 3: 

499 el2.tail = m.group(3) 

500 return el1, m.start(0), m.end(0) 

501 

502 

503class HtmlInlineProcessor(InlineProcessor): 

504 """ Store raw inline html and return a placeholder. """ 

505 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

506 """ Store the text of `group(1)` of a pattern and return a placeholder string. """ 

507 rawhtml = self.backslash_unescape(self.unescape(m.group(1))) 

508 place_holder = self.md.htmlStash.store(rawhtml) 

509 return place_holder, m.start(0), m.end(0) 

510 

511 def unescape(self, text: str) -> str: 

512 """ Return unescaped text given text with an inline placeholder. """ 

513 try: 

514 stash = self.md.treeprocessors['inline'].stashed_nodes 

515 except KeyError: # pragma: no cover 

516 return text 

517 

518 def get_stash(m: re.Match[str]) -> str: 

519 id = m.group(1) 

520 value = stash.get(id) 

521 if value is not None: 

522 try: 

523 return self.md.serializer(value) 

524 except Exception: 

525 return r'\%s' % value 

526 

527 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

528 

529 def backslash_unescape(self, text: str) -> str: 

530 """ Return text with backslash escapes undone (backslashes are restored). """ 

531 try: 

532 RE = self.md.treeprocessors['unescape'].RE 

533 except KeyError: # pragma: no cover 

534 return text 

535 

536 def _unescape(m: re.Match[str]) -> str: 

537 return chr(int(m.group(1))) 

538 

539 return RE.sub(_unescape, text) 

540 

541 

542class AsteriskProcessor(InlineProcessor): 

543 """Emphasis processor for handling strong and em matches inside asterisks.""" 

544 

545 PATTERNS = [ 

546 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

547 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

548 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

549 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

550 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

551 ] 

552 """ The various strong and emphasis patterns handled by this processor. """ 

553 

554 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element: 

555 """Return single tag.""" 

556 el1 = etree.Element(tag) 

557 text = m.group(2) 

558 self.parse_sub_patterns(text, el1, None, idx) 

559 return el1 

560 

561 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

562 """Return double tag.""" 

563 

564 tag1, tag2 = tags.split(",") 

565 el1 = etree.Element(tag1) 

566 el2 = etree.Element(tag2) 

567 text = m.group(2) 

568 self.parse_sub_patterns(text, el2, None, idx) 

569 el1.append(el2) 

570 if len(m.groups()) == 3: 

571 text = m.group(3) 

572 self.parse_sub_patterns(text, el1, el2, idx) 

573 return el1 

574 

575 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

576 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" 

577 

578 tag1, tag2 = tags.split(",") 

579 el1 = etree.Element(tag1) 

580 el2 = etree.Element(tag2) 

581 text = m.group(2) 

582 self.parse_sub_patterns(text, el1, None, idx) 

583 text = m.group(3) 

584 el1.append(el2) 

585 self.parse_sub_patterns(text, el2, None, idx) 

586 return el1 

587 

588 def parse_sub_patterns( 

589 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int 

590 ) -> None: 

591 """ 

592 Parses sub patterns. 

593 

594 `data`: text to evaluate. 

595 

596 `parent`: Parent to attach text and sub elements to. 

597 

598 `last`: Last appended child to parent. Can also be None if parent has no children. 

599 

600 `idx`: Current pattern index that was used to evaluate the parent. 

601 """ 

602 

603 offset = 0 

604 pos = 0 

605 

606 length = len(data) 

607 while pos < length: 

608 # Find the start of potential emphasis or strong tokens 

609 if self.compiled_re.match(data, pos): 

610 matched = False 

611 # See if the we can match an emphasis/strong pattern 

612 for index, item in enumerate(self.PATTERNS): 

613 # Only evaluate patterns that are after what was used on the parent 

614 if index <= idx: 

615 continue 

616 m = item.pattern.match(data, pos) 

617 if m: 

618 # Append child nodes to parent 

619 # Text nodes should be appended to the last 

620 # child if present, and if not, it should 

621 # be added as the parent's text node. 

622 text = data[offset:m.start(0)] 

623 if text: 

624 if last is not None: 

625 last.tail = text 

626 else: 

627 parent.text = text 

628 el = self.build_element(m, item.builder, item.tags, index) 

629 parent.append(el) 

630 last = el 

631 # Move our position past the matched hunk 

632 offset = pos = m.end(0) 

633 matched = True 

634 if not matched: 

635 # We matched nothing, move on to the next character 

636 pos += 1 

637 else: 

638 # Increment position as no potential emphasis start was found. 

639 pos += 1 

640 

641 # Append any leftover text as a text node. 

642 text = data[offset:] 

643 if text: 

644 if last is not None: 

645 last.tail = text 

646 else: 

647 parent.text = text 

648 

649 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element: 

650 """Element builder.""" 

651 

652 if builder == 'double2': 

653 return self.build_double2(m, tags, index) 

654 elif builder == 'double': 

655 return self.build_double(m, tags, index) 

656 else: 

657 return self.build_single(m, tags, index) 

658 

659 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

660 """Parse patterns.""" 

661 

662 el = None 

663 start = None 

664 end = None 

665 

666 for index, item in enumerate(self.PATTERNS): 

667 m1 = item.pattern.match(data, m.start(0)) 

668 if m1: 

669 start = m1.start(0) 

670 end = m1.end(0) 

671 el = self.build_element(m1, item.builder, item.tags, index) 

672 break 

673 return el, start, end 

674 

675 

676class UnderscoreProcessor(AsteriskProcessor): 

677 """Emphasis processor for handling strong and em matches inside underscores.""" 

678 

679 PATTERNS = [ 

680 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

681 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

682 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

683 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

684 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

685 ] 

686 """ The various strong and emphasis patterns handled by this processor. """ 

687 

688 

689class LinkInlineProcessor(InlineProcessor): 

690 """ Return a link element from the given match. """ 

691 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) 

692 RE_TITLE_CLEAN = re.compile(r'\s') 

693 

694 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

695 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

696 text, index, handled = self.getText(data, m.end(0)) 

697 

698 if not handled: 

699 return None, None, None 

700 

701 href, title, index, handled = self.getLink(data, index) 

702 if not handled: 

703 return None, None, None 

704 

705 el = etree.Element("a") 

706 el.text = text 

707 

708 el.set("href", href) 

709 

710 if title is not None: 

711 el.set("title", title) 

712 

713 return el, m.start(0), index 

714 

715 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]: 

716 """Parse data between `()` of `[Text]()` allowing recursive `()`. """ 

717 

718 href = '' 

719 title: str | None = None 

720 handled = False 

721 

722 m = self.RE_LINK.match(data, pos=index) 

723 if m and m.group(1): 

724 # Matches [Text](<link> "title") 

725 href = m.group(1)[1:-1].strip() 

726 if m.group(2): 

727 title = m.group(2)[1:-1] 

728 index = m.end(0) 

729 handled = True 

730 elif m: 

731 # Track bracket nesting and index in string 

732 bracket_count = 1 

733 backtrack_count = 1 

734 start_index = m.end() 

735 index = start_index 

736 last_bracket = -1 

737 

738 # Primary (first found) quote tracking. 

739 quote: str | None = None 

740 start_quote = -1 

741 exit_quote = -1 

742 ignore_matches = False 

743 

744 # Secondary (second found) quote tracking. 

745 alt_quote = None 

746 start_alt_quote = -1 

747 exit_alt_quote = -1 

748 

749 # Track last character 

750 last = '' 

751 

752 for pos in range(index, len(data)): 

753 c = data[pos] 

754 if c == '(': 

755 # Count nested ( 

756 # Don't increment the bracket count if we are sure we're in a title. 

757 if not ignore_matches: 

758 bracket_count += 1 

759 elif backtrack_count > 0: 

760 backtrack_count -= 1 

761 elif c == ')': 

762 # Match nested ) to ( 

763 # Don't decrement if we are sure we are in a title that is unclosed. 

764 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): 

765 bracket_count = 0 

766 elif not ignore_matches: 

767 bracket_count -= 1 

768 elif backtrack_count > 0: 

769 backtrack_count -= 1 

770 # We've found our backup end location if the title doesn't resolve. 

771 if backtrack_count == 0: 

772 last_bracket = index + 1 

773 

774 elif c in ("'", '"'): 

775 # Quote has started 

776 if not quote: 

777 # We'll assume we are now in a title. 

778 # Brackets are quoted, so no need to match them (except for the final one). 

779 ignore_matches = True 

780 backtrack_count = bracket_count 

781 bracket_count = 1 

782 start_quote = index + 1 

783 quote = c 

784 # Secondary quote (in case the first doesn't resolve): [text](link'"title") 

785 elif c != quote and not alt_quote: 

786 start_alt_quote = index + 1 

787 alt_quote = c 

788 # Update primary quote match 

789 elif c == quote: 

790 exit_quote = index + 1 

791 # Update secondary quote match 

792 elif alt_quote and c == alt_quote: 

793 exit_alt_quote = index + 1 

794 

795 index += 1 

796 

797 # Link is closed, so let's break out of the loop 

798 if bracket_count == 0: 

799 # Get the title if we closed a title string right before link closed 

800 if exit_quote >= 0 and quote == last: 

801 href = data[start_index:start_quote - 1] 

802 title = ''.join(data[start_quote:exit_quote - 1]) 

803 elif exit_alt_quote >= 0 and alt_quote == last: 

804 href = data[start_index:start_alt_quote - 1] 

805 title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) 

806 else: 

807 href = data[start_index:index - 1] 

808 break 

809 

810 if c != ' ': 

811 last = c 

812 

813 # We have a scenario: `[test](link"notitle)` 

814 # When we enter a string, we stop tracking bracket resolution in the main counter, 

815 # but we do keep a backup counter up until we discover where we might resolve all brackets 

816 # if the title string fails to resolve. 

817 if bracket_count != 0 and backtrack_count == 0: 

818 href = data[start_index:last_bracket - 1] 

819 index = last_bracket 

820 bracket_count = 0 

821 

822 handled = bracket_count == 0 

823 

824 if title is not None: 

825 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) 

826 

827 href = self.unescape(href).strip() 

828 

829 return href, title, index, handled 

830 

831 def getText(self, data: str, index: int) -> tuple[str, int, bool]: 

832 """Parse the content between `[]` of the start of an image or link 

833 resolving nested square brackets. 

834 

835 """ 

836 bracket_count = 1 

837 text = [] 

838 for pos in range(index, len(data)): 

839 c = data[pos] 

840 if c == ']': 

841 bracket_count -= 1 

842 elif c == '[': 

843 bracket_count += 1 

844 index += 1 

845 if bracket_count == 0: 

846 break 

847 text.append(c) 

848 return ''.join(text), index, bracket_count == 0 

849 

850 

851class ImageInlineProcessor(LinkInlineProcessor): 

852 """ Return a `img` element from the given match. """ 

853 

854 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

855 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

856 text, index, handled = self.getText(data, m.end(0)) 

857 if not handled: 

858 return None, None, None 

859 

860 src, title, index, handled = self.getLink(data, index) 

861 if not handled: 

862 return None, None, None 

863 

864 el = etree.Element("img") 

865 

866 el.set("src", src) 

867 

868 if title is not None: 

869 el.set("title", title) 

870 

871 el.set('alt', self.unescape(text)) 

872 return el, m.start(0), index 

873 

874 

875class ReferenceInlineProcessor(LinkInlineProcessor): 

876 """ Match to a stored reference and return link element. """ 

877 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) 

878 

879 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) 

880 

881 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

882 """ 

883 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`. 

884 

885 """ 

886 text, index, handled = self.getText(data, m.end(0)) 

887 if not handled: 

888 return None, None, None 

889 

890 id, end, handled = self.evalId(data, index, text) 

891 if not handled: 

892 return None, None, None 

893 

894 # Clean up line breaks in id 

895 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 

896 if id not in self.md.references: # ignore undefined refs 

897 return None, m.start(0), end 

898 

899 href, title = self.md.references[id] 

900 

901 return self.makeTag(href, title, text), m.start(0), end 

902 

903 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: 

904 """ 

905 Evaluate the id portion of `[ref][id]`. 

906 

907 If `[ref][]` use `[ref]`. 

908 """ 

909 m = self.RE_LINK.match(data, pos=index) 

910 if not m: 

911 return None, index, False 

912 else: 

913 id = m.group(1).lower() 

914 end = m.end(0) 

915 if not id: 

916 id = text.lower() 

917 return id, end, True 

918 

919 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

920 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """ 

921 el = etree.Element('a') 

922 

923 el.set('href', href) 

924 if title: 

925 el.set('title', title) 

926 

927 el.text = text 

928 return el 

929 

930 

931class ShortReferenceInlineProcessor(ReferenceInlineProcessor): 

932 """Short form of reference: `[google]`. """ 

933 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

934 """Evaluate the id of `[ref]`. """ 

935 

936 return text.lower(), index, True 

937 

938 

939class ImageReferenceInlineProcessor(ReferenceInlineProcessor): 

940 """ Match to a stored reference and return `img` element. """ 

941 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

942 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """ 

943 el = etree.Element("img") 

944 el.set("src", href) 

945 if title: 

946 el.set("title", title) 

947 el.set("alt", self.unescape(text)) 

948 return el 

949 

950 

951class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): 

952 """ Short form of image reference: `![ref]`. """ 

953 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

954 """Evaluate the id of `[ref]`. """ 

955 

956 return text.lower(), index, True 

957 

958 

959class AutolinkInlineProcessor(InlineProcessor): 

960 """ Return a link Element given an auto-link (`<http://example/com>`). """ 

961 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

962 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """ 

963 el = etree.Element("a") 

964 el.set('href', self.unescape(m.group(1))) 

965 el.text = util.AtomicString(m.group(1)) 

966 return el, m.start(0), m.end(0) 

967 

968 

969class AutomailInlineProcessor(InlineProcessor): 

970 """ 

971 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`). 

972 """ 

973 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

974 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """ 

975 el = etree.Element('a') 

976 email = self.unescape(m.group(1)) 

977 if email.startswith("mailto:"): 

978 email = email[len("mailto:"):] 

979 

980 def codepoint2name(code: int) -> str: 

981 """Return entity definition by code, or the code if not defined.""" 

982 entity = entities.codepoint2name.get(code) 

983 if entity: 

984 return "{}{};".format(util.AMP_SUBSTITUTE, entity) 

985 else: 

986 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 

987 

988 letters = [codepoint2name(ord(letter)) for letter in email] 

989 el.text = util.AtomicString(''.join(letters)) 

990 

991 mailto = "mailto:" + email 

992 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 

993 ord(letter) for letter in mailto]) 

994 el.set('href', mailto) 

995 return el, m.start(0), m.end(0)