Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/inlinepatterns.py: 99%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

409 statements  

1# Python Markdown 

2 

3# A Python implementation of John Gruber's Markdown. 

4 

5# Documentation: https://python-markdown.github.io/ 

6# GitHub: https://github.com/Python-Markdown/markdown/ 

7# PyPI: https://pypi.org/project/Markdown/ 

8 

9# Started by Manfred Stienstra (http://www.dwerg.net/). 

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). 

11# Currently maintained by Waylan Limberg (https://github.com/waylan), 

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). 

13 

14# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) 

15# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) 

16# Copyright 2004 Manfred Stienstra (the original version) 

17 

18# License: BSD (see LICENSE.md for details). 

19 

20""" 

21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The 

22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still 

23supported, though users are encouraged to migrate. 

24 

25The new `InlineProcessor` provides two major enhancements to `Patterns`: 

26 

271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with 

28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain 

29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`. 

30 

312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis, 

32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative 

33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are 

34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`. 

35 

36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested 

37 brackets, and explicit control of the span "consumed" by the processor. 

38 

39""" 

40 

41from __future__ import annotations 

42 

43from . import util 

44from typing import TYPE_CHECKING, Any, Collection, NamedTuple 

45import re 

46import xml.etree.ElementTree as etree 

47from html import entities 

48 

49if TYPE_CHECKING: # pragma: no cover 

50 from markdown import Markdown 

51 

52 

53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]: 

54 """ 

55 Build the default set of inline patterns for Markdown. 

56 

57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace 

58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we 

59 apply the expressions in the following order: 

60 

61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown 

62 patterns by escaping them; 

63 

64 * then we handle the various types of links (auto-links must be handled before inline HTML); 

65 

66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder 

67 and add the actual HTML to a stash; 

68 

69 * finally we apply strong, emphasis, etc. 

70 

71 """ 

72 inlinePatterns = util.Registry() 

73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) 

74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) 

75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) 

76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) 

77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) 

78 inlinePatterns.register( 

79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 

80 ) 

81 inlinePatterns.register( 

82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 

83 ) 

84 inlinePatterns.register( 

85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 

86 ) 

87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) 

88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) 

89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) 

90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) 

91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) 

92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) 

93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) 

94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) 

95 return inlinePatterns 

96 

97 

98# The actual regular expressions for patterns 

99# ----------------------------------------------------------------------------- 

100 

101NOIMG = r'(?<!\!)' 

102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """ 

103 

104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' 

105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """ 

106 

107ESCAPE_RE = r'\\(.)' 

108""" Match a backslash escaped character (`\\<` or `\\*`). """ 

109 

110EMPHASIS_RE = r'(\*)([^\*]+)\1' 

111""" Match emphasis with an asterisk (`*emphasis*`). """ 

112 

113STRONG_RE = r'(\*{2})(.+?)\1' 

114""" Match strong with an asterisk (`**strong**`). """ 

115 

116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' 

117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """ 

118 

119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' 

120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """ 

121 

122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' 

123""" Match strong emphasis with underscores (`__strong _em__`). """ 

124 

125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' 

126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """ 

127 

128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' 

129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """ 

130 

131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' 

132""" Match strong emphasis with asterisk (`***strong**em*`). """ 

133 

134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' 

135""" Match strong emphasis with underscores (`___strong__em_`). """ 

136 

137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' 

138""" Match strong emphasis with asterisk (`**strong*em***`). """ 

139 

140LINK_RE = NOIMG + r'\[' 

141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """ 

142 

143IMAGE_LINK_RE = r'\!\[' 

144""" Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """ 

145 

146REFERENCE_RE = LINK_RE 

147""" Match start of reference link (`[Label][3]`). """ 

148 

149IMAGE_REFERENCE_RE = IMAGE_LINK_RE 

150""" Match start of image reference (`![alt text][2]`). """ 

151 

152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))' 

153""" Match a stand-alone `*` or `_`. """ 

154 

155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' 

156""" Match an automatic link (`<http://www.example.com>`). """ 

157 

158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' 

159""" Match an automatic email link (`<me@example.com>`). """ 

160 

161HTML_RE = ( 

162 r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag 

163 r'!--(?:(?!<!--|-->).)*--|' # Comment 

164 r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction 

165 r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA` 

166 ')>)' 

167) 

168""" Match an HTML tag (`<...>`). """ 

169 

170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' 

171""" Match an HTML entity (`&#38;` (decimal) or `&#x26;` (hex) or `&amp;` (named)). """ 

172 

173LINE_BREAK_RE = r' \n' 

174""" Match two spaces at end of line. """ 

175 

176 

177def dequote(string: str) -> str: 

178 """Remove quotes from around a string.""" 

179 if ((string.startswith('"') and string.endswith('"')) or 

180 (string.startswith("'") and string.endswith("'"))): 

181 return string[1:-1] 

182 else: 

183 return string 

184 

185 

186class EmStrongItem(NamedTuple): 

187 """Emphasis/strong pattern item.""" 

188 pattern: re.Pattern[str] 

189 builder: str 

190 tags: str 

191 

192 

193# The pattern classes 

194# ----------------------------------------------------------------------------- 

195 

196 

197class Pattern: # pragma: no cover 

198 """ 

199 Base class that inline patterns subclass. 

200 

201 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression. 

202 Each pattern object uses a single regular expression and must support the following methods: 

203 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and 

204 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch]. 

205 

206 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this 

207 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on 

208 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression 

209 is pre-compiled. 

210 

211 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that 

212 use a more efficient and flexible search approach be used instead. However, the older style 

213 `Pattern` remains for backward compatibility with many existing third-party extensions. 

214 

215 """ 

216 

217 ANCESTOR_EXCLUDES: Collection[str] = tuple() 

218 """ 

219 A collection of elements which are undesirable ancestors. The processor will be skipped if it 

220 would cause the content to be a descendant of one of the listed tag names. 

221 """ 

222 

223 compiled_re: re.Pattern[str] 

224 md: Markdown | None 

225 

226 def __init__(self, pattern: str, md: Markdown | None = None): 

227 """ 

228 Create an instant of an inline pattern. 

229 

230 Arguments: 

231 pattern: A regular expression that matches a pattern. 

232 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

233 `self.md` on the class instance. 

234 

235 

236 """ 

237 self.pattern = pattern 

238 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, 

239 re.DOTALL | re.UNICODE) 

240 

241 self.md = md 

242 

243 def getCompiledRegExp(self) -> re.Pattern: 

244 """ Return a compiled regular expression. """ 

245 return self.compiled_re 

246 

247 def handleMatch(self, m: re.Match[str]) -> etree.Element | str: 

248 """Return a ElementTree element from the given match. 

249 

250 Subclasses should override this method. 

251 

252 Arguments: 

253 m: A match object containing a match of the pattern. 

254 

255 Returns: An ElementTree Element object. 

256 

257 """ 

258 pass # pragma: no cover 

259 

260 def type(self) -> str: 

261 """ Return class name, to define pattern type """ 

262 return self.__class__.__name__ 

263 

264 def unescape(self, text: str) -> str: 

265 """ Return unescaped text given text with an inline placeholder. """ 

266 try: 

267 stash = self.md.treeprocessors['inline'].stashed_nodes 

268 except KeyError: # pragma: no cover 

269 return text 

270 

271 def get_stash(m): 

272 id = m.group(1) 

273 if id in stash: 

274 value = stash.get(id) 

275 if isinstance(value, str): 

276 return value 

277 else: 

278 # An `etree` Element - return text content only 

279 return ''.join(value.itertext()) 

280 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

281 

282 

283class InlineProcessor(Pattern): 

284 """ 

285 Base class that inline processors subclass. 

286 

287 This is the newer style inline processor that uses a more 

288 efficient and flexible search approach. 

289 

290 """ 

291 

292 def __init__(self, pattern: str, md: Markdown | None = None): 

293 """ 

294 Create an instant of an inline processor. 

295 

296 Arguments: 

297 pattern: A regular expression that matches a pattern. 

298 md: An optional pointer to the instance of `markdown.Markdown` and is available as 

299 `self.md` on the class instance. 

300 

301 """ 

302 self.pattern = pattern 

303 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) 

304 

305 # API for Markdown to pass `safe_mode` into instance 

306 self.safe_mode = False 

307 self.md = md 

308 

309 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]: 

310 """Return a ElementTree element from the given match and the 

311 start and end index of the matched text. 

312 

313 If `start` and/or `end` are returned as `None`, it will be 

314 assumed that the processor did not find a valid region of text. 

315 

316 Subclasses should override this method. 

317 

318 Arguments: 

319 m: A re match object containing a match of the pattern. 

320 data: The buffer currently under analysis. 

321 

322 Returns: 

323 el: The ElementTree element, text or None. 

324 start: The start of the region that has been matched or None. 

325 end: The end of the region that has been matched or None. 

326 

327 """ 

328 pass # pragma: no cover 

329 

330 

331class SimpleTextPattern(Pattern): # pragma: no cover 

332 """ Return a simple text of `group(2)` of a Pattern. """ 

333 def handleMatch(self, m: re.Match[str]) -> str: 

334 """ Return string content of `group(2)` of a matching pattern. """ 

335 return m.group(2) 

336 

337 

338class SimpleTextInlineProcessor(InlineProcessor): 

339 """ Return a simple text of `group(1)` of a Pattern. """ 

340 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

341 """ Return string content of `group(1)` of a matching pattern. """ 

342 return m.group(1), m.start(0), m.end(0) 

343 

344 

345class EscapeInlineProcessor(InlineProcessor): 

346 """ Return an escaped character. """ 

347 

348 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]: 

349 """ 

350 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS] 

351 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped 

352 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX]. 

353 

354 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`. 

355 """ 

356 

357 char = m.group(1) 

358 if char in self.md.ESCAPED_CHARS: 

359 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) 

360 else: 

361 return None, m.start(0), m.end(0) 

362 

363 

364class SimpleTagPattern(Pattern): # pragma: no cover 

365 """ 

366 Return element of type `tag` with a text attribute of `group(3)` 

367 of a Pattern. 

368 

369 """ 

370 def __init__(self, pattern: str, tag: str): 

371 """ 

372 Create an instant of an simple tag pattern. 

373 

374 Arguments: 

375 pattern: A regular expression that matches a pattern. 

376 tag: Tag of element. 

377 

378 """ 

379 Pattern.__init__(self, pattern) 

380 self.tag = tag 

381 """ The tag of the rendered element. """ 

382 

383 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

384 """ 

385 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a 

386 matching pattern as the Element's text. 

387 """ 

388 el = etree.Element(self.tag) 

389 el.text = m.group(3) 

390 return el 

391 

392 

393class SimpleTagInlineProcessor(InlineProcessor): 

394 """ 

395 Return element of type `tag` with a text attribute of `group(2)` 

396 of a Pattern. 

397 

398 """ 

399 def __init__(self, pattern: str, tag: str): 

400 """ 

401 Create an instant of an simple tag processor. 

402 

403 Arguments: 

404 pattern: A regular expression that matches a pattern. 

405 tag: Tag of element. 

406 

407 """ 

408 InlineProcessor.__init__(self, pattern) 

409 self.tag = tag 

410 """ The tag of the rendered element. """ 

411 

412 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

413 """ 

414 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a 

415 matching pattern as the Element's text. 

416 """ 

417 el = etree.Element(self.tag) 

418 el.text = m.group(2) 

419 return el, m.start(0), m.end(0) 

420 

421 

422class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover 

423 """ Return an element of type `tag` with no children. """ 

424 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

425 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

426 return etree.Element(self.tag) 

427 

428 

429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): 

430 """ Return an element of type `tag` with no children. """ 

431 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

432 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ 

433 return etree.Element(self.tag), m.start(0), m.end(0) 

434 

435 

436class BacktickInlineProcessor(InlineProcessor): 

437 """ Return a `<code>` element containing the escaped matching text. """ 

438 def __init__(self, pattern: str): 

439 InlineProcessor.__init__(self, pattern) 

440 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) 

441 self.tag = 'code' 

442 """ The tag of the rendered element. """ 

443 

444 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]: 

445 """ 

446 If the match contains `group(3)` of a pattern, then return a `code` 

447 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with 

448 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString]. 

449 

450 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped. 

451 

452 """ 

453 if m.group(3): 

454 el = etree.Element(self.tag) 

455 el.text = util.AtomicString(util.code_escape(m.group(3).strip())) 

456 return el, m.start(0), m.end(0) 

457 else: 

458 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) 

459 

460 

461class DoubleTagPattern(SimpleTagPattern): # pragma: no cover 

462 """Return a ElementTree element nested in tag2 nested in tag1. 

463 

464 Useful for strong emphasis etc. 

465 

466 """ 

467 def handleMatch(self, m: re.Match[str]) -> etree.Element: 

468 """ 

469 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

470 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional. 

471 

472 """ 

473 tag1, tag2 = self.tag.split(",") 

474 el1 = etree.Element(tag1) 

475 el2 = etree.SubElement(el1, tag2) 

476 el2.text = m.group(3) 

477 if len(m.groups()) == 5: 

478 el2.tail = m.group(4) 

479 return el1 

480 

481 

482class DoubleTagInlineProcessor(SimpleTagInlineProcessor): 

483 """Return a ElementTree element nested in tag2 nested in tag1. 

484 

485 Useful for strong emphasis etc. 

486 

487 """ 

488 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover 

489 """ 

490 Return [`Element`][xml.etree.ElementTree.Element] in following format: 

491 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional. 

492 

493 """ 

494 tag1, tag2 = self.tag.split(",") 

495 el1 = etree.Element(tag1) 

496 el2 = etree.SubElement(el1, tag2) 

497 el2.text = m.group(2) 

498 if len(m.groups()) == 3: 

499 el2.tail = m.group(3) 

500 return el1, m.start(0), m.end(0) 

501 

502 

503class HtmlInlineProcessor(InlineProcessor): 

504 """ Store raw inline html and return a placeholder. """ 

505 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: 

506 """ Store the text of `group(1)` of a pattern and return a placeholder string. """ 

507 rawhtml = self.backslash_unescape(self.unescape(m.group(1))) 

508 place_holder = self.md.htmlStash.store(rawhtml) 

509 return place_holder, m.start(0), m.end(0) 

510 

511 def unescape(self, text: str) -> str: 

512 """ Return unescaped text given text with an inline placeholder. """ 

513 try: 

514 stash = self.md.treeprocessors['inline'].stashed_nodes 

515 except KeyError: # pragma: no cover 

516 return text 

517 

518 def get_stash(m: re.Match[str]) -> str: 

519 id = m.group(1) 

520 value = stash.get(id) 

521 if value is not None: 

522 try: 

523 # Ensure we don't have a placeholder inside a placeholder 

524 return self.unescape(self.md.serializer(value)) 

525 except Exception: 

526 return r'\%s' % value 

527 

528 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) 

529 

530 def backslash_unescape(self, text: str) -> str: 

531 """ Return text with backslash escapes undone (backslashes are restored). """ 

532 try: 

533 RE = self.md.treeprocessors['unescape'].RE 

534 except KeyError: # pragma: no cover 

535 return text 

536 

537 def _unescape(m: re.Match[str]) -> str: 

538 return chr(int(m.group(1))) 

539 

540 return RE.sub(_unescape, text) 

541 

542 

543class AsteriskProcessor(InlineProcessor): 

544 """Emphasis processor for handling strong and em matches inside asterisks.""" 

545 

546 PATTERNS = [ 

547 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

548 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

549 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

550 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

551 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

552 ] 

553 """ The various strong and emphasis patterns handled by this processor. """ 

554 

555 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element: 

556 """Return single tag.""" 

557 el1 = etree.Element(tag) 

558 text = m.group(2) 

559 self.parse_sub_patterns(text, el1, None, idx) 

560 return el1 

561 

562 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

563 """Return double tag.""" 

564 

565 tag1, tag2 = tags.split(",") 

566 el1 = etree.Element(tag1) 

567 el2 = etree.Element(tag2) 

568 text = m.group(2) 

569 self.parse_sub_patterns(text, el2, None, idx) 

570 el1.append(el2) 

571 if len(m.groups()) == 3: 

572 text = m.group(3) 

573 self.parse_sub_patterns(text, el1, el2, idx) 

574 return el1 

575 

576 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: 

577 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" 

578 

579 tag1, tag2 = tags.split(",") 

580 el1 = etree.Element(tag1) 

581 el2 = etree.Element(tag2) 

582 text = m.group(2) 

583 self.parse_sub_patterns(text, el1, None, idx) 

584 text = m.group(3) 

585 el1.append(el2) 

586 self.parse_sub_patterns(text, el2, None, idx) 

587 return el1 

588 

589 def parse_sub_patterns( 

590 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int 

591 ) -> None: 

592 """ 

593 Parses sub patterns. 

594 

595 `data`: text to evaluate. 

596 

597 `parent`: Parent to attach text and sub elements to. 

598 

599 `last`: Last appended child to parent. Can also be None if parent has no children. 

600 

601 `idx`: Current pattern index that was used to evaluate the parent. 

602 """ 

603 

604 offset = 0 

605 pos = 0 

606 

607 length = len(data) 

608 while pos < length: 

609 # Find the start of potential emphasis or strong tokens 

610 if self.compiled_re.match(data, pos): 

611 matched = False 

612 # See if the we can match an emphasis/strong pattern 

613 for index, item in enumerate(self.PATTERNS): 

614 # Only evaluate patterns that are after what was used on the parent 

615 if index <= idx: 

616 continue 

617 m = item.pattern.match(data, pos) 

618 if m: 

619 # Append child nodes to parent 

620 # Text nodes should be appended to the last 

621 # child if present, and if not, it should 

622 # be added as the parent's text node. 

623 text = data[offset:m.start(0)] 

624 if text: 

625 if last is not None: 

626 last.tail = text 

627 else: 

628 parent.text = text 

629 el = self.build_element(m, item.builder, item.tags, index) 

630 parent.append(el) 

631 last = el 

632 # Move our position past the matched hunk 

633 offset = pos = m.end(0) 

634 matched = True 

635 if not matched: 

636 # We matched nothing, move on to the next character 

637 pos += 1 

638 else: 

639 # Increment position as no potential emphasis start was found. 

640 pos += 1 

641 

642 # Append any leftover text as a text node. 

643 text = data[offset:] 

644 if text: 

645 if last is not None: 

646 last.tail = text 

647 else: 

648 parent.text = text 

649 

650 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element: 

651 """Element builder.""" 

652 

653 if builder == 'double2': 

654 return self.build_double2(m, tags, index) 

655 elif builder == 'double': 

656 return self.build_double(m, tags, index) 

657 else: 

658 return self.build_single(m, tags, index) 

659 

660 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

661 """Parse patterns.""" 

662 

663 el = None 

664 start = None 

665 end = None 

666 

667 for index, item in enumerate(self.PATTERNS): 

668 m1 = item.pattern.match(data, m.start(0)) 

669 if m1: 

670 start = m1.start(0) 

671 end = m1.end(0) 

672 el = self.build_element(m1, item.builder, item.tags, index) 

673 break 

674 return el, start, end 

675 

676 

677class UnderscoreProcessor(AsteriskProcessor): 

678 """Emphasis processor for handling strong and em matches inside underscores.""" 

679 

680 PATTERNS = [ 

681 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), 

682 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), 

683 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), 

684 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), 

685 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') 

686 ] 

687 """ The various strong and emphasis patterns handled by this processor. """ 

688 

689 

690class LinkInlineProcessor(InlineProcessor): 

691 """ Return a link element from the given match. """ 

692 RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) 

693 RE_TITLE_CLEAN = re.compile(r'\s') 

694 

695 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

696 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

697 text, index, handled = self.getText(data, m.end(0)) 

698 

699 if not handled: 

700 return None, None, None 

701 

702 href, title, index, handled = self.getLink(data, index) 

703 if not handled: 

704 return None, None, None 

705 

706 el = etree.Element("a") 

707 el.text = text 

708 

709 el.set("href", href) 

710 

711 if title is not None: 

712 el.set("title", title) 

713 

714 return el, m.start(0), index 

715 

716 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]: 

717 """Parse data between `()` of `[Text]()` allowing recursive `()`. """ 

718 

719 href = '' 

720 title: str | None = None 

721 handled = False 

722 

723 m = self.RE_LINK.match(data, pos=index) 

724 if m and m.group(1): 

725 # Matches [Text](<link> "title") 

726 href = m.group(1)[1:-1].strip() 

727 if m.group(2): 

728 title = m.group(2)[1:-1] 

729 index = m.end(0) 

730 handled = True 

731 elif m: 

732 # Track bracket nesting and index in string 

733 bracket_count = 1 

734 backtrack_count = 1 

735 start_index = m.end() 

736 index = start_index 

737 last_bracket = -1 

738 

739 # Primary (first found) quote tracking. 

740 quote: str | None = None 

741 start_quote = -1 

742 exit_quote = -1 

743 ignore_matches = False 

744 

745 # Secondary (second found) quote tracking. 

746 alt_quote = None 

747 start_alt_quote = -1 

748 exit_alt_quote = -1 

749 

750 # Track last character 

751 last = '' 

752 

753 for pos in range(index, len(data)): 

754 c = data[pos] 

755 if c == '(': 

756 # Count nested ( 

757 # Don't increment the bracket count if we are sure we're in a title. 

758 if not ignore_matches: 

759 bracket_count += 1 

760 elif backtrack_count > 0: 

761 backtrack_count -= 1 

762 elif c == ')': 

763 # Match nested ) to ( 

764 # Don't decrement if we are sure we are in a title that is unclosed. 

765 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): 

766 bracket_count = 0 

767 elif not ignore_matches: 

768 bracket_count -= 1 

769 elif backtrack_count > 0: 

770 backtrack_count -= 1 

771 # We've found our backup end location if the title doesn't resolve. 

772 if backtrack_count == 0: 

773 last_bracket = index + 1 

774 

775 elif c in ("'", '"'): 

776 # Quote has started 

777 if not quote: 

778 # We'll assume we are now in a title. 

779 # Brackets are quoted, so no need to match them (except for the final one). 

780 ignore_matches = True 

781 backtrack_count = bracket_count 

782 bracket_count = 1 

783 start_quote = index + 1 

784 quote = c 

785 # Secondary quote (in case the first doesn't resolve): [text](link'"title") 

786 elif c != quote and not alt_quote: 

787 start_alt_quote = index + 1 

788 alt_quote = c 

789 # Update primary quote match 

790 elif c == quote: 

791 exit_quote = index + 1 

792 # Update secondary quote match 

793 elif alt_quote and c == alt_quote: 

794 exit_alt_quote = index + 1 

795 

796 index += 1 

797 

798 # Link is closed, so let's break out of the loop 

799 if bracket_count == 0: 

800 # Get the title if we closed a title string right before link closed 

801 if exit_quote >= 0 and quote == last: 

802 href = data[start_index:start_quote - 1] 

803 title = ''.join(data[start_quote:exit_quote - 1]) 

804 elif exit_alt_quote >= 0 and alt_quote == last: 

805 href = data[start_index:start_alt_quote - 1] 

806 title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) 

807 else: 

808 href = data[start_index:index - 1] 

809 break 

810 

811 if c != ' ': 

812 last = c 

813 

814 # We have a scenario: `[test](link"notitle)` 

815 # When we enter a string, we stop tracking bracket resolution in the main counter, 

816 # but we do keep a backup counter up until we discover where we might resolve all brackets 

817 # if the title string fails to resolve. 

818 if bracket_count != 0 and backtrack_count == 0: 

819 href = data[start_index:last_bracket - 1] 

820 index = last_bracket 

821 bracket_count = 0 

822 

823 handled = bracket_count == 0 

824 

825 if title is not None: 

826 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) 

827 

828 href = self.unescape(href).strip() 

829 

830 return href, title, index, handled 

831 

832 def getText(self, data: str, index: int) -> tuple[str, int, bool]: 

833 """Parse the content between `[]` of the start of an image or link 

834 resolving nested square brackets. 

835 

836 """ 

837 bracket_count = 1 

838 text = [] 

839 for pos in range(index, len(data)): 

840 c = data[pos] 

841 if c == ']': 

842 bracket_count -= 1 

843 elif c == '[': 

844 bracket_count += 1 

845 index += 1 

846 if bracket_count == 0: 

847 break 

848 text.append(c) 

849 return ''.join(text), index, bracket_count == 0 

850 

851 

852class ImageInlineProcessor(LinkInlineProcessor): 

853 """ Return a `img` element from the given match. """ 

854 

855 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

856 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ 

857 text, index, handled = self.getText(data, m.end(0)) 

858 if not handled: 

859 return None, None, None 

860 

861 src, title, index, handled = self.getLink(data, index) 

862 if not handled: 

863 return None, None, None 

864 

865 el = etree.Element("img") 

866 

867 el.set("src", src) 

868 

869 if title is not None: 

870 el.set("title", title) 

871 

872 el.set('alt', self.unescape(text)) 

873 return el, m.start(0), index 

874 

875 

876class ReferenceInlineProcessor(LinkInlineProcessor): 

877 """ Match to a stored reference and return link element. """ 

878 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) 

879 

880 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) 

881 

882 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: 

883 """ 

884 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`. 

885 

886 """ 

887 text, index, handled = self.getText(data, m.end(0)) 

888 if not handled: 

889 return None, None, None 

890 

891 id, end, handled = self.evalId(data, index, text) 

892 if not handled: 

893 return None, None, None 

894 

895 # Clean up line breaks in id 

896 id = self.NEWLINE_CLEANUP_RE.sub(' ', id) 

897 if id not in self.md.references: # ignore undefined refs 

898 return None, m.start(0), end 

899 

900 href, title = self.md.references[id] 

901 

902 return self.makeTag(href, title, text), m.start(0), end 

903 

904 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: 

905 """ 

906 Evaluate the id portion of `[ref][id]`. 

907 

908 If `[ref][]` use `[ref]`. 

909 """ 

910 m = self.RE_LINK.match(data, pos=index) 

911 if not m: 

912 return None, index, False 

913 else: 

914 id = m.group(1).lower() 

915 end = m.end(0) 

916 if not id: 

917 id = text.lower() 

918 return id, end, True 

919 

920 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

921 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """ 

922 el = etree.Element('a') 

923 

924 el.set('href', href) 

925 if title: 

926 el.set('title', title) 

927 

928 el.text = text 

929 return el 

930 

931 

932class ShortReferenceInlineProcessor(ReferenceInlineProcessor): 

933 """Short form of reference: `[google]`. """ 

934 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

935 """Evaluate the id of `[ref]`. """ 

936 

937 return text.lower(), index, True 

938 

939 

940class ImageReferenceInlineProcessor(ReferenceInlineProcessor): 

941 """ Match to a stored reference and return `img` element. """ 

942 def makeTag(self, href: str, title: str, text: str) -> etree.Element: 

943 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """ 

944 el = etree.Element("img") 

945 el.set("src", href) 

946 if title: 

947 el.set("title", title) 

948 el.set("alt", self.unescape(text)) 

949 return el 

950 

951 

952class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): 

953 """ Short form of image reference: `![ref]`. """ 

954 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: 

955 """Evaluate the id of `[ref]`. """ 

956 

957 return text.lower(), index, True 

958 

959 

960class AutolinkInlineProcessor(InlineProcessor): 

961 """ Return a link Element given an auto-link (`<http://example/com>`). """ 

962 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

963 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """ 

964 el = etree.Element("a") 

965 el.set('href', self.unescape(m.group(1))) 

966 el.text = util.AtomicString(m.group(1)) 

967 return el, m.start(0), m.end(0) 

968 

969 

970class AutomailInlineProcessor(InlineProcessor): 

971 """ 

972 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`). 

973 """ 

974 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: 

975 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """ 

976 el = etree.Element('a') 

977 email = self.unescape(m.group(1)) 

978 if email.startswith("mailto:"): 

979 email = email[len("mailto:"):] 

980 

981 def codepoint2name(code: int) -> str: 

982 """Return entity definition by code, or the code if not defined.""" 

983 entity = entities.codepoint2name.get(code) 

984 if entity: 

985 return "{}{};".format(util.AMP_SUBSTITUTE, entity) 

986 else: 

987 return "%s#%d;" % (util.AMP_SUBSTITUTE, code) 

988 

989 letters = [codepoint2name(ord(letter)) for letter in email] 

990 el.text = util.AtomicString(''.join(letters)) 

991 

992 mailto = "mailto:" + email 

993 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % 

994 ord(letter) for letter in mailto]) 

995 el.set('href', mailto) 

996 return el, m.start(0), m.end(0)