Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown/inlinepatterns.py: 99%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The

22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still

23supported, though users are encouraged to migrate.

25The new `InlineProcessor` provides two major enhancements to `Patterns`:

271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with

28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain

29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`.

312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis,

32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative

33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are

34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`.

36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested

37 brackets, and explicit control of the span "consumed" by the processor.

39"""

41from __future__ import annotations

43from . import util

44from typing import TYPE_CHECKING, Any, Collection, NamedTuple

45import re

46import xml.etree.ElementTree as etree

47from html import entities

49if TYPE_CHECKING: # pragma: no cover

50 from markdown import Markdown

53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]:

54 """

55 Build the default set of inline patterns for Markdown.

57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace

58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we

59 apply the expressions in the following order:

61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown

62 patterns by escaping them;

64 * then we handle the various types of links (auto-links must be handled before inline HTML);

66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder

67 and add the actual HTML to a stash;

69 * finally we apply strong, emphasis, etc.

71 """

72 inlinePatterns = util.Registry()

73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)

74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)

75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)

76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)

77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)

78 inlinePatterns.register(

79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140

80 )

81 inlinePatterns.register(

82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130

83 )

84 inlinePatterns.register(

85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125

86 )

87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)

88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)

89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)

90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)

91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)

92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)

93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)

94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)

95 return inlinePatterns

98# The actual regular expressions for patterns

99# -----------------------------------------------------------------------------

100

101NOIMG = r'(?<!\!)'

102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """

103

104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'

105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """

106

107ESCAPE_RE = r'\\(.)'

108""" Match a backslash escaped character (`\\<` or `\\*`). """

109

110EMPHASIS_RE = r'(\*)([^\*]+)\1'

111""" Match emphasis with an asterisk (`*emphasis*`). """

112

113STRONG_RE = r'(\*{2})(.+?)\1'

114""" Match strong with an asterisk (`**strong**`). """

115

116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'

117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """

118

119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'

120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """

121

122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'

123""" Match strong emphasis with underscores (`__strong _em__`). """

124

125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'

126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """

127

128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'

129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """

130

131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'

132""" Match strong emphasis with asterisk (`***strong**em*`). """

133

134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'

135""" Match strong emphasis with underscores (`___strong__em_`). """

136

137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'

138""" Match strong emphasis with asterisk (`**strong*em***`). """

139

140LINK_RE = NOIMG + r'\['

141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """

142

143IMAGE_LINK_RE = r'\!\['

144""" Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """

145

146REFERENCE_RE = LINK_RE

147""" Match start of reference link (`[Label][3]`). """

148

149IMAGE_REFERENCE_RE = IMAGE_LINK_RE

150""" Match start of image reference (`![alt text][2]`). """

151

152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))'

153""" Match a stand-alone `*` or `_`. """

154

155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'

156""" Match an automatic link (`<http://www.example.com>`). """

157

158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'

159""" Match an automatic email link (`<me@example.com>`). """

160

161HTML_RE = (

162 r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag

163 r'!--(?:(?!).)*--|' # Comment

164 r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction

165 r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA`

166 ')>)'

167)

168""" Match an HTML tag (`<...>`). """

169

170ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'

171""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """

172

173LINE_BREAK_RE = r' \n'

174""" Match two spaces at end of line. """

175

176

177def dequote(string: str) -> str:

178 """Remove quotes from around a string."""

179 if ((string.startswith('"') and string.endswith('"')) or

180 (string.startswith("'") and string.endswith("'"))):

181 return string[1:-1]

182 else:

183 return string

184

185

186class EmStrongItem(NamedTuple):

187 """Emphasis/strong pattern item."""

188 pattern: re.Pattern[str]

189 builder: str

190 tags: str

191

192

193# The pattern classes

194# -----------------------------------------------------------------------------

195

196

197class Pattern: # pragma: no cover

198 """

199 Base class that inline patterns subclass.

200

201 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression.

202 Each pattern object uses a single regular expression and must support the following methods:

203 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and

204 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch].

205

206 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this

207 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on

208 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression

209 is pre-compiled.

210

211 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that

212 use a more efficient and flexible search approach be used instead. However, the older style

213 `Pattern` remains for backward compatibility with many existing third-party extensions.

214

215 """

216

217 ANCESTOR_EXCLUDES: Collection[str] = tuple()

218 """

219 A collection of elements which are undesirable ancestors. The processor will be skipped if it

220 would cause the content to be a descendant of one of the listed tag names.

221 """

222

223 compiled_re: re.Pattern[str]

224 md: Markdown | None

225

226 def __init__(self, pattern: str, md: Markdown | None = None):

227 """

228 Create an instant of an inline pattern.

229

230 Arguments:

231 pattern: A regular expression that matches a pattern.

232 md: An optional pointer to the instance of `markdown.Markdown` and is available as

233 `self.md` on the class instance.

234

235

236 """

237 self.pattern = pattern

238 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,

239 re.DOTALL | re.UNICODE)

240

241 self.md = md

242

243 def getCompiledRegExp(self) -> re.Pattern:

244 """ Return a compiled regular expression. """

245 return self.compiled_re

246

247 def handleMatch(self, m: re.Match[str]) -> etree.Element | str:

248 """Return a ElementTree element from the given match.

249

250 Subclasses should override this method.

251

252 Arguments:

253 m: A match object containing a match of the pattern.

254

255 Returns: An ElementTree Element object.

256

257 """

258 pass # pragma: no cover

259

260 def type(self) -> str:

261 """ Return class name, to define pattern type """

262 return self.__class__.__name__

263

264 def unescape(self, text: str) -> str:

265 """ Return unescaped text given text with an inline placeholder. """

266 try:

267 stash = self.md.treeprocessors['inline'].stashed_nodes

268 except KeyError: # pragma: no cover

269 return text

270

271 def get_stash(m):

272 id = m.group(1)

273 if id in stash:

274 value = stash.get(id)

275 if isinstance(value, str):

276 return value

277 else:

278 # An `etree` Element - return text content only

279 return ''.join(value.itertext())

280 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

281

282

283class InlineProcessor(Pattern):

284 """

285 Base class that inline processors subclass.

286

287 This is the newer style inline processor that uses a more

288 efficient and flexible search approach.

289

290 """

291

292 def __init__(self, pattern: str, md: Markdown | None = None):

293 """

294 Create an instant of an inline processor.

295

296 Arguments:

297 pattern: A regular expression that matches a pattern.

298 md: An optional pointer to the instance of `markdown.Markdown` and is available as

299 `self.md` on the class instance.

300

301 """

302 self.pattern = pattern

303 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)

304

305 # API for Markdown to pass `safe_mode` into instance

306 self.safe_mode = False

307 self.md = md

308

309 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]:

310 """Return a ElementTree element from the given match and the

311 start and end index of the matched text.

312

313 If `start` and/or `end` are returned as `None`, it will be

314 assumed that the processor did not find a valid region of text.

315

316 Subclasses should override this method.

317

318 Arguments:

319 m: A re match object containing a match of the pattern.

320 data: The buffer currently under analysis.

321

322 Returns:

323 el: The ElementTree element, text or None.

324 start: The start of the region that has been matched or None.

325 end: The end of the region that has been matched or None.

326

327 """

328 pass # pragma: no cover

329

330

331class SimpleTextPattern(Pattern): # pragma: no cover

332 """ Return a simple text of `group(2)` of a Pattern. """

333 def handleMatch(self, m: re.Match[str]) -> str:

334 """ Return string content of `group(2)` of a matching pattern. """

335 return m.group(2)

336

337

338class SimpleTextInlineProcessor(InlineProcessor):

339 """ Return a simple text of `group(1)` of a Pattern. """

340 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:

341 """ Return string content of `group(1)` of a matching pattern. """

342 return m.group(1), m.start(0), m.end(0)

343

344

345class EscapeInlineProcessor(InlineProcessor):

346 """ Return an escaped character. """

347

348 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]:

349 """

350 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS]

351 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped

352 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX].

353

354 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`.

355 """

356

357 char = m.group(1)

358 if char in self.md.ESCAPED_CHARS:

359 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)

360 else:

361 return None, m.start(0), m.end(0)

362

363

364class SimpleTagPattern(Pattern): # pragma: no cover

365 """

366 Return element of type `tag` with a text attribute of `group(3)`

367 of a Pattern.

368

369 """

370 def __init__(self, pattern: str, tag: str):

371 """

372 Create an instant of an simple tag pattern.

373

374 Arguments:

375 pattern: A regular expression that matches a pattern.

376 tag: Tag of element.

377

378 """

379 Pattern.__init__(self, pattern)

380 self.tag = tag

381 """ The tag of the rendered element. """

382

383 def handleMatch(self, m: re.Match[str]) -> etree.Element:

384 """

385 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a

386 matching pattern as the Element's text.

387 """

388 el = etree.Element(self.tag)

389 el.text = m.group(3)

390 return el

391

392

393class SimpleTagInlineProcessor(InlineProcessor):

394 """

395 Return element of type `tag` with a text attribute of `group(2)`

396 of a Pattern.

397

398 """

399 def __init__(self, pattern: str, tag: str):

400 """

401 Create an instant of an simple tag processor.

402

403 Arguments:

404 pattern: A regular expression that matches a pattern.

405 tag: Tag of element.

406

407 """

408 InlineProcessor.__init__(self, pattern)

409 self.tag = tag

410 """ The tag of the rendered element. """

411

412 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover

413 """

414 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a

415 matching pattern as the Element's text.

416 """

417 el = etree.Element(self.tag)

418 el.text = m.group(2)

419 return el, m.start(0), m.end(0)

420

421

422class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover

423 """ Return an element of type `tag` with no children. """

424 def handleMatch(self, m: re.Match[str]) -> etree.Element:

425 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """

426 return etree.Element(self.tag)

427

428

429class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):

430 """ Return an element of type `tag` with no children. """

431 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

432 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """

433 return etree.Element(self.tag), m.start(0), m.end(0)

434

435

436class BacktickInlineProcessor(InlineProcessor):

437 """ Return a `<code>` element containing the escaped matching text. """

438 def __init__(self, pattern: str):

439 InlineProcessor.__init__(self, pattern)

440 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)

441 self.tag = 'code'

442 """ The tag of the rendered element. """

443

444 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:

445 """

446 If the match contains `group(3)` of a pattern, then return a `code`

447 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with

448 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString].

449

450 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped.

451

452 """

453 if m.group(3):

454 el = etree.Element(self.tag)

455 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))

456 return el, m.start(0), m.end(0)

457 else:

458 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)

459

460

461class DoubleTagPattern(SimpleTagPattern): # pragma: no cover

462 """Return a ElementTree element nested in tag2 nested in tag1.

463

464 Useful for strong emphasis etc.

465

466 """

467 def handleMatch(self, m: re.Match[str]) -> etree.Element:

468 """

469 Return [`Element`][xml.etree.ElementTree.Element] in following format:

470 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional.

471

472 """

473 tag1, tag2 = self.tag.split(",")

474 el1 = etree.Element(tag1)

475 el2 = etree.SubElement(el1, tag2)

476 el2.text = m.group(3)

477 if len(m.groups()) == 5:

478 el2.tail = m.group(4)

479 return el1

480

481

482class DoubleTagInlineProcessor(SimpleTagInlineProcessor):

483 """Return a ElementTree element nested in tag2 nested in tag1.

484

485 Useful for strong emphasis etc.

486

487 """

488 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover

489 """

490 Return [`Element`][xml.etree.ElementTree.Element] in following format:

491 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional.

492

493 """

494 tag1, tag2 = self.tag.split(",")

495 el1 = etree.Element(tag1)

496 el2 = etree.SubElement(el1, tag2)

497 el2.text = m.group(2)

498 if len(m.groups()) == 3:

499 el2.tail = m.group(3)

500 return el1, m.start(0), m.end(0)

501

502

503class HtmlInlineProcessor(InlineProcessor):

504 """ Store raw inline html and return a placeholder. """

505 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:

506 """ Store the text of `group(1)` of a pattern and return a placeholder string. """

507 rawhtml = self.backslash_unescape(self.unescape(m.group(1)))

508 place_holder = self.md.htmlStash.store(rawhtml)

509 return place_holder, m.start(0), m.end(0)

510

511 def unescape(self, text: str) -> str:

512 """ Return unescaped text given text with an inline placeholder. """

513 try:

514 stash = self.md.treeprocessors['inline'].stashed_nodes

515 except KeyError: # pragma: no cover

516 return text

517

518 def get_stash(m: re.Match[str]) -> str:

519 id = m.group(1)

520 value = stash.get(id)

521 if value is not None:

522 try:

523 # Ensure we don't have a placeholder inside a placeholder

524 return self.unescape(self.md.serializer(value))

525 except Exception:

526 return r'\%s' % value

527

528 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

529

530 def backslash_unescape(self, text: str) -> str:

531 """ Return text with backslash escapes undone (backslashes are restored). """

532 try:

533 RE = self.md.treeprocessors['unescape'].RE

534 except KeyError: # pragma: no cover

535 return text

536

537 def _unescape(m: re.Match[str]) -> str:

538 return chr(int(m.group(1)))

539

540 return RE.sub(_unescape, text)

541

542

543class AsteriskProcessor(InlineProcessor):

544 """Emphasis processor for handling strong and em matches inside asterisks."""

545

546 PATTERNS = [

547 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),

548 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),

549 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),

550 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),

551 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')

552 ]

553 """ The various strong and emphasis patterns handled by this processor. """

554

555 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element:

556 """Return single tag."""

557 el1 = etree.Element(tag)

558 text = m.group(2)

559 self.parse_sub_patterns(text, el1, None, idx)

560 return el1

561

562 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:

563 """Return double tag."""

564

565 tag1, tag2 = tags.split(",")

566 el1 = etree.Element(tag1)

567 el2 = etree.Element(tag2)

568 text = m.group(2)

569 self.parse_sub_patterns(text, el2, None, idx)

570 el1.append(el2)

571 if len(m.groups()) == 3:

572 text = m.group(3)

573 self.parse_sub_patterns(text, el1, el2, idx)

574 return el1

575

576 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:

577 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""

578

579 tag1, tag2 = tags.split(",")

580 el1 = etree.Element(tag1)

581 el2 = etree.Element(tag2)

582 text = m.group(2)

583 self.parse_sub_patterns(text, el1, None, idx)

584 text = m.group(3)

585 el1.append(el2)

586 self.parse_sub_patterns(text, el2, None, idx)

587 return el1

588

589 def parse_sub_patterns(

590 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int

591 ) -> None:

592 """

593 Parses sub patterns.

594

595 `data`: text to evaluate.

596

597 `parent`: Parent to attach text and sub elements to.

598

599 `last`: Last appended child to parent. Can also be None if parent has no children.

600

601 `idx`: Current pattern index that was used to evaluate the parent.

602 """

603

604 offset = 0

605 pos = 0

606

607 length = len(data)

608 while pos < length:

609 # Find the start of potential emphasis or strong tokens

610 if self.compiled_re.match(data, pos):

611 matched = False

612 # See if the we can match an emphasis/strong pattern

613 for index, item in enumerate(self.PATTERNS):

614 # Only evaluate patterns that are after what was used on the parent

615 if index <= idx:

616 continue

617 m = item.pattern.match(data, pos)

618 if m:

619 # Append child nodes to parent

620 # Text nodes should be appended to the last

621 # child if present, and if not, it should

622 # be added as the parent's text node.

623 text = data[offset:m.start(0)]

624 if text:

625 if last is not None:

626 last.tail = text

627 else:

628 parent.text = text

629 el = self.build_element(m, item.builder, item.tags, index)

630 parent.append(el)

631 last = el

632 # Move our position past the matched hunk

633 offset = pos = m.end(0)

634 matched = True

635 if not matched:

636 # We matched nothing, move on to the next character

637 pos += 1

638 else:

639 # Increment position as no potential emphasis start was found.

640 pos += 1

641

642 # Append any leftover text as a text node.

643 text = data[offset:]

644 if text:

645 if last is not None:

646 last.tail = text

647 else:

648 parent.text = text

649

650 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element:

651 """Element builder."""

652

653 if builder == 'double2':

654 return self.build_double2(m, tags, index)

655 elif builder == 'double':

656 return self.build_double(m, tags, index)

657 else:

658 return self.build_single(m, tags, index)

659

660 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

661 """Parse patterns."""

662

663 el = None

664 start = None

665 end = None

666

667 for index, item in enumerate(self.PATTERNS):

668 m1 = item.pattern.match(data, m.start(0))

669 if m1:

670 start = m1.start(0)

671 end = m1.end(0)

672 el = self.build_element(m1, item.builder, item.tags, index)

673 break

674 return el, start, end

675

676

677class UnderscoreProcessor(AsteriskProcessor):

678 """Emphasis processor for handling strong and em matches inside underscores."""

679

680 PATTERNS = [

681 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),

682 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),

683 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),

684 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),

685 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')

686 ]

687 """ The various strong and emphasis patterns handled by this processor. """

688

689

690class LinkInlineProcessor(InlineProcessor):

691 """ Return a link element from the given match. """

692 RE_LINK = re.compile(r'''$\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?$)?''', re.DOTALL | re.UNICODE)

693 RE_TITLE_CLEAN = re.compile(r'\s')

694

695 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

696 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """

697 text, index, handled = self.getText(data, m.end(0))

698

699 if not handled:

700 return None, None, None

701

702 href, title, index, handled = self.getLink(data, index)

703 if not handled:

704 return None, None, None

705

706 el = etree.Element("a")

707 el.text = text

708

709 el.set("href", href)

710

711 if title is not None:

712 el.set("title", title)

713

714 return el, m.start(0), index

715

716 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]:

717 """Parse data between `()` of `[Text]()` allowing recursive `()`. """

718

719 href = ''

720 title: str | None = None

721 handled = False

722

723 m = self.RE_LINK.match(data, pos=index)

724 if m and m.group(1):

725 # Matches [Text](<link> "title")

726 href = m.group(1)[1:-1].strip()

727 if m.group(2):

728 title = m.group(2)[1:-1]

729 index = m.end(0)

730 handled = True

731 elif m:

732 # Track bracket nesting and index in string

733 bracket_count = 1

734 backtrack_count = 1

735 start_index = m.end()

736 index = start_index

737 last_bracket = -1

738

739 # Primary (first found) quote tracking.

740 quote: str | None = None

741 start_quote = -1

742 exit_quote = -1

743 ignore_matches = False

744

745 # Secondary (second found) quote tracking.

746 alt_quote = None

747 start_alt_quote = -1

748 exit_alt_quote = -1

749

750 # Track last character

751 last = ''

752

753 for pos in range(index, len(data)):

754 c = data[pos]

755 if c == '(':

756 # Count nested (

757 # Don't increment the bracket count if we are sure we're in a title.

758 if not ignore_matches:

759 bracket_count += 1

760 elif backtrack_count > 0:

761 backtrack_count -= 1

762 elif c == ')':

763 # Match nested ) to (

764 # Don't decrement if we are sure we are in a title that is unclosed.

765 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):

766 bracket_count = 0

767 elif not ignore_matches:

768 bracket_count -= 1

769 elif backtrack_count > 0:

770 backtrack_count -= 1

771 # We've found our backup end location if the title doesn't resolve.

772 if backtrack_count == 0:

773 last_bracket = index + 1

774

775 elif c in ("'", '"'):

776 # Quote has started

777 if not quote:

778 # We'll assume we are now in a title.

779 # Brackets are quoted, so no need to match them (except for the final one).

780 ignore_matches = True

781 backtrack_count = bracket_count

782 bracket_count = 1

783 start_quote = index + 1

784 quote = c

785 # Secondary quote (in case the first doesn't resolve): [text](link'"title")

786 elif c != quote and not alt_quote:

787 start_alt_quote = index + 1

788 alt_quote = c

789 # Update primary quote match

790 elif c == quote:

791 exit_quote = index + 1

792 # Update secondary quote match

793 elif alt_quote and c == alt_quote:

794 exit_alt_quote = index + 1

795

796 index += 1

797

798 # Link is closed, so let's break out of the loop

799 if bracket_count == 0:

800 # Get the title if we closed a title string right before link closed

801 if exit_quote >= 0 and quote == last:

802 href = data[start_index:start_quote - 1]

803 title = ''.join(data[start_quote:exit_quote - 1])

804 elif exit_alt_quote >= 0 and alt_quote == last:

805 href = data[start_index:start_alt_quote - 1]

806 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])

807 else:

808 href = data[start_index:index - 1]

809 break

810

811 if c != ' ':

812 last = c

813

814 # We have a scenario: `[test](link"notitle)`

815 # When we enter a string, we stop tracking bracket resolution in the main counter,

816 # but we do keep a backup counter up until we discover where we might resolve all brackets

817 # if the title string fails to resolve.

818 if bracket_count != 0 and backtrack_count == 0:

819 href = data[start_index:last_bracket - 1]

820 index = last_bracket

821 bracket_count = 0

822

823 handled = bracket_count == 0

824

825 if title is not None:

826 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))

827

828 href = self.unescape(href).strip()

829

830 return href, title, index, handled

831

832 def getText(self, data: str, index: int) -> tuple[str, int, bool]:

833 """Parse the content between `[]` of the start of an image or link

834 resolving nested square brackets.

835

836 """

837 bracket_count = 1

838 text = []

839 for pos in range(index, len(data)):

840 c = data[pos]

841 if c == ']':

842 bracket_count -= 1

843 elif c == '[':

844 bracket_count += 1

845 index += 1

846 if bracket_count == 0:

847 break

848 text.append(c)

849 return ''.join(text), index, bracket_count == 0

850

851

852class ImageInlineProcessor(LinkInlineProcessor):

853 """ Return a `img` element from the given match. """

854

855 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

856 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """

857 text, index, handled = self.getText(data, m.end(0))

858 if not handled:

859 return None, None, None

860

861 src, title, index, handled = self.getLink(data, index)

862 if not handled:

863 return None, None, None

864

865 el = etree.Element("img")

866

867 el.set("src", src)

868

869 if title is not None:

870 el.set("title", title)

871

872 el.set('alt', self.unescape(text))

873 return el, m.start(0), index

874

875

876class ReferenceInlineProcessor(LinkInlineProcessor):

877 """ Match to a stored reference and return link element. """

878 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)

879

880 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)

881

882 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

883 """

884 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`.

885

886 """

887 text, index, handled = self.getText(data, m.end(0))

888 if not handled:

889 return None, None, None

890

891 id, end, handled = self.evalId(data, index, text)

892 if not handled:

893 return None, None, None

894

895 # Clean up line breaks in id

896 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

897 if id not in self.md.references: # ignore undefined refs

898 return None, m.start(0), end

899

900 href, title = self.md.references[id]

901

902 return self.makeTag(href, title, text), m.start(0), end

903

904 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:

905 """

906 Evaluate the id portion of `[ref][id]`.

907

908 If `[ref][]` use `[ref]`.

909 """

910 m = self.RE_LINK.match(data, pos=index)

911 if not m:

912 return None, index, False

913 else:

914 id = m.group(1).lower()

915 end = m.end(0)

916 if not id:

917 id = text.lower()

918 return id, end, True

919

920 def makeTag(self, href: str, title: str, text: str) -> etree.Element:

921 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """

922 el = etree.Element('a')

923

924 el.set('href', href)

925 if title:

926 el.set('title', title)

927

928 el.text = text

929 return el

930

931

932class ShortReferenceInlineProcessor(ReferenceInlineProcessor):

933 """Short form of reference: `[google]`. """

934 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:

935 """Evaluate the id of `[ref]`. """

936

937 return text.lower(), index, True

938

939

940class ImageReferenceInlineProcessor(ReferenceInlineProcessor):

941 """ Match to a stored reference and return `img` element. """

942 def makeTag(self, href: str, title: str, text: str) -> etree.Element:

943 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """

944 el = etree.Element("img")

945 el.set("src", href)

946 if title:

947 el.set("title", title)

948 el.set("alt", self.unescape(text))

949 return el

950

951

952class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):

953 """ Short form of image reference: `![ref]`. """

954 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:

955 """Evaluate the id of `[ref]`. """

956

957 return text.lower(), index, True

958

959

960class AutolinkInlineProcessor(InlineProcessor):

961 """ Return a link Element given an auto-link (`<http://example/com>`). """

962 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

963 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """

964 el = etree.Element("a")

965 el.set('href', self.unescape(m.group(1)))

966 el.text = util.AtomicString(m.group(1))

967 return el, m.start(0), m.end(0)

968

969

970class AutomailInlineProcessor(InlineProcessor):

971 """

972 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`).

973 """

974 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

975 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """

976 el = etree.Element('a')

977 email = self.unescape(m.group(1))

978 if email.startswith("mailto:"):

979 email = email[len("mailto:"):]

980

981 def codepoint2name(code: int) -> str:

982 """Return entity definition by code, or the code if not defined."""

983 entity = entities.codepoint2name.get(code)

984 if entity:

985 return "{}{};".format(util.AMP_SUBSTITUTE, entity)

986 else:

987 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

988

989 letters = [codepoint2name(ord(letter)) for letter in email]

990 el.text = util.AtomicString(''.join(letters))

991

992 mailto = "mailto:" + email

993 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

994 ord(letter) for letter in mailto])

995 el.set('href', mailto)

996 return el, m.start(0), m.end(0)