Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/markdown/inlinepatterns.py: 42%

1# Python Markdown

3# A Python implementation of John Gruber's Markdown.

5# Documentation: https://python-markdown.github.io/

6# GitHub: https://github.com/Python-Markdown/markdown/

7# PyPI: https://pypi.org/project/Markdown/

9# Started by Manfred Stienstra (http://www.dwerg.net/).

10# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).

11# Currently maintained by Waylan Limberg (https://github.com/waylan),

12# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

18# License: BSD (see LICENSE.md for details).

20"""

21In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The

22original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still

23supported, though users are encouraged to migrate.

25The new `InlineProcessor` provides two major enhancements to `Patterns`:

271. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with

28 `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain

29 what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`.

312. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis,

32 not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative

33 to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are

34 returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`.

36 This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested

37 brackets, and explicit control of the span "consumed" by the processor.

39"""

41from __future__ import annotations

43from . import util

44from typing import TYPE_CHECKING, Any, Collection, NamedTuple

45import re

46import xml.etree.ElementTree as etree

47from html import entities

49if TYPE_CHECKING: # pragma: no cover

50 from markdown import Markdown

53def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]:

54 """

55 Build the default set of inline patterns for Markdown.

57 The order in which processors and/or patterns are applied is very important - e.g. if we first replace

58 `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we

59 apply the expressions in the following order:

61 * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown

62 patterns by escaping them;

64 * then we handle the various types of links (auto-links must be handled before inline HTML);

66 * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder

67 and add the actual HTML to a stash;

69 * finally we apply strong, emphasis, etc.

71 """

72 inlinePatterns = util.Registry()

73 inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190)

74 inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180)

75 inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170)

76 inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160)

77 inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150)

78 inlinePatterns.register(

79 ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140

80 )

81 inlinePatterns.register(

82 ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130

83 )

84 inlinePatterns.register(

85 ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125

86 )

87 inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120)

88 inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110)

89 inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100)

90 inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90)

91 inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80)

92 inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70)

93 inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60)

94 inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50)

95 return inlinePatterns

98# The actual regular expressions for patterns

99# -----------------------------------------------------------------------------

100

101NOIMG = r'(?<!\!)'

102""" Match not an image. Partial regular expression which matches if not preceded by `!`. """

103

104BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))'

105""" Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """

106

107ESCAPE_RE = r'\\(.)'

108""" Match a backslash escaped character (`\\<` or `\\*`). """

109

110EMPHASIS_RE = r'(\*)([^\*]+)\1'

111""" Match emphasis with an asterisk (`*emphasis*`). """

112

113STRONG_RE = r'(\*{2})(.+?)\1'

114""" Match strong with an asterisk (`**strong**`). """

115

116SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'

117""" Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """

118

119SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'

120""" Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """

121

122SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)'

123""" Match strong emphasis with underscores (`__strong _em__`). """

124

125EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}'

126""" Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """

127

128EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}'

129""" Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """

130

131STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1'

132""" Match strong emphasis with asterisk (`***strong**em*`). """

133

134STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1'

135""" Match strong emphasis with underscores (`___strong__em_`). """

136

137STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}'

138""" Match strong emphasis with asterisk (`**strong*em***`). """

139

140LINK_RE = NOIMG + r'\['

141""" Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """

142

143IMAGE_LINK_RE = r'\!\['

144""" Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """

145

146REFERENCE_RE = LINK_RE

147""" Match start of reference link (`[Label][3]`). """

148

149IMAGE_REFERENCE_RE = IMAGE_LINK_RE

150""" Match start of image reference (`![alt text][2]`). """

151

152NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))'

153""" Match a stand-alone `*` or `_`. """

154

155AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>'

156""" Match an automatic link (`<http://www.example.com>`). """

157

158AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'

159""" Match an automatic email link (`<me@example.com>`). """

160

161HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!).)*--)>)'

162""" Match an HTML tag (`<...>`). """

163

164ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'

165""" Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """

166

167LINE_BREAK_RE = r' \n'

168""" Match two spaces at end of line. """

169

170

171def dequote(string: str) -> str:

172 """Remove quotes from around a string."""

173 if ((string.startswith('"') and string.endswith('"')) or

174 (string.startswith("'") and string.endswith("'"))):

175 return string[1:-1]

176 else:

177 return string

178

179

180class EmStrongItem(NamedTuple):

181 """Emphasis/strong pattern item."""

182 pattern: re.Pattern[str]

183 builder: str

184 tags: str

185

186

187# The pattern classes

188# -----------------------------------------------------------------------------

189

190

191class Pattern: # pragma: no cover

192 """

193 Base class that inline patterns subclass.

194

195 Inline patterns are handled by means of `Pattern` subclasses, one per regular expression.

196 Each pattern object uses a single regular expression and must support the following methods:

197 [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and

198 [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch].

199

200 All the regular expressions used by `Pattern` subclasses must capture the whole block. For this

201 reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on

202 class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression

203 is pre-compiled.

204

205 It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that

206 use a more efficient and flexible search approach be used instead. However, the older style

207 `Pattern` remains for backward compatibility with many existing third-party extensions.

208

209 """

210

211 ANCESTOR_EXCLUDES: Collection[str] = tuple()

212 """

213 A collection of elements which are undesirable ancestors. The processor will be skipped if it

214 would cause the content to be a descendant of one of the listed tag names.

215 """

216

217 compiled_re: re.Pattern[str]

218 md: Markdown | None

219

220 def __init__(self, pattern: str, md: Markdown | None = None):

221 """

222 Create an instant of an inline pattern.

223

224 Arguments:

225 pattern: A regular expression that matches a pattern.

226 md: An optional pointer to the instance of `markdown.Markdown` and is available as

227 `self.md` on the class instance.

228

229

230 """

231 self.pattern = pattern

232 self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern,

233 re.DOTALL | re.UNICODE)

234

235 self.md = md

236

237 def getCompiledRegExp(self) -> re.Pattern:

238 """ Return a compiled regular expression. """

239 return self.compiled_re

240

241 def handleMatch(self, m: re.Match[str]) -> etree.Element | str:

242 """Return a ElementTree element from the given match.

243

244 Subclasses should override this method.

245

246 Arguments:

247 m: A match object containing a match of the pattern.

248

249 Returns: An ElementTree Element object.

250

251 """

252 pass # pragma: no cover

253

254 def type(self) -> str:

255 """ Return class name, to define pattern type """

256 return self.__class__.__name__

257

258 def unescape(self, text: str) -> str:

259 """ Return unescaped text given text with an inline placeholder. """

260 try:

261 stash = self.md.treeprocessors['inline'].stashed_nodes

262 except KeyError: # pragma: no cover

263 return text

264

265 def get_stash(m):

266 id = m.group(1)

267 if id in stash:

268 value = stash.get(id)

269 if isinstance(value, str):

270 return value

271 else:

272 # An `etree` Element - return text content only

273 return ''.join(value.itertext())

274 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

275

276

277class InlineProcessor(Pattern):

278 """

279 Base class that inline processors subclass.

280

281 This is the newer style inline processor that uses a more

282 efficient and flexible search approach.

283

284 """

285

286 def __init__(self, pattern: str, md: Markdown | None = None):

287 """

288 Create an instant of an inline processor.

289

290 Arguments:

291 pattern: A regular expression that matches a pattern.

292 md: An optional pointer to the instance of `markdown.Markdown` and is available as

293 `self.md` on the class instance.

294

295 """

296 self.pattern = pattern

297 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)

298

299 # API for Markdown to pass `safe_mode` into instance

300 self.safe_mode = False

301 self.md = md

302

303 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]:

304 """Return a ElementTree element from the given match and the

305 start and end index of the matched text.

306

307 If `start` and/or `end` are returned as `None`, it will be

308 assumed that the processor did not find a valid region of text.

309

310 Subclasses should override this method.

311

312 Arguments:

313 m: A re match object containing a match of the pattern.

314 data: The buffer currently under analysis.

315

316 Returns:

317 el: The ElementTree element, text or None.

318 start: The start of the region that has been matched or None.

319 end: The end of the region that has been matched or None.

320

321 """

322 pass # pragma: no cover

323

324

325class SimpleTextPattern(Pattern): # pragma: no cover

326 """ Return a simple text of `group(2)` of a Pattern. """

327 def handleMatch(self, m: re.Match[str]) -> str:

328 """ Return string content of `group(2)` of a matching pattern. """

329 return m.group(2)

330

331

332class SimpleTextInlineProcessor(InlineProcessor):

333 """ Return a simple text of `group(1)` of a Pattern. """

334 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:

335 """ Return string content of `group(1)` of a matching pattern. """

336 return m.group(1), m.start(0), m.end(0)

337

338

339class EscapeInlineProcessor(InlineProcessor):

340 """ Return an escaped character. """

341

342 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]:

343 """

344 If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS]

345 then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped

346 in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX].

347

348 If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`.

349 """

350

351 char = m.group(1)

352 if char in self.md.ESCAPED_CHARS:

353 return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0)

354 else:

355 return None, m.start(0), m.end(0)

356

357

358class SimpleTagPattern(Pattern): # pragma: no cover

359 """

360 Return element of type `tag` with a text attribute of `group(3)`

361 of a Pattern.

362

363 """

364 def __init__(self, pattern: str, tag: str):

365 """

366 Create an instant of an simple tag pattern.

367

368 Arguments:

369 pattern: A regular expression that matches a pattern.

370 tag: Tag of element.

371

372 """

373 Pattern.__init__(self, pattern)

374 self.tag = tag

375 """ The tag of the rendered element. """

376

377 def handleMatch(self, m: re.Match[str]) -> etree.Element:

378 """

379 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a

380 matching pattern as the Element's text.

381 """

382 el = etree.Element(self.tag)

383 el.text = m.group(3)

384 return el

385

386

387class SimpleTagInlineProcessor(InlineProcessor):

388 """

389 Return element of type `tag` with a text attribute of `group(2)`

390 of a Pattern.

391

392 """

393 def __init__(self, pattern: str, tag: str):

394 """

395 Create an instant of an simple tag processor.

396

397 Arguments:

398 pattern: A regular expression that matches a pattern.

399 tag: Tag of element.

400

401 """

402 InlineProcessor.__init__(self, pattern)

403 self.tag = tag

404 """ The tag of the rendered element. """

405

406 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover

407 """

408 Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a

409 matching pattern as the Element's text.

410 """

411 el = etree.Element(self.tag)

412 el.text = m.group(2)

413 return el, m.start(0), m.end(0)

414

415

416class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover

417 """ Return an element of type `tag` with no children. """

418 def handleMatch(self, m: re.Match[str]) -> etree.Element:

419 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """

420 return etree.Element(self.tag)

421

422

423class SubstituteTagInlineProcessor(SimpleTagInlineProcessor):

424 """ Return an element of type `tag` with no children. """

425 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

426 """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """

427 return etree.Element(self.tag), m.start(0), m.end(0)

428

429

430class BacktickInlineProcessor(InlineProcessor):

431 """ Return a `<code>` element containing the escaped matching text. """

432 def __init__(self, pattern: str):

433 InlineProcessor.__init__(self, pattern)

434 self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX)

435 self.tag = 'code'

436 """ The tag of the rendered element. """

437

438 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:

439 """

440 If the match contains `group(3)` of a pattern, then return a `code`

441 [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with

442 [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString].

443

444 If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped.

445

446 """

447 if m.group(3):

448 el = etree.Element(self.tag)

449 el.text = util.AtomicString(util.code_escape(m.group(3).strip()))

450 return el, m.start(0), m.end(0)

451 else:

452 return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)

453

454

455class DoubleTagPattern(SimpleTagPattern): # pragma: no cover

456 """Return a ElementTree element nested in tag2 nested in tag1.

457

458 Useful for strong emphasis etc.

459

460 """

461 def handleMatch(self, m: re.Match[str]) -> etree.Element:

462 """

463 Return [`Element`][xml.etree.ElementTree.Element] in following format:

464 `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional.

465

466 """

467 tag1, tag2 = self.tag.split(",")

468 el1 = etree.Element(tag1)

469 el2 = etree.SubElement(el1, tag2)

470 el2.text = m.group(3)

471 if len(m.groups()) == 5:

472 el2.tail = m.group(4)

473 return el1

474

475

476class DoubleTagInlineProcessor(SimpleTagInlineProcessor):

477 """Return a ElementTree element nested in tag2 nested in tag1.

478

479 Useful for strong emphasis etc.

480

481 """

482 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover

483 """

484 Return [`Element`][xml.etree.ElementTree.Element] in following format:

485 `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional.

486

487 """

488 tag1, tag2 = self.tag.split(",")

489 el1 = etree.Element(tag1)

490 el2 = etree.SubElement(el1, tag2)

491 el2.text = m.group(2)

492 if len(m.groups()) == 3:

493 el2.tail = m.group(3)

494 return el1, m.start(0), m.end(0)

495

496

497class HtmlInlineProcessor(InlineProcessor):

498 """ Store raw inline html and return a placeholder. """

499 def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:

500 """ Store the text of `group(1)` of a pattern and return a placeholder string. """

501 rawhtml = self.backslash_unescape(self.unescape(m.group(1)))

502 place_holder = self.md.htmlStash.store(rawhtml)

503 return place_holder, m.start(0), m.end(0)

504

505 def unescape(self, text: str) -> str:

506 """ Return unescaped text given text with an inline placeholder. """

507 try:

508 stash = self.md.treeprocessors['inline'].stashed_nodes

509 except KeyError: # pragma: no cover

510 return text

511

512 def get_stash(m: re.Match[str]) -> str:

513 id = m.group(1)

514 value = stash.get(id)

515 if value is not None:

516 try:

517 return self.md.serializer(value)

518 except Exception:

519 return r'\%s' % value

520

521 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)

522

523 def backslash_unescape(self, text: str) -> str:

524 """ Return text with backslash escapes undone (backslashes are restored). """

525 try:

526 RE = self.md.treeprocessors['unescape'].RE

527 except KeyError: # pragma: no cover

528 return text

529

530 def _unescape(m: re.Match[str]) -> str:

531 return chr(int(m.group(1)))

532

533 return RE.sub(_unescape, text)

534

535

536class AsteriskProcessor(InlineProcessor):

537 """Emphasis processor for handling strong and em matches inside asterisks."""

538

539 PATTERNS = [

540 EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),

541 EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),

542 EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),

543 EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),

544 EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')

545 ]

546 """ The various strong and emphasis patterns handled by this processor. """

547

548 def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element:

549 """Return single tag."""

550 el1 = etree.Element(tag)

551 text = m.group(2)

552 self.parse_sub_patterns(text, el1, None, idx)

553 return el1

554

555 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:

556 """Return double tag."""

557

558 tag1, tag2 = tags.split(",")

559 el1 = etree.Element(tag1)

560 el2 = etree.Element(tag2)

561 text = m.group(2)

562 self.parse_sub_patterns(text, el2, None, idx)

563 el1.append(el2)

564 if len(m.groups()) == 3:

565 text = m.group(3)

566 self.parse_sub_patterns(text, el1, el2, idx)

567 return el1

568

569 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element:

570 """Return double tags (variant 2): `<strong>text <em>text</em></strong>`."""

571

572 tag1, tag2 = tags.split(",")

573 el1 = etree.Element(tag1)

574 el2 = etree.Element(tag2)

575 text = m.group(2)

576 self.parse_sub_patterns(text, el1, None, idx)

577 text = m.group(3)

578 el1.append(el2)

579 self.parse_sub_patterns(text, el2, None, idx)

580 return el1

581

582 def parse_sub_patterns(

583 self, data: str, parent: etree.Element, last: etree.Element | None, idx: int

584 ) -> None:

585 """

586 Parses sub patterns.

587

588 `data`: text to evaluate.

589

590 `parent`: Parent to attach text and sub elements to.

591

592 `last`: Last appended child to parent. Can also be None if parent has no children.

593

594 `idx`: Current pattern index that was used to evaluate the parent.

595 """

596

597 offset = 0

598 pos = 0

599

600 length = len(data)

601 while pos < length:

602 # Find the start of potential emphasis or strong tokens

603 if self.compiled_re.match(data, pos):

604 matched = False

605 # See if the we can match an emphasis/strong pattern

606 for index, item in enumerate(self.PATTERNS):

607 # Only evaluate patterns that are after what was used on the parent

608 if index <= idx:

609 continue

610 m = item.pattern.match(data, pos)

611 if m:

612 # Append child nodes to parent

613 # Text nodes should be appended to the last

614 # child if present, and if not, it should

615 # be added as the parent's text node.

616 text = data[offset:m.start(0)]

617 if text:

618 if last is not None:

619 last.tail = text

620 else:

621 parent.text = text

622 el = self.build_element(m, item.builder, item.tags, index)

623 parent.append(el)

624 last = el

625 # Move our position past the matched hunk

626 offset = pos = m.end(0)

627 matched = True

628 if not matched:

629 # We matched nothing, move on to the next character

630 pos += 1

631 else:

632 # Increment position as no potential emphasis start was found.

633 pos += 1

634

635 # Append any leftover text as a text node.

636 text = data[offset:]

637 if text:

638 if last is not None:

639 last.tail = text

640 else:

641 parent.text = text

642

643 def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element:

644 """Element builder."""

645

646 if builder == 'double2':

647 return self.build_double2(m, tags, index)

648 elif builder == 'double':

649 return self.build_double(m, tags, index)

650 else:

651 return self.build_single(m, tags, index)

652

653 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

654 """Parse patterns."""

655

656 el = None

657 start = None

658 end = None

659

660 for index, item in enumerate(self.PATTERNS):

661 m1 = item.pattern.match(data, m.start(0))

662 if m1:

663 start = m1.start(0)

664 end = m1.end(0)

665 el = self.build_element(m1, item.builder, item.tags, index)

666 break

667 return el, start, end

668

669

670class UnderscoreProcessor(AsteriskProcessor):

671 """Emphasis processor for handling strong and em matches inside underscores."""

672

673 PATTERNS = [

674 EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'),

675 EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'),

676 EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'),

677 EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'),

678 EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em')

679 ]

680 """ The various strong and emphasis patterns handled by this processor. """

681

682

683class LinkInlineProcessor(InlineProcessor):

684 """ Return a link element from the given match. """

685 RE_LINK = re.compile(r'''$\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?$)?''', re.DOTALL | re.UNICODE)

686 RE_TITLE_CLEAN = re.compile(r'\s')

687

688 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

689 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """

690 text, index, handled = self.getText(data, m.end(0))

691

692 if not handled:

693 return None, None, None

694

695 href, title, index, handled = self.getLink(data, index)

696 if not handled:

697 return None, None, None

698

699 el = etree.Element("a")

700 el.text = text

701

702 el.set("href", href)

703

704 if title is not None:

705 el.set("title", title)

706

707 return el, m.start(0), index

708

709 def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]:

710 """Parse data between `()` of `[Text]()` allowing recursive `()`. """

711

712 href = ''

713 title: str | None = None

714 handled = False

715

716 m = self.RE_LINK.match(data, pos=index)

717 if m and m.group(1):

718 # Matches [Text](<link> "title")

719 href = m.group(1)[1:-1].strip()

720 if m.group(2):

721 title = m.group(2)[1:-1]

722 index = m.end(0)

723 handled = True

724 elif m:

725 # Track bracket nesting and index in string

726 bracket_count = 1

727 backtrack_count = 1

728 start_index = m.end()

729 index = start_index

730 last_bracket = -1

731

732 # Primary (first found) quote tracking.

733 quote: str | None = None

734 start_quote = -1

735 exit_quote = -1

736 ignore_matches = False

737

738 # Secondary (second found) quote tracking.

739 alt_quote = None

740 start_alt_quote = -1

741 exit_alt_quote = -1

742

743 # Track last character

744 last = ''

745

746 for pos in range(index, len(data)):

747 c = data[pos]

748 if c == '(':

749 # Count nested (

750 # Don't increment the bracket count if we are sure we're in a title.

751 if not ignore_matches:

752 bracket_count += 1

753 elif backtrack_count > 0:

754 backtrack_count -= 1

755 elif c == ')':

756 # Match nested ) to (

757 # Don't decrement if we are sure we are in a title that is unclosed.

758 if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)):

759 bracket_count = 0

760 elif not ignore_matches:

761 bracket_count -= 1

762 elif backtrack_count > 0:

763 backtrack_count -= 1

764 # We've found our backup end location if the title doesn't resolve.

765 if backtrack_count == 0:

766 last_bracket = index + 1

767

768 elif c in ("'", '"'):

769 # Quote has started

770 if not quote:

771 # We'll assume we are now in a title.

772 # Brackets are quoted, so no need to match them (except for the final one).

773 ignore_matches = True

774 backtrack_count = bracket_count

775 bracket_count = 1

776 start_quote = index + 1

777 quote = c

778 # Secondary quote (in case the first doesn't resolve): [text](link'"title")

779 elif c != quote and not alt_quote:

780 start_alt_quote = index + 1

781 alt_quote = c

782 # Update primary quote match

783 elif c == quote:

784 exit_quote = index + 1

785 # Update secondary quote match

786 elif alt_quote and c == alt_quote:

787 exit_alt_quote = index + 1

788

789 index += 1

790

791 # Link is closed, so let's break out of the loop

792 if bracket_count == 0:

793 # Get the title if we closed a title string right before link closed

794 if exit_quote >= 0 and quote == last:

795 href = data[start_index:start_quote - 1]

796 title = ''.join(data[start_quote:exit_quote - 1])

797 elif exit_alt_quote >= 0 and alt_quote == last:

798 href = data[start_index:start_alt_quote - 1]

799 title = ''.join(data[start_alt_quote:exit_alt_quote - 1])

800 else:

801 href = data[start_index:index - 1]

802 break

803

804 if c != ' ':

805 last = c

806

807 # We have a scenario: `[test](link"notitle)`

808 # When we enter a string, we stop tracking bracket resolution in the main counter,

809 # but we do keep a backup counter up until we discover where we might resolve all brackets

810 # if the title string fails to resolve.

811 if bracket_count != 0 and backtrack_count == 0:

812 href = data[start_index:last_bracket - 1]

813 index = last_bracket

814 bracket_count = 0

815

816 handled = bracket_count == 0

817

818 if title is not None:

819 title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip())))

820

821 href = self.unescape(href).strip()

822

823 return href, title, index, handled

824

825 def getText(self, data: str, index: int) -> tuple[str, int, bool]:

826 """Parse the content between `[]` of the start of an image or link

827 resolving nested square brackets.

828

829 """

830 bracket_count = 1

831 text = []

832 for pos in range(index, len(data)):

833 c = data[pos]

834 if c == ']':

835 bracket_count -= 1

836 elif c == '[':

837 bracket_count += 1

838 index += 1

839 if bracket_count == 0:

840 break

841 text.append(c)

842 return ''.join(text), index, bracket_count == 0

843

844

845class ImageInlineProcessor(LinkInlineProcessor):

846 """ Return a `img` element from the given match. """

847

848 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

849 """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """

850 text, index, handled = self.getText(data, m.end(0))

851 if not handled:

852 return None, None, None

853

854 src, title, index, handled = self.getLink(data, index)

855 if not handled:

856 return None, None, None

857

858 el = etree.Element("img")

859

860 el.set("src", src)

861

862 if title is not None:

863 el.set("title", title)

864

865 el.set('alt', self.unescape(text))

866 return el, m.start(0), index

867

868

869class ReferenceInlineProcessor(LinkInlineProcessor):

870 """ Match to a stored reference and return link element. """

871 NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE)

872

873 RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE)

874

875 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]:

876 """

877 Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`.

878

879 """

880 text, index, handled = self.getText(data, m.end(0))

881 if not handled:

882 return None, None, None

883

884 id, end, handled = self.evalId(data, index, text)

885 if not handled:

886 return None, None, None

887

888 # Clean up line breaks in id

889 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)

890 if id not in self.md.references: # ignore undefined refs

891 return None, m.start(0), end

892

893 href, title = self.md.references[id]

894

895 return self.makeTag(href, title, text), m.start(0), end

896

897 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]:

898 """

899 Evaluate the id portion of `[ref][id]`.

900

901 If `[ref][]` use `[ref]`.

902 """

903 m = self.RE_LINK.match(data, pos=index)

904 if not m:

905 return None, index, False

906 else:

907 id = m.group(1).lower()

908 end = m.end(0)

909 if not id:

910 id = text.lower()

911 return id, end, True

912

913 def makeTag(self, href: str, title: str, text: str) -> etree.Element:

914 """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """

915 el = etree.Element('a')

916

917 el.set('href', href)

918 if title:

919 el.set('title', title)

920

921 el.text = text

922 return el

923

924

925class ShortReferenceInlineProcessor(ReferenceInlineProcessor):

926 """Short form of reference: `[google]`. """

927 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:

928 """Evaluate the id of `[ref]`. """

929

930 return text.lower(), index, True

931

932

933class ImageReferenceInlineProcessor(ReferenceInlineProcessor):

934 """ Match to a stored reference and return `img` element. """

935 def makeTag(self, href: str, title: str, text: str) -> etree.Element:

936 """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """

937 el = etree.Element("img")

938 el.set("src", href)

939 if title:

940 el.set("title", title)

941 el.set("alt", self.unescape(text))

942 return el

943

944

945class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor):

946 """ Short form of image reference: `![ref]`. """

947 def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]:

948 """Evaluate the id of `[ref]`. """

949

950 return text.lower(), index, True

951

952

953class AutolinkInlineProcessor(InlineProcessor):

954 """ Return a link Element given an auto-link (`<http://example/com>`). """

955 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

956 """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """

957 el = etree.Element("a")

958 el.set('href', self.unescape(m.group(1)))

959 el.text = util.AtomicString(m.group(1))

960 return el, m.start(0), m.end(0)

961

962

963class AutomailInlineProcessor(InlineProcessor):

964 """

965 Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`).

966 """

967 def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:

968 """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """

969 el = etree.Element('a')

970 email = self.unescape(m.group(1))

971 if email.startswith("mailto:"):

972 email = email[len("mailto:"):]

973

974 def codepoint2name(code: int) -> str:

975 """Return entity definition by code, or the code if not defined."""

976 entity = entities.codepoint2name.get(code)

977 if entity:

978 return "{}{};".format(util.AMP_SUBSTITUTE, entity)

979 else:

980 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)

981

982 letters = [codepoint2name(ord(letter)) for letter in email]

983 el.text = util.AtomicString(''.join(letters))

984

985 mailto = "mailto:" + email

986 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %

987 ord(letter) for letter in mailto])

988 el.set('href', mailto)

989 return el, m.start(0), m.end(0)