Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/linkifier.py: 13%

1import re

3from urllib.parse import quote

5from bleach import callbacks as linkify_callbacks

6from bleach import html5lib_shim

9#: List of default callbacks

10DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]

13TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az

14 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat

15 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk

16 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg

17 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il

18 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp

19 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk

20 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne

21 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post

22 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl

23 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to

24 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws

25 xn xxx ye yt yu za zm zw""".split()

27# Make sure that .com doesn't get matched by .co first

28TLDS.reverse()

31def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):

32 """Builds the url regex used by linkifier

34 If you want a different set of tlds or allowed protocols, pass those in

35 and stomp on the existing ``url_re``::

37 from bleach import linkifier

39 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)

41 linker = LinkifyFilter(url_re=my_url_re)

43 """

44 return re.compile(

45 r"""\(* # Match any opening parentheses.

46 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://

47 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?

48 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?

49 # /path/zz (excluding "unsafe" chars from RFC 1738,

50 # except for # and ~, which happen in practice)

51 """.format(

52 "|".join(sorted(protocols)), "|".join(sorted(tlds))

53 ),

54 re.IGNORECASE | re.VERBOSE | re.UNICODE,

55 )

58URL_RE = build_url_re()

61PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)

64def build_email_re(tlds=TLDS):

65 """Builds the email regex used by linkifier

67 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::

69 from bleach import linkifier

71 my_email_re = linkifier.build_email_re(my_tlds_list)

73 linker = LinkifyFilter(email_re=my_url_re)

75 """

76 # open and closing braces doubled below for format string

77 return re.compile(

78 r"""(?<!//)

79 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+

80 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom

81 |^"([\001-\010\013\014\016-\037!#-\[\]-\177]

82 |\\[\001-\011\013\014\016-\177])*" # quoted-string

83 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain

84 """.format(

85 "|".join(tlds)

86 ),

87 re.IGNORECASE | re.MULTILINE | re.VERBOSE,

88 )

91EMAIL_RE = build_email_re()

94class Linker:

95 """Convert URL-like strings in an HTML fragment to links

97 This function converts strings that look like URLs, domain names and email

98 addresses in text that may be an HTML fragment to links, while preserving:

100 1. links already in the string

101 2. urls found in attributes

102 3. email addresses

103

104 linkify does a best-effort approach and tries to recover from bad

105 situations due to crazy text.

106

107 """

108

109 def __init__(

110 self,

111 callbacks=DEFAULT_CALLBACKS,

112 skip_tags=None,

113 parse_email=False,

114 url_re=URL_RE,

115 email_re=EMAIL_RE,

116 recognized_tags=html5lib_shim.HTML_TAGS,

117 ):

118 """Creates a Linker instance

119

120 :arg list callbacks: list of callbacks to run when adjusting tag attributes;

121 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

122

123 :arg set skip_tags: set of tags that you don't want to linkify the

124 contents of; for example, you could set this to ``{'pre'}`` to skip

125 linkifying contents of ``pre`` tags; ``None`` means you don't

126 want linkify to skip any tags

127

128 :arg bool parse_email: whether or not to linkify email addresses

129

130 :arg url_re: url matching regex

131

132 :arg email_re: email matching regex

133

134 :arg set recognized_tags: the set of tags that linkify knows about;

135 everything else gets escaped

136

137 :returns: linkified text as unicode

138

139 """

140 self.callbacks = callbacks

141 self.skip_tags = skip_tags

142 self.parse_email = parse_email

143 self.url_re = url_re

144 self.email_re = email_re

145

146 # Create a parser/tokenizer that allows all HTML tags and escapes

147 # anything not in that list.

148 self.parser = html5lib_shim.BleachHTMLParser(

149 tags=frozenset(recognized_tags),

150 strip=False,

151 consume_entities=False,

152 namespaceHTMLElements=False,

153 )

154 self.walker = html5lib_shim.getTreeWalker("etree")

155 self.serializer = html5lib_shim.BleachHTMLSerializer(

156 quote_attr_values="always",

157 omit_optional_tags=False,

158 # We want to leave entities as they are without escaping or

159 # resolving or expanding

160 resolve_entities=False,

161 # linkify does not sanitize

162 sanitize=False,

163 # linkify preserves attr order

164 alphabetical_attributes=False,

165 )

166

167 def linkify(self, text):

168 """Linkify specified text

169

170 :arg str text: the text to add links to

171

172 :returns: linkified text as unicode

173

174 :raises TypeError: if ``text`` is not a text type

175

176 """

177 if not isinstance(text, str):

178 raise TypeError("argument must be of text type")

179

180 if not text:

181 return ""

182

183 dom = self.parser.parseFragment(text)

184 filtered = LinkifyFilter(

185 source=self.walker(dom),

186 callbacks=self.callbacks,

187 skip_tags=self.skip_tags,

188 parse_email=self.parse_email,

189 url_re=self.url_re,

190 email_re=self.email_re,

191 )

192 return self.serializer.render(filtered)

193

194

195class LinkifyFilter(html5lib_shim.Filter):

196 """html5lib filter that linkifies text

197

198 This will do the following:

199

200 * convert email addresses into links

201 * convert urls into links

202 * edit existing links by running them through callbacks--the default is to

203 add a ``rel="nofollow"``

204

205 This filter can be used anywhere html5lib filters can be used.

206

207 """

208

209 def __init__(

210 self,

211 source,

212 callbacks=DEFAULT_CALLBACKS,

213 skip_tags=None,

214 parse_email=False,

215 url_re=URL_RE,

216 email_re=EMAIL_RE,

217 ):

218 """Creates a LinkifyFilter instance

219

220 :arg source: stream as an html5lib TreeWalker

221

222 :arg list callbacks: list of callbacks to run when adjusting tag attributes;

223 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

224

225 :arg set skip_tags: set of tags that you don't want to linkify the

226 contents of; for example, you could set this to ``{'pre'}`` to skip

227 linkifying contents of ``pre`` tags

228

229 :arg bool parse_email: whether or not to linkify email addresses

230

231 :arg url_re: url matching regex

232

233 :arg email_re: email matching regex

234

235 """

236 super().__init__(source)

237

238 self.callbacks = callbacks or []

239 self.skip_tags = skip_tags or {}

240 self.parse_email = parse_email

241

242 self.url_re = url_re

243 self.email_re = email_re

244

245 def apply_callbacks(self, attrs, is_new):

246 """Given an attrs dict and an is_new bool, runs through callbacks

247

248 Callbacks can return an adjusted attrs dict or ``None``. In the case of

249 ``None``, we stop going through callbacks and return that and the link

250 gets dropped.

251

252 :arg dict attrs: map of ``(namespace, name)`` -> ``value``

253

254 :arg bool is_new: whether or not this link was added by linkify

255

256 :returns: adjusted attrs dict or ``None``

257

258 """

259 for cb in self.callbacks:

260 attrs = cb(attrs, is_new)

261 if attrs is None:

262 return None

263 return attrs

264

265 def extract_character_data(self, token_list):

266 """Extracts and squashes character sequences in a token stream"""

267 # FIXME(willkg): This is a terrible idea. What it does is drop all the

268 # tags from the token list and merge the Characters and SpaceCharacters

269 # tokens into a single text.

270 #

271 # So something like this::

272 #

273 # "<span>" "<b>" "some text" "</b>" "</span>"

274 #

275 # gets converted to "some text".

276 #

277 # This gets used to figure out the ``_text`` fauxttribute value for

278 # linkify callables.

279 #

280 # I'm not really sure how else to support that ``_text`` fauxttribute and

281 # maintain some modicum of backwards compatibility with previous versions

282 # of Bleach.

283

284 out = []

285 for token in token_list:

286 token_type = token["type"]

287 if token_type in ["Characters", "SpaceCharacters"]:

288 out.append(token["data"])

289

290 return "".join(out)

291

292 def handle_email_addresses(self, src_iter):

293 """Handle email addresses in character tokens"""

294 for token in src_iter:

295 if token["type"] == "Characters":

296 text = token["data"]

297 new_tokens = []

298 end = 0

299

300 # For each email address we find in the text

301 for match in self.email_re.finditer(text):

302 if match.start() > end:

303 new_tokens.append(

304 {"type": "Characters", "data": text[end : match.start()]}

305 )

306

307 # URL-encode the "local-part" according to RFC6068

308 parts = match.group(0).split("@")

309 parts[0] = quote(parts[0])

310 address = "@".join(parts)

311

312 # Run attributes through the callbacks to see what we

313 # should do with this match

314 attrs = {

315 (None, "href"): "mailto:%s" % address,

316 "_text": match.group(0),

317 }

318 attrs = self.apply_callbacks(attrs, True)

319

320 if attrs is None:

321 # Just add the text--but not as a link

322 new_tokens.append(

323 {"type": "Characters", "data": match.group(0)}

324 )

325

326 else:

327 # Add an "a" tag for the new link

328 _text = attrs.pop("_text", "")

329 new_tokens.extend(

330 [

331 {"type": "StartTag", "name": "a", "data": attrs},

332 {"type": "Characters", "data": str(_text)},

333 {"type": "EndTag", "name": "a"},

334 ]

335 )

336 end = match.end()

337

338 if new_tokens:

339 # Yield the adjusted set of tokens and then continue

340 # through the loop

341 if end < len(text):

342 new_tokens.append({"type": "Characters", "data": text[end:]})

343

344 yield from new_tokens

345

346 continue

347

348 yield token

349

350 def strip_non_url_bits(self, fragment):

351 """Strips non-url bits from the url

352

353 This accounts for over-eager matching by the regex.

354

355 """

356 prefix = suffix = ""

357

358 while fragment:

359 # Try removing ( from the beginning and, if it's balanced, from the

360 # end, too

361 if fragment.startswith("("):

362 prefix = prefix + "("

363 fragment = fragment[1:]

364

365 if fragment.endswith(")"):

366 suffix = ")" + suffix

367 fragment = fragment[:-1]

368 continue

369

370 # Now try extraneous things from the end. For example, sometimes we

371 # pick up ) at the end of a url, but the url is in a parenthesized

372 # phrase like:

373 #

374 # "i looked at the site (at http://example.com)"

375

376 if fragment.endswith(")") and "(" not in fragment:

377 fragment = fragment[:-1]

378 suffix = ")" + suffix

379 continue

380

381 # Handle commas

382 if fragment.endswith(","):

383 fragment = fragment[:-1]

384 suffix = "," + suffix

385 continue

386

387 # Handle periods

388 if fragment.endswith("."):

389 fragment = fragment[:-1]

390 suffix = "." + suffix

391 continue

392

393 # Nothing matched, so we're done

394 break

395

396 return fragment, prefix, suffix

397

398 def handle_links(self, src_iter):

399 """Handle links in character tokens"""

400 in_a = False # happens, if parse_email=True and if a mail was found

401 for token in src_iter:

402 if in_a:

403 if token["type"] == "EndTag" and token["name"] == "a":

404 in_a = False

405 yield token

406 continue

407 elif token["type"] == "StartTag" and token["name"] == "a":

408 in_a = True

409 yield token

410 continue

411 if token["type"] == "Characters":

412 text = token["data"]

413 new_tokens = []

414 end = 0

415

416 for match in self.url_re.finditer(text):

417 if match.start() > end:

418 new_tokens.append(

419 {"type": "Characters", "data": text[end : match.start()]}

420 )

421

422 url = match.group(0)

423 prefix = suffix = ""

424

425 # Sometimes we pick up too much in the url match, so look for

426 # bits we should drop and remove them from the match

427 url, prefix, suffix = self.strip_non_url_bits(url)

428

429 # If there's no protocol, add one

430 if PROTO_RE.search(url):

431 href = url

432 else:

433 href = "http://%s" % url

434

435 attrs = {(None, "href"): href, "_text": url}

436 attrs = self.apply_callbacks(attrs, True)

437

438 if attrs is None:

439 # Just add the text

440 new_tokens.append(

441 {"type": "Characters", "data": prefix + url + suffix}

442 )

443

444 else:

445 # Add the "a" tag!

446 if prefix:

447 new_tokens.append({"type": "Characters", "data": prefix})

448

449 _text = attrs.pop("_text", "")

450 new_tokens.extend(

451 [

452 {"type": "StartTag", "name": "a", "data": attrs},

453 {"type": "Characters", "data": str(_text)},

454 {"type": "EndTag", "name": "a"},

455 ]

456 )

457

458 if suffix:

459 new_tokens.append({"type": "Characters", "data": suffix})

460

461 end = match.end()

462

463 if new_tokens:

464 # Yield the adjusted set of tokens and then continue

465 # through the loop

466 if end < len(text):

467 new_tokens.append({"type": "Characters", "data": text[end:]})

468

469 yield from new_tokens

470

471 continue

472

473 yield token

474

475 def handle_a_tag(self, token_buffer):

476 """Handle the "a" tag

477

478 This could adjust the link or drop it altogether depending on what the

479 callbacks return.

480

481 This yields the new set of tokens.

482

483 """

484 a_token = token_buffer[0]

485 if a_token["data"]:

486 attrs = a_token["data"]

487 else:

488 attrs = {}

489 text = self.extract_character_data(token_buffer)

490 attrs["_text"] = text

491

492 attrs = self.apply_callbacks(attrs, False)

493

494 if attrs is None:

495 # We're dropping the "a" tag and everything else and replacing

496 # it with character data. So emit that token.

497 yield {"type": "Characters", "data": text}

498

499 else:

500 new_text = attrs.pop("_text", "")

501 a_token["data"] = attrs

502

503 if text == new_text:

504 # The callbacks didn't change the text, so we yield the new "a"

505 # token, then whatever else was there, then the end "a" token

506 yield a_token

507 yield from token_buffer[1:]

508

509 else:

510 # If the callbacks changed the text, then we're going to drop

511 # all the tokens between the start and end "a" tags and replace

512 # it with the new text

513 yield a_token

514 yield {"type": "Characters", "data": str(new_text)}

515 yield token_buffer[-1]

516

517 def extract_entities(self, token):

518 """Handles Characters tokens with entities

519

520 Our overridden tokenizer doesn't do anything with entities. However,

521 that means that the serializer will convert all ``&`` in Characters

522 tokens to ``&``.

523

524 Since we don't want that, we extract entities here and convert them to

525 Entity tokens so the serializer will let them be.

526

527 :arg token: the Characters token to work on

528

529 :returns: generator of tokens

530

531 """

532 data = token.get("data", "")

533

534 # If there isn't a & in the data, we can return now

535 if "&" not in data:

536 yield token

537 return

538

539 new_tokens = []

540

541 # For each possible entity that starts with a "&", we try to extract an

542 # actual entity and re-tokenize accordingly

543 for part in html5lib_shim.next_possible_entity(data):

544 if not part:

545 continue

546

547 if part.startswith("&"):

548 entity = html5lib_shim.match_entity(part)

549 if entity is not None:

550 if entity == "amp":

551 # LinkifyFilter can't match urls across token boundaries

552 # which is problematic with & since that shows up in

553 # querystrings all the time. This special-cases &

554 # and converts it to a & and sticks it in as a

555 # Characters token. It'll get merged with surrounding

556 # tokens in the BleachSanitizerfilter.__iter__ and

557 # escaped in the serializer.

558 new_tokens.append({"type": "Characters", "data": "&"})

559 else:

560 new_tokens.append({"type": "Entity", "name": entity})

561

562 # Length of the entity plus 2--one for & at the beginning

563 # and one for ; at the end

564 remainder = part[len(entity) + 2 :]

565 if remainder:

566 new_tokens.append({"type": "Characters", "data": remainder})

567 continue

568

569 new_tokens.append({"type": "Characters", "data": part})

570

571 yield from new_tokens

572

573 def __iter__(self):

574 in_a = False

575 in_skip_tag = None

576

577 token_buffer = []

578

579 for token in super().__iter__():

580 if in_a:

581 # Handle the case where we're in an "a" tag--we want to buffer tokens

582 # until we hit an end "a" tag.

583 if token["type"] == "EndTag" and token["name"] == "a":

584 # Add the end tag to the token buffer and then handle them

585 # and yield anything returned

586 token_buffer.append(token)

587 yield from self.handle_a_tag(token_buffer)

588

589 # Clear "a" related state and continue since we've yielded all

590 # the tokens we're going to yield

591 in_a = False

592 token_buffer = []

593 else:

594 token_buffer.append(token)

595 continue

596

597 if token["type"] in ["StartTag", "EmptyTag"]:

598 if token["name"] in self.skip_tags:

599 # Skip tags start a "special mode" where we don't linkify

600 # anything until the end tag.

601 in_skip_tag = token["name"]

602

603 elif token["name"] == "a":

604 # The "a" tag is special--we switch to a slurp mode and

605 # slurp all the tokens until the end "a" tag and then

606 # figure out what to do with them there.

607 in_a = True

608 token_buffer.append(token)

609

610 # We buffer the start tag, so we don't want to yield it,

611 # yet

612 continue

613

614 elif in_skip_tag and self.skip_tags:

615 # NOTE(willkg): We put this clause here since in_a and

616 # switching in and out of in_a takes precedence.

617 if token["type"] == "EndTag" and token["name"] == in_skip_tag:

618 in_skip_tag = None

619

620 elif not in_a and not in_skip_tag and token["type"] == "Characters":

621 new_stream = iter([token])

622 if self.parse_email:

623 new_stream = self.handle_email_addresses(new_stream)

624

625 new_stream = self.handle_links(new_stream)

626

627 for new_token in new_stream:

628 yield from self.extract_entities(new_token)

629

630 # We've already yielded this token, so continue

631 continue

632

633 yield token