Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/linkifier.py: 13%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

215 statements  

1import re 

2 

3from urllib.parse import quote 

4 

5from bleach import callbacks as linkify_callbacks 

6from bleach import html5lib_shim 

7 

8 

9#: List of default callbacks 

10DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] 

11 

12 

13TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az 

14 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat 

15 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk 

16 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg 

17 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il 

18 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp 

19 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk 

20 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne 

21 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post 

22 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl 

23 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to 

24 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws 

25 xn xxx ye yt yu za zm zw""".split() 

26 

27# Make sure that .com doesn't get matched by .co first 

28TLDS.reverse() 

29 

30 

31def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): 

32 """Builds the url regex used by linkifier 

33 

34 If you want a different set of tlds or allowed protocols, pass those in 

35 and stomp on the existing ``url_re``:: 

36 

37 from bleach import linkifier 

38 

39 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) 

40 

41 linker = LinkifyFilter(url_re=my_url_re) 

42 

43 """ 

44 return re.compile( 

45 r"""\(* # Match any opening parentheses. 

46 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// 

47 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? 

48 (?:[/?][^\s\{{\}}\|\\\^`<>"]*)? 

49 # /path/zz (excluding "unsafe" chars from RFC 3986, 

50 # except for # and ~, which happen in practice) 

51 """.format("|".join(sorted(protocols)), "|".join(sorted(tlds))), 

52 re.IGNORECASE | re.VERBOSE | re.UNICODE, 

53 ) 

54 

55 

56URL_RE = build_url_re() 

57 

58 

59PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE) 

60 

61 

62def build_email_re(tlds=TLDS): 

63 """Builds the email regex used by linkifier 

64 

65 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: 

66 

67 from bleach import linkifier 

68 

69 my_email_re = linkifier.build_email_re(my_tlds_list) 

70 

71 linker = LinkifyFilter(email_re=my_url_re) 

72 

73 """ 

74 # open and closing braces doubled below for format string 

75 return re.compile( 

76 r"""(?<!//) 

77 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ 

78 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom 

79 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] 

80 |\\[\001-\011\013\014\016-\177])*" # quoted-string 

81 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain 

82 """.format("|".join(tlds)), 

83 re.IGNORECASE | re.MULTILINE | re.VERBOSE, 

84 ) 

85 

86 

87EMAIL_RE = build_email_re() 

88 

89 

90class Linker: 

91 """Convert URL-like strings in an HTML fragment to links 

92 

93 This function converts strings that look like URLs, domain names and email 

94 addresses in text that may be an HTML fragment to links, while preserving: 

95 

96 1. links already in the string 

97 2. urls found in attributes 

98 3. email addresses 

99 

100 linkify does a best-effort approach and tries to recover from bad 

101 situations due to crazy text. 

102 

103 """ 

104 

105 def __init__( 

106 self, 

107 callbacks=DEFAULT_CALLBACKS, 

108 skip_tags=None, 

109 parse_email=False, 

110 url_re=URL_RE, 

111 email_re=EMAIL_RE, 

112 recognized_tags=html5lib_shim.HTML_TAGS, 

113 ): 

114 """Creates a Linker instance 

115 

116 :arg list callbacks: list of callbacks to run when adjusting tag attributes; 

117 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` 

118 

119 :arg set skip_tags: set of tags that you don't want to linkify the 

120 contents of; for example, you could set this to ``{'pre'}`` to skip 

121 linkifying contents of ``pre`` tags; ``None`` means you don't 

122 want linkify to skip any tags 

123 

124 :arg bool parse_email: whether or not to linkify email addresses 

125 

126 :arg url_re: url matching regex 

127 

128 :arg email_re: email matching regex 

129 

130 :arg set recognized_tags: the set of tags that linkify knows about; 

131 everything else gets escaped 

132 

133 :returns: linkified text as unicode 

134 

135 """ 

136 self.callbacks = callbacks 

137 self.skip_tags = skip_tags 

138 self.parse_email = parse_email 

139 self.url_re = url_re 

140 self.email_re = email_re 

141 

142 # Create a parser/tokenizer that allows all HTML tags and escapes 

143 # anything not in that list. 

144 self.parser = html5lib_shim.BleachHTMLParser( 

145 tags=frozenset(recognized_tags), 

146 strip=False, 

147 consume_entities=False, 

148 namespaceHTMLElements=False, 

149 ) 

150 self.walker = html5lib_shim.getTreeWalker("etree") 

151 self.serializer = html5lib_shim.BleachHTMLSerializer( 

152 quote_attr_values="always", 

153 omit_optional_tags=False, 

154 # We want to leave entities as they are without escaping or 

155 # resolving or expanding 

156 resolve_entities=False, 

157 # linkify does not sanitize 

158 sanitize=False, 

159 # linkify preserves attr order 

160 alphabetical_attributes=False, 

161 ) 

162 

163 def linkify(self, text): 

164 """Linkify specified text 

165 

166 :arg str text: the text to add links to 

167 

168 :returns: linkified text as unicode 

169 

170 :raises TypeError: if ``text`` is not a text type 

171 

172 """ 

173 if not isinstance(text, str): 

174 raise TypeError("argument must be of text type") 

175 

176 if not text: 

177 return "" 

178 

179 dom = self.parser.parseFragment(text) 

180 filtered = LinkifyFilter( 

181 source=self.walker(dom), 

182 callbacks=self.callbacks, 

183 skip_tags=self.skip_tags, 

184 parse_email=self.parse_email, 

185 url_re=self.url_re, 

186 email_re=self.email_re, 

187 ) 

188 return self.serializer.render(filtered) 

189 

190 

191class LinkifyFilter(html5lib_shim.Filter): 

192 """html5lib filter that linkifies text 

193 

194 This will do the following: 

195 

196 * convert email addresses into links 

197 * convert urls into links 

198 * edit existing links by running them through callbacks--the default is to 

199 add a ``rel="nofollow"`` 

200 

201 This filter can be used anywhere html5lib filters can be used. 

202 

203 """ 

204 

205 def __init__( 

206 self, 

207 source, 

208 callbacks=DEFAULT_CALLBACKS, 

209 skip_tags=None, 

210 parse_email=False, 

211 url_re=URL_RE, 

212 email_re=EMAIL_RE, 

213 ): 

214 """Creates a LinkifyFilter instance 

215 

216 :arg source: stream as an html5lib TreeWalker 

217 

218 :arg list callbacks: list of callbacks to run when adjusting tag attributes; 

219 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` 

220 

221 :arg set skip_tags: set of tags that you don't want to linkify the 

222 contents of; for example, you could set this to ``{'pre'}`` to skip 

223 linkifying contents of ``pre`` tags 

224 

225 :arg bool parse_email: whether or not to linkify email addresses 

226 

227 :arg url_re: url matching regex 

228 

229 :arg email_re: email matching regex 

230 

231 """ 

232 super().__init__(source) 

233 

234 self.callbacks = callbacks or [] 

235 self.skip_tags = skip_tags or {} 

236 self.parse_email = parse_email 

237 

238 self.url_re = url_re 

239 self.email_re = email_re 

240 

241 def apply_callbacks(self, attrs, is_new): 

242 """Given an attrs dict and an is_new bool, runs through callbacks 

243 

244 Callbacks can return an adjusted attrs dict or ``None``. In the case of 

245 ``None``, we stop going through callbacks and return that and the link 

246 gets dropped. 

247 

248 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` 

249 

250 :arg bool is_new: whether or not this link was added by linkify 

251 

252 :returns: adjusted attrs dict or ``None`` 

253 

254 """ 

255 for cb in self.callbacks: 

256 attrs = cb(attrs, is_new) 

257 if attrs is None: 

258 return None 

259 return attrs 

260 

261 def extract_character_data(self, token_list): 

262 """Extracts and squashes character sequences in a token stream""" 

263 # FIXME(willkg): This is a terrible idea. What it does is drop all the 

264 # tags from the token list and merge the Characters and SpaceCharacters 

265 # tokens into a single text. 

266 # 

267 # So something like this:: 

268 # 

269 # "<span>" "<b>" "some text" "</b>" "</span>" 

270 # 

271 # gets converted to "some text". 

272 # 

273 # This gets used to figure out the ``_text`` fauxttribute value for 

274 # linkify callables. 

275 # 

276 # I'm not really sure how else to support that ``_text`` fauxttribute and 

277 # maintain some modicum of backwards compatibility with previous versions 

278 # of Bleach. 

279 

280 out = [] 

281 for token in token_list: 

282 token_type = token["type"] 

283 if token_type in ["Characters", "SpaceCharacters"]: 

284 out.append(token["data"]) 

285 

286 return "".join(out) 

287 

288 def handle_email_addresses(self, src_iter): 

289 """Handle email addresses in character tokens""" 

290 for token in src_iter: 

291 if token["type"] == "Characters": 

292 text = token["data"] 

293 new_tokens = [] 

294 end = 0 

295 

296 # For each email address we find in the text 

297 for match in self.email_re.finditer(text): 

298 if match.start() > end: 

299 new_tokens.append( 

300 {"type": "Characters", "data": text[end : match.start()]} 

301 ) 

302 

303 # URL-encode the "local-part" according to RFC6068 

304 parts = match.group(0).split("@") 

305 parts[0] = quote(parts[0]) 

306 address = "@".join(parts) 

307 

308 # Run attributes through the callbacks to see what we 

309 # should do with this match 

310 attrs = { 

311 (None, "href"): "mailto:%s" % address, 

312 "_text": match.group(0), 

313 } 

314 attrs = self.apply_callbacks(attrs, True) 

315 

316 if attrs is None: 

317 # Just add the text--but not as a link 

318 new_tokens.append( 

319 {"type": "Characters", "data": match.group(0)} 

320 ) 

321 

322 else: 

323 # Add an "a" tag for the new link 

324 _text = attrs.pop("_text", "") 

325 new_tokens.extend( 

326 [ 

327 {"type": "StartTag", "name": "a", "data": attrs}, 

328 {"type": "Characters", "data": str(_text)}, 

329 {"type": "EndTag", "name": "a"}, 

330 ] 

331 ) 

332 end = match.end() 

333 

334 if new_tokens: 

335 # Yield the adjusted set of tokens and then continue 

336 # through the loop 

337 if end < len(text): 

338 new_tokens.append({"type": "Characters", "data": text[end:]}) 

339 

340 yield from new_tokens 

341 

342 continue 

343 

344 yield token 

345 

346 def strip_non_url_bits(self, fragment): 

347 """Strips non-url bits from the url 

348 

349 This accounts for over-eager matching by the regex. 

350 

351 """ 

352 prefix = suffix = "" 

353 

354 while fragment: 

355 # Try removing ( from the beginning and, if it's balanced, from the 

356 # end, too 

357 if fragment.startswith("("): 

358 prefix = prefix + "(" 

359 fragment = fragment[1:] 

360 

361 if fragment.endswith(")"): 

362 suffix = ")" + suffix 

363 fragment = fragment[:-1] 

364 continue 

365 

366 # Now try extraneous things from the end. For example, sometimes we 

367 # pick up ) at the end of a url, but the url is in a parenthesized 

368 # phrase like: 

369 # 

370 # "i looked at the site (at http://example.com)" 

371 

372 if fragment.endswith(")") and "(" not in fragment: 

373 fragment = fragment[:-1] 

374 suffix = ")" + suffix 

375 continue 

376 

377 # Handle commas 

378 if fragment.endswith(","): 

379 fragment = fragment[:-1] 

380 suffix = "," + suffix 

381 continue 

382 

383 # Handle periods 

384 if fragment.endswith("."): 

385 fragment = fragment[:-1] 

386 suffix = "." + suffix 

387 continue 

388 

389 # Nothing matched, so we're done 

390 break 

391 

392 return fragment, prefix, suffix 

393 

394 def handle_links(self, src_iter): 

395 """Handle links in character tokens""" 

396 in_a = False # happens, if parse_email=True and if a mail was found 

397 for token in src_iter: 

398 if in_a: 

399 if token["type"] == "EndTag" and token["name"] == "a": 

400 in_a = False 

401 yield token 

402 continue 

403 elif token["type"] == "StartTag" and token["name"] == "a": 

404 in_a = True 

405 yield token 

406 continue 

407 if token["type"] == "Characters": 

408 text = token["data"] 

409 new_tokens = [] 

410 end = 0 

411 

412 for match in self.url_re.finditer(text): 

413 if match.start() > end: 

414 new_tokens.append( 

415 {"type": "Characters", "data": text[end : match.start()]} 

416 ) 

417 

418 url = match.group(0) 

419 prefix = suffix = "" 

420 

421 # Sometimes we pick up too much in the url match, so look for 

422 # bits we should drop and remove them from the match 

423 url, prefix, suffix = self.strip_non_url_bits(url) 

424 

425 # If there's no protocol, add one 

426 if PROTO_RE.search(url): 

427 href = url 

428 else: 

429 href = "http://%s" % url 

430 

431 attrs = {(None, "href"): href, "_text": url} 

432 attrs = self.apply_callbacks(attrs, True) 

433 

434 if attrs is None: 

435 # Just add the text 

436 new_tokens.append( 

437 {"type": "Characters", "data": prefix + url + suffix} 

438 ) 

439 

440 else: 

441 # Add the "a" tag! 

442 if prefix: 

443 new_tokens.append({"type": "Characters", "data": prefix}) 

444 

445 _text = attrs.pop("_text", "") 

446 new_tokens.extend( 

447 [ 

448 {"type": "StartTag", "name": "a", "data": attrs}, 

449 {"type": "Characters", "data": str(_text)}, 

450 {"type": "EndTag", "name": "a"}, 

451 ] 

452 ) 

453 

454 if suffix: 

455 new_tokens.append({"type": "Characters", "data": suffix}) 

456 

457 end = match.end() 

458 

459 if new_tokens: 

460 # Yield the adjusted set of tokens and then continue 

461 # through the loop 

462 if end < len(text): 

463 new_tokens.append({"type": "Characters", "data": text[end:]}) 

464 

465 yield from new_tokens 

466 

467 continue 

468 

469 yield token 

470 

471 def handle_a_tag(self, token_buffer): 

472 """Handle the "a" tag 

473 

474 This could adjust the link or drop it altogether depending on what the 

475 callbacks return. 

476 

477 This yields the new set of tokens. 

478 

479 """ 

480 a_token = token_buffer[0] 

481 if a_token["data"]: 

482 attrs = a_token["data"] 

483 else: 

484 attrs = {} 

485 text = self.extract_character_data(token_buffer) 

486 attrs["_text"] = text 

487 

488 attrs = self.apply_callbacks(attrs, False) 

489 

490 if attrs is None: 

491 # We're dropping the "a" tag and everything else and replacing 

492 # it with character data. So emit that token. 

493 yield {"type": "Characters", "data": text} 

494 

495 else: 

496 new_text = attrs.pop("_text", "") 

497 a_token["data"] = attrs 

498 

499 if text == new_text: 

500 # The callbacks didn't change the text, so we yield the new "a" 

501 # token, then whatever else was there, then the end "a" token 

502 yield a_token 

503 yield from token_buffer[1:] 

504 

505 else: 

506 # If the callbacks changed the text, then we're going to drop 

507 # all the tokens between the start and end "a" tags and replace 

508 # it with the new text 

509 yield a_token 

510 yield {"type": "Characters", "data": str(new_text)} 

511 yield token_buffer[-1] 

512 

513 def extract_entities(self, token): 

514 """Handles Characters tokens with entities 

515 

516 Our overridden tokenizer doesn't do anything with entities. However, 

517 that means that the serializer will convert all ``&`` in Characters 

518 tokens to ``&amp;``. 

519 

520 Since we don't want that, we extract entities here and convert them to 

521 Entity tokens so the serializer will let them be. 

522 

523 :arg token: the Characters token to work on 

524 

525 :returns: generator of tokens 

526 

527 """ 

528 data = token.get("data", "") 

529 

530 # If there isn't a & in the data, we can return now 

531 if "&" not in data: 

532 yield token 

533 return 

534 

535 new_tokens = [] 

536 

537 # For each possible entity that starts with a "&", we try to extract an 

538 # actual entity and re-tokenize accordingly 

539 for part in html5lib_shim.next_possible_entity(data): 

540 if not part: 

541 continue 

542 

543 if part.startswith("&"): 

544 entity = html5lib_shim.match_entity(part) 

545 if entity is not None: 

546 if entity == "amp": 

547 # LinkifyFilter can't match urls across token boundaries 

548 # which is problematic with &amp; since that shows up in 

549 # querystrings all the time. This special-cases &amp; 

550 # and converts it to a & and sticks it in as a 

551 # Characters token. It'll get merged with surrounding 

552 # tokens in the BleachSanitizerfilter.__iter__ and 

553 # escaped in the serializer. 

554 new_tokens.append({"type": "Characters", "data": "&"}) 

555 else: 

556 new_tokens.append({"type": "Entity", "name": entity}) 

557 

558 # Length of the entity plus 2--one for & at the beginning 

559 # and one for ; at the end 

560 remainder = part[len(entity) + 2 :] 

561 if remainder: 

562 new_tokens.append({"type": "Characters", "data": remainder}) 

563 continue 

564 

565 new_tokens.append({"type": "Characters", "data": part}) 

566 

567 yield from new_tokens 

568 

569 def __iter__(self): 

570 in_a = False 

571 in_skip_tag = None 

572 

573 token_buffer = [] 

574 

575 for token in super().__iter__(): 

576 if in_a: 

577 # Handle the case where we're in an "a" tag--we want to buffer tokens 

578 # until we hit an end "a" tag. 

579 if token["type"] == "EndTag" and token["name"] == "a": 

580 # Add the end tag to the token buffer and then handle them 

581 # and yield anything returned 

582 token_buffer.append(token) 

583 yield from self.handle_a_tag(token_buffer) 

584 

585 # Clear "a" related state and continue since we've yielded all 

586 # the tokens we're going to yield 

587 in_a = False 

588 token_buffer = [] 

589 else: 

590 token_buffer.extend(list(self.extract_entities(token))) 

591 continue 

592 

593 if token["type"] in ["StartTag", "EmptyTag"]: 

594 if token["name"] in self.skip_tags: 

595 # Skip tags start a "special mode" where we don't linkify 

596 # anything until the end tag. 

597 in_skip_tag = token["name"] 

598 

599 elif token["name"] == "a": 

600 # The "a" tag is special--we switch to a slurp mode and 

601 # slurp all the tokens until the end "a" tag and then 

602 # figure out what to do with them there. 

603 in_a = True 

604 token_buffer.append(token) 

605 

606 # We buffer the start tag, so we don't want to yield it, 

607 # yet 

608 continue 

609 

610 elif in_skip_tag and self.skip_tags: 

611 # NOTE(willkg): We put this clause here since in_a and 

612 # switching in and out of in_a takes precedence. 

613 if token["type"] == "EndTag" and token["name"] == in_skip_tag: 

614 in_skip_tag = None 

615 

616 elif not in_a and not in_skip_tag and token["type"] == "Characters": 

617 new_stream = iter([token]) 

618 if self.parse_email: 

619 new_stream = self.handle_email_addresses(new_stream) 

620 

621 new_stream = self.handle_links(new_stream) 

622 

623 for new_token in new_stream: 

624 yield from self.extract_entities(new_token) 

625 

626 # We've already yielded this token, so continue 

627 continue 

628 

629 yield token