Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/linkifier.py: 13%

215 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 06:10 +0000

1import re 

2 

3from urllib.parse import quote 

4 

5from bleach import callbacks as linkify_callbacks 

6from bleach import html5lib_shim 

7 

8 

9#: List of default callbacks 

10DEFAULT_CALLBACKS = [linkify_callbacks.nofollow] 

11 

12 

13TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az 

14 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat 

15 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk 

16 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg 

17 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il 

18 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp 

19 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk 

20 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne 

21 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post 

22 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl 

23 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to 

24 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws 

25 xn xxx ye yt yu za zm zw""".split() 

26 

27# Make sure that .com doesn't get matched by .co first 

28TLDS.reverse() 

29 

30 

31def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols): 

32 """Builds the url regex used by linkifier 

33 

34 If you want a different set of tlds or allowed protocols, pass those in 

35 and stomp on the existing ``url_re``:: 

36 

37 from bleach import linkifier 

38 

39 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols) 

40 

41 linker = LinkifyFilter(url_re=my_url_re) 

42 

43 """ 

44 return re.compile( 

45 r"""\(* # Match any opening parentheses. 

46 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http:// 

47 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)? 

48 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)? 

49 # /path/zz (excluding "unsafe" chars from RFC 1738, 

50 # except for # and ~, which happen in practice) 

51 """.format( 

52 "|".join(sorted(protocols)), "|".join(sorted(tlds)) 

53 ), 

54 re.IGNORECASE | re.VERBOSE | re.UNICODE, 

55 ) 

56 

57 

58URL_RE = build_url_re() 

59 

60 

61PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE) 

62 

63 

64def build_email_re(tlds=TLDS): 

65 """Builds the email regex used by linkifier 

66 

67 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``:: 

68 

69 from bleach import linkifier 

70 

71 my_email_re = linkifier.build_email_re(my_tlds_list) 

72 

73 linker = LinkifyFilter(email_re=my_url_re) 

74 

75 """ 

76 # open and closing braces doubled below for format string 

77 return re.compile( 

78 r"""(?<!//) 

79 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+ 

80 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom 

81 |^"([\001-\010\013\014\016-\037!#-\[\]-\177] 

82 |\\[\001-\011\013\014\016-\177])*" # quoted-string 

83 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain 

84 """.format( 

85 "|".join(tlds) 

86 ), 

87 re.IGNORECASE | re.MULTILINE | re.VERBOSE, 

88 ) 

89 

90 

91EMAIL_RE = build_email_re() 

92 

93 

94class Linker: 

95 """Convert URL-like strings in an HTML fragment to links 

96 

97 This function converts strings that look like URLs, domain names and email 

98 addresses in text that may be an HTML fragment to links, while preserving: 

99 

100 1. links already in the string 

101 2. urls found in attributes 

102 3. email addresses 

103 

104 linkify does a best-effort approach and tries to recover from bad 

105 situations due to crazy text. 

106 

107 """ 

108 

109 def __init__( 

110 self, 

111 callbacks=DEFAULT_CALLBACKS, 

112 skip_tags=None, 

113 parse_email=False, 

114 url_re=URL_RE, 

115 email_re=EMAIL_RE, 

116 recognized_tags=html5lib_shim.HTML_TAGS, 

117 ): 

118 """Creates a Linker instance 

119 

120 :arg list callbacks: list of callbacks to run when adjusting tag attributes; 

121 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` 

122 

123 :arg set skip_tags: set of tags that you don't want to linkify the 

124 contents of; for example, you could set this to ``{'pre'}`` to skip 

125 linkifying contents of ``pre`` tags; ``None`` means you don't 

126 want linkify to skip any tags 

127 

128 :arg bool parse_email: whether or not to linkify email addresses 

129 

130 :arg url_re: url matching regex 

131 

132 :arg email_re: email matching regex 

133 

134 :arg set recognized_tags: the set of tags that linkify knows about; 

135 everything else gets escaped 

136 

137 :returns: linkified text as unicode 

138 

139 """ 

140 self.callbacks = callbacks 

141 self.skip_tags = skip_tags 

142 self.parse_email = parse_email 

143 self.url_re = url_re 

144 self.email_re = email_re 

145 

146 # Create a parser/tokenizer that allows all HTML tags and escapes 

147 # anything not in that list. 

148 self.parser = html5lib_shim.BleachHTMLParser( 

149 tags=frozenset(recognized_tags), 

150 strip=False, 

151 consume_entities=False, 

152 namespaceHTMLElements=False, 

153 ) 

154 self.walker = html5lib_shim.getTreeWalker("etree") 

155 self.serializer = html5lib_shim.BleachHTMLSerializer( 

156 quote_attr_values="always", 

157 omit_optional_tags=False, 

158 # We want to leave entities as they are without escaping or 

159 # resolving or expanding 

160 resolve_entities=False, 

161 # linkify does not sanitize 

162 sanitize=False, 

163 # linkify preserves attr order 

164 alphabetical_attributes=False, 

165 ) 

166 

167 def linkify(self, text): 

168 """Linkify specified text 

169 

170 :arg str text: the text to add links to 

171 

172 :returns: linkified text as unicode 

173 

174 :raises TypeError: if ``text`` is not a text type 

175 

176 """ 

177 if not isinstance(text, str): 

178 raise TypeError("argument must be of text type") 

179 

180 if not text: 

181 return "" 

182 

183 dom = self.parser.parseFragment(text) 

184 filtered = LinkifyFilter( 

185 source=self.walker(dom), 

186 callbacks=self.callbacks, 

187 skip_tags=self.skip_tags, 

188 parse_email=self.parse_email, 

189 url_re=self.url_re, 

190 email_re=self.email_re, 

191 ) 

192 return self.serializer.render(filtered) 

193 

194 

195class LinkifyFilter(html5lib_shim.Filter): 

196 """html5lib filter that linkifies text 

197 

198 This will do the following: 

199 

200 * convert email addresses into links 

201 * convert urls into links 

202 * edit existing links by running them through callbacks--the default is to 

203 add a ``rel="nofollow"`` 

204 

205 This filter can be used anywhere html5lib filters can be used. 

206 

207 """ 

208 

209 def __init__( 

210 self, 

211 source, 

212 callbacks=DEFAULT_CALLBACKS, 

213 skip_tags=None, 

214 parse_email=False, 

215 url_re=URL_RE, 

216 email_re=EMAIL_RE, 

217 ): 

218 """Creates a LinkifyFilter instance 

219 

220 :arg source: stream as an html5lib TreeWalker 

221 

222 :arg list callbacks: list of callbacks to run when adjusting tag attributes; 

223 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` 

224 

225 :arg set skip_tags: set of tags that you don't want to linkify the 

226 contents of; for example, you could set this to ``{'pre'}`` to skip 

227 linkifying contents of ``pre`` tags 

228 

229 :arg bool parse_email: whether or not to linkify email addresses 

230 

231 :arg url_re: url matching regex 

232 

233 :arg email_re: email matching regex 

234 

235 """ 

236 super().__init__(source) 

237 

238 self.callbacks = callbacks or [] 

239 self.skip_tags = skip_tags or {} 

240 self.parse_email = parse_email 

241 

242 self.url_re = url_re 

243 self.email_re = email_re 

244 

245 def apply_callbacks(self, attrs, is_new): 

246 """Given an attrs dict and an is_new bool, runs through callbacks 

247 

248 Callbacks can return an adjusted attrs dict or ``None``. In the case of 

249 ``None``, we stop going through callbacks and return that and the link 

250 gets dropped. 

251 

252 :arg dict attrs: map of ``(namespace, name)`` -> ``value`` 

253 

254 :arg bool is_new: whether or not this link was added by linkify 

255 

256 :returns: adjusted attrs dict or ``None`` 

257 

258 """ 

259 for cb in self.callbacks: 

260 attrs = cb(attrs, is_new) 

261 if attrs is None: 

262 return None 

263 return attrs 

264 

265 def extract_character_data(self, token_list): 

266 """Extracts and squashes character sequences in a token stream""" 

267 # FIXME(willkg): This is a terrible idea. What it does is drop all the 

268 # tags from the token list and merge the Characters and SpaceCharacters 

269 # tokens into a single text. 

270 # 

271 # So something like this:: 

272 # 

273 # "<span>" "<b>" "some text" "</b>" "</span>" 

274 # 

275 # gets converted to "some text". 

276 # 

277 # This gets used to figure out the ``_text`` fauxttribute value for 

278 # linkify callables. 

279 # 

280 # I'm not really sure how else to support that ``_text`` fauxttribute and 

281 # maintain some modicum of backwards compatibility with previous versions 

282 # of Bleach. 

283 

284 out = [] 

285 for token in token_list: 

286 token_type = token["type"] 

287 if token_type in ["Characters", "SpaceCharacters"]: 

288 out.append(token["data"]) 

289 

290 return "".join(out) 

291 

292 def handle_email_addresses(self, src_iter): 

293 """Handle email addresses in character tokens""" 

294 for token in src_iter: 

295 if token["type"] == "Characters": 

296 text = token["data"] 

297 new_tokens = [] 

298 end = 0 

299 

300 # For each email address we find in the text 

301 for match in self.email_re.finditer(text): 

302 if match.start() > end: 

303 new_tokens.append( 

304 {"type": "Characters", "data": text[end : match.start()]} 

305 ) 

306 

307 # URL-encode the "local-part" according to RFC6068 

308 parts = match.group(0).split("@") 

309 parts[0] = quote(parts[0]) 

310 address = "@".join(parts) 

311 

312 # Run attributes through the callbacks to see what we 

313 # should do with this match 

314 attrs = { 

315 (None, "href"): "mailto:%s" % address, 

316 "_text": match.group(0), 

317 } 

318 attrs = self.apply_callbacks(attrs, True) 

319 

320 if attrs is None: 

321 # Just add the text--but not as a link 

322 new_tokens.append( 

323 {"type": "Characters", "data": match.group(0)} 

324 ) 

325 

326 else: 

327 # Add an "a" tag for the new link 

328 _text = attrs.pop("_text", "") 

329 new_tokens.extend( 

330 [ 

331 {"type": "StartTag", "name": "a", "data": attrs}, 

332 {"type": "Characters", "data": str(_text)}, 

333 {"type": "EndTag", "name": "a"}, 

334 ] 

335 ) 

336 end = match.end() 

337 

338 if new_tokens: 

339 # Yield the adjusted set of tokens and then continue 

340 # through the loop 

341 if end < len(text): 

342 new_tokens.append({"type": "Characters", "data": text[end:]}) 

343 

344 yield from new_tokens 

345 

346 continue 

347 

348 yield token 

349 

350 def strip_non_url_bits(self, fragment): 

351 """Strips non-url bits from the url 

352 

353 This accounts for over-eager matching by the regex. 

354 

355 """ 

356 prefix = suffix = "" 

357 

358 while fragment: 

359 # Try removing ( from the beginning and, if it's balanced, from the 

360 # end, too 

361 if fragment.startswith("("): 

362 prefix = prefix + "(" 

363 fragment = fragment[1:] 

364 

365 if fragment.endswith(")"): 

366 suffix = ")" + suffix 

367 fragment = fragment[:-1] 

368 continue 

369 

370 # Now try extraneous things from the end. For example, sometimes we 

371 # pick up ) at the end of a url, but the url is in a parenthesized 

372 # phrase like: 

373 # 

374 # "i looked at the site (at http://example.com)" 

375 

376 if fragment.endswith(")") and "(" not in fragment: 

377 fragment = fragment[:-1] 

378 suffix = ")" + suffix 

379 continue 

380 

381 # Handle commas 

382 if fragment.endswith(","): 

383 fragment = fragment[:-1] 

384 suffix = "," + suffix 

385 continue 

386 

387 # Handle periods 

388 if fragment.endswith("."): 

389 fragment = fragment[:-1] 

390 suffix = "." + suffix 

391 continue 

392 

393 # Nothing matched, so we're done 

394 break 

395 

396 return fragment, prefix, suffix 

397 

398 def handle_links(self, src_iter): 

399 """Handle links in character tokens""" 

400 in_a = False # happens, if parse_email=True and if a mail was found 

401 for token in src_iter: 

402 if in_a: 

403 if token["type"] == "EndTag" and token["name"] == "a": 

404 in_a = False 

405 yield token 

406 continue 

407 elif token["type"] == "StartTag" and token["name"] == "a": 

408 in_a = True 

409 yield token 

410 continue 

411 if token["type"] == "Characters": 

412 text = token["data"] 

413 new_tokens = [] 

414 end = 0 

415 

416 for match in self.url_re.finditer(text): 

417 if match.start() > end: 

418 new_tokens.append( 

419 {"type": "Characters", "data": text[end : match.start()]} 

420 ) 

421 

422 url = match.group(0) 

423 prefix = suffix = "" 

424 

425 # Sometimes we pick up too much in the url match, so look for 

426 # bits we should drop and remove them from the match 

427 url, prefix, suffix = self.strip_non_url_bits(url) 

428 

429 # If there's no protocol, add one 

430 if PROTO_RE.search(url): 

431 href = url 

432 else: 

433 href = "http://%s" % url 

434 

435 attrs = {(None, "href"): href, "_text": url} 

436 attrs = self.apply_callbacks(attrs, True) 

437 

438 if attrs is None: 

439 # Just add the text 

440 new_tokens.append( 

441 {"type": "Characters", "data": prefix + url + suffix} 

442 ) 

443 

444 else: 

445 # Add the "a" tag! 

446 if prefix: 

447 new_tokens.append({"type": "Characters", "data": prefix}) 

448 

449 _text = attrs.pop("_text", "") 

450 new_tokens.extend( 

451 [ 

452 {"type": "StartTag", "name": "a", "data": attrs}, 

453 {"type": "Characters", "data": str(_text)}, 

454 {"type": "EndTag", "name": "a"}, 

455 ] 

456 ) 

457 

458 if suffix: 

459 new_tokens.append({"type": "Characters", "data": suffix}) 

460 

461 end = match.end() 

462 

463 if new_tokens: 

464 # Yield the adjusted set of tokens and then continue 

465 # through the loop 

466 if end < len(text): 

467 new_tokens.append({"type": "Characters", "data": text[end:]}) 

468 

469 yield from new_tokens 

470 

471 continue 

472 

473 yield token 

474 

475 def handle_a_tag(self, token_buffer): 

476 """Handle the "a" tag 

477 

478 This could adjust the link or drop it altogether depending on what the 

479 callbacks return. 

480 

481 This yields the new set of tokens. 

482 

483 """ 

484 a_token = token_buffer[0] 

485 if a_token["data"]: 

486 attrs = a_token["data"] 

487 else: 

488 attrs = {} 

489 text = self.extract_character_data(token_buffer) 

490 attrs["_text"] = text 

491 

492 attrs = self.apply_callbacks(attrs, False) 

493 

494 if attrs is None: 

495 # We're dropping the "a" tag and everything else and replacing 

496 # it with character data. So emit that token. 

497 yield {"type": "Characters", "data": text} 

498 

499 else: 

500 new_text = attrs.pop("_text", "") 

501 a_token["data"] = attrs 

502 

503 if text == new_text: 

504 # The callbacks didn't change the text, so we yield the new "a" 

505 # token, then whatever else was there, then the end "a" token 

506 yield a_token 

507 yield from token_buffer[1:] 

508 

509 else: 

510 # If the callbacks changed the text, then we're going to drop 

511 # all the tokens between the start and end "a" tags and replace 

512 # it with the new text 

513 yield a_token 

514 yield {"type": "Characters", "data": str(new_text)} 

515 yield token_buffer[-1] 

516 

517 def extract_entities(self, token): 

518 """Handles Characters tokens with entities 

519 

520 Our overridden tokenizer doesn't do anything with entities. However, 

521 that means that the serializer will convert all ``&`` in Characters 

522 tokens to ``&amp;``. 

523 

524 Since we don't want that, we extract entities here and convert them to 

525 Entity tokens so the serializer will let them be. 

526 

527 :arg token: the Characters token to work on 

528 

529 :returns: generator of tokens 

530 

531 """ 

532 data = token.get("data", "") 

533 

534 # If there isn't a & in the data, we can return now 

535 if "&" not in data: 

536 yield token 

537 return 

538 

539 new_tokens = [] 

540 

541 # For each possible entity that starts with a "&", we try to extract an 

542 # actual entity and re-tokenize accordingly 

543 for part in html5lib_shim.next_possible_entity(data): 

544 if not part: 

545 continue 

546 

547 if part.startswith("&"): 

548 entity = html5lib_shim.match_entity(part) 

549 if entity is not None: 

550 if entity == "amp": 

551 # LinkifyFilter can't match urls across token boundaries 

552 # which is problematic with &amp; since that shows up in 

553 # querystrings all the time. This special-cases &amp; 

554 # and converts it to a & and sticks it in as a 

555 # Characters token. It'll get merged with surrounding 

556 # tokens in the BleachSanitizerfilter.__iter__ and 

557 # escaped in the serializer. 

558 new_tokens.append({"type": "Characters", "data": "&"}) 

559 else: 

560 new_tokens.append({"type": "Entity", "name": entity}) 

561 

562 # Length of the entity plus 2--one for & at the beginning 

563 # and one for ; at the end 

564 remainder = part[len(entity) + 2 :] 

565 if remainder: 

566 new_tokens.append({"type": "Characters", "data": remainder}) 

567 continue 

568 

569 new_tokens.append({"type": "Characters", "data": part}) 

570 

571 yield from new_tokens 

572 

573 def __iter__(self): 

574 in_a = False 

575 in_skip_tag = None 

576 

577 token_buffer = [] 

578 

579 for token in super().__iter__(): 

580 if in_a: 

581 # Handle the case where we're in an "a" tag--we want to buffer tokens 

582 # until we hit an end "a" tag. 

583 if token["type"] == "EndTag" and token["name"] == "a": 

584 # Add the end tag to the token buffer and then handle them 

585 # and yield anything returned 

586 token_buffer.append(token) 

587 yield from self.handle_a_tag(token_buffer) 

588 

589 # Clear "a" related state and continue since we've yielded all 

590 # the tokens we're going to yield 

591 in_a = False 

592 token_buffer = [] 

593 else: 

594 token_buffer.append(token) 

595 continue 

596 

597 if token["type"] in ["StartTag", "EmptyTag"]: 

598 if token["name"] in self.skip_tags: 

599 # Skip tags start a "special mode" where we don't linkify 

600 # anything until the end tag. 

601 in_skip_tag = token["name"] 

602 

603 elif token["name"] == "a": 

604 # The "a" tag is special--we switch to a slurp mode and 

605 # slurp all the tokens until the end "a" tag and then 

606 # figure out what to do with them there. 

607 in_a = True 

608 token_buffer.append(token) 

609 

610 # We buffer the start tag, so we don't want to yield it, 

611 # yet 

612 continue 

613 

614 elif in_skip_tag and self.skip_tags: 

615 # NOTE(willkg): We put this clause here since in_a and 

616 # switching in and out of in_a takes precedence. 

617 if token["type"] == "EndTag" and token["name"] == in_skip_tag: 

618 in_skip_tag = None 

619 

620 elif not in_a and not in_skip_tag and token["type"] == "Characters": 

621 new_stream = iter([token]) 

622 if self.parse_email: 

623 new_stream = self.handle_email_addresses(new_stream) 

624 

625 new_stream = self.handle_links(new_stream) 

626 

627 for new_token in new_stream: 

628 yield from self.extract_entities(new_token) 

629 

630 # We've already yielded this token, so continue 

631 continue 

632 

633 yield token