Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/linkifier.py: 13%
215 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
1import re
3from urllib.parse import quote
5from bleach import callbacks as linkify_callbacks
6from bleach import html5lib_shim
9#: List of default callbacks
10DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
13TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
14 ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
15 cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
16 dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
17 gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
18 im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
19 kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
20 ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
21 net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
22 pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
23 sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
24 tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
25 xn xxx ye yt yu za zm zw""".split()
27# Make sure that .com doesn't get matched by .co first
28TLDS.reverse()
31def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
32 """Builds the url regex used by linkifier
34 If you want a different set of tlds or allowed protocols, pass those in
35 and stomp on the existing ``url_re``::
37 from bleach import linkifier
39 my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
41 linker = LinkifyFilter(url_re=my_url_re)
43 """
44 return re.compile(
45 r"""\(* # Match any opening parentheses.
46 \b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
47 ([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
48 (?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
49 # /path/zz (excluding "unsafe" chars from RFC 1738,
50 # except for # and ~, which happen in practice)
51 """.format(
52 "|".join(sorted(protocols)), "|".join(sorted(tlds))
53 ),
54 re.IGNORECASE | re.VERBOSE | re.UNICODE,
55 )
58URL_RE = build_url_re()
61PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
64def build_email_re(tlds=TLDS):
65 """Builds the email regex used by linkifier
67 If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
69 from bleach import linkifier
71 my_email_re = linkifier.build_email_re(my_tlds_list)
73 linker = LinkifyFilter(email_re=my_url_re)
75 """
76 # open and closing braces doubled below for format string
77 return re.compile(
78 r"""(?<!//)
79 (([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
80 (\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
81 |^"([\001-\010\013\014\016-\037!#-\[\]-\177]
82 |\\[\001-\011\013\014\016-\177])*" # quoted-string
83 )@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
84 """.format(
85 "|".join(tlds)
86 ),
87 re.IGNORECASE | re.MULTILINE | re.VERBOSE,
88 )
91EMAIL_RE = build_email_re()
94class Linker:
95 """Convert URL-like strings in an HTML fragment to links
97 This function converts strings that look like URLs, domain names and email
98 addresses in text that may be an HTML fragment to links, while preserving:
100 1. links already in the string
101 2. urls found in attributes
102 3. email addresses
104 linkify does a best-effort approach and tries to recover from bad
105 situations due to crazy text.
107 """
109 def __init__(
110 self,
111 callbacks=DEFAULT_CALLBACKS,
112 skip_tags=None,
113 parse_email=False,
114 url_re=URL_RE,
115 email_re=EMAIL_RE,
116 recognized_tags=html5lib_shim.HTML_TAGS,
117 ):
118 """Creates a Linker instance
120 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
121 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
123 :arg set skip_tags: set of tags that you don't want to linkify the
124 contents of; for example, you could set this to ``{'pre'}`` to skip
125 linkifying contents of ``pre`` tags; ``None`` means you don't
126 want linkify to skip any tags
128 :arg bool parse_email: whether or not to linkify email addresses
130 :arg url_re: url matching regex
132 :arg email_re: email matching regex
134 :arg set recognized_tags: the set of tags that linkify knows about;
135 everything else gets escaped
137 :returns: linkified text as unicode
139 """
140 self.callbacks = callbacks
141 self.skip_tags = skip_tags
142 self.parse_email = parse_email
143 self.url_re = url_re
144 self.email_re = email_re
146 # Create a parser/tokenizer that allows all HTML tags and escapes
147 # anything not in that list.
148 self.parser = html5lib_shim.BleachHTMLParser(
149 tags=frozenset(recognized_tags),
150 strip=False,
151 consume_entities=False,
152 namespaceHTMLElements=False,
153 )
154 self.walker = html5lib_shim.getTreeWalker("etree")
155 self.serializer = html5lib_shim.BleachHTMLSerializer(
156 quote_attr_values="always",
157 omit_optional_tags=False,
158 # We want to leave entities as they are without escaping or
159 # resolving or expanding
160 resolve_entities=False,
161 # linkify does not sanitize
162 sanitize=False,
163 # linkify preserves attr order
164 alphabetical_attributes=False,
165 )
167 def linkify(self, text):
168 """Linkify specified text
170 :arg str text: the text to add links to
172 :returns: linkified text as unicode
174 :raises TypeError: if ``text`` is not a text type
176 """
177 if not isinstance(text, str):
178 raise TypeError("argument must be of text type")
180 if not text:
181 return ""
183 dom = self.parser.parseFragment(text)
184 filtered = LinkifyFilter(
185 source=self.walker(dom),
186 callbacks=self.callbacks,
187 skip_tags=self.skip_tags,
188 parse_email=self.parse_email,
189 url_re=self.url_re,
190 email_re=self.email_re,
191 )
192 return self.serializer.render(filtered)
195class LinkifyFilter(html5lib_shim.Filter):
196 """html5lib filter that linkifies text
198 This will do the following:
200 * convert email addresses into links
201 * convert urls into links
202 * edit existing links by running them through callbacks--the default is to
203 add a ``rel="nofollow"``
205 This filter can be used anywhere html5lib filters can be used.
207 """
209 def __init__(
210 self,
211 source,
212 callbacks=DEFAULT_CALLBACKS,
213 skip_tags=None,
214 parse_email=False,
215 url_re=URL_RE,
216 email_re=EMAIL_RE,
217 ):
218 """Creates a LinkifyFilter instance
220 :arg source: stream as an html5lib TreeWalker
222 :arg list callbacks: list of callbacks to run when adjusting tag attributes;
223 defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
225 :arg set skip_tags: set of tags that you don't want to linkify the
226 contents of; for example, you could set this to ``{'pre'}`` to skip
227 linkifying contents of ``pre`` tags
229 :arg bool parse_email: whether or not to linkify email addresses
231 :arg url_re: url matching regex
233 :arg email_re: email matching regex
235 """
236 super().__init__(source)
238 self.callbacks = callbacks or []
239 self.skip_tags = skip_tags or {}
240 self.parse_email = parse_email
242 self.url_re = url_re
243 self.email_re = email_re
245 def apply_callbacks(self, attrs, is_new):
246 """Given an attrs dict and an is_new bool, runs through callbacks
248 Callbacks can return an adjusted attrs dict or ``None``. In the case of
249 ``None``, we stop going through callbacks and return that and the link
250 gets dropped.
252 :arg dict attrs: map of ``(namespace, name)`` -> ``value``
254 :arg bool is_new: whether or not this link was added by linkify
256 :returns: adjusted attrs dict or ``None``
258 """
259 for cb in self.callbacks:
260 attrs = cb(attrs, is_new)
261 if attrs is None:
262 return None
263 return attrs
265 def extract_character_data(self, token_list):
266 """Extracts and squashes character sequences in a token stream"""
267 # FIXME(willkg): This is a terrible idea. What it does is drop all the
268 # tags from the token list and merge the Characters and SpaceCharacters
269 # tokens into a single text.
270 #
271 # So something like this::
272 #
273 # "<span>" "<b>" "some text" "</b>" "</span>"
274 #
275 # gets converted to "some text".
276 #
277 # This gets used to figure out the ``_text`` fauxttribute value for
278 # linkify callables.
279 #
280 # I'm not really sure how else to support that ``_text`` fauxttribute and
281 # maintain some modicum of backwards compatibility with previous versions
282 # of Bleach.
284 out = []
285 for token in token_list:
286 token_type = token["type"]
287 if token_type in ["Characters", "SpaceCharacters"]:
288 out.append(token["data"])
290 return "".join(out)
292 def handle_email_addresses(self, src_iter):
293 """Handle email addresses in character tokens"""
294 for token in src_iter:
295 if token["type"] == "Characters":
296 text = token["data"]
297 new_tokens = []
298 end = 0
300 # For each email address we find in the text
301 for match in self.email_re.finditer(text):
302 if match.start() > end:
303 new_tokens.append(
304 {"type": "Characters", "data": text[end : match.start()]}
305 )
307 # URL-encode the "local-part" according to RFC6068
308 parts = match.group(0).split("@")
309 parts[0] = quote(parts[0])
310 address = "@".join(parts)
312 # Run attributes through the callbacks to see what we
313 # should do with this match
314 attrs = {
315 (None, "href"): "mailto:%s" % address,
316 "_text": match.group(0),
317 }
318 attrs = self.apply_callbacks(attrs, True)
320 if attrs is None:
321 # Just add the text--but not as a link
322 new_tokens.append(
323 {"type": "Characters", "data": match.group(0)}
324 )
326 else:
327 # Add an "a" tag for the new link
328 _text = attrs.pop("_text", "")
329 new_tokens.extend(
330 [
331 {"type": "StartTag", "name": "a", "data": attrs},
332 {"type": "Characters", "data": str(_text)},
333 {"type": "EndTag", "name": "a"},
334 ]
335 )
336 end = match.end()
338 if new_tokens:
339 # Yield the adjusted set of tokens and then continue
340 # through the loop
341 if end < len(text):
342 new_tokens.append({"type": "Characters", "data": text[end:]})
344 yield from new_tokens
346 continue
348 yield token
350 def strip_non_url_bits(self, fragment):
351 """Strips non-url bits from the url
353 This accounts for over-eager matching by the regex.
355 """
356 prefix = suffix = ""
358 while fragment:
359 # Try removing ( from the beginning and, if it's balanced, from the
360 # end, too
361 if fragment.startswith("("):
362 prefix = prefix + "("
363 fragment = fragment[1:]
365 if fragment.endswith(")"):
366 suffix = ")" + suffix
367 fragment = fragment[:-1]
368 continue
370 # Now try extraneous things from the end. For example, sometimes we
371 # pick up ) at the end of a url, but the url is in a parenthesized
372 # phrase like:
373 #
374 # "i looked at the site (at http://example.com)"
376 if fragment.endswith(")") and "(" not in fragment:
377 fragment = fragment[:-1]
378 suffix = ")" + suffix
379 continue
381 # Handle commas
382 if fragment.endswith(","):
383 fragment = fragment[:-1]
384 suffix = "," + suffix
385 continue
387 # Handle periods
388 if fragment.endswith("."):
389 fragment = fragment[:-1]
390 suffix = "." + suffix
391 continue
393 # Nothing matched, so we're done
394 break
396 return fragment, prefix, suffix
398 def handle_links(self, src_iter):
399 """Handle links in character tokens"""
400 in_a = False # happens, if parse_email=True and if a mail was found
401 for token in src_iter:
402 if in_a:
403 if token["type"] == "EndTag" and token["name"] == "a":
404 in_a = False
405 yield token
406 continue
407 elif token["type"] == "StartTag" and token["name"] == "a":
408 in_a = True
409 yield token
410 continue
411 if token["type"] == "Characters":
412 text = token["data"]
413 new_tokens = []
414 end = 0
416 for match in self.url_re.finditer(text):
417 if match.start() > end:
418 new_tokens.append(
419 {"type": "Characters", "data": text[end : match.start()]}
420 )
422 url = match.group(0)
423 prefix = suffix = ""
425 # Sometimes we pick up too much in the url match, so look for
426 # bits we should drop and remove them from the match
427 url, prefix, suffix = self.strip_non_url_bits(url)
429 # If there's no protocol, add one
430 if PROTO_RE.search(url):
431 href = url
432 else:
433 href = "http://%s" % url
435 attrs = {(None, "href"): href, "_text": url}
436 attrs = self.apply_callbacks(attrs, True)
438 if attrs is None:
439 # Just add the text
440 new_tokens.append(
441 {"type": "Characters", "data": prefix + url + suffix}
442 )
444 else:
445 # Add the "a" tag!
446 if prefix:
447 new_tokens.append({"type": "Characters", "data": prefix})
449 _text = attrs.pop("_text", "")
450 new_tokens.extend(
451 [
452 {"type": "StartTag", "name": "a", "data": attrs},
453 {"type": "Characters", "data": str(_text)},
454 {"type": "EndTag", "name": "a"},
455 ]
456 )
458 if suffix:
459 new_tokens.append({"type": "Characters", "data": suffix})
461 end = match.end()
463 if new_tokens:
464 # Yield the adjusted set of tokens and then continue
465 # through the loop
466 if end < len(text):
467 new_tokens.append({"type": "Characters", "data": text[end:]})
469 yield from new_tokens
471 continue
473 yield token
475 def handle_a_tag(self, token_buffer):
476 """Handle the "a" tag
478 This could adjust the link or drop it altogether depending on what the
479 callbacks return.
481 This yields the new set of tokens.
483 """
484 a_token = token_buffer[0]
485 if a_token["data"]:
486 attrs = a_token["data"]
487 else:
488 attrs = {}
489 text = self.extract_character_data(token_buffer)
490 attrs["_text"] = text
492 attrs = self.apply_callbacks(attrs, False)
494 if attrs is None:
495 # We're dropping the "a" tag and everything else and replacing
496 # it with character data. So emit that token.
497 yield {"type": "Characters", "data": text}
499 else:
500 new_text = attrs.pop("_text", "")
501 a_token["data"] = attrs
503 if text == new_text:
504 # The callbacks didn't change the text, so we yield the new "a"
505 # token, then whatever else was there, then the end "a" token
506 yield a_token
507 yield from token_buffer[1:]
509 else:
510 # If the callbacks changed the text, then we're going to drop
511 # all the tokens between the start and end "a" tags and replace
512 # it with the new text
513 yield a_token
514 yield {"type": "Characters", "data": str(new_text)}
515 yield token_buffer[-1]
517 def extract_entities(self, token):
518 """Handles Characters tokens with entities
520 Our overridden tokenizer doesn't do anything with entities. However,
521 that means that the serializer will convert all ``&`` in Characters
522 tokens to ``&``.
524 Since we don't want that, we extract entities here and convert them to
525 Entity tokens so the serializer will let them be.
527 :arg token: the Characters token to work on
529 :returns: generator of tokens
531 """
532 data = token.get("data", "")
534 # If there isn't a & in the data, we can return now
535 if "&" not in data:
536 yield token
537 return
539 new_tokens = []
541 # For each possible entity that starts with a "&", we try to extract an
542 # actual entity and re-tokenize accordingly
543 for part in html5lib_shim.next_possible_entity(data):
544 if not part:
545 continue
547 if part.startswith("&"):
548 entity = html5lib_shim.match_entity(part)
549 if entity is not None:
550 if entity == "amp":
551 # LinkifyFilter can't match urls across token boundaries
552 # which is problematic with & since that shows up in
553 # querystrings all the time. This special-cases &
554 # and converts it to a & and sticks it in as a
555 # Characters token. It'll get merged with surrounding
556 # tokens in the BleachSanitizerfilter.__iter__ and
557 # escaped in the serializer.
558 new_tokens.append({"type": "Characters", "data": "&"})
559 else:
560 new_tokens.append({"type": "Entity", "name": entity})
562 # Length of the entity plus 2--one for & at the beginning
563 # and one for ; at the end
564 remainder = part[len(entity) + 2 :]
565 if remainder:
566 new_tokens.append({"type": "Characters", "data": remainder})
567 continue
569 new_tokens.append({"type": "Characters", "data": part})
571 yield from new_tokens
573 def __iter__(self):
574 in_a = False
575 in_skip_tag = None
577 token_buffer = []
579 for token in super().__iter__():
580 if in_a:
581 # Handle the case where we're in an "a" tag--we want to buffer tokens
582 # until we hit an end "a" tag.
583 if token["type"] == "EndTag" and token["name"] == "a":
584 # Add the end tag to the token buffer and then handle them
585 # and yield anything returned
586 token_buffer.append(token)
587 yield from self.handle_a_tag(token_buffer)
589 # Clear "a" related state and continue since we've yielded all
590 # the tokens we're going to yield
591 in_a = False
592 token_buffer = []
593 else:
594 token_buffer.append(token)
595 continue
597 if token["type"] in ["StartTag", "EmptyTag"]:
598 if token["name"] in self.skip_tags:
599 # Skip tags start a "special mode" where we don't linkify
600 # anything until the end tag.
601 in_skip_tag = token["name"]
603 elif token["name"] == "a":
604 # The "a" tag is special--we switch to a slurp mode and
605 # slurp all the tokens until the end "a" tag and then
606 # figure out what to do with them there.
607 in_a = True
608 token_buffer.append(token)
610 # We buffer the start tag, so we don't want to yield it,
611 # yet
612 continue
614 elif in_skip_tag and self.skip_tags:
615 # NOTE(willkg): We put this clause here since in_a and
616 # switching in and out of in_a takes precedence.
617 if token["type"] == "EndTag" and token["name"] == in_skip_tag:
618 in_skip_tag = None
620 elif not in_a and not in_skip_tag and token["type"] == "Characters":
621 new_stream = iter([token])
622 if self.parse_email:
623 new_stream = self.handle_email_addresses(new_stream)
625 new_stream = self.handle_links(new_stream)
627 for new_token in new_stream:
628 yield from self.extract_entities(new_token)
630 # We've already yielded this token, so continue
631 continue
633 yield token