Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/sanitizer.py: 13%
218 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1from itertools import chain
2import re
3import warnings
5from xml.sax.saxutils import unescape
7from bleach import html5lib_shim
8from bleach import parse_shim
11#: Set of allowed tags
12ALLOWED_TAGS = frozenset(
13 (
14 "a",
15 "abbr",
16 "acronym",
17 "b",
18 "blockquote",
19 "code",
20 "em",
21 "i",
22 "li",
23 "ol",
24 "strong",
25 "ul",
26 )
27)
30#: Map of allowed attributes by tag
31ALLOWED_ATTRIBUTES = {
32 "a": ["href", "title"],
33 "abbr": ["title"],
34 "acronym": ["title"],
35}
37#: List of allowed protocols
38ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))
40#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
41INVISIBLE_CHARACTERS = "".join(
42 [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]
43)
45#: Regexp for characters that are invisible
46INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)
48#: String to replace invisible characters with. This can be a character, a
49#: string, or even a function that takes a Python re matchobj
50INVISIBLE_REPLACEMENT_CHAR = "?"
53class NoCssSanitizerWarning(UserWarning):
54 pass
57class Cleaner:
58 """Cleaner for cleaning HTML fragments of malicious content
60 This cleaner is a security-focused function whose sole purpose is to remove
61 malicious content from a string such that it can be displayed as content in
62 a web page.
64 To use::
66 from bleach.sanitizer import Cleaner
68 cleaner = Cleaner()
70 for text in all_the_yucky_things:
71 sanitized = cleaner.clean(text)
73 .. Note::
75 This cleaner is not designed to use to transform content to be used in
76 non-web-page contexts.
78 .. Warning::
80 This cleaner is not thread-safe--the html parser has internal state.
81 Create a separate cleaner per thread!
84 """
86 def __init__(
87 self,
88 tags=ALLOWED_TAGS,
89 attributes=ALLOWED_ATTRIBUTES,
90 protocols=ALLOWED_PROTOCOLS,
91 strip=False,
92 strip_comments=True,
93 filters=None,
94 css_sanitizer=None,
95 ):
96 """Initializes a Cleaner
98 :arg set tags: set of allowed tags; defaults to
99 ``bleach.sanitizer.ALLOWED_TAGS``
101 :arg dict attributes: allowed attributes; can be a callable, list or dict;
102 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
104 :arg list protocols: allowed list of protocols for links; defaults
105 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
107 :arg bool strip: whether or not to strip disallowed elements
109 :arg bool strip_comments: whether or not to strip HTML comments
111 :arg list filters: list of html5lib Filter classes to pass streamed content through
113 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
115 .. Warning::
117 Using filters changes the output of ``bleach.Cleaner.clean``.
118 Make sure the way the filters change the output are secure.
120 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
121 sanitizing style attribute values and style text; defaults to None
123 """
124 self.tags = tags
125 self.attributes = attributes
126 self.protocols = protocols
127 self.strip = strip
128 self.strip_comments = strip_comments
129 self.filters = filters or []
130 self.css_sanitizer = css_sanitizer
132 self.parser = html5lib_shim.BleachHTMLParser(
133 tags=self.tags,
134 strip=self.strip,
135 consume_entities=False,
136 namespaceHTMLElements=False,
137 )
138 self.walker = html5lib_shim.getTreeWalker("etree")
139 self.serializer = html5lib_shim.BleachHTMLSerializer(
140 quote_attr_values="always",
141 omit_optional_tags=False,
142 escape_lt_in_attrs=True,
143 # We want to leave entities as they are without escaping or
144 # resolving or expanding
145 resolve_entities=False,
146 # Bleach has its own sanitizer, so don't use the html5lib one
147 sanitize=False,
148 # clean preserves attr order
149 alphabetical_attributes=False,
150 )
152 if css_sanitizer is None:
153 # FIXME(willkg): this doesn't handle when attributes or an
154 # attributes value is a callable
155 attributes_values = []
156 if isinstance(attributes, list):
157 attributes_values = attributes
159 elif isinstance(attributes, dict):
160 attributes_values = []
161 for values in attributes.values():
162 if isinstance(values, (list, tuple)):
163 attributes_values.extend(values)
165 if "style" in attributes_values:
166 warnings.warn(
167 "'style' attribute specified, but css_sanitizer not set.",
168 category=NoCssSanitizerWarning,
169 )
171 def clean(self, text):
172 """Cleans text and returns sanitized result as unicode
174 :arg str text: text to be cleaned
176 :returns: sanitized text as unicode
178 :raises TypeError: if ``text`` is not a text type
180 """
181 if not isinstance(text, str):
182 message = (
183 f"argument cannot be of {text.__class__.__name__!r} type, "
184 + "must be of text type"
185 )
186 raise TypeError(message)
188 if not text:
189 return ""
191 dom = self.parser.parseFragment(text)
192 filtered = BleachSanitizerFilter(
193 source=self.walker(dom),
194 allowed_tags=self.tags,
195 attributes=self.attributes,
196 strip_disallowed_tags=self.strip,
197 strip_html_comments=self.strip_comments,
198 css_sanitizer=self.css_sanitizer,
199 allowed_protocols=self.protocols,
200 )
202 # Apply any filters after the BleachSanitizerFilter
203 for filter_class in self.filters:
204 filtered = filter_class(source=filtered)
206 return self.serializer.render(filtered)
209def attribute_filter_factory(attributes):
210 """Generates attribute filter function for the given attributes value
212 The attributes value can take one of several shapes. This returns a filter
213 function appropriate to the attributes value. One nice thing about this is
214 that there's less if/then shenanigans in the ``allow_token`` method.
216 """
217 if callable(attributes):
218 return attributes
220 if isinstance(attributes, dict):
222 def _attr_filter(tag, attr, value):
223 if tag in attributes:
224 attr_val = attributes[tag]
225 if callable(attr_val):
226 return attr_val(tag, attr, value)
228 if attr in attr_val:
229 return True
231 if "*" in attributes:
232 attr_val = attributes["*"]
233 if callable(attr_val):
234 return attr_val(tag, attr, value)
236 return attr in attr_val
238 return False
240 return _attr_filter
242 if isinstance(attributes, list):
244 def _attr_filter(tag, attr, value):
245 return attr in attributes
247 return _attr_filter
249 raise ValueError("attributes needs to be a callable, a list or a dict")
252class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
253 """html5lib Filter that sanitizes text
255 This filter can be used anywhere html5lib filters can be used.
257 """
259 def __init__(
260 self,
261 source,
262 allowed_tags=ALLOWED_TAGS,
263 attributes=ALLOWED_ATTRIBUTES,
264 allowed_protocols=ALLOWED_PROTOCOLS,
265 attr_val_is_uri=html5lib_shim.attr_val_is_uri,
266 svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,
267 svg_allow_local_href=html5lib_shim.svg_allow_local_href,
268 strip_disallowed_tags=False,
269 strip_html_comments=True,
270 css_sanitizer=None,
271 ):
272 """Creates a BleachSanitizerFilter instance
274 :arg source: html5lib TreeWalker stream as an html5lib TreeWalker
276 :arg set allowed_tags: set of allowed tags; defaults to
277 ``bleach.sanitizer.ALLOWED_TAGS``
279 :arg dict attributes: allowed attributes; can be a callable, list or dict;
280 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
282 :arg list allowed_protocols: allowed list of protocols for links; defaults
283 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
285 :arg attr_val_is_uri: set of attributes that have URI values
287 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
288 references
290 :arg svg_allow_local_href: set of SVG elements that can have local
291 hrefs
293 :arg bool strip_disallowed_tags: whether or not to strip disallowed
294 tags
296 :arg bool strip_html_comments: whether or not to strip HTML comments
298 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
299 sanitizing style attribute values and style text; defaults to None
301 """
302 # NOTE(willkg): This is the superclass of
303 # html5lib.filters.sanitizer.Filter. We call this directly skipping the
304 # __init__ for html5lib.filters.sanitizer.Filter because that does
305 # things we don't need to do and kicks up the deprecation warning for
306 # using Sanitizer.
307 html5lib_shim.Filter.__init__(self, source)
309 self.allowed_tags = frozenset(allowed_tags)
310 self.allowed_protocols = frozenset(allowed_protocols)
312 self.attr_filter = attribute_filter_factory(attributes)
313 self.strip_disallowed_tags = strip_disallowed_tags
314 self.strip_html_comments = strip_html_comments
316 self.attr_val_is_uri = attr_val_is_uri
317 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
318 self.css_sanitizer = css_sanitizer
319 self.svg_allow_local_href = svg_allow_local_href
321 def sanitize_stream(self, token_iterator):
322 for token in token_iterator:
323 ret = self.sanitize_token(token)
325 if not ret:
326 continue
328 if isinstance(ret, list):
329 yield from ret
330 else:
331 yield ret
333 def merge_characters(self, token_iterator):
334 """Merge consecutive Characters tokens in a stream"""
335 characters_buffer = []
337 for token in token_iterator:
338 if characters_buffer:
339 if token["type"] == "Characters":
340 characters_buffer.append(token)
341 continue
342 else:
343 # Merge all the characters tokens together into one and then
344 # operate on it.
345 new_token = {
346 "data": "".join(
347 [char_token["data"] for char_token in characters_buffer]
348 ),
349 "type": "Characters",
350 }
351 characters_buffer = []
352 yield new_token
354 elif token["type"] == "Characters":
355 characters_buffer.append(token)
356 continue
358 yield token
360 new_token = {
361 "data": "".join([char_token["data"] for char_token in characters_buffer]),
362 "type": "Characters",
363 }
364 yield new_token
366 def __iter__(self):
367 return self.merge_characters(
368 self.sanitize_stream(html5lib_shim.Filter.__iter__(self))
369 )
371 def sanitize_token(self, token):
372 """Sanitize a token either by HTML-encoding or dropping.
374 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
375 ['attribute', 'pairs'], 'tag': callable}.
377 Here callable is a function with two arguments of attribute name and
378 value. It should return true of false.
380 Also gives the option to strip tags instead of encoding.
382 :arg dict token: token to sanitize
384 :returns: token or list of tokens
386 """
387 token_type = token["type"]
388 if token_type in ["StartTag", "EndTag", "EmptyTag"]:
389 if token["name"] in self.allowed_tags:
390 return self.allow_token(token)
392 elif self.strip_disallowed_tags:
393 return None
395 else:
396 return self.disallowed_token(token)
398 elif token_type == "Comment":
399 if not self.strip_html_comments:
400 # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
401 token["data"] = html5lib_shim.escape(
402 token["data"], entities={'"': """, "'": "'"}
403 )
404 return token
405 else:
406 return None
408 elif token_type == "Characters":
409 return self.sanitize_characters(token)
411 else:
412 return token
414 def sanitize_characters(self, token):
415 """Handles Characters tokens
417 Our overridden tokenizer doesn't do anything with entities. However,
418 that means that the serializer will convert all ``&`` in Characters
419 tokens to ``&``.
421 Since we don't want that, we extract entities here and convert them to
422 Entity tokens so the serializer will let them be.
424 :arg token: the Characters token to work on
426 :returns: a list of tokens
428 """
429 data = token.get("data", "")
431 if not data:
432 return token
434 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
435 token["data"] = data
437 # If there isn't a & in the data, we can return now
438 if "&" not in data:
439 return token
441 new_tokens = []
443 # For each possible entity that starts with a "&", we try to extract an
444 # actual entity and re-tokenize accordingly
445 for part in html5lib_shim.next_possible_entity(data):
446 if not part:
447 continue
449 if part.startswith("&"):
450 entity = html5lib_shim.match_entity(part)
451 if entity is not None:
452 if entity == "amp":
453 # LinkifyFilter can't match urls across token boundaries
454 # which is problematic with & since that shows up in
455 # querystrings all the time. This special-cases &
456 # and converts it to a & and sticks it in as a
457 # Characters token. It'll get merged with surrounding
458 # tokens in the BleachSanitizerfilter.__iter__ and
459 # escaped in the serializer.
460 new_tokens.append({"type": "Characters", "data": "&"})
461 else:
462 new_tokens.append({"type": "Entity", "name": entity})
464 # Length of the entity plus 2--one for & at the beginning
465 # and one for ; at the end
466 remainder = part[len(entity) + 2 :]
467 if remainder:
468 new_tokens.append({"type": "Characters", "data": remainder})
469 continue
471 new_tokens.append({"type": "Characters", "data": part})
473 return new_tokens
475 def sanitize_uri_value(self, value, allowed_protocols):
476 """Checks a uri value to see if it's allowed
478 :arg value: the uri value to sanitize
479 :arg allowed_protocols: list of allowed protocols
481 :returns: allowed value or None
483 """
484 # NOTE(willkg): This transforms the value into a normalized one that's
485 # easier to match and verify, but shouldn't get returned since it's
486 # vastly different than the original value.
488 # Convert all character entities in the value
489 normalized_uri = html5lib_shim.convert_entities(value)
491 # Nix backtick, space characters, and control characters
492 normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
494 # Remove REPLACEMENT characters
495 normalized_uri = normalized_uri.replace("\ufffd", "")
497 # Lowercase it--this breaks the value, but makes it easier to match
498 # against
499 normalized_uri = normalized_uri.lower()
501 try:
502 # Drop attributes with uri values that have protocols that aren't
503 # allowed
504 parsed = parse_shim.urlparse(normalized_uri)
505 except ValueError:
506 # URI is impossible to parse, therefore it's not allowed
507 return None
509 if parsed.scheme:
510 # If urlparse found a scheme, check that
511 if parsed.scheme in allowed_protocols:
512 return value
514 else:
515 # Allow uris that are just an anchor
516 if normalized_uri.startswith("#"):
517 return value
519 # Handle protocols that urlparse doesn't recognize like "myprotocol"
520 if (
521 ":" in normalized_uri
522 and normalized_uri.split(":")[0] in allowed_protocols
523 ):
524 return value
526 # If there's no protocol/scheme specified, then assume it's "http" or
527 # "https" and see if that's allowed
528 if "http" in allowed_protocols or "https" in allowed_protocols:
529 return value
531 return None
533 def allow_token(self, token):
534 """Handles the case where we're allowing the tag"""
535 if "data" in token:
536 # Loop through all the attributes and drop the ones that are not
537 # allowed, are unsafe or break other rules. Additionally, fix
538 # attribute values that need fixing.
539 #
540 # At the end of this loop, we have the final set of attributes
541 # we're keeping.
542 attrs = {}
543 for namespaced_name, val in token["data"].items():
544 namespace, name = namespaced_name
546 # Drop attributes that are not explicitly allowed
547 #
548 # NOTE(willkg): We pass in the attribute name--not a namespaced
549 # name.
550 if not self.attr_filter(token["name"], name, val):
551 continue
553 # Drop attributes with uri values that use a disallowed protocol
554 # Sanitize attributes with uri values
555 if namespaced_name in self.attr_val_is_uri:
556 new_value = self.sanitize_uri_value(val, self.allowed_protocols)
557 if new_value is None:
558 continue
559 val = new_value
561 # Drop values in svg attrs with non-local IRIs
562 if namespaced_name in self.svg_attr_val_allows_ref:
563 new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))
564 new_val = new_val.strip()
565 if not new_val:
566 continue
568 else:
569 # Replace the val with the unescaped version because
570 # it's a iri
571 val = new_val
573 # Drop href and xlink:href attr for svg elements with non-local IRIs
574 if (None, token["name"]) in self.svg_allow_local_href:
575 if namespaced_name in [
576 (None, "href"),
577 (html5lib_shim.namespaces["xlink"], "href"),
578 ]:
579 if re.search(r"^\s*[^#\s]", val):
580 continue
582 # If it's a style attribute, sanitize it
583 if namespaced_name == (None, "style"):
584 if self.css_sanitizer:
585 val = self.css_sanitizer.sanitize_css(val)
586 else:
587 # FIXME(willkg): if style is allowed, but no
588 # css_sanitizer was set up, then this is probably a
589 # mistake and we should raise an error here
590 #
591 # For now, we're going to set the value to "" because
592 # there was no sanitizer set
593 val = ""
595 # At this point, we want to keep the attribute, so add it in
596 attrs[namespaced_name] = val
598 token["data"] = attrs
600 return token
602 def disallowed_token(self, token):
603 token_type = token["type"]
604 if token_type == "EndTag":
605 token["data"] = f"</{token['name']}>"
607 elif token["data"]:
608 assert token_type in ("StartTag", "EmptyTag")
609 attrs = []
610 for (ns, name), v in token["data"].items():
611 # If we end up with a namespace, but no name, switch them so we
612 # have a valid name to use.
613 if ns and not name:
614 ns, name = name, ns
616 # Figure out namespaced name if the namespace is appropriate
617 # and exists; if the ns isn't in prefixes, then drop it.
618 if ns is None or ns not in html5lib_shim.prefixes:
619 namespaced_name = name
620 else:
621 namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"
623 # NOTE(willkg): HTMLSerializer escapes attribute values
624 # already, so if we do it here (like HTMLSerializer does),
625 # then we end up double-escaping.
626 attrs.append(f' {namespaced_name}="{v}"')
627 token["data"] = f"<{token['name']}{''.join(attrs)}>"
629 else:
630 token["data"] = f"<{token['name']}>"
632 if token.get("selfClosing"):
633 token["data"] = f"{token['data'][:-1]}/>"
635 token["type"] = "Characters"
637 del token["name"]
638 return token