Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/html5lib_shim.py: 24%
236 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1# flake8: noqa
2"""
3Shim module between Bleach and html5lib. This makes it easier to upgrade the
4html5lib library without having to change a lot of code.
5"""
7import re
8import string
9import warnings
11# ignore html5lib deprecation warnings to use bleach; we are bleach
12# apply before we import submodules that import html5lib
13warnings.filterwarnings(
14 "ignore",
15 message="html5lib's sanitizer is deprecated",
16 category=DeprecationWarning,
17 module="bleach._vendor.html5lib",
18)
20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
21 HTMLParser,
22 getTreeWalker,
23)
24from bleach._vendor.html5lib import (
25 constants,
26) # noqa: E402 module level import not at top of file
27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
28 namespaces,
29 prefixes,
30)
31from bleach._vendor.html5lib.constants import (
32 _ReparseException as ReparseException,
33) # noqa: E402 module level import not at top of file
34from bleach._vendor.html5lib.filters.base import (
35 Filter,
36) # noqa: E402 module level import not at top of file
37from bleach._vendor.html5lib.filters.sanitizer import (
38 allowed_protocols,
39 allowed_css_properties,
40 allowed_svg_properties,
41 attr_val_is_uri,
42 svg_attr_val_allows_ref,
43 svg_allow_local_href,
44) # noqa: E402 module level import not at top of file
45from bleach._vendor.html5lib.filters.sanitizer import (
46 Filter as SanitizerFilter,
47) # noqa: E402 module level import not at top of file
48from bleach._vendor.html5lib._inputstream import (
49 HTMLInputStream,
50) # noqa: E402 module level import not at top of file
51from bleach._vendor.html5lib.serializer import (
52 escape,
53 HTMLSerializer,
54) # noqa: E402 module level import not at top of file
55from bleach._vendor.html5lib._tokenizer import (
56 attributeMap,
57 HTMLTokenizer,
58) # noqa: E402 module level import not at top of file
59from bleach._vendor.html5lib._trie import (
60 Trie,
61) # noqa: E402 module level import not at top of file
64#: Map of entity name to expanded entity
65ENTITIES = constants.entities
67#: Trie of html entity string -> character representation
68ENTITIES_TRIE = Trie(ENTITIES)
70#: Token type constants--these never change
71TAG_TOKEN_TYPES = {
72 constants.tokenTypes["StartTag"],
73 constants.tokenTypes["EndTag"],
74 constants.tokenTypes["EmptyTag"],
75}
76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
84HTML_TAGS = frozenset(
85 (
86 "a",
87 "abbr",
88 "address",
89 "area",
90 "article",
91 "aside",
92 "audio",
93 "b",
94 "base",
95 "bdi",
96 "bdo",
97 "blockquote",
98 "body",
99 "br",
100 "button",
101 "canvas",
102 "caption",
103 "cite",
104 "code",
105 "col",
106 "colgroup",
107 "data",
108 "datalist",
109 "dd",
110 "del",
111 "details",
112 "dfn",
113 "dialog",
114 "div",
115 "dl",
116 "dt",
117 "em",
118 "embed",
119 "fieldset",
120 "figcaption",
121 "figure",
122 "footer",
123 "form",
124 "h1",
125 "h2",
126 "h3",
127 "h4",
128 "h5",
129 "h6",
130 "head",
131 "header",
132 "hgroup",
133 "hr",
134 "html",
135 "i",
136 "iframe",
137 "img",
138 "input",
139 "ins",
140 "kbd",
141 "keygen",
142 "label",
143 "legend",
144 "li",
145 "link",
146 "map",
147 "mark",
148 "menu",
149 "meta",
150 "meter",
151 "nav",
152 "noscript",
153 "object",
154 "ol",
155 "optgroup",
156 "option",
157 "output",
158 "p",
159 "param",
160 "picture",
161 "pre",
162 "progress",
163 "q",
164 "rp",
165 "rt",
166 "ruby",
167 "s",
168 "samp",
169 "script",
170 "section",
171 "select",
172 "slot",
173 "small",
174 "source",
175 "span",
176 "strong",
177 "style",
178 "sub",
179 "summary",
180 "sup",
181 "table",
182 "tbody",
183 "td",
184 "template",
185 "textarea",
186 "tfoot",
187 "th",
188 "thead",
189 "time",
190 "title",
191 "tr",
192 "track",
193 "u",
194 "ul",
195 "var",
196 "video",
197 "wbr",
198 )
199)
202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
203#: from mozilla on 2019.07.11
204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
205HTML_TAGS_BLOCK_LEVEL = frozenset(
206 (
207 "address",
208 "article",
209 "aside",
210 "blockquote",
211 "details",
212 "dialog",
213 "dd",
214 "div",
215 "dl",
216 "dt",
217 "fieldset",
218 "figcaption",
219 "figure",
220 "footer",
221 "form",
222 "h1",
223 "h2",
224 "h3",
225 "h4",
226 "h5",
227 "h6",
228 "header",
229 "hgroup",
230 "hr",
231 "li",
232 "main",
233 "nav",
234 "ol",
235 "p",
236 "pre",
237 "section",
238 "table",
239 "ul",
240 )
241)
244class InputStreamWithMemory:
245 """Wraps an HTMLInputStream to remember characters since last <
247 This wraps existing HTMLInputStream classes to keep track of the stream
248 since the last < which marked an open tag state.
250 """
252 def __init__(self, inner_stream):
253 self._inner_stream = inner_stream
254 self.reset = self._inner_stream.reset
255 self.position = self._inner_stream.position
256 self._buffer = []
258 @property
259 def errors(self):
260 return self._inner_stream.errors
262 @property
263 def charEncoding(self):
264 return self._inner_stream.charEncoding
266 @property
267 def changeEncoding(self):
268 return self._inner_stream.changeEncoding
270 def char(self):
271 c = self._inner_stream.char()
272 # char() can return None if EOF, so ignore that
273 if c:
274 self._buffer.append(c)
275 return c
277 def charsUntil(self, characters, opposite=False):
278 chars = self._inner_stream.charsUntil(characters, opposite=opposite)
279 self._buffer.extend(list(chars))
280 return chars
282 def unget(self, char):
283 if self._buffer:
284 self._buffer.pop(-1)
285 return self._inner_stream.unget(char)
287 def get_tag(self):
288 """Returns the stream history since last '<'
290 Since the buffer starts at the last '<' as as seen by tagOpenState(),
291 we know that everything from that point to when this method is called
292 is the "tag" that is being tokenized.
294 """
295 return "".join(self._buffer)
297 def start_tag(self):
298 """Resets stream history to just '<'
300 This gets called by tagOpenState() which marks a '<' that denotes an
301 open tag. Any time we see that, we reset the buffer.
303 """
304 self._buffer = ["<"]
307class BleachHTMLTokenizer(HTMLTokenizer):
308 """Tokenizer that doesn't consume character entities"""
310 def __init__(self, consume_entities=False, **kwargs):
311 super().__init__(**kwargs)
313 self.consume_entities = consume_entities
315 # Wrap the stream with one that remembers the history
316 self.stream = InputStreamWithMemory(self.stream)
318 # Remember the last token emitted; needed for block element spacing
319 self.emitted_last_token = None
321 def __iter__(self):
322 last_error_token = None
324 for token in super().__iter__():
325 if last_error_token is not None:
326 if (
327 last_error_token["data"] == "invalid-character-in-attribute-name"
328 and token["type"] in TAG_TOKEN_TYPES
329 and token.get("data")
330 ):
331 # token["data"] is an html5lib attributeMap
332 # (OrderedDict 3.7+ and dict otherwise)
333 # of attr name to attr value
334 #
335 # Remove attribute names that have ', " or < in them
336 # because those characters are invalid for attribute names.
337 token["data"] = attributeMap(
338 (attr_name, attr_value)
339 for attr_name, attr_value in token["data"].items()
340 if (
341 '"' not in attr_name
342 and "'" not in attr_name
343 and "<" not in attr_name
344 )
345 )
346 last_error_token = None
347 yield token
349 elif (
350 last_error_token["data"] == "expected-closing-tag-but-got-char"
351 and self.parser.tags is not None
352 and token["data"].lower().strip() not in self.parser.tags
353 ):
354 # We've got either a malformed tag or a pseudo-tag or
355 # something that html5lib wants to turn into a malformed
356 # comment which Bleach clean() will drop so we interfere
357 # with the token stream to handle it more correctly.
358 #
359 # If this is an allowed tag, it's malformed and we just let
360 # the html5lib parser deal with it--we don't enter into this
361 # block.
362 #
363 # If this is not an allowed tag, then we convert it to
364 # characters and it'll get escaped in the sanitizer.
365 token["data"] = self.stream.get_tag()
366 token["type"] = TAG_TOKEN_TYPE_CHARACTERS
368 last_error_token = None
369 yield token
371 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
372 # If the token is a parse error, then let the last_error_token
373 # go, and make token the new last_error_token
374 yield last_error_token
375 last_error_token = token
377 else:
378 yield last_error_token
379 yield token
380 last_error_token = None
382 continue
384 # If the token is a ParseError, we hold on to it so we can get the
385 # next token and potentially fix it.
386 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
387 last_error_token = token
388 continue
390 yield token
392 if last_error_token:
393 if last_error_token["data"] == "eof-in-tag-name":
394 # Handle the case where the text being parsed ends with <
395 # followed by a series of characters. It's treated as a tag
396 # name that abruptly ends, but we should treat that like
397 # character data
398 yield {
399 "type": TAG_TOKEN_TYPE_CHARACTERS,
400 "data": "<" + self.currentToken["name"],
401 }
402 else:
403 yield last_error_token
405 def consumeEntity(self, allowedChar=None, fromAttribute=False):
406 # If this tokenizer is set to consume entities, then we can let the
407 # superclass do its thing.
408 if self.consume_entities:
409 return super().consumeEntity(allowedChar, fromAttribute)
411 # If this tokenizer is set to not consume entities, then we don't want
412 # to consume and convert them, so this overrides the html5lib tokenizer's
413 # consumeEntity so that it's now a no-op.
414 #
415 # However, when that gets called, it's consumed an &, so we put that back in
416 # the stream.
417 if fromAttribute:
418 self.currentToken["data"][-1][1] += "&"
420 else:
421 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
423 def tagOpenState(self):
424 # This state marks a < that is either a StartTag, EndTag, EmptyTag,
425 # or ParseError. In all cases, we want to drop any stream history
426 # we've collected so far and we do that by calling start_tag() on
427 # the input stream wrapper.
428 self.stream.start_tag()
429 return super().tagOpenState()
431 def emitCurrentToken(self):
432 token = self.currentToken
434 if (
435 self.parser.tags is not None
436 and token["type"] in TAG_TOKEN_TYPES
437 and token["name"].lower() not in self.parser.tags
438 ):
439 # If this is a start/end/empty tag for a tag that's not in our
440 # allowed list, then it gets stripped or escaped. In both of these
441 # cases it gets converted to a Characters token.
442 if self.parser.strip:
443 if (
444 self.emitted_last_token
445 and token["type"] == TAG_TOKEN_TYPE_START
446 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
447 ):
448 # If this is a block level tag we're stripping, we drop it
449 # for a newline because that's what a browser would parse
450 # it as
451 new_data = "\n"
452 else:
453 # For all other things being stripped, we throw in an empty
454 # string token
455 new_data = ""
457 else:
458 # If we're escaping the token, we want to escape the exact
459 # original string. Since tokenizing also normalizes data
460 # and this is a tag-like thing, we've lost some information.
461 # So we go back through the stream to get the original
462 # string and use that.
463 new_data = self.stream.get_tag()
465 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
467 self.currentToken = self.emitted_last_token = new_token
468 self.tokenQueue.append(new_token)
469 self.state = self.dataState
470 return
472 self.emitted_last_token = self.currentToken
473 super().emitCurrentToken()
476class BleachHTMLParser(HTMLParser):
477 """Parser that uses BleachHTMLTokenizer"""
479 def __init__(self, tags, strip, consume_entities, **kwargs):
480 """
481 :arg tags: set of allowed tags--everything else is either stripped or
482 escaped; if None, then this doesn't look at tags at all
483 :arg strip: whether to strip disallowed tags (True) or escape them (False);
484 if tags=None, then this doesn't have any effect
485 :arg consume_entities: whether to consume entities (default behavior) or
486 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
488 """
489 self.tags = (
490 frozenset((tag.lower() for tag in tags)) if tags is not None else None
491 )
492 self.strip = strip
493 self.consume_entities = consume_entities
494 super().__init__(**kwargs)
496 def _parse(
497 self, stream, innerHTML=False, container="div", scripting=True, **kwargs
498 ):
499 # set scripting=True to parse <noscript> as though JS is enabled to
500 # match the expected context in browsers
501 #
502 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
503 #
504 # Override HTMLParser so we can swap out the tokenizer for our own.
505 self.innerHTMLMode = innerHTML
506 self.container = container
507 self.scripting = scripting
508 self.tokenizer = BleachHTMLTokenizer(
509 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
510 )
511 self.reset()
513 try:
514 self.mainLoop()
515 except ReparseException:
516 self.reset()
517 self.mainLoop()
520def convert_entity(value):
521 """Convert an entity (minus the & and ; part) into what it represents
523 This handles numeric, hex, and text entities.
525 :arg value: the string (minus the ``&`` and ``;`` part) to convert
527 :returns: unicode character or None if it's an ambiguous ampersand that
528 doesn't match a character entity
530 """
531 if value[0] == "#":
532 if len(value) < 2:
533 return None
535 if value[1] in ("x", "X"):
536 # hex-encoded code point
537 int_as_string, base = value[2:], 16
538 else:
539 # decimal code point
540 int_as_string, base = value[1:], 10
542 if int_as_string == "":
543 return None
545 code_point = int(int_as_string, base)
546 if 0 < code_point < 0x110000:
547 return chr(code_point)
548 else:
549 return None
551 return ENTITIES.get(value, None)
554def convert_entities(text):
555 """Converts all found entities in the text
557 :arg text: the text to convert entities in
559 :returns: unicode text with converted entities
561 """
562 if "&" not in text:
563 return text
565 new_text = []
566 for part in next_possible_entity(text):
567 if not part:
568 continue
570 if part.startswith("&"):
571 entity = match_entity(part)
572 if entity is not None:
573 converted = convert_entity(entity)
575 # If it's not an ambiguous ampersand, then replace with the
576 # unicode character. Otherwise, we leave the entity in.
577 if converted is not None:
578 new_text.append(converted)
579 remainder = part[len(entity) + 2 :]
580 if part:
581 new_text.append(remainder)
582 continue
584 new_text.append(part)
586 return "".join(new_text)
589def match_entity(stream):
590 """Returns first entity in stream or None if no entity exists
592 Note: For Bleach purposes, entities must start with a "&" and end with a
593 ";". This ignores ambiguous character entities that have no ";" at the end.
595 :arg stream: the character stream
597 :returns: the entity string without "&" or ";" if it's a valid character
598 entity; ``None`` otherwise
600 """
601 # Nix the & at the beginning
602 if stream[0] != "&":
603 raise ValueError('Stream should begin with "&"')
605 stream = stream[1:]
607 stream = list(stream)
608 possible_entity = ""
609 end_characters = "<&=;" + string.whitespace
611 # Handle number entities
612 if stream and stream[0] == "#":
613 possible_entity = "#"
614 stream.pop(0)
616 if stream and stream[0] in ("x", "X"):
617 allowed = "0123456789abcdefABCDEF"
618 possible_entity += stream.pop(0)
619 else:
620 allowed = "0123456789"
622 # FIXME(willkg): Do we want to make sure these are valid number
623 # entities? This doesn't do that currently.
624 while stream and stream[0] not in end_characters:
625 c = stream.pop(0)
626 if c not in allowed:
627 break
628 possible_entity += c
630 if possible_entity and stream and stream[0] == ";":
631 return possible_entity
632 return None
634 # Handle character entities
635 while stream and stream[0] not in end_characters:
636 c = stream.pop(0)
637 possible_entity += c
638 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
639 # If it's not a prefix, then it's not an entity and we're
640 # out
641 return None
643 if possible_entity and stream and stream[0] == ";":
644 return possible_entity
646 return None
649AMP_SPLIT_RE = re.compile("(&)")
652def next_possible_entity(text):
653 """Takes a text and generates a list of possible entities
655 :arg text: the text to look at
657 :returns: generator where each part (except the first) starts with an
658 "&"
660 """
661 for i, part in enumerate(AMP_SPLIT_RE.split(text)):
662 if i == 0:
663 yield part
664 elif i % 2 == 0:
665 yield "&" + part
668class BleachHTMLSerializer(HTMLSerializer):
669 """HTMLSerializer that undoes & -> & in attributes and sets
670 escape_rcdata to True
671 """
673 # per the HTMLSerializer.__init__ docstring:
674 #
675 # Whether to escape characters that need to be
676 # escaped within normal elements within rcdata elements such as
677 # style.
678 #
679 escape_rcdata = True
681 def escape_base_amp(self, stoken):
682 """Escapes just bare & in HTML attribute values"""
683 # First, undo escaping of &. We need to do this because html5lib's
684 # HTMLSerializer expected the tokenizer to consume all the character
685 # entities and convert them to their respective characters, but the
686 # BleachHTMLTokenizer doesn't do that. For example, this fixes
687 # &entity; back to &entity; .
688 stoken = stoken.replace("&", "&")
690 # However, we do want all bare & that are not marking character
691 # entities to be changed to &, so let's do that carefully here.
692 for part in next_possible_entity(stoken):
693 if not part:
694 continue
696 if part.startswith("&"):
697 entity = match_entity(part)
698 # Only leave entities in that are not ambiguous. If they're
699 # ambiguous, then we escape the ampersand.
700 if entity is not None and convert_entity(entity) is not None:
701 yield f"&{entity};"
703 # Length of the entity plus 2--one for & at the beginning
704 # and one for ; at the end
705 part = part[len(entity) + 2 :]
706 if part:
707 yield part
708 continue
710 yield part.replace("&", "&")
712 def serialize(self, treewalker, encoding=None):
713 """Wrap HTMLSerializer.serialize and conver & to & in attribute values
715 Note that this converts & to & in attribute values where the & isn't
716 already part of an unambiguous character entity.
718 """
719 in_tag = False
720 after_equals = False
722 for stoken in super().serialize(treewalker, encoding):
723 if in_tag:
724 if stoken == ">":
725 in_tag = False
727 elif after_equals:
728 if stoken != '"':
729 yield from self.escape_base_amp(stoken)
731 after_equals = False
732 continue
734 elif stoken == "=":
735 after_equals = True
737 yield stoken
738 else:
739 if stoken.startswith("<"):
740 in_tag = True
741 yield stoken