Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/html5lib_shim.py: 96%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# flake8: noqa
2"""
3Shim module between Bleach and html5lib. This makes it easier to upgrade the
4html5lib library without having to change a lot of code.
5"""
7import re
8import string
9import warnings
11# ignore html5lib deprecation warnings to use bleach; we are bleach
12# apply before we import submodules that import html5lib
13warnings.filterwarnings(
14 "ignore",
15 message="html5lib's sanitizer is deprecated",
16 category=DeprecationWarning,
17 module="bleach._vendor.html5lib",
18)
20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
21 HTMLParser,
22 getTreeWalker,
23)
24from bleach._vendor.html5lib import (
25 constants,
26) # noqa: E402 module level import not at top of file
27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
28 namespaces,
29 prefixes,
30)
31from bleach._vendor.html5lib.constants import (
32 _ReparseException as ReparseException,
33) # noqa: E402 module level import not at top of file
34from bleach._vendor.html5lib.filters.base import (
35 Filter,
36) # noqa: E402 module level import not at top of file
37from bleach._vendor.html5lib.filters.sanitizer import (
38 allowed_protocols,
39 allowed_css_properties,
40 allowed_svg_properties,
41 attr_val_is_uri,
42 svg_attr_val_allows_ref,
43 svg_allow_local_href,
44) # noqa: E402 module level import not at top of file
45from bleach._vendor.html5lib.filters.sanitizer import (
46 Filter as SanitizerFilter,
47) # noqa: E402 module level import not at top of file
48from bleach._vendor.html5lib._inputstream import (
49 HTMLInputStream,
50) # noqa: E402 module level import not at top of file
51from bleach._vendor.html5lib.serializer import (
52 escape,
53 HTMLSerializer,
54) # noqa: E402 module level import not at top of file
55from bleach._vendor.html5lib._tokenizer import (
56 attributeMap,
57 HTMLTokenizer,
58) # noqa: E402 module level import not at top of file
59from bleach._vendor.html5lib._trie import (
60 Trie,
61) # noqa: E402 module level import not at top of file
64#: Map of entity name to expanded entity
65ENTITIES = constants.entities
67#: Trie of html entity string -> character representation
68ENTITIES_TRIE = Trie(ENTITIES)
70#: Token type constants--these never change
71TAG_TOKEN_TYPES = {
72 constants.tokenTypes["StartTag"],
73 constants.tokenTypes["EndTag"],
74 constants.tokenTypes["EmptyTag"],
75}
76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
84HTML_TAGS = frozenset(
85 (
86 "a",
87 "abbr",
88 "address",
89 "area",
90 "article",
91 "aside",
92 "audio",
93 "b",
94 "base",
95 "bdi",
96 "bdo",
97 "blockquote",
98 "body",
99 "br",
100 "button",
101 "canvas",
102 "caption",
103 "cite",
104 "code",
105 "col",
106 "colgroup",
107 "data",
108 "datalist",
109 "dd",
110 "del",
111 "details",
112 "dfn",
113 "dialog",
114 "div",
115 "dl",
116 "dt",
117 "em",
118 "embed",
119 "fieldset",
120 "figcaption",
121 "figure",
122 "footer",
123 "form",
124 "h1",
125 "h2",
126 "h3",
127 "h4",
128 "h5",
129 "h6",
130 "head",
131 "header",
132 "hgroup",
133 "hr",
134 "html",
135 "i",
136 "iframe",
137 "img",
138 "input",
139 "ins",
140 "kbd",
141 "keygen",
142 "label",
143 "legend",
144 "li",
145 "link",
146 "map",
147 "mark",
148 "menu",
149 "meta",
150 "meter",
151 "nav",
152 "noscript",
153 "object",
154 "ol",
155 "optgroup",
156 "option",
157 "output",
158 "p",
159 "param",
160 "picture",
161 "pre",
162 "progress",
163 "q",
164 "rp",
165 "rt",
166 "ruby",
167 "s",
168 "samp",
169 "script",
170 "section",
171 "select",
172 "slot",
173 "small",
174 "source",
175 "span",
176 "strong",
177 "style",
178 "sub",
179 "summary",
180 "sup",
181 "table",
182 "tbody",
183 "td",
184 "template",
185 "textarea",
186 "tfoot",
187 "th",
188 "thead",
189 "time",
190 "title",
191 "tr",
192 "track",
193 "u",
194 "ul",
195 "var",
196 "video",
197 "wbr",
198 )
199)
202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
203#: from mozilla on 2019.07.11
204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
205HTML_TAGS_BLOCK_LEVEL = frozenset(
206 (
207 "address",
208 "article",
209 "aside",
210 "blockquote",
211 "details",
212 "dialog",
213 "dd",
214 "div",
215 "dl",
216 "dt",
217 "fieldset",
218 "figcaption",
219 "figure",
220 "footer",
221 "form",
222 "h1",
223 "h2",
224 "h3",
225 "h4",
226 "h5",
227 "h6",
228 "header",
229 "hgroup",
230 "hr",
231 "li",
232 "main",
233 "nav",
234 "ol",
235 "p",
236 "pre",
237 "section",
238 "table",
239 "ul",
240 )
241)
244class InputStreamWithMemory:
245 """Wraps an HTMLInputStream to remember characters since last <
247 This wraps existing HTMLInputStream classes to keep track of the stream
248 since the last < which marked an open tag state.
250 """
252 def __init__(self, inner_stream):
253 self._inner_stream = inner_stream
254 self.reset = self._inner_stream.reset
255 self.position = self._inner_stream.position
256 self._buffer = []
258 @property
259 def errors(self):
260 return self._inner_stream.errors
262 @property
263 def charEncoding(self):
264 return self._inner_stream.charEncoding
266 @property
267 def changeEncoding(self):
268 return self._inner_stream.changeEncoding
270 def char(self):
271 c = self._inner_stream.char()
272 # char() can return None if EOF, so ignore that
273 if c:
274 self._buffer.append(c)
275 return c
277 def charsUntil(self, characters, opposite=False):
278 chars = self._inner_stream.charsUntil(characters, opposite=opposite)
279 self._buffer.extend(list(chars))
280 return chars
282 def unget(self, char):
283 if self._buffer:
284 self._buffer.pop(-1)
285 return self._inner_stream.unget(char)
287 def get_tag(self):
288 """Returns the stream history since last '<'
290 Since the buffer starts at the last '<' as as seen by tagOpenState(),
291 we know that everything from that point to when this method is called
292 is the "tag" that is being tokenized.
294 """
295 return "".join(self._buffer)
297 def start_tag(self):
298 """Resets stream history to just '<'
300 This gets called by tagOpenState() which marks a '<' that denotes an
301 open tag. Any time we see that, we reset the buffer.
303 """
304 self._buffer = ["<"]
307class BleachHTMLTokenizer(HTMLTokenizer):
308 """Tokenizer that doesn't consume character entities"""
310 def __init__(self, consume_entities=False, **kwargs):
311 super().__init__(**kwargs)
313 self.consume_entities = consume_entities
315 # Wrap the stream with one that remembers the history
316 self.stream = InputStreamWithMemory(self.stream)
318 # Remember the last token emitted; needed for block element spacing
319 self.emitted_last_token = None
321 def __iter__(self):
322 last_error_token = None
324 for token in super().__iter__():
325 if last_error_token is not None:
326 if (
327 last_error_token["data"] == "invalid-character-in-attribute-name"
328 and token["type"] in TAG_TOKEN_TYPES
329 and token.get("data")
330 ):
331 # token["data"] is an html5lib attributeMap
332 # (OrderedDict 3.7+ and dict otherwise)
333 # of attr name to attr value
334 #
335 # Remove attribute names that have ', " or < in them
336 # because those characters are invalid for attribute names.
337 token["data"] = attributeMap(
338 (attr_name, attr_value)
339 for attr_name, attr_value in token["data"].items()
340 if (
341 '"' not in attr_name
342 and "'" not in attr_name
343 and "<" not in attr_name
344 )
345 )
346 last_error_token = None
347 yield token
349 elif (
350 last_error_token["data"]
351 in (
352 "invalid-character-in-attribute-name",
353 "invalid-character-after-attribute-name",
354 )
355 and token["type"] == TAG_TOKEN_TYPE_CHARACTERS
356 and token.get("data")
357 and " " in token["data"]
358 ):
359 # token["data"] has something that starts with a left angle
360 # bracket, then has some characters followed by a space
361 # followed by another left angle bracket and ending with
362 # a right angle bracket. That part could be a real tag, so
363 # we don't want it to get treated as Characters. For
364 # example, soemthing in this shape: <nottag <...>
365 # If so, we want to take off the first bit that is
366 # definitely not a tag and reparse the rest.
367 head, rest = token["data"].split(" ", 1)
368 if rest.strip().startswith("<"):
369 # yield the not-a-tag plus the space we split on
370 token["data"] = head + " "
371 yield token
373 # shove the rest back in the stream for the praser to look
374 # at
375 for c in reversed(rest):
376 self.stream.unget(c)
377 else:
378 yield token
380 elif (
381 last_error_token["data"] == "expected-closing-tag-but-got-char"
382 and self.parser.tags is not None
383 and token["data"].lower().strip() not in self.parser.tags
384 ):
385 # We've got either a malformed tag or a pseudo-tag or
386 # something that html5lib wants to turn into a malformed
387 # comment which Bleach clean() will drop so we interfere
388 # with the token stream to handle it more correctly.
389 #
390 # If this is an allowed tag, it's malformed and we just let
391 # the html5lib parser deal with it--we don't enter into this
392 # block.
393 #
394 # If this is not an allowed tag, then we convert it to
395 # characters and it'll get escaped in the sanitizer.
396 token["data"] = self.stream.get_tag()
397 token["type"] = TAG_TOKEN_TYPE_CHARACTERS
399 last_error_token = None
400 yield token
402 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
403 # If the token is a parse error, then let the last_error_token
404 # go, and make token the new last_error_token
405 yield last_error_token
406 last_error_token = token
408 else:
409 yield last_error_token
410 yield token
411 last_error_token = None
413 continue
415 # If the token is a ParseError, we hold on to it so we can get the
416 # next token and potentially fix it.
417 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
418 last_error_token = token
419 continue
421 yield token
423 if last_error_token:
424 if last_error_token["data"] == "eof-in-tag-name":
425 # Handle the case where the text being parsed ends with <
426 # followed by a series of characters. It's treated as a tag
427 # name that abruptly ends, but we should treat that like
428 # character data
429 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
431 elif last_error_token["data"] in (
432 "duplicate-attribute",
433 "eof-in-attribute-name",
434 "eof-in-attribute-value-no-quotes",
435 "expected-end-of-tag-but-got-eof",
436 ):
437 # Handle the case where the text being parsed ends with <
438 # followed by characters and then space and then:
439 #
440 # * more characters
441 # * more characters repeated with a space between (e.g. "abc abc")
442 # * more characters and then a space and then an EOF (e.g. "abc def ")
443 #
444 # These cases are treated as a tag name followed by an
445 # attribute that abruptly ends, but we should treat that like
446 # character data instead.
447 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
449 else:
450 yield last_error_token
452 def consumeEntity(self, allowedChar=None, fromAttribute=False):
453 # If this tokenizer is set to consume entities, then we can let the
454 # superclass do its thing.
455 if self.consume_entities:
456 return super().consumeEntity(allowedChar, fromAttribute)
458 # If this tokenizer is set to not consume entities, then we don't want
459 # to consume and convert them, so this overrides the html5lib tokenizer's
460 # consumeEntity so that it's now a no-op.
461 #
462 # However, when that gets called, it's consumed an &, so we put that back in
463 # the stream.
464 if fromAttribute:
465 self.currentToken["data"][-1][1] += "&"
467 else:
468 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
470 def tagOpenState(self):
471 # This state marks a < that is either a StartTag, EndTag, EmptyTag,
472 # or ParseError. In all cases, we want to drop any stream history
473 # we've collected so far and we do that by calling start_tag() on
474 # the input stream wrapper.
475 self.stream.start_tag()
476 return super().tagOpenState()
478 def emitCurrentToken(self):
479 token = self.currentToken
481 if (
482 self.parser.tags is not None
483 and token["type"] in TAG_TOKEN_TYPES
484 and token["name"].lower() not in self.parser.tags
485 ):
486 # If this is a start/end/empty tag for a tag that's not in our
487 # allowed list, then it gets stripped or escaped. In both of these
488 # cases it gets converted to a Characters token.
489 if self.parser.strip:
490 if (
491 self.emitted_last_token
492 and token["type"] == TAG_TOKEN_TYPE_START
493 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
494 ):
495 # If this is a block level tag we're stripping, we drop it
496 # for a newline because that's what a browser would parse
497 # it as
498 new_data = "\n"
499 else:
500 # For all other things being stripped, we throw in an empty
501 # string token
502 new_data = ""
504 else:
505 # If we're escaping the token, we want to escape the exact
506 # original string. Since tokenizing also normalizes data
507 # and this is a tag-like thing, we've lost some information.
508 # So we go back through the stream to get the original
509 # string and use that.
510 new_data = self.stream.get_tag()
512 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
514 self.currentToken = self.emitted_last_token = new_token
515 self.tokenQueue.append(new_token)
516 self.state = self.dataState
517 return
519 self.emitted_last_token = self.currentToken
520 super().emitCurrentToken()
523class BleachHTMLParser(HTMLParser):
524 """Parser that uses BleachHTMLTokenizer"""
526 def __init__(self, tags, strip, consume_entities, **kwargs):
527 """
528 :arg tags: set of allowed tags--everything else is either stripped or
529 escaped; if None, then this doesn't look at tags at all
530 :arg strip: whether to strip disallowed tags (True) or escape them (False);
531 if tags=None, then this doesn't have any effect
532 :arg consume_entities: whether to consume entities (default behavior) or
533 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
535 """
536 self.tags = (
537 frozenset((tag.lower() for tag in tags)) if tags is not None else None
538 )
539 self.strip = strip
540 self.consume_entities = consume_entities
541 super().__init__(**kwargs)
543 def _parse(
544 self, stream, innerHTML=False, container="div", scripting=True, **kwargs
545 ):
546 # set scripting=True to parse <noscript> as though JS is enabled to
547 # match the expected context in browsers
548 #
549 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
550 #
551 # Override HTMLParser so we can swap out the tokenizer for our own.
552 self.innerHTMLMode = innerHTML
553 self.container = container
554 self.scripting = scripting
555 self.tokenizer = BleachHTMLTokenizer(
556 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
557 )
558 self.reset()
560 try:
561 self.mainLoop()
562 except ReparseException:
563 self.reset()
564 self.mainLoop()
567def convert_entity(value):
568 """Convert an entity (minus the & and ; part) into what it represents
570 This handles numeric, hex, and text entities.
572 :arg value: the string (minus the ``&`` and ``;`` part) to convert
574 :returns: unicode character or None if it's an ambiguous ampersand that
575 doesn't match a character entity
577 """
578 if value[0] == "#":
579 if len(value) < 2:
580 return None
582 if value[1] in ("x", "X"):
583 # hex-encoded code point
584 int_as_string, base = value[2:], 16
585 else:
586 # decimal code point
587 int_as_string, base = value[1:], 10
589 if int_as_string == "":
590 return None
592 code_point = int(int_as_string, base)
593 if 0 < code_point < 0x110000:
594 return chr(code_point)
595 else:
596 return None
598 return ENTITIES.get(value, None)
601def convert_entities(text):
602 """Converts all found entities in the text
604 :arg text: the text to convert entities in
606 :returns: unicode text with converted entities
608 """
609 if "&" not in text:
610 return text
612 new_text = []
613 for part in next_possible_entity(text):
614 if not part:
615 continue
617 if part.startswith("&"):
618 entity = match_entity(part)
619 if entity is not None:
620 converted = convert_entity(entity)
622 # If it's not an ambiguous ampersand, then replace with the
623 # unicode character. Otherwise, we leave the entity in.
624 if converted is not None:
625 new_text.append(converted)
626 remainder = part[len(entity) + 2 :]
627 if part:
628 new_text.append(remainder)
629 continue
631 new_text.append(part)
633 return "".join(new_text)
636def match_entity(stream):
637 """Returns first entity in stream or None if no entity exists
639 Note: For Bleach purposes, entities must start with a "&" and end with a
640 ";". This ignores ambiguous character entities that have no ";" at the end.
642 :arg stream: the character stream
644 :returns: the entity string without "&" or ";" if it's a valid character
645 entity; ``None`` otherwise
647 """
648 # Nix the & at the beginning
649 if stream[0] != "&":
650 raise ValueError('Stream should begin with "&"')
652 stream = stream[1:]
654 stream = list(stream)
655 possible_entity = ""
656 end_characters = "<&=;" + string.whitespace
658 # Handle number entities
659 if stream and stream[0] == "#":
660 possible_entity = "#"
661 stream.pop(0)
663 if stream and stream[0] in ("x", "X"):
664 allowed = "0123456789abcdefABCDEF"
665 possible_entity += stream.pop(0)
666 else:
667 allowed = "0123456789"
669 # FIXME(willkg): Do we want to make sure these are valid number
670 # entities? This doesn't do that currently.
671 while stream and stream[0] not in end_characters:
672 c = stream.pop(0)
673 if c not in allowed:
674 break
675 possible_entity += c
677 if possible_entity and stream and stream[0] == ";":
678 return possible_entity
679 return None
681 # Handle character entities
682 while stream and stream[0] not in end_characters:
683 c = stream.pop(0)
684 possible_entity += c
685 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
686 # If it's not a prefix, then it's not an entity and we're
687 # out
688 return None
690 if possible_entity and stream and stream[0] == ";":
691 return possible_entity
693 return None
696AMP_SPLIT_RE = re.compile("(&)")
699def next_possible_entity(text):
700 """Takes a text and generates a list of possible entities
702 :arg text: the text to look at
704 :returns: generator where each part (except the first) starts with an
705 "&"
707 """
708 for i, part in enumerate(AMP_SPLIT_RE.split(text)):
709 if i == 0:
710 yield part
711 elif i % 2 == 0:
712 yield "&" + part
715class BleachHTMLSerializer(HTMLSerializer):
716 """HTMLSerializer that undoes & -> & in attributes and sets
717 escape_rcdata to True
718 """
720 # per the HTMLSerializer.__init__ docstring:
721 #
722 # Whether to escape characters that need to be
723 # escaped within normal elements within rcdata elements such as
724 # style.
725 #
726 escape_rcdata = True
728 def escape_base_amp(self, stoken):
729 """Escapes just bare & in HTML attribute values"""
730 # First, undo escaping of &. We need to do this because html5lib's
731 # HTMLSerializer expected the tokenizer to consume all the character
732 # entities and convert them to their respective characters, but the
733 # BleachHTMLTokenizer doesn't do that. For example, this fixes
734 # &entity; back to &entity; .
735 stoken = stoken.replace("&", "&")
737 # However, we do want all bare & that are not marking character
738 # entities to be changed to &, so let's do that carefully here.
739 for part in next_possible_entity(stoken):
740 if not part:
741 continue
743 if part.startswith("&"):
744 entity = match_entity(part)
745 # Only leave entities in that are not ambiguous. If they're
746 # ambiguous, then we escape the ampersand.
747 if entity is not None and convert_entity(entity) is not None:
748 yield f"&{entity};"
750 # Length of the entity plus 2--one for & at the beginning
751 # and one for ; at the end
752 part = part[len(entity) + 2 :]
753 if part:
754 yield part
755 continue
757 yield part.replace("&", "&")
759 def serialize(self, treewalker, encoding=None):
760 """Wrap HTMLSerializer.serialize and conver & to & in attribute values
762 Note that this converts & to & in attribute values where the & isn't
763 already part of an unambiguous character entity.
765 """
766 in_tag = False
767 after_equals = False
769 for stoken in super().serialize(treewalker, encoding):
770 if in_tag:
771 if stoken == ">":
772 in_tag = False
774 elif after_equals:
775 if stoken != '"':
776 yield from self.escape_base_amp(stoken)
778 after_equals = False
779 continue
781 elif stoken == "=":
782 after_equals = True
784 yield stoken
785 else:
786 if stoken.startswith("<"):
787 in_tag = True
788 yield stoken