1# flake8: noqa
2"""
3Shim module between Bleach and html5lib. This makes it easier to upgrade the
4html5lib library without having to change a lot of code.
5"""
6
7import re
8import string
9import warnings
10
11# ignore html5lib deprecation warnings to use bleach; we are bleach
12# apply before we import submodules that import html5lib
13warnings.filterwarnings(
14 "ignore",
15 message="html5lib's sanitizer is deprecated",
16 category=DeprecationWarning,
17 module="bleach._vendor.html5lib",
18)
19
20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
21 HTMLParser,
22 getTreeWalker,
23)
24from bleach._vendor.html5lib import (
25 constants,
26) # noqa: E402 module level import not at top of file
27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
28 namespaces,
29 prefixes,
30)
31from bleach._vendor.html5lib.constants import (
32 _ReparseException as ReparseException,
33) # noqa: E402 module level import not at top of file
34from bleach._vendor.html5lib.filters.base import (
35 Filter,
36) # noqa: E402 module level import not at top of file
37from bleach._vendor.html5lib.filters.sanitizer import (
38 allowed_protocols,
39 allowed_css_properties,
40 allowed_svg_properties,
41 attr_val_is_uri,
42 svg_attr_val_allows_ref,
43 svg_allow_local_href,
44) # noqa: E402 module level import not at top of file
45from bleach._vendor.html5lib.filters.sanitizer import (
46 Filter as SanitizerFilter,
47) # noqa: E402 module level import not at top of file
48from bleach._vendor.html5lib._inputstream import (
49 HTMLInputStream,
50) # noqa: E402 module level import not at top of file
51from bleach._vendor.html5lib.serializer import (
52 escape,
53 HTMLSerializer,
54) # noqa: E402 module level import not at top of file
55from bleach._vendor.html5lib._tokenizer import (
56 attributeMap,
57 HTMLTokenizer,
58) # noqa: E402 module level import not at top of file
59from bleach._vendor.html5lib._trie import (
60 Trie,
61) # noqa: E402 module level import not at top of file
62
63
64#: Map of entity name to expanded entity
65ENTITIES = constants.entities
66
67#: Trie of html entity string -> character representation
68ENTITIES_TRIE = Trie(ENTITIES)
69
70#: Token type constants--these never change
71TAG_TOKEN_TYPES = {
72 constants.tokenTypes["StartTag"],
73 constants.tokenTypes["EndTag"],
74 constants.tokenTypes["EmptyTag"],
75}
76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
80
81
82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
84HTML_TAGS = frozenset(
85 (
86 "a",
87 "abbr",
88 "address",
89 "area",
90 "article",
91 "aside",
92 "audio",
93 "b",
94 "base",
95 "bdi",
96 "bdo",
97 "blockquote",
98 "body",
99 "br",
100 "button",
101 "canvas",
102 "caption",
103 "cite",
104 "code",
105 "col",
106 "colgroup",
107 "data",
108 "datalist",
109 "dd",
110 "del",
111 "details",
112 "dfn",
113 "dialog",
114 "div",
115 "dl",
116 "dt",
117 "em",
118 "embed",
119 "fieldset",
120 "figcaption",
121 "figure",
122 "footer",
123 "form",
124 "h1",
125 "h2",
126 "h3",
127 "h4",
128 "h5",
129 "h6",
130 "head",
131 "header",
132 "hgroup",
133 "hr",
134 "html",
135 "i",
136 "iframe",
137 "img",
138 "input",
139 "ins",
140 "kbd",
141 "keygen",
142 "label",
143 "legend",
144 "li",
145 "link",
146 "map",
147 "mark",
148 "menu",
149 "meta",
150 "meter",
151 "nav",
152 "noscript",
153 "object",
154 "ol",
155 "optgroup",
156 "option",
157 "output",
158 "p",
159 "param",
160 "picture",
161 "pre",
162 "progress",
163 "q",
164 "rp",
165 "rt",
166 "ruby",
167 "s",
168 "samp",
169 "script",
170 "section",
171 "select",
172 "slot",
173 "small",
174 "source",
175 "span",
176 "strong",
177 "style",
178 "sub",
179 "summary",
180 "sup",
181 "table",
182 "tbody",
183 "td",
184 "template",
185 "textarea",
186 "tfoot",
187 "th",
188 "thead",
189 "time",
190 "title",
191 "tr",
192 "track",
193 "u",
194 "ul",
195 "var",
196 "video",
197 "wbr",
198 )
199)
200
201
202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
203#: from mozilla on 2019.07.11
204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
205HTML_TAGS_BLOCK_LEVEL = frozenset(
206 (
207 "address",
208 "article",
209 "aside",
210 "blockquote",
211 "details",
212 "dialog",
213 "dd",
214 "div",
215 "dl",
216 "dt",
217 "fieldset",
218 "figcaption",
219 "figure",
220 "footer",
221 "form",
222 "h1",
223 "h2",
224 "h3",
225 "h4",
226 "h5",
227 "h6",
228 "header",
229 "hgroup",
230 "hr",
231 "li",
232 "main",
233 "nav",
234 "ol",
235 "p",
236 "pre",
237 "section",
238 "table",
239 "ul",
240 )
241)
242
243
244class InputStreamWithMemory:
245 """Wraps an HTMLInputStream to remember characters since last <
246
247 This wraps existing HTMLInputStream classes to keep track of the stream
248 since the last < which marked an open tag state.
249
250 """
251
252 def __init__(self, inner_stream):
253 self._inner_stream = inner_stream
254 self.reset = self._inner_stream.reset
255 self.position = self._inner_stream.position
256 self._buffer = []
257
258 @property
259 def errors(self):
260 return self._inner_stream.errors
261
262 @property
263 def charEncoding(self):
264 return self._inner_stream.charEncoding
265
266 @property
267 def changeEncoding(self):
268 return self._inner_stream.changeEncoding
269
270 def char(self):
271 c = self._inner_stream.char()
272 # char() can return None if EOF, so ignore that
273 if c:
274 self._buffer.append(c)
275 return c
276
277 def charsUntil(self, characters, opposite=False):
278 chars = self._inner_stream.charsUntil(characters, opposite=opposite)
279 self._buffer.extend(list(chars))
280 return chars
281
282 def unget(self, char):
283 if self._buffer:
284 self._buffer.pop(-1)
285 return self._inner_stream.unget(char)
286
287 def get_tag(self):
288 """Returns the stream history since last '<'
289
290 Since the buffer starts at the last '<' as as seen by tagOpenState(),
291 we know that everything from that point to when this method is called
292 is the "tag" that is being tokenized.
293
294 """
295 return "".join(self._buffer)
296
297 def start_tag(self):
298 """Resets stream history to just '<'
299
300 This gets called by tagOpenState() which marks a '<' that denotes an
301 open tag. Any time we see that, we reset the buffer.
302
303 """
304 self._buffer = ["<"]
305
306
307class BleachHTMLTokenizer(HTMLTokenizer):
308 """Tokenizer that doesn't consume character entities"""
309
310 def __init__(self, consume_entities=False, **kwargs):
311 super().__init__(**kwargs)
312
313 self.consume_entities = consume_entities
314
315 # Wrap the stream with one that remembers the history
316 self.stream = InputStreamWithMemory(self.stream)
317
318 # Remember the last token emitted; needed for block element spacing
319 self.emitted_last_token = None
320
321 def __iter__(self):
322 last_error_token = None
323
324 for token in super().__iter__():
325 if last_error_token is not None:
326 if (
327 last_error_token["data"] == "invalid-character-in-attribute-name"
328 and token["type"] in TAG_TOKEN_TYPES
329 and token.get("data")
330 ):
331 # token["data"] is an html5lib attributeMap
332 # (OrderedDict 3.7+ and dict otherwise)
333 # of attr name to attr value
334 #
335 # Remove attribute names that have ', " or < in them
336 # because those characters are invalid for attribute names.
337 token["data"] = attributeMap(
338 (attr_name, attr_value)
339 for attr_name, attr_value in token["data"].items()
340 if (
341 '"' not in attr_name
342 and "'" not in attr_name
343 and "<" not in attr_name
344 )
345 )
346 last_error_token = None
347 yield token
348
349 elif (
350 last_error_token["data"] == "expected-closing-tag-but-got-char"
351 and self.parser.tags is not None
352 and token["data"].lower().strip() not in self.parser.tags
353 ):
354 # We've got either a malformed tag or a pseudo-tag or
355 # something that html5lib wants to turn into a malformed
356 # comment which Bleach clean() will drop so we interfere
357 # with the token stream to handle it more correctly.
358 #
359 # If this is an allowed tag, it's malformed and we just let
360 # the html5lib parser deal with it--we don't enter into this
361 # block.
362 #
363 # If this is not an allowed tag, then we convert it to
364 # characters and it'll get escaped in the sanitizer.
365 token["data"] = self.stream.get_tag()
366 token["type"] = TAG_TOKEN_TYPE_CHARACTERS
367
368 last_error_token = None
369 yield token
370
371 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
372 # If the token is a parse error, then let the last_error_token
373 # go, and make token the new last_error_token
374 yield last_error_token
375 last_error_token = token
376
377 else:
378 yield last_error_token
379 yield token
380 last_error_token = None
381
382 continue
383
384 # If the token is a ParseError, we hold on to it so we can get the
385 # next token and potentially fix it.
386 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
387 last_error_token = token
388 continue
389
390 yield token
391
392 if last_error_token:
393 if last_error_token["data"] == "eof-in-tag-name":
394 # Handle the case where the text being parsed ends with <
395 # followed by a series of characters. It's treated as a tag
396 # name that abruptly ends, but we should treat that like
397 # character data
398 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
399
400 elif last_error_token["data"] in (
401 "duplicate-attribute",
402 "eof-in-attribute-name",
403 "eof-in-attribute-value-no-quotes",
404 "expected-end-of-tag-but-got-eof",
405 ):
406 # Handle the case where the text being parsed ends with <
407 # followed by characters and then space and then:
408 #
409 # * more characters
410 # * more characters repeated with a space between (e.g. "abc abc")
411 # * more characters and then a space and then an EOF (e.g. "abc def ")
412 #
413 # These cases are treated as a tag name followed by an
414 # attribute that abruptly ends, but we should treat that like
415 # character data instead.
416 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
417
418 else:
419 yield last_error_token
420
421 def consumeEntity(self, allowedChar=None, fromAttribute=False):
422 # If this tokenizer is set to consume entities, then we can let the
423 # superclass do its thing.
424 if self.consume_entities:
425 return super().consumeEntity(allowedChar, fromAttribute)
426
427 # If this tokenizer is set to not consume entities, then we don't want
428 # to consume and convert them, so this overrides the html5lib tokenizer's
429 # consumeEntity so that it's now a no-op.
430 #
431 # However, when that gets called, it's consumed an &, so we put that back in
432 # the stream.
433 if fromAttribute:
434 self.currentToken["data"][-1][1] += "&"
435
436 else:
437 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
438
439 def tagOpenState(self):
440 # This state marks a < that is either a StartTag, EndTag, EmptyTag,
441 # or ParseError. In all cases, we want to drop any stream history
442 # we've collected so far and we do that by calling start_tag() on
443 # the input stream wrapper.
444 self.stream.start_tag()
445 return super().tagOpenState()
446
447 def emitCurrentToken(self):
448 token = self.currentToken
449
450 if (
451 self.parser.tags is not None
452 and token["type"] in TAG_TOKEN_TYPES
453 and token["name"].lower() not in self.parser.tags
454 ):
455 # If this is a start/end/empty tag for a tag that's not in our
456 # allowed list, then it gets stripped or escaped. In both of these
457 # cases it gets converted to a Characters token.
458 if self.parser.strip:
459 if (
460 self.emitted_last_token
461 and token["type"] == TAG_TOKEN_TYPE_START
462 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
463 ):
464 # If this is a block level tag we're stripping, we drop it
465 # for a newline because that's what a browser would parse
466 # it as
467 new_data = "\n"
468 else:
469 # For all other things being stripped, we throw in an empty
470 # string token
471 new_data = ""
472
473 else:
474 # If we're escaping the token, we want to escape the exact
475 # original string. Since tokenizing also normalizes data
476 # and this is a tag-like thing, we've lost some information.
477 # So we go back through the stream to get the original
478 # string and use that.
479 new_data = self.stream.get_tag()
480
481 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
482
483 self.currentToken = self.emitted_last_token = new_token
484 self.tokenQueue.append(new_token)
485 self.state = self.dataState
486 return
487
488 self.emitted_last_token = self.currentToken
489 super().emitCurrentToken()
490
491
492class BleachHTMLParser(HTMLParser):
493 """Parser that uses BleachHTMLTokenizer"""
494
495 def __init__(self, tags, strip, consume_entities, **kwargs):
496 """
497 :arg tags: set of allowed tags--everything else is either stripped or
498 escaped; if None, then this doesn't look at tags at all
499 :arg strip: whether to strip disallowed tags (True) or escape them (False);
500 if tags=None, then this doesn't have any effect
501 :arg consume_entities: whether to consume entities (default behavior) or
502 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
503
504 """
505 self.tags = (
506 frozenset((tag.lower() for tag in tags)) if tags is not None else None
507 )
508 self.strip = strip
509 self.consume_entities = consume_entities
510 super().__init__(**kwargs)
511
512 def _parse(
513 self, stream, innerHTML=False, container="div", scripting=True, **kwargs
514 ):
515 # set scripting=True to parse <noscript> as though JS is enabled to
516 # match the expected context in browsers
517 #
518 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
519 #
520 # Override HTMLParser so we can swap out the tokenizer for our own.
521 self.innerHTMLMode = innerHTML
522 self.container = container
523 self.scripting = scripting
524 self.tokenizer = BleachHTMLTokenizer(
525 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
526 )
527 self.reset()
528
529 try:
530 self.mainLoop()
531 except ReparseException:
532 self.reset()
533 self.mainLoop()
534
535
536def convert_entity(value):
537 """Convert an entity (minus the & and ; part) into what it represents
538
539 This handles numeric, hex, and text entities.
540
541 :arg value: the string (minus the ``&`` and ``;`` part) to convert
542
543 :returns: unicode character or None if it's an ambiguous ampersand that
544 doesn't match a character entity
545
546 """
547 if value[0] == "#":
548 if len(value) < 2:
549 return None
550
551 if value[1] in ("x", "X"):
552 # hex-encoded code point
553 int_as_string, base = value[2:], 16
554 else:
555 # decimal code point
556 int_as_string, base = value[1:], 10
557
558 if int_as_string == "":
559 return None
560
561 code_point = int(int_as_string, base)
562 if 0 < code_point < 0x110000:
563 return chr(code_point)
564 else:
565 return None
566
567 return ENTITIES.get(value, None)
568
569
570def convert_entities(text):
571 """Converts all found entities in the text
572
573 :arg text: the text to convert entities in
574
575 :returns: unicode text with converted entities
576
577 """
578 if "&" not in text:
579 return text
580
581 new_text = []
582 for part in next_possible_entity(text):
583 if not part:
584 continue
585
586 if part.startswith("&"):
587 entity = match_entity(part)
588 if entity is not None:
589 converted = convert_entity(entity)
590
591 # If it's not an ambiguous ampersand, then replace with the
592 # unicode character. Otherwise, we leave the entity in.
593 if converted is not None:
594 new_text.append(converted)
595 remainder = part[len(entity) + 2 :]
596 if part:
597 new_text.append(remainder)
598 continue
599
600 new_text.append(part)
601
602 return "".join(new_text)
603
604
605def match_entity(stream):
606 """Returns first entity in stream or None if no entity exists
607
608 Note: For Bleach purposes, entities must start with a "&" and end with a
609 ";". This ignores ambiguous character entities that have no ";" at the end.
610
611 :arg stream: the character stream
612
613 :returns: the entity string without "&" or ";" if it's a valid character
614 entity; ``None`` otherwise
615
616 """
617 # Nix the & at the beginning
618 if stream[0] != "&":
619 raise ValueError('Stream should begin with "&"')
620
621 stream = stream[1:]
622
623 stream = list(stream)
624 possible_entity = ""
625 end_characters = "<&=;" + string.whitespace
626
627 # Handle number entities
628 if stream and stream[0] == "#":
629 possible_entity = "#"
630 stream.pop(0)
631
632 if stream and stream[0] in ("x", "X"):
633 allowed = "0123456789abcdefABCDEF"
634 possible_entity += stream.pop(0)
635 else:
636 allowed = "0123456789"
637
638 # FIXME(willkg): Do we want to make sure these are valid number
639 # entities? This doesn't do that currently.
640 while stream and stream[0] not in end_characters:
641 c = stream.pop(0)
642 if c not in allowed:
643 break
644 possible_entity += c
645
646 if possible_entity and stream and stream[0] == ";":
647 return possible_entity
648 return None
649
650 # Handle character entities
651 while stream and stream[0] not in end_characters:
652 c = stream.pop(0)
653 possible_entity += c
654 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
655 # If it's not a prefix, then it's not an entity and we're
656 # out
657 return None
658
659 if possible_entity and stream and stream[0] == ";":
660 return possible_entity
661
662 return None
663
664
665AMP_SPLIT_RE = re.compile("(&)")
666
667
668def next_possible_entity(text):
669 """Takes a text and generates a list of possible entities
670
671 :arg text: the text to look at
672
673 :returns: generator where each part (except the first) starts with an
674 "&"
675
676 """
677 for i, part in enumerate(AMP_SPLIT_RE.split(text)):
678 if i == 0:
679 yield part
680 elif i % 2 == 0:
681 yield "&" + part
682
683
684class BleachHTMLSerializer(HTMLSerializer):
685 """HTMLSerializer that undoes & -> & in attributes and sets
686 escape_rcdata to True
687 """
688
689 # per the HTMLSerializer.__init__ docstring:
690 #
691 # Whether to escape characters that need to be
692 # escaped within normal elements within rcdata elements such as
693 # style.
694 #
695 escape_rcdata = True
696
697 def escape_base_amp(self, stoken):
698 """Escapes just bare & in HTML attribute values"""
699 # First, undo escaping of &. We need to do this because html5lib's
700 # HTMLSerializer expected the tokenizer to consume all the character
701 # entities and convert them to their respective characters, but the
702 # BleachHTMLTokenizer doesn't do that. For example, this fixes
703 # &entity; back to &entity; .
704 stoken = stoken.replace("&", "&")
705
706 # However, we do want all bare & that are not marking character
707 # entities to be changed to &, so let's do that carefully here.
708 for part in next_possible_entity(stoken):
709 if not part:
710 continue
711
712 if part.startswith("&"):
713 entity = match_entity(part)
714 # Only leave entities in that are not ambiguous. If they're
715 # ambiguous, then we escape the ampersand.
716 if entity is not None and convert_entity(entity) is not None:
717 yield f"&{entity};"
718
719 # Length of the entity plus 2--one for & at the beginning
720 # and one for ; at the end
721 part = part[len(entity) + 2 :]
722 if part:
723 yield part
724 continue
725
726 yield part.replace("&", "&")
727
728 def serialize(self, treewalker, encoding=None):
729 """Wrap HTMLSerializer.serialize and conver & to & in attribute values
730
731 Note that this converts & to & in attribute values where the & isn't
732 already part of an unambiguous character entity.
733
734 """
735 in_tag = False
736 after_equals = False
737
738 for stoken in super().serialize(treewalker, encoding):
739 if in_tag:
740 if stoken == ">":
741 in_tag = False
742
743 elif after_equals:
744 if stoken != '"':
745 yield from self.escape_base_amp(stoken)
746
747 after_equals = False
748 continue
749
750 elif stoken == "=":
751 after_equals = True
752
753 yield stoken
754 else:
755 if stoken.startswith("<"):
756 in_tag = True
757 yield stoken