Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/html5lib_shim.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

247 statements  

1# flake8: noqa 

2""" 

3Shim module between Bleach and html5lib. This makes it easier to upgrade the 

4html5lib library without having to change a lot of code. 

5""" 

6 

7import re 

8import string 

9import warnings 

10 

11# ignore html5lib deprecation warnings to use bleach; we are bleach 

12# apply before we import submodules that import html5lib 

13warnings.filterwarnings( 

14 "ignore", 

15 message="html5lib's sanitizer is deprecated", 

16 category=DeprecationWarning, 

17 module="bleach._vendor.html5lib", 

18) 

19 

20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file 

21 HTMLParser, 

22 getTreeWalker, 

23) 

24from bleach._vendor.html5lib import ( 

25 constants, 

26) # noqa: E402 module level import not at top of file 

27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file 

28 namespaces, 

29 prefixes, 

30) 

31from bleach._vendor.html5lib.constants import ( 

32 _ReparseException as ReparseException, 

33) # noqa: E402 module level import not at top of file 

34from bleach._vendor.html5lib.filters.base import ( 

35 Filter, 

36) # noqa: E402 module level import not at top of file 

37from bleach._vendor.html5lib.filters.sanitizer import ( 

38 allowed_protocols, 

39 allowed_css_properties, 

40 allowed_svg_properties, 

41 attr_val_is_uri, 

42 svg_attr_val_allows_ref, 

43 svg_allow_local_href, 

44) # noqa: E402 module level import not at top of file 

45from bleach._vendor.html5lib.filters.sanitizer import ( 

46 Filter as SanitizerFilter, 

47) # noqa: E402 module level import not at top of file 

48from bleach._vendor.html5lib._inputstream import ( 

49 HTMLInputStream, 

50) # noqa: E402 module level import not at top of file 

51from bleach._vendor.html5lib.serializer import ( 

52 escape, 

53 HTMLSerializer, 

54) # noqa: E402 module level import not at top of file 

55from bleach._vendor.html5lib._tokenizer import ( 

56 attributeMap, 

57 HTMLTokenizer, 

58) # noqa: E402 module level import not at top of file 

59from bleach._vendor.html5lib._trie import ( 

60 Trie, 

61) # noqa: E402 module level import not at top of file 

62 

63 

64#: Map of entity name to expanded entity 

65ENTITIES = constants.entities 

66 

67#: Trie of html entity string -> character representation 

68ENTITIES_TRIE = Trie(ENTITIES) 

69 

70#: Token type constants--these never change 

71TAG_TOKEN_TYPES = { 

72 constants.tokenTypes["StartTag"], 

73 constants.tokenTypes["EndTag"], 

74 constants.tokenTypes["EmptyTag"], 

75} 

76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"] 

77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"] 

78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"] 

79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] 

80 

81 

82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 

83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 

84HTML_TAGS = frozenset( 

85 ( 

86 "a", 

87 "abbr", 

88 "address", 

89 "area", 

90 "article", 

91 "aside", 

92 "audio", 

93 "b", 

94 "base", 

95 "bdi", 

96 "bdo", 

97 "blockquote", 

98 "body", 

99 "br", 

100 "button", 

101 "canvas", 

102 "caption", 

103 "cite", 

104 "code", 

105 "col", 

106 "colgroup", 

107 "data", 

108 "datalist", 

109 "dd", 

110 "del", 

111 "details", 

112 "dfn", 

113 "dialog", 

114 "div", 

115 "dl", 

116 "dt", 

117 "em", 

118 "embed", 

119 "fieldset", 

120 "figcaption", 

121 "figure", 

122 "footer", 

123 "form", 

124 "h1", 

125 "h2", 

126 "h3", 

127 "h4", 

128 "h5", 

129 "h6", 

130 "head", 

131 "header", 

132 "hgroup", 

133 "hr", 

134 "html", 

135 "i", 

136 "iframe", 

137 "img", 

138 "input", 

139 "ins", 

140 "kbd", 

141 "keygen", 

142 "label", 

143 "legend", 

144 "li", 

145 "link", 

146 "map", 

147 "mark", 

148 "menu", 

149 "meta", 

150 "meter", 

151 "nav", 

152 "noscript", 

153 "object", 

154 "ol", 

155 "optgroup", 

156 "option", 

157 "output", 

158 "p", 

159 "param", 

160 "picture", 

161 "pre", 

162 "progress", 

163 "q", 

164 "rp", 

165 "rt", 

166 "ruby", 

167 "s", 

168 "samp", 

169 "script", 

170 "section", 

171 "select", 

172 "slot", 

173 "small", 

174 "source", 

175 "span", 

176 "strong", 

177 "style", 

178 "sub", 

179 "summary", 

180 "sup", 

181 "table", 

182 "tbody", 

183 "td", 

184 "template", 

185 "textarea", 

186 "tfoot", 

187 "th", 

188 "thead", 

189 "time", 

190 "title", 

191 "tr", 

192 "track", 

193 "u", 

194 "ul", 

195 "var", 

196 "video", 

197 "wbr", 

198 ) 

199) 

200 

201 

202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 

203#: from mozilla on 2019.07.11 

204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements 

205HTML_TAGS_BLOCK_LEVEL = frozenset( 

206 ( 

207 "address", 

208 "article", 

209 "aside", 

210 "blockquote", 

211 "details", 

212 "dialog", 

213 "dd", 

214 "div", 

215 "dl", 

216 "dt", 

217 "fieldset", 

218 "figcaption", 

219 "figure", 

220 "footer", 

221 "form", 

222 "h1", 

223 "h2", 

224 "h3", 

225 "h4", 

226 "h5", 

227 "h6", 

228 "header", 

229 "hgroup", 

230 "hr", 

231 "li", 

232 "main", 

233 "nav", 

234 "ol", 

235 "p", 

236 "pre", 

237 "section", 

238 "table", 

239 "ul", 

240 ) 

241) 

242 

243 

244class InputStreamWithMemory: 

245 """Wraps an HTMLInputStream to remember characters since last < 

246 

247 This wraps existing HTMLInputStream classes to keep track of the stream 

248 since the last < which marked an open tag state. 

249 

250 """ 

251 

252 def __init__(self, inner_stream): 

253 self._inner_stream = inner_stream 

254 self.reset = self._inner_stream.reset 

255 self.position = self._inner_stream.position 

256 self._buffer = [] 

257 

258 @property 

259 def errors(self): 

260 return self._inner_stream.errors 

261 

262 @property 

263 def charEncoding(self): 

264 return self._inner_stream.charEncoding 

265 

266 @property 

267 def changeEncoding(self): 

268 return self._inner_stream.changeEncoding 

269 

270 def char(self): 

271 c = self._inner_stream.char() 

272 # char() can return None if EOF, so ignore that 

273 if c: 

274 self._buffer.append(c) 

275 return c 

276 

277 def charsUntil(self, characters, opposite=False): 

278 chars = self._inner_stream.charsUntil(characters, opposite=opposite) 

279 self._buffer.extend(list(chars)) 

280 return chars 

281 

282 def unget(self, char): 

283 if self._buffer: 

284 self._buffer.pop(-1) 

285 return self._inner_stream.unget(char) 

286 

287 def get_tag(self): 

288 """Returns the stream history since last '<' 

289 

290 Since the buffer starts at the last '<' as as seen by tagOpenState(), 

291 we know that everything from that point to when this method is called 

292 is the "tag" that is being tokenized. 

293 

294 """ 

295 return "".join(self._buffer) 

296 

297 def start_tag(self): 

298 """Resets stream history to just '<' 

299 

300 This gets called by tagOpenState() which marks a '<' that denotes an 

301 open tag. Any time we see that, we reset the buffer. 

302 

303 """ 

304 self._buffer = ["<"] 

305 

306 

307class BleachHTMLTokenizer(HTMLTokenizer): 

308 """Tokenizer that doesn't consume character entities""" 

309 

310 def __init__(self, consume_entities=False, **kwargs): 

311 super().__init__(**kwargs) 

312 

313 self.consume_entities = consume_entities 

314 

315 # Wrap the stream with one that remembers the history 

316 self.stream = InputStreamWithMemory(self.stream) 

317 

318 # Remember the last token emitted; needed for block element spacing 

319 self.emitted_last_token = None 

320 

321 def __iter__(self): 

322 last_error_token = None 

323 

324 for token in super().__iter__(): 

325 if last_error_token is not None: 

326 if ( 

327 last_error_token["data"] == "invalid-character-in-attribute-name" 

328 and token["type"] in TAG_TOKEN_TYPES 

329 and token.get("data") 

330 ): 

331 # token["data"] is an html5lib attributeMap 

332 # (OrderedDict 3.7+ and dict otherwise) 

333 # of attr name to attr value 

334 # 

335 # Remove attribute names that have ', " or < in them 

336 # because those characters are invalid for attribute names. 

337 token["data"] = attributeMap( 

338 (attr_name, attr_value) 

339 for attr_name, attr_value in token["data"].items() 

340 if ( 

341 '"' not in attr_name 

342 and "'" not in attr_name 

343 and "<" not in attr_name 

344 ) 

345 ) 

346 last_error_token = None 

347 yield token 

348 

349 elif ( 

350 last_error_token["data"] 

351 in ( 

352 "invalid-character-in-attribute-name", 

353 "invalid-character-after-attribute-name", 

354 ) 

355 and token["type"] == TAG_TOKEN_TYPE_CHARACTERS 

356 and token.get("data") 

357 and " " in token["data"] 

358 ): 

359 # token["data"] has something that starts with a left angle 

360 # bracket, then has some characters followed by a space 

361 # followed by another left angle bracket and ending with 

362 # a right angle bracket. That part could be a real tag, so 

363 # we don't want it to get treated as Characters. For 

364 # example, soemthing in this shape: <nottag <...> 

365 # If so, we want to take off the first bit that is 

366 # definitely not a tag and reparse the rest. 

367 head, rest = token["data"].split(" ", 1) 

368 if rest.strip().startswith("<"): 

369 # yield the not-a-tag plus the space we split on 

370 token["data"] = head + " " 

371 yield token 

372 

373 # shove the rest back in the stream for the praser to look 

374 # at 

375 for c in reversed(rest): 

376 self.stream.unget(c) 

377 else: 

378 yield token 

379 

380 elif ( 

381 last_error_token["data"] == "expected-closing-tag-but-got-char" 

382 and self.parser.tags is not None 

383 and token["data"].lower().strip() not in self.parser.tags 

384 ): 

385 # We've got either a malformed tag or a pseudo-tag or 

386 # something that html5lib wants to turn into a malformed 

387 # comment which Bleach clean() will drop so we interfere 

388 # with the token stream to handle it more correctly. 

389 # 

390 # If this is an allowed tag, it's malformed and we just let 

391 # the html5lib parser deal with it--we don't enter into this 

392 # block. 

393 # 

394 # If this is not an allowed tag, then we convert it to 

395 # characters and it'll get escaped in the sanitizer. 

396 token["data"] = self.stream.get_tag() 

397 token["type"] = TAG_TOKEN_TYPE_CHARACTERS 

398 

399 last_error_token = None 

400 yield token 

401 

402 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR: 

403 # If the token is a parse error, then let the last_error_token 

404 # go, and make token the new last_error_token 

405 yield last_error_token 

406 last_error_token = token 

407 

408 else: 

409 yield last_error_token 

410 yield token 

411 last_error_token = None 

412 

413 continue 

414 

415 # If the token is a ParseError, we hold on to it so we can get the 

416 # next token and potentially fix it. 

417 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR: 

418 last_error_token = token 

419 continue 

420 

421 yield token 

422 

423 if last_error_token: 

424 if last_error_token["data"] == "eof-in-tag-name": 

425 # Handle the case where the text being parsed ends with < 

426 # followed by a series of characters. It's treated as a tag 

427 # name that abruptly ends, but we should treat that like 

428 # character data 

429 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} 

430 

431 elif last_error_token["data"] in ( 

432 "duplicate-attribute", 

433 "eof-in-attribute-name", 

434 "eof-in-attribute-value-no-quotes", 

435 "expected-end-of-tag-but-got-eof", 

436 ): 

437 # Handle the case where the text being parsed ends with < 

438 # followed by characters and then space and then: 

439 # 

440 # * more characters 

441 # * more characters repeated with a space between (e.g. "abc abc") 

442 # * more characters and then a space and then an EOF (e.g. "abc def ") 

443 # 

444 # These cases are treated as a tag name followed by an 

445 # attribute that abruptly ends, but we should treat that like 

446 # character data instead. 

447 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} 

448 

449 else: 

450 yield last_error_token 

451 

452 def consumeEntity(self, allowedChar=None, fromAttribute=False): 

453 # If this tokenizer is set to consume entities, then we can let the 

454 # superclass do its thing. 

455 if self.consume_entities: 

456 return super().consumeEntity(allowedChar, fromAttribute) 

457 

458 # If this tokenizer is set to not consume entities, then we don't want 

459 # to consume and convert them, so this overrides the html5lib tokenizer's 

460 # consumeEntity so that it's now a no-op. 

461 # 

462 # However, when that gets called, it's consumed an &, so we put that back in 

463 # the stream. 

464 if fromAttribute: 

465 self.currentToken["data"][-1][1] += "&" 

466 

467 else: 

468 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"}) 

469 

470 def tagOpenState(self): 

471 # This state marks a < that is either a StartTag, EndTag, EmptyTag, 

472 # or ParseError. In all cases, we want to drop any stream history 

473 # we've collected so far and we do that by calling start_tag() on 

474 # the input stream wrapper. 

475 self.stream.start_tag() 

476 return super().tagOpenState() 

477 

478 def emitCurrentToken(self): 

479 token = self.currentToken 

480 

481 if ( 

482 self.parser.tags is not None 

483 and token["type"] in TAG_TOKEN_TYPES 

484 and token["name"].lower() not in self.parser.tags 

485 ): 

486 # If this is a start/end/empty tag for a tag that's not in our 

487 # allowed list, then it gets stripped or escaped. In both of these 

488 # cases it gets converted to a Characters token. 

489 if self.parser.strip: 

490 if ( 

491 self.emitted_last_token 

492 and token["type"] == TAG_TOKEN_TYPE_START 

493 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL 

494 ): 

495 # If this is a block level tag we're stripping, we drop it 

496 # for a newline because that's what a browser would parse 

497 # it as 

498 new_data = "\n" 

499 else: 

500 # For all other things being stripped, we throw in an empty 

501 # string token 

502 new_data = "" 

503 

504 else: 

505 # If we're escaping the token, we want to escape the exact 

506 # original string. Since tokenizing also normalizes data 

507 # and this is a tag-like thing, we've lost some information. 

508 # So we go back through the stream to get the original 

509 # string and use that. 

510 new_data = self.stream.get_tag() 

511 

512 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data} 

513 

514 self.currentToken = self.emitted_last_token = new_token 

515 self.tokenQueue.append(new_token) 

516 self.state = self.dataState 

517 return 

518 

519 self.emitted_last_token = self.currentToken 

520 super().emitCurrentToken() 

521 

522 

523class BleachHTMLParser(HTMLParser): 

524 """Parser that uses BleachHTMLTokenizer""" 

525 

526 def __init__(self, tags, strip, consume_entities, **kwargs): 

527 """ 

528 :arg tags: set of allowed tags--everything else is either stripped or 

529 escaped; if None, then this doesn't look at tags at all 

530 :arg strip: whether to strip disallowed tags (True) or escape them (False); 

531 if tags=None, then this doesn't have any effect 

532 :arg consume_entities: whether to consume entities (default behavior) or 

533 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) 

534 

535 """ 

536 self.tags = ( 

537 frozenset((tag.lower() for tag in tags)) if tags is not None else None 

538 ) 

539 self.strip = strip 

540 self.consume_entities = consume_entities 

541 super().__init__(**kwargs) 

542 

543 def _parse( 

544 self, stream, innerHTML=False, container="div", scripting=True, **kwargs 

545 ): 

546 # set scripting=True to parse <noscript> as though JS is enabled to 

547 # match the expected context in browsers 

548 # 

549 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element 

550 # 

551 # Override HTMLParser so we can swap out the tokenizer for our own. 

552 self.innerHTMLMode = innerHTML 

553 self.container = container 

554 self.scripting = scripting 

555 self.tokenizer = BleachHTMLTokenizer( 

556 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs 

557 ) 

558 self.reset() 

559 

560 try: 

561 self.mainLoop() 

562 except ReparseException: 

563 self.reset() 

564 self.mainLoop() 

565 

566 

567def convert_entity(value): 

568 """Convert an entity (minus the & and ; part) into what it represents 

569 

570 This handles numeric, hex, and text entities. 

571 

572 :arg value: the string (minus the ``&`` and ``;`` part) to convert 

573 

574 :returns: unicode character or None if it's an ambiguous ampersand that 

575 doesn't match a character entity 

576 

577 """ 

578 if value[0] == "#": 

579 if len(value) < 2: 

580 return None 

581 

582 if value[1] in ("x", "X"): 

583 # hex-encoded code point 

584 int_as_string, base = value[2:], 16 

585 else: 

586 # decimal code point 

587 int_as_string, base = value[1:], 10 

588 

589 if int_as_string == "": 

590 return None 

591 

592 code_point = int(int_as_string, base) 

593 if 0 < code_point < 0x110000: 

594 return chr(code_point) 

595 else: 

596 return None 

597 

598 return ENTITIES.get(value, None) 

599 

600 

601def convert_entities(text): 

602 """Converts all found entities in the text 

603 

604 :arg text: the text to convert entities in 

605 

606 :returns: unicode text with converted entities 

607 

608 """ 

609 if "&" not in text: 

610 return text 

611 

612 new_text = [] 

613 for part in next_possible_entity(text): 

614 if not part: 

615 continue 

616 

617 if part.startswith("&"): 

618 entity = match_entity(part) 

619 if entity is not None: 

620 converted = convert_entity(entity) 

621 

622 # If it's not an ambiguous ampersand, then replace with the 

623 # unicode character. Otherwise, we leave the entity in. 

624 if converted is not None: 

625 new_text.append(converted) 

626 remainder = part[len(entity) + 2 :] 

627 if part: 

628 new_text.append(remainder) 

629 continue 

630 

631 new_text.append(part) 

632 

633 return "".join(new_text) 

634 

635 

636def match_entity(stream): 

637 """Returns first entity in stream or None if no entity exists 

638 

639 Note: For Bleach purposes, entities must start with a "&" and end with a 

640 ";". This ignores ambiguous character entities that have no ";" at the end. 

641 

642 :arg stream: the character stream 

643 

644 :returns: the entity string without "&" or ";" if it's a valid character 

645 entity; ``None`` otherwise 

646 

647 """ 

648 # Nix the & at the beginning 

649 if stream[0] != "&": 

650 raise ValueError('Stream should begin with "&"') 

651 

652 stream = stream[1:] 

653 

654 stream = list(stream) 

655 possible_entity = "" 

656 end_characters = "<&=;" + string.whitespace 

657 

658 # Handle number entities 

659 if stream and stream[0] == "#": 

660 possible_entity = "#" 

661 stream.pop(0) 

662 

663 if stream and stream[0] in ("x", "X"): 

664 allowed = "0123456789abcdefABCDEF" 

665 possible_entity += stream.pop(0) 

666 else: 

667 allowed = "0123456789" 

668 

669 # FIXME(willkg): Do we want to make sure these are valid number 

670 # entities? This doesn't do that currently. 

671 while stream and stream[0] not in end_characters: 

672 c = stream.pop(0) 

673 if c not in allowed: 

674 break 

675 possible_entity += c 

676 

677 if possible_entity and stream and stream[0] == ";": 

678 return possible_entity 

679 return None 

680 

681 # Handle character entities 

682 while stream and stream[0] not in end_characters: 

683 c = stream.pop(0) 

684 possible_entity += c 

685 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): 

686 # If it's not a prefix, then it's not an entity and we're 

687 # out 

688 return None 

689 

690 if possible_entity and stream and stream[0] == ";": 

691 return possible_entity 

692 

693 return None 

694 

695 

696AMP_SPLIT_RE = re.compile("(&)") 

697 

698 

699def next_possible_entity(text): 

700 """Takes a text and generates a list of possible entities 

701 

702 :arg text: the text to look at 

703 

704 :returns: generator where each part (except the first) starts with an 

705 "&" 

706 

707 """ 

708 for i, part in enumerate(AMP_SPLIT_RE.split(text)): 

709 if i == 0: 

710 yield part 

711 elif i % 2 == 0: 

712 yield "&" + part 

713 

714 

715class BleachHTMLSerializer(HTMLSerializer): 

716 """HTMLSerializer that undoes & -> &amp; in attributes and sets 

717 escape_rcdata to True 

718 """ 

719 

720 # per the HTMLSerializer.__init__ docstring: 

721 # 

722 # Whether to escape characters that need to be 

723 # escaped within normal elements within rcdata elements such as 

724 # style. 

725 # 

726 escape_rcdata = True 

727 

728 def escape_base_amp(self, stoken): 

729 """Escapes just bare & in HTML attribute values""" 

730 # First, undo escaping of &. We need to do this because html5lib's 

731 # HTMLSerializer expected the tokenizer to consume all the character 

732 # entities and convert them to their respective characters, but the 

733 # BleachHTMLTokenizer doesn't do that. For example, this fixes 

734 # &amp;entity; back to &entity; . 

735 stoken = stoken.replace("&amp;", "&") 

736 

737 # However, we do want all bare & that are not marking character 

738 # entities to be changed to &amp;, so let's do that carefully here. 

739 for part in next_possible_entity(stoken): 

740 if not part: 

741 continue 

742 

743 if part.startswith("&"): 

744 entity = match_entity(part) 

745 # Only leave entities in that are not ambiguous. If they're 

746 # ambiguous, then we escape the ampersand. 

747 if entity is not None and convert_entity(entity) is not None: 

748 yield f"&{entity};" 

749 

750 # Length of the entity plus 2--one for & at the beginning 

751 # and one for ; at the end 

752 part = part[len(entity) + 2 :] 

753 if part: 

754 yield part 

755 continue 

756 

757 yield part.replace("&", "&amp;") 

758 

759 def serialize(self, treewalker, encoding=None): 

760 """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values 

761 

762 Note that this converts & to &amp; in attribute values where the & isn't 

763 already part of an unambiguous character entity. 

764 

765 """ 

766 in_tag = False 

767 after_equals = False 

768 

769 for stoken in super().serialize(treewalker, encoding): 

770 if in_tag: 

771 if stoken == ">": 

772 in_tag = False 

773 

774 elif after_equals: 

775 if stoken != '"': 

776 yield from self.escape_base_amp(stoken) 

777 

778 after_equals = False 

779 continue 

780 

781 elif stoken == "=": 

782 after_equals = True 

783 

784 yield stoken 

785 else: 

786 if stoken.startswith("<"): 

787 in_tag = True 

788 yield stoken