Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/html5lib_shim.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

239 statements  

1# flake8: noqa 

2""" 

3Shim module between Bleach and html5lib. This makes it easier to upgrade the 

4html5lib library without having to change a lot of code. 

5""" 

6 

7import re 

8import string 

9import warnings 

10 

11# ignore html5lib deprecation warnings to use bleach; we are bleach 

12# apply before we import submodules that import html5lib 

13warnings.filterwarnings( 

14 "ignore", 

15 message="html5lib's sanitizer is deprecated", 

16 category=DeprecationWarning, 

17 module="bleach._vendor.html5lib", 

18) 

19 

20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file 

21 HTMLParser, 

22 getTreeWalker, 

23) 

24from bleach._vendor.html5lib import ( 

25 constants, 

26) # noqa: E402 module level import not at top of file 

27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file 

28 namespaces, 

29 prefixes, 

30) 

31from bleach._vendor.html5lib.constants import ( 

32 _ReparseException as ReparseException, 

33) # noqa: E402 module level import not at top of file 

34from bleach._vendor.html5lib.filters.base import ( 

35 Filter, 

36) # noqa: E402 module level import not at top of file 

37from bleach._vendor.html5lib.filters.sanitizer import ( 

38 allowed_protocols, 

39 allowed_css_properties, 

40 allowed_svg_properties, 

41 attr_val_is_uri, 

42 svg_attr_val_allows_ref, 

43 svg_allow_local_href, 

44) # noqa: E402 module level import not at top of file 

45from bleach._vendor.html5lib.filters.sanitizer import ( 

46 Filter as SanitizerFilter, 

47) # noqa: E402 module level import not at top of file 

48from bleach._vendor.html5lib._inputstream import ( 

49 HTMLInputStream, 

50) # noqa: E402 module level import not at top of file 

51from bleach._vendor.html5lib.serializer import ( 

52 escape, 

53 HTMLSerializer, 

54) # noqa: E402 module level import not at top of file 

55from bleach._vendor.html5lib._tokenizer import ( 

56 attributeMap, 

57 HTMLTokenizer, 

58) # noqa: E402 module level import not at top of file 

59from bleach._vendor.html5lib._trie import ( 

60 Trie, 

61) # noqa: E402 module level import not at top of file 

62 

63 

64#: Map of entity name to expanded entity 

65ENTITIES = constants.entities 

66 

67#: Trie of html entity string -> character representation 

68ENTITIES_TRIE = Trie(ENTITIES) 

69 

70#: Token type constants--these never change 

71TAG_TOKEN_TYPES = { 

72 constants.tokenTypes["StartTag"], 

73 constants.tokenTypes["EndTag"], 

74 constants.tokenTypes["EmptyTag"], 

75} 

76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"] 

77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"] 

78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"] 

79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] 

80 

81 

82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 

83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3 

84HTML_TAGS = frozenset( 

85 ( 

86 "a", 

87 "abbr", 

88 "address", 

89 "area", 

90 "article", 

91 "aside", 

92 "audio", 

93 "b", 

94 "base", 

95 "bdi", 

96 "bdo", 

97 "blockquote", 

98 "body", 

99 "br", 

100 "button", 

101 "canvas", 

102 "caption", 

103 "cite", 

104 "code", 

105 "col", 

106 "colgroup", 

107 "data", 

108 "datalist", 

109 "dd", 

110 "del", 

111 "details", 

112 "dfn", 

113 "dialog", 

114 "div", 

115 "dl", 

116 "dt", 

117 "em", 

118 "embed", 

119 "fieldset", 

120 "figcaption", 

121 "figure", 

122 "footer", 

123 "form", 

124 "h1", 

125 "h2", 

126 "h3", 

127 "h4", 

128 "h5", 

129 "h6", 

130 "head", 

131 "header", 

132 "hgroup", 

133 "hr", 

134 "html", 

135 "i", 

136 "iframe", 

137 "img", 

138 "input", 

139 "ins", 

140 "kbd", 

141 "keygen", 

142 "label", 

143 "legend", 

144 "li", 

145 "link", 

146 "map", 

147 "mark", 

148 "menu", 

149 "meta", 

150 "meter", 

151 "nav", 

152 "noscript", 

153 "object", 

154 "ol", 

155 "optgroup", 

156 "option", 

157 "output", 

158 "p", 

159 "param", 

160 "picture", 

161 "pre", 

162 "progress", 

163 "q", 

164 "rp", 

165 "rt", 

166 "ruby", 

167 "s", 

168 "samp", 

169 "script", 

170 "section", 

171 "select", 

172 "slot", 

173 "small", 

174 "source", 

175 "span", 

176 "strong", 

177 "style", 

178 "sub", 

179 "summary", 

180 "sup", 

181 "table", 

182 "tbody", 

183 "td", 

184 "template", 

185 "textarea", 

186 "tfoot", 

187 "th", 

188 "thead", 

189 "time", 

190 "title", 

191 "tr", 

192 "track", 

193 "u", 

194 "ul", 

195 "var", 

196 "video", 

197 "wbr", 

198 ) 

199) 

200 

201 

202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 

203#: from mozilla on 2019.07.11 

204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements 

205HTML_TAGS_BLOCK_LEVEL = frozenset( 

206 ( 

207 "address", 

208 "article", 

209 "aside", 

210 "blockquote", 

211 "details", 

212 "dialog", 

213 "dd", 

214 "div", 

215 "dl", 

216 "dt", 

217 "fieldset", 

218 "figcaption", 

219 "figure", 

220 "footer", 

221 "form", 

222 "h1", 

223 "h2", 

224 "h3", 

225 "h4", 

226 "h5", 

227 "h6", 

228 "header", 

229 "hgroup", 

230 "hr", 

231 "li", 

232 "main", 

233 "nav", 

234 "ol", 

235 "p", 

236 "pre", 

237 "section", 

238 "table", 

239 "ul", 

240 ) 

241) 

242 

243 

244class InputStreamWithMemory: 

245 """Wraps an HTMLInputStream to remember characters since last < 

246 

247 This wraps existing HTMLInputStream classes to keep track of the stream 

248 since the last < which marked an open tag state. 

249 

250 """ 

251 

252 def __init__(self, inner_stream): 

253 self._inner_stream = inner_stream 

254 self.reset = self._inner_stream.reset 

255 self.position = self._inner_stream.position 

256 self._buffer = [] 

257 

258 @property 

259 def errors(self): 

260 return self._inner_stream.errors 

261 

262 @property 

263 def charEncoding(self): 

264 return self._inner_stream.charEncoding 

265 

266 @property 

267 def changeEncoding(self): 

268 return self._inner_stream.changeEncoding 

269 

270 def char(self): 

271 c = self._inner_stream.char() 

272 # char() can return None if EOF, so ignore that 

273 if c: 

274 self._buffer.append(c) 

275 return c 

276 

277 def charsUntil(self, characters, opposite=False): 

278 chars = self._inner_stream.charsUntil(characters, opposite=opposite) 

279 self._buffer.extend(list(chars)) 

280 return chars 

281 

282 def unget(self, char): 

283 if self._buffer: 

284 self._buffer.pop(-1) 

285 return self._inner_stream.unget(char) 

286 

287 def get_tag(self): 

288 """Returns the stream history since last '<' 

289 

290 Since the buffer starts at the last '<' as as seen by tagOpenState(), 

291 we know that everything from that point to when this method is called 

292 is the "tag" that is being tokenized. 

293 

294 """ 

295 return "".join(self._buffer) 

296 

297 def start_tag(self): 

298 """Resets stream history to just '<' 

299 

300 This gets called by tagOpenState() which marks a '<' that denotes an 

301 open tag. Any time we see that, we reset the buffer. 

302 

303 """ 

304 self._buffer = ["<"] 

305 

306 

307class BleachHTMLTokenizer(HTMLTokenizer): 

308 """Tokenizer that doesn't consume character entities""" 

309 

310 def __init__(self, consume_entities=False, **kwargs): 

311 super().__init__(**kwargs) 

312 

313 self.consume_entities = consume_entities 

314 

315 # Wrap the stream with one that remembers the history 

316 self.stream = InputStreamWithMemory(self.stream) 

317 

318 # Remember the last token emitted; needed for block element spacing 

319 self.emitted_last_token = None 

320 

321 def __iter__(self): 

322 last_error_token = None 

323 

324 for token in super().__iter__(): 

325 if last_error_token is not None: 

326 if ( 

327 last_error_token["data"] == "invalid-character-in-attribute-name" 

328 and token["type"] in TAG_TOKEN_TYPES 

329 and token.get("data") 

330 ): 

331 # token["data"] is an html5lib attributeMap 

332 # (OrderedDict 3.7+ and dict otherwise) 

333 # of attr name to attr value 

334 # 

335 # Remove attribute names that have ', " or < in them 

336 # because those characters are invalid for attribute names. 

337 token["data"] = attributeMap( 

338 (attr_name, attr_value) 

339 for attr_name, attr_value in token["data"].items() 

340 if ( 

341 '"' not in attr_name 

342 and "'" not in attr_name 

343 and "<" not in attr_name 

344 ) 

345 ) 

346 last_error_token = None 

347 yield token 

348 

349 elif ( 

350 last_error_token["data"] == "expected-closing-tag-but-got-char" 

351 and self.parser.tags is not None 

352 and token["data"].lower().strip() not in self.parser.tags 

353 ): 

354 # We've got either a malformed tag or a pseudo-tag or 

355 # something that html5lib wants to turn into a malformed 

356 # comment which Bleach clean() will drop so we interfere 

357 # with the token stream to handle it more correctly. 

358 # 

359 # If this is an allowed tag, it's malformed and we just let 

360 # the html5lib parser deal with it--we don't enter into this 

361 # block. 

362 # 

363 # If this is not an allowed tag, then we convert it to 

364 # characters and it'll get escaped in the sanitizer. 

365 token["data"] = self.stream.get_tag() 

366 token["type"] = TAG_TOKEN_TYPE_CHARACTERS 

367 

368 last_error_token = None 

369 yield token 

370 

371 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR: 

372 # If the token is a parse error, then let the last_error_token 

373 # go, and make token the new last_error_token 

374 yield last_error_token 

375 last_error_token = token 

376 

377 else: 

378 yield last_error_token 

379 yield token 

380 last_error_token = None 

381 

382 continue 

383 

384 # If the token is a ParseError, we hold on to it so we can get the 

385 # next token and potentially fix it. 

386 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR: 

387 last_error_token = token 

388 continue 

389 

390 yield token 

391 

392 if last_error_token: 

393 if last_error_token["data"] == "eof-in-tag-name": 

394 # Handle the case where the text being parsed ends with < 

395 # followed by a series of characters. It's treated as a tag 

396 # name that abruptly ends, but we should treat that like 

397 # character data 

398 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} 

399 

400 elif last_error_token["data"] in ( 

401 "duplicate-attribute", 

402 "eof-in-attribute-name", 

403 "eof-in-attribute-value-no-quotes", 

404 "expected-end-of-tag-but-got-eof", 

405 ): 

406 # Handle the case where the text being parsed ends with < 

407 # followed by characters and then space and then: 

408 # 

409 # * more characters 

410 # * more characters repeated with a space between (e.g. "abc abc") 

411 # * more characters and then a space and then an EOF (e.g. "abc def ") 

412 # 

413 # These cases are treated as a tag name followed by an 

414 # attribute that abruptly ends, but we should treat that like 

415 # character data instead. 

416 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()} 

417 

418 else: 

419 yield last_error_token 

420 

421 def consumeEntity(self, allowedChar=None, fromAttribute=False): 

422 # If this tokenizer is set to consume entities, then we can let the 

423 # superclass do its thing. 

424 if self.consume_entities: 

425 return super().consumeEntity(allowedChar, fromAttribute) 

426 

427 # If this tokenizer is set to not consume entities, then we don't want 

428 # to consume and convert them, so this overrides the html5lib tokenizer's 

429 # consumeEntity so that it's now a no-op. 

430 # 

431 # However, when that gets called, it's consumed an &, so we put that back in 

432 # the stream. 

433 if fromAttribute: 

434 self.currentToken["data"][-1][1] += "&" 

435 

436 else: 

437 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"}) 

438 

439 def tagOpenState(self): 

440 # This state marks a < that is either a StartTag, EndTag, EmptyTag, 

441 # or ParseError. In all cases, we want to drop any stream history 

442 # we've collected so far and we do that by calling start_tag() on 

443 # the input stream wrapper. 

444 self.stream.start_tag() 

445 return super().tagOpenState() 

446 

447 def emitCurrentToken(self): 

448 token = self.currentToken 

449 

450 if ( 

451 self.parser.tags is not None 

452 and token["type"] in TAG_TOKEN_TYPES 

453 and token["name"].lower() not in self.parser.tags 

454 ): 

455 # If this is a start/end/empty tag for a tag that's not in our 

456 # allowed list, then it gets stripped or escaped. In both of these 

457 # cases it gets converted to a Characters token. 

458 if self.parser.strip: 

459 if ( 

460 self.emitted_last_token 

461 and token["type"] == TAG_TOKEN_TYPE_START 

462 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL 

463 ): 

464 # If this is a block level tag we're stripping, we drop it 

465 # for a newline because that's what a browser would parse 

466 # it as 

467 new_data = "\n" 

468 else: 

469 # For all other things being stripped, we throw in an empty 

470 # string token 

471 new_data = "" 

472 

473 else: 

474 # If we're escaping the token, we want to escape the exact 

475 # original string. Since tokenizing also normalizes data 

476 # and this is a tag-like thing, we've lost some information. 

477 # So we go back through the stream to get the original 

478 # string and use that. 

479 new_data = self.stream.get_tag() 

480 

481 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data} 

482 

483 self.currentToken = self.emitted_last_token = new_token 

484 self.tokenQueue.append(new_token) 

485 self.state = self.dataState 

486 return 

487 

488 self.emitted_last_token = self.currentToken 

489 super().emitCurrentToken() 

490 

491 

492class BleachHTMLParser(HTMLParser): 

493 """Parser that uses BleachHTMLTokenizer""" 

494 

495 def __init__(self, tags, strip, consume_entities, **kwargs): 

496 """ 

497 :arg tags: set of allowed tags--everything else is either stripped or 

498 escaped; if None, then this doesn't look at tags at all 

499 :arg strip: whether to strip disallowed tags (True) or escape them (False); 

500 if tags=None, then this doesn't have any effect 

501 :arg consume_entities: whether to consume entities (default behavior) or 

502 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) 

503 

504 """ 

505 self.tags = ( 

506 frozenset((tag.lower() for tag in tags)) if tags is not None else None 

507 ) 

508 self.strip = strip 

509 self.consume_entities = consume_entities 

510 super().__init__(**kwargs) 

511 

512 def _parse( 

513 self, stream, innerHTML=False, container="div", scripting=True, **kwargs 

514 ): 

515 # set scripting=True to parse <noscript> as though JS is enabled to 

516 # match the expected context in browsers 

517 # 

518 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element 

519 # 

520 # Override HTMLParser so we can swap out the tokenizer for our own. 

521 self.innerHTMLMode = innerHTML 

522 self.container = container 

523 self.scripting = scripting 

524 self.tokenizer = BleachHTMLTokenizer( 

525 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs 

526 ) 

527 self.reset() 

528 

529 try: 

530 self.mainLoop() 

531 except ReparseException: 

532 self.reset() 

533 self.mainLoop() 

534 

535 

536def convert_entity(value): 

537 """Convert an entity (minus the & and ; part) into what it represents 

538 

539 This handles numeric, hex, and text entities. 

540 

541 :arg value: the string (minus the ``&`` and ``;`` part) to convert 

542 

543 :returns: unicode character or None if it's an ambiguous ampersand that 

544 doesn't match a character entity 

545 

546 """ 

547 if value[0] == "#": 

548 if len(value) < 2: 

549 return None 

550 

551 if value[1] in ("x", "X"): 

552 # hex-encoded code point 

553 int_as_string, base = value[2:], 16 

554 else: 

555 # decimal code point 

556 int_as_string, base = value[1:], 10 

557 

558 if int_as_string == "": 

559 return None 

560 

561 code_point = int(int_as_string, base) 

562 if 0 < code_point < 0x110000: 

563 return chr(code_point) 

564 else: 

565 return None 

566 

567 return ENTITIES.get(value, None) 

568 

569 

570def convert_entities(text): 

571 """Converts all found entities in the text 

572 

573 :arg text: the text to convert entities in 

574 

575 :returns: unicode text with converted entities 

576 

577 """ 

578 if "&" not in text: 

579 return text 

580 

581 new_text = [] 

582 for part in next_possible_entity(text): 

583 if not part: 

584 continue 

585 

586 if part.startswith("&"): 

587 entity = match_entity(part) 

588 if entity is not None: 

589 converted = convert_entity(entity) 

590 

591 # If it's not an ambiguous ampersand, then replace with the 

592 # unicode character. Otherwise, we leave the entity in. 

593 if converted is not None: 

594 new_text.append(converted) 

595 remainder = part[len(entity) + 2 :] 

596 if part: 

597 new_text.append(remainder) 

598 continue 

599 

600 new_text.append(part) 

601 

602 return "".join(new_text) 

603 

604 

605def match_entity(stream): 

606 """Returns first entity in stream or None if no entity exists 

607 

608 Note: For Bleach purposes, entities must start with a "&" and end with a 

609 ";". This ignores ambiguous character entities that have no ";" at the end. 

610 

611 :arg stream: the character stream 

612 

613 :returns: the entity string without "&" or ";" if it's a valid character 

614 entity; ``None`` otherwise 

615 

616 """ 

617 # Nix the & at the beginning 

618 if stream[0] != "&": 

619 raise ValueError('Stream should begin with "&"') 

620 

621 stream = stream[1:] 

622 

623 stream = list(stream) 

624 possible_entity = "" 

625 end_characters = "<&=;" + string.whitespace 

626 

627 # Handle number entities 

628 if stream and stream[0] == "#": 

629 possible_entity = "#" 

630 stream.pop(0) 

631 

632 if stream and stream[0] in ("x", "X"): 

633 allowed = "0123456789abcdefABCDEF" 

634 possible_entity += stream.pop(0) 

635 else: 

636 allowed = "0123456789" 

637 

638 # FIXME(willkg): Do we want to make sure these are valid number 

639 # entities? This doesn't do that currently. 

640 while stream and stream[0] not in end_characters: 

641 c = stream.pop(0) 

642 if c not in allowed: 

643 break 

644 possible_entity += c 

645 

646 if possible_entity and stream and stream[0] == ";": 

647 return possible_entity 

648 return None 

649 

650 # Handle character entities 

651 while stream and stream[0] not in end_characters: 

652 c = stream.pop(0) 

653 possible_entity += c 

654 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity): 

655 # If it's not a prefix, then it's not an entity and we're 

656 # out 

657 return None 

658 

659 if possible_entity and stream and stream[0] == ";": 

660 return possible_entity 

661 

662 return None 

663 

664 

665AMP_SPLIT_RE = re.compile("(&)") 

666 

667 

668def next_possible_entity(text): 

669 """Takes a text and generates a list of possible entities 

670 

671 :arg text: the text to look at 

672 

673 :returns: generator where each part (except the first) starts with an 

674 "&" 

675 

676 """ 

677 for i, part in enumerate(AMP_SPLIT_RE.split(text)): 

678 if i == 0: 

679 yield part 

680 elif i % 2 == 0: 

681 yield "&" + part 

682 

683 

684class BleachHTMLSerializer(HTMLSerializer): 

685 """HTMLSerializer that undoes & -> &amp; in attributes and sets 

686 escape_rcdata to True 

687 """ 

688 

689 # per the HTMLSerializer.__init__ docstring: 

690 # 

691 # Whether to escape characters that need to be 

692 # escaped within normal elements within rcdata elements such as 

693 # style. 

694 # 

695 escape_rcdata = True 

696 

697 def escape_base_amp(self, stoken): 

698 """Escapes just bare & in HTML attribute values""" 

699 # First, undo escaping of &. We need to do this because html5lib's 

700 # HTMLSerializer expected the tokenizer to consume all the character 

701 # entities and convert them to their respective characters, but the 

702 # BleachHTMLTokenizer doesn't do that. For example, this fixes 

703 # &amp;entity; back to &entity; . 

704 stoken = stoken.replace("&amp;", "&") 

705 

706 # However, we do want all bare & that are not marking character 

707 # entities to be changed to &amp;, so let's do that carefully here. 

708 for part in next_possible_entity(stoken): 

709 if not part: 

710 continue 

711 

712 if part.startswith("&"): 

713 entity = match_entity(part) 

714 # Only leave entities in that are not ambiguous. If they're 

715 # ambiguous, then we escape the ampersand. 

716 if entity is not None and convert_entity(entity) is not None: 

717 yield f"&{entity};" 

718 

719 # Length of the entity plus 2--one for & at the beginning 

720 # and one for ; at the end 

721 part = part[len(entity) + 2 :] 

722 if part: 

723 yield part 

724 continue 

725 

726 yield part.replace("&", "&amp;") 

727 

728 def serialize(self, treewalker, encoding=None): 

729 """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values 

730 

731 Note that this converts & to &amp; in attribute values where the & isn't 

732 already part of an unambiguous character entity. 

733 

734 """ 

735 in_tag = False 

736 after_equals = False 

737 

738 for stoken in super().serialize(treewalker, encoding): 

739 if in_tag: 

740 if stoken == ">": 

741 in_tag = False 

742 

743 elif after_equals: 

744 if stoken != '"': 

745 yield from self.escape_base_amp(stoken) 

746 

747 after_equals = False 

748 continue 

749 

750 elif stoken == "=": 

751 after_equals = True 

752 

753 yield stoken 

754 else: 

755 if stoken.startswith("<"): 

756 in_tag = True 

757 yield stoken