Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/html5lib

1# flake8: noqa

2"""

3Shim module between Bleach and html5lib. This makes it easier to upgrade the

4html5lib library without having to change a lot of code.

5"""

7import re

8import string

9import warnings

11# ignore html5lib deprecation warnings to use bleach; we are bleach

12# apply before we import submodules that import html5lib

13warnings.filterwarnings(

14 "ignore",

15 message="html5lib's sanitizer is deprecated",

16 category=DeprecationWarning,

17 module="bleach._vendor.html5lib",

18)

20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file

21 HTMLParser,

22 getTreeWalker,

23)

24from bleach._vendor.html5lib import (

25 constants,

26) # noqa: E402 module level import not at top of file

27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file

28 namespaces,

29 prefixes,

30)

31from bleach._vendor.html5lib.constants import (

32 _ReparseException as ReparseException,

33) # noqa: E402 module level import not at top of file

34from bleach._vendor.html5lib.filters.base import (

35 Filter,

36) # noqa: E402 module level import not at top of file

37from bleach._vendor.html5lib.filters.sanitizer import (

38 allowed_protocols,

39 allowed_css_properties,

40 allowed_svg_properties,

41 attr_val_is_uri,

42 svg_attr_val_allows_ref,

43 svg_allow_local_href,

44) # noqa: E402 module level import not at top of file

45from bleach._vendor.html5lib.filters.sanitizer import (

46 Filter as SanitizerFilter,

47) # noqa: E402 module level import not at top of file

48from bleach._vendor.html5lib._inputstream import (

49 HTMLInputStream,

50) # noqa: E402 module level import not at top of file

51from bleach._vendor.html5lib.serializer import (

52 escape,

53 HTMLSerializer,

54) # noqa: E402 module level import not at top of file

55from bleach._vendor.html5lib._tokenizer import (

56 attributeMap,

57 HTMLTokenizer,

58) # noqa: E402 module level import not at top of file

59from bleach._vendor.html5lib._trie import (

60 Trie,

61) # noqa: E402 module level import not at top of file

64#: Map of entity name to expanded entity

65ENTITIES = constants.entities

67#: Trie of html entity string -> character representation

68ENTITIES_TRIE = Trie(ENTITIES)

70#: Token type constants--these never change

71TAG_TOKEN_TYPES = {

72 constants.tokenTypes["StartTag"],

73 constants.tokenTypes["EndTag"],

74 constants.tokenTypes["EmptyTag"],

75}

76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]

77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]

78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]

79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]

82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17

83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3

84HTML_TAGS = frozenset(

85 (

86 "a",

87 "abbr",

88 "address",

89 "area",

90 "article",

91 "aside",

92 "audio",

93 "b",

94 "base",

95 "bdi",

96 "bdo",

97 "blockquote",

98 "body",

99 "br",

100 "button",

101 "canvas",

102 "caption",

103 "cite",

104 "code",

105 "col",

106 "colgroup",

107 "data",

108 "datalist",

109 "dd",

110 "del",

111 "details",

112 "dfn",

113 "dialog",

114 "div",

115 "dl",

116 "dt",

117 "em",

118 "embed",

119 "fieldset",

120 "figcaption",

121 "figure",

122 "footer",

123 "form",

124 "h1",

125 "h2",

126 "h3",

127 "h4",

128 "h5",

129 "h6",

130 "head",

131 "header",

132 "hgroup",

133 "hr",

134 "html",

135 "i",

136 "iframe",

137 "img",

138 "input",

139 "ins",

140 "kbd",

141 "keygen",

142 "label",

143 "legend",

144 "li",

145 "link",

146 "map",

147 "mark",

148 "menu",

149 "meta",

150 "meter",

151 "nav",

152 "noscript",

153 "object",

154 "ol",

155 "optgroup",

156 "option",

157 "output",

158 "p",

159 "param",

160 "picture",

161 "pre",

162 "progress",

163 "q",

164 "rp",

165 "rt",

166 "ruby",

167 "s",

168 "samp",

169 "script",

170 "section",

171 "select",

172 "slot",

173 "small",

174 "source",

175 "span",

176 "strong",

177 "style",

178 "sub",

179 "summary",

180 "sup",

181 "table",

182 "tbody",

183 "td",

184 "template",

185 "textarea",

186 "tfoot",

187 "th",

188 "thead",

189 "time",

190 "title",

191 "tr",

192 "track",

193 "u",

194 "ul",

195 "var",

196 "video",

197 "wbr",

198 )

199)

200

201

202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369

203#: from mozilla on 2019.07.11

204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements

205HTML_TAGS_BLOCK_LEVEL = frozenset(

206 (

207 "address",

208 "article",

209 "aside",

210 "blockquote",

211 "details",

212 "dialog",

213 "dd",

214 "div",

215 "dl",

216 "dt",

217 "fieldset",

218 "figcaption",

219 "figure",

220 "footer",

221 "form",

222 "h1",

223 "h2",

224 "h3",

225 "h4",

226 "h5",

227 "h6",

228 "header",

229 "hgroup",

230 "hr",

231 "li",

232 "main",

233 "nav",

234 "ol",

235 "p",

236 "pre",

237 "section",

238 "table",

239 "ul",

240 )

241)

242

243

244class InputStreamWithMemory:

245 """Wraps an HTMLInputStream to remember characters since last <

246

247 This wraps existing HTMLInputStream classes to keep track of the stream

248 since the last < which marked an open tag state.

249

250 """

251

252 def __init__(self, inner_stream):

253 self._inner_stream = inner_stream

254 self.reset = self._inner_stream.reset

255 self.position = self._inner_stream.position

256 self._buffer = []

257

258 @property

259 def errors(self):

260 return self._inner_stream.errors

261

262 @property

263 def charEncoding(self):

264 return self._inner_stream.charEncoding

265

266 @property

267 def changeEncoding(self):

268 return self._inner_stream.changeEncoding

269

270 def char(self):

271 c = self._inner_stream.char()

272 # char() can return None if EOF, so ignore that

273 if c:

274 self._buffer.append(c)

275 return c

276

277 def charsUntil(self, characters, opposite=False):

278 chars = self._inner_stream.charsUntil(characters, opposite=opposite)

279 self._buffer.extend(list(chars))

280 return chars

281

282 def unget(self, char):

283 if self._buffer:

284 self._buffer.pop(-1)

285 return self._inner_stream.unget(char)

286

287 def get_tag(self):

288 """Returns the stream history since last '<'

289

290 Since the buffer starts at the last '<' as as seen by tagOpenState(),

291 we know that everything from that point to when this method is called

292 is the "tag" that is being tokenized.

293

294 """

295 return "".join(self._buffer)

296

297 def start_tag(self):

298 """Resets stream history to just '<'

299

300 This gets called by tagOpenState() which marks a '<' that denotes an

301 open tag. Any time we see that, we reset the buffer.

302

303 """

304 self._buffer = ["<"]

305

306

307class BleachHTMLTokenizer(HTMLTokenizer):

308 """Tokenizer that doesn't consume character entities"""

309

310 def __init__(self, consume_entities=False, **kwargs):

311 super().__init__(**kwargs)

312

313 self.consume_entities = consume_entities

314

315 # Wrap the stream with one that remembers the history

316 self.stream = InputStreamWithMemory(self.stream)

317

318 # Remember the last token emitted; needed for block element spacing

319 self.emitted_last_token = None

320

321 def __iter__(self):

322 last_error_token = None

323

324 for token in super().__iter__():

325 if last_error_token is not None:

326 if (

327 last_error_token["data"] == "invalid-character-in-attribute-name"

328 and token["type"] in TAG_TOKEN_TYPES

329 and token.get("data")

330 ):

331 # token["data"] is an html5lib attributeMap

332 # (OrderedDict 3.7+ and dict otherwise)

333 # of attr name to attr value

334 #

335 # Remove attribute names that have ', " or < in them

336 # because those characters are invalid for attribute names.

337 token["data"] = attributeMap(

338 (attr_name, attr_value)

339 for attr_name, attr_value in token["data"].items()

340 if (

341 '"' not in attr_name

342 and "'" not in attr_name

343 and "<" not in attr_name

344 )

345 )

346 last_error_token = None

347 yield token

348

349 elif (

350 last_error_token["data"]

351 in (

352 "invalid-character-in-attribute-name",

353 "invalid-character-after-attribute-name",

354 )

355 and token["type"] == TAG_TOKEN_TYPE_CHARACTERS

356 and token.get("data")

357 and " " in token["data"]

358 ):

359 # token["data"] has something that starts with a left angle

360 # bracket, then has some characters followed by a space

361 # followed by another left angle bracket and ending with

362 # a right angle bracket. That part could be a real tag, so

363 # we don't want it to get treated as Characters. For

364 # example, soemthing in this shape: <nottag <...>

365 # If so, we want to take off the first bit that is

366 # definitely not a tag and reparse the rest.

367 head, rest = token["data"].split(" ", 1)

368 if rest.strip().startswith("<"):

369 # yield the not-a-tag plus the space we split on

370 token["data"] = head + " "

371 yield token

372

373 # shove the rest back in the stream for the praser to look

374 # at

375 for c in reversed(rest):

376 self.stream.unget(c)

377 else:

378 yield token

379

380 elif (

381 last_error_token["data"] == "expected-closing-tag-but-got-char"

382 and self.parser.tags is not None

383 and token["data"].lower().strip() not in self.parser.tags

384 ):

385 # We've got either a malformed tag or a pseudo-tag or

386 # something that html5lib wants to turn into a malformed

387 # comment which Bleach clean() will drop so we interfere

388 # with the token stream to handle it more correctly.

389 #

390 # If this is an allowed tag, it's malformed and we just let

391 # the html5lib parser deal with it--we don't enter into this

392 # block.

393 #

394 # If this is not an allowed tag, then we convert it to

395 # characters and it'll get escaped in the sanitizer.

396 token["data"] = self.stream.get_tag()

397 token["type"] = TAG_TOKEN_TYPE_CHARACTERS

398

399 last_error_token = None

400 yield token

401

402 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:

403 # If the token is a parse error, then let the last_error_token

404 # go, and make token the new last_error_token

405 yield last_error_token

406 last_error_token = token

407

408 else:

409 yield last_error_token

410 yield token

411 last_error_token = None

412

413 continue

414

415 # If the token is a ParseError, we hold on to it so we can get the

416 # next token and potentially fix it.

417 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:

418 last_error_token = token

419 continue

420

421 yield token

422

423 if last_error_token:

424 if last_error_token["data"] == "eof-in-tag-name":

425 # Handle the case where the text being parsed ends with <

426 # followed by a series of characters. It's treated as a tag

427 # name that abruptly ends, but we should treat that like

428 # character data

429 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

430

431 elif last_error_token["data"] in (

432 "duplicate-attribute",

433 "eof-in-attribute-name",

434 "eof-in-attribute-value-no-quotes",

435 "expected-end-of-tag-but-got-eof",

436 ):

437 # Handle the case where the text being parsed ends with <

438 # followed by characters and then space and then:

439 #

440 # * more characters

441 # * more characters repeated with a space between (e.g. "abc abc")

442 # * more characters and then a space and then an EOF (e.g. "abc def ")

443 #

444 # These cases are treated as a tag name followed by an

445 # attribute that abruptly ends, but we should treat that like

446 # character data instead.

447 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

448

449 else:

450 yield last_error_token

451

452 def consumeEntity(self, allowedChar=None, fromAttribute=False):

453 # If this tokenizer is set to consume entities, then we can let the

454 # superclass do its thing.

455 if self.consume_entities:

456 return super().consumeEntity(allowedChar, fromAttribute)

457

458 # If this tokenizer is set to not consume entities, then we don't want

459 # to consume and convert them, so this overrides the html5lib tokenizer's

460 # consumeEntity so that it's now a no-op.

461 #

462 # However, when that gets called, it's consumed an &, so we put that back in

463 # the stream.

464 if fromAttribute:

465 self.currentToken["data"][-1][1] += "&"

466

467 else:

468 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})

469

470 def tagOpenState(self):

471 # This state marks a < that is either a StartTag, EndTag, EmptyTag,

472 # or ParseError. In all cases, we want to drop any stream history

473 # we've collected so far and we do that by calling start_tag() on

474 # the input stream wrapper.

475 self.stream.start_tag()

476 return super().tagOpenState()

477

478 def emitCurrentToken(self):

479 token = self.currentToken

480

481 if (

482 self.parser.tags is not None

483 and token["type"] in TAG_TOKEN_TYPES

484 and token["name"].lower() not in self.parser.tags

485 ):

486 # If this is a start/end/empty tag for a tag that's not in our

487 # allowed list, then it gets stripped or escaped. In both of these

488 # cases it gets converted to a Characters token.

489 if self.parser.strip:

490 if (

491 self.emitted_last_token

492 and token["type"] == TAG_TOKEN_TYPE_START

493 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL

494 ):

495 # If this is a block level tag we're stripping, we drop it

496 # for a newline because that's what a browser would parse

497 # it as

498 new_data = "\n"

499 else:

500 # For all other things being stripped, we throw in an empty

501 # string token

502 new_data = ""

503

504 else:

505 # If we're escaping the token, we want to escape the exact

506 # original string. Since tokenizing also normalizes data

507 # and this is a tag-like thing, we've lost some information.

508 # So we go back through the stream to get the original

509 # string and use that.

510 new_data = self.stream.get_tag()

511

512 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}

513

514 self.currentToken = self.emitted_last_token = new_token

515 self.tokenQueue.append(new_token)

516 self.state = self.dataState

517 return

518

519 self.emitted_last_token = self.currentToken

520 super().emitCurrentToken()

521

522

523class BleachHTMLParser(HTMLParser):

524 """Parser that uses BleachHTMLTokenizer"""

525

526 def __init__(self, tags, strip, consume_entities, **kwargs):

527 """

528 :arg tags: set of allowed tags--everything else is either stripped or

529 escaped; if None, then this doesn't look at tags at all

530 :arg strip: whether to strip disallowed tags (True) or escape them (False);

531 if tags=None, then this doesn't have any effect

532 :arg consume_entities: whether to consume entities (default behavior) or

533 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

534

535 """

536 self.tags = (

537 frozenset((tag.lower() for tag in tags)) if tags is not None else None

538 )

539 self.strip = strip

540 self.consume_entities = consume_entities

541 super().__init__(**kwargs)

542

543 def _parse(

544 self, stream, innerHTML=False, container="div", scripting=True, **kwargs

545 ):

546 # set scripting=True to parse <noscript> as though JS is enabled to

547 # match the expected context in browsers

548 #

549 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element

550 #

551 # Override HTMLParser so we can swap out the tokenizer for our own.

552 self.innerHTMLMode = innerHTML

553 self.container = container

554 self.scripting = scripting

555 self.tokenizer = BleachHTMLTokenizer(

556 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs

557 )

558 self.reset()

559

560 try:

561 self.mainLoop()

562 except ReparseException:

563 self.reset()

564 self.mainLoop()

565

566

567def convert_entity(value):

568 """Convert an entity (minus the & and ; part) into what it represents

569

570 This handles numeric, hex, and text entities.

571

572 :arg value: the string (minus the ``&`` and ``;`` part) to convert

573

574 :returns: unicode character or None if it's an ambiguous ampersand that

575 doesn't match a character entity

576

577 """

578 if value[0] == "#":

579 if len(value) < 2:

580 return None

581

582 if value[1] in ("x", "X"):

583 # hex-encoded code point

584 int_as_string, base = value[2:], 16

585 else:

586 # decimal code point

587 int_as_string, base = value[1:], 10

588

589 if int_as_string == "":

590 return None

591

592 code_point = int(int_as_string, base)

593 if 0 < code_point < 0x110000:

594 return chr(code_point)

595 else:

596 return None

597

598 return ENTITIES.get(value, None)

599

600

601def convert_entities(text):

602 """Converts all found entities in the text

603

604 :arg text: the text to convert entities in

605

606 :returns: unicode text with converted entities

607

608 """

609 if "&" not in text:

610 return text

611

612 new_text = []

613 for part in next_possible_entity(text):

614 if not part:

615 continue

616

617 if part.startswith("&"):

618 entity = match_entity(part)

619 if entity is not None:

620 converted = convert_entity(entity)

621

622 # If it's not an ambiguous ampersand, then replace with the

623 # unicode character. Otherwise, we leave the entity in.

624 if converted is not None:

625 new_text.append(converted)

626 remainder = part[len(entity) + 2 :]

627 if part:

628 new_text.append(remainder)

629 continue

630

631 new_text.append(part)

632

633 return "".join(new_text)

634

635

636def match_entity(stream):

637 """Returns first entity in stream or None if no entity exists

638

639 Note: For Bleach purposes, entities must start with a "&" and end with a

640 ";". This ignores ambiguous character entities that have no ";" at the end.

641

642 :arg stream: the character stream

643

644 :returns: the entity string without "&" or ";" if it's a valid character

645 entity; ``None`` otherwise

646

647 """

648 # Nix the & at the beginning

649 if stream[0] != "&":

650 raise ValueError('Stream should begin with "&"')

651

652 stream = stream[1:]

653

654 stream = list(stream)

655 possible_entity = ""

656 end_characters = "<&=;" + string.whitespace

657

658 # Handle number entities

659 if stream and stream[0] == "#":

660 possible_entity = "#"

661 stream.pop(0)

662

663 if stream and stream[0] in ("x", "X"):

664 allowed = "0123456789abcdefABCDEF"

665 possible_entity += stream.pop(0)

666 else:

667 allowed = "0123456789"

668

669 # FIXME(willkg): Do we want to make sure these are valid number

670 # entities? This doesn't do that currently.

671 while stream and stream[0] not in end_characters:

672 c = stream.pop(0)

673 if c not in allowed:

674 break

675 possible_entity += c

676

677 if possible_entity and stream and stream[0] == ";":

678 return possible_entity

679 return None

680

681 # Handle character entities

682 while stream and stream[0] not in end_characters:

683 c = stream.pop(0)

684 possible_entity += c

685 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):

686 # If it's not a prefix, then it's not an entity and we're

687 # out

688 return None

689

690 if possible_entity and stream and stream[0] == ";":

691 return possible_entity

692

693 return None

694

695

696AMP_SPLIT_RE = re.compile("(&)")

697

698

699def next_possible_entity(text):

700 """Takes a text and generates a list of possible entities

701

702 :arg text: the text to look at

703

704 :returns: generator where each part (except the first) starts with an

705 "&"

706

707 """

708 for i, part in enumerate(AMP_SPLIT_RE.split(text)):

709 if i == 0:

710 yield part

711 elif i % 2 == 0:

712 yield "&" + part

713

714

715class BleachHTMLSerializer(HTMLSerializer):

716 """HTMLSerializer that undoes & -> & in attributes and sets

717 escape_rcdata to True

718 """

719

720 # per the HTMLSerializer.__init__ docstring:

721 #

722 # Whether to escape characters that need to be

723 # escaped within normal elements within rcdata elements such as

724 # style.

725 #

726 escape_rcdata = True

727

728 def escape_base_amp(self, stoken):

729 """Escapes just bare & in HTML attribute values"""

730 # First, undo escaping of &. We need to do this because html5lib's

731 # HTMLSerializer expected the tokenizer to consume all the character

732 # entities and convert them to their respective characters, but the

733 # BleachHTMLTokenizer doesn't do that. For example, this fixes

734 # &entity; back to &entity; .

735 stoken = stoken.replace("&", "&")

736

737 # However, we do want all bare & that are not marking character

738 # entities to be changed to &, so let's do that carefully here.

739 for part in next_possible_entity(stoken):

740 if not part:

741 continue

742

743 if part.startswith("&"):

744 entity = match_entity(part)

745 # Only leave entities in that are not ambiguous. If they're

746 # ambiguous, then we escape the ampersand.

747 if entity is not None and convert_entity(entity) is not None:

748 yield f"&{entity};"

749

750 # Length of the entity plus 2--one for & at the beginning

751 # and one for ; at the end

752 part = part[len(entity) + 2 :]

753 if part:

754 yield part

755 continue

756

757 yield part.replace("&", "&")

758

759 def serialize(self, treewalker, encoding=None):

760 """Wrap HTMLSerializer.serialize and conver & to & in attribute values

761

762 Note that this converts & to & in attribute values where the & isn't

763 already part of an unambiguous character entity.

764

765 """

766 in_tag = False

767 after_equals = False

768

769 for stoken in super().serialize(treewalker, encoding):

770 if in_tag:

771 if stoken == ">":

772 in_tag = False

773

774 elif after_equals:

775 if stoken != '"':

776 yield from self.escape_base_amp(stoken)

777

778 after_equals = False

779 continue

780

781 elif stoken == "=":

782 after_equals = True

783

784 yield stoken

785 else:

786 if stoken.startswith("<"):

787 in_tag = True

788 yield stoken

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/html5lib_shim.py: 96%

247 statements