Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/html5lib

1# flake8: noqa

2"""

3Shim module between Bleach and html5lib. This makes it easier to upgrade the

4html5lib library without having to change a lot of code.

5"""

7import re

8import string

9import warnings

11# ignore html5lib deprecation warnings to use bleach; we are bleach

12# apply before we import submodules that import html5lib

13warnings.filterwarnings(

14 "ignore",

15 message="html5lib's sanitizer is deprecated",

16 category=DeprecationWarning,

17 module="bleach._vendor.html5lib",

18)

20from bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file

21 HTMLParser,

22 getTreeWalker,

23)

24from bleach._vendor.html5lib import (

25 constants,

26) # noqa: E402 module level import not at top of file

27from bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file

28 namespaces,

29 prefixes,

30)

31from bleach._vendor.html5lib.constants import (

32 _ReparseException as ReparseException,

33) # noqa: E402 module level import not at top of file

34from bleach._vendor.html5lib.filters.base import (

35 Filter,

36) # noqa: E402 module level import not at top of file

37from bleach._vendor.html5lib.filters.sanitizer import (

38 allowed_protocols,

39 allowed_css_properties,

40 allowed_svg_properties,

41 attr_val_is_uri,

42 svg_attr_val_allows_ref,

43 svg_allow_local_href,

44) # noqa: E402 module level import not at top of file

45from bleach._vendor.html5lib.filters.sanitizer import (

46 Filter as SanitizerFilter,

47) # noqa: E402 module level import not at top of file

48from bleach._vendor.html5lib._inputstream import (

49 HTMLInputStream,

50) # noqa: E402 module level import not at top of file

51from bleach._vendor.html5lib.serializer import (

52 escape,

53 HTMLSerializer,

54) # noqa: E402 module level import not at top of file

55from bleach._vendor.html5lib._tokenizer import (

56 attributeMap,

57 HTMLTokenizer,

58) # noqa: E402 module level import not at top of file

59from bleach._vendor.html5lib._trie import (

60 Trie,

61) # noqa: E402 module level import not at top of file

64#: Map of entity name to expanded entity

65ENTITIES = constants.entities

67#: Trie of html entity string -> character representation

68ENTITIES_TRIE = Trie(ENTITIES)

70#: Token type constants--these never change

71TAG_TOKEN_TYPES = {

72 constants.tokenTypes["StartTag"],

73 constants.tokenTypes["EndTag"],

74 constants.tokenTypes["EmptyTag"],

75}

76TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]

77TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]

78TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]

79TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]

82#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17

83#: https://html.spec.whatwg.org/multipage/indices.html#elements-3

84HTML_TAGS = frozenset(

85 (

86 "a",

87 "abbr",

88 "address",

89 "area",

90 "article",

91 "aside",

92 "audio",

93 "b",

94 "base",

95 "bdi",

96 "bdo",

97 "blockquote",

98 "body",

99 "br",

100 "button",

101 "canvas",

102 "caption",

103 "cite",

104 "code",

105 "col",

106 "colgroup",

107 "data",

108 "datalist",

109 "dd",

110 "del",

111 "details",

112 "dfn",

113 "dialog",

114 "div",

115 "dl",

116 "dt",

117 "em",

118 "embed",

119 "fieldset",

120 "figcaption",

121 "figure",

122 "footer",

123 "form",

124 "h1",

125 "h2",

126 "h3",

127 "h4",

128 "h5",

129 "h6",

130 "head",

131 "header",

132 "hgroup",

133 "hr",

134 "html",

135 "i",

136 "iframe",

137 "img",

138 "input",

139 "ins",

140 "kbd",

141 "keygen",

142 "label",

143 "legend",

144 "li",

145 "link",

146 "map",

147 "mark",

148 "menu",

149 "meta",

150 "meter",

151 "nav",

152 "noscript",

153 "object",

154 "ol",

155 "optgroup",

156 "option",

157 "output",

158 "p",

159 "param",

160 "picture",

161 "pre",

162 "progress",

163 "q",

164 "rp",

165 "rt",

166 "ruby",

167 "s",

168 "samp",

169 "script",

170 "section",

171 "select",

172 "slot",

173 "small",

174 "source",

175 "span",

176 "strong",

177 "style",

178 "sub",

179 "summary",

180 "sup",

181 "table",

182 "tbody",

183 "td",

184 "template",

185 "textarea",

186 "tfoot",

187 "th",

188 "thead",

189 "time",

190 "title",

191 "tr",

192 "track",

193 "u",

194 "ul",

195 "var",

196 "video",

197 "wbr",

198 )

199)

200

201

202#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369

203#: from mozilla on 2019.07.11

204#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements

205HTML_TAGS_BLOCK_LEVEL = frozenset(

206 (

207 "address",

208 "article",

209 "aside",

210 "blockquote",

211 "details",

212 "dialog",

213 "dd",

214 "div",

215 "dl",

216 "dt",

217 "fieldset",

218 "figcaption",

219 "figure",

220 "footer",

221 "form",

222 "h1",

223 "h2",

224 "h3",

225 "h4",

226 "h5",

227 "h6",

228 "header",

229 "hgroup",

230 "hr",

231 "li",

232 "main",

233 "nav",

234 "ol",

235 "p",

236 "pre",

237 "section",

238 "table",

239 "ul",

240 )

241)

242

243

244class InputStreamWithMemory:

245 """Wraps an HTMLInputStream to remember characters since last <

246

247 This wraps existing HTMLInputStream classes to keep track of the stream

248 since the last < which marked an open tag state.

249

250 """

251

252 def __init__(self, inner_stream):

253 self._inner_stream = inner_stream

254 self.reset = self._inner_stream.reset

255 self.position = self._inner_stream.position

256 self._buffer = []

257

258 @property

259 def errors(self):

260 return self._inner_stream.errors

261

262 @property

263 def charEncoding(self):

264 return self._inner_stream.charEncoding

265

266 @property

267 def changeEncoding(self):

268 return self._inner_stream.changeEncoding

269

270 def char(self):

271 c = self._inner_stream.char()

272 # char() can return None if EOF, so ignore that

273 if c:

274 self._buffer.append(c)

275 return c

276

277 def charsUntil(self, characters, opposite=False):

278 chars = self._inner_stream.charsUntil(characters, opposite=opposite)

279 self._buffer.extend(list(chars))

280 return chars

281

282 def unget(self, char):

283 if self._buffer:

284 self._buffer.pop(-1)

285 return self._inner_stream.unget(char)

286

287 def get_tag(self):

288 """Returns the stream history since last '<'

289

290 Since the buffer starts at the last '<' as as seen by tagOpenState(),

291 we know that everything from that point to when this method is called

292 is the "tag" that is being tokenized.

293

294 """

295 return "".join(self._buffer)

296

297 def start_tag(self):

298 """Resets stream history to just '<'

299

300 This gets called by tagOpenState() which marks a '<' that denotes an

301 open tag. Any time we see that, we reset the buffer.

302

303 """

304 self._buffer = ["<"]

305

306

307class BleachHTMLTokenizer(HTMLTokenizer):

308 """Tokenizer that doesn't consume character entities"""

309

310 def __init__(self, consume_entities=False, **kwargs):

311 super().__init__(**kwargs)

312

313 self.consume_entities = consume_entities

314

315 # Wrap the stream with one that remembers the history

316 self.stream = InputStreamWithMemory(self.stream)

317

318 # Remember the last token emitted; needed for block element spacing

319 self.emitted_last_token = None

320

321 def __iter__(self):

322 last_error_token = None

323

324 for token in super().__iter__():

325 if last_error_token is not None:

326 if (

327 last_error_token["data"] == "invalid-character-in-attribute-name"

328 and token["type"] in TAG_TOKEN_TYPES

329 and token.get("data")

330 ):

331 # token["data"] is an html5lib attributeMap

332 # (OrderedDict 3.7+ and dict otherwise)

333 # of attr name to attr value

334 #

335 # Remove attribute names that have ', " or < in them

336 # because those characters are invalid for attribute names.

337 token["data"] = attributeMap(

338 (attr_name, attr_value)

339 for attr_name, attr_value in token["data"].items()

340 if (

341 '"' not in attr_name

342 and "'" not in attr_name

343 and "<" not in attr_name

344 )

345 )

346 last_error_token = None

347 yield token

348

349 elif (

350 last_error_token["data"] == "expected-closing-tag-but-got-char"

351 and self.parser.tags is not None

352 and token["data"].lower().strip() not in self.parser.tags

353 ):

354 # We've got either a malformed tag or a pseudo-tag or

355 # something that html5lib wants to turn into a malformed

356 # comment which Bleach clean() will drop so we interfere

357 # with the token stream to handle it more correctly.

358 #

359 # If this is an allowed tag, it's malformed and we just let

360 # the html5lib parser deal with it--we don't enter into this

361 # block.

362 #

363 # If this is not an allowed tag, then we convert it to

364 # characters and it'll get escaped in the sanitizer.

365 token["data"] = self.stream.get_tag()

366 token["type"] = TAG_TOKEN_TYPE_CHARACTERS

367

368 last_error_token = None

369 yield token

370

371 elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:

372 # If the token is a parse error, then let the last_error_token

373 # go, and make token the new last_error_token

374 yield last_error_token

375 last_error_token = token

376

377 else:

378 yield last_error_token

379 yield token

380 last_error_token = None

381

382 continue

383

384 # If the token is a ParseError, we hold on to it so we can get the

385 # next token and potentially fix it.

386 if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:

387 last_error_token = token

388 continue

389

390 yield token

391

392 if last_error_token:

393 if last_error_token["data"] == "eof-in-tag-name":

394 # Handle the case where the text being parsed ends with <

395 # followed by a series of characters. It's treated as a tag

396 # name that abruptly ends, but we should treat that like

397 # character data

398 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

399 elif last_error_token["data"] in (

400 "eof-in-attribute-name",

401 "eof-in-attribute-value-no-quotes",

402 ):

403 # Handle the case where the text being parsed ends with <

404 # followed by a series of characters and then space and then

405 # more characters. It's treated as a tag name followed by an

406 # attribute that abruptly ends, but we should treat that like

407 # character data.

408 yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

409 else:

410 yield last_error_token

411

412 def consumeEntity(self, allowedChar=None, fromAttribute=False):

413 # If this tokenizer is set to consume entities, then we can let the

414 # superclass do its thing.

415 if self.consume_entities:

416 return super().consumeEntity(allowedChar, fromAttribute)

417

418 # If this tokenizer is set to not consume entities, then we don't want

419 # to consume and convert them, so this overrides the html5lib tokenizer's

420 # consumeEntity so that it's now a no-op.

421 #

422 # However, when that gets called, it's consumed an &, so we put that back in

423 # the stream.

424 if fromAttribute:

425 self.currentToken["data"][-1][1] += "&"

426

427 else:

428 self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})

429

430 def tagOpenState(self):

431 # This state marks a < that is either a StartTag, EndTag, EmptyTag,

432 # or ParseError. In all cases, we want to drop any stream history

433 # we've collected so far and we do that by calling start_tag() on

434 # the input stream wrapper.

435 self.stream.start_tag()

436 return super().tagOpenState()

437

438 def emitCurrentToken(self):

439 token = self.currentToken

440

441 if (

442 self.parser.tags is not None

443 and token["type"] in TAG_TOKEN_TYPES

444 and token["name"].lower() not in self.parser.tags

445 ):

446 # If this is a start/end/empty tag for a tag that's not in our

447 # allowed list, then it gets stripped or escaped. In both of these

448 # cases it gets converted to a Characters token.

449 if self.parser.strip:

450 if (

451 self.emitted_last_token

452 and token["type"] == TAG_TOKEN_TYPE_START

453 and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL

454 ):

455 # If this is a block level tag we're stripping, we drop it

456 # for a newline because that's what a browser would parse

457 # it as

458 new_data = "\n"

459 else:

460 # For all other things being stripped, we throw in an empty

461 # string token

462 new_data = ""

463

464 else:

465 # If we're escaping the token, we want to escape the exact

466 # original string. Since tokenizing also normalizes data

467 # and this is a tag-like thing, we've lost some information.

468 # So we go back through the stream to get the original

469 # string and use that.

470 new_data = self.stream.get_tag()

471

472 new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}

473

474 self.currentToken = self.emitted_last_token = new_token

475 self.tokenQueue.append(new_token)

476 self.state = self.dataState

477 return

478

479 self.emitted_last_token = self.currentToken

480 super().emitCurrentToken()

481

482

483class BleachHTMLParser(HTMLParser):

484 """Parser that uses BleachHTMLTokenizer"""

485

486 def __init__(self, tags, strip, consume_entities, **kwargs):

487 """

488 :arg tags: set of allowed tags--everything else is either stripped or

489 escaped; if None, then this doesn't look at tags at all

490 :arg strip: whether to strip disallowed tags (True) or escape them (False);

491 if tags=None, then this doesn't have any effect

492 :arg consume_entities: whether to consume entities (default behavior) or

493 leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

494

495 """

496 self.tags = (

497 frozenset((tag.lower() for tag in tags)) if tags is not None else None

498 )

499 self.strip = strip

500 self.consume_entities = consume_entities

501 super().__init__(**kwargs)

502

503 def _parse(

504 self, stream, innerHTML=False, container="div", scripting=True, **kwargs

505 ):

506 # set scripting=True to parse <noscript> as though JS is enabled to

507 # match the expected context in browsers

508 #

509 # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element

510 #

511 # Override HTMLParser so we can swap out the tokenizer for our own.

512 self.innerHTMLMode = innerHTML

513 self.container = container

514 self.scripting = scripting

515 self.tokenizer = BleachHTMLTokenizer(

516 stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs

517 )

518 self.reset()

519

520 try:

521 self.mainLoop()

522 except ReparseException:

523 self.reset()

524 self.mainLoop()

525

526

527def convert_entity(value):

528 """Convert an entity (minus the & and ; part) into what it represents

529

530 This handles numeric, hex, and text entities.

531

532 :arg value: the string (minus the ``&`` and ``;`` part) to convert

533

534 :returns: unicode character or None if it's an ambiguous ampersand that

535 doesn't match a character entity

536

537 """

538 if value[0] == "#":

539 if len(value) < 2:

540 return None

541

542 if value[1] in ("x", "X"):

543 # hex-encoded code point

544 int_as_string, base = value[2:], 16

545 else:

546 # decimal code point

547 int_as_string, base = value[1:], 10

548

549 if int_as_string == "":

550 return None

551

552 code_point = int(int_as_string, base)

553 if 0 < code_point < 0x110000:

554 return chr(code_point)

555 else:

556 return None

557

558 return ENTITIES.get(value, None)

559

560

561def convert_entities(text):

562 """Converts all found entities in the text

563

564 :arg text: the text to convert entities in

565

566 :returns: unicode text with converted entities

567

568 """

569 if "&" not in text:

570 return text

571

572 new_text = []

573 for part in next_possible_entity(text):

574 if not part:

575 continue

576

577 if part.startswith("&"):

578 entity = match_entity(part)

579 if entity is not None:

580 converted = convert_entity(entity)

581

582 # If it's not an ambiguous ampersand, then replace with the

583 # unicode character. Otherwise, we leave the entity in.

584 if converted is not None:

585 new_text.append(converted)

586 remainder = part[len(entity) + 2 :]

587 if part:

588 new_text.append(remainder)

589 continue

590

591 new_text.append(part)

592

593 return "".join(new_text)

594

595

596def match_entity(stream):

597 """Returns first entity in stream or None if no entity exists

598

599 Note: For Bleach purposes, entities must start with a "&" and end with a

600 ";". This ignores ambiguous character entities that have no ";" at the end.

601

602 :arg stream: the character stream

603

604 :returns: the entity string without "&" or ";" if it's a valid character

605 entity; ``None`` otherwise

606

607 """

608 # Nix the & at the beginning

609 if stream[0] != "&":

610 raise ValueError('Stream should begin with "&"')

611

612 stream = stream[1:]

613

614 stream = list(stream)

615 possible_entity = ""

616 end_characters = "<&=;" + string.whitespace

617

618 # Handle number entities

619 if stream and stream[0] == "#":

620 possible_entity = "#"

621 stream.pop(0)

622

623 if stream and stream[0] in ("x", "X"):

624 allowed = "0123456789abcdefABCDEF"

625 possible_entity += stream.pop(0)

626 else:

627 allowed = "0123456789"

628

629 # FIXME(willkg): Do we want to make sure these are valid number

630 # entities? This doesn't do that currently.

631 while stream and stream[0] not in end_characters:

632 c = stream.pop(0)

633 if c not in allowed:

634 break

635 possible_entity += c

636

637 if possible_entity and stream and stream[0] == ";":

638 return possible_entity

639 return None

640

641 # Handle character entities

642 while stream and stream[0] not in end_characters:

643 c = stream.pop(0)

644 possible_entity += c

645 if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):

646 # If it's not a prefix, then it's not an entity and we're

647 # out

648 return None

649

650 if possible_entity and stream and stream[0] == ";":

651 return possible_entity

652

653 return None

654

655

656AMP_SPLIT_RE = re.compile("(&)")

657

658

659def next_possible_entity(text):

660 """Takes a text and generates a list of possible entities

661

662 :arg text: the text to look at

663

664 :returns: generator where each part (except the first) starts with an

665 "&"

666

667 """

668 for i, part in enumerate(AMP_SPLIT_RE.split(text)):

669 if i == 0:

670 yield part

671 elif i % 2 == 0:

672 yield "&" + part

673

674

675class BleachHTMLSerializer(HTMLSerializer):

676 """HTMLSerializer that undoes & -> & in attributes and sets

677 escape_rcdata to True

678 """

679

680 # per the HTMLSerializer.__init__ docstring:

681 #

682 # Whether to escape characters that need to be

683 # escaped within normal elements within rcdata elements such as

684 # style.

685 #

686 escape_rcdata = True

687

688 def escape_base_amp(self, stoken):

689 """Escapes just bare & in HTML attribute values"""

690 # First, undo escaping of &. We need to do this because html5lib's

691 # HTMLSerializer expected the tokenizer to consume all the character

692 # entities and convert them to their respective characters, but the

693 # BleachHTMLTokenizer doesn't do that. For example, this fixes

694 # &entity; back to &entity; .

695 stoken = stoken.replace("&", "&")

696

697 # However, we do want all bare & that are not marking character

698 # entities to be changed to &, so let's do that carefully here.

699 for part in next_possible_entity(stoken):

700 if not part:

701 continue

702

703 if part.startswith("&"):

704 entity = match_entity(part)

705 # Only leave entities in that are not ambiguous. If they're

706 # ambiguous, then we escape the ampersand.

707 if entity is not None and convert_entity(entity) is not None:

708 yield f"&{entity};"

709

710 # Length of the entity plus 2--one for & at the beginning

711 # and one for ; at the end

712 part = part[len(entity) + 2 :]

713 if part:

714 yield part

715 continue

716

717 yield part.replace("&", "&")

718

719 def serialize(self, treewalker, encoding=None):

720 """Wrap HTMLSerializer.serialize and conver & to & in attribute values

721

722 Note that this converts & to & in attribute values where the & isn't

723 already part of an unambiguous character entity.

724

725 """

726 in_tag = False

727 after_equals = False

728

729 for stoken in super().serialize(treewalker, encoding):

730 if in_tag:

731 if stoken == ">":

732 in_tag = False

733

734 elif after_equals:

735 if stoken != '"':

736 yield from self.escape_base_amp(stoken)

737

738 after_equals = False

739 continue

740

741 elif stoken == "=":

742 after_equals = True

743

744 yield stoken

745 else:

746 if stoken.startswith("<"):

747 in_tag = True

748 yield stoken

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/html5lib_shim.py: 96%

238 statements