Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/sanitizer.py: 13%

1from itertools import chain

2import re

3import warnings

5from xml.sax.saxutils import unescape

7from bleach import html5lib_shim

8from bleach import parse_shim

11#: Set of allowed tags

12ALLOWED_TAGS = frozenset(

13 (

14 "a",

15 "abbr",

16 "acronym",

17 "b",

18 "blockquote",

19 "code",

20 "em",

21 "i",

22 "li",

23 "ol",

24 "strong",

25 "ul",

26 )

27)

30#: Map of allowed attributes by tag

31ALLOWED_ATTRIBUTES = {

32 "a": ["href", "title"],

33 "abbr": ["title"],

34 "acronym": ["title"],

35}

37#: List of allowed protocols

38ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))

40#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)

41INVISIBLE_CHARACTERS = "".join(

42 [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))]

43)

45#: Regexp for characters that are invisible

46INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE)

48#: String to replace invisible characters with. This can be a character, a

49#: string, or even a function that takes a Python re matchobj

50INVISIBLE_REPLACEMENT_CHAR = "?"

53class NoCssSanitizerWarning(UserWarning):

54 pass

57class Cleaner:

58 """Cleaner for cleaning HTML fragments of malicious content

60 This cleaner is a security-focused function whose sole purpose is to remove

61 malicious content from a string such that it can be displayed as content in

62 a web page.

64 To use::

66 from bleach.sanitizer import Cleaner

68 cleaner = Cleaner()

70 for text in all_the_yucky_things:

71 sanitized = cleaner.clean(text)

73 .. Note::

75 This cleaner is not designed to use to transform content to be used in

76 non-web-page contexts.

78 .. Warning::

80 This cleaner is not thread-safe--the html parser has internal state.

81 Create a separate cleaner per thread!

84 """

86 def __init__(

87 self,

88 tags=ALLOWED_TAGS,

89 attributes=ALLOWED_ATTRIBUTES,

90 protocols=ALLOWED_PROTOCOLS,

91 strip=False,

92 strip_comments=True,

93 filters=None,

94 css_sanitizer=None,

95 ):

96 """Initializes a Cleaner

98 :arg set tags: set of allowed tags; defaults to

99 ``bleach.sanitizer.ALLOWED_TAGS``

100

101 :arg dict attributes: allowed attributes; can be a callable, list or dict;

102 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

103

104 :arg list protocols: allowed list of protocols for links; defaults

105 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

106

107 :arg bool strip: whether or not to strip disallowed elements

108

109 :arg bool strip_comments: whether or not to strip HTML comments

110

111 :arg list filters: list of html5lib Filter classes to pass streamed content through

112

113 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters

114

115 .. Warning::

116

117 Using filters changes the output of ``bleach.Cleaner.clean``.

118 Make sure the way the filters change the output are secure.

119

120 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for

121 sanitizing style attribute values and style text; defaults to None

122

123 """

124 self.tags = tags

125 self.attributes = attributes

126 self.protocols = protocols

127 self.strip = strip

128 self.strip_comments = strip_comments

129 self.filters = filters or []

130 self.css_sanitizer = css_sanitizer

131

132 self.parser = html5lib_shim.BleachHTMLParser(

133 tags=self.tags,

134 strip=self.strip,

135 consume_entities=False,

136 namespaceHTMLElements=False,

137 )

138 self.walker = html5lib_shim.getTreeWalker("etree")

139 self.serializer = html5lib_shim.BleachHTMLSerializer(

140 quote_attr_values="always",

141 omit_optional_tags=False,

142 escape_lt_in_attrs=True,

143 # We want to leave entities as they are without escaping or

144 # resolving or expanding

145 resolve_entities=False,

146 # Bleach has its own sanitizer, so don't use the html5lib one

147 sanitize=False,

148 # clean preserves attr order

149 alphabetical_attributes=False,

150 )

151

152 if css_sanitizer is None:

153 # FIXME(willkg): this doesn't handle when attributes or an

154 # attributes value is a callable

155 attributes_values = []

156 if isinstance(attributes, list):

157 attributes_values = attributes

158

159 elif isinstance(attributes, dict):

160 attributes_values = []

161 for values in attributes.values():

162 if isinstance(values, (list, tuple)):

163 attributes_values.extend(values)

164

165 if "style" in attributes_values:

166 warnings.warn(

167 "'style' attribute specified, but css_sanitizer not set.",

168 category=NoCssSanitizerWarning,

169 )

170

171 def clean(self, text):

172 """Cleans text and returns sanitized result as unicode

173

174 :arg str text: text to be cleaned

175

176 :returns: sanitized text as unicode

177

178 :raises TypeError: if ``text`` is not a text type

179

180 """

181 if not isinstance(text, str):

182 message = (

183 f"argument cannot be of {text.__class__.__name__!r} type, "

184 + "must be of text type"

185 )

186 raise TypeError(message)

187

188 if not text:

189 return ""

190

191 dom = self.parser.parseFragment(text)

192 filtered = BleachSanitizerFilter(

193 source=self.walker(dom),

194 allowed_tags=self.tags,

195 attributes=self.attributes,

196 strip_disallowed_tags=self.strip,

197 strip_html_comments=self.strip_comments,

198 css_sanitizer=self.css_sanitizer,

199 allowed_protocols=self.protocols,

200 )

201

202 # Apply any filters after the BleachSanitizerFilter

203 for filter_class in self.filters:

204 filtered = filter_class(source=filtered)

205

206 return self.serializer.render(filtered)

207

208

209def attribute_filter_factory(attributes):

210 """Generates attribute filter function for the given attributes value

211

212 The attributes value can take one of several shapes. This returns a filter

213 function appropriate to the attributes value. One nice thing about this is

214 that there's less if/then shenanigans in the ``allow_token`` method.

215

216 """

217 if callable(attributes):

218 return attributes

219

220 if isinstance(attributes, dict):

221

222 def _attr_filter(tag, attr, value):

223 if tag in attributes:

224 attr_val = attributes[tag]

225 if callable(attr_val):

226 return attr_val(tag, attr, value)

227

228 if attr in attr_val:

229 return True

230

231 if "*" in attributes:

232 attr_val = attributes["*"]

233 if callable(attr_val):

234 return attr_val(tag, attr, value)

235

236 return attr in attr_val

237

238 return False

239

240 return _attr_filter

241

242 if isinstance(attributes, list):

243

244 def _attr_filter(tag, attr, value):

245 return attr in attributes

246

247 return _attr_filter

248

249 raise ValueError("attributes needs to be a callable, a list or a dict")

250

251

252class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):

253 """html5lib Filter that sanitizes text

254

255 This filter can be used anywhere html5lib filters can be used.

256

257 """

258

259 def __init__(

260 self,

261 source,

262 allowed_tags=ALLOWED_TAGS,

263 attributes=ALLOWED_ATTRIBUTES,

264 allowed_protocols=ALLOWED_PROTOCOLS,

265 attr_val_is_uri=html5lib_shim.attr_val_is_uri,

266 svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,

267 svg_allow_local_href=html5lib_shim.svg_allow_local_href,

268 strip_disallowed_tags=False,

269 strip_html_comments=True,

270 css_sanitizer=None,

271 ):

272 """Creates a BleachSanitizerFilter instance

273

274 :arg source: html5lib TreeWalker stream as an html5lib TreeWalker

275

276 :arg set allowed_tags: set of allowed tags; defaults to

277 ``bleach.sanitizer.ALLOWED_TAGS``

278

279 :arg dict attributes: allowed attributes; can be a callable, list or dict;

280 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``

281

282 :arg list allowed_protocols: allowed list of protocols for links; defaults

283 to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

284

285 :arg attr_val_is_uri: set of attributes that have URI values

286

287 :arg svg_attr_val_allows_ref: set of SVG attributes that can have

288 references

289

290 :arg svg_allow_local_href: set of SVG elements that can have local

291 hrefs

292

293 :arg bool strip_disallowed_tags: whether or not to strip disallowed

294 tags

295

296 :arg bool strip_html_comments: whether or not to strip HTML comments

297

298 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for

299 sanitizing style attribute values and style text; defaults to None

300

301 """

302 # NOTE(willkg): This is the superclass of

303 # html5lib.filters.sanitizer.Filter. We call this directly skipping the

304 # __init__ for html5lib.filters.sanitizer.Filter because that does

305 # things we don't need to do and kicks up the deprecation warning for

306 # using Sanitizer.

307 html5lib_shim.Filter.__init__(self, source)

308

309 self.allowed_tags = frozenset(allowed_tags)

310 self.allowed_protocols = frozenset(allowed_protocols)

311

312 self.attr_filter = attribute_filter_factory(attributes)

313 self.strip_disallowed_tags = strip_disallowed_tags

314 self.strip_html_comments = strip_html_comments

315

316 self.attr_val_is_uri = attr_val_is_uri

317 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref

318 self.css_sanitizer = css_sanitizer

319 self.svg_allow_local_href = svg_allow_local_href

320

321 def sanitize_stream(self, token_iterator):

322 for token in token_iterator:

323 ret = self.sanitize_token(token)

324

325 if not ret:

326 continue

327

328 if isinstance(ret, list):

329 yield from ret

330 else:

331 yield ret

332

333 def merge_characters(self, token_iterator):

334 """Merge consecutive Characters tokens in a stream"""

335 characters_buffer = []

336

337 for token in token_iterator:

338 if characters_buffer:

339 if token["type"] == "Characters":

340 characters_buffer.append(token)

341 continue

342 else:

343 # Merge all the characters tokens together into one and then

344 # operate on it.

345 new_token = {

346 "data": "".join(

347 [char_token["data"] for char_token in characters_buffer]

348 ),

349 "type": "Characters",

350 }

351 characters_buffer = []

352 yield new_token

353

354 elif token["type"] == "Characters":

355 characters_buffer.append(token)

356 continue

357

358 yield token

359

360 new_token = {

361 "data": "".join([char_token["data"] for char_token in characters_buffer]),

362 "type": "Characters",

363 }

364 yield new_token

365

366 def __iter__(self):

367 return self.merge_characters(

368 self.sanitize_stream(html5lib_shim.Filter.__iter__(self))

369 )

370

371 def sanitize_token(self, token):

372 """Sanitize a token either by HTML-encoding or dropping.

373

374 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':

375 ['attribute', 'pairs'], 'tag': callable}.

376

377 Here callable is a function with two arguments of attribute name and

378 value. It should return true of false.

379

380 Also gives the option to strip tags instead of encoding.

381

382 :arg dict token: token to sanitize

383

384 :returns: token or list of tokens

385

386 """

387 token_type = token["type"]

388 if token_type in ["StartTag", "EndTag", "EmptyTag"]:

389 if token["name"] in self.allowed_tags:

390 return self.allow_token(token)

391

392 elif self.strip_disallowed_tags:

393 return None

394

395 else:

396 return self.disallowed_token(token)

397

398 elif token_type == "Comment":

399 if not self.strip_html_comments:

400 # call lxml.sax.saxutils to escape &, <, and > in addition to " and '

401 token["data"] = html5lib_shim.escape(

402 token["data"], entities={'"': """, "'": "'"}

403 )

404 return token

405 else:

406 return None

407

408 elif token_type == "Characters":

409 return self.sanitize_characters(token)

410

411 else:

412 return token

413

414 def sanitize_characters(self, token):

415 """Handles Characters tokens

416

417 Our overridden tokenizer doesn't do anything with entities. However,

418 that means that the serializer will convert all ``&`` in Characters

419 tokens to ``&``.

420

421 Since we don't want that, we extract entities here and convert them to

422 Entity tokens so the serializer will let them be.

423

424 :arg token: the Characters token to work on

425

426 :returns: a list of tokens

427

428 """

429 data = token.get("data", "")

430

431 if not data:

432 return token

433

434 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)

435 token["data"] = data

436

437 # If there isn't a & in the data, we can return now

438 if "&" not in data:

439 return token

440

441 new_tokens = []

442

443 # For each possible entity that starts with a "&", we try to extract an

444 # actual entity and re-tokenize accordingly

445 for part in html5lib_shim.next_possible_entity(data):

446 if not part:

447 continue

448

449 if part.startswith("&"):

450 entity = html5lib_shim.match_entity(part)

451 if entity is not None:

452 if entity == "amp":

453 # LinkifyFilter can't match urls across token boundaries

454 # which is problematic with & since that shows up in

455 # querystrings all the time. This special-cases &

456 # and converts it to a & and sticks it in as a

457 # Characters token. It'll get merged with surrounding

458 # tokens in the BleachSanitizerfilter.__iter__ and

459 # escaped in the serializer.

460 new_tokens.append({"type": "Characters", "data": "&"})

461 else:

462 new_tokens.append({"type": "Entity", "name": entity})

463

464 # Length of the entity plus 2--one for & at the beginning

465 # and one for ; at the end

466 remainder = part[len(entity) + 2 :]

467 if remainder:

468 new_tokens.append({"type": "Characters", "data": remainder})

469 continue

470

471 new_tokens.append({"type": "Characters", "data": part})

472

473 return new_tokens

474

475 def sanitize_uri_value(self, value, allowed_protocols):

476 """Checks a uri value to see if it's allowed

477

478 :arg value: the uri value to sanitize

479 :arg allowed_protocols: list of allowed protocols

480

481 :returns: allowed value or None

482

483 """

484 # NOTE(willkg): This transforms the value into a normalized one that's

485 # easier to match and verify, but shouldn't get returned since it's

486 # vastly different than the original value.

487

488 # Convert all character entities in the value

489 normalized_uri = html5lib_shim.convert_entities(value)

490

491 # Nix backtick, space characters, and control characters

492 normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)

493

494 # Remove REPLACEMENT characters

495 normalized_uri = normalized_uri.replace("\ufffd", "")

496

497 # Lowercase it--this breaks the value, but makes it easier to match

498 # against

499 normalized_uri = normalized_uri.lower()

500

501 try:

502 # Drop attributes with uri values that have protocols that aren't

503 # allowed

504 parsed = parse_shim.urlparse(normalized_uri)

505 except ValueError:

506 # URI is impossible to parse, therefore it's not allowed

507 return None

508

509 if parsed.scheme:

510 # If urlparse found a scheme, check that

511 if parsed.scheme in allowed_protocols:

512 return value

513

514 else:

515 # Allow uris that are just an anchor

516 if normalized_uri.startswith("#"):

517 return value

518

519 # Handle protocols that urlparse doesn't recognize like "myprotocol"

520 if (

521 ":" in normalized_uri

522 and normalized_uri.split(":")[0] in allowed_protocols

523 ):

524 return value

525

526 # If there's no protocol/scheme specified, then assume it's "http" or

527 # "https" and see if that's allowed

528 if "http" in allowed_protocols or "https" in allowed_protocols:

529 return value

530

531 return None

532

533 def allow_token(self, token):

534 """Handles the case where we're allowing the tag"""

535 if "data" in token:

536 # Loop through all the attributes and drop the ones that are not

537 # allowed, are unsafe or break other rules. Additionally, fix

538 # attribute values that need fixing.

539 #

540 # At the end of this loop, we have the final set of attributes

541 # we're keeping.

542 attrs = {}

543 for namespaced_name, val in token["data"].items():

544 namespace, name = namespaced_name

545

546 # Drop attributes that are not explicitly allowed

547 #

548 # NOTE(willkg): We pass in the attribute name--not a namespaced

549 # name.

550 if not self.attr_filter(token["name"], name, val):

551 continue

552

553 # Drop attributes with uri values that use a disallowed protocol

554 # Sanitize attributes with uri values

555 if namespaced_name in self.attr_val_is_uri:

556 new_value = self.sanitize_uri_value(val, self.allowed_protocols)

557 if new_value is None:

558 continue

559 val = new_value

560

561 # Drop values in svg attrs with non-local IRIs

562 if namespaced_name in self.svg_attr_val_allows_ref:

563 new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val))

564 new_val = new_val.strip()

565 if not new_val:

566 continue

567

568 else:

569 # Replace the val with the unescaped version because

570 # it's a iri

571 val = new_val

572

573 # Drop href and xlink:href attr for svg elements with non-local IRIs

574 if (None, token["name"]) in self.svg_allow_local_href:

575 if namespaced_name in [

576 (None, "href"),

577 (html5lib_shim.namespaces["xlink"], "href"),

578 ]:

579 if re.search(r"^\s*[^#\s]", val):

580 continue

581

582 # If it's a style attribute, sanitize it

583 if namespaced_name == (None, "style"):

584 if self.css_sanitizer:

585 val = self.css_sanitizer.sanitize_css(val)

586 else:

587 # FIXME(willkg): if style is allowed, but no

588 # css_sanitizer was set up, then this is probably a

589 # mistake and we should raise an error here

590 #

591 # For now, we're going to set the value to "" because

592 # there was no sanitizer set

593 val = ""

594

595 # At this point, we want to keep the attribute, so add it in

596 attrs[namespaced_name] = val

597

598 token["data"] = attrs

599

600 return token

601

602 def disallowed_token(self, token):

603 token_type = token["type"]

604 if token_type == "EndTag":

605 token["data"] = f"</{token['name']}>"

606

607 elif token["data"]:

608 assert token_type in ("StartTag", "EmptyTag")

609 attrs = []

610 for (ns, name), v in token["data"].items():

611 # If we end up with a namespace, but no name, switch them so we

612 # have a valid name to use.

613 if ns and not name:

614 ns, name = name, ns

615

616 # Figure out namespaced name if the namespace is appropriate

617 # and exists; if the ns isn't in prefixes, then drop it.

618 if ns is None or ns not in html5lib_shim.prefixes:

619 namespaced_name = name

620 else:

621 namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"

622

623 # NOTE(willkg): HTMLSerializer escapes attribute values

624 # already, so if we do it here (like HTMLSerializer does),

625 # then we end up double-escaping.

626 attrs.append(f' {namespaced_name}="{v}"')

627 token["data"] = f"<{token['name']}{''.join(attrs)}>"

628

629 else:

630 token["data"] = f"<{token['name']}>"

631

632 if token.get("selfClosing"):

633 token["data"] = f"{token['data'][:-1]}/>"

634

635 token["type"] = "Characters"

636

637 del token["name"]

638 return token