Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/sanitizer.py: 13%

218 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1from itertools import chain 

2import re 

3import warnings 

4 

5from xml.sax.saxutils import unescape 

6 

7from bleach import html5lib_shim 

8from bleach import parse_shim 

9 

10 

11#: Set of allowed tags 

12ALLOWED_TAGS = frozenset( 

13 ( 

14 "a", 

15 "abbr", 

16 "acronym", 

17 "b", 

18 "blockquote", 

19 "code", 

20 "em", 

21 "i", 

22 "li", 

23 "ol", 

24 "strong", 

25 "ul", 

26 ) 

27) 

28 

29 

30#: Map of allowed attributes by tag 

31ALLOWED_ATTRIBUTES = { 

32 "a": ["href", "title"], 

33 "abbr": ["title"], 

34 "acronym": ["title"], 

35} 

36 

37#: List of allowed protocols 

38ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto")) 

39 

40#: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) 

41INVISIBLE_CHARACTERS = "".join( 

42 [chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))] 

43) 

44 

45#: Regexp for characters that are invisible 

46INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICODE) 

47 

48#: String to replace invisible characters with. This can be a character, a 

49#: string, or even a function that takes a Python re matchobj 

50INVISIBLE_REPLACEMENT_CHAR = "?" 

51 

52 

53class NoCssSanitizerWarning(UserWarning): 

54 pass 

55 

56 

57class Cleaner: 

58 """Cleaner for cleaning HTML fragments of malicious content 

59 

60 This cleaner is a security-focused function whose sole purpose is to remove 

61 malicious content from a string such that it can be displayed as content in 

62 a web page. 

63 

64 To use:: 

65 

66 from bleach.sanitizer import Cleaner 

67 

68 cleaner = Cleaner() 

69 

70 for text in all_the_yucky_things: 

71 sanitized = cleaner.clean(text) 

72 

73 .. Note:: 

74 

75 This cleaner is not designed to use to transform content to be used in 

76 non-web-page contexts. 

77 

78 .. Warning:: 

79 

80 This cleaner is not thread-safe--the html parser has internal state. 

81 Create a separate cleaner per thread! 

82 

83 

84 """ 

85 

86 def __init__( 

87 self, 

88 tags=ALLOWED_TAGS, 

89 attributes=ALLOWED_ATTRIBUTES, 

90 protocols=ALLOWED_PROTOCOLS, 

91 strip=False, 

92 strip_comments=True, 

93 filters=None, 

94 css_sanitizer=None, 

95 ): 

96 """Initializes a Cleaner 

97 

98 :arg set tags: set of allowed tags; defaults to 

99 ``bleach.sanitizer.ALLOWED_TAGS`` 

100 

101 :arg dict attributes: allowed attributes; can be a callable, list or dict; 

102 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` 

103 

104 :arg list protocols: allowed list of protocols for links; defaults 

105 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` 

106 

107 :arg bool strip: whether or not to strip disallowed elements 

108 

109 :arg bool strip_comments: whether or not to strip HTML comments 

110 

111 :arg list filters: list of html5lib Filter classes to pass streamed content through 

112 

113 .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters 

114 

115 .. Warning:: 

116 

117 Using filters changes the output of ``bleach.Cleaner.clean``. 

118 Make sure the way the filters change the output are secure. 

119 

120 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for 

121 sanitizing style attribute values and style text; defaults to None 

122 

123 """ 

124 self.tags = tags 

125 self.attributes = attributes 

126 self.protocols = protocols 

127 self.strip = strip 

128 self.strip_comments = strip_comments 

129 self.filters = filters or [] 

130 self.css_sanitizer = css_sanitizer 

131 

132 self.parser = html5lib_shim.BleachHTMLParser( 

133 tags=self.tags, 

134 strip=self.strip, 

135 consume_entities=False, 

136 namespaceHTMLElements=False, 

137 ) 

138 self.walker = html5lib_shim.getTreeWalker("etree") 

139 self.serializer = html5lib_shim.BleachHTMLSerializer( 

140 quote_attr_values="always", 

141 omit_optional_tags=False, 

142 escape_lt_in_attrs=True, 

143 # We want to leave entities as they are without escaping or 

144 # resolving or expanding 

145 resolve_entities=False, 

146 # Bleach has its own sanitizer, so don't use the html5lib one 

147 sanitize=False, 

148 # clean preserves attr order 

149 alphabetical_attributes=False, 

150 ) 

151 

152 if css_sanitizer is None: 

153 # FIXME(willkg): this doesn't handle when attributes or an 

154 # attributes value is a callable 

155 attributes_values = [] 

156 if isinstance(attributes, list): 

157 attributes_values = attributes 

158 

159 elif isinstance(attributes, dict): 

160 attributes_values = [] 

161 for values in attributes.values(): 

162 if isinstance(values, (list, tuple)): 

163 attributes_values.extend(values) 

164 

165 if "style" in attributes_values: 

166 warnings.warn( 

167 "'style' attribute specified, but css_sanitizer not set.", 

168 category=NoCssSanitizerWarning, 

169 ) 

170 

171 def clean(self, text): 

172 """Cleans text and returns sanitized result as unicode 

173 

174 :arg str text: text to be cleaned 

175 

176 :returns: sanitized text as unicode 

177 

178 :raises TypeError: if ``text`` is not a text type 

179 

180 """ 

181 if not isinstance(text, str): 

182 message = ( 

183 f"argument cannot be of {text.__class__.__name__!r} type, " 

184 + "must be of text type" 

185 ) 

186 raise TypeError(message) 

187 

188 if not text: 

189 return "" 

190 

191 dom = self.parser.parseFragment(text) 

192 filtered = BleachSanitizerFilter( 

193 source=self.walker(dom), 

194 allowed_tags=self.tags, 

195 attributes=self.attributes, 

196 strip_disallowed_tags=self.strip, 

197 strip_html_comments=self.strip_comments, 

198 css_sanitizer=self.css_sanitizer, 

199 allowed_protocols=self.protocols, 

200 ) 

201 

202 # Apply any filters after the BleachSanitizerFilter 

203 for filter_class in self.filters: 

204 filtered = filter_class(source=filtered) 

205 

206 return self.serializer.render(filtered) 

207 

208 

209def attribute_filter_factory(attributes): 

210 """Generates attribute filter function for the given attributes value 

211 

212 The attributes value can take one of several shapes. This returns a filter 

213 function appropriate to the attributes value. One nice thing about this is 

214 that there's less if/then shenanigans in the ``allow_token`` method. 

215 

216 """ 

217 if callable(attributes): 

218 return attributes 

219 

220 if isinstance(attributes, dict): 

221 

222 def _attr_filter(tag, attr, value): 

223 if tag in attributes: 

224 attr_val = attributes[tag] 

225 if callable(attr_val): 

226 return attr_val(tag, attr, value) 

227 

228 if attr in attr_val: 

229 return True 

230 

231 if "*" in attributes: 

232 attr_val = attributes["*"] 

233 if callable(attr_val): 

234 return attr_val(tag, attr, value) 

235 

236 return attr in attr_val 

237 

238 return False 

239 

240 return _attr_filter 

241 

242 if isinstance(attributes, list): 

243 

244 def _attr_filter(tag, attr, value): 

245 return attr in attributes 

246 

247 return _attr_filter 

248 

249 raise ValueError("attributes needs to be a callable, a list or a dict") 

250 

251 

252class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): 

253 """html5lib Filter that sanitizes text 

254 

255 This filter can be used anywhere html5lib filters can be used. 

256 

257 """ 

258 

259 def __init__( 

260 self, 

261 source, 

262 allowed_tags=ALLOWED_TAGS, 

263 attributes=ALLOWED_ATTRIBUTES, 

264 allowed_protocols=ALLOWED_PROTOCOLS, 

265 attr_val_is_uri=html5lib_shim.attr_val_is_uri, 

266 svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref, 

267 svg_allow_local_href=html5lib_shim.svg_allow_local_href, 

268 strip_disallowed_tags=False, 

269 strip_html_comments=True, 

270 css_sanitizer=None, 

271 ): 

272 """Creates a BleachSanitizerFilter instance 

273 

274 :arg source: html5lib TreeWalker stream as an html5lib TreeWalker 

275 

276 :arg set allowed_tags: set of allowed tags; defaults to 

277 ``bleach.sanitizer.ALLOWED_TAGS`` 

278 

279 :arg dict attributes: allowed attributes; can be a callable, list or dict; 

280 defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` 

281 

282 :arg list allowed_protocols: allowed list of protocols for links; defaults 

283 to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` 

284 

285 :arg attr_val_is_uri: set of attributes that have URI values 

286 

287 :arg svg_attr_val_allows_ref: set of SVG attributes that can have 

288 references 

289 

290 :arg svg_allow_local_href: set of SVG elements that can have local 

291 hrefs 

292 

293 :arg bool strip_disallowed_tags: whether or not to strip disallowed 

294 tags 

295 

296 :arg bool strip_html_comments: whether or not to strip HTML comments 

297 

298 :arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for 

299 sanitizing style attribute values and style text; defaults to None 

300 

301 """ 

302 # NOTE(willkg): This is the superclass of 

303 # html5lib.filters.sanitizer.Filter. We call this directly skipping the 

304 # __init__ for html5lib.filters.sanitizer.Filter because that does 

305 # things we don't need to do and kicks up the deprecation warning for 

306 # using Sanitizer. 

307 html5lib_shim.Filter.__init__(self, source) 

308 

309 self.allowed_tags = frozenset(allowed_tags) 

310 self.allowed_protocols = frozenset(allowed_protocols) 

311 

312 self.attr_filter = attribute_filter_factory(attributes) 

313 self.strip_disallowed_tags = strip_disallowed_tags 

314 self.strip_html_comments = strip_html_comments 

315 

316 self.attr_val_is_uri = attr_val_is_uri 

317 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref 

318 self.css_sanitizer = css_sanitizer 

319 self.svg_allow_local_href = svg_allow_local_href 

320 

321 def sanitize_stream(self, token_iterator): 

322 for token in token_iterator: 

323 ret = self.sanitize_token(token) 

324 

325 if not ret: 

326 continue 

327 

328 if isinstance(ret, list): 

329 yield from ret 

330 else: 

331 yield ret 

332 

333 def merge_characters(self, token_iterator): 

334 """Merge consecutive Characters tokens in a stream""" 

335 characters_buffer = [] 

336 

337 for token in token_iterator: 

338 if characters_buffer: 

339 if token["type"] == "Characters": 

340 characters_buffer.append(token) 

341 continue 

342 else: 

343 # Merge all the characters tokens together into one and then 

344 # operate on it. 

345 new_token = { 

346 "data": "".join( 

347 [char_token["data"] for char_token in characters_buffer] 

348 ), 

349 "type": "Characters", 

350 } 

351 characters_buffer = [] 

352 yield new_token 

353 

354 elif token["type"] == "Characters": 

355 characters_buffer.append(token) 

356 continue 

357 

358 yield token 

359 

360 new_token = { 

361 "data": "".join([char_token["data"] for char_token in characters_buffer]), 

362 "type": "Characters", 

363 } 

364 yield new_token 

365 

366 def __iter__(self): 

367 return self.merge_characters( 

368 self.sanitize_stream(html5lib_shim.Filter.__iter__(self)) 

369 ) 

370 

371 def sanitize_token(self, token): 

372 """Sanitize a token either by HTML-encoding or dropping. 

373 

374 Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': 

375 ['attribute', 'pairs'], 'tag': callable}. 

376 

377 Here callable is a function with two arguments of attribute name and 

378 value. It should return true of false. 

379 

380 Also gives the option to strip tags instead of encoding. 

381 

382 :arg dict token: token to sanitize 

383 

384 :returns: token or list of tokens 

385 

386 """ 

387 token_type = token["type"] 

388 if token_type in ["StartTag", "EndTag", "EmptyTag"]: 

389 if token["name"] in self.allowed_tags: 

390 return self.allow_token(token) 

391 

392 elif self.strip_disallowed_tags: 

393 return None 

394 

395 else: 

396 return self.disallowed_token(token) 

397 

398 elif token_type == "Comment": 

399 if not self.strip_html_comments: 

400 # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' 

401 token["data"] = html5lib_shim.escape( 

402 token["data"], entities={'"': "&quot;", "'": "&#x27;"} 

403 ) 

404 return token 

405 else: 

406 return None 

407 

408 elif token_type == "Characters": 

409 return self.sanitize_characters(token) 

410 

411 else: 

412 return token 

413 

414 def sanitize_characters(self, token): 

415 """Handles Characters tokens 

416 

417 Our overridden tokenizer doesn't do anything with entities. However, 

418 that means that the serializer will convert all ``&`` in Characters 

419 tokens to ``&amp;``. 

420 

421 Since we don't want that, we extract entities here and convert them to 

422 Entity tokens so the serializer will let them be. 

423 

424 :arg token: the Characters token to work on 

425 

426 :returns: a list of tokens 

427 

428 """ 

429 data = token.get("data", "") 

430 

431 if not data: 

432 return token 

433 

434 data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data) 

435 token["data"] = data 

436 

437 # If there isn't a & in the data, we can return now 

438 if "&" not in data: 

439 return token 

440 

441 new_tokens = [] 

442 

443 # For each possible entity that starts with a "&", we try to extract an 

444 # actual entity and re-tokenize accordingly 

445 for part in html5lib_shim.next_possible_entity(data): 

446 if not part: 

447 continue 

448 

449 if part.startswith("&"): 

450 entity = html5lib_shim.match_entity(part) 

451 if entity is not None: 

452 if entity == "amp": 

453 # LinkifyFilter can't match urls across token boundaries 

454 # which is problematic with &amp; since that shows up in 

455 # querystrings all the time. This special-cases &amp; 

456 # and converts it to a & and sticks it in as a 

457 # Characters token. It'll get merged with surrounding 

458 # tokens in the BleachSanitizerfilter.__iter__ and 

459 # escaped in the serializer. 

460 new_tokens.append({"type": "Characters", "data": "&"}) 

461 else: 

462 new_tokens.append({"type": "Entity", "name": entity}) 

463 

464 # Length of the entity plus 2--one for & at the beginning 

465 # and one for ; at the end 

466 remainder = part[len(entity) + 2 :] 

467 if remainder: 

468 new_tokens.append({"type": "Characters", "data": remainder}) 

469 continue 

470 

471 new_tokens.append({"type": "Characters", "data": part}) 

472 

473 return new_tokens 

474 

475 def sanitize_uri_value(self, value, allowed_protocols): 

476 """Checks a uri value to see if it's allowed 

477 

478 :arg value: the uri value to sanitize 

479 :arg allowed_protocols: list of allowed protocols 

480 

481 :returns: allowed value or None 

482 

483 """ 

484 # NOTE(willkg): This transforms the value into a normalized one that's 

485 # easier to match and verify, but shouldn't get returned since it's 

486 # vastly different than the original value. 

487 

488 # Convert all character entities in the value 

489 normalized_uri = html5lib_shim.convert_entities(value) 

490 

491 # Nix backtick, space characters, and control characters 

492 normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri) 

493 

494 # Remove REPLACEMENT characters 

495 normalized_uri = normalized_uri.replace("\ufffd", "") 

496 

497 # Lowercase it--this breaks the value, but makes it easier to match 

498 # against 

499 normalized_uri = normalized_uri.lower() 

500 

501 try: 

502 # Drop attributes with uri values that have protocols that aren't 

503 # allowed 

504 parsed = parse_shim.urlparse(normalized_uri) 

505 except ValueError: 

506 # URI is impossible to parse, therefore it's not allowed 

507 return None 

508 

509 if parsed.scheme: 

510 # If urlparse found a scheme, check that 

511 if parsed.scheme in allowed_protocols: 

512 return value 

513 

514 else: 

515 # Allow uris that are just an anchor 

516 if normalized_uri.startswith("#"): 

517 return value 

518 

519 # Handle protocols that urlparse doesn't recognize like "myprotocol" 

520 if ( 

521 ":" in normalized_uri 

522 and normalized_uri.split(":")[0] in allowed_protocols 

523 ): 

524 return value 

525 

526 # If there's no protocol/scheme specified, then assume it's "http" or 

527 # "https" and see if that's allowed 

528 if "http" in allowed_protocols or "https" in allowed_protocols: 

529 return value 

530 

531 return None 

532 

533 def allow_token(self, token): 

534 """Handles the case where we're allowing the tag""" 

535 if "data" in token: 

536 # Loop through all the attributes and drop the ones that are not 

537 # allowed, are unsafe or break other rules. Additionally, fix 

538 # attribute values that need fixing. 

539 # 

540 # At the end of this loop, we have the final set of attributes 

541 # we're keeping. 

542 attrs = {} 

543 for namespaced_name, val in token["data"].items(): 

544 namespace, name = namespaced_name 

545 

546 # Drop attributes that are not explicitly allowed 

547 # 

548 # NOTE(willkg): We pass in the attribute name--not a namespaced 

549 # name. 

550 if not self.attr_filter(token["name"], name, val): 

551 continue 

552 

553 # Drop attributes with uri values that use a disallowed protocol 

554 # Sanitize attributes with uri values 

555 if namespaced_name in self.attr_val_is_uri: 

556 new_value = self.sanitize_uri_value(val, self.allowed_protocols) 

557 if new_value is None: 

558 continue 

559 val = new_value 

560 

561 # Drop values in svg attrs with non-local IRIs 

562 if namespaced_name in self.svg_attr_val_allows_ref: 

563 new_val = re.sub(r"url\s*\(\s*[^#\s][^)]+?\)", " ", unescape(val)) 

564 new_val = new_val.strip() 

565 if not new_val: 

566 continue 

567 

568 else: 

569 # Replace the val with the unescaped version because 

570 # it's a iri 

571 val = new_val 

572 

573 # Drop href and xlink:href attr for svg elements with non-local IRIs 

574 if (None, token["name"]) in self.svg_allow_local_href: 

575 if namespaced_name in [ 

576 (None, "href"), 

577 (html5lib_shim.namespaces["xlink"], "href"), 

578 ]: 

579 if re.search(r"^\s*[^#\s]", val): 

580 continue 

581 

582 # If it's a style attribute, sanitize it 

583 if namespaced_name == (None, "style"): 

584 if self.css_sanitizer: 

585 val = self.css_sanitizer.sanitize_css(val) 

586 else: 

587 # FIXME(willkg): if style is allowed, but no 

588 # css_sanitizer was set up, then this is probably a 

589 # mistake and we should raise an error here 

590 # 

591 # For now, we're going to set the value to "" because 

592 # there was no sanitizer set 

593 val = "" 

594 

595 # At this point, we want to keep the attribute, so add it in 

596 attrs[namespaced_name] = val 

597 

598 token["data"] = attrs 

599 

600 return token 

601 

602 def disallowed_token(self, token): 

603 token_type = token["type"] 

604 if token_type == "EndTag": 

605 token["data"] = f"</{token['name']}>" 

606 

607 elif token["data"]: 

608 assert token_type in ("StartTag", "EmptyTag") 

609 attrs = [] 

610 for (ns, name), v in token["data"].items(): 

611 # If we end up with a namespace, but no name, switch them so we 

612 # have a valid name to use. 

613 if ns and not name: 

614 ns, name = name, ns 

615 

616 # Figure out namespaced name if the namespace is appropriate 

617 # and exists; if the ns isn't in prefixes, then drop it. 

618 if ns is None or ns not in html5lib_shim.prefixes: 

619 namespaced_name = name 

620 else: 

621 namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}" 

622 

623 # NOTE(willkg): HTMLSerializer escapes attribute values 

624 # already, so if we do it here (like HTMLSerializer does), 

625 # then we end up double-escaping. 

626 attrs.append(f' {namespaced_name}="{v}"') 

627 token["data"] = f"<{token['name']}{''.join(attrs)}>" 

628 

629 else: 

630 token["data"] = f"<{token['name']}>" 

631 

632 if token.get("selfClosing"): 

633 token["data"] = f"{token['data'][:-1]}/>" 

634 

635 token["type"] = "Characters" 

636 

637 del token["name"] 

638 return token