Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/filters/sanitizer.py: 23%
121 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
1"""Deprecated from html5lib 1.1.
3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
5is recommended as a replacement. Please let us know in the aforementioned issue
6if Bleach is unsuitable for your needs.
8"""
9from __future__ import absolute_import, division, unicode_literals
11import re
12import warnings
13from xml.sax.saxutils import escape, unescape
15from six.moves import urllib_parse as urlparse
17from . import base
18from ..constants import namespaces, prefixes
20__all__ = ["Filter"]
23_deprecation_msg = (
24 "html5lib's sanitizer is deprecated; see " +
25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
26 "us know if Bleach is unsuitable for your needs"
27)
29warnings.warn(_deprecation_msg, DeprecationWarning)
31allowed_elements = frozenset((
32 (namespaces['html'], 'a'),
33 (namespaces['html'], 'abbr'),
34 (namespaces['html'], 'acronym'),
35 (namespaces['html'], 'address'),
36 (namespaces['html'], 'area'),
37 (namespaces['html'], 'article'),
38 (namespaces['html'], 'aside'),
39 (namespaces['html'], 'audio'),
40 (namespaces['html'], 'b'),
41 (namespaces['html'], 'big'),
42 (namespaces['html'], 'blockquote'),
43 (namespaces['html'], 'br'),
44 (namespaces['html'], 'button'),
45 (namespaces['html'], 'canvas'),
46 (namespaces['html'], 'caption'),
47 (namespaces['html'], 'center'),
48 (namespaces['html'], 'cite'),
49 (namespaces['html'], 'code'),
50 (namespaces['html'], 'col'),
51 (namespaces['html'], 'colgroup'),
52 (namespaces['html'], 'command'),
53 (namespaces['html'], 'datagrid'),
54 (namespaces['html'], 'datalist'),
55 (namespaces['html'], 'dd'),
56 (namespaces['html'], 'del'),
57 (namespaces['html'], 'details'),
58 (namespaces['html'], 'dfn'),
59 (namespaces['html'], 'dialog'),
60 (namespaces['html'], 'dir'),
61 (namespaces['html'], 'div'),
62 (namespaces['html'], 'dl'),
63 (namespaces['html'], 'dt'),
64 (namespaces['html'], 'em'),
65 (namespaces['html'], 'event-source'),
66 (namespaces['html'], 'fieldset'),
67 (namespaces['html'], 'figcaption'),
68 (namespaces['html'], 'figure'),
69 (namespaces['html'], 'footer'),
70 (namespaces['html'], 'font'),
71 (namespaces['html'], 'form'),
72 (namespaces['html'], 'header'),
73 (namespaces['html'], 'h1'),
74 (namespaces['html'], 'h2'),
75 (namespaces['html'], 'h3'),
76 (namespaces['html'], 'h4'),
77 (namespaces['html'], 'h5'),
78 (namespaces['html'], 'h6'),
79 (namespaces['html'], 'hr'),
80 (namespaces['html'], 'i'),
81 (namespaces['html'], 'img'),
82 (namespaces['html'], 'input'),
83 (namespaces['html'], 'ins'),
84 (namespaces['html'], 'keygen'),
85 (namespaces['html'], 'kbd'),
86 (namespaces['html'], 'label'),
87 (namespaces['html'], 'legend'),
88 (namespaces['html'], 'li'),
89 (namespaces['html'], 'm'),
90 (namespaces['html'], 'map'),
91 (namespaces['html'], 'menu'),
92 (namespaces['html'], 'meter'),
93 (namespaces['html'], 'multicol'),
94 (namespaces['html'], 'nav'),
95 (namespaces['html'], 'nextid'),
96 (namespaces['html'], 'ol'),
97 (namespaces['html'], 'output'),
98 (namespaces['html'], 'optgroup'),
99 (namespaces['html'], 'option'),
100 (namespaces['html'], 'p'),
101 (namespaces['html'], 'pre'),
102 (namespaces['html'], 'progress'),
103 (namespaces['html'], 'q'),
104 (namespaces['html'], 's'),
105 (namespaces['html'], 'samp'),
106 (namespaces['html'], 'section'),
107 (namespaces['html'], 'select'),
108 (namespaces['html'], 'small'),
109 (namespaces['html'], 'sound'),
110 (namespaces['html'], 'source'),
111 (namespaces['html'], 'spacer'),
112 (namespaces['html'], 'span'),
113 (namespaces['html'], 'strike'),
114 (namespaces['html'], 'strong'),
115 (namespaces['html'], 'sub'),
116 (namespaces['html'], 'sup'),
117 (namespaces['html'], 'table'),
118 (namespaces['html'], 'tbody'),
119 (namespaces['html'], 'td'),
120 (namespaces['html'], 'textarea'),
121 (namespaces['html'], 'time'),
122 (namespaces['html'], 'tfoot'),
123 (namespaces['html'], 'th'),
124 (namespaces['html'], 'thead'),
125 (namespaces['html'], 'tr'),
126 (namespaces['html'], 'tt'),
127 (namespaces['html'], 'u'),
128 (namespaces['html'], 'ul'),
129 (namespaces['html'], 'var'),
130 (namespaces['html'], 'video'),
131 (namespaces['mathml'], 'maction'),
132 (namespaces['mathml'], 'math'),
133 (namespaces['mathml'], 'merror'),
134 (namespaces['mathml'], 'mfrac'),
135 (namespaces['mathml'], 'mi'),
136 (namespaces['mathml'], 'mmultiscripts'),
137 (namespaces['mathml'], 'mn'),
138 (namespaces['mathml'], 'mo'),
139 (namespaces['mathml'], 'mover'),
140 (namespaces['mathml'], 'mpadded'),
141 (namespaces['mathml'], 'mphantom'),
142 (namespaces['mathml'], 'mprescripts'),
143 (namespaces['mathml'], 'mroot'),
144 (namespaces['mathml'], 'mrow'),
145 (namespaces['mathml'], 'mspace'),
146 (namespaces['mathml'], 'msqrt'),
147 (namespaces['mathml'], 'mstyle'),
148 (namespaces['mathml'], 'msub'),
149 (namespaces['mathml'], 'msubsup'),
150 (namespaces['mathml'], 'msup'),
151 (namespaces['mathml'], 'mtable'),
152 (namespaces['mathml'], 'mtd'),
153 (namespaces['mathml'], 'mtext'),
154 (namespaces['mathml'], 'mtr'),
155 (namespaces['mathml'], 'munder'),
156 (namespaces['mathml'], 'munderover'),
157 (namespaces['mathml'], 'none'),
158 (namespaces['svg'], 'a'),
159 (namespaces['svg'], 'animate'),
160 (namespaces['svg'], 'animateColor'),
161 (namespaces['svg'], 'animateMotion'),
162 (namespaces['svg'], 'animateTransform'),
163 (namespaces['svg'], 'clipPath'),
164 (namespaces['svg'], 'circle'),
165 (namespaces['svg'], 'defs'),
166 (namespaces['svg'], 'desc'),
167 (namespaces['svg'], 'ellipse'),
168 (namespaces['svg'], 'font-face'),
169 (namespaces['svg'], 'font-face-name'),
170 (namespaces['svg'], 'font-face-src'),
171 (namespaces['svg'], 'g'),
172 (namespaces['svg'], 'glyph'),
173 (namespaces['svg'], 'hkern'),
174 (namespaces['svg'], 'linearGradient'),
175 (namespaces['svg'], 'line'),
176 (namespaces['svg'], 'marker'),
177 (namespaces['svg'], 'metadata'),
178 (namespaces['svg'], 'missing-glyph'),
179 (namespaces['svg'], 'mpath'),
180 (namespaces['svg'], 'path'),
181 (namespaces['svg'], 'polygon'),
182 (namespaces['svg'], 'polyline'),
183 (namespaces['svg'], 'radialGradient'),
184 (namespaces['svg'], 'rect'),
185 (namespaces['svg'], 'set'),
186 (namespaces['svg'], 'stop'),
187 (namespaces['svg'], 'svg'),
188 (namespaces['svg'], 'switch'),
189 (namespaces['svg'], 'text'),
190 (namespaces['svg'], 'title'),
191 (namespaces['svg'], 'tspan'),
192 (namespaces['svg'], 'use'),
193))
195allowed_attributes = frozenset((
196 # HTML attributes
197 (None, 'abbr'),
198 (None, 'accept'),
199 (None, 'accept-charset'),
200 (None, 'accesskey'),
201 (None, 'action'),
202 (None, 'align'),
203 (None, 'alt'),
204 (None, 'autocomplete'),
205 (None, 'autofocus'),
206 (None, 'axis'),
207 (None, 'background'),
208 (None, 'balance'),
209 (None, 'bgcolor'),
210 (None, 'bgproperties'),
211 (None, 'border'),
212 (None, 'bordercolor'),
213 (None, 'bordercolordark'),
214 (None, 'bordercolorlight'),
215 (None, 'bottompadding'),
216 (None, 'cellpadding'),
217 (None, 'cellspacing'),
218 (None, 'ch'),
219 (None, 'challenge'),
220 (None, 'char'),
221 (None, 'charoff'),
222 (None, 'choff'),
223 (None, 'charset'),
224 (None, 'checked'),
225 (None, 'cite'),
226 (None, 'class'),
227 (None, 'clear'),
228 (None, 'color'),
229 (None, 'cols'),
230 (None, 'colspan'),
231 (None, 'compact'),
232 (None, 'contenteditable'),
233 (None, 'controls'),
234 (None, 'coords'),
235 (None, 'data'),
236 (None, 'datafld'),
237 (None, 'datapagesize'),
238 (None, 'datasrc'),
239 (None, 'datetime'),
240 (None, 'default'),
241 (None, 'delay'),
242 (None, 'dir'),
243 (None, 'disabled'),
244 (None, 'draggable'),
245 (None, 'dynsrc'),
246 (None, 'enctype'),
247 (None, 'end'),
248 (None, 'face'),
249 (None, 'for'),
250 (None, 'form'),
251 (None, 'frame'),
252 (None, 'galleryimg'),
253 (None, 'gutter'),
254 (None, 'headers'),
255 (None, 'height'),
256 (None, 'hidefocus'),
257 (None, 'hidden'),
258 (None, 'high'),
259 (None, 'href'),
260 (None, 'hreflang'),
261 (None, 'hspace'),
262 (None, 'icon'),
263 (None, 'id'),
264 (None, 'inputmode'),
265 (None, 'ismap'),
266 (None, 'keytype'),
267 (None, 'label'),
268 (None, 'leftspacing'),
269 (None, 'lang'),
270 (None, 'list'),
271 (None, 'longdesc'),
272 (None, 'loop'),
273 (None, 'loopcount'),
274 (None, 'loopend'),
275 (None, 'loopstart'),
276 (None, 'low'),
277 (None, 'lowsrc'),
278 (None, 'max'),
279 (None, 'maxlength'),
280 (None, 'media'),
281 (None, 'method'),
282 (None, 'min'),
283 (None, 'multiple'),
284 (None, 'name'),
285 (None, 'nohref'),
286 (None, 'noshade'),
287 (None, 'nowrap'),
288 (None, 'open'),
289 (None, 'optimum'),
290 (None, 'pattern'),
291 (None, 'ping'),
292 (None, 'point-size'),
293 (None, 'poster'),
294 (None, 'pqg'),
295 (None, 'preload'),
296 (None, 'prompt'),
297 (None, 'radiogroup'),
298 (None, 'readonly'),
299 (None, 'rel'),
300 (None, 'repeat-max'),
301 (None, 'repeat-min'),
302 (None, 'replace'),
303 (None, 'required'),
304 (None, 'rev'),
305 (None, 'rightspacing'),
306 (None, 'rows'),
307 (None, 'rowspan'),
308 (None, 'rules'),
309 (None, 'scope'),
310 (None, 'selected'),
311 (None, 'shape'),
312 (None, 'size'),
313 (None, 'span'),
314 (None, 'src'),
315 (None, 'start'),
316 (None, 'step'),
317 (None, 'style'),
318 (None, 'summary'),
319 (None, 'suppress'),
320 (None, 'tabindex'),
321 (None, 'target'),
322 (None, 'template'),
323 (None, 'title'),
324 (None, 'toppadding'),
325 (None, 'type'),
326 (None, 'unselectable'),
327 (None, 'usemap'),
328 (None, 'urn'),
329 (None, 'valign'),
330 (None, 'value'),
331 (None, 'variable'),
332 (None, 'volume'),
333 (None, 'vspace'),
334 (None, 'vrml'),
335 (None, 'width'),
336 (None, 'wrap'),
337 (namespaces['xml'], 'lang'),
338 # MathML attributes
339 (None, 'actiontype'),
340 (None, 'align'),
341 (None, 'columnalign'),
342 (None, 'columnalign'),
343 (None, 'columnalign'),
344 (None, 'columnlines'),
345 (None, 'columnspacing'),
346 (None, 'columnspan'),
347 (None, 'depth'),
348 (None, 'display'),
349 (None, 'displaystyle'),
350 (None, 'equalcolumns'),
351 (None, 'equalrows'),
352 (None, 'fence'),
353 (None, 'fontstyle'),
354 (None, 'fontweight'),
355 (None, 'frame'),
356 (None, 'height'),
357 (None, 'linethickness'),
358 (None, 'lspace'),
359 (None, 'mathbackground'),
360 (None, 'mathcolor'),
361 (None, 'mathvariant'),
362 (None, 'mathvariant'),
363 (None, 'maxsize'),
364 (None, 'minsize'),
365 (None, 'other'),
366 (None, 'rowalign'),
367 (None, 'rowalign'),
368 (None, 'rowalign'),
369 (None, 'rowlines'),
370 (None, 'rowspacing'),
371 (None, 'rowspan'),
372 (None, 'rspace'),
373 (None, 'scriptlevel'),
374 (None, 'selection'),
375 (None, 'separator'),
376 (None, 'stretchy'),
377 (None, 'width'),
378 (None, 'width'),
379 (namespaces['xlink'], 'href'),
380 (namespaces['xlink'], 'show'),
381 (namespaces['xlink'], 'type'),
382 # SVG attributes
383 (None, 'accent-height'),
384 (None, 'accumulate'),
385 (None, 'additive'),
386 (None, 'alphabetic'),
387 (None, 'arabic-form'),
388 (None, 'ascent'),
389 (None, 'attributeName'),
390 (None, 'attributeType'),
391 (None, 'baseProfile'),
392 (None, 'bbox'),
393 (None, 'begin'),
394 (None, 'by'),
395 (None, 'calcMode'),
396 (None, 'cap-height'),
397 (None, 'class'),
398 (None, 'clip-path'),
399 (None, 'color'),
400 (None, 'color-rendering'),
401 (None, 'content'),
402 (None, 'cx'),
403 (None, 'cy'),
404 (None, 'd'),
405 (None, 'dx'),
406 (None, 'dy'),
407 (None, 'descent'),
408 (None, 'display'),
409 (None, 'dur'),
410 (None, 'end'),
411 (None, 'fill'),
412 (None, 'fill-opacity'),
413 (None, 'fill-rule'),
414 (None, 'font-family'),
415 (None, 'font-size'),
416 (None, 'font-stretch'),
417 (None, 'font-style'),
418 (None, 'font-variant'),
419 (None, 'font-weight'),
420 (None, 'from'),
421 (None, 'fx'),
422 (None, 'fy'),
423 (None, 'g1'),
424 (None, 'g2'),
425 (None, 'glyph-name'),
426 (None, 'gradientUnits'),
427 (None, 'hanging'),
428 (None, 'height'),
429 (None, 'horiz-adv-x'),
430 (None, 'horiz-origin-x'),
431 (None, 'id'),
432 (None, 'ideographic'),
433 (None, 'k'),
434 (None, 'keyPoints'),
435 (None, 'keySplines'),
436 (None, 'keyTimes'),
437 (None, 'lang'),
438 (None, 'marker-end'),
439 (None, 'marker-mid'),
440 (None, 'marker-start'),
441 (None, 'markerHeight'),
442 (None, 'markerUnits'),
443 (None, 'markerWidth'),
444 (None, 'mathematical'),
445 (None, 'max'),
446 (None, 'min'),
447 (None, 'name'),
448 (None, 'offset'),
449 (None, 'opacity'),
450 (None, 'orient'),
451 (None, 'origin'),
452 (None, 'overline-position'),
453 (None, 'overline-thickness'),
454 (None, 'panose-1'),
455 (None, 'path'),
456 (None, 'pathLength'),
457 (None, 'points'),
458 (None, 'preserveAspectRatio'),
459 (None, 'r'),
460 (None, 'refX'),
461 (None, 'refY'),
462 (None, 'repeatCount'),
463 (None, 'repeatDur'),
464 (None, 'requiredExtensions'),
465 (None, 'requiredFeatures'),
466 (None, 'restart'),
467 (None, 'rotate'),
468 (None, 'rx'),
469 (None, 'ry'),
470 (None, 'slope'),
471 (None, 'stemh'),
472 (None, 'stemv'),
473 (None, 'stop-color'),
474 (None, 'stop-opacity'),
475 (None, 'strikethrough-position'),
476 (None, 'strikethrough-thickness'),
477 (None, 'stroke'),
478 (None, 'stroke-dasharray'),
479 (None, 'stroke-dashoffset'),
480 (None, 'stroke-linecap'),
481 (None, 'stroke-linejoin'),
482 (None, 'stroke-miterlimit'),
483 (None, 'stroke-opacity'),
484 (None, 'stroke-width'),
485 (None, 'systemLanguage'),
486 (None, 'target'),
487 (None, 'text-anchor'),
488 (None, 'to'),
489 (None, 'transform'),
490 (None, 'type'),
491 (None, 'u1'),
492 (None, 'u2'),
493 (None, 'underline-position'),
494 (None, 'underline-thickness'),
495 (None, 'unicode'),
496 (None, 'unicode-range'),
497 (None, 'units-per-em'),
498 (None, 'values'),
499 (None, 'version'),
500 (None, 'viewBox'),
501 (None, 'visibility'),
502 (None, 'width'),
503 (None, 'widths'),
504 (None, 'x'),
505 (None, 'x-height'),
506 (None, 'x1'),
507 (None, 'x2'),
508 (namespaces['xlink'], 'actuate'),
509 (namespaces['xlink'], 'arcrole'),
510 (namespaces['xlink'], 'href'),
511 (namespaces['xlink'], 'role'),
512 (namespaces['xlink'], 'show'),
513 (namespaces['xlink'], 'title'),
514 (namespaces['xlink'], 'type'),
515 (namespaces['xml'], 'base'),
516 (namespaces['xml'], 'lang'),
517 (namespaces['xml'], 'space'),
518 (None, 'y'),
519 (None, 'y1'),
520 (None, 'y2'),
521 (None, 'zoomAndPan'),
522))
524attr_val_is_uri = frozenset((
525 (None, 'href'),
526 (None, 'src'),
527 (None, 'cite'),
528 (None, 'action'),
529 (None, 'longdesc'),
530 (None, 'poster'),
531 (None, 'background'),
532 (None, 'datasrc'),
533 (None, 'dynsrc'),
534 (None, 'lowsrc'),
535 (None, 'ping'),
536 (namespaces['xlink'], 'href'),
537 (namespaces['xml'], 'base'),
538))
540svg_attr_val_allows_ref = frozenset((
541 (None, 'clip-path'),
542 (None, 'color-profile'),
543 (None, 'cursor'),
544 (None, 'fill'),
545 (None, 'filter'),
546 (None, 'marker'),
547 (None, 'marker-start'),
548 (None, 'marker-mid'),
549 (None, 'marker-end'),
550 (None, 'mask'),
551 (None, 'stroke'),
552))
554svg_allow_local_href = frozenset((
555 (None, 'altGlyph'),
556 (None, 'animate'),
557 (None, 'animateColor'),
558 (None, 'animateMotion'),
559 (None, 'animateTransform'),
560 (None, 'cursor'),
561 (None, 'feImage'),
562 (None, 'filter'),
563 (None, 'linearGradient'),
564 (None, 'pattern'),
565 (None, 'radialGradient'),
566 (None, 'textpath'),
567 (None, 'tref'),
568 (None, 'set'),
569 (None, 'use')
570))
572allowed_css_properties = frozenset((
573 'azimuth',
574 'background-color',
575 'border-bottom-color',
576 'border-collapse',
577 'border-color',
578 'border-left-color',
579 'border-right-color',
580 'border-top-color',
581 'clear',
582 'color',
583 'cursor',
584 'direction',
585 'display',
586 'elevation',
587 'float',
588 'font',
589 'font-family',
590 'font-size',
591 'font-style',
592 'font-variant',
593 'font-weight',
594 'height',
595 'letter-spacing',
596 'line-height',
597 'overflow',
598 'pause',
599 'pause-after',
600 'pause-before',
601 'pitch',
602 'pitch-range',
603 'richness',
604 'speak',
605 'speak-header',
606 'speak-numeral',
607 'speak-punctuation',
608 'speech-rate',
609 'stress',
610 'text-align',
611 'text-decoration',
612 'text-indent',
613 'unicode-bidi',
614 'vertical-align',
615 'voice-family',
616 'volume',
617 'white-space',
618 'width',
619))
621allowed_css_keywords = frozenset((
622 'auto',
623 'aqua',
624 'black',
625 'block',
626 'blue',
627 'bold',
628 'both',
629 'bottom',
630 'brown',
631 'center',
632 'collapse',
633 'dashed',
634 'dotted',
635 'fuchsia',
636 'gray',
637 'green',
638 '!important',
639 'italic',
640 'left',
641 'lime',
642 'maroon',
643 'medium',
644 'none',
645 'navy',
646 'normal',
647 'nowrap',
648 'olive',
649 'pointer',
650 'purple',
651 'red',
652 'right',
653 'solid',
654 'silver',
655 'teal',
656 'top',
657 'transparent',
658 'underline',
659 'white',
660 'yellow',
661))
663allowed_svg_properties = frozenset((
664 'fill',
665 'fill-opacity',
666 'fill-rule',
667 'stroke',
668 'stroke-width',
669 'stroke-linecap',
670 'stroke-linejoin',
671 'stroke-opacity',
672))
674allowed_protocols = frozenset((
675 'ed2k',
676 'ftp',
677 'http',
678 'https',
679 'irc',
680 'mailto',
681 'news',
682 'gopher',
683 'nntp',
684 'telnet',
685 'webcal',
686 'xmpp',
687 'callto',
688 'feed',
689 'urn',
690 'aim',
691 'rsync',
692 'tag',
693 'ssh',
694 'sftp',
695 'rtsp',
696 'afs',
697 'data',
698))
700allowed_content_types = frozenset((
701 'image/png',
702 'image/jpeg',
703 'image/gif',
704 'image/webp',
705 'image/bmp',
706 'text/plain',
707))
710data_content_type = re.compile(r'''
711 ^
712 # Match a content type <application>/<type>
713 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
714 # Match any character set and encoding
715 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
716 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
717 # Assume the rest is data
718 ,.*
719 $
720 ''',
721 re.VERBOSE)
724class Filter(base.Filter):
725 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
726 def __init__(self,
727 source,
728 allowed_elements=allowed_elements,
729 allowed_attributes=allowed_attributes,
730 allowed_css_properties=allowed_css_properties,
731 allowed_css_keywords=allowed_css_keywords,
732 allowed_svg_properties=allowed_svg_properties,
733 allowed_protocols=allowed_protocols,
734 allowed_content_types=allowed_content_types,
735 attr_val_is_uri=attr_val_is_uri,
736 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
737 svg_allow_local_href=svg_allow_local_href):
738 """Creates a Filter
740 :arg allowed_elements: set of elements to allow--everything else will
741 be escaped
743 :arg allowed_attributes: set of attributes to allow in
744 elements--everything else will be stripped
746 :arg allowed_css_properties: set of CSS properties to allow--everything
747 else will be stripped
749 :arg allowed_css_keywords: set of CSS keywords to allow--everything
750 else will be stripped
752 :arg allowed_svg_properties: set of SVG properties to allow--everything
753 else will be removed
755 :arg allowed_protocols: set of allowed protocols for URIs
757 :arg allowed_content_types: set of allowed content types for ``data`` URIs.
759 :arg attr_val_is_uri: set of attributes that have URI values--values
760 that have a scheme not listed in ``allowed_protocols`` are removed
762 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
763 references
765 :arg svg_allow_local_href: set of SVG elements that can have local
766 hrefs--these are removed
768 """
769 super(Filter, self).__init__(source)
771 warnings.warn(_deprecation_msg, DeprecationWarning)
773 self.allowed_elements = allowed_elements
774 self.allowed_attributes = allowed_attributes
775 self.allowed_css_properties = allowed_css_properties
776 self.allowed_css_keywords = allowed_css_keywords
777 self.allowed_svg_properties = allowed_svg_properties
778 self.allowed_protocols = allowed_protocols
779 self.allowed_content_types = allowed_content_types
780 self.attr_val_is_uri = attr_val_is_uri
781 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
782 self.svg_allow_local_href = svg_allow_local_href
784 def __iter__(self):
785 for token in base.Filter.__iter__(self):
786 token = self.sanitize_token(token)
787 if token:
788 yield token
790 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
791 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
792 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
793 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
794 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
795 # allowed.
796 #
797 # sanitize_html('<script> do_nasty_stuff() </script>')
798 # => <script> do_nasty_stuff() </script>
799 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
800 # => <a>Click here for $100</a>
801 def sanitize_token(self, token):
803 # accommodate filters which use token_type differently
804 token_type = token["type"]
805 if token_type in ("StartTag", "EndTag", "EmptyTag"):
806 name = token["name"]
807 namespace = token["namespace"]
808 if ((namespace, name) in self.allowed_elements or
809 (namespace is None and
810 (namespaces["html"], name) in self.allowed_elements)):
811 return self.allowed_token(token)
812 else:
813 return self.disallowed_token(token)
814 elif token_type == "Comment":
815 pass
816 else:
817 return token
819 def allowed_token(self, token):
820 if "data" in token:
821 attrs = token["data"]
822 attr_names = set(attrs.keys())
824 # Remove forbidden attributes
825 for to_remove in (attr_names - self.allowed_attributes):
826 del token["data"][to_remove]
827 attr_names.remove(to_remove)
829 # Remove attributes with disallowed URL values
830 for attr in (attr_names & self.attr_val_is_uri):
831 assert attr in attrs
832 # I don't have a clue where this regexp comes from or why it matches those
833 # characters, nor why we call unescape. I just know it's always been here.
834 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
835 # this will do is remove *more* than it otherwise would.
836 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
837 unescape(attrs[attr])).lower()
838 # remove replacement characters from unescaped characters
839 val_unescaped = val_unescaped.replace("\ufffd", "")
840 try:
841 uri = urlparse.urlparse(val_unescaped)
842 except ValueError:
843 uri = None
844 del attrs[attr]
845 if uri and uri.scheme:
846 if uri.scheme not in self.allowed_protocols:
847 del attrs[attr]
848 if uri.scheme == 'data':
849 m = data_content_type.match(uri.path)
850 if not m:
851 del attrs[attr]
852 elif m.group('content_type') not in self.allowed_content_types:
853 del attrs[attr]
855 for attr in self.svg_attr_val_allows_ref:
856 if attr in attrs:
857 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
858 ' ',
859 unescape(attrs[attr]))
860 if (token["name"] in self.svg_allow_local_href and
861 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
862 attrs[(namespaces['xlink'], 'href')])):
863 del attrs[(namespaces['xlink'], 'href')]
864 if (None, 'style') in attrs:
865 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
866 token["data"] = attrs
867 return token
869 def disallowed_token(self, token):
870 token_type = token["type"]
871 if token_type == "EndTag":
872 token["data"] = "</%s>" % token["name"]
873 elif token["data"]:
874 assert token_type in ("StartTag", "EmptyTag")
875 attrs = []
876 for (ns, name), v in token["data"].items():
877 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
878 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
879 else:
880 token["data"] = "<%s>" % token["name"]
881 if token.get("selfClosing"):
882 token["data"] = token["data"][:-1] + "/>"
884 token["type"] = "Characters"
886 del token["name"]
887 return token
889 def sanitize_css(self, style):
890 # disallow urls
891 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
893 # gauntlet
894 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
895 return ''
896 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
897 return ''
899 clean = []
900 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
901 if not value:
902 continue
903 if prop.lower() in self.allowed_css_properties:
904 clean.append(prop + ': ' + value + ';')
905 elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
906 'padding']:
907 for keyword in value.split():
908 if keyword not in self.allowed_css_keywords and \
909 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
910 break
911 else:
912 clean.append(prop + ': ' + value + ';')
913 elif prop.lower() in self.allowed_svg_properties:
914 clean.append(prop + ': ' + value + ';')
916 return ' '.join(clean)