1"""Deprecated from html5lib 1.1.
2
3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
5is recommended as a replacement. Please let us know in the aforementioned issue
6if Bleach is unsuitable for your needs.
7
8"""
9from __future__ import absolute_import, division, unicode_literals
10
11import re
12import warnings
13from xml.sax.saxutils import escape, unescape
14
15from bleach.six_shim import urllib_parse as urlparse
16
17from . import base
18from ..constants import namespaces, prefixes
19
20__all__ = ["Filter"]
21
22
23_deprecation_msg = (
24 "html5lib's sanitizer is deprecated; see " +
25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
26 "us know if Bleach is unsuitable for your needs"
27)
28
29warnings.warn(_deprecation_msg, DeprecationWarning)
30
31allowed_elements = frozenset((
32 (namespaces['html'], 'a'),
33 (namespaces['html'], 'abbr'),
34 (namespaces['html'], 'acronym'),
35 (namespaces['html'], 'address'),
36 (namespaces['html'], 'area'),
37 (namespaces['html'], 'article'),
38 (namespaces['html'], 'aside'),
39 (namespaces['html'], 'audio'),
40 (namespaces['html'], 'b'),
41 (namespaces['html'], 'big'),
42 (namespaces['html'], 'blockquote'),
43 (namespaces['html'], 'br'),
44 (namespaces['html'], 'button'),
45 (namespaces['html'], 'canvas'),
46 (namespaces['html'], 'caption'),
47 (namespaces['html'], 'center'),
48 (namespaces['html'], 'cite'),
49 (namespaces['html'], 'code'),
50 (namespaces['html'], 'col'),
51 (namespaces['html'], 'colgroup'),
52 (namespaces['html'], 'command'),
53 (namespaces['html'], 'datagrid'),
54 (namespaces['html'], 'datalist'),
55 (namespaces['html'], 'dd'),
56 (namespaces['html'], 'del'),
57 (namespaces['html'], 'details'),
58 (namespaces['html'], 'dfn'),
59 (namespaces['html'], 'dialog'),
60 (namespaces['html'], 'dir'),
61 (namespaces['html'], 'div'),
62 (namespaces['html'], 'dl'),
63 (namespaces['html'], 'dt'),
64 (namespaces['html'], 'em'),
65 (namespaces['html'], 'event-source'),
66 (namespaces['html'], 'fieldset'),
67 (namespaces['html'], 'figcaption'),
68 (namespaces['html'], 'figure'),
69 (namespaces['html'], 'footer'),
70 (namespaces['html'], 'font'),
71 (namespaces['html'], 'form'),
72 (namespaces['html'], 'header'),
73 (namespaces['html'], 'h1'),
74 (namespaces['html'], 'h2'),
75 (namespaces['html'], 'h3'),
76 (namespaces['html'], 'h4'),
77 (namespaces['html'], 'h5'),
78 (namespaces['html'], 'h6'),
79 (namespaces['html'], 'hr'),
80 (namespaces['html'], 'i'),
81 (namespaces['html'], 'img'),
82 (namespaces['html'], 'input'),
83 (namespaces['html'], 'ins'),
84 (namespaces['html'], 'keygen'),
85 (namespaces['html'], 'kbd'),
86 (namespaces['html'], 'label'),
87 (namespaces['html'], 'legend'),
88 (namespaces['html'], 'li'),
89 (namespaces['html'], 'm'),
90 (namespaces['html'], 'map'),
91 (namespaces['html'], 'menu'),
92 (namespaces['html'], 'meter'),
93 (namespaces['html'], 'multicol'),
94 (namespaces['html'], 'nav'),
95 (namespaces['html'], 'nextid'),
96 (namespaces['html'], 'ol'),
97 (namespaces['html'], 'output'),
98 (namespaces['html'], 'optgroup'),
99 (namespaces['html'], 'option'),
100 (namespaces['html'], 'p'),
101 (namespaces['html'], 'pre'),
102 (namespaces['html'], 'progress'),
103 (namespaces['html'], 'q'),
104 (namespaces['html'], 's'),
105 (namespaces['html'], 'samp'),
106 (namespaces['html'], 'section'),
107 (namespaces['html'], 'select'),
108 (namespaces['html'], 'small'),
109 (namespaces['html'], 'sound'),
110 (namespaces['html'], 'source'),
111 (namespaces['html'], 'spacer'),
112 (namespaces['html'], 'span'),
113 (namespaces['html'], 'strike'),
114 (namespaces['html'], 'strong'),
115 (namespaces['html'], 'sub'),
116 (namespaces['html'], 'sup'),
117 (namespaces['html'], 'table'),
118 (namespaces['html'], 'tbody'),
119 (namespaces['html'], 'td'),
120 (namespaces['html'], 'textarea'),
121 (namespaces['html'], 'time'),
122 (namespaces['html'], 'tfoot'),
123 (namespaces['html'], 'th'),
124 (namespaces['html'], 'thead'),
125 (namespaces['html'], 'tr'),
126 (namespaces['html'], 'tt'),
127 (namespaces['html'], 'u'),
128 (namespaces['html'], 'ul'),
129 (namespaces['html'], 'var'),
130 (namespaces['html'], 'video'),
131 (namespaces['mathml'], 'maction'),
132 (namespaces['mathml'], 'math'),
133 (namespaces['mathml'], 'merror'),
134 (namespaces['mathml'], 'mfrac'),
135 (namespaces['mathml'], 'mi'),
136 (namespaces['mathml'], 'mmultiscripts'),
137 (namespaces['mathml'], 'mn'),
138 (namespaces['mathml'], 'mo'),
139 (namespaces['mathml'], 'mover'),
140 (namespaces['mathml'], 'mpadded'),
141 (namespaces['mathml'], 'mphantom'),
142 (namespaces['mathml'], 'mprescripts'),
143 (namespaces['mathml'], 'mroot'),
144 (namespaces['mathml'], 'mrow'),
145 (namespaces['mathml'], 'mspace'),
146 (namespaces['mathml'], 'msqrt'),
147 (namespaces['mathml'], 'mstyle'),
148 (namespaces['mathml'], 'msub'),
149 (namespaces['mathml'], 'msubsup'),
150 (namespaces['mathml'], 'msup'),
151 (namespaces['mathml'], 'mtable'),
152 (namespaces['mathml'], 'mtd'),
153 (namespaces['mathml'], 'mtext'),
154 (namespaces['mathml'], 'mtr'),
155 (namespaces['mathml'], 'munder'),
156 (namespaces['mathml'], 'munderover'),
157 (namespaces['mathml'], 'none'),
158 (namespaces['svg'], 'a'),
159 (namespaces['svg'], 'animate'),
160 (namespaces['svg'], 'animateColor'),
161 (namespaces['svg'], 'animateMotion'),
162 (namespaces['svg'], 'animateTransform'),
163 (namespaces['svg'], 'clipPath'),
164 (namespaces['svg'], 'circle'),
165 (namespaces['svg'], 'defs'),
166 (namespaces['svg'], 'desc'),
167 (namespaces['svg'], 'ellipse'),
168 (namespaces['svg'], 'font-face'),
169 (namespaces['svg'], 'font-face-name'),
170 (namespaces['svg'], 'font-face-src'),
171 (namespaces['svg'], 'g'),
172 (namespaces['svg'], 'glyph'),
173 (namespaces['svg'], 'hkern'),
174 (namespaces['svg'], 'linearGradient'),
175 (namespaces['svg'], 'line'),
176 (namespaces['svg'], 'marker'),
177 (namespaces['svg'], 'metadata'),
178 (namespaces['svg'], 'missing-glyph'),
179 (namespaces['svg'], 'mpath'),
180 (namespaces['svg'], 'path'),
181 (namespaces['svg'], 'polygon'),
182 (namespaces['svg'], 'polyline'),
183 (namespaces['svg'], 'radialGradient'),
184 (namespaces['svg'], 'rect'),
185 (namespaces['svg'], 'set'),
186 (namespaces['svg'], 'stop'),
187 (namespaces['svg'], 'svg'),
188 (namespaces['svg'], 'switch'),
189 (namespaces['svg'], 'text'),
190 (namespaces['svg'], 'title'),
191 (namespaces['svg'], 'tspan'),
192 (namespaces['svg'], 'use'),
193))
194
195allowed_attributes = frozenset((
196 # HTML attributes
197 (None, 'abbr'),
198 (None, 'accept'),
199 (None, 'accept-charset'),
200 (None, 'accesskey'),
201 (None, 'action'),
202 (None, 'align'),
203 (None, 'alt'),
204 (None, 'autocomplete'),
205 (None, 'autofocus'),
206 (None, 'axis'),
207 (None, 'background'),
208 (None, 'balance'),
209 (None, 'bgcolor'),
210 (None, 'bgproperties'),
211 (None, 'border'),
212 (None, 'bordercolor'),
213 (None, 'bordercolordark'),
214 (None, 'bordercolorlight'),
215 (None, 'bottompadding'),
216 (None, 'cellpadding'),
217 (None, 'cellspacing'),
218 (None, 'ch'),
219 (None, 'challenge'),
220 (None, 'char'),
221 (None, 'charoff'),
222 (None, 'choff'),
223 (None, 'charset'),
224 (None, 'checked'),
225 (None, 'cite'),
226 (None, 'class'),
227 (None, 'clear'),
228 (None, 'color'),
229 (None, 'cols'),
230 (None, 'colspan'),
231 (None, 'compact'),
232 (None, 'contenteditable'),
233 (None, 'controls'),
234 (None, 'coords'),
235 (None, 'data'),
236 (None, 'datafld'),
237 (None, 'datapagesize'),
238 (None, 'datasrc'),
239 (None, 'datetime'),
240 (None, 'default'),
241 (None, 'delay'),
242 (None, 'dir'),
243 (None, 'disabled'),
244 (None, 'draggable'),
245 (None, 'dynsrc'),
246 (None, 'enctype'),
247 (None, 'end'),
248 (None, 'face'),
249 (None, 'for'),
250 (None, 'form'),
251 (None, 'frame'),
252 (None, 'galleryimg'),
253 (None, 'gutter'),
254 (None, 'headers'),
255 (None, 'height'),
256 (None, 'hidefocus'),
257 (None, 'hidden'),
258 (None, 'high'),
259 (None, 'href'),
260 (None, 'hreflang'),
261 (None, 'hspace'),
262 (None, 'icon'),
263 (None, 'id'),
264 (None, 'inputmode'),
265 (None, 'ismap'),
266 (None, 'keytype'),
267 (None, 'label'),
268 (None, 'leftspacing'),
269 (None, 'lang'),
270 (None, 'list'),
271 (None, 'longdesc'),
272 (None, 'loop'),
273 (None, 'loopcount'),
274 (None, 'loopend'),
275 (None, 'loopstart'),
276 (None, 'low'),
277 (None, 'lowsrc'),
278 (None, 'max'),
279 (None, 'maxlength'),
280 (None, 'media'),
281 (None, 'method'),
282 (None, 'min'),
283 (None, 'multiple'),
284 (None, 'name'),
285 (None, 'nohref'),
286 (None, 'noshade'),
287 (None, 'nowrap'),
288 (None, 'open'),
289 (None, 'optimum'),
290 (None, 'pattern'),
291 (None, 'ping'),
292 (None, 'point-size'),
293 (None, 'poster'),
294 (None, 'pqg'),
295 (None, 'preload'),
296 (None, 'prompt'),
297 (None, 'radiogroup'),
298 (None, 'readonly'),
299 (None, 'rel'),
300 (None, 'repeat-max'),
301 (None, 'repeat-min'),
302 (None, 'replace'),
303 (None, 'required'),
304 (None, 'rev'),
305 (None, 'rightspacing'),
306 (None, 'rows'),
307 (None, 'rowspan'),
308 (None, 'rules'),
309 (None, 'scope'),
310 (None, 'selected'),
311 (None, 'shape'),
312 (None, 'size'),
313 (None, 'span'),
314 (None, 'src'),
315 (None, 'start'),
316 (None, 'step'),
317 (None, 'style'),
318 (None, 'summary'),
319 (None, 'suppress'),
320 (None, 'tabindex'),
321 (None, 'target'),
322 (None, 'template'),
323 (None, 'title'),
324 (None, 'toppadding'),
325 (None, 'type'),
326 (None, 'unselectable'),
327 (None, 'usemap'),
328 (None, 'urn'),
329 (None, 'valign'),
330 (None, 'value'),
331 (None, 'variable'),
332 (None, 'volume'),
333 (None, 'vspace'),
334 (None, 'vrml'),
335 (None, 'width'),
336 (None, 'wrap'),
337 (namespaces['xml'], 'lang'),
338 # MathML attributes
339 (None, 'actiontype'),
340 (None, 'align'),
341 (None, 'columnalign'),
342 (None, 'columnalign'),
343 (None, 'columnalign'),
344 (None, 'columnlines'),
345 (None, 'columnspacing'),
346 (None, 'columnspan'),
347 (None, 'depth'),
348 (None, 'display'),
349 (None, 'displaystyle'),
350 (None, 'equalcolumns'),
351 (None, 'equalrows'),
352 (None, 'fence'),
353 (None, 'fontstyle'),
354 (None, 'fontweight'),
355 (None, 'frame'),
356 (None, 'height'),
357 (None, 'linethickness'),
358 (None, 'lspace'),
359 (None, 'mathbackground'),
360 (None, 'mathcolor'),
361 (None, 'mathvariant'),
362 (None, 'mathvariant'),
363 (None, 'maxsize'),
364 (None, 'minsize'),
365 (None, 'other'),
366 (None, 'rowalign'),
367 (None, 'rowalign'),
368 (None, 'rowalign'),
369 (None, 'rowlines'),
370 (None, 'rowspacing'),
371 (None, 'rowspan'),
372 (None, 'rspace'),
373 (None, 'scriptlevel'),
374 (None, 'selection'),
375 (None, 'separator'),
376 (None, 'stretchy'),
377 (None, 'width'),
378 (None, 'width'),
379 (namespaces['xlink'], 'href'),
380 (namespaces['xlink'], 'show'),
381 (namespaces['xlink'], 'type'),
382 # SVG attributes
383 (None, 'accent-height'),
384 (None, 'accumulate'),
385 (None, 'additive'),
386 (None, 'alphabetic'),
387 (None, 'arabic-form'),
388 (None, 'ascent'),
389 (None, 'attributeName'),
390 (None, 'attributeType'),
391 (None, 'baseProfile'),
392 (None, 'bbox'),
393 (None, 'begin'),
394 (None, 'by'),
395 (None, 'calcMode'),
396 (None, 'cap-height'),
397 (None, 'class'),
398 (None, 'clip-path'),
399 (None, 'color'),
400 (None, 'color-rendering'),
401 (None, 'content'),
402 (None, 'cx'),
403 (None, 'cy'),
404 (None, 'd'),
405 (None, 'dx'),
406 (None, 'dy'),
407 (None, 'descent'),
408 (None, 'display'),
409 (None, 'dur'),
410 (None, 'end'),
411 (None, 'fill'),
412 (None, 'fill-opacity'),
413 (None, 'fill-rule'),
414 (None, 'font-family'),
415 (None, 'font-size'),
416 (None, 'font-stretch'),
417 (None, 'font-style'),
418 (None, 'font-variant'),
419 (None, 'font-weight'),
420 (None, 'from'),
421 (None, 'fx'),
422 (None, 'fy'),
423 (None, 'g1'),
424 (None, 'g2'),
425 (None, 'glyph-name'),
426 (None, 'gradientUnits'),
427 (None, 'hanging'),
428 (None, 'height'),
429 (None, 'horiz-adv-x'),
430 (None, 'horiz-origin-x'),
431 (None, 'id'),
432 (None, 'ideographic'),
433 (None, 'k'),
434 (None, 'keyPoints'),
435 (None, 'keySplines'),
436 (None, 'keyTimes'),
437 (None, 'lang'),
438 (None, 'marker-end'),
439 (None, 'marker-mid'),
440 (None, 'marker-start'),
441 (None, 'markerHeight'),
442 (None, 'markerUnits'),
443 (None, 'markerWidth'),
444 (None, 'mathematical'),
445 (None, 'max'),
446 (None, 'min'),
447 (None, 'name'),
448 (None, 'offset'),
449 (None, 'opacity'),
450 (None, 'orient'),
451 (None, 'origin'),
452 (None, 'overline-position'),
453 (None, 'overline-thickness'),
454 (None, 'panose-1'),
455 (None, 'path'),
456 (None, 'pathLength'),
457 (None, 'points'),
458 (None, 'preserveAspectRatio'),
459 (None, 'r'),
460 (None, 'refX'),
461 (None, 'refY'),
462 (None, 'repeatCount'),
463 (None, 'repeatDur'),
464 (None, 'requiredExtensions'),
465 (None, 'requiredFeatures'),
466 (None, 'restart'),
467 (None, 'rotate'),
468 (None, 'rx'),
469 (None, 'ry'),
470 (None, 'slope'),
471 (None, 'stemh'),
472 (None, 'stemv'),
473 (None, 'stop-color'),
474 (None, 'stop-opacity'),
475 (None, 'strikethrough-position'),
476 (None, 'strikethrough-thickness'),
477 (None, 'stroke'),
478 (None, 'stroke-dasharray'),
479 (None, 'stroke-dashoffset'),
480 (None, 'stroke-linecap'),
481 (None, 'stroke-linejoin'),
482 (None, 'stroke-miterlimit'),
483 (None, 'stroke-opacity'),
484 (None, 'stroke-width'),
485 (None, 'systemLanguage'),
486 (None, 'target'),
487 (None, 'text-anchor'),
488 (None, 'to'),
489 (None, 'transform'),
490 (None, 'type'),
491 (None, 'u1'),
492 (None, 'u2'),
493 (None, 'underline-position'),
494 (None, 'underline-thickness'),
495 (None, 'unicode'),
496 (None, 'unicode-range'),
497 (None, 'units-per-em'),
498 (None, 'values'),
499 (None, 'version'),
500 (None, 'viewBox'),
501 (None, 'visibility'),
502 (None, 'width'),
503 (None, 'widths'),
504 (None, 'x'),
505 (None, 'x-height'),
506 (None, 'x1'),
507 (None, 'x2'),
508 (namespaces['xlink'], 'actuate'),
509 (namespaces['xlink'], 'arcrole'),
510 (namespaces['xlink'], 'href'),
511 (namespaces['xlink'], 'role'),
512 (namespaces['xlink'], 'show'),
513 (namespaces['xlink'], 'title'),
514 (namespaces['xlink'], 'type'),
515 (namespaces['xml'], 'base'),
516 (namespaces['xml'], 'lang'),
517 (namespaces['xml'], 'space'),
518 (None, 'y'),
519 (None, 'y1'),
520 (None, 'y2'),
521 (None, 'zoomAndPan'),
522))
523
524attr_val_is_uri = frozenset((
525 (None, 'href'),
526 (None, 'src'),
527 (None, 'cite'),
528 (None, 'action'),
529 (None, 'longdesc'),
530 (None, 'poster'),
531 (None, 'background'),
532 (None, 'datasrc'),
533 (None, 'dynsrc'),
534 (None, 'lowsrc'),
535 (None, 'ping'),
536 (namespaces['xlink'], 'href'),
537 (namespaces['xml'], 'base'),
538))
539
540svg_attr_val_allows_ref = frozenset((
541 (None, 'clip-path'),
542 (None, 'color-profile'),
543 (None, 'cursor'),
544 (None, 'fill'),
545 (None, 'filter'),
546 (None, 'marker'),
547 (None, 'marker-start'),
548 (None, 'marker-mid'),
549 (None, 'marker-end'),
550 (None, 'mask'),
551 (None, 'stroke'),
552))
553
554svg_allow_local_href = frozenset((
555 (None, 'altGlyph'),
556 (None, 'animate'),
557 (None, 'animateColor'),
558 (None, 'animateMotion'),
559 (None, 'animateTransform'),
560 (None, 'cursor'),
561 (None, 'feImage'),
562 (None, 'filter'),
563 (None, 'linearGradient'),
564 (None, 'pattern'),
565 (None, 'radialGradient'),
566 (None, 'textpath'),
567 (None, 'tref'),
568 (None, 'set'),
569 (None, 'use')
570))
571
572allowed_css_properties = frozenset((
573 'azimuth',
574 'background-color',
575 'border-bottom-color',
576 'border-collapse',
577 'border-color',
578 'border-left-color',
579 'border-right-color',
580 'border-top-color',
581 'clear',
582 'color',
583 'cursor',
584 'direction',
585 'display',
586 'elevation',
587 'float',
588 'font',
589 'font-family',
590 'font-size',
591 'font-style',
592 'font-variant',
593 'font-weight',
594 'height',
595 'letter-spacing',
596 'line-height',
597 'overflow',
598 'pause',
599 'pause-after',
600 'pause-before',
601 'pitch',
602 'pitch-range',
603 'richness',
604 'speak',
605 'speak-header',
606 'speak-numeral',
607 'speak-punctuation',
608 'speech-rate',
609 'stress',
610 'text-align',
611 'text-decoration',
612 'text-indent',
613 'unicode-bidi',
614 'vertical-align',
615 'voice-family',
616 'volume',
617 'white-space',
618 'width',
619))
620
621allowed_css_keywords = frozenset((
622 'auto',
623 'aqua',
624 'black',
625 'block',
626 'blue',
627 'bold',
628 'both',
629 'bottom',
630 'brown',
631 'center',
632 'collapse',
633 'dashed',
634 'dotted',
635 'fuchsia',
636 'gray',
637 'green',
638 '!important',
639 'italic',
640 'left',
641 'lime',
642 'maroon',
643 'medium',
644 'none',
645 'navy',
646 'normal',
647 'nowrap',
648 'olive',
649 'pointer',
650 'purple',
651 'red',
652 'right',
653 'solid',
654 'silver',
655 'teal',
656 'top',
657 'transparent',
658 'underline',
659 'white',
660 'yellow',
661))
662
663allowed_svg_properties = frozenset((
664 'fill',
665 'fill-opacity',
666 'fill-rule',
667 'stroke',
668 'stroke-width',
669 'stroke-linecap',
670 'stroke-linejoin',
671 'stroke-opacity',
672))
673
674allowed_protocols = frozenset((
675 'ed2k',
676 'ftp',
677 'http',
678 'https',
679 'irc',
680 'mailto',
681 'news',
682 'gopher',
683 'nntp',
684 'telnet',
685 'webcal',
686 'xmpp',
687 'callto',
688 'feed',
689 'urn',
690 'aim',
691 'rsync',
692 'tag',
693 'ssh',
694 'sftp',
695 'rtsp',
696 'afs',
697 'data',
698))
699
700allowed_content_types = frozenset((
701 'image/png',
702 'image/jpeg',
703 'image/gif',
704 'image/webp',
705 'image/bmp',
706 'text/plain',
707))
708
709
710data_content_type = re.compile(r'''
711 ^
712 # Match a content type <application>/<type>
713 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
714 # Match any character set and encoding
715 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
716 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
717 # Assume the rest is data
718 ,.*
719 $
720 ''',
721 re.VERBOSE)
722
723
724class Filter(base.Filter):
725 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
726 def __init__(self,
727 source,
728 allowed_elements=allowed_elements,
729 allowed_attributes=allowed_attributes,
730 allowed_css_properties=allowed_css_properties,
731 allowed_css_keywords=allowed_css_keywords,
732 allowed_svg_properties=allowed_svg_properties,
733 allowed_protocols=allowed_protocols,
734 allowed_content_types=allowed_content_types,
735 attr_val_is_uri=attr_val_is_uri,
736 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
737 svg_allow_local_href=svg_allow_local_href):
738 """Creates a Filter
739
740 :arg allowed_elements: set of elements to allow--everything else will
741 be escaped
742
743 :arg allowed_attributes: set of attributes to allow in
744 elements--everything else will be stripped
745
746 :arg allowed_css_properties: set of CSS properties to allow--everything
747 else will be stripped
748
749 :arg allowed_css_keywords: set of CSS keywords to allow--everything
750 else will be stripped
751
752 :arg allowed_svg_properties: set of SVG properties to allow--everything
753 else will be removed
754
755 :arg allowed_protocols: set of allowed protocols for URIs
756
757 :arg allowed_content_types: set of allowed content types for ``data`` URIs.
758
759 :arg attr_val_is_uri: set of attributes that have URI values--values
760 that have a scheme not listed in ``allowed_protocols`` are removed
761
762 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
763 references
764
765 :arg svg_allow_local_href: set of SVG elements that can have local
766 hrefs--these are removed
767
768 """
769 super(Filter, self).__init__(source)
770
771 warnings.warn(_deprecation_msg, DeprecationWarning)
772
773 self.allowed_elements = allowed_elements
774 self.allowed_attributes = allowed_attributes
775 self.allowed_css_properties = allowed_css_properties
776 self.allowed_css_keywords = allowed_css_keywords
777 self.allowed_svg_properties = allowed_svg_properties
778 self.allowed_protocols = allowed_protocols
779 self.allowed_content_types = allowed_content_types
780 self.attr_val_is_uri = attr_val_is_uri
781 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
782 self.svg_allow_local_href = svg_allow_local_href
783
784 def __iter__(self):
785 for token in base.Filter.__iter__(self):
786 token = self.sanitize_token(token)
787 if token:
788 yield token
789
790 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
791 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
792 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
793 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
794 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
795 # allowed.
796 #
797 # sanitize_html('<script> do_nasty_stuff() </script>')
798 # => <script> do_nasty_stuff() </script>
799 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
800 # => <a>Click here for $100</a>
801 def sanitize_token(self, token):
802
803 # accommodate filters which use token_type differently
804 token_type = token["type"]
805 if token_type in ("StartTag", "EndTag", "EmptyTag"):
806 name = token["name"]
807 namespace = token["namespace"]
808 if ((namespace, name) in self.allowed_elements or
809 (namespace is None and
810 (namespaces["html"], name) in self.allowed_elements)):
811 return self.allowed_token(token)
812 else:
813 return self.disallowed_token(token)
814 elif token_type == "Comment":
815 pass
816 else:
817 return token
818
819 def allowed_token(self, token):
820 if "data" in token:
821 attrs = token["data"]
822 attr_names = set(attrs.keys())
823
824 # Remove forbidden attributes
825 for to_remove in (attr_names - self.allowed_attributes):
826 del token["data"][to_remove]
827 attr_names.remove(to_remove)
828
829 # Remove attributes with disallowed URL values
830 for attr in (attr_names & self.attr_val_is_uri):
831 assert attr in attrs
832 # I don't have a clue where this regexp comes from or why it matches those
833 # characters, nor why we call unescape. I just know it's always been here.
834 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
835 # this will do is remove *more* than it otherwise would.
836 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
837 unescape(attrs[attr])).lower()
838 # remove replacement characters from unescaped characters
839 val_unescaped = val_unescaped.replace("\ufffd", "")
840 try:
841 uri = urlparse.urlparse(val_unescaped)
842 except ValueError:
843 uri = None
844 del attrs[attr]
845 if uri and uri.scheme:
846 if uri.scheme not in self.allowed_protocols:
847 del attrs[attr]
848 if uri.scheme == 'data':
849 m = data_content_type.match(uri.path)
850 if not m:
851 del attrs[attr]
852 elif m.group('content_type') not in self.allowed_content_types:
853 del attrs[attr]
854
855 for attr in self.svg_attr_val_allows_ref:
856 if attr in attrs:
857 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
858 ' ',
859 unescape(attrs[attr]))
860 if (token["name"] in self.svg_allow_local_href and
861 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
862 attrs[(namespaces['xlink'], 'href')])):
863 del attrs[(namespaces['xlink'], 'href')]
864 if (None, 'style') in attrs:
865 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
866 token["data"] = attrs
867 return token
868
869 def disallowed_token(self, token):
870 token_type = token["type"]
871 if token_type == "EndTag":
872 token["data"] = "</%s>" % token["name"]
873 elif token["data"]:
874 assert token_type in ("StartTag", "EmptyTag")
875 attrs = []
876 for (ns, name), v in token["data"].items():
877 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
878 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
879 else:
880 token["data"] = "<%s>" % token["name"]
881 if token.get("selfClosing"):
882 token["data"] = token["data"][:-1] + "/>"
883
884 token["type"] = "Characters"
885
886 del token["name"]
887 return token
888
889 def sanitize_css(self, style):
890 # disallow urls
891 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
892
893 # gauntlet
894 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
895 return ''
896 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
897 return ''
898
899 clean = []
900 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
901 if not value:
902 continue
903 if prop.lower() in self.allowed_css_properties:
904 clean.append(prop + ': ' + value + ';')
905 elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
906 'padding']:
907 for keyword in value.split():
908 if keyword not in self.allowed_css_keywords and \
909 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
910 break
911 else:
912 clean.append(prop + ': ' + value + ';')
913 elif prop.lower() in self.allowed_svg_properties:
914 clean.append(prop + ': ' + value + ';')
915
916 return ' '.join(clean)