1"""Deprecated from html5lib 1.1.
2
3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
5is recommended as a replacement. Please let us know in the aforementioned issue
6if Bleach is unsuitable for your needs.
7
8"""
9from __future__ import absolute_import, division, unicode_literals
10
11import re
12import warnings
13from xml.sax.saxutils import escape, unescape
14
15from bleach.six_shim import urllib_parse as urlparse
16
17from . import base
18from ..constants import namespaces, prefixes
19
20__all__ = ["Filter"]
21
22
23_deprecation_msg = (
24 "html5lib's sanitizer is deprecated; see " +
25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
26 "us know if Bleach is unsuitable for your needs"
27)
28
29warnings.warn(_deprecation_msg, DeprecationWarning)
30
31allowed_elements = frozenset((
32 (namespaces['html'], 'a'),
33 (namespaces['html'], 'abbr'),
34 (namespaces['html'], 'acronym'),
35 (namespaces['html'], 'address'),
36 (namespaces['html'], 'area'),
37 (namespaces['html'], 'article'),
38 (namespaces['html'], 'aside'),
39 (namespaces['html'], 'audio'),
40 (namespaces['html'], 'b'),
41 (namespaces['html'], 'big'),
42 (namespaces['html'], 'blockquote'),
43 (namespaces['html'], 'br'),
44 (namespaces['html'], 'button'),
45 (namespaces['html'], 'canvas'),
46 (namespaces['html'], 'caption'),
47 (namespaces['html'], 'center'),
48 (namespaces['html'], 'cite'),
49 (namespaces['html'], 'code'),
50 (namespaces['html'], 'col'),
51 (namespaces['html'], 'colgroup'),
52 (namespaces['html'], 'command'),
53 (namespaces['html'], 'datagrid'),
54 (namespaces['html'], 'datalist'),
55 (namespaces['html'], 'dd'),
56 (namespaces['html'], 'del'),
57 (namespaces['html'], 'details'),
58 (namespaces['html'], 'dfn'),
59 (namespaces['html'], 'dialog'),
60 (namespaces['html'], 'dir'),
61 (namespaces['html'], 'div'),
62 (namespaces['html'], 'dl'),
63 (namespaces['html'], 'dt'),
64 (namespaces['html'], 'em'),
65 (namespaces['html'], 'event-source'),
66 (namespaces['html'], 'fieldset'),
67 (namespaces['html'], 'figcaption'),
68 (namespaces['html'], 'figure'),
69 (namespaces['html'], 'footer'),
70 (namespaces['html'], 'font'),
71 (namespaces['html'], 'form'),
72 (namespaces['html'], 'header'),
73 (namespaces['html'], 'h1'),
74 (namespaces['html'], 'h2'),
75 (namespaces['html'], 'h3'),
76 (namespaces['html'], 'h4'),
77 (namespaces['html'], 'h5'),
78 (namespaces['html'], 'h6'),
79 (namespaces['html'], 'hr'),
80 (namespaces['html'], 'i'),
81 (namespaces['html'], 'img'),
82 (namespaces['html'], 'input'),
83 (namespaces['html'], 'ins'),
84 (namespaces['html'], 'keygen'),
85 (namespaces['html'], 'kbd'),
86 (namespaces['html'], 'label'),
87 (namespaces['html'], 'legend'),
88 (namespaces['html'], 'li'),
89 (namespaces['html'], 'm'),
90 (namespaces['html'], 'map'),
91 (namespaces['html'], 'menu'),
92 (namespaces['html'], 'meter'),
93 (namespaces['html'], 'multicol'),
94 (namespaces['html'], 'nav'),
95 (namespaces['html'], 'nextid'),
96 (namespaces['html'], 'ol'),
97 (namespaces['html'], 'output'),
98 (namespaces['html'], 'optgroup'),
99 (namespaces['html'], 'option'),
100 (namespaces['html'], 'p'),
101 (namespaces['html'], 'pre'),
102 (namespaces['html'], 'progress'),
103 (namespaces['html'], 'q'),
104 (namespaces['html'], 's'),
105 (namespaces['html'], 'samp'),
106 (namespaces['html'], 'section'),
107 (namespaces['html'], 'select'),
108 (namespaces['html'], 'small'),
109 (namespaces['html'], 'sound'),
110 (namespaces['html'], 'source'),
111 (namespaces['html'], 'spacer'),
112 (namespaces['html'], 'span'),
113 (namespaces['html'], 'strike'),
114 (namespaces['html'], 'strong'),
115 (namespaces['html'], 'sub'),
116 (namespaces['html'], 'sup'),
117 (namespaces['html'], 'table'),
118 (namespaces['html'], 'tbody'),
119 (namespaces['html'], 'td'),
120 (namespaces['html'], 'textarea'),
121 (namespaces['html'], 'time'),
122 (namespaces['html'], 'tfoot'),
123 (namespaces['html'], 'th'),
124 (namespaces['html'], 'thead'),
125 (namespaces['html'], 'tr'),
126 (namespaces['html'], 'tt'),
127 (namespaces['html'], 'u'),
128 (namespaces['html'], 'ul'),
129 (namespaces['html'], 'var'),
130 (namespaces['html'], 'video'),
131 (namespaces['html'], 'wbr'),
132 (namespaces['mathml'], 'maction'),
133 (namespaces['mathml'], 'math'),
134 (namespaces['mathml'], 'merror'),
135 (namespaces['mathml'], 'mfrac'),
136 (namespaces['mathml'], 'mi'),
137 (namespaces['mathml'], 'mmultiscripts'),
138 (namespaces['mathml'], 'mn'),
139 (namespaces['mathml'], 'mo'),
140 (namespaces['mathml'], 'mover'),
141 (namespaces['mathml'], 'mpadded'),
142 (namespaces['mathml'], 'mphantom'),
143 (namespaces['mathml'], 'mprescripts'),
144 (namespaces['mathml'], 'mroot'),
145 (namespaces['mathml'], 'mrow'),
146 (namespaces['mathml'], 'mspace'),
147 (namespaces['mathml'], 'msqrt'),
148 (namespaces['mathml'], 'mstyle'),
149 (namespaces['mathml'], 'msub'),
150 (namespaces['mathml'], 'msubsup'),
151 (namespaces['mathml'], 'msup'),
152 (namespaces['mathml'], 'mtable'),
153 (namespaces['mathml'], 'mtd'),
154 (namespaces['mathml'], 'mtext'),
155 (namespaces['mathml'], 'mtr'),
156 (namespaces['mathml'], 'munder'),
157 (namespaces['mathml'], 'munderover'),
158 (namespaces['mathml'], 'none'),
159 (namespaces['svg'], 'a'),
160 (namespaces['svg'], 'animate'),
161 (namespaces['svg'], 'animateColor'),
162 (namespaces['svg'], 'animateMotion'),
163 (namespaces['svg'], 'animateTransform'),
164 (namespaces['svg'], 'clipPath'),
165 (namespaces['svg'], 'circle'),
166 (namespaces['svg'], 'defs'),
167 (namespaces['svg'], 'desc'),
168 (namespaces['svg'], 'ellipse'),
169 (namespaces['svg'], 'font-face'),
170 (namespaces['svg'], 'font-face-name'),
171 (namespaces['svg'], 'font-face-src'),
172 (namespaces['svg'], 'g'),
173 (namespaces['svg'], 'glyph'),
174 (namespaces['svg'], 'hkern'),
175 (namespaces['svg'], 'linearGradient'),
176 (namespaces['svg'], 'line'),
177 (namespaces['svg'], 'marker'),
178 (namespaces['svg'], 'metadata'),
179 (namespaces['svg'], 'missing-glyph'),
180 (namespaces['svg'], 'mpath'),
181 (namespaces['svg'], 'path'),
182 (namespaces['svg'], 'polygon'),
183 (namespaces['svg'], 'polyline'),
184 (namespaces['svg'], 'radialGradient'),
185 (namespaces['svg'], 'rect'),
186 (namespaces['svg'], 'set'),
187 (namespaces['svg'], 'stop'),
188 (namespaces['svg'], 'svg'),
189 (namespaces['svg'], 'switch'),
190 (namespaces['svg'], 'text'),
191 (namespaces['svg'], 'title'),
192 (namespaces['svg'], 'tspan'),
193 (namespaces['svg'], 'use'),
194))
195
196allowed_attributes = frozenset((
197 # HTML attributes
198 (None, 'abbr'),
199 (None, 'accept'),
200 (None, 'accept-charset'),
201 (None, 'accesskey'),
202 (None, 'action'),
203 (None, 'align'),
204 (None, 'alt'),
205 (None, 'autocomplete'),
206 (None, 'autofocus'),
207 (None, 'axis'),
208 (None, 'background'),
209 (None, 'balance'),
210 (None, 'bgcolor'),
211 (None, 'bgproperties'),
212 (None, 'border'),
213 (None, 'bordercolor'),
214 (None, 'bordercolordark'),
215 (None, 'bordercolorlight'),
216 (None, 'bottompadding'),
217 (None, 'cellpadding'),
218 (None, 'cellspacing'),
219 (None, 'ch'),
220 (None, 'challenge'),
221 (None, 'char'),
222 (None, 'charoff'),
223 (None, 'choff'),
224 (None, 'charset'),
225 (None, 'checked'),
226 (None, 'cite'),
227 (None, 'class'),
228 (None, 'clear'),
229 (None, 'color'),
230 (None, 'cols'),
231 (None, 'colspan'),
232 (None, 'compact'),
233 (None, 'contenteditable'),
234 (None, 'controls'),
235 (None, 'coords'),
236 (None, 'data'),
237 (None, 'datafld'),
238 (None, 'datapagesize'),
239 (None, 'datasrc'),
240 (None, 'datetime'),
241 (None, 'default'),
242 (None, 'delay'),
243 (None, 'dir'),
244 (None, 'disabled'),
245 (None, 'draggable'),
246 (None, 'dynsrc'),
247 (None, 'enctype'),
248 (None, 'end'),
249 (None, 'face'),
250 (None, 'for'),
251 (None, 'form'),
252 (None, 'frame'),
253 (None, 'galleryimg'),
254 (None, 'gutter'),
255 (None, 'headers'),
256 (None, 'height'),
257 (None, 'hidefocus'),
258 (None, 'hidden'),
259 (None, 'high'),
260 (None, 'href'),
261 (None, 'hreflang'),
262 (None, 'hspace'),
263 (None, 'icon'),
264 (None, 'id'),
265 (None, 'inputmode'),
266 (None, 'ismap'),
267 (None, 'keytype'),
268 (None, 'label'),
269 (None, 'leftspacing'),
270 (None, 'lang'),
271 (None, 'list'),
272 (None, 'longdesc'),
273 (None, 'loop'),
274 (None, 'loopcount'),
275 (None, 'loopend'),
276 (None, 'loopstart'),
277 (None, 'low'),
278 (None, 'lowsrc'),
279 (None, 'max'),
280 (None, 'maxlength'),
281 (None, 'media'),
282 (None, 'method'),
283 (None, 'min'),
284 (None, 'multiple'),
285 (None, 'name'),
286 (None, 'nohref'),
287 (None, 'noshade'),
288 (None, 'nowrap'),
289 (None, 'open'),
290 (None, 'optimum'),
291 (None, 'pattern'),
292 (None, 'ping'),
293 (None, 'point-size'),
294 (None, 'poster'),
295 (None, 'pqg'),
296 (None, 'preload'),
297 (None, 'prompt'),
298 (None, 'radiogroup'),
299 (None, 'readonly'),
300 (None, 'rel'),
301 (None, 'repeat-max'),
302 (None, 'repeat-min'),
303 (None, 'replace'),
304 (None, 'required'),
305 (None, 'rev'),
306 (None, 'rightspacing'),
307 (None, 'rows'),
308 (None, 'rowspan'),
309 (None, 'rules'),
310 (None, 'scope'),
311 (None, 'selected'),
312 (None, 'shape'),
313 (None, 'size'),
314 (None, 'span'),
315 (None, 'src'),
316 (None, 'start'),
317 (None, 'step'),
318 (None, 'style'),
319 (None, 'summary'),
320 (None, 'suppress'),
321 (None, 'tabindex'),
322 (None, 'target'),
323 (None, 'template'),
324 (None, 'title'),
325 (None, 'toppadding'),
326 (None, 'type'),
327 (None, 'unselectable'),
328 (None, 'usemap'),
329 (None, 'urn'),
330 (None, 'valign'),
331 (None, 'value'),
332 (None, 'variable'),
333 (None, 'volume'),
334 (None, 'vspace'),
335 (None, 'vrml'),
336 (None, 'width'),
337 (None, 'wrap'),
338 (namespaces['xml'], 'lang'),
339 # MathML attributes
340 (None, 'actiontype'),
341 (None, 'align'),
342 (None, 'columnalign'),
343 (None, 'columnalign'),
344 (None, 'columnalign'),
345 (None, 'columnlines'),
346 (None, 'columnspacing'),
347 (None, 'columnspan'),
348 (None, 'depth'),
349 (None, 'display'),
350 (None, 'displaystyle'),
351 (None, 'equalcolumns'),
352 (None, 'equalrows'),
353 (None, 'fence'),
354 (None, 'fontstyle'),
355 (None, 'fontweight'),
356 (None, 'frame'),
357 (None, 'height'),
358 (None, 'linethickness'),
359 (None, 'lspace'),
360 (None, 'mathbackground'),
361 (None, 'mathcolor'),
362 (None, 'mathvariant'),
363 (None, 'mathvariant'),
364 (None, 'maxsize'),
365 (None, 'minsize'),
366 (None, 'other'),
367 (None, 'rowalign'),
368 (None, 'rowalign'),
369 (None, 'rowalign'),
370 (None, 'rowlines'),
371 (None, 'rowspacing'),
372 (None, 'rowspan'),
373 (None, 'rspace'),
374 (None, 'scriptlevel'),
375 (None, 'selection'),
376 (None, 'separator'),
377 (None, 'stretchy'),
378 (None, 'width'),
379 (None, 'width'),
380 (namespaces['xlink'], 'href'),
381 (namespaces['xlink'], 'show'),
382 (namespaces['xlink'], 'type'),
383 # SVG attributes
384 (None, 'accent-height'),
385 (None, 'accumulate'),
386 (None, 'additive'),
387 (None, 'alphabetic'),
388 (None, 'arabic-form'),
389 (None, 'ascent'),
390 (None, 'attributeName'),
391 (None, 'attributeType'),
392 (None, 'baseProfile'),
393 (None, 'bbox'),
394 (None, 'begin'),
395 (None, 'by'),
396 (None, 'calcMode'),
397 (None, 'cap-height'),
398 (None, 'class'),
399 (None, 'clip-path'),
400 (None, 'color'),
401 (None, 'color-rendering'),
402 (None, 'content'),
403 (None, 'cx'),
404 (None, 'cy'),
405 (None, 'd'),
406 (None, 'dx'),
407 (None, 'dy'),
408 (None, 'descent'),
409 (None, 'display'),
410 (None, 'dur'),
411 (None, 'end'),
412 (None, 'fill'),
413 (None, 'fill-opacity'),
414 (None, 'fill-rule'),
415 (None, 'font-family'),
416 (None, 'font-size'),
417 (None, 'font-stretch'),
418 (None, 'font-style'),
419 (None, 'font-variant'),
420 (None, 'font-weight'),
421 (None, 'from'),
422 (None, 'fx'),
423 (None, 'fy'),
424 (None, 'g1'),
425 (None, 'g2'),
426 (None, 'glyph-name'),
427 (None, 'gradientUnits'),
428 (None, 'hanging'),
429 (None, 'height'),
430 (None, 'horiz-adv-x'),
431 (None, 'horiz-origin-x'),
432 (None, 'id'),
433 (None, 'ideographic'),
434 (None, 'k'),
435 (None, 'keyPoints'),
436 (None, 'keySplines'),
437 (None, 'keyTimes'),
438 (None, 'lang'),
439 (None, 'marker-end'),
440 (None, 'marker-mid'),
441 (None, 'marker-start'),
442 (None, 'markerHeight'),
443 (None, 'markerUnits'),
444 (None, 'markerWidth'),
445 (None, 'mathematical'),
446 (None, 'max'),
447 (None, 'min'),
448 (None, 'name'),
449 (None, 'offset'),
450 (None, 'opacity'),
451 (None, 'orient'),
452 (None, 'origin'),
453 (None, 'overline-position'),
454 (None, 'overline-thickness'),
455 (None, 'panose-1'),
456 (None, 'path'),
457 (None, 'pathLength'),
458 (None, 'points'),
459 (None, 'preserveAspectRatio'),
460 (None, 'r'),
461 (None, 'refX'),
462 (None, 'refY'),
463 (None, 'repeatCount'),
464 (None, 'repeatDur'),
465 (None, 'requiredExtensions'),
466 (None, 'requiredFeatures'),
467 (None, 'restart'),
468 (None, 'rotate'),
469 (None, 'rx'),
470 (None, 'ry'),
471 (None, 'slope'),
472 (None, 'stemh'),
473 (None, 'stemv'),
474 (None, 'stop-color'),
475 (None, 'stop-opacity'),
476 (None, 'strikethrough-position'),
477 (None, 'strikethrough-thickness'),
478 (None, 'stroke'),
479 (None, 'stroke-dasharray'),
480 (None, 'stroke-dashoffset'),
481 (None, 'stroke-linecap'),
482 (None, 'stroke-linejoin'),
483 (None, 'stroke-miterlimit'),
484 (None, 'stroke-opacity'),
485 (None, 'stroke-width'),
486 (None, 'systemLanguage'),
487 (None, 'target'),
488 (None, 'text-anchor'),
489 (None, 'to'),
490 (None, 'transform'),
491 (None, 'type'),
492 (None, 'u1'),
493 (None, 'u2'),
494 (None, 'underline-position'),
495 (None, 'underline-thickness'),
496 (None, 'unicode'),
497 (None, 'unicode-range'),
498 (None, 'units-per-em'),
499 (None, 'values'),
500 (None, 'version'),
501 (None, 'viewBox'),
502 (None, 'visibility'),
503 (None, 'width'),
504 (None, 'widths'),
505 (None, 'x'),
506 (None, 'x-height'),
507 (None, 'x1'),
508 (None, 'x2'),
509 (namespaces['xlink'], 'actuate'),
510 (namespaces['xlink'], 'arcrole'),
511 (namespaces['xlink'], 'href'),
512 (namespaces['xlink'], 'role'),
513 (namespaces['xlink'], 'show'),
514 (namespaces['xlink'], 'title'),
515 (namespaces['xlink'], 'type'),
516 (namespaces['xml'], 'base'),
517 (namespaces['xml'], 'lang'),
518 (namespaces['xml'], 'space'),
519 (None, 'y'),
520 (None, 'y1'),
521 (None, 'y2'),
522 (None, 'zoomAndPan'),
523))
524
525attr_val_is_uri = frozenset((
526 (None, 'href'),
527 (None, 'src'),
528 (None, 'cite'),
529 (None, 'action'),
530 (None, 'longdesc'),
531 (None, 'poster'),
532 (None, 'background'),
533 (None, 'datasrc'),
534 (None, 'dynsrc'),
535 (None, 'lowsrc'),
536 (None, 'ping'),
537 (None, 'formaction'),
538 (namespaces['xlink'], 'href'),
539 (namespaces['xml'], 'base'),
540))
541
542svg_attr_val_allows_ref = frozenset((
543 (None, 'clip-path'),
544 (None, 'color-profile'),
545 (None, 'cursor'),
546 (None, 'fill'),
547 (None, 'filter'),
548 (None, 'marker'),
549 (None, 'marker-start'),
550 (None, 'marker-mid'),
551 (None, 'marker-end'),
552 (None, 'mask'),
553 (None, 'stroke'),
554))
555
556svg_allow_local_href = frozenset((
557 (None, 'altGlyph'),
558 (None, 'animate'),
559 (None, 'animateColor'),
560 (None, 'animateMotion'),
561 (None, 'animateTransform'),
562 (None, 'cursor'),
563 (None, 'feImage'),
564 (None, 'filter'),
565 (None, 'linearGradient'),
566 (None, 'pattern'),
567 (None, 'radialGradient'),
568 (None, 'textpath'),
569 (None, 'tref'),
570 (None, 'set'),
571 (None, 'use')
572))
573
574allowed_css_properties = frozenset((
575 'azimuth',
576 'background-color',
577 'border-bottom-color',
578 'border-collapse',
579 'border-color',
580 'border-left-color',
581 'border-right-color',
582 'border-top-color',
583 'clear',
584 'color',
585 'cursor',
586 'direction',
587 'display',
588 'elevation',
589 'float',
590 'font',
591 'font-family',
592 'font-size',
593 'font-style',
594 'font-variant',
595 'font-weight',
596 'height',
597 'letter-spacing',
598 'line-height',
599 'overflow',
600 'pause',
601 'pause-after',
602 'pause-before',
603 'pitch',
604 'pitch-range',
605 'richness',
606 'speak',
607 'speak-header',
608 'speak-numeral',
609 'speak-punctuation',
610 'speech-rate',
611 'stress',
612 'text-align',
613 'text-decoration',
614 'text-indent',
615 'unicode-bidi',
616 'vertical-align',
617 'voice-family',
618 'volume',
619 'white-space',
620 'width',
621))
622
623allowed_css_keywords = frozenset((
624 'auto',
625 'aqua',
626 'black',
627 'block',
628 'blue',
629 'bold',
630 'both',
631 'bottom',
632 'brown',
633 'center',
634 'collapse',
635 'dashed',
636 'dotted',
637 'fuchsia',
638 'gray',
639 'green',
640 '!important',
641 'italic',
642 'left',
643 'lime',
644 'maroon',
645 'medium',
646 'none',
647 'navy',
648 'normal',
649 'nowrap',
650 'olive',
651 'pointer',
652 'purple',
653 'red',
654 'right',
655 'solid',
656 'silver',
657 'teal',
658 'top',
659 'transparent',
660 'underline',
661 'white',
662 'yellow',
663))
664
665allowed_svg_properties = frozenset((
666 'fill',
667 'fill-opacity',
668 'fill-rule',
669 'stroke',
670 'stroke-width',
671 'stroke-linecap',
672 'stroke-linejoin',
673 'stroke-opacity',
674))
675
676allowed_protocols = frozenset((
677 'ed2k',
678 'ftp',
679 'http',
680 'https',
681 'irc',
682 'mailto',
683 'news',
684 'gopher',
685 'nntp',
686 'telnet',
687 'webcal',
688 'xmpp',
689 'callto',
690 'feed',
691 'urn',
692 'aim',
693 'rsync',
694 'tag',
695 'ssh',
696 'sftp',
697 'rtsp',
698 'afs',
699 'data',
700))
701
702allowed_content_types = frozenset((
703 'image/png',
704 'image/jpeg',
705 'image/gif',
706 'image/webp',
707 'image/bmp',
708 'text/plain',
709))
710
711
712data_content_type = re.compile(r'''
713 ^
714 # Match a content type <application>/<type>
715 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
716 # Match any character set and encoding
717 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
718 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
719 # Assume the rest is data
720 ,.*
721 $
722 ''',
723 re.VERBOSE)
724
725
726class Filter(base.Filter):
727 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
728 def __init__(self,
729 source,
730 allowed_elements=allowed_elements,
731 allowed_attributes=allowed_attributes,
732 allowed_css_properties=allowed_css_properties,
733 allowed_css_keywords=allowed_css_keywords,
734 allowed_svg_properties=allowed_svg_properties,
735 allowed_protocols=allowed_protocols,
736 allowed_content_types=allowed_content_types,
737 attr_val_is_uri=attr_val_is_uri,
738 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
739 svg_allow_local_href=svg_allow_local_href):
740 """Creates a Filter
741
742 :arg allowed_elements: set of elements to allow--everything else will
743 be escaped
744
745 :arg allowed_attributes: set of attributes to allow in
746 elements--everything else will be stripped
747
748 :arg allowed_css_properties: set of CSS properties to allow--everything
749 else will be stripped
750
751 :arg allowed_css_keywords: set of CSS keywords to allow--everything
752 else will be stripped
753
754 :arg allowed_svg_properties: set of SVG properties to allow--everything
755 else will be removed
756
757 :arg allowed_protocols: set of allowed protocols for URIs
758
759 :arg allowed_content_types: set of allowed content types for ``data`` URIs.
760
761 :arg attr_val_is_uri: set of attributes that have URI values--values
762 that have a scheme not listed in ``allowed_protocols`` are removed
763
764 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
765 references
766
767 :arg svg_allow_local_href: set of SVG elements that can have local
768 hrefs--these are removed
769
770 """
771 super(Filter, self).__init__(source)
772
773 warnings.warn(_deprecation_msg, DeprecationWarning)
774
775 self.allowed_elements = allowed_elements
776 self.allowed_attributes = allowed_attributes
777 self.allowed_css_properties = allowed_css_properties
778 self.allowed_css_keywords = allowed_css_keywords
779 self.allowed_svg_properties = allowed_svg_properties
780 self.allowed_protocols = allowed_protocols
781 self.allowed_content_types = allowed_content_types
782 self.attr_val_is_uri = attr_val_is_uri
783 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
784 self.svg_allow_local_href = svg_allow_local_href
785
786 def __iter__(self):
787 for token in base.Filter.__iter__(self):
788 token = self.sanitize_token(token)
789 if token:
790 yield token
791
792 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
793 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
794 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
795 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
796 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
797 # allowed.
798 #
799 # sanitize_html('<script> do_nasty_stuff() </script>')
800 # => <script> do_nasty_stuff() </script>
801 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
802 # => <a>Click here for $100</a>
803 def sanitize_token(self, token):
804
805 # accommodate filters which use token_type differently
806 token_type = token["type"]
807 if token_type in ("StartTag", "EndTag", "EmptyTag"):
808 name = token["name"]
809 namespace = token["namespace"]
810 if ((namespace, name) in self.allowed_elements or
811 (namespace is None and
812 (namespaces["html"], name) in self.allowed_elements)):
813 return self.allowed_token(token)
814 else:
815 return self.disallowed_token(token)
816 elif token_type == "Comment":
817 pass
818 else:
819 return token
820
821 def allowed_token(self, token):
822 if "data" in token:
823 attrs = token["data"]
824 attr_names = set(attrs.keys())
825
826 # Remove forbidden attributes
827 for to_remove in (attr_names - self.allowed_attributes):
828 del token["data"][to_remove]
829 attr_names.remove(to_remove)
830
831 # Remove attributes with disallowed URL values
832 for attr in (attr_names & self.attr_val_is_uri):
833 assert attr in attrs
834 # I don't have a clue where this regexp comes from or why it matches those
835 # characters, nor why we call unescape. I just know it's always been here.
836 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
837 # this will do is remove *more* than it otherwise would.
838 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
839 unescape(attrs[attr])).lower()
840 # remove replacement characters from unescaped characters
841 val_unescaped = val_unescaped.replace("\ufffd", "")
842 try:
843 uri = urlparse.urlparse(val_unescaped)
844 except ValueError:
845 uri = None
846 del attrs[attr]
847 if uri and uri.scheme:
848 if uri.scheme not in self.allowed_protocols:
849 del attrs[attr]
850 if uri.scheme == 'data':
851 m = data_content_type.match(uri.path)
852 if not m:
853 del attrs[attr]
854 elif m.group('content_type') not in self.allowed_content_types:
855 del attrs[attr]
856
857 for attr in self.svg_attr_val_allows_ref:
858 if attr in attrs:
859 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
860 ' ',
861 unescape(attrs[attr]))
862 if (token["name"] in self.svg_allow_local_href and
863 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
864 attrs[(namespaces['xlink'], 'href')])):
865 del attrs[(namespaces['xlink'], 'href')]
866 if (None, 'style') in attrs:
867 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
868 token["data"] = attrs
869 return token
870
871 def disallowed_token(self, token):
872 token_type = token["type"]
873 if token_type == "EndTag":
874 token["data"] = "</%s>" % token["name"]
875 elif token["data"]:
876 assert token_type in ("StartTag", "EmptyTag")
877 attrs = []
878 for (ns, name), v in token["data"].items():
879 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
880 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
881 else:
882 token["data"] = "<%s>" % token["name"]
883 if token.get("selfClosing"):
884 token["data"] = token["data"][:-1] + "/>"
885
886 token["type"] = "Characters"
887
888 del token["name"]
889 return token
890
891 def sanitize_css(self, style):
892 # disallow urls
893 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
894
895 # gauntlet
896 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
897 return ''
898 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
899 return ''
900
901 clean = []
902 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
903 if not value:
904 continue
905 if prop.lower() in self.allowed_css_properties:
906 clean.append(prop + ': ' + value + ';')
907 elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
908 'padding']:
909 for keyword in value.split():
910 if keyword not in self.allowed_css_keywords and \
911 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
912 break
913 else:
914 clean.append(prop + ': ' + value + ';')
915 elif prop.lower() in self.allowed_svg_properties:
916 clean.append(prop + ': ' + value + ';')
917
918 return ' '.join(clean)