1"""Deprecated from html5lib 1.1.
2
3See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
4information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
5is recommended as a replacement. Please let us know in the aforementioned issue
6if Bleach is unsuitable for your needs.
7
8"""
9from __future__ import absolute_import, division, unicode_literals
10
11import re
12import warnings
13from xml.sax.saxutils import escape, unescape
14
15from bleach.six_shim import urllib_parse as urlparse
16
17from . import base
18from ..constants import namespaces, prefixes
19
20__all__ = ["Filter"]
21
22
23_deprecation_msg = (
24 "html5lib's sanitizer is deprecated; see " +
25 "https://github.com/html5lib/html5lib-python/issues/443 and please let " +
26 "us know if Bleach is unsuitable for your needs"
27)
28
29warnings.warn(_deprecation_msg, DeprecationWarning)
30
31allowed_elements = frozenset((
32 (namespaces['html'], 'a'),
33 (namespaces['html'], 'abbr'),
34 (namespaces['html'], 'acronym'),
35 (namespaces['html'], 'address'),
36 (namespaces['html'], 'area'),
37 (namespaces['html'], 'article'),
38 (namespaces['html'], 'aside'),
39 (namespaces['html'], 'audio'),
40 (namespaces['html'], 'b'),
41 (namespaces['html'], 'big'),
42 (namespaces['html'], 'blockquote'),
43 (namespaces['html'], 'br'),
44 (namespaces['html'], 'button'),
45 (namespaces['html'], 'canvas'),
46 (namespaces['html'], 'caption'),
47 (namespaces['html'], 'center'),
48 (namespaces['html'], 'cite'),
49 (namespaces['html'], 'code'),
50 (namespaces['html'], 'col'),
51 (namespaces['html'], 'colgroup'),
52 (namespaces['html'], 'command'),
53 (namespaces['html'], 'datagrid'),
54 (namespaces['html'], 'datalist'),
55 (namespaces['html'], 'dd'),
56 (namespaces['html'], 'del'),
57 (namespaces['html'], 'details'),
58 (namespaces['html'], 'dfn'),
59 (namespaces['html'], 'dialog'),
60 (namespaces['html'], 'dir'),
61 (namespaces['html'], 'div'),
62 (namespaces['html'], 'dl'),
63 (namespaces['html'], 'dt'),
64 (namespaces['html'], 'em'),
65 (namespaces['html'], 'event-source'),
66 (namespaces['html'], 'fieldset'),
67 (namespaces['html'], 'figcaption'),
68 (namespaces['html'], 'figure'),
69 (namespaces['html'], 'footer'),
70 (namespaces['html'], 'font'),
71 (namespaces['html'], 'form'),
72 (namespaces['html'], 'header'),
73 (namespaces['html'], 'h1'),
74 (namespaces['html'], 'h2'),
75 (namespaces['html'], 'h3'),
76 (namespaces['html'], 'h4'),
77 (namespaces['html'], 'h5'),
78 (namespaces['html'], 'h6'),
79 (namespaces['html'], 'hr'),
80 (namespaces['html'], 'i'),
81 (namespaces['html'], 'img'),
82 (namespaces['html'], 'input'),
83 (namespaces['html'], 'ins'),
84 (namespaces['html'], 'keygen'),
85 (namespaces['html'], 'kbd'),
86 (namespaces['html'], 'label'),
87 (namespaces['html'], 'legend'),
88 (namespaces['html'], 'li'),
89 (namespaces['html'], 'm'),
90 (namespaces['html'], 'map'),
91 (namespaces['html'], 'menu'),
92 (namespaces['html'], 'meter'),
93 (namespaces['html'], 'multicol'),
94 (namespaces['html'], 'nav'),
95 (namespaces['html'], 'nextid'),
96 (namespaces['html'], 'ol'),
97 (namespaces['html'], 'output'),
98 (namespaces['html'], 'optgroup'),
99 (namespaces['html'], 'option'),
100 (namespaces['html'], 'p'),
101 (namespaces['html'], 'pre'),
102 (namespaces['html'], 'progress'),
103 (namespaces['html'], 'q'),
104 (namespaces['html'], 's'),
105 (namespaces['html'], 'samp'),
106 (namespaces['html'], 'section'),
107 (namespaces['html'], 'select'),
108 (namespaces['html'], 'small'),
109 (namespaces['html'], 'sound'),
110 (namespaces['html'], 'source'),
111 (namespaces['html'], 'spacer'),
112 (namespaces['html'], 'span'),
113 (namespaces['html'], 'strike'),
114 (namespaces['html'], 'strong'),
115 (namespaces['html'], 'sub'),
116 (namespaces['html'], 'sup'),
117 (namespaces['html'], 'table'),
118 (namespaces['html'], 'tbody'),
119 (namespaces['html'], 'td'),
120 (namespaces['html'], 'textarea'),
121 (namespaces['html'], 'time'),
122 (namespaces['html'], 'tfoot'),
123 (namespaces['html'], 'th'),
124 (namespaces['html'], 'thead'),
125 (namespaces['html'], 'tr'),
126 (namespaces['html'], 'tt'),
127 (namespaces['html'], 'u'),
128 (namespaces['html'], 'ul'),
129 (namespaces['html'], 'var'),
130 (namespaces['html'], 'video'),
131 (namespaces['html'], 'wbr'),
132 (namespaces['mathml'], 'maction'),
133 (namespaces['mathml'], 'math'),
134 (namespaces['mathml'], 'merror'),
135 (namespaces['mathml'], 'mfrac'),
136 (namespaces['mathml'], 'mi'),
137 (namespaces['mathml'], 'mmultiscripts'),
138 (namespaces['mathml'], 'mn'),
139 (namespaces['mathml'], 'mo'),
140 (namespaces['mathml'], 'mover'),
141 (namespaces['mathml'], 'mpadded'),
142 (namespaces['mathml'], 'mphantom'),
143 (namespaces['mathml'], 'mprescripts'),
144 (namespaces['mathml'], 'mroot'),
145 (namespaces['mathml'], 'mrow'),
146 (namespaces['mathml'], 'mspace'),
147 (namespaces['mathml'], 'msqrt'),
148 (namespaces['mathml'], 'mstyle'),
149 (namespaces['mathml'], 'msub'),
150 (namespaces['mathml'], 'msubsup'),
151 (namespaces['mathml'], 'msup'),
152 (namespaces['mathml'], 'mtable'),
153 (namespaces['mathml'], 'mtd'),
154 (namespaces['mathml'], 'mtext'),
155 (namespaces['mathml'], 'mtr'),
156 (namespaces['mathml'], 'munder'),
157 (namespaces['mathml'], 'munderover'),
158 (namespaces['mathml'], 'none'),
159 (namespaces['svg'], 'a'),
160 (namespaces['svg'], 'animate'),
161 (namespaces['svg'], 'animateColor'),
162 (namespaces['svg'], 'animateMotion'),
163 (namespaces['svg'], 'animateTransform'),
164 (namespaces['svg'], 'clipPath'),
165 (namespaces['svg'], 'circle'),
166 (namespaces['svg'], 'defs'),
167 (namespaces['svg'], 'desc'),
168 (namespaces['svg'], 'ellipse'),
169 (namespaces['svg'], 'font-face'),
170 (namespaces['svg'], 'font-face-name'),
171 (namespaces['svg'], 'font-face-src'),
172 (namespaces['svg'], 'g'),
173 (namespaces['svg'], 'glyph'),
174 (namespaces['svg'], 'hkern'),
175 (namespaces['svg'], 'linearGradient'),
176 (namespaces['svg'], 'line'),
177 (namespaces['svg'], 'marker'),
178 (namespaces['svg'], 'metadata'),
179 (namespaces['svg'], 'missing-glyph'),
180 (namespaces['svg'], 'mpath'),
181 (namespaces['svg'], 'path'),
182 (namespaces['svg'], 'polygon'),
183 (namespaces['svg'], 'polyline'),
184 (namespaces['svg'], 'radialGradient'),
185 (namespaces['svg'], 'rect'),
186 (namespaces['svg'], 'set'),
187 (namespaces['svg'], 'stop'),
188 (namespaces['svg'], 'svg'),
189 (namespaces['svg'], 'switch'),
190 (namespaces['svg'], 'text'),
191 (namespaces['svg'], 'title'),
192 (namespaces['svg'], 'tspan'),
193 (namespaces['svg'], 'use'),
194))
195
196allowed_attributes = frozenset((
197 # HTML attributes
198 (None, 'abbr'),
199 (None, 'accept'),
200 (None, 'accept-charset'),
201 (None, 'accesskey'),
202 (None, 'action'),
203 (None, 'align'),
204 (None, 'alt'),
205 (None, 'autocomplete'),
206 (None, 'autofocus'),
207 (None, 'axis'),
208 (None, 'background'),
209 (None, 'balance'),
210 (None, 'bgcolor'),
211 (None, 'bgproperties'),
212 (None, 'border'),
213 (None, 'bordercolor'),
214 (None, 'bordercolordark'),
215 (None, 'bordercolorlight'),
216 (None, 'bottompadding'),
217 (None, 'cellpadding'),
218 (None, 'cellspacing'),
219 (None, 'ch'),
220 (None, 'challenge'),
221 (None, 'char'),
222 (None, 'charoff'),
223 (None, 'choff'),
224 (None, 'charset'),
225 (None, 'checked'),
226 (None, 'cite'),
227 (None, 'class'),
228 (None, 'clear'),
229 (None, 'color'),
230 (None, 'cols'),
231 (None, 'colspan'),
232 (None, 'compact'),
233 (None, 'contenteditable'),
234 (None, 'controls'),
235 (None, 'coords'),
236 (None, 'data'),
237 (None, 'datafld'),
238 (None, 'datapagesize'),
239 (None, 'datasrc'),
240 (None, 'datetime'),
241 (None, 'default'),
242 (None, 'delay'),
243 (None, 'dir'),
244 (None, 'disabled'),
245 (None, 'draggable'),
246 (None, 'dynsrc'),
247 (None, 'enctype'),
248 (None, 'end'),
249 (None, 'face'),
250 (None, 'for'),
251 (None, 'form'),
252 (None, 'frame'),
253 (None, 'galleryimg'),
254 (None, 'gutter'),
255 (None, 'headers'),
256 (None, 'height'),
257 (None, 'hidefocus'),
258 (None, 'hidden'),
259 (None, 'high'),
260 (None, 'href'),
261 (None, 'hreflang'),
262 (None, 'hspace'),
263 (None, 'icon'),
264 (None, 'id'),
265 (None, 'inputmode'),
266 (None, 'ismap'),
267 (None, 'keytype'),
268 (None, 'label'),
269 (None, 'leftspacing'),
270 (None, 'lang'),
271 (None, 'list'),
272 (None, 'longdesc'),
273 (None, 'loop'),
274 (None, 'loopcount'),
275 (None, 'loopend'),
276 (None, 'loopstart'),
277 (None, 'low'),
278 (None, 'lowsrc'),
279 (None, 'max'),
280 (None, 'maxlength'),
281 (None, 'media'),
282 (None, 'method'),
283 (None, 'min'),
284 (None, 'multiple'),
285 (None, 'name'),
286 (None, 'nohref'),
287 (None, 'noshade'),
288 (None, 'nowrap'),
289 (None, 'open'),
290 (None, 'optimum'),
291 (None, 'pattern'),
292 (None, 'ping'),
293 (None, 'point-size'),
294 (None, 'poster'),
295 (None, 'pqg'),
296 (None, 'preload'),
297 (None, 'prompt'),
298 (None, 'radiogroup'),
299 (None, 'readonly'),
300 (None, 'rel'),
301 (None, 'repeat-max'),
302 (None, 'repeat-min'),
303 (None, 'replace'),
304 (None, 'required'),
305 (None, 'rev'),
306 (None, 'rightspacing'),
307 (None, 'rows'),
308 (None, 'rowspan'),
309 (None, 'rules'),
310 (None, 'scope'),
311 (None, 'selected'),
312 (None, 'shape'),
313 (None, 'size'),
314 (None, 'span'),
315 (None, 'src'),
316 (None, 'start'),
317 (None, 'step'),
318 (None, 'style'),
319 (None, 'summary'),
320 (None, 'suppress'),
321 (None, 'tabindex'),
322 (None, 'target'),
323 (None, 'template'),
324 (None, 'title'),
325 (None, 'toppadding'),
326 (None, 'type'),
327 (None, 'unselectable'),
328 (None, 'usemap'),
329 (None, 'urn'),
330 (None, 'valign'),
331 (None, 'value'),
332 (None, 'variable'),
333 (None, 'volume'),
334 (None, 'vspace'),
335 (None, 'vrml'),
336 (None, 'width'),
337 (None, 'wrap'),
338 (namespaces['xml'], 'lang'),
339 # MathML attributes
340 (None, 'actiontype'),
341 (None, 'align'),
342 (None, 'columnalign'),
343 (None, 'columnalign'),
344 (None, 'columnalign'),
345 (None, 'columnlines'),
346 (None, 'columnspacing'),
347 (None, 'columnspan'),
348 (None, 'depth'),
349 (None, 'display'),
350 (None, 'displaystyle'),
351 (None, 'equalcolumns'),
352 (None, 'equalrows'),
353 (None, 'fence'),
354 (None, 'fontstyle'),
355 (None, 'fontweight'),
356 (None, 'frame'),
357 (None, 'height'),
358 (None, 'linethickness'),
359 (None, 'lspace'),
360 (None, 'mathbackground'),
361 (None, 'mathcolor'),
362 (None, 'mathvariant'),
363 (None, 'mathvariant'),
364 (None, 'maxsize'),
365 (None, 'minsize'),
366 (None, 'other'),
367 (None, 'rowalign'),
368 (None, 'rowalign'),
369 (None, 'rowalign'),
370 (None, 'rowlines'),
371 (None, 'rowspacing'),
372 (None, 'rowspan'),
373 (None, 'rspace'),
374 (None, 'scriptlevel'),
375 (None, 'selection'),
376 (None, 'separator'),
377 (None, 'stretchy'),
378 (None, 'width'),
379 (None, 'width'),
380 (namespaces['xlink'], 'href'),
381 (namespaces['xlink'], 'show'),
382 (namespaces['xlink'], 'type'),
383 # SVG attributes
384 (None, 'accent-height'),
385 (None, 'accumulate'),
386 (None, 'additive'),
387 (None, 'alphabetic'),
388 (None, 'arabic-form'),
389 (None, 'ascent'),
390 (None, 'attributeName'),
391 (None, 'attributeType'),
392 (None, 'baseProfile'),
393 (None, 'bbox'),
394 (None, 'begin'),
395 (None, 'by'),
396 (None, 'calcMode'),
397 (None, 'cap-height'),
398 (None, 'class'),
399 (None, 'clip-path'),
400 (None, 'color'),
401 (None, 'color-rendering'),
402 (None, 'content'),
403 (None, 'cx'),
404 (None, 'cy'),
405 (None, 'd'),
406 (None, 'dx'),
407 (None, 'dy'),
408 (None, 'descent'),
409 (None, 'display'),
410 (None, 'dur'),
411 (None, 'end'),
412 (None, 'fill'),
413 (None, 'fill-opacity'),
414 (None, 'fill-rule'),
415 (None, 'font-family'),
416 (None, 'font-size'),
417 (None, 'font-stretch'),
418 (None, 'font-style'),
419 (None, 'font-variant'),
420 (None, 'font-weight'),
421 (None, 'from'),
422 (None, 'fx'),
423 (None, 'fy'),
424 (None, 'g1'),
425 (None, 'g2'),
426 (None, 'glyph-name'),
427 (None, 'gradientUnits'),
428 (None, 'hanging'),
429 (None, 'height'),
430 (None, 'horiz-adv-x'),
431 (None, 'horiz-origin-x'),
432 (None, 'id'),
433 (None, 'ideographic'),
434 (None, 'k'),
435 (None, 'keyPoints'),
436 (None, 'keySplines'),
437 (None, 'keyTimes'),
438 (None, 'lang'),
439 (None, 'marker-end'),
440 (None, 'marker-mid'),
441 (None, 'marker-start'),
442 (None, 'markerHeight'),
443 (None, 'markerUnits'),
444 (None, 'markerWidth'),
445 (None, 'mathematical'),
446 (None, 'max'),
447 (None, 'min'),
448 (None, 'name'),
449 (None, 'offset'),
450 (None, 'opacity'),
451 (None, 'orient'),
452 (None, 'origin'),
453 (None, 'overline-position'),
454 (None, 'overline-thickness'),
455 (None, 'panose-1'),
456 (None, 'path'),
457 (None, 'pathLength'),
458 (None, 'points'),
459 (None, 'preserveAspectRatio'),
460 (None, 'r'),
461 (None, 'refX'),
462 (None, 'refY'),
463 (None, 'repeatCount'),
464 (None, 'repeatDur'),
465 (None, 'requiredExtensions'),
466 (None, 'requiredFeatures'),
467 (None, 'restart'),
468 (None, 'rotate'),
469 (None, 'rx'),
470 (None, 'ry'),
471 (None, 'slope'),
472 (None, 'stemh'),
473 (None, 'stemv'),
474 (None, 'stop-color'),
475 (None, 'stop-opacity'),
476 (None, 'strikethrough-position'),
477 (None, 'strikethrough-thickness'),
478 (None, 'stroke'),
479 (None, 'stroke-dasharray'),
480 (None, 'stroke-dashoffset'),
481 (None, 'stroke-linecap'),
482 (None, 'stroke-linejoin'),
483 (None, 'stroke-miterlimit'),
484 (None, 'stroke-opacity'),
485 (None, 'stroke-width'),
486 (None, 'systemLanguage'),
487 (None, 'target'),
488 (None, 'text-anchor'),
489 (None, 'to'),
490 (None, 'transform'),
491 (None, 'type'),
492 (None, 'u1'),
493 (None, 'u2'),
494 (None, 'underline-position'),
495 (None, 'underline-thickness'),
496 (None, 'unicode'),
497 (None, 'unicode-range'),
498 (None, 'units-per-em'),
499 (None, 'values'),
500 (None, 'version'),
501 (None, 'viewBox'),
502 (None, 'visibility'),
503 (None, 'width'),
504 (None, 'widths'),
505 (None, 'x'),
506 (None, 'x-height'),
507 (None, 'x1'),
508 (None, 'x2'),
509 (namespaces['xlink'], 'actuate'),
510 (namespaces['xlink'], 'arcrole'),
511 (namespaces['xlink'], 'href'),
512 (namespaces['xlink'], 'role'),
513 (namespaces['xlink'], 'show'),
514 (namespaces['xlink'], 'title'),
515 (namespaces['xlink'], 'type'),
516 (namespaces['xml'], 'base'),
517 (namespaces['xml'], 'lang'),
518 (namespaces['xml'], 'space'),
519 (None, 'y'),
520 (None, 'y1'),
521 (None, 'y2'),
522 (None, 'zoomAndPan'),
523))
524
525attr_val_is_uri = frozenset((
526 (None, 'href'),
527 (None, 'src'),
528 (None, 'cite'),
529 (None, 'action'),
530 (None, 'longdesc'),
531 (None, 'poster'),
532 (None, 'background'),
533 (None, 'datasrc'),
534 (None, 'dynsrc'),
535 (None, 'lowsrc'),
536 (None, 'ping'),
537 (namespaces['xlink'], 'href'),
538 (namespaces['xml'], 'base'),
539))
540
541svg_attr_val_allows_ref = frozenset((
542 (None, 'clip-path'),
543 (None, 'color-profile'),
544 (None, 'cursor'),
545 (None, 'fill'),
546 (None, 'filter'),
547 (None, 'marker'),
548 (None, 'marker-start'),
549 (None, 'marker-mid'),
550 (None, 'marker-end'),
551 (None, 'mask'),
552 (None, 'stroke'),
553))
554
555svg_allow_local_href = frozenset((
556 (None, 'altGlyph'),
557 (None, 'animate'),
558 (None, 'animateColor'),
559 (None, 'animateMotion'),
560 (None, 'animateTransform'),
561 (None, 'cursor'),
562 (None, 'feImage'),
563 (None, 'filter'),
564 (None, 'linearGradient'),
565 (None, 'pattern'),
566 (None, 'radialGradient'),
567 (None, 'textpath'),
568 (None, 'tref'),
569 (None, 'set'),
570 (None, 'use')
571))
572
573allowed_css_properties = frozenset((
574 'azimuth',
575 'background-color',
576 'border-bottom-color',
577 'border-collapse',
578 'border-color',
579 'border-left-color',
580 'border-right-color',
581 'border-top-color',
582 'clear',
583 'color',
584 'cursor',
585 'direction',
586 'display',
587 'elevation',
588 'float',
589 'font',
590 'font-family',
591 'font-size',
592 'font-style',
593 'font-variant',
594 'font-weight',
595 'height',
596 'letter-spacing',
597 'line-height',
598 'overflow',
599 'pause',
600 'pause-after',
601 'pause-before',
602 'pitch',
603 'pitch-range',
604 'richness',
605 'speak',
606 'speak-header',
607 'speak-numeral',
608 'speak-punctuation',
609 'speech-rate',
610 'stress',
611 'text-align',
612 'text-decoration',
613 'text-indent',
614 'unicode-bidi',
615 'vertical-align',
616 'voice-family',
617 'volume',
618 'white-space',
619 'width',
620))
621
622allowed_css_keywords = frozenset((
623 'auto',
624 'aqua',
625 'black',
626 'block',
627 'blue',
628 'bold',
629 'both',
630 'bottom',
631 'brown',
632 'center',
633 'collapse',
634 'dashed',
635 'dotted',
636 'fuchsia',
637 'gray',
638 'green',
639 '!important',
640 'italic',
641 'left',
642 'lime',
643 'maroon',
644 'medium',
645 'none',
646 'navy',
647 'normal',
648 'nowrap',
649 'olive',
650 'pointer',
651 'purple',
652 'red',
653 'right',
654 'solid',
655 'silver',
656 'teal',
657 'top',
658 'transparent',
659 'underline',
660 'white',
661 'yellow',
662))
663
664allowed_svg_properties = frozenset((
665 'fill',
666 'fill-opacity',
667 'fill-rule',
668 'stroke',
669 'stroke-width',
670 'stroke-linecap',
671 'stroke-linejoin',
672 'stroke-opacity',
673))
674
675allowed_protocols = frozenset((
676 'ed2k',
677 'ftp',
678 'http',
679 'https',
680 'irc',
681 'mailto',
682 'news',
683 'gopher',
684 'nntp',
685 'telnet',
686 'webcal',
687 'xmpp',
688 'callto',
689 'feed',
690 'urn',
691 'aim',
692 'rsync',
693 'tag',
694 'ssh',
695 'sftp',
696 'rtsp',
697 'afs',
698 'data',
699))
700
701allowed_content_types = frozenset((
702 'image/png',
703 'image/jpeg',
704 'image/gif',
705 'image/webp',
706 'image/bmp',
707 'text/plain',
708))
709
710
711data_content_type = re.compile(r'''
712 ^
713 # Match a content type <application>/<type>
714 (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
715 # Match any character set and encoding
716 (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
717 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
718 # Assume the rest is data
719 ,.*
720 $
721 ''',
722 re.VERBOSE)
723
724
725class Filter(base.Filter):
726 """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
727 def __init__(self,
728 source,
729 allowed_elements=allowed_elements,
730 allowed_attributes=allowed_attributes,
731 allowed_css_properties=allowed_css_properties,
732 allowed_css_keywords=allowed_css_keywords,
733 allowed_svg_properties=allowed_svg_properties,
734 allowed_protocols=allowed_protocols,
735 allowed_content_types=allowed_content_types,
736 attr_val_is_uri=attr_val_is_uri,
737 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
738 svg_allow_local_href=svg_allow_local_href):
739 """Creates a Filter
740
741 :arg allowed_elements: set of elements to allow--everything else will
742 be escaped
743
744 :arg allowed_attributes: set of attributes to allow in
745 elements--everything else will be stripped
746
747 :arg allowed_css_properties: set of CSS properties to allow--everything
748 else will be stripped
749
750 :arg allowed_css_keywords: set of CSS keywords to allow--everything
751 else will be stripped
752
753 :arg allowed_svg_properties: set of SVG properties to allow--everything
754 else will be removed
755
756 :arg allowed_protocols: set of allowed protocols for URIs
757
758 :arg allowed_content_types: set of allowed content types for ``data`` URIs.
759
760 :arg attr_val_is_uri: set of attributes that have URI values--values
761 that have a scheme not listed in ``allowed_protocols`` are removed
762
763 :arg svg_attr_val_allows_ref: set of SVG attributes that can have
764 references
765
766 :arg svg_allow_local_href: set of SVG elements that can have local
767 hrefs--these are removed
768
769 """
770 super(Filter, self).__init__(source)
771
772 warnings.warn(_deprecation_msg, DeprecationWarning)
773
774 self.allowed_elements = allowed_elements
775 self.allowed_attributes = allowed_attributes
776 self.allowed_css_properties = allowed_css_properties
777 self.allowed_css_keywords = allowed_css_keywords
778 self.allowed_svg_properties = allowed_svg_properties
779 self.allowed_protocols = allowed_protocols
780 self.allowed_content_types = allowed_content_types
781 self.attr_val_is_uri = attr_val_is_uri
782 self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
783 self.svg_allow_local_href = svg_allow_local_href
784
785 def __iter__(self):
786 for token in base.Filter.__iter__(self):
787 token = self.sanitize_token(token)
788 if token:
789 yield token
790
791 # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
792 # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
793 # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
794 # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
795 # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
796 # allowed.
797 #
798 # sanitize_html('<script> do_nasty_stuff() </script>')
799 # => <script> do_nasty_stuff() </script>
800 # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
801 # => <a>Click here for $100</a>
802 def sanitize_token(self, token):
803
804 # accommodate filters which use token_type differently
805 token_type = token["type"]
806 if token_type in ("StartTag", "EndTag", "EmptyTag"):
807 name = token["name"]
808 namespace = token["namespace"]
809 if ((namespace, name) in self.allowed_elements or
810 (namespace is None and
811 (namespaces["html"], name) in self.allowed_elements)):
812 return self.allowed_token(token)
813 else:
814 return self.disallowed_token(token)
815 elif token_type == "Comment":
816 pass
817 else:
818 return token
819
820 def allowed_token(self, token):
821 if "data" in token:
822 attrs = token["data"]
823 attr_names = set(attrs.keys())
824
825 # Remove forbidden attributes
826 for to_remove in (attr_names - self.allowed_attributes):
827 del token["data"][to_remove]
828 attr_names.remove(to_remove)
829
830 # Remove attributes with disallowed URL values
831 for attr in (attr_names & self.attr_val_is_uri):
832 assert attr in attrs
833 # I don't have a clue where this regexp comes from or why it matches those
834 # characters, nor why we call unescape. I just know it's always been here.
835 # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
836 # this will do is remove *more* than it otherwise would.
837 val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
838 unescape(attrs[attr])).lower()
839 # remove replacement characters from unescaped characters
840 val_unescaped = val_unescaped.replace("\ufffd", "")
841 try:
842 uri = urlparse.urlparse(val_unescaped)
843 except ValueError:
844 uri = None
845 del attrs[attr]
846 if uri and uri.scheme:
847 if uri.scheme not in self.allowed_protocols:
848 del attrs[attr]
849 if uri.scheme == 'data':
850 m = data_content_type.match(uri.path)
851 if not m:
852 del attrs[attr]
853 elif m.group('content_type') not in self.allowed_content_types:
854 del attrs[attr]
855
856 for attr in self.svg_attr_val_allows_ref:
857 if attr in attrs:
858 attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
859 ' ',
860 unescape(attrs[attr]))
861 if (token["name"] in self.svg_allow_local_href and
862 (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
863 attrs[(namespaces['xlink'], 'href')])):
864 del attrs[(namespaces['xlink'], 'href')]
865 if (None, 'style') in attrs:
866 attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
867 token["data"] = attrs
868 return token
869
870 def disallowed_token(self, token):
871 token_type = token["type"]
872 if token_type == "EndTag":
873 token["data"] = "</%s>" % token["name"]
874 elif token["data"]:
875 assert token_type in ("StartTag", "EmptyTag")
876 attrs = []
877 for (ns, name), v in token["data"].items():
878 attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
879 token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
880 else:
881 token["data"] = "<%s>" % token["name"]
882 if token.get("selfClosing"):
883 token["data"] = token["data"][:-1] + "/>"
884
885 token["type"] = "Characters"
886
887 del token["name"]
888 return token
889
890 def sanitize_css(self, style):
891 # disallow urls
892 style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
893
894 # gauntlet
895 if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
896 return ''
897 if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
898 return ''
899
900 clean = []
901 for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
902 if not value:
903 continue
904 if prop.lower() in self.allowed_css_properties:
905 clean.append(prop + ': ' + value + ';')
906 elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
907 'padding']:
908 for keyword in value.split():
909 if keyword not in self.allowed_css_keywords and \
910 not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
911 break
912 else:
913 clean.append(prop + ': ' + value + ';')
914 elif prop.lower() in self.allowed_svg_properties:
915 clean.append(prop + ': ' + value + ';')
916
917 return ' '.join(clean)