1"""
2NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
3"""
4
5import warnings
6
7from bleach import ALLOWED_ATTRIBUTES, ALLOWED_TAGS, clean
8from traitlets import Any, Bool, List, Set, Unicode
9
10from .base import Preprocessor
11
12_USE_BLEACH_CSS_SANITIZER = False
13_USE_BLEACH_STYLES = False
14
15
16try:
17 # bleach[css] >=5.0
18 from bleach.css_sanitizer import ALLOWED_CSS_PROPERTIES as ALLOWED_STYLES
19 from bleach.css_sanitizer import CSSSanitizer
20
21 _USE_BLEACH_CSS_SANITIZER = True
22 _USE_BLEACH_STYLES = False
23except ImportError:
24 try:
25 # bleach <5
26 from bleach import ALLOWED_STYLES # type:ignore[attr-defined, no-redef]
27
28 _USE_BLEACH_CSS_SANITIZER = False
29 _USE_BLEACH_STYLES = True
30 warnings.warn(
31 "Support for bleach <5 will be removed in a future version of nbconvert",
32 DeprecationWarning,
33 stacklevel=2,
34 )
35
36 except ImportError:
37 warnings.warn(
38 "The installed bleach/tinycss2 do not provide CSS sanitization, "
39 "please upgrade to bleach >=5",
40 UserWarning,
41 stacklevel=2,
42 )
43
44
45__all__ = ["SanitizeHTML"]
46
47
48class SanitizeHTML(Preprocessor):
49 """A preprocessor to sanitize html."""
50
51 # Bleach config.
52 attributes = Any(
53 config=True,
54 default_value=ALLOWED_ATTRIBUTES,
55 help="Allowed HTML tag attributes",
56 )
57 tags = List(
58 Unicode(),
59 config=True,
60 default_value=ALLOWED_TAGS, # type:ignore[arg-type]
61 help="List of HTML tags to allow",
62 )
63 styles = List(
64 Unicode(),
65 config=True,
66 default_value=ALLOWED_STYLES, # type:ignore[arg-type]
67 help="Allowed CSS styles if <style> tag is allowed",
68 )
69 strip = Bool(
70 config=True,
71 default_value=False,
72 help="If True, remove unsafe markup entirely instead of escaping",
73 )
74 strip_comments = Bool(
75 config=True,
76 default_value=True,
77 help="If True, strip comments from escaped HTML",
78 )
79
80 # Display data config.
81 safe_output_keys = Set(
82 config=True,
83 default_value={
84 "metadata", # Not a mimetype per-se, but expected and safe.
85 "text/plain",
86 "text/latex",
87 "application/json",
88 "image/png",
89 "image/jpeg",
90 },
91 help="Cell output mimetypes to render without modification",
92 )
93 sanitized_output_types = Set(
94 config=True,
95 default_value={
96 "text/html",
97 "text/markdown",
98 },
99 help="Cell output types to display after escaping with Bleach.",
100 )
101
102 def preprocess_cell(self, cell, resources, cell_index):
103 """
104 Sanitize potentially-dangerous contents of the cell.
105
106 Cell Types:
107 raw:
108 Sanitize literal HTML
109 markdown:
110 Sanitize literal HTML
111 code:
112 Sanitize outputs that could result in code execution
113 """
114 if cell.cell_type == "raw":
115 # Sanitize all raw cells anyway.
116 # Only ones with the text/html mimetype should be emitted
117 # but erring on the side of safety maybe.
118 cell.source = self.sanitize_html_tags(cell.source)
119 return cell, resources
120 if cell.cell_type == "markdown":
121 cell.source = self.sanitize_html_tags(cell.source)
122 return cell, resources
123 if cell.cell_type == "code":
124 cell.outputs = self.sanitize_code_outputs(cell.outputs)
125 return cell, resources
126 return None
127
128 def sanitize_code_outputs(self, outputs):
129 """
130 Sanitize code cell outputs.
131
132 Removes 'text/javascript' fields from display_data outputs, and
133 runs `sanitize_html_tags` over 'text/html'.
134 """
135 for output in outputs:
136 # These are always ascii, so nothing to escape.
137 if output["output_type"] in ("stream", "error"):
138 continue
139 data = output.data
140 to_remove = []
141 for key in data:
142 if key in self.safe_output_keys:
143 continue
144 if key in self.sanitized_output_types:
145 self.log.info("Sanitizing %s", key)
146 data[key] = self.sanitize_html_tags(data[key])
147 else:
148 # Mark key for removal. (Python doesn't allow deletion of
149 # keys from a dict during iteration)
150 to_remove.append(key)
151 for key in to_remove:
152 self.log.info("Removing %s", key)
153 del data[key]
154 return outputs
155
156 def sanitize_html_tags(self, html_str):
157 """
158 Sanitize a string containing raw HTML tags.
159 """
160 kwargs = {
161 "tags": self.tags,
162 "attributes": self.attributes,
163 "strip": self.strip,
164 "strip_comments": self.strip_comments,
165 }
166
167 if _USE_BLEACH_CSS_SANITIZER:
168 css_sanitizer = CSSSanitizer(allowed_css_properties=self.styles)
169 kwargs.update(css_sanitizer=css_sanitizer)
170 elif _USE_BLEACH_STYLES:
171 kwargs.update(styles=self.styles)
172
173 return clean(html_str, **kwargs)
174
175
176def _get_default_css_sanitizer():
177 if _USE_BLEACH_CSS_SANITIZER:
178 return CSSSanitizer(allowed_css_properties=ALLOWED_STYLES)
179 return None