1"""String filters.
2
3Contains a collection of useful string manipulation filters for use in Jinja
4templates.
5"""
6
7# Copyright (c) IPython Development Team.
8# Distributed under the terms of the Modified BSD License.
9
10import base64
11import os
12import re
13import textwrap
14import warnings
15from urllib.parse import quote
16from xml.etree.ElementTree import Element
17
18import bleach
19
20# defusedxml does safe(r) parsing of untrusted XML data
21from defusedxml import ElementTree # type:ignore[import-untyped]
22
23from nbconvert.preprocessors.sanitize import _get_default_css_sanitizer
24
25__all__ = [
26 "wrap_text",
27 "html2text",
28 "clean_html",
29 "add_anchor",
30 "strip_dollars",
31 "strip_files_prefix",
32 "comment_lines",
33 "get_lines",
34 "ipython2python",
35 "posix_path",
36 "path2url",
37 "add_prompts",
38 "ascii_only",
39 "prevent_list_blocks",
40 "strip_trailing_newline",
41 "text_base64",
42]
43
44
45def wrap_text(text, width=100):
46 """
47 Intelligently wrap text.
48 Wrap text without breaking words if possible.
49
50 Parameters
51 ----------
52 text : str
53 Text to wrap.
54 width : int, optional
55 Number of characters to wrap to, default 100.
56 """
57
58 split_text = text.split("\n")
59 wrp = map(lambda x: textwrap.wrap(x, width), split_text) # noqa: C417
60 wrpd = map("\n".join, wrp)
61 return "\n".join(wrpd)
62
63
64def html2text(element):
65 """extract inner text from html
66
67 Analog of jQuery's $(element).text()
68 """
69 if isinstance(element, (str,)):
70 try:
71 element = ElementTree.fromstring(element)
72 except Exception:
73 # failed to parse, just return it unmodified
74 return element
75
76 text = element.text or ""
77 for child in element:
78 text += html2text(child)
79 text += element.tail or ""
80 return text
81
82
83def clean_html(element):
84 """Clean an html element."""
85 element = element.decode() if isinstance(element, bytes) else str(element)
86 kwargs = {}
87 css_sanitizer = _get_default_css_sanitizer()
88 if css_sanitizer:
89 kwargs["css_sanitizer"] = css_sanitizer
90 return bleach.clean(
91 element,
92 tags=[*bleach.ALLOWED_TAGS, "div", "pre", "code", "span", "table", "tr", "td"],
93 attributes={
94 **bleach.ALLOWED_ATTRIBUTES,
95 "*": ["class", "id"],
96 },
97 **kwargs,
98 )
99
100
101def _convert_header_id(header_contents):
102 """Convert header contents to valid id value. Takes string as input, returns string.
103
104 Note: this may be subject to change in the case of changes to how we wish to generate ids.
105
106 For use on markdown headings.
107 """
108 # Valid IDs need to be non-empty and contain no space characters, but are otherwise arbitrary.
109 # However, these IDs are also used in URL fragments, which are more restrictive, so we URL
110 # encode any characters that are not valid in URL fragments.
111 return quote(header_contents.replace(" ", "-"), safe="?/:@!$&'()*+,;=")
112
113
114def add_anchor(html, anchor_link_text="¶"):
115 """Add an id and an anchor-link to an html header
116
117 For use on markdown headings
118 """
119 try:
120 h = ElementTree.fromstring(html)
121 except Exception:
122 # failed to parse, just return it unmodified
123 return html
124 link = _convert_header_id(html2text(h))
125 h.set("id", link)
126 a = Element("a", {"class": "anchor-link", "href": "#" + link})
127 try:
128 # Test if the anchor link text is HTML (e.g. an image)
129 a.append(ElementTree.fromstring(anchor_link_text))
130 except Exception:
131 # If we fail to parse, assume we've just got regular text
132 a.text = anchor_link_text
133 h.append(a)
134
135 return ElementTree.tostring(h).decode(encoding="utf-8")
136
137
138def add_prompts(code, first=">>> ", cont="... "):
139 """Add prompts to code snippets"""
140 new_code = []
141 code_list = code.split("\n")
142 new_code.append(first + code_list[0])
143 for line in code_list[1:]:
144 new_code.append(cont + line)
145 return "\n".join(new_code)
146
147
148def strip_dollars(text):
149 """
150 Remove all dollar symbols from text
151
152 Parameters
153 ----------
154 text : str
155 Text to remove dollars from
156 """
157
158 return text.strip("$")
159
160
161files_url_pattern = re.compile(r'(src|href)\=([\'"]?)/?files/')
162markdown_url_pattern = re.compile(r"(!?)\[(?P<caption>.*?)\]\(/?files/(?P<location>.*?)\)")
163
164
165def strip_files_prefix(text):
166 """
167 Fix all fake URLs that start with ``files/``, stripping out the ``files/`` prefix.
168 Applies to both urls (for html) and relative paths (for markdown paths).
169
170 Parameters
171 ----------
172 text : str
173 Text in which to replace 'src="files/real...' with 'src="real...'
174 """
175 cleaned_text = files_url_pattern.sub(r"\1=\2", text)
176 cleaned_text = markdown_url_pattern.sub(r"\1[\2](\3)", cleaned_text)
177 return cleaned_text # noqa: RET504
178
179
180def comment_lines(text, prefix="# "):
181 """
182 Build a Python comment line from input text.
183
184 Parameters
185 ----------
186 text : str
187 Text to comment out.
188 prefix : str
189 Character to append to the start of each line.
190 """
191
192 # Replace line breaks with line breaks and comment symbols.
193 # Also add a comment symbol at the beginning to comment out
194 # the first line.
195 return prefix + ("\n" + prefix).join(text.split("\n"))
196
197
198def get_lines(text, start=None, end=None):
199 """
200 Split the input text into separate lines and then return the
201 lines that the caller is interested in.
202
203 Parameters
204 ----------
205 text : str
206 Text to parse lines from.
207 start : int, optional
208 First line to grab from.
209 end : int, optional
210 Last line to grab from.
211 """
212
213 # Split the input into lines.
214 lines = text.split("\n")
215
216 # Return the right lines.
217 return "\n".join(lines[start:end]) # re-join
218
219
220def ipython2python(code):
221 """Transform IPython syntax to pure Python syntax
222
223 Parameters
224 ----------
225 code : str
226 IPython code, to be transformed to pure Python
227 """
228 try:
229 from IPython.core.inputtransformer2 import TransformerManager
230 except ImportError:
231 warnings.warn(
232 "IPython is needed to transform IPython syntax to pure Python."
233 " Install ipython if you need this functionality.",
234 stacklevel=2,
235 )
236 return code
237 else:
238 isp = TransformerManager()
239 return isp.transform_cell(code)
240
241
242def posix_path(path):
243 """Turn a path into posix-style path/to/etc
244
245 Mainly for use in latex on Windows,
246 where native Windows paths are not allowed.
247 """
248 if os.path.sep != "/":
249 return path.replace(os.path.sep, "/")
250 return path
251
252
253def path2url(path):
254 """Turn a file path into a URL"""
255 parts = path.split(os.path.sep)
256 return "/".join(quote(part) for part in parts)
257
258
259def ascii_only(s):
260 """ensure a string is ascii"""
261 return s.encode("ascii", "replace").decode("ascii")
262
263
264def prevent_list_blocks(s):
265 """
266 Prevent presence of enumerate or itemize blocks in latex headings cells
267 """
268 out = re.sub(r"(^\s*\d*)\.", r"\1\.", s)
269 out = re.sub(r"(^\s*)\-", r"\1\-", out)
270 out = re.sub(r"(^\s*)\+", r"\1\+", out)
271 out = re.sub(r"(^\s*)\*", r"\1\*", out)
272 return out # noqa: RET504
273
274
275def strip_trailing_newline(text):
276 """
277 Strips a newline from the end of text.
278 """
279 if text.endswith("\n"):
280 text = text[:-1]
281 return text
282
283
284def text_base64(text):
285 """
286 Encode base64 text
287 """
288 return base64.b64encode(text.encode()).decode()