Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/nbconvert/filters/strings.py: 51%

1"""String filters.

3Contains a collection of useful string manipulation filters for use in Jinja

4templates.

5"""

7# Copyright (c) IPython Development Team.

8# Distributed under the terms of the Modified BSD License.

10import base64

11import os

12import re

13import textwrap

14import warnings

15from urllib.parse import quote

16from xml.etree.ElementTree import Element

18import bleach

20# defusedxml does safe(r) parsing of untrusted XML data

21from defusedxml import ElementTree # type:ignore[import-untyped]

23from nbconvert.preprocessors.sanitize import _get_default_css_sanitizer

25__all__ = [

26 "wrap_text",

27 "html2text",

28 "clean_html",

29 "add_anchor",

30 "strip_dollars",

31 "strip_files_prefix",

32 "comment_lines",

33 "get_lines",

34 "ipython2python",

35 "posix_path",

36 "path2url",

37 "add_prompts",

38 "ascii_only",

39 "prevent_list_blocks",

40 "strip_trailing_newline",

41 "text_base64",

42]

45def wrap_text(text, width=100):

46 """

47 Intelligently wrap text.

48 Wrap text without breaking words if possible.

50 Parameters

51 ----------

52 text : str

53 Text to wrap.

54 width : int, optional

55 Number of characters to wrap to, default 100.

56 """

58 split_text = text.split("\n")

59 wrp = map(lambda x: textwrap.wrap(x, width), split_text) # noqa: C417

60 wrpd = map("\n".join, wrp)

61 return "\n".join(wrpd)

64def html2text(element):

65 """extract inner text from html

67 Analog of jQuery's $(element).text()

68 """

69 if isinstance(element, (str,)):

70 try:

71 element = ElementTree.fromstring(element)

72 except Exception:

73 # failed to parse, just return it unmodified

74 return element

76 text = element.text or ""

77 for child in element:

78 text += html2text(child)

79 text += element.tail or ""

80 return text

83def clean_html(element):

84 """Clean an html element."""

85 element = element.decode() if isinstance(element, bytes) else str(element)

86 kwargs = {}

87 css_sanitizer = _get_default_css_sanitizer()

88 if css_sanitizer:

89 kwargs["css_sanitizer"] = css_sanitizer

90 return bleach.clean(

91 element,

92 tags=[*bleach.ALLOWED_TAGS, "div", "pre", "code", "span", "table", "tr", "td"],

93 attributes={

94 **bleach.ALLOWED_ATTRIBUTES,

95 "*": ["class", "id"],

96 },

97 **kwargs,

98 )

100

101def _convert_header_id(header_contents):

102 """Convert header contents to valid id value. Takes string as input, returns string.

103

104 Note: this may be subject to change in the case of changes to how we wish to generate ids.

105

106 For use on markdown headings.

107 """

108 # Valid IDs need to be non-empty and contain no space characters, but are otherwise arbitrary.

109 # However, these IDs are also used in URL fragments, which are more restrictive, so we URL

110 # encode any characters that are not valid in URL fragments.

111 return quote(header_contents.replace(" ", "-"), safe="?/:@!$&'()*+,;=")

112

113

114def add_anchor(html, anchor_link_text="¶"):

115 """Add an id and an anchor-link to an html header

116

117 For use on markdown headings

118 """

119 try:

120 h = ElementTree.fromstring(html)

121 except Exception:

122 # failed to parse, just return it unmodified

123 return html

124 link = _convert_header_id(html2text(h))

125 h.set("id", link)

126 a = Element("a", {"class": "anchor-link", "href": "#" + link})

127 try:

128 # Test if the anchor link text is HTML (e.g. an image)

129 a.append(ElementTree.fromstring(anchor_link_text))

130 except Exception:

131 # If we fail to parse, assume we've just got regular text

132 a.text = anchor_link_text

133 h.append(a)

134

135 return ElementTree.tostring(h).decode(encoding="utf-8")

136

137

138def add_prompts(code, first=">>> ", cont="... "):

139 """Add prompts to code snippets"""

140 new_code = []

141 code_list = code.split("\n")

142 new_code.append(first + code_list[0])

143 for line in code_list[1:]:

144 new_code.append(cont + line)

145 return "\n".join(new_code)

146

147

148def strip_dollars(text):

149 """

150 Remove all dollar symbols from text

151

152 Parameters

153 ----------

154 text : str

155 Text to remove dollars from

156 """

157

158 return text.strip("$")

159

160

161files_url_pattern = re.compile(r'(src|href)\=([\'"]?)/?files/')

162markdown_url_pattern = re.compile(r"(!?)\[(?P<caption>.*?)\]$/?files/(?P<location>.*?)$")

163

164

165def strip_files_prefix(text):

166 """

167 Fix all fake URLs that start with ``files/``, stripping out the ``files/`` prefix.

168 Applies to both urls (for html) and relative paths (for markdown paths).

169

170 Parameters

171 ----------

172 text : str

173 Text in which to replace 'src="files/real...' with 'src="real...'

174 """

175 cleaned_text = files_url_pattern.sub(r"\1=\2", text)

176 cleaned_text = markdown_url_pattern.sub(r"\1[\2](\3)", cleaned_text)

177 return cleaned_text # noqa: RET504

178

179

180def comment_lines(text, prefix="# "):

181 """

182 Build a Python comment line from input text.

183

184 Parameters

185 ----------

186 text : str

187 Text to comment out.

188 prefix : str

189 Character to append to the start of each line.

190 """

191

192 # Replace line breaks with line breaks and comment symbols.

193 # Also add a comment symbol at the beginning to comment out

194 # the first line.

195 return prefix + ("\n" + prefix).join(text.split("\n"))

196

197

198def get_lines(text, start=None, end=None):

199 """

200 Split the input text into separate lines and then return the

201 lines that the caller is interested in.

202

203 Parameters

204 ----------

205 text : str

206 Text to parse lines from.

207 start : int, optional

208 First line to grab from.

209 end : int, optional

210 Last line to grab from.

211 """

212

213 # Split the input into lines.

214 lines = text.split("\n")

215

216 # Return the right lines.

217 return "\n".join(lines[start:end]) # re-join

218

219

220def ipython2python(code):

221 """Transform IPython syntax to pure Python syntax

222

223 Parameters

224 ----------

225 code : str

226 IPython code, to be transformed to pure Python

227 """

228 try:

229 from IPython.core.inputtransformer2 import TransformerManager

230 except ImportError:

231 warnings.warn(

232 "IPython is needed to transform IPython syntax to pure Python."

233 " Install ipython if you need this functionality.",

234 stacklevel=2,

235 )

236 return code

237 else:

238 isp = TransformerManager()

239 return isp.transform_cell(code)

240

241

242def posix_path(path):

243 """Turn a path into posix-style path/to/etc

244

245 Mainly for use in latex on Windows,

246 where native Windows paths are not allowed.

247 """

248 if os.path.sep != "/":

249 return path.replace(os.path.sep, "/")

250 return path

251

252

253def path2url(path):

254 """Turn a file path into a URL"""

255 parts = path.split(os.path.sep)

256 return "/".join(quote(part) for part in parts)

257

258

259def ascii_only(s):

260 """ensure a string is ascii"""

261 return s.encode("ascii", "replace").decode("ascii")

262

263

264def prevent_list_blocks(s):

265 """

266 Prevent presence of enumerate or itemize blocks in latex headings cells

267 """

268 out = re.sub(r"(^\s*\d*)\.", r"\1\.", s)

269 out = re.sub(r"(^\s*)\-", r"\1\-", out)

270 out = re.sub(r"(^\s*)\+", r"\1\+", out)

271 out = re.sub(r"(^\s*)\*", r"\1\*", out)

272 return out # noqa: RET504

273

274

275def strip_trailing_newline(text):

276 """

277 Strips a newline from the end of text.

278 """

279 if text.endswith("\n"):

280 text = text[:-1]

281 return text

282

283

284def text_base64(text):

285 """

286 Encode base64 text

287 """

288 return base64.b64encode(text.encode()).decode()