Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown_mistune.py: 73%

153 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 06:10 +0000

1"""Markdown filters with mistune 

2 

3Used from markdown.py 

4""" 

5# Copyright (c) IPython Development Team. 

6# Distributed under the terms of the Modified BSD License. 

7 

8 

9import base64 

10import mimetypes 

11import os 

12import re 

13from functools import partial 

14from html import escape 

15 

16import bs4 

17from mistune import PLUGINS, BlockParser, HTMLRenderer, InlineParser, Markdown # type:ignore 

18from pygments import highlight 

19from pygments.formatters import HtmlFormatter 

20from pygments.lexers import get_lexer_by_name 

21from pygments.util import ClassNotFound 

22 

23from nbconvert.filters.strings import add_anchor 

24 

25html_escape = partial(escape, quote=False) 

26 

27 

28class InvalidNotebook(Exception): # noqa 

29 """An invalid notebook model.""" 

30 

31 pass 

32 

33 

34class MathBlockParser(BlockParser): 

35 """This acts as a pass-through to the MathInlineParser. It is needed in 

36 order to avoid other block level rules splitting math sections apart. 

37 """ 

38 

39 MULTILINE_MATH = re.compile( 

40 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|" 

41 r"\\\\\[.*?\\\\\]|" 

42 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}", 

43 re.DOTALL, 

44 ) 

45 

46 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) 

47 

48 # Regex for header that doesn't require space after '#' 

49 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))") 

50 

51 def parse_multiline_math(self, m, state): 

52 """Pass token through mutiline math.""" 

53 return {"type": "multiline_math", "text": m.group(0)} 

54 

55 

56def _dotall(pattern): 

57 """Make the '.' special character match any character inside the pattern, including a newline. 

58 

59 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL` when 

60 it is the only pattern used. It is necessary since `mistune>=2.0.0`, where the pattern is passed 

61 to the undocumented `re.Scanner`. 

62 """ 

63 return f"(?s:{pattern})" 

64 

65 

66class MathInlineParser(InlineParser): 

67 r"""This interprets the content of LaTeX style math objects. 

68 

69 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``, 

70 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips 

71 delimiters from all these varieties, and extracts the type of environment 

72 in the last case (``foo`` in this example). 

73 """ 

74 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$") 

75 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]") 

76 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$") 

77 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)") 

78 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}") 

79 

80 # The order is important here 

81 RULE_NAMES = ( 

82 "block_math_tex", 

83 "block_math_latex", 

84 "inline_math_tex", 

85 "inline_math_latex", 

86 "latex_environment", 

87 *InlineParser.RULE_NAMES, 

88 ) 

89 

90 def parse_block_math_tex(self, m, state): 

91 """Parse block text math.""" 

92 # sometimes the Scanner keeps the final '$$', so we use the 

93 # full matched string and remove the math markers 

94 text = m.group(0)[2:-2] 

95 return "block_math", text 

96 

97 def parse_block_math_latex(self, m, state): 

98 """Parse block latex math .""" 

99 text = m.group(1) 

100 return "block_math", text 

101 

102 def parse_inline_math_tex(self, m, state): 

103 """Parse inline tex math.""" 

104 text = m.group(1) 

105 return "inline_math", text 

106 

107 def parse_inline_math_latex(self, m, state): 

108 """Parse inline latex math.""" 

109 text = m.group(1) 

110 return "inline_math", text 

111 

112 def parse_latex_environment(self, m, state): 

113 """Parse a latex environment.""" 

114 name, text = m.group(1), m.group(2) 

115 return "latex_environment", name, text 

116 

117 

118class MarkdownWithMath(Markdown): 

119 """Markdown text with math enabled.""" 

120 

121 def __init__(self, renderer, block=None, inline=None, plugins=None): 

122 """Initialize the parser.""" 

123 if block is None: 

124 block = MathBlockParser() 

125 if inline is None: 

126 inline = MathInlineParser(renderer, hard_wrap=False) 

127 if plugins is None: 

128 plugins = [ 

129 # "abbr", 

130 # 'footnotes', 

131 "strikethrough", 

132 "table", 

133 "url", 

134 "task_lists", 

135 "def_list", 

136 ] 

137 _plugins = [] 

138 for p in plugins: 

139 if isinstance(p, str): 

140 _plugins.append(PLUGINS[p]) 

141 else: 

142 _plugins.append(p) 

143 plugins = _plugins 

144 super().__init__(renderer, block, inline, plugins) 

145 

146 def render(self, s): 

147 """Compatibility method with `mistune==0.8.4`.""" 

148 return self.parse(s) 

149 

150 

151class IPythonRenderer(HTMLRenderer): 

152 """An ipython html renderer.""" 

153 

154 def __init__( # noqa 

155 self, 

156 escape=True, 

157 allow_harmful_protocols=True, 

158 embed_images=False, 

159 exclude_anchor_links=False, 

160 anchor_link_text="¶", 

161 path="", 

162 attachments=None, 

163 ): 

164 """Initialize the renderer.""" 

165 super().__init__(escape, allow_harmful_protocols) 

166 self.embed_images = embed_images 

167 self.exclude_anchor_links = exclude_anchor_links 

168 self.anchor_link_text = anchor_link_text 

169 self.path = path 

170 if attachments is not None: 

171 self.attachments = attachments 

172 else: 

173 self.attachments = {} 

174 

175 def block_code(self, code, info=None): 

176 """Handle block code.""" 

177 lang = "" 

178 lexer = None 

179 if info: 

180 try: 

181 lang = info.strip().split(None, 1)[0] 

182 lexer = get_lexer_by_name(lang, stripall=True) 

183 except ClassNotFound: 

184 code = lang + "\n" + code 

185 lang = None # type:ignore 

186 

187 if not lang: 

188 return super().block_code(code) 

189 

190 formatter = HtmlFormatter() 

191 return highlight(code, lexer, formatter) 

192 

193 def block_html(self, html): 

194 """Handle block html.""" 

195 if self.embed_images: 

196 html = self._html_embed_images(html) 

197 

198 return super().block_html(html) 

199 

200 def inline_html(self, html): 

201 """Handle inline html.""" 

202 if self.embed_images: 

203 html = self._html_embed_images(html) 

204 

205 return super().inline_html(html) 

206 

207 def heading(self, text, level): 

208 """Handle a heading.""" 

209 html = super().heading(text, level) 

210 if self.exclude_anchor_links: 

211 return html 

212 return add_anchor(html, anchor_link_text=self.anchor_link_text) 

213 

214 def escape_html(self, text): 

215 """Escape html content.""" 

216 return html_escape(text) 

217 

218 def multiline_math(self, text): 

219 """Handle mulitline math.""" 

220 return text 

221 

222 def block_math(self, text): 

223 """Handle block math.""" 

224 return f"$${self.escape_html(text)}$$" 

225 

226 def latex_environment(self, name, text): 

227 """Handle a latex environment.""" 

228 name, text = self.escape_html(name), self.escape_html(text) 

229 return f"\\begin{{{name}}}{text}\\end{{{name}}}" 

230 

231 def inline_math(self, text): 

232 """Handle inline math.""" 

233 return f"${self.escape_html(text)}$" 

234 

235 def image(self, src, text, title): 

236 """Rendering a image with title and text. 

237 

238 :param src: source link of the image. 

239 :param text: alt text of the image. 

240 :param title: title text of the image. 

241 """ 

242 attachment_prefix = "attachment:" 

243 

244 if src.startswith(attachment_prefix): 

245 name = src[len(attachment_prefix) :] 

246 

247 if name not in self.attachments: 

248 msg = f"missing attachment: {name}" 

249 raise InvalidNotebook(msg) 

250 

251 attachment = self.attachments[name] 

252 # we choose vector over raster, and lossless over lossy 

253 preferred_mime_types = ["image/svg+xml", "image/png", "image/jpeg"] 

254 for preferred_mime_type in preferred_mime_types: 

255 if preferred_mime_type in attachment: 

256 break 

257 else: # otherwise we choose the first mimetype we can find 

258 preferred_mime_type = list(attachment.keys())[0] 

259 mime_type = preferred_mime_type 

260 data = attachment[mime_type] 

261 src = "data:" + mime_type + ";base64," + data 

262 

263 elif self.embed_images: 

264 base64_url = self._src_to_base64(src) 

265 

266 if base64_url is not None: 

267 src = base64_url 

268 

269 return super().image(src, text, title) 

270 

271 def _src_to_base64(self, src): 

272 """Turn the source file into a base64 url. 

273 

274 :param src: source link of the file. 

275 :return: the base64 url or None if the file was not found. 

276 """ 

277 src_path = os.path.join(self.path, src) 

278 

279 if not os.path.exists(src_path): 

280 return None 

281 

282 with open(src_path, "rb") as fobj: 

283 mime_type = mimetypes.guess_type(src_path)[0] 

284 

285 base64_data = base64.b64encode(fobj.read()) 

286 base64_str = base64_data.replace(b"\n", b"").decode("ascii") 

287 

288 return f"data:{mime_type};base64,{base64_str}" 

289 

290 def _html_embed_images(self, html): 

291 parsed_html = bs4.BeautifulSoup(html, features="html.parser") 

292 imgs = parsed_html.find_all("img") 

293 

294 # Replace img tags's sources by base64 dataurls 

295 for img in imgs: 

296 if "src" not in img.attrs: 

297 continue 

298 

299 base64_url = self._src_to_base64(img.attrs["src"]) 

300 

301 if base64_url is not None: 

302 img.attrs["src"] = base64_url 

303 

304 return str(parsed_html) 

305 

306 

307def markdown2html_mistune(source): 

308 """Convert a markdown string to HTML using mistune""" 

309 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)