Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown_mistune.py: 61%

204 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1"""Markdown filters with mistune 

2 

3Used from markdown.py 

4""" 

5# Copyright (c) IPython Development Team. 

6# Distributed under the terms of the Modified BSD License. 

7 

8 

9import base64 

10import mimetypes 

11import os 

12from html import escape 

13from typing import Any, Callable, Dict, Iterable, Match, Optional, Tuple 

14 

15import bs4 

16from pygments import highlight 

17from pygments.formatters import HtmlFormatter 

18from pygments.lexer import Lexer 

19from pygments.lexers import get_lexer_by_name 

20from pygments.util import ClassNotFound 

21 

22from nbconvert.filters.strings import add_anchor 

23 

24try: # for Mistune >= 3.0 

25 from mistune import ( 

26 BlockParser, 

27 BlockState, 

28 HTMLRenderer, 

29 InlineParser, 

30 InlineState, 

31 Markdown, 

32 import_plugin, 

33 ) 

34 

35 MISTUNE_V3 = True 

36 

37except ImportError: # for Mistune >= 2.0 

38 import re 

39 

40 from mistune import ( # type: ignore[attr-defined] 

41 PLUGINS, 

42 BlockParser, 

43 HTMLRenderer, 

44 InlineParser, 

45 Markdown, 

46 ) 

47 

48 MISTUNE_V3 = False 

49 

50 def import_plugin(name: str) -> 'MarkdownPlugin': # type: ignore[misc] 

51 """Simple implementation of Mistune V3's import_plugin for V2.""" 

52 return PLUGINS[name] # type: ignore[no-any-return] 

53 

54 

55class InvalidNotebook(Exception): # noqa 

56 """An invalid notebook model.""" 

57 

58 pass 

59 

60 

61def _dotall(pattern: str) -> str: 

62 """Makes the '.' special character match any character inside the pattern, including a newline. 

63 

64 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`. 

65 It is useful for LaTeX environments, where line breaks may be present. 

66 """ 

67 return f"(?s:{pattern})" 

68 

69 

70if MISTUNE_V3: # Parsers for Mistune >= 3.0.0 

71 

72 class MathBlockParser(BlockParser): 

73 """This acts as a pass-through to the MathInlineParser. It is needed in 

74 order to avoid other block level rules splitting math sections apart. 

75 

76 It works by matching each multiline math environment as a single paragraph, 

77 so that other rules don't think each section is its own paragraph. Inline 

78 is ignored here. 

79 """ 

80 

81 AXT_HEADING_WITHOUT_LEADING_SPACES = ( 

82 r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$" 

83 ) 

84 

85 MULTILINE_MATH = _dotall( 

86 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$ 

87 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}" 

88 "|" 

89 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \] 

90 r"\\\\\[.*?\\\\\]" 

91 "|" 

92 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation} 

93 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}" 

94 ) 

95 

96 SPECIFICATION = { 

97 **BlockParser.SPECIFICATION, 

98 "axt_heading": AXT_HEADING_WITHOUT_LEADING_SPACES, 

99 "multiline_math": MULTILINE_MATH, 

100 } 

101 

102 # Multiline math must be searched before other rules 

103 DEFAULT_RULES: Tuple[str, ...] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment] 

104 

105 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int: 

106 """Send mutiline math as a single paragraph to MathInlineParser.""" 

107 matched_text = m[0] 

108 state.add_paragraph(matched_text) 

109 return m.end() 

110 

111 class MathInlineParser(InlineParser): 

112 r"""This interprets the content of LaTeX style math objects. 

113 

114 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``, 

115 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips 

116 delimiters from all these varieties, and extracts the type of environment 

117 in the last case (``foo`` in this example). 

118 """ 

119 

120 # Display math mode, using older TeX delimiter: $$ \pi $$ 

121 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$") 

122 # Display math mode, using newer LaTeX delimiter: \[ \pi \] 

123 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]") 

124 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!) 

125 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$") 

126 # Inline math mode, using newer LaTeX delimiter: \( \pi \) 

127 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((?P<math_inline_latex>.*?)(?<!\\)\\\\\)") 

128 # LaTeX math environment: \begin{equation} \pi \end{equation} 

129 LATEX_ENVIRONMENT = _dotall( 

130 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}" 

131 r"(?P<math_env_body>.*?)" 

132 r"\\end\{(?P=math_env_name)\}" 

133 ) 

134 

135 SPECIFICATION = { 

136 **InlineParser.SPECIFICATION, 

137 "block_math_tex": BLOCK_MATH_TEX, 

138 "block_math_latex": BLOCK_MATH_LATEX, 

139 "inline_math_tex": INLINE_MATH_TEX, 

140 "inline_math_latex": INLINE_MATH_LATEX, 

141 "latex_environment": LATEX_ENVIRONMENT, 

142 } 

143 

144 # Block math must be matched first, and all math must come before text 

145 DEFAULT_RULES: Tuple[str, ...] = ( 

146 "block_math_tex", 

147 "block_math_latex", 

148 "inline_math_tex", 

149 "inline_math_latex", 

150 "latex_environment", 

151 *InlineParser.DEFAULT_RULES, 

152 ) # type: ignore[assignment] 

153 

154 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int: 

155 """Parse older TeX-style display math.""" 

156 body = m.group("math_block_tex") 

157 state.append_token({"type": "block_math", "raw": body}) 

158 return m.end() 

159 

160 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int: 

161 """Parse newer LaTeX-style display math.""" 

162 body = m.group("math_block_latex") 

163 state.append_token({"type": "block_math", "raw": body}) 

164 return m.end() 

165 

166 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int: 

167 """Parse older TeX-style inline math.""" 

168 body = m.group("math_inline_tex") 

169 state.append_token({"type": "inline_math", "raw": body}) 

170 return m.end() 

171 

172 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int: 

173 """Parse newer LaTeX-style inline math.""" 

174 body = m.group("math_inline_latex") 

175 state.append_token({"type": "inline_math", "raw": body}) 

176 return m.end() 

177 

178 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int: 

179 """Parse a latex environment.""" 

180 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")} 

181 state.append_token({"type": "latex_environment", "attrs": attrs}) 

182 return m.end() 

183 

184else: # Parsers for Mistune >= 2.0.0 < 3.0.0 

185 

186 class MathBlockParser(BlockParser): # type: ignore[no-redef] 

187 """This acts as a pass-through to the MathInlineParser. It is needed in 

188 order to avoid other block level rules splitting math sections apart. 

189 """ 

190 

191 MULTILINE_MATH = re.compile( 

192 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$ 

193 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|" 

194 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \] 

195 r"\\\\\[.*?\\\\\]|" 

196 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation} 

197 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}", 

198 re.DOTALL, 

199 ) 

200 

201 # Regex for header that doesn't require space after '#' 

202 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))") 

203 

204 # Multiline math must be searched before other rules 

205 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore 

206 

207 def parse_multiline_math(self, m: Match[str], state: Any) -> Dict[str, str]: 

208 """Pass token through mutiline math.""" 

209 return {"type": "multiline_math", "text": m.group(0)} 

210 

211 class MathInlineParser(InlineParser): # type: ignore[no-redef] 

212 r"""This interprets the content of LaTeX style math objects. 

213 

214 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``, 

215 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips 

216 delimiters from all these varieties, and extracts the type of environment 

217 in the last case (``foo`` in this example). 

218 """ 

219 

220 # Display math mode, using older TeX delimiter: $$ \pi $$ 

221 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$") 

222 # Display math mode, using newer LaTeX delimiter: \[ \pi \] 

223 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]") 

224 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!) 

225 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$") 

226 # Inline math mode, using newer LaTeX delimiter: \( \pi \) 

227 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)") 

228 # LaTeX math environment: \begin{equation} \pi \end{equation} 

229 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}") 

230 

231 RULE_NAMES = ( 

232 "block_math_tex", 

233 "block_math_latex", 

234 "inline_math_tex", 

235 "inline_math_latex", 

236 "latex_environment", 

237 *InlineParser.RULE_NAMES, # type: ignore 

238 ) 

239 

240 def parse_block_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]: 

241 """Parse block text math.""" 

242 # sometimes the Scanner keeps the final '$$', so we use the 

243 # full matched string and remove the math markers 

244 text = m.group(0)[2:-2] 

245 return "block_math", text 

246 

247 def parse_block_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]: 

248 """Parse block latex math .""" 

249 text = m.group(1) 

250 return "block_math", text 

251 

252 def parse_inline_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]: 

253 """Parse inline tex math.""" 

254 text = m.group(1) 

255 return "inline_math", text 

256 

257 def parse_inline_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]: 

258 """Parse inline latex math.""" 

259 text = m.group(1) 

260 return "inline_math", text 

261 

262 def parse_latex_environment(self, m: Match[str], state: Any) -> Tuple[str, str, str]: 

263 """Parse a latex environment.""" 

264 name, text = m.group(1), m.group(2) 

265 return "latex_environment", name, text 

266 

267 

268class IPythonRenderer(HTMLRenderer): 

269 """An ipython html renderer.""" 

270 

271 def __init__( # noqa 

272 self, 

273 escape: bool = True, 

274 allow_harmful_protocols: bool = True, 

275 embed_images: bool = False, 

276 exclude_anchor_links: bool = False, 

277 anchor_link_text: str = "¶", 

278 path: str = "", 

279 attachments: Optional[Dict[str, Dict[str, str]]] = None, 

280 ): 

281 """Initialize the renderer.""" 

282 super().__init__(escape, allow_harmful_protocols) 

283 self.embed_images = embed_images 

284 self.exclude_anchor_links = exclude_anchor_links 

285 self.anchor_link_text = anchor_link_text 

286 self.path = path 

287 if attachments is not None: 

288 self.attachments = attachments 

289 else: 

290 self.attachments = {} 

291 

292 def block_code(self, code: str, info: Optional[str] = None) -> str: 

293 """Handle block code.""" 

294 lang: Optional[str] = "" 

295 lexer: Optional[Lexer] = None 

296 

297 if info: 

298 if info.startswith("mermaid"): 

299 return self.block_mermaidjs(code) 

300 

301 try: 

302 lang = info.strip().split(maxsplit=1)[0] 

303 lexer = get_lexer_by_name(lang, stripall=True) 

304 except ClassNotFound: 

305 code = f"{lang}\n{code}" 

306 lang = None 

307 

308 if not lang: 

309 return super().block_code(code, info=info) 

310 

311 formatter = HtmlFormatter() 

312 return highlight(code, lexer, formatter) 

313 

314 def block_mermaidjs(self, code: str) -> str: 

315 """Handle mermaid syntax.""" 

316 return ( 

317 """<div class="jp-Mermaid"><pre class="mermaid">\n""" 

318 f"""{code.strip()}""" 

319 """\n</pre></div>""" 

320 ) 

321 

322 def block_html(self, html: str) -> str: 

323 """Handle block html.""" 

324 if self.embed_images: 

325 html = self._html_embed_images(html) 

326 

327 return super().block_html(html) 

328 

329 def inline_html(self, html: str) -> str: 

330 """Handle inline html.""" 

331 if self.embed_images: 

332 html = self._html_embed_images(html) 

333 

334 return super().inline_html(html) 

335 

336 def heading(self, text: str, level: int, **attrs: Dict[str, Any]) -> str: 

337 """Handle a heading.""" 

338 html = super().heading(text, level, **attrs) 

339 if self.exclude_anchor_links: 

340 return html 

341 return str(add_anchor(html, anchor_link_text=self.anchor_link_text)) 

342 

343 def escape_html(self, text: str) -> str: 

344 """Escape html content.""" 

345 return escape(text, quote=False) 

346 

347 def block_math(self, body: str) -> str: 

348 """Handle block math.""" 

349 return f"$${self.escape_html(body)}$$" 

350 

351 def multiline_math(self, text: str) -> str: 

352 """Handle mulitline math for older mistune versions.""" 

353 return text 

354 

355 def latex_environment(self, name: str, body: str) -> str: 

356 """Handle a latex environment.""" 

357 name, body = self.escape_html(name), self.escape_html(body) 

358 return f"\\begin{{{name}}}{body}\\end{{{name}}}" 

359 

360 def inline_math(self, body: str) -> str: 

361 """Handle inline math.""" 

362 return f"${self.escape_html(body)}$" 

363 

364 def image(self, text: str, url: str, title: Optional[str] = None) -> str: 

365 """Rendering a image with title and text. 

366 

367 :param text: alt text of the image. 

368 :param url: source link of the image. 

369 :param title: title text of the image. 

370 

371 :note: The parameters `text` and `url` are swapped in older versions 

372 of mistune. 

373 """ 

374 if MISTUNE_V3: 

375 url = self._embed_image_or_attachment(url) 

376 else: # for mistune v2, the first argument is the URL 

377 text = self._embed_image_or_attachment(text) 

378 

379 return super().image(text, url, title) 

380 

381 def _embed_image_or_attachment(self, src: str) -> str: 

382 """Embed an image or attachment, depending on the configuration. 

383 If neither is possible, returns the original URL. 

384 """ 

385 

386 attachment_prefix = "attachment:" 

387 if src.startswith(attachment_prefix): 

388 name = src[len(attachment_prefix) :] 

389 

390 if name not in self.attachments: 

391 msg = f"missing attachment: {name}" 

392 raise InvalidNotebook(msg) 

393 

394 attachment = self.attachments[name] 

395 # we choose vector over raster, and lossless over lossy 

396 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg") 

397 for mime_type in preferred_mime_types: 

398 if mime_type in attachment: 

399 return f"data:{mime_type};base64,{attachment[mime_type]}" 

400 # otherwise we choose the first mimetype we can find 

401 default_mime_type = tuple(attachment.keys())[0] 

402 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}" 

403 

404 elif self.embed_images: 

405 base64_url = self._src_to_base64(src) 

406 if base64_url is not None: 

407 return base64_url 

408 

409 return src 

410 

411 def _src_to_base64(self, src: str) -> Optional[str]: 

412 """Turn the source file into a base64 url. 

413 

414 :param src: source link of the file. 

415 :return: the base64 url or None if the file was not found. 

416 """ 

417 src_path = os.path.join(self.path, src) 

418 

419 if not os.path.exists(src_path): 

420 return None 

421 

422 with open(src_path, "rb") as fobj: 

423 mime_type, _ = mimetypes.guess_type(src_path) 

424 

425 base64_data = base64.b64encode(fobj.read()) 

426 base64_str = base64_data.replace(b"\n", b"").decode("ascii") 

427 

428 return f"data:{mime_type};base64,{base64_str}" 

429 

430 def _html_embed_images(self, html: str) -> str: 

431 parsed_html = bs4.BeautifulSoup(html, features="html.parser") 

432 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img") 

433 

434 # Replace img tags's sources by base64 dataurls 

435 for img in imgs: 

436 src = img.attrs.get("src") 

437 if src is None: 

438 continue 

439 

440 base64_url = self._src_to_base64(img.attrs["src"]) 

441 if base64_url is not None: 

442 img.attrs["src"] = base64_url 

443 

444 return str(parsed_html) 

445 

446 

447# Represents an already imported plugin for Mistune 

448MarkdownPlugin = Callable[[Markdown], None] 

449 

450 

451class MarkdownWithMath(Markdown): 

452 """Markdown text with math enabled.""" 

453 

454 DEFAULT_PLUGINS = ( 

455 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853) 

456 # "footnotes", 

457 "strikethrough", 

458 "table", 

459 "url", 

460 "task_lists", 

461 "def_list", 

462 ) 

463 

464 def __init__( 

465 self, 

466 renderer: HTMLRenderer, 

467 block: Optional[BlockParser] = None, 

468 inline: Optional[InlineParser] = None, 

469 plugins: Optional[Iterable[MarkdownPlugin]] = None, 

470 ): 

471 """Initialize the parser.""" 

472 if block is None: 

473 block = MathBlockParser() 

474 if inline is None: 

475 if MISTUNE_V3: 

476 inline = MathInlineParser(hard_wrap=False) 

477 else: 

478 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore 

479 if plugins is None: 

480 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS) 

481 

482 super().__init__(renderer, block, inline, plugins) 

483 

484 def render(self, source: str) -> str: 

485 """Render the HTML output for a Markdown source.""" 

486 return str(super().__call__(source)) 

487 

488 

489def markdown2html_mistune(source: str) -> str: 

490 """Convert a markdown string to HTML using mistune""" 

491 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)