Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/nbconvert/filters/markdown_mistune.py: 61%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

216 statements  

1"""Markdown filters with mistune 

2 

3Used from markdown.py 

4""" 

5# Copyright (c) IPython Development Team. 

6# Distributed under the terms of the Modified BSD License. 

7 

8import base64 

9import mimetypes 

10import os 

11from collections.abc import Iterable 

12from html import escape 

13from re import Match 

14from typing import TYPE_CHECKING, Any, ClassVar, Optional, Protocol 

15 

16import bs4 

17from pygments import highlight 

18from pygments.formatters import HtmlFormatter 

19from pygments.lexer import Lexer 

20from pygments.lexers import get_lexer_by_name 

21from pygments.util import ClassNotFound 

22 

23from nbconvert.filters.strings import add_anchor 

24 

25if TYPE_CHECKING: 

26 try: 

27 from mistune.plugins import Plugin 

28 except ImportError: 

29 

30 class Plugin(Protocol): # type: ignore[no-redef] 

31 """Mistune plugin interface.""" 

32 

33 def __call__(self, markdown: "Markdown") -> None: 

34 """Apply the plugin on the markdown document.""" 

35 ... 

36 

37 

38try: # for Mistune >= 3.0 

39 from mistune import ( # type:ignore[attr-defined] 

40 BlockParser, 

41 BlockState, 

42 HTMLRenderer, 

43 InlineParser, 

44 InlineState, 

45 Markdown, 

46 import_plugin, 

47 ) 

48 

49 MISTUNE_V3 = True 

50 MISTUNE_V3_ATX = "atx_heading" in BlockParser.SPECIFICATION 

51 

52except ImportError: # for Mistune >= 2.0 

53 import re 

54 

55 from mistune import ( # type: ignore[attr-defined] 

56 PLUGINS, 

57 BlockParser, 

58 HTMLRenderer, 

59 InlineParser, 

60 Markdown, 

61 ) 

62 

63 MISTUNE_V3 = False 

64 MISTUNE_V3_ATX = False 

65 

66 def import_plugin(name: str) -> "Plugin": # type: ignore[misc] 

67 """Simple implementation of Mistune V3's import_plugin for V2.""" 

68 return PLUGINS[name] # type: ignore[no-any-return] 

69 

70 

71class InvalidNotebook(Exception): 

72 """An invalid notebook model.""" 

73 

74 

75def _dotall(pattern: str) -> str: 

76 """Makes the '.' special character match any character inside the pattern, including a newline. 

77 

78 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`. 

79 It is useful for LaTeX environments, where line breaks may be present. 

80 """ 

81 return f"(?s:{pattern})" 

82 

83 

84if MISTUNE_V3: # Parsers for Mistune >= 3.0.0 

85 

86 class MathBlockParser(BlockParser): 

87 """This acts as a pass-through to the MathInlineParser. It is needed in 

88 order to avoid other block level rules splitting math sections apart. 

89 

90 It works by matching each multiline math environment as a single paragraph, 

91 so that other rules don't think each section is its own paragraph. Inline 

92 is ignored here. 

93 """ 

94 

95 ATX_HEADING_WITHOUT_LEADING_SPACES = ( 

96 r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*(.*?)?)$" 

97 if MISTUNE_V3_ATX 

98 else r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$" 

99 ) 

100 

101 MULTILINE_MATH = _dotall( 

102 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$ 

103 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}" 

104 "|" 

105 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \] 

106 r"\\\\\[.*?\\\\\]" 

107 "|" 

108 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation} 

109 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}" 

110 ) 

111 

112 SPECIFICATION = { 

113 **BlockParser.SPECIFICATION, 

114 ( 

115 "atx_heading" if MISTUNE_V3_ATX else "axt_heading" 

116 ): ATX_HEADING_WITHOUT_LEADING_SPACES, 

117 "multiline_math": MULTILINE_MATH, 

118 } 

119 

120 # Multiline math must be searched before other rules 

121 DEFAULT_RULES: ClassVar[Iterable[str]] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment] 

122 

123 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int: 

124 """Send mutiline math as a single paragraph to MathInlineParser.""" 

125 matched_text = m[0] 

126 state.add_paragraph(matched_text) 

127 return m.end() 

128 

129 class MathInlineParser(InlineParser): 

130 r"""This interprets the content of LaTeX style math objects. 

131 

132 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``, 

133 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips 

134 delimiters from all these varieties, and extracts the type of environment 

135 in the last case (``foo`` in this example). 

136 """ 

137 

138 # Display math mode, using older TeX delimiter: $$ \pi $$ 

139 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$") 

140 # Display math mode, using newer LaTeX delimiter: \[ \pi \] 

141 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]") 

142 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!) 

143 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$") 

144 # Inline math mode, using newer LaTeX delimiter: \( \pi \) 

145 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((?P<math_inline_latex>.*?)(?<!\\)\\\\\)") 

146 # LaTeX math environment: \begin{equation} \pi \end{equation} 

147 LATEX_ENVIRONMENT = _dotall( 

148 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}" 

149 r"(?P<math_env_body>.*?)" 

150 r"\\end\{(?P=math_env_name)\}" 

151 ) 

152 

153 SPECIFICATION = { 

154 **InlineParser.SPECIFICATION, 

155 "block_math_tex": BLOCK_MATH_TEX, 

156 "block_math_latex": BLOCK_MATH_LATEX, 

157 "inline_math_tex": INLINE_MATH_TEX, 

158 "inline_math_latex": INLINE_MATH_LATEX, 

159 "latex_environment": LATEX_ENVIRONMENT, 

160 } 

161 

162 # Block math must be matched first, and all math must come before text 

163 DEFAULT_RULES: ClassVar[Iterable[str]] = ( 

164 "block_math_tex", 

165 "block_math_latex", 

166 "inline_math_tex", 

167 "inline_math_latex", 

168 "latex_environment", 

169 *InlineParser.DEFAULT_RULES, 

170 ) # type: ignore[assignment] 

171 

172 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int: 

173 """Parse older TeX-style display math.""" 

174 body = m.group("math_block_tex") 

175 state.append_token({"type": "block_math", "raw": body}) 

176 return m.end() 

177 

178 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int: 

179 """Parse newer LaTeX-style display math.""" 

180 body = m.group("math_block_latex") 

181 state.append_token({"type": "block_math", "raw": body}) 

182 return m.end() 

183 

184 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int: 

185 """Parse older TeX-style inline math.""" 

186 body = m.group("math_inline_tex") 

187 state.append_token({"type": "inline_math", "raw": body}) 

188 return m.end() 

189 

190 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int: 

191 """Parse newer LaTeX-style inline math.""" 

192 body = m.group("math_inline_latex") 

193 state.append_token({"type": "inline_math", "raw": body}) 

194 return m.end() 

195 

196 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int: 

197 """Parse a latex environment.""" 

198 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")} 

199 state.append_token({"type": "latex_environment", "attrs": attrs}) 

200 return m.end() 

201 

202else: # Parsers for Mistune >= 2.0.0 < 3.0.0 

203 

204 class MathBlockParser(BlockParser): # type: ignore[no-redef] 

205 """This acts as a pass-through to the MathInlineParser. It is needed in 

206 order to avoid other block level rules splitting math sections apart. 

207 """ 

208 

209 MULTILINE_MATH = re.compile( 

210 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$ 

211 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|" 

212 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \] 

213 r"\\\\\[.*?\\\\\]|" 

214 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation} 

215 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}", 

216 re.DOTALL, 

217 ) 

218 

219 # Regex for header that doesn't require space after '#' 

220 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))") 

221 

222 # Multiline math must be searched before other rules 

223 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore[attr-defined] 

224 

225 def parse_multiline_math(self, m: Match[str], state: Any) -> dict[str, str]: 

226 """Pass token through mutiline math.""" 

227 return {"type": "multiline_math", "text": m.group(0)} 

228 

229 class MathInlineParser(InlineParser): # type: ignore[no-redef] 

230 r"""This interprets the content of LaTeX style math objects. 

231 

232 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``, 

233 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips 

234 delimiters from all these varieties, and extracts the type of environment 

235 in the last case (``foo`` in this example). 

236 """ 

237 

238 # Display math mode, using older TeX delimiter: $$ \pi $$ 

239 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$") 

240 # Display math mode, using newer LaTeX delimiter: \[ \pi \] 

241 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]") 

242 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!) 

243 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$") 

244 # Inline math mode, using newer LaTeX delimiter: \( \pi \) 

245 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)") 

246 # LaTeX math environment: \begin{equation} \pi \end{equation} 

247 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}") 

248 

249 RULE_NAMES = ( 

250 "block_math_tex", 

251 "block_math_latex", 

252 "inline_math_tex", 

253 "inline_math_latex", 

254 "latex_environment", 

255 *InlineParser.RULE_NAMES, # type: ignore[attr-defined] 

256 ) 

257 

258 def parse_block_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]: 

259 """Parse block text math.""" 

260 # sometimes the Scanner keeps the final '$$', so we use the 

261 # full matched string and remove the math markers 

262 text = m.group(0)[2:-2] 

263 return "block_math", text 

264 

265 def parse_block_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]: 

266 """Parse block latex math .""" 

267 text = m.group(1) 

268 return "block_math", text 

269 

270 def parse_inline_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]: 

271 """Parse inline tex math.""" 

272 text = m.group(1) 

273 return "inline_math", text 

274 

275 def parse_inline_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]: 

276 """Parse inline latex math.""" 

277 text = m.group(1) 

278 return "inline_math", text 

279 

280 def parse_latex_environment(self, m: Match[str], state: Any) -> tuple[str, str, str]: 

281 """Parse a latex environment.""" 

282 name, text = m.group(1), m.group(2) 

283 return "latex_environment", name, text 

284 

285 

286class IPythonRenderer(HTMLRenderer): 

287 """An ipython html renderer.""" 

288 

289 def __init__( 

290 self, 

291 escape: bool = True, 

292 allow_harmful_protocols: bool = True, 

293 embed_images: bool = False, 

294 exclude_anchor_links: bool = False, 

295 anchor_link_text: str = "¶", 

296 path: str = "", 

297 attachments: Optional[dict[str, dict[str, str]]] = None, 

298 **lexer_options, 

299 ): 

300 """Initialize the renderer.""" 

301 super().__init__(escape, allow_harmful_protocols) 

302 self.embed_images = embed_images 

303 self.exclude_anchor_links = exclude_anchor_links 

304 self.anchor_link_text = anchor_link_text 

305 self.path = path 

306 self.lexer_options = lexer_options 

307 if attachments is not None: 

308 self.attachments = attachments 

309 else: 

310 self.attachments = {} 

311 

312 def block_code(self, code: str, info: Optional[str] = None) -> str: 

313 """Handle block code.""" 

314 lang: Optional[str] = "" 

315 lexer: Optional[Lexer] = None 

316 

317 if info: 

318 if info.startswith("mermaid"): 

319 return self.block_mermaidjs(code) 

320 

321 try: 

322 if info.strip().split(None, 1): 

323 lang = info.strip().split(maxsplit=1)[0] 

324 lexer = get_lexer_by_name(lang, **self.lexer_options) 

325 except ClassNotFound: 

326 code = f"{lang}\n{code}" 

327 lang = None 

328 

329 if not lang: 

330 return super().block_code(code, info=info) 

331 

332 formatter = HtmlFormatter() 

333 return highlight(code, lexer, formatter) 

334 

335 def block_mermaidjs(self, code: str) -> str: 

336 """Handle mermaid syntax.""" 

337 return ( 

338 """<div class="jp-Mermaid"><pre class="mermaid">\n""" 

339 f"""{code.strip()}""" 

340 """\n</pre></div>""" 

341 ) 

342 

343 def block_html(self, html: str) -> str: 

344 """Handle block html.""" 

345 if self.embed_images: 

346 html = self._html_embed_images(html) 

347 

348 return super().block_html(html) 

349 

350 def inline_html(self, html: str) -> str: 

351 """Handle inline html.""" 

352 if self.embed_images: 

353 html = self._html_embed_images(html) 

354 

355 return super().inline_html(html) 

356 

357 def heading(self, text: str, level: int, **attrs: dict[str, Any]) -> str: 

358 """Handle a heading.""" 

359 html = super().heading(text, level, **attrs) 

360 if self.exclude_anchor_links: 

361 return html 

362 return str(add_anchor(html, anchor_link_text=self.anchor_link_text)) 

363 

364 def escape_html(self, text: str) -> str: 

365 """Escape html content.""" 

366 return escape(text, quote=False) 

367 

368 def block_math(self, body: str) -> str: 

369 """Handle block math.""" 

370 return f"$${self.escape_html(body)}$$" 

371 

372 def multiline_math(self, text: str) -> str: 

373 """Handle mulitline math for older mistune versions.""" 

374 return text 

375 

376 def latex_environment(self, name: str, body: str) -> str: 

377 """Handle a latex environment.""" 

378 name, body = self.escape_html(name), self.escape_html(body) 

379 return f"\\begin{{{name}}}{body}\\end{{{name}}}" 

380 

381 def inline_math(self, body: str) -> str: 

382 """Handle inline math.""" 

383 return f"${self.escape_html(body)}$" 

384 

385 def image(self, text: str, url: str, title: Optional[str] = None) -> str: 

386 """Rendering a image with title and text. 

387 

388 :param text: alt text of the image. 

389 :param url: source link of the image. 

390 :param title: title text of the image. 

391 

392 :note: The parameters `text` and `url` are swapped in older versions 

393 of mistune. 

394 """ 

395 if MISTUNE_V3: 

396 url = self._embed_image_or_attachment(url) 

397 else: # for mistune v2, the first argument is the URL 

398 text = self._embed_image_or_attachment(text) 

399 

400 return super().image(text, url, title) 

401 

402 def _embed_image_or_attachment(self, src: str) -> str: 

403 """Embed an image or attachment, depending on the configuration. 

404 If neither is possible, returns the original URL. 

405 """ 

406 

407 attachment_prefix = "attachment:" 

408 if src.startswith(attachment_prefix): 

409 name = src[len(attachment_prefix) :] 

410 

411 if name not in self.attachments: 

412 msg = f"missing attachment: {name}" 

413 raise InvalidNotebook(msg) 

414 

415 attachment = self.attachments[name] 

416 # we choose vector over raster, and lossless over lossy 

417 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg") 

418 for mime_type in preferred_mime_types: 

419 if mime_type in attachment: 

420 return f"data:{mime_type};base64,{attachment[mime_type]}" 

421 # otherwise we choose the first mimetype we can find 

422 default_mime_type = next(iter(attachment.keys())) 

423 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}" 

424 

425 if self.embed_images: 

426 base64_url = self._src_to_base64(src) 

427 if base64_url is not None: 

428 return base64_url 

429 

430 return src 

431 

432 def _src_to_base64(self, src: str) -> Optional[str]: 

433 """Turn the source file into a base64 url. 

434 

435 :param src: source link of the file. 

436 :return: the base64 url or None if the file was not found. 

437 """ 

438 src_path = os.path.join(self.path, src) 

439 

440 if not os.path.exists(src_path): 

441 return None 

442 

443 with open(src_path, "rb") as fobj: 

444 mime_type, _ = mimetypes.guess_type(src_path) 

445 

446 base64_data = base64.b64encode(fobj.read()) 

447 base64_str = base64_data.replace(b"\n", b"").decode("ascii") 

448 

449 return f"data:{mime_type};base64,{base64_str}" 

450 

451 def _html_embed_images(self, html: str) -> str: 

452 parsed_html = bs4.BeautifulSoup(html, features="html.parser") 

453 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img") 

454 

455 # Replace img tags's sources by base64 dataurls 

456 for img in imgs: 

457 src = img.attrs.get("src") 

458 if src is None: 

459 continue 

460 

461 base64_url = self._src_to_base64(img.attrs["src"]) 

462 if base64_url is not None: 

463 img.attrs["src"] = base64_url 

464 

465 return str(parsed_html) 

466 

467 

468class MarkdownWithMath(Markdown): 

469 """Markdown text with math enabled.""" 

470 

471 DEFAULT_PLUGINS = ( 

472 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853) 

473 # "footnotes", 

474 "strikethrough", 

475 "table", 

476 "url", 

477 "task_lists", 

478 "def_list", 

479 ) 

480 

481 def __init__( 

482 self, 

483 renderer: HTMLRenderer, 

484 block: Optional[BlockParser] = None, 

485 inline: Optional[InlineParser] = None, 

486 plugins: Optional[Iterable["Plugin"]] = None, 

487 ): 

488 """Initialize the parser.""" 

489 if block is None: 

490 block = MathBlockParser() 

491 if inline is None: 

492 if MISTUNE_V3: 

493 inline = MathInlineParser(hard_wrap=False) 

494 else: 

495 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore[arg-type,misc] 

496 if plugins is None: 

497 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS) 

498 

499 super().__init__(renderer, block, inline, plugins) 

500 

501 def render(self, source: str) -> str: 

502 """Render the HTML output for a Markdown source.""" 

503 return str(super().__call__(source)) 

504 

505 

506def markdown2html_mistune(source: str) -> str: 

507 """Convert a markdown string to HTML using mistune""" 

508 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)