1"""Markdown filters with mistune
2
3Used from markdown.py
4"""
5# Copyright (c) IPython Development Team.
6# Distributed under the terms of the Modified BSD License.
7
8import base64
9import mimetypes
10import os
11from collections.abc import Iterable
12from html import escape
13from re import Match
14from typing import TYPE_CHECKING, Any, ClassVar, Optional, Protocol
15
16import bs4
17from pygments import highlight
18from pygments.formatters import HtmlFormatter
19from pygments.lexer import Lexer
20from pygments.lexers import get_lexer_by_name
21from pygments.util import ClassNotFound
22
23from nbconvert.filters.strings import add_anchor
24
25if TYPE_CHECKING:
26 try:
27 from mistune.plugins import Plugin
28 except ImportError:
29
30 class Plugin(Protocol): # type: ignore[no-redef]
31 """Mistune plugin interface."""
32
33 def __call__(self, markdown: "Markdown") -> None:
34 """Apply the plugin on the markdown document."""
35 ...
36
37
38try: # for Mistune >= 3.0
39 from mistune import ( # type:ignore[attr-defined]
40 BlockParser,
41 BlockState,
42 HTMLRenderer,
43 InlineParser,
44 InlineState,
45 Markdown,
46 import_plugin,
47 )
48
49 MISTUNE_V3 = True
50 MISTUNE_V3_ATX = "atx_heading" in BlockParser.SPECIFICATION
51
52except ImportError: # for Mistune >= 2.0
53 import re
54
55 from mistune import ( # type: ignore[attr-defined]
56 PLUGINS,
57 BlockParser,
58 HTMLRenderer,
59 InlineParser,
60 Markdown,
61 )
62
63 MISTUNE_V3 = False
64 MISTUNE_V3_ATX = False
65
66 def import_plugin(name: str) -> "Plugin": # type: ignore[misc]
67 """Simple implementation of Mistune V3's import_plugin for V2."""
68 return PLUGINS[name] # type: ignore[no-any-return]
69
70
71class InvalidNotebook(Exception):
72 """An invalid notebook model."""
73
74
75def _dotall(pattern: str) -> str:
76 """Makes the '.' special character match any character inside the pattern, including a newline.
77
78 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.
79 It is useful for LaTeX environments, where line breaks may be present.
80 """
81 return f"(?s:{pattern})"
82
83
84if MISTUNE_V3: # Parsers for Mistune >= 3.0.0
85
86 class MathBlockParser(BlockParser):
87 """This acts as a pass-through to the MathInlineParser. It is needed in
88 order to avoid other block level rules splitting math sections apart.
89
90 It works by matching each multiline math environment as a single paragraph,
91 so that other rules don't think each section is its own paragraph. Inline
92 is ignored here.
93 """
94
95 ATX_HEADING_WITHOUT_LEADING_SPACES = (
96 r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*(.*?)?)$"
97 if MISTUNE_V3_ATX
98 else r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$"
99 )
100
101 MULTILINE_MATH = _dotall(
102 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$
103 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}"
104 "|"
105 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]
106 r"\\\\\[.*?\\\\\]"
107 "|"
108 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}
109 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}"
110 )
111
112 SPECIFICATION = {
113 **BlockParser.SPECIFICATION,
114 (
115 "atx_heading" if MISTUNE_V3_ATX else "axt_heading"
116 ): ATX_HEADING_WITHOUT_LEADING_SPACES,
117 "multiline_math": MULTILINE_MATH,
118 }
119
120 # Multiline math must be searched before other rules
121 DEFAULT_RULES: ClassVar[Iterable[str]] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment]
122
123 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int:
124 """Send mutiline math as a single paragraph to MathInlineParser."""
125 matched_text = m[0]
126 state.add_paragraph(matched_text)
127 return m.end()
128
129 class MathInlineParser(InlineParser):
130 r"""This interprets the content of LaTeX style math objects.
131
132 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
133 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
134 delimiters from all these varieties, and extracts the type of environment
135 in the last case (``foo`` in this example).
136 """
137
138 # Display math mode, using older TeX delimiter: $$ \pi $$
139 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$")
140 # Display math mode, using newer LaTeX delimiter: \[ \pi \]
141 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]")
142 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)
143 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$")
144 # Inline math mode, using newer LaTeX delimiter: \( \pi \)
145 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((?P<math_inline_latex>.*?)(?<!\\)\\\\\)")
146 # LaTeX math environment: \begin{equation} \pi \end{equation}
147 LATEX_ENVIRONMENT = _dotall(
148 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}"
149 r"(?P<math_env_body>.*?)"
150 r"\\end\{(?P=math_env_name)\}"
151 )
152
153 SPECIFICATION = {
154 **InlineParser.SPECIFICATION,
155 "block_math_tex": BLOCK_MATH_TEX,
156 "block_math_latex": BLOCK_MATH_LATEX,
157 "inline_math_tex": INLINE_MATH_TEX,
158 "inline_math_latex": INLINE_MATH_LATEX,
159 "latex_environment": LATEX_ENVIRONMENT,
160 }
161
162 # Block math must be matched first, and all math must come before text
163 DEFAULT_RULES: ClassVar[Iterable[str]] = (
164 "block_math_tex",
165 "block_math_latex",
166 "inline_math_tex",
167 "inline_math_latex",
168 "latex_environment",
169 *InlineParser.DEFAULT_RULES,
170 ) # type: ignore[assignment]
171
172 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int:
173 """Parse older TeX-style display math."""
174 body = m.group("math_block_tex")
175 state.append_token({"type": "block_math", "raw": body})
176 return m.end()
177
178 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int:
179 """Parse newer LaTeX-style display math."""
180 body = m.group("math_block_latex")
181 state.append_token({"type": "block_math", "raw": body})
182 return m.end()
183
184 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int:
185 """Parse older TeX-style inline math."""
186 body = m.group("math_inline_tex")
187 state.append_token({"type": "inline_math", "raw": body})
188 return m.end()
189
190 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int:
191 """Parse newer LaTeX-style inline math."""
192 body = m.group("math_inline_latex")
193 state.append_token({"type": "inline_math", "raw": body})
194 return m.end()
195
196 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int:
197 """Parse a latex environment."""
198 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")}
199 state.append_token({"type": "latex_environment", "attrs": attrs})
200 return m.end()
201
202else: # Parsers for Mistune >= 2.0.0 < 3.0.0
203
204 class MathBlockParser(BlockParser): # type: ignore[no-redef]
205 """This acts as a pass-through to the MathInlineParser. It is needed in
206 order to avoid other block level rules splitting math sections apart.
207 """
208
209 MULTILINE_MATH = re.compile(
210 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$
211 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"
212 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]
213 r"\\\\\[.*?\\\\\]|"
214 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}
215 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",
216 re.DOTALL,
217 )
218
219 # Regex for header that doesn't require space after '#'
220 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")
221
222 # Multiline math must be searched before other rules
223 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore[attr-defined]
224
225 def parse_multiline_math(self, m: Match[str], state: Any) -> dict[str, str]:
226 """Pass token through mutiline math."""
227 return {"type": "multiline_math", "text": m.group(0)}
228
229 class MathInlineParser(InlineParser): # type: ignore[no-redef]
230 r"""This interprets the content of LaTeX style math objects.
231
232 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
233 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
234 delimiters from all these varieties, and extracts the type of environment
235 in the last case (``foo`` in this example).
236 """
237
238 # Display math mode, using older TeX delimiter: $$ \pi $$
239 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")
240 # Display math mode, using newer LaTeX delimiter: \[ \pi \]
241 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")
242 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)
243 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")
244 # Inline math mode, using newer LaTeX delimiter: \( \pi \)
245 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)")
246 # LaTeX math environment: \begin{equation} \pi \end{equation}
247 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")
248
249 RULE_NAMES = (
250 "block_math_tex",
251 "block_math_latex",
252 "inline_math_tex",
253 "inline_math_latex",
254 "latex_environment",
255 *InlineParser.RULE_NAMES, # type: ignore[attr-defined]
256 )
257
258 def parse_block_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]:
259 """Parse block text math."""
260 # sometimes the Scanner keeps the final '$$', so we use the
261 # full matched string and remove the math markers
262 text = m.group(0)[2:-2]
263 return "block_math", text
264
265 def parse_block_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]:
266 """Parse block latex math ."""
267 text = m.group(1)
268 return "block_math", text
269
270 def parse_inline_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]:
271 """Parse inline tex math."""
272 text = m.group(1)
273 return "inline_math", text
274
275 def parse_inline_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]:
276 """Parse inline latex math."""
277 text = m.group(1)
278 return "inline_math", text
279
280 def parse_latex_environment(self, m: Match[str], state: Any) -> tuple[str, str, str]:
281 """Parse a latex environment."""
282 name, text = m.group(1), m.group(2)
283 return "latex_environment", name, text
284
285
286class IPythonRenderer(HTMLRenderer):
287 """An ipython html renderer."""
288
289 def __init__(
290 self,
291 escape: bool = True,
292 allow_harmful_protocols: bool = True,
293 embed_images: bool = False,
294 exclude_anchor_links: bool = False,
295 anchor_link_text: str = "¶",
296 path: str = "",
297 attachments: Optional[dict[str, dict[str, str]]] = None,
298 **lexer_options,
299 ):
300 """Initialize the renderer."""
301 super().__init__(escape, allow_harmful_protocols)
302 self.embed_images = embed_images
303 self.exclude_anchor_links = exclude_anchor_links
304 self.anchor_link_text = anchor_link_text
305 self.path = path
306 self.lexer_options = lexer_options
307 if attachments is not None:
308 self.attachments = attachments
309 else:
310 self.attachments = {}
311
312 def block_code(self, code: str, info: Optional[str] = None) -> str:
313 """Handle block code."""
314 lang: Optional[str] = ""
315 lexer: Optional[Lexer] = None
316
317 if info:
318 if info.startswith("mermaid"):
319 return self.block_mermaidjs(code)
320
321 try:
322 if info.strip().split(None, 1):
323 lang = info.strip().split(maxsplit=1)[0]
324 lexer = get_lexer_by_name(lang, **self.lexer_options)
325 except ClassNotFound:
326 code = f"{lang}\n{code}"
327 lang = None
328
329 if not lang:
330 return super().block_code(code, info=info)
331
332 formatter = HtmlFormatter()
333 return highlight(code, lexer, formatter)
334
335 def block_mermaidjs(self, code: str) -> str:
336 """Handle mermaid syntax."""
337 return (
338 """<div class="jp-Mermaid"><pre class="mermaid">\n"""
339 f"""{code.strip()}"""
340 """\n</pre></div>"""
341 )
342
343 def block_html(self, html: str) -> str:
344 """Handle block html."""
345 if self.embed_images:
346 html = self._html_embed_images(html)
347
348 return super().block_html(html)
349
350 def inline_html(self, html: str) -> str:
351 """Handle inline html."""
352 if self.embed_images:
353 html = self._html_embed_images(html)
354
355 return super().inline_html(html)
356
357 def heading(self, text: str, level: int, **attrs: dict[str, Any]) -> str:
358 """Handle a heading."""
359 html = super().heading(text, level, **attrs)
360 if self.exclude_anchor_links:
361 return html
362 return str(add_anchor(html, anchor_link_text=self.anchor_link_text))
363
364 def escape_html(self, text: str) -> str:
365 """Escape html content."""
366 return escape(text, quote=False)
367
368 def block_math(self, body: str) -> str:
369 """Handle block math."""
370 return f"$${self.escape_html(body)}$$"
371
372 def multiline_math(self, text: str) -> str:
373 """Handle mulitline math for older mistune versions."""
374 return text
375
376 def latex_environment(self, name: str, body: str) -> str:
377 """Handle a latex environment."""
378 name, body = self.escape_html(name), self.escape_html(body)
379 return f"\\begin{{{name}}}{body}\\end{{{name}}}"
380
381 def inline_math(self, body: str) -> str:
382 """Handle inline math."""
383 return f"${self.escape_html(body)}$"
384
385 def image(self, text: str, url: str, title: Optional[str] = None) -> str:
386 """Rendering a image with title and text.
387
388 :param text: alt text of the image.
389 :param url: source link of the image.
390 :param title: title text of the image.
391
392 :note: The parameters `text` and `url` are swapped in older versions
393 of mistune.
394 """
395 if MISTUNE_V3:
396 url = self._embed_image_or_attachment(url)
397 else: # for mistune v2, the first argument is the URL
398 text = self._embed_image_or_attachment(text)
399
400 return super().image(text, url, title)
401
402 def _embed_image_or_attachment(self, src: str) -> str:
403 """Embed an image or attachment, depending on the configuration.
404 If neither is possible, returns the original URL.
405 """
406
407 attachment_prefix = "attachment:"
408 if src.startswith(attachment_prefix):
409 name = src[len(attachment_prefix) :]
410
411 if name not in self.attachments:
412 msg = f"missing attachment: {name}"
413 raise InvalidNotebook(msg)
414
415 attachment = self.attachments[name]
416 # we choose vector over raster, and lossless over lossy
417 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg")
418 for mime_type in preferred_mime_types:
419 if mime_type in attachment:
420 return f"data:{mime_type};base64,{attachment[mime_type]}"
421 # otherwise we choose the first mimetype we can find
422 default_mime_type = next(iter(attachment.keys()))
423 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}"
424
425 if self.embed_images:
426 base64_url = self._src_to_base64(src)
427 if base64_url is not None:
428 return base64_url
429
430 return src
431
432 def _src_to_base64(self, src: str) -> Optional[str]:
433 """Turn the source file into a base64 url.
434
435 :param src: source link of the file.
436 :return: the base64 url or None if the file was not found.
437 """
438 src_path = os.path.join(self.path, src)
439
440 if not os.path.exists(src_path):
441 return None
442
443 with open(src_path, "rb") as fobj:
444 mime_type, _ = mimetypes.guess_type(src_path)
445
446 base64_data = base64.b64encode(fobj.read())
447 base64_str = base64_data.replace(b"\n", b"").decode("ascii")
448
449 return f"data:{mime_type};base64,{base64_str}"
450
451 def _html_embed_images(self, html: str) -> str:
452 parsed_html = bs4.BeautifulSoup(html, features="html.parser")
453 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")
454
455 # Replace img tags's sources by base64 dataurls
456 for img in imgs:
457 src = img.attrs.get("src")
458 if src is None:
459 continue
460
461 base64_url = self._src_to_base64(img.attrs["src"])
462 if base64_url is not None:
463 img.attrs["src"] = base64_url
464
465 return str(parsed_html)
466
467
468class MarkdownWithMath(Markdown):
469 """Markdown text with math enabled."""
470
471 DEFAULT_PLUGINS = (
472 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853)
473 # "footnotes",
474 "strikethrough",
475 "table",
476 "url",
477 "task_lists",
478 "def_list",
479 )
480
481 def __init__(
482 self,
483 renderer: HTMLRenderer,
484 block: Optional[BlockParser] = None,
485 inline: Optional[InlineParser] = None,
486 plugins: Optional[Iterable["Plugin"]] = None,
487 ):
488 """Initialize the parser."""
489 if block is None:
490 block = MathBlockParser()
491 if inline is None:
492 if MISTUNE_V3:
493 inline = MathInlineParser(hard_wrap=False)
494 else:
495 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore[arg-type,misc]
496 if plugins is None:
497 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS)
498
499 super().__init__(renderer, block, inline, plugins)
500
501 def render(self, source: str) -> str:
502 """Render the HTML output for a Markdown source."""
503 return str(super().__call__(source))
504
505
506def markdown2html_mistune(source: str) -> str:
507 """Convert a markdown string to HTML using mistune"""
508 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)