Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown_mistune.py: 61%
204 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""Markdown filters with mistune
3Used from markdown.py
4"""
5# Copyright (c) IPython Development Team.
6# Distributed under the terms of the Modified BSD License.
9import base64
10import mimetypes
11import os
12from html import escape
13from typing import Any, Callable, Dict, Iterable, Match, Optional, Tuple
15import bs4
16from pygments import highlight
17from pygments.formatters import HtmlFormatter
18from pygments.lexer import Lexer
19from pygments.lexers import get_lexer_by_name
20from pygments.util import ClassNotFound
22from nbconvert.filters.strings import add_anchor
24try: # for Mistune >= 3.0
25 from mistune import (
26 BlockParser,
27 BlockState,
28 HTMLRenderer,
29 InlineParser,
30 InlineState,
31 Markdown,
32 import_plugin,
33 )
35 MISTUNE_V3 = True
37except ImportError: # for Mistune >= 2.0
38 import re
40 from mistune import ( # type: ignore[attr-defined]
41 PLUGINS,
42 BlockParser,
43 HTMLRenderer,
44 InlineParser,
45 Markdown,
46 )
48 MISTUNE_V3 = False
50 def import_plugin(name: str) -> 'MarkdownPlugin': # type: ignore[misc]
51 """Simple implementation of Mistune V3's import_plugin for V2."""
52 return PLUGINS[name] # type: ignore[no-any-return]
55class InvalidNotebook(Exception): # noqa
56 """An invalid notebook model."""
58 pass
61def _dotall(pattern: str) -> str:
62 """Makes the '.' special character match any character inside the pattern, including a newline.
64 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.
65 It is useful for LaTeX environments, where line breaks may be present.
66 """
67 return f"(?s:{pattern})"
70if MISTUNE_V3: # Parsers for Mistune >= 3.0.0
72 class MathBlockParser(BlockParser):
73 """This acts as a pass-through to the MathInlineParser. It is needed in
74 order to avoid other block level rules splitting math sections apart.
76 It works by matching each multiline math environment as a single paragraph,
77 so that other rules don't think each section is its own paragraph. Inline
78 is ignored here.
79 """
81 AXT_HEADING_WITHOUT_LEADING_SPACES = (
82 r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$"
83 )
85 MULTILINE_MATH = _dotall(
86 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$
87 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}"
88 "|"
89 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]
90 r"\\\\\[.*?\\\\\]"
91 "|"
92 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}
93 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}"
94 )
96 SPECIFICATION = {
97 **BlockParser.SPECIFICATION,
98 "axt_heading": AXT_HEADING_WITHOUT_LEADING_SPACES,
99 "multiline_math": MULTILINE_MATH,
100 }
102 # Multiline math must be searched before other rules
103 DEFAULT_RULES: Tuple[str, ...] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment]
105 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int:
106 """Send mutiline math as a single paragraph to MathInlineParser."""
107 matched_text = m[0]
108 state.add_paragraph(matched_text)
109 return m.end()
111 class MathInlineParser(InlineParser):
112 r"""This interprets the content of LaTeX style math objects.
114 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
115 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
116 delimiters from all these varieties, and extracts the type of environment
117 in the last case (``foo`` in this example).
118 """
120 # Display math mode, using older TeX delimiter: $$ \pi $$
121 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$")
122 # Display math mode, using newer LaTeX delimiter: \[ \pi \]
123 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]")
124 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)
125 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$")
126 # Inline math mode, using newer LaTeX delimiter: \( \pi \)
127 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((?P<math_inline_latex>.*?)(?<!\\)\\\\\)")
128 # LaTeX math environment: \begin{equation} \pi \end{equation}
129 LATEX_ENVIRONMENT = _dotall(
130 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}"
131 r"(?P<math_env_body>.*?)"
132 r"\\end\{(?P=math_env_name)\}"
133 )
135 SPECIFICATION = {
136 **InlineParser.SPECIFICATION,
137 "block_math_tex": BLOCK_MATH_TEX,
138 "block_math_latex": BLOCK_MATH_LATEX,
139 "inline_math_tex": INLINE_MATH_TEX,
140 "inline_math_latex": INLINE_MATH_LATEX,
141 "latex_environment": LATEX_ENVIRONMENT,
142 }
144 # Block math must be matched first, and all math must come before text
145 DEFAULT_RULES: Tuple[str, ...] = (
146 "block_math_tex",
147 "block_math_latex",
148 "inline_math_tex",
149 "inline_math_latex",
150 "latex_environment",
151 *InlineParser.DEFAULT_RULES,
152 ) # type: ignore[assignment]
154 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int:
155 """Parse older TeX-style display math."""
156 body = m.group("math_block_tex")
157 state.append_token({"type": "block_math", "raw": body})
158 return m.end()
160 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int:
161 """Parse newer LaTeX-style display math."""
162 body = m.group("math_block_latex")
163 state.append_token({"type": "block_math", "raw": body})
164 return m.end()
166 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int:
167 """Parse older TeX-style inline math."""
168 body = m.group("math_inline_tex")
169 state.append_token({"type": "inline_math", "raw": body})
170 return m.end()
172 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int:
173 """Parse newer LaTeX-style inline math."""
174 body = m.group("math_inline_latex")
175 state.append_token({"type": "inline_math", "raw": body})
176 return m.end()
178 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int:
179 """Parse a latex environment."""
180 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")}
181 state.append_token({"type": "latex_environment", "attrs": attrs})
182 return m.end()
184else: # Parsers for Mistune >= 2.0.0 < 3.0.0
186 class MathBlockParser(BlockParser): # type: ignore[no-redef]
187 """This acts as a pass-through to the MathInlineParser. It is needed in
188 order to avoid other block level rules splitting math sections apart.
189 """
191 MULTILINE_MATH = re.compile(
192 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$
193 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"
194 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]
195 r"\\\\\[.*?\\\\\]|"
196 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}
197 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",
198 re.DOTALL,
199 )
201 # Regex for header that doesn't require space after '#'
202 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")
204 # Multiline math must be searched before other rules
205 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore
207 def parse_multiline_math(self, m: Match[str], state: Any) -> Dict[str, str]:
208 """Pass token through mutiline math."""
209 return {"type": "multiline_math", "text": m.group(0)}
211 class MathInlineParser(InlineParser): # type: ignore[no-redef]
212 r"""This interprets the content of LaTeX style math objects.
214 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
215 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
216 delimiters from all these varieties, and extracts the type of environment
217 in the last case (``foo`` in this example).
218 """
220 # Display math mode, using older TeX delimiter: $$ \pi $$
221 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")
222 # Display math mode, using newer LaTeX delimiter: \[ \pi \]
223 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")
224 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)
225 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")
226 # Inline math mode, using newer LaTeX delimiter: \( \pi \)
227 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)")
228 # LaTeX math environment: \begin{equation} \pi \end{equation}
229 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")
231 RULE_NAMES = (
232 "block_math_tex",
233 "block_math_latex",
234 "inline_math_tex",
235 "inline_math_latex",
236 "latex_environment",
237 *InlineParser.RULE_NAMES, # type: ignore
238 )
240 def parse_block_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:
241 """Parse block text math."""
242 # sometimes the Scanner keeps the final '$$', so we use the
243 # full matched string and remove the math markers
244 text = m.group(0)[2:-2]
245 return "block_math", text
247 def parse_block_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]:
248 """Parse block latex math ."""
249 text = m.group(1)
250 return "block_math", text
252 def parse_inline_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:
253 """Parse inline tex math."""
254 text = m.group(1)
255 return "inline_math", text
257 def parse_inline_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]:
258 """Parse inline latex math."""
259 text = m.group(1)
260 return "inline_math", text
262 def parse_latex_environment(self, m: Match[str], state: Any) -> Tuple[str, str, str]:
263 """Parse a latex environment."""
264 name, text = m.group(1), m.group(2)
265 return "latex_environment", name, text
268class IPythonRenderer(HTMLRenderer):
269 """An ipython html renderer."""
271 def __init__( # noqa
272 self,
273 escape: bool = True,
274 allow_harmful_protocols: bool = True,
275 embed_images: bool = False,
276 exclude_anchor_links: bool = False,
277 anchor_link_text: str = "¶",
278 path: str = "",
279 attachments: Optional[Dict[str, Dict[str, str]]] = None,
280 ):
281 """Initialize the renderer."""
282 super().__init__(escape, allow_harmful_protocols)
283 self.embed_images = embed_images
284 self.exclude_anchor_links = exclude_anchor_links
285 self.anchor_link_text = anchor_link_text
286 self.path = path
287 if attachments is not None:
288 self.attachments = attachments
289 else:
290 self.attachments = {}
292 def block_code(self, code: str, info: Optional[str] = None) -> str:
293 """Handle block code."""
294 lang: Optional[str] = ""
295 lexer: Optional[Lexer] = None
297 if info:
298 if info.startswith("mermaid"):
299 return self.block_mermaidjs(code)
301 try:
302 lang = info.strip().split(maxsplit=1)[0]
303 lexer = get_lexer_by_name(lang, stripall=True)
304 except ClassNotFound:
305 code = f"{lang}\n{code}"
306 lang = None
308 if not lang:
309 return super().block_code(code, info=info)
311 formatter = HtmlFormatter()
312 return highlight(code, lexer, formatter)
314 def block_mermaidjs(self, code: str) -> str:
315 """Handle mermaid syntax."""
316 return (
317 """<div class="jp-Mermaid"><pre class="mermaid">\n"""
318 f"""{code.strip()}"""
319 """\n</pre></div>"""
320 )
322 def block_html(self, html: str) -> str:
323 """Handle block html."""
324 if self.embed_images:
325 html = self._html_embed_images(html)
327 return super().block_html(html)
329 def inline_html(self, html: str) -> str:
330 """Handle inline html."""
331 if self.embed_images:
332 html = self._html_embed_images(html)
334 return super().inline_html(html)
336 def heading(self, text: str, level: int, **attrs: Dict[str, Any]) -> str:
337 """Handle a heading."""
338 html = super().heading(text, level, **attrs)
339 if self.exclude_anchor_links:
340 return html
341 return str(add_anchor(html, anchor_link_text=self.anchor_link_text))
343 def escape_html(self, text: str) -> str:
344 """Escape html content."""
345 return escape(text, quote=False)
347 def block_math(self, body: str) -> str:
348 """Handle block math."""
349 return f"$${self.escape_html(body)}$$"
351 def multiline_math(self, text: str) -> str:
352 """Handle mulitline math for older mistune versions."""
353 return text
355 def latex_environment(self, name: str, body: str) -> str:
356 """Handle a latex environment."""
357 name, body = self.escape_html(name), self.escape_html(body)
358 return f"\\begin{{{name}}}{body}\\end{{{name}}}"
360 def inline_math(self, body: str) -> str:
361 """Handle inline math."""
362 return f"${self.escape_html(body)}$"
364 def image(self, text: str, url: str, title: Optional[str] = None) -> str:
365 """Rendering a image with title and text.
367 :param text: alt text of the image.
368 :param url: source link of the image.
369 :param title: title text of the image.
371 :note: The parameters `text` and `url` are swapped in older versions
372 of mistune.
373 """
374 if MISTUNE_V3:
375 url = self._embed_image_or_attachment(url)
376 else: # for mistune v2, the first argument is the URL
377 text = self._embed_image_or_attachment(text)
379 return super().image(text, url, title)
381 def _embed_image_or_attachment(self, src: str) -> str:
382 """Embed an image or attachment, depending on the configuration.
383 If neither is possible, returns the original URL.
384 """
386 attachment_prefix = "attachment:"
387 if src.startswith(attachment_prefix):
388 name = src[len(attachment_prefix) :]
390 if name not in self.attachments:
391 msg = f"missing attachment: {name}"
392 raise InvalidNotebook(msg)
394 attachment = self.attachments[name]
395 # we choose vector over raster, and lossless over lossy
396 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg")
397 for mime_type in preferred_mime_types:
398 if mime_type in attachment:
399 return f"data:{mime_type};base64,{attachment[mime_type]}"
400 # otherwise we choose the first mimetype we can find
401 default_mime_type = tuple(attachment.keys())[0]
402 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}"
404 elif self.embed_images:
405 base64_url = self._src_to_base64(src)
406 if base64_url is not None:
407 return base64_url
409 return src
411 def _src_to_base64(self, src: str) -> Optional[str]:
412 """Turn the source file into a base64 url.
414 :param src: source link of the file.
415 :return: the base64 url or None if the file was not found.
416 """
417 src_path = os.path.join(self.path, src)
419 if not os.path.exists(src_path):
420 return None
422 with open(src_path, "rb") as fobj:
423 mime_type, _ = mimetypes.guess_type(src_path)
425 base64_data = base64.b64encode(fobj.read())
426 base64_str = base64_data.replace(b"\n", b"").decode("ascii")
428 return f"data:{mime_type};base64,{base64_str}"
430 def _html_embed_images(self, html: str) -> str:
431 parsed_html = bs4.BeautifulSoup(html, features="html.parser")
432 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")
434 # Replace img tags's sources by base64 dataurls
435 for img in imgs:
436 src = img.attrs.get("src")
437 if src is None:
438 continue
440 base64_url = self._src_to_base64(img.attrs["src"])
441 if base64_url is not None:
442 img.attrs["src"] = base64_url
444 return str(parsed_html)
447# Represents an already imported plugin for Mistune
448MarkdownPlugin = Callable[[Markdown], None]
451class MarkdownWithMath(Markdown):
452 """Markdown text with math enabled."""
454 DEFAULT_PLUGINS = (
455 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853)
456 # "footnotes",
457 "strikethrough",
458 "table",
459 "url",
460 "task_lists",
461 "def_list",
462 )
464 def __init__(
465 self,
466 renderer: HTMLRenderer,
467 block: Optional[BlockParser] = None,
468 inline: Optional[InlineParser] = None,
469 plugins: Optional[Iterable[MarkdownPlugin]] = None,
470 ):
471 """Initialize the parser."""
472 if block is None:
473 block = MathBlockParser()
474 if inline is None:
475 if MISTUNE_V3:
476 inline = MathInlineParser(hard_wrap=False)
477 else:
478 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore
479 if plugins is None:
480 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS)
482 super().__init__(renderer, block, inline, plugins)
484 def render(self, source: str) -> str:
485 """Render the HTML output for a Markdown source."""
486 return str(super().__call__(source))
489def markdown2html_mistune(source: str) -> str:
490 """Convert a markdown string to HTML using mistune"""
491 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)