Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown_mistune.py: 73%
153 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
1"""Markdown filters with mistune
3Used from markdown.py
4"""
5# Copyright (c) IPython Development Team.
6# Distributed under the terms of the Modified BSD License.
9import base64
10import mimetypes
11import os
12import re
13from functools import partial
14from html import escape
16import bs4
17from mistune import PLUGINS, BlockParser, HTMLRenderer, InlineParser, Markdown # type:ignore
18from pygments import highlight
19from pygments.formatters import HtmlFormatter
20from pygments.lexers import get_lexer_by_name
21from pygments.util import ClassNotFound
23from nbconvert.filters.strings import add_anchor
25html_escape = partial(escape, quote=False)
28class InvalidNotebook(Exception): # noqa
29 """An invalid notebook model."""
31 pass
34class MathBlockParser(BlockParser):
35 """This acts as a pass-through to the MathInlineParser. It is needed in
36 order to avoid other block level rules splitting math sections apart.
37 """
39 MULTILINE_MATH = re.compile(
40 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"
41 r"\\\\\[.*?\\\\\]|"
42 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",
43 re.DOTALL,
44 )
46 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES)
48 # Regex for header that doesn't require space after '#'
49 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")
51 def parse_multiline_math(self, m, state):
52 """Pass token through mutiline math."""
53 return {"type": "multiline_math", "text": m.group(0)}
56def _dotall(pattern):
57 """Make the '.' special character match any character inside the pattern, including a newline.
59 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL` when
60 it is the only pattern used. It is necessary since `mistune>=2.0.0`, where the pattern is passed
61 to the undocumented `re.Scanner`.
62 """
63 return f"(?s:{pattern})"
66class MathInlineParser(InlineParser):
67 r"""This interprets the content of LaTeX style math objects.
69 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\\(...\\)``, ``$...$``,
70 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips
71 delimiters from all these varieties, and extracts the type of environment
72 in the last case (``foo`` in this example).
73 """
74 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")
75 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")
76 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")
77 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\\((.*?)(?<!\\)\\\\\)")
78 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")
80 # The order is important here
81 RULE_NAMES = (
82 "block_math_tex",
83 "block_math_latex",
84 "inline_math_tex",
85 "inline_math_latex",
86 "latex_environment",
87 *InlineParser.RULE_NAMES,
88 )
90 def parse_block_math_tex(self, m, state):
91 """Parse block text math."""
92 # sometimes the Scanner keeps the final '$$', so we use the
93 # full matched string and remove the math markers
94 text = m.group(0)[2:-2]
95 return "block_math", text
97 def parse_block_math_latex(self, m, state):
98 """Parse block latex math ."""
99 text = m.group(1)
100 return "block_math", text
102 def parse_inline_math_tex(self, m, state):
103 """Parse inline tex math."""
104 text = m.group(1)
105 return "inline_math", text
107 def parse_inline_math_latex(self, m, state):
108 """Parse inline latex math."""
109 text = m.group(1)
110 return "inline_math", text
112 def parse_latex_environment(self, m, state):
113 """Parse a latex environment."""
114 name, text = m.group(1), m.group(2)
115 return "latex_environment", name, text
118class MarkdownWithMath(Markdown):
119 """Markdown text with math enabled."""
121 def __init__(self, renderer, block=None, inline=None, plugins=None):
122 """Initialize the parser."""
123 if block is None:
124 block = MathBlockParser()
125 if inline is None:
126 inline = MathInlineParser(renderer, hard_wrap=False)
127 if plugins is None:
128 plugins = [
129 # "abbr",
130 # 'footnotes',
131 "strikethrough",
132 "table",
133 "url",
134 "task_lists",
135 "def_list",
136 ]
137 _plugins = []
138 for p in plugins:
139 if isinstance(p, str):
140 _plugins.append(PLUGINS[p])
141 else:
142 _plugins.append(p)
143 plugins = _plugins
144 super().__init__(renderer, block, inline, plugins)
146 def render(self, s):
147 """Compatibility method with `mistune==0.8.4`."""
148 return self.parse(s)
151class IPythonRenderer(HTMLRenderer):
152 """An ipython html renderer."""
154 def __init__( # noqa
155 self,
156 escape=True,
157 allow_harmful_protocols=True,
158 embed_images=False,
159 exclude_anchor_links=False,
160 anchor_link_text="¶",
161 path="",
162 attachments=None,
163 ):
164 """Initialize the renderer."""
165 super().__init__(escape, allow_harmful_protocols)
166 self.embed_images = embed_images
167 self.exclude_anchor_links = exclude_anchor_links
168 self.anchor_link_text = anchor_link_text
169 self.path = path
170 if attachments is not None:
171 self.attachments = attachments
172 else:
173 self.attachments = {}
175 def block_code(self, code, info=None):
176 """Handle block code."""
177 lang = ""
178 lexer = None
179 if info:
180 try:
181 lang = info.strip().split(None, 1)[0]
182 lexer = get_lexer_by_name(lang, stripall=True)
183 except ClassNotFound:
184 code = lang + "\n" + code
185 lang = None # type:ignore
187 if not lang:
188 return super().block_code(code)
190 formatter = HtmlFormatter()
191 return highlight(code, lexer, formatter)
193 def block_html(self, html):
194 """Handle block html."""
195 if self.embed_images:
196 html = self._html_embed_images(html)
198 return super().block_html(html)
200 def inline_html(self, html):
201 """Handle inline html."""
202 if self.embed_images:
203 html = self._html_embed_images(html)
205 return super().inline_html(html)
207 def heading(self, text, level):
208 """Handle a heading."""
209 html = super().heading(text, level)
210 if self.exclude_anchor_links:
211 return html
212 return add_anchor(html, anchor_link_text=self.anchor_link_text)
214 def escape_html(self, text):
215 """Escape html content."""
216 return html_escape(text)
218 def multiline_math(self, text):
219 """Handle mulitline math."""
220 return text
222 def block_math(self, text):
223 """Handle block math."""
224 return f"$${self.escape_html(text)}$$"
226 def latex_environment(self, name, text):
227 """Handle a latex environment."""
228 name, text = self.escape_html(name), self.escape_html(text)
229 return f"\\begin{{{name}}}{text}\\end{{{name}}}"
231 def inline_math(self, text):
232 """Handle inline math."""
233 return f"${self.escape_html(text)}$"
235 def image(self, src, text, title):
236 """Rendering a image with title and text.
238 :param src: source link of the image.
239 :param text: alt text of the image.
240 :param title: title text of the image.
241 """
242 attachment_prefix = "attachment:"
244 if src.startswith(attachment_prefix):
245 name = src[len(attachment_prefix) :]
247 if name not in self.attachments:
248 msg = f"missing attachment: {name}"
249 raise InvalidNotebook(msg)
251 attachment = self.attachments[name]
252 # we choose vector over raster, and lossless over lossy
253 preferred_mime_types = ["image/svg+xml", "image/png", "image/jpeg"]
254 for preferred_mime_type in preferred_mime_types:
255 if preferred_mime_type in attachment:
256 break
257 else: # otherwise we choose the first mimetype we can find
258 preferred_mime_type = list(attachment.keys())[0]
259 mime_type = preferred_mime_type
260 data = attachment[mime_type]
261 src = "data:" + mime_type + ";base64," + data
263 elif self.embed_images:
264 base64_url = self._src_to_base64(src)
266 if base64_url is not None:
267 src = base64_url
269 return super().image(src, text, title)
271 def _src_to_base64(self, src):
272 """Turn the source file into a base64 url.
274 :param src: source link of the file.
275 :return: the base64 url or None if the file was not found.
276 """
277 src_path = os.path.join(self.path, src)
279 if not os.path.exists(src_path):
280 return None
282 with open(src_path, "rb") as fobj:
283 mime_type = mimetypes.guess_type(src_path)[0]
285 base64_data = base64.b64encode(fobj.read())
286 base64_str = base64_data.replace(b"\n", b"").decode("ascii")
288 return f"data:{mime_type};base64,{base64_str}"
290 def _html_embed_images(self, html):
291 parsed_html = bs4.BeautifulSoup(html, features="html.parser")
292 imgs = parsed_html.find_all("img")
294 # Replace img tags's sources by base64 dataurls
295 for img in imgs:
296 if "src" not in img.attrs:
297 continue
299 base64_url = self._src_to_base64(img.attrs["src"])
301 if base64_url is not None:
302 img.attrs["src"] = base64_url
304 return str(parsed_html)
307def markdown2html_mistune(source):
308 """Convert a markdown string to HTML using mistune"""
309 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)