Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown

1"""Markdown filters with mistune

3Used from markdown.py

4"""

5# Copyright (c) IPython Development Team.

6# Distributed under the terms of the Modified BSD License.

9import base64

10import mimetypes

11import os

12from html import escape

13from typing import Any, Callable, Dict, Iterable, Match, Optional, Tuple

15import bs4

16from pygments import highlight

17from pygments.formatters import HtmlFormatter

18from pygments.lexer import Lexer

19from pygments.lexers import get_lexer_by_name

20from pygments.util import ClassNotFound

22from nbconvert.filters.strings import add_anchor

24try: # for Mistune >= 3.0

25 from mistune import (

26 BlockParser,

27 BlockState,

28 HTMLRenderer,

29 InlineParser,

30 InlineState,

31 Markdown,

32 import_plugin,

33 )

35 MISTUNE_V3 = True

37except ImportError: # for Mistune >= 2.0

38 import re

40 from mistune import ( # type: ignore[attr-defined]

41 PLUGINS,

42 BlockParser,

43 HTMLRenderer,

44 InlineParser,

45 Markdown,

46 )

48 MISTUNE_V3 = False

50 def import_plugin(name: str) -> 'MarkdownPlugin': # type: ignore[misc]

51 """Simple implementation of Mistune V3's import_plugin for V2."""

52 return PLUGINS[name] # type: ignore[no-any-return]

55class InvalidNotebook(Exception): # noqa

56 """An invalid notebook model."""

58 pass

61def _dotall(pattern: str) -> str:

62 """Makes the '.' special character match any character inside the pattern, including a newline.

64 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.

65 It is useful for LaTeX environments, where line breaks may be present.

66 """

67 return f"(?s:{pattern})"

70if MISTUNE_V3: # Parsers for Mistune >= 3.0.0

72 class MathBlockParser(BlockParser):

73 """This acts as a pass-through to the MathInlineParser. It is needed in

74 order to avoid other block level rules splitting math sections apart.

76 It works by matching each multiline math environment as a single paragraph,

77 so that other rules don't think each section is its own paragraph. Inline

78 is ignored here.

79 """

81 AXT_HEADING_WITHOUT_LEADING_SPACES = (

82 r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$"

83 )

85 MULTILINE_MATH = _dotall(

86 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$

87 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}"

88 "|"

89 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]

90 r"\\\\\[.*?\\\\\]"

91 "|"

92 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}

93 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}"

94 )

96 SPECIFICATION = {

97 **BlockParser.SPECIFICATION,

98 "axt_heading": AXT_HEADING_WITHOUT_LEADING_SPACES,

99 "multiline_math": MULTILINE_MATH,

100 }

101

102 # Multiline math must be searched before other rules

103 DEFAULT_RULES: Tuple[str, ...] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment]

104

105 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int:

106 """Send mutiline math as a single paragraph to MathInlineParser."""

107 matched_text = m[0]

108 state.add_paragraph(matched_text)

109 return m.end()

110

111 class MathInlineParser(InlineParser):

112 r"""This interprets the content of LaTeX style math objects.

113

114 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\$...\$``, ``$...$``,

115 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips

116 delimiters from all these varieties, and extracts the type of environment

117 in the last case (``foo`` in this example).

118 """

119

120 # Display math mode, using older TeX delimiter: $$ \pi $$

121 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$")

122 # Display math mode, using newer LaTeX delimiter: \[ \pi \]

123 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]")

124 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)

125 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$")

126 # Inline math mode, using newer LaTeX delimiter: $ \pi $

127 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\$(?P<math_inline_latex>.*?)(?<!\$\\\\\)")

128 # LaTeX math environment: \begin{equation} \pi \end{equation}

129 LATEX_ENVIRONMENT = _dotall(

130 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}"

131 r"(?P<math_env_body>.*?)"

132 r"\\end\{(?P=math_env_name)\}"

133 )

134

135 SPECIFICATION = {

136 **InlineParser.SPECIFICATION,

137 "block_math_tex": BLOCK_MATH_TEX,

138 "block_math_latex": BLOCK_MATH_LATEX,

139 "inline_math_tex": INLINE_MATH_TEX,

140 "inline_math_latex": INLINE_MATH_LATEX,

141 "latex_environment": LATEX_ENVIRONMENT,

142 }

143

144 # Block math must be matched first, and all math must come before text

145 DEFAULT_RULES: Tuple[str, ...] = (

146 "block_math_tex",

147 "block_math_latex",

148 "inline_math_tex",

149 "inline_math_latex",

150 "latex_environment",

151 *InlineParser.DEFAULT_RULES,

152 ) # type: ignore[assignment]

153

154 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int:

155 """Parse older TeX-style display math."""

156 body = m.group("math_block_tex")

157 state.append_token({"type": "block_math", "raw": body})

158 return m.end()

159

160 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int:

161 """Parse newer LaTeX-style display math."""

162 body = m.group("math_block_latex")

163 state.append_token({"type": "block_math", "raw": body})

164 return m.end()

165

166 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int:

167 """Parse older TeX-style inline math."""

168 body = m.group("math_inline_tex")

169 state.append_token({"type": "inline_math", "raw": body})

170 return m.end()

171

172 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int:

173 """Parse newer LaTeX-style inline math."""

174 body = m.group("math_inline_latex")

175 state.append_token({"type": "inline_math", "raw": body})

176 return m.end()

177

178 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int:

179 """Parse a latex environment."""

180 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")}

181 state.append_token({"type": "latex_environment", "attrs": attrs})

182 return m.end()

183

184else: # Parsers for Mistune >= 2.0.0 < 3.0.0

185

186 class MathBlockParser(BlockParser): # type: ignore[no-redef]

187 """This acts as a pass-through to the MathInlineParser. It is needed in

188 order to avoid other block level rules splitting math sections apart.

189 """

190

191 MULTILINE_MATH = re.compile(

192 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$

193 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"

194 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]

195 r"\\\\\[.*?\\\\\]|"

196 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}

197 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",

198 re.DOTALL,

199 )

200

201 # Regex for header that doesn't require space after '#'

202 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")

203

204 # Multiline math must be searched before other rules

205 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore

206

207 def parse_multiline_math(self, m: Match[str], state: Any) -> Dict[str, str]:

208 """Pass token through mutiline math."""

209 return {"type": "multiline_math", "text": m.group(0)}

210

211 class MathInlineParser(InlineParser): # type: ignore[no-redef]

212 r"""This interprets the content of LaTeX style math objects.

213

214 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\$...\$``, ``$...$``,

215 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips

216 delimiters from all these varieties, and extracts the type of environment

217 in the last case (``foo`` in this example).

218 """

219

220 # Display math mode, using older TeX delimiter: $$ \pi $$

221 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")

222 # Display math mode, using newer LaTeX delimiter: \[ \pi \]

223 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")

224 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)

225 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")

226 # Inline math mode, using newer LaTeX delimiter: $ \pi $

227 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\$(.*?)(?<!\$\\\\\)")

228 # LaTeX math environment: \begin{equation} \pi \end{equation}

229 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")

230

231 RULE_NAMES = (

232 "block_math_tex",

233 "block_math_latex",

234 "inline_math_tex",

235 "inline_math_latex",

236 "latex_environment",

237 *InlineParser.RULE_NAMES, # type: ignore

238 )

239

240 def parse_block_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:

241 """Parse block text math."""

242 # sometimes the Scanner keeps the final '$$', so we use the

243 # full matched string and remove the math markers

244 text = m.group(0)[2:-2]

245 return "block_math", text

246

247 def parse_block_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]:

248 """Parse block latex math ."""

249 text = m.group(1)

250 return "block_math", text

251

252 def parse_inline_math_tex(self, m: Match[str], state: Any) -> Tuple[str, str]:

253 """Parse inline tex math."""

254 text = m.group(1)

255 return "inline_math", text

256

257 def parse_inline_math_latex(self, m: Match[str], state: Any) -> Tuple[str, str]:

258 """Parse inline latex math."""

259 text = m.group(1)

260 return "inline_math", text

261

262 def parse_latex_environment(self, m: Match[str], state: Any) -> Tuple[str, str, str]:

263 """Parse a latex environment."""

264 name, text = m.group(1), m.group(2)

265 return "latex_environment", name, text

266

267

268class IPythonRenderer(HTMLRenderer):

269 """An ipython html renderer."""

270

271 def __init__( # noqa

272 self,

273 escape: bool = True,

274 allow_harmful_protocols: bool = True,

275 embed_images: bool = False,

276 exclude_anchor_links: bool = False,

277 anchor_link_text: str = "¶",

278 path: str = "",

279 attachments: Optional[Dict[str, Dict[str, str]]] = None,

280 ):

281 """Initialize the renderer."""

282 super().__init__(escape, allow_harmful_protocols)

283 self.embed_images = embed_images

284 self.exclude_anchor_links = exclude_anchor_links

285 self.anchor_link_text = anchor_link_text

286 self.path = path

287 if attachments is not None:

288 self.attachments = attachments

289 else:

290 self.attachments = {}

291

292 def block_code(self, code: str, info: Optional[str] = None) -> str:

293 """Handle block code."""

294 lang: Optional[str] = ""

295 lexer: Optional[Lexer] = None

296

297 if info:

298 if info.startswith("mermaid"):

299 return self.block_mermaidjs(code)

300

301 try:

302 lang = info.strip().split(maxsplit=1)[0]

303 lexer = get_lexer_by_name(lang, stripall=True)

304 except ClassNotFound:

305 code = f"{lang}\n{code}"

306 lang = None

307

308 if not lang:

309 return super().block_code(code, info=info)

310

311 formatter = HtmlFormatter()

312 return highlight(code, lexer, formatter)

313

314 def block_mermaidjs(self, code: str) -> str:

315 """Handle mermaid syntax."""

316 return (

317 """<div class="jp-Mermaid"><pre class="mermaid">\n"""

318 f"""{code.strip()}"""

319 """\n</pre></div>"""

320 )

321

322 def block_html(self, html: str) -> str:

323 """Handle block html."""

324 if self.embed_images:

325 html = self._html_embed_images(html)

326

327 return super().block_html(html)

328

329 def inline_html(self, html: str) -> str:

330 """Handle inline html."""

331 if self.embed_images:

332 html = self._html_embed_images(html)

333

334 return super().inline_html(html)

335

336 def heading(self, text: str, level: int, **attrs: Dict[str, Any]) -> str:

337 """Handle a heading."""

338 html = super().heading(text, level, **attrs)

339 if self.exclude_anchor_links:

340 return html

341 return str(add_anchor(html, anchor_link_text=self.anchor_link_text))

342

343 def escape_html(self, text: str) -> str:

344 """Escape html content."""

345 return escape(text, quote=False)

346

347 def block_math(self, body: str) -> str:

348 """Handle block math."""

349 return f"$${self.escape_html(body)}$$"

350

351 def multiline_math(self, text: str) -> str:

352 """Handle mulitline math for older mistune versions."""

353 return text

354

355 def latex_environment(self, name: str, body: str) -> str:

356 """Handle a latex environment."""

357 name, body = self.escape_html(name), self.escape_html(body)

358 return f"\\begin{{{name}}}{body}\\end{{{name}}}"

359

360 def inline_math(self, body: str) -> str:

361 """Handle inline math."""

362 return f"${self.escape_html(body)}$"

363

364 def image(self, text: str, url: str, title: Optional[str] = None) -> str:

365 """Rendering a image with title and text.

366

367 :param text: alt text of the image.

368 :param url: source link of the image.

369 :param title: title text of the image.

370

371 :note: The parameters `text` and `url` are swapped in older versions

372 of mistune.

373 """

374 if MISTUNE_V3:

375 url = self._embed_image_or_attachment(url)

376 else: # for mistune v2, the first argument is the URL

377 text = self._embed_image_or_attachment(text)

378

379 return super().image(text, url, title)

380

381 def _embed_image_or_attachment(self, src: str) -> str:

382 """Embed an image or attachment, depending on the configuration.

383 If neither is possible, returns the original URL.

384 """

385

386 attachment_prefix = "attachment:"

387 if src.startswith(attachment_prefix):

388 name = src[len(attachment_prefix) :]

389

390 if name not in self.attachments:

391 msg = f"missing attachment: {name}"

392 raise InvalidNotebook(msg)

393

394 attachment = self.attachments[name]

395 # we choose vector over raster, and lossless over lossy

396 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg")

397 for mime_type in preferred_mime_types:

398 if mime_type in attachment:

399 return f"data:{mime_type};base64,{attachment[mime_type]}"

400 # otherwise we choose the first mimetype we can find

401 default_mime_type = tuple(attachment.keys())[0]

402 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}"

403

404 elif self.embed_images:

405 base64_url = self._src_to_base64(src)

406 if base64_url is not None:

407 return base64_url

408

409 return src

410

411 def _src_to_base64(self, src: str) -> Optional[str]:

412 """Turn the source file into a base64 url.

413

414 :param src: source link of the file.

415 :return: the base64 url or None if the file was not found.

416 """

417 src_path = os.path.join(self.path, src)

418

419 if not os.path.exists(src_path):

420 return None

421

422 with open(src_path, "rb") as fobj:

423 mime_type, _ = mimetypes.guess_type(src_path)

424

425 base64_data = base64.b64encode(fobj.read())

426 base64_str = base64_data.replace(b"\n", b"").decode("ascii")

427

428 return f"data:{mime_type};base64,{base64_str}"

429

430 def _html_embed_images(self, html: str) -> str:

431 parsed_html = bs4.BeautifulSoup(html, features="html.parser")

432 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")

433

434 # Replace img tags's sources by base64 dataurls

435 for img in imgs:

436 src = img.attrs.get("src")

437 if src is None:

438 continue

439

440 base64_url = self._src_to_base64(img.attrs["src"])

441 if base64_url is not None:

442 img.attrs["src"] = base64_url

443

444 return str(parsed_html)

445

446

447# Represents an already imported plugin for Mistune

448MarkdownPlugin = Callable[[Markdown], None]

449

450

451class MarkdownWithMath(Markdown):

452 """Markdown text with math enabled."""

453

454 DEFAULT_PLUGINS = (

455 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853)

456 # "footnotes",

457 "strikethrough",

458 "table",

459 "url",

460 "task_lists",

461 "def_list",

462 )

463

464 def __init__(

465 self,

466 renderer: HTMLRenderer,

467 block: Optional[BlockParser] = None,

468 inline: Optional[InlineParser] = None,

469 plugins: Optional[Iterable[MarkdownPlugin]] = None,

470 ):

471 """Initialize the parser."""

472 if block is None:

473 block = MathBlockParser()

474 if inline is None:

475 if MISTUNE_V3:

476 inline = MathInlineParser(hard_wrap=False)

477 else:

478 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore

479 if plugins is None:

480 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS)

481

482 super().__init__(renderer, block, inline, plugins)

483

484 def render(self, source: str) -> str:

485 """Render the HTML output for a Markdown source."""

486 return str(super().__call__(source))

487

488

489def markdown2html_mistune(source: str) -> str:

490 """Convert a markdown string to HTML using mistune"""

491 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/nbconvert/filters/markdown_mistune.py: 61%

204 statements