Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/nbconvert/filters/markdown

1"""Markdown filters with mistune

3Used from markdown.py

4"""

5# Copyright (c) IPython Development Team.

6# Distributed under the terms of the Modified BSD License.

8import base64

9import mimetypes

10import os

11from collections.abc import Iterable

12from html import escape

13from re import Match

14from typing import TYPE_CHECKING, Any, ClassVar, Optional, Protocol

16import bs4

17from pygments import highlight

18from pygments.formatters import HtmlFormatter

19from pygments.lexer import Lexer

20from pygments.lexers import get_lexer_by_name

21from pygments.util import ClassNotFound

23from nbconvert.filters.strings import add_anchor

25if TYPE_CHECKING:

26 try:

27 from mistune.plugins import Plugin

28 except ImportError:

30 class Plugin(Protocol): # type: ignore[no-redef]

31 """Mistune plugin interface."""

33 def __call__(self, markdown: "Markdown") -> None:

34 """Apply the plugin on the markdown document."""

35 ...

38try: # for Mistune >= 3.0

39 from mistune import ( # type:ignore[attr-defined]

40 BlockParser,

41 BlockState,

42 HTMLRenderer,

43 InlineParser,

44 InlineState,

45 Markdown,

46 import_plugin,

47 )

49 MISTUNE_V3 = True

50 MISTUNE_V3_ATX = "atx_heading" in BlockParser.SPECIFICATION

52except ImportError: # for Mistune >= 2.0

53 import re

55 from mistune import ( # type: ignore[attr-defined]

56 PLUGINS,

57 BlockParser,

58 HTMLRenderer,

59 InlineParser,

60 Markdown,

61 )

63 MISTUNE_V3 = False

64 MISTUNE_V3_ATX = False

66 def import_plugin(name: str) -> "Plugin": # type: ignore[misc]

67 """Simple implementation of Mistune V3's import_plugin for V2."""

68 return PLUGINS[name] # type: ignore[no-any-return]

71class InvalidNotebook(Exception):

72 """An invalid notebook model."""

75def _dotall(pattern: str) -> str:

76 """Makes the '.' special character match any character inside the pattern, including a newline.

78 This is implemented with the inline flag `(?s:...)` and is equivalent to using `re.DOTALL`.

79 It is useful for LaTeX environments, where line breaks may be present.

80 """

81 return f"(?s:{pattern})"

84if MISTUNE_V3: # Parsers for Mistune >= 3.0.0

86 class MathBlockParser(BlockParser):

87 """This acts as a pass-through to the MathInlineParser. It is needed in

88 order to avoid other block level rules splitting math sections apart.

90 It works by matching each multiline math environment as a single paragraph,

91 so that other rules don't think each section is its own paragraph. Inline

92 is ignored here.

93 """

95 ATX_HEADING_WITHOUT_LEADING_SPACES = (

96 r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*(.*?)?)$"

97 if MISTUNE_V3_ATX

98 else r"^ {0,3}(?P<axt_1>#{1,6})(?!#+)(?P<axt_2>[ \t]*(.*?)?)$"

99 )

100

101 MULTILINE_MATH = _dotall(

102 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$

103 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}"

104 "|"

105 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]

106 r"\\\\\[.*?\\\\\]"

107 "|"

108 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}

109 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}.*?\\end\{(?P=math_env_name)\}"

110 )

111

112 SPECIFICATION = {

113 **BlockParser.SPECIFICATION,

114 (

115 "atx_heading" if MISTUNE_V3_ATX else "axt_heading"

116 ): ATX_HEADING_WITHOUT_LEADING_SPACES,

117 "multiline_math": MULTILINE_MATH,

118 }

119

120 # Multiline math must be searched before other rules

121 DEFAULT_RULES: ClassVar[Iterable[str]] = ("multiline_math", *BlockParser.DEFAULT_RULES) # type: ignore[assignment]

122

123 def parse_multiline_math(self, m: Match[str], state: BlockState) -> int:

124 """Send mutiline math as a single paragraph to MathInlineParser."""

125 matched_text = m[0]

126 state.add_paragraph(matched_text)

127 return m.end()

128

129 class MathInlineParser(InlineParser):

130 r"""This interprets the content of LaTeX style math objects.

131

132 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\$...\$``, ``$...$``,

133 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips

134 delimiters from all these varieties, and extracts the type of environment

135 in the last case (``foo`` in this example).

136 """

137

138 # Display math mode, using older TeX delimiter: $$ \pi $$

139 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(?P<math_block_tex>.*?)(?<!\\)\$\$")

140 # Display math mode, using newer LaTeX delimiter: \[ \pi \]

141 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(?P<math_block_latex>.*?)(?<!\\)\\\\\]")

142 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)

143 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(?P<math_inline_tex>.+?)(?<![$\\])\$")

144 # Inline math mode, using newer LaTeX delimiter: $ \pi $

145 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\$(?P<math_inline_latex>.*?)(?<!\$\\\\\)")

146 # LaTeX math environment: \begin{equation} \pi \end{equation}

147 LATEX_ENVIRONMENT = _dotall(

148 r"\\begin\{(?P<math_env_name>[a-z]*\*?)\}"

149 r"(?P<math_env_body>.*?)"

150 r"\\end\{(?P=math_env_name)\}"

151 )

152

153 SPECIFICATION = {

154 **InlineParser.SPECIFICATION,

155 "block_math_tex": BLOCK_MATH_TEX,

156 "block_math_latex": BLOCK_MATH_LATEX,

157 "inline_math_tex": INLINE_MATH_TEX,

158 "inline_math_latex": INLINE_MATH_LATEX,

159 "latex_environment": LATEX_ENVIRONMENT,

160 }

161

162 # Block math must be matched first, and all math must come before text

163 DEFAULT_RULES: ClassVar[Iterable[str]] = (

164 "block_math_tex",

165 "block_math_latex",

166 "inline_math_tex",

167 "inline_math_latex",

168 "latex_environment",

169 *InlineParser.DEFAULT_RULES,

170 ) # type: ignore[assignment]

171

172 def parse_block_math_tex(self, m: Match[str], state: InlineState) -> int:

173 """Parse older TeX-style display math."""

174 body = m.group("math_block_tex")

175 state.append_token({"type": "block_math", "raw": body})

176 return m.end()

177

178 def parse_block_math_latex(self, m: Match[str], state: InlineState) -> int:

179 """Parse newer LaTeX-style display math."""

180 body = m.group("math_block_latex")

181 state.append_token({"type": "block_math", "raw": body})

182 return m.end()

183

184 def parse_inline_math_tex(self, m: Match[str], state: InlineState) -> int:

185 """Parse older TeX-style inline math."""

186 body = m.group("math_inline_tex")

187 state.append_token({"type": "inline_math", "raw": body})

188 return m.end()

189

190 def parse_inline_math_latex(self, m: Match[str], state: InlineState) -> int:

191 """Parse newer LaTeX-style inline math."""

192 body = m.group("math_inline_latex")

193 state.append_token({"type": "inline_math", "raw": body})

194 return m.end()

195

196 def parse_latex_environment(self, m: Match[str], state: InlineState) -> int:

197 """Parse a latex environment."""

198 attrs = {"name": m.group("math_env_name"), "body": m.group("math_env_body")}

199 state.append_token({"type": "latex_environment", "attrs": attrs})

200 return m.end()

201

202else: # Parsers for Mistune >= 2.0.0 < 3.0.0

203

204 class MathBlockParser(BlockParser): # type: ignore[no-redef]

205 """This acts as a pass-through to the MathInlineParser. It is needed in

206 order to avoid other block level rules splitting math sections apart.

207 """

208

209 MULTILINE_MATH = re.compile(

210 # Display math mode, old TeX delimiter: $$ \sqrt{2} $$

211 r"(?<!\\)[$]{2}.*?(?<!\\)[$]{2}|"

212 # Display math mode, new LaTeX delimiter: \[ \sqrt{2} \]

213 r"\\\\\[.*?\\\\\]|"

214 # LaTeX environment: \begin{equation} \sqrt{2} \end{equation}

215 r"\\begin\{([a-z]*\*?)\}.*?\\end\{\1\}",

216 re.DOTALL,

217 )

218

219 # Regex for header that doesn't require space after '#'

220 AXT_HEADING = re.compile(r" {0,3}(#{1,6})(?!#+)(?: *\n+|([^\n]*?)(?:\n+|\s+?#+\s*\n+))")

221

222 # Multiline math must be searched before other rules

223 RULE_NAMES = ("multiline_math", *BlockParser.RULE_NAMES) # type: ignore[attr-defined]

224

225 def parse_multiline_math(self, m: Match[str], state: Any) -> dict[str, str]:

226 """Pass token through mutiline math."""

227 return {"type": "multiline_math", "text": m.group(0)}

228

229 class MathInlineParser(InlineParser): # type: ignore[no-redef]

230 r"""This interprets the content of LaTeX style math objects.

231

232 In particular this grabs ``$$...$$``, ``\\[...\\]``, ``\$...\$``, ``$...$``,

233 and ``\begin{foo}...\end{foo}`` styles for declaring mathematics. It strips

234 delimiters from all these varieties, and extracts the type of environment

235 in the last case (``foo`` in this example).

236 """

237

238 # Display math mode, using older TeX delimiter: $$ \pi $$

239 BLOCK_MATH_TEX = _dotall(r"(?<!\\)\$\$(.*?)(?<!\\)\$\$")

240 # Display math mode, using newer LaTeX delimiter: \[ \pi \]

241 BLOCK_MATH_LATEX = _dotall(r"(?<!\\)\\\\\[(.*?)(?<!\\)\\\\\]")

242 # Inline math mode, using older TeX delimiter: $ \pi $ (cannot be empty!)

243 INLINE_MATH_TEX = _dotall(r"(?<![$\\])\$(.+?)(?<![$\\])\$")

244 # Inline math mode, using newer LaTeX delimiter: $ \pi $

245 INLINE_MATH_LATEX = _dotall(r"(?<!\\)\\\\$(.*?)(?<!\$\\\\\)")

246 # LaTeX math environment: \begin{equation} \pi \end{equation}

247 LATEX_ENVIRONMENT = _dotall(r"\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}")

248

249 RULE_NAMES = (

250 "block_math_tex",

251 "block_math_latex",

252 "inline_math_tex",

253 "inline_math_latex",

254 "latex_environment",

255 *InlineParser.RULE_NAMES, # type: ignore[attr-defined]

256 )

257

258 def parse_block_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]:

259 """Parse block text math."""

260 # sometimes the Scanner keeps the final '$$', so we use the

261 # full matched string and remove the math markers

262 text = m.group(0)[2:-2]

263 return "block_math", text

264

265 def parse_block_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]:

266 """Parse block latex math ."""

267 text = m.group(1)

268 return "block_math", text

269

270 def parse_inline_math_tex(self, m: Match[str], state: Any) -> tuple[str, str]:

271 """Parse inline tex math."""

272 text = m.group(1)

273 return "inline_math", text

274

275 def parse_inline_math_latex(self, m: Match[str], state: Any) -> tuple[str, str]:

276 """Parse inline latex math."""

277 text = m.group(1)

278 return "inline_math", text

279

280 def parse_latex_environment(self, m: Match[str], state: Any) -> tuple[str, str, str]:

281 """Parse a latex environment."""

282 name, text = m.group(1), m.group(2)

283 return "latex_environment", name, text

284

285

286class IPythonRenderer(HTMLRenderer):

287 """An ipython html renderer."""

288

289 def __init__(

290 self,

291 escape: bool = True,

292 allow_harmful_protocols: bool = True,

293 embed_images: bool = False,

294 exclude_anchor_links: bool = False,

295 anchor_link_text: str = "¶",

296 path: str = "",

297 attachments: Optional[dict[str, dict[str, str]]] = None,

298 **lexer_options,

299 ):

300 """Initialize the renderer."""

301 super().__init__(escape, allow_harmful_protocols)

302 self.embed_images = embed_images

303 self.exclude_anchor_links = exclude_anchor_links

304 self.anchor_link_text = anchor_link_text

305 self.path = path

306 self.lexer_options = lexer_options

307 if attachments is not None:

308 self.attachments = attachments

309 else:

310 self.attachments = {}

311

312 def block_code(self, code: str, info: Optional[str] = None) -> str:

313 """Handle block code."""

314 lang: Optional[str] = ""

315 lexer: Optional[Lexer] = None

316

317 if info:

318 if info.startswith("mermaid"):

319 return self.block_mermaidjs(code)

320

321 try:

322 if info.strip().split(None, 1):

323 lang = info.strip().split(maxsplit=1)[0]

324 lexer = get_lexer_by_name(lang, **self.lexer_options)

325 except ClassNotFound:

326 code = f"{lang}\n{code}"

327 lang = None

328

329 if not lang:

330 return super().block_code(code, info=info)

331

332 formatter = HtmlFormatter()

333 return highlight(code, lexer, formatter)

334

335 def block_mermaidjs(self, code: str) -> str:

336 """Handle mermaid syntax."""

337 return (

338 """<div class="jp-Mermaid"><pre class="mermaid">\n"""

339 f"""{code.strip()}"""

340 """\n</pre></div>"""

341 )

342

343 def block_html(self, html: str) -> str:

344 """Handle block html."""

345 if self.embed_images:

346 html = self._html_embed_images(html)

347

348 return super().block_html(html)

349

350 def inline_html(self, html: str) -> str:

351 """Handle inline html."""

352 if self.embed_images:

353 html = self._html_embed_images(html)

354

355 return super().inline_html(html)

356

357 def heading(self, text: str, level: int, **attrs: dict[str, Any]) -> str:

358 """Handle a heading."""

359 html = super().heading(text, level, **attrs)

360 if self.exclude_anchor_links:

361 return html

362 return str(add_anchor(html, anchor_link_text=self.anchor_link_text))

363

364 def escape_html(self, text: str) -> str:

365 """Escape html content."""

366 return escape(text, quote=False)

367

368 def block_math(self, body: str) -> str:

369 """Handle block math."""

370 return f"$${self.escape_html(body)}$$"

371

372 def multiline_math(self, text: str) -> str:

373 """Handle mulitline math for older mistune versions."""

374 return text

375

376 def latex_environment(self, name: str, body: str) -> str:

377 """Handle a latex environment."""

378 name, body = self.escape_html(name), self.escape_html(body)

379 return f"\\begin{{{name}}}{body}\\end{{{name}}}"

380

381 def inline_math(self, body: str) -> str:

382 """Handle inline math."""

383 return f"${self.escape_html(body)}$"

384

385 def image(self, text: str, url: str, title: Optional[str] = None) -> str:

386 """Rendering a image with title and text.

387

388 :param text: alt text of the image.

389 :param url: source link of the image.

390 :param title: title text of the image.

391

392 :note: The parameters `text` and `url` are swapped in older versions

393 of mistune.

394 """

395 if MISTUNE_V3:

396 url = self._embed_image_or_attachment(url)

397 else: # for mistune v2, the first argument is the URL

398 text = self._embed_image_or_attachment(text)

399

400 return super().image(text, url, title)

401

402 def _embed_image_or_attachment(self, src: str) -> str:

403 """Embed an image or attachment, depending on the configuration.

404 If neither is possible, returns the original URL.

405 """

406

407 attachment_prefix = "attachment:"

408 if src.startswith(attachment_prefix):

409 name = src[len(attachment_prefix) :]

410

411 if name not in self.attachments:

412 msg = f"missing attachment: {name}"

413 raise InvalidNotebook(msg)

414

415 attachment = self.attachments[name]

416 # we choose vector over raster, and lossless over lossy

417 preferred_mime_types = ("image/svg+xml", "image/png", "image/jpeg")

418 for mime_type in preferred_mime_types:

419 if mime_type in attachment:

420 return f"data:{mime_type};base64,{attachment[mime_type]}"

421 # otherwise we choose the first mimetype we can find

422 default_mime_type = next(iter(attachment.keys()))

423 return f"data:{default_mime_type};base64,{attachment[default_mime_type]}"

424

425 if self.embed_images:

426 base64_url = self._src_to_base64(src)

427 if base64_url is not None:

428 return base64_url

429

430 return src

431

432 def _src_to_base64(self, src: str) -> Optional[str]:

433 """Turn the source file into a base64 url.

434

435 :param src: source link of the file.

436 :return: the base64 url or None if the file was not found.

437 """

438 src_path = os.path.join(self.path, src)

439

440 if not os.path.exists(src_path):

441 return None

442

443 with open(src_path, "rb") as fobj:

444 mime_type, _ = mimetypes.guess_type(src_path)

445

446 base64_data = base64.b64encode(fobj.read())

447 base64_str = base64_data.replace(b"\n", b"").decode("ascii")

448

449 return f"data:{mime_type};base64,{base64_str}"

450

451 def _html_embed_images(self, html: str) -> str:

452 parsed_html = bs4.BeautifulSoup(html, features="html.parser")

453 imgs: bs4.ResultSet[bs4.Tag] = parsed_html.find_all("img")

454

455 # Replace img tags's sources by base64 dataurls

456 for img in imgs:

457 src = img.attrs.get("src")

458 if src is None:

459 continue

460

461 base64_url = self._src_to_base64(img.attrs["src"])

462 if base64_url is not None:

463 img.attrs["src"] = base64_url

464

465 return str(parsed_html)

466

467

468class MarkdownWithMath(Markdown):

469 """Markdown text with math enabled."""

470

471 DEFAULT_PLUGINS = (

472 # "abbr", (see https://github.com/jupyter/nbconvert/pull/1853)

473 # "footnotes",

474 "strikethrough",

475 "table",

476 "url",

477 "task_lists",

478 "def_list",

479 )

480

481 def __init__(

482 self,

483 renderer: HTMLRenderer,

484 block: Optional[BlockParser] = None,

485 inline: Optional[InlineParser] = None,

486 plugins: Optional[Iterable["Plugin"]] = None,

487 ):

488 """Initialize the parser."""

489 if block is None:

490 block = MathBlockParser()

491 if inline is None:

492 if MISTUNE_V3:

493 inline = MathInlineParser(hard_wrap=False)

494 else:

495 inline = MathInlineParser(renderer, hard_wrap=False) # type: ignore[arg-type,misc]

496 if plugins is None:

497 plugins = (import_plugin(p) for p in self.DEFAULT_PLUGINS)

498

499 super().__init__(renderer, block, inline, plugins)

500

501 def render(self, source: str) -> str:

502 """Render the HTML output for a Markdown source."""

503 return str(super().__call__(source))

504

505

506def markdown2html_mistune(source: str) -> str:

507 """Convert a markdown string to HTML using mistune"""

508 return MarkdownWithMath(renderer=IPythonRenderer(escape=False)).render(source)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/nbconvert/filters/markdown_mistune.py: 61%

216 statements