1"""Export to PDF via a headless browser"""
2
3# Copyright (c) IPython Development Team.
4# Distributed under the terms of the Modified BSD License.
5
6import asyncio
7import concurrent.futures
8import os
9import subprocess
10import sys
11import tempfile
12from importlib import util as importlib_util
13
14from traitlets import Bool, Int, List, Unicode, default
15
16from .html import HTMLExporter
17
18PLAYWRIGHT_INSTALLED = importlib_util.find_spec("playwright") is not None
19IS_WINDOWS = os.name == "nt"
20
21
22class WebPDFExporter(HTMLExporter):
23 """Writer designed to write to PDF files.
24
25 This inherits from :class:`HTMLExporter`. It creates the HTML using the
26 template machinery, and then run playwright to create a pdf.
27 """
28
29 export_from_notebook = "PDF via HTML"
30
31 allow_chromium_download = Bool(
32 False,
33 help="Whether to allow downloading Chromium if no suitable version is found on the system.",
34 ).tag(config=True)
35
36 paginate = Bool(
37 True,
38 help="""
39 Split generated notebook into multiple pages.
40
41 If False, a PDF with one long page will be generated.
42
43 Set to True to match behavior of LaTeX based PDF generator
44 """,
45 ).tag(config=True)
46
47 page_render_timeout = Int(
48 100,
49 help="""
50 Time to wait for the page to render before converting to PDF, in milliseconds.
51 Increase this value if your notebook has a lot of complex JavaScript
52 output that needs more time to load.
53 """,
54 ).tag(config=True)
55
56 @default("file_extension")
57 def _file_extension_default(self):
58 return ".pdf"
59
60 @default("template_extension")
61 def _template_extension_default(self):
62 # NOTE: we use .html.j2 so that the HTMLExporter can find the template
63 return ".html.j2"
64
65 @default("template_name")
66 def _template_name_default(self):
67 return "webpdf"
68
69 disable_sandbox = Bool(
70 False,
71 help="""
72 Disable chromium security sandbox when converting to PDF.
73
74 WARNING: This could cause arbitrary code execution in specific circumstances,
75 where JS in your notebook can execute serverside code! Please use with
76 caution.
77
78 ``https://github.com/puppeteer/puppeteer/blob/main@%7B2020-12-14T17:22:24Z%7D/docs/troubleshooting.md#setting-up-chrome-linux-sandbox``
79 has more information.
80
81 This is required for webpdf to work inside most container environments.
82 """,
83 ).tag(config=True)
84
85 browser_args = List(
86 Unicode(),
87 help="""
88 Additional arguments to pass to the browser rendering to PDF.
89
90 These arguments will be passed directly to the browser launch method
91 and can be used to customize browser behavior beyond the default settings.
92 """,
93 ).tag(config=True)
94
95 def run_playwright(self, html):
96 """Run playwright."""
97
98 async def main(temp_file):
99 """Run main playwright script."""
100
101 try:
102 from playwright.async_api import ( # type: ignore[import-not-found] # noqa: PLC0415,
103 async_playwright,
104 )
105 except ModuleNotFoundError as e:
106 msg = (
107 "Playwright is not installed to support Web PDF conversion. "
108 "Please install `nbconvert[webpdf]` to enable."
109 )
110 raise RuntimeError(msg) from e
111
112 if self.allow_chromium_download:
113 cmd = [sys.executable, "-m", "playwright", "install", "chromium"]
114 subprocess.check_call(cmd) # noqa: S603
115
116 playwright = await async_playwright().start()
117 chromium = playwright.chromium
118
119 args = self.browser_args
120 if self.disable_sandbox:
121 args.append("--no-sandbox")
122
123 try:
124 browser = await chromium.launch(
125 handle_sigint=False, handle_sigterm=False, handle_sighup=False, args=args
126 )
127 except Exception as e:
128 msg = (
129 "No suitable chromium executable found on the system. "
130 "Please use '--allow-chromium-download' to allow downloading one,"
131 "or install it using `playwright install chromium`."
132 )
133 await playwright.stop()
134 raise RuntimeError(msg) from e
135
136 page = await browser.new_page()
137 await page.emulate_media(media="print")
138 await page.wait_for_timeout(100)
139 await page.goto(f"file://{temp_file.name}", wait_until="networkidle")
140 await page.wait_for_timeout(self.page_render_timeout)
141
142 pdf_params = {"print_background": True}
143 if not self.paginate:
144 # Floating point precision errors cause the printed
145 # PDF from spilling over a new page by a pixel fraction.
146 dimensions = await page.evaluate(
147 """() => {
148 const rect = document.body.getBoundingClientRect();
149 return {
150 width: Math.ceil(rect.width) + 1,
151 height: Math.ceil(rect.height) + 1,
152 }
153 }"""
154 )
155 width = dimensions["width"]
156 height = dimensions["height"]
157 # 200 inches is the maximum size for Adobe Acrobat Reader.
158 pdf_params.update(
159 {
160 "width": min(width, 200 * 72),
161 "height": min(height, 200 * 72),
162 }
163 )
164 pdf_data = await page.pdf(**pdf_params)
165
166 await browser.close()
167 await playwright.stop()
168 return pdf_data
169
170 pool = concurrent.futures.ThreadPoolExecutor()
171 # Create a temporary file to pass the HTML code to Chromium:
172 # Unfortunately, tempfile on Windows does not allow for an already open
173 # file to be opened by a separate process. So we must close it first
174 # before calling Chromium. We also specify delete=False to ensure the
175 # file is not deleted after closing (the default behavior).
176 temp_file = tempfile.NamedTemporaryFile( # noqa: SIM115
177 suffix=".html", delete=False
178 )
179 with temp_file:
180 temp_file.write(html.encode("utf-8"))
181 try:
182 pdf_data = pool.submit(asyncio.run, main(temp_file)).result()
183 finally:
184 # Ensure the file is deleted even if playwright raises an exception
185 os.unlink(temp_file.name)
186 return pdf_data
187
188 def from_notebook_node(self, nb, resources=None, **kw):
189 """Convert from a notebook node."""
190 html, resources = super().from_notebook_node(nb, resources=resources, **kw)
191
192 self.log.info("Building PDF")
193 pdf_data = self.run_playwright(html)
194 self.log.info("PDF successfully created")
195
196 return pdf_data, resources