/src/mupdf/source/html/html-doc.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2022 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "html-imp.h" |
25 | | |
26 | | #include <string.h> |
27 | | #include <math.h> |
28 | | |
29 | | enum { T, R, B, L }; |
30 | | |
31 | | enum { FORMAT_FB2, FORMAT_XHTML, FORMAT_HTML5, FORMAT_MOBI }; |
32 | | |
33 | | typedef struct |
34 | | { |
35 | | fz_document super; |
36 | | fz_archive *zip; |
37 | | fz_html_font_set *set; |
38 | | fz_html *html; |
39 | | fz_outline *outline; |
40 | | } html_document; |
41 | | |
42 | | typedef struct |
43 | | { |
44 | | fz_page super; |
45 | | html_document *doc; |
46 | | int number; |
47 | | } html_page; |
48 | | |
49 | | static void |
50 | | htdoc_drop_document(fz_context *ctx, fz_document *doc_) |
51 | 0 | { |
52 | 0 | html_document *doc = (html_document*)doc_; |
53 | 0 | fz_drop_archive(ctx, doc->zip); |
54 | 0 | fz_drop_html(ctx, doc->html); |
55 | 0 | fz_drop_html_font_set(ctx, doc->set); |
56 | 0 | fz_drop_outline(ctx, doc->outline); |
57 | 0 | } |
58 | | |
59 | | static fz_link_dest |
60 | | htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest) |
61 | 0 | { |
62 | 0 | html_document *doc = (html_document*)doc_; |
63 | 0 | const char *s = strchr(dest, '#'); |
64 | 0 | if (s && s[1] != 0) |
65 | 0 | { |
66 | 0 | float y = fz_find_html_target(ctx, doc->html, s+1); |
67 | 0 | if (y >= 0) |
68 | 0 | { |
69 | 0 | int page = y / doc->html->page_h; |
70 | 0 | return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0); |
71 | 0 | } |
72 | 0 | } |
73 | | |
74 | 0 | return fz_make_link_dest_none(); |
75 | 0 | } |
76 | | |
77 | | static int |
78 | | htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter) |
79 | 0 | { |
80 | 0 | html_document *doc = (html_document*)doc_; |
81 | 0 | if (doc->html->tree.root->s.layout.b > 0) |
82 | 0 | return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h); |
83 | 0 | return 1; |
84 | 0 | } |
85 | | |
86 | | static void |
87 | | htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node) |
88 | 0 | { |
89 | 0 | while (node) |
90 | 0 | { |
91 | 0 | fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri); |
92 | 0 | node->page = dest.loc; |
93 | 0 | node->x = dest.x; |
94 | 0 | node->y = dest.y; |
95 | 0 | htdoc_update_outline(ctx, doc, node->down); |
96 | 0 | node = node->next; |
97 | 0 | } |
98 | 0 | } |
99 | | |
100 | | static void |
101 | | htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em) |
102 | 0 | { |
103 | 0 | html_document *doc = (html_document*)doc_; |
104 | |
|
105 | 0 | fz_layout_html(ctx, doc->html, w, h, em); |
106 | |
|
107 | 0 | htdoc_update_outline(ctx, doc_, doc->outline); |
108 | 0 | } |
109 | | |
110 | | static void |
111 | | htdoc_drop_page(fz_context *ctx, fz_page *page_) |
112 | 0 | { |
113 | 0 | } |
114 | | |
115 | | static fz_rect |
116 | | htdoc_bound_page(fz_context *ctx, fz_page *page_) |
117 | 0 | { |
118 | 0 | html_page *page = (html_page*)page_; |
119 | 0 | html_document *doc = page->doc; |
120 | 0 | fz_rect bbox; |
121 | 0 | bbox.x0 = 0; |
122 | 0 | bbox.y0 = 0; |
123 | 0 | bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R]; |
124 | 0 | bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B]; |
125 | 0 | return bbox; |
126 | 0 | } |
127 | | |
128 | | static void |
129 | | htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie) |
130 | 0 | { |
131 | 0 | html_page *page = (html_page*)page_; |
132 | 0 | html_document *doc = page->doc; |
133 | 0 | fz_draw_html(ctx, dev, ctm, doc->html, page->number); |
134 | 0 | } |
135 | | |
136 | | static fz_link * |
137 | | htdoc_load_links(fz_context *ctx, fz_page *page_) |
138 | 0 | { |
139 | 0 | html_page *page = (html_page*)page_; |
140 | 0 | html_document *doc = page->doc; |
141 | 0 | return fz_load_html_links(ctx, doc->html, page->number, ""); |
142 | 0 | } |
143 | | |
144 | | static fz_bookmark |
145 | | htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc) |
146 | 0 | { |
147 | 0 | html_document *doc = (html_document*)doc_; |
148 | 0 | return fz_make_html_bookmark(ctx, doc->html, loc.page); |
149 | 0 | } |
150 | | |
151 | | static fz_location |
152 | | htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark) |
153 | 0 | { |
154 | 0 | html_document *doc = (html_document*)doc_; |
155 | 0 | return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark)); |
156 | 0 | } |
157 | | |
158 | | static fz_page * |
159 | | htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number) |
160 | 0 | { |
161 | 0 | html_document *doc = (html_document*)doc_; |
162 | 0 | html_page *page = fz_new_derived_page(ctx, html_page, doc_); |
163 | 0 | page->super.bound_page = htdoc_bound_page; |
164 | 0 | page->super.run_page_contents = htdoc_run_page; |
165 | 0 | page->super.load_links = htdoc_load_links; |
166 | 0 | page->super.drop_page = htdoc_drop_page; |
167 | 0 | page->doc = doc; |
168 | 0 | page->number = number; |
169 | 0 | return (fz_page*)page; |
170 | 0 | } |
171 | | |
172 | | static fz_outline * |
173 | | htdoc_load_outline(fz_context *ctx, fz_document *doc_) |
174 | 0 | { |
175 | 0 | html_document *doc = (html_document*)doc_; |
176 | 0 | return fz_keep_outline(ctx, doc->outline); |
177 | 0 | } |
178 | | |
179 | | static int |
180 | | fb2doc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, int size) |
181 | 0 | { |
182 | 0 | html_document *doc = (html_document*)doc_; |
183 | 0 | if (!strcmp(key, FZ_META_FORMAT)) |
184 | 0 | return 1 + (int)fz_strlcpy(buf, "FictionBook2", size); |
185 | 0 | if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) |
186 | 0 | return 1 + (int)fz_strlcpy(buf, doc->html->title, size); |
187 | 0 | return -1; |
188 | 0 | } |
189 | | |
190 | | static int |
191 | | htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, int size) |
192 | 0 | { |
193 | 0 | html_document *doc = (html_document*)doc_; |
194 | 0 | if (!strcmp(key, FZ_META_FORMAT)) |
195 | 0 | return (int)fz_strlcpy(buf, "HTML5", size); |
196 | 0 | if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) |
197 | 0 | return 1 + (int)fz_strlcpy(buf, doc->html->title, size); |
198 | 0 | return -1; |
199 | 0 | } |
200 | | |
201 | | static int |
202 | | xhtdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, int size) |
203 | 0 | { |
204 | 0 | html_document *doc = (html_document*)doc_; |
205 | 0 | if (!strcmp(key, FZ_META_FORMAT)) |
206 | 0 | return (int)fz_strlcpy(buf, "XHTML", size); |
207 | 0 | if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) |
208 | 0 | return 1 + (int)fz_strlcpy(buf, doc->html->title, size); |
209 | 0 | return -1; |
210 | 0 | } |
211 | | |
212 | | static int |
213 | | mobi_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, int size) |
214 | 0 | { |
215 | 0 | html_document *doc = (html_document*)doc_; |
216 | 0 | if (!strcmp(key, FZ_META_FORMAT)) |
217 | 0 | return (int)fz_strlcpy(buf, "MOBI", size); |
218 | 0 | if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) |
219 | 0 | return 1 + (int)fz_strlcpy(buf, doc->html->title, size); |
220 | 0 | return -1; |
221 | 0 | } |
222 | | |
223 | | static fz_document * |
224 | | htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *zip, fz_buffer *buf, int format) |
225 | 0 | { |
226 | 0 | html_document *doc = fz_new_derived_document(ctx, html_document); |
227 | 0 | doc->super.drop_document = htdoc_drop_document; |
228 | 0 | doc->super.layout = htdoc_layout; |
229 | 0 | doc->super.load_outline = htdoc_load_outline; |
230 | 0 | doc->super.resolve_link_dest = htdoc_resolve_link; |
231 | 0 | doc->super.make_bookmark = htdoc_make_bookmark; |
232 | 0 | doc->super.lookup_bookmark = htdoc_lookup_bookmark; |
233 | 0 | doc->super.count_pages = htdoc_count_pages; |
234 | 0 | doc->super.load_page = htdoc_load_page; |
235 | 0 | switch (format) |
236 | 0 | { |
237 | 0 | case FORMAT_FB2: doc->super.lookup_metadata = fb2doc_lookup_metadata; break; |
238 | 0 | case FORMAT_HTML5: doc->super.lookup_metadata = htdoc_lookup_metadata; break; |
239 | 0 | case FORMAT_XHTML: doc->super.lookup_metadata = xhtdoc_lookup_metadata; break; |
240 | 0 | case FORMAT_MOBI: doc->super.lookup_metadata = mobi_lookup_metadata; break; |
241 | 0 | } |
242 | 0 | doc->super.is_reflowable = 1; |
243 | |
|
244 | 0 | fz_try(ctx) |
245 | 0 | { |
246 | 0 | doc->zip = zip; |
247 | 0 | doc->set = fz_new_html_font_set(ctx); |
248 | 0 | switch (format) |
249 | 0 | { |
250 | 0 | case FORMAT_FB2: doc->html = fz_parse_fb2(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx)); break; |
251 | 0 | case FORMAT_HTML5: doc->html = fz_parse_html5(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx)); break; |
252 | 0 | case FORMAT_XHTML: doc->html = fz_parse_xhtml(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx)); break; |
253 | 0 | case FORMAT_MOBI: doc->html = fz_parse_mobi(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx)); break; |
254 | 0 | } |
255 | 0 | doc->outline = fz_load_html_outline(ctx, doc->html); |
256 | 0 | } |
257 | 0 | fz_always(ctx) |
258 | 0 | fz_drop_buffer(ctx, buf); |
259 | 0 | fz_catch(ctx) |
260 | 0 | { |
261 | 0 | fz_drop_document(ctx, &doc->super); |
262 | 0 | fz_rethrow(ctx); |
263 | 0 | } |
264 | | |
265 | 0 | return (fz_document*)doc; |
266 | 0 | } |
267 | | |
268 | | static fz_document * |
269 | | htdoc_open_document_with_stream(fz_context *ctx, fz_stream *file) |
270 | 0 | { |
271 | 0 | return htdoc_open_document_with_buffer(ctx, fz_open_directory(ctx, "."), fz_read_all(ctx, file, 0), FORMAT_HTML5); |
272 | 0 | } |
273 | | |
274 | | static fz_document * |
275 | | htdoc_open_document(fz_context *ctx, const char *filename) |
276 | 0 | { |
277 | 0 | char dirname[2048]; |
278 | 0 | fz_dirname(dirname, filename, sizeof dirname); |
279 | 0 | return htdoc_open_document_with_buffer(ctx, fz_open_directory(ctx, dirname), fz_read_file(ctx, filename), FORMAT_HTML5); |
280 | 0 | } |
281 | | |
282 | | static const char *htdoc_extensions[] = |
283 | | { |
284 | | "htm", |
285 | | "html", |
286 | | NULL |
287 | | }; |
288 | | |
289 | | static const char *htdoc_mimetypes[] = |
290 | | { |
291 | | "text/html", |
292 | | NULL |
293 | | }; |
294 | | |
295 | | fz_document_handler html_document_handler = |
296 | | { |
297 | | NULL, |
298 | | htdoc_open_document, |
299 | | htdoc_open_document_with_stream, |
300 | | htdoc_extensions, |
301 | | htdoc_mimetypes, |
302 | | NULL, |
303 | | NULL, |
304 | | }; |
305 | | |
306 | | static fz_document * |
307 | | xhtdoc_open_document_with_stream(fz_context *ctx, fz_stream *file) |
308 | 0 | { |
309 | 0 | return htdoc_open_document_with_buffer(ctx, fz_open_directory(ctx, "."), fz_read_all(ctx, file, 0), FORMAT_XHTML); |
310 | 0 | } |
311 | | |
312 | | static fz_document * |
313 | | xhtdoc_open_document(fz_context *ctx, const char *filename) |
314 | 0 | { |
315 | 0 | char dirname[2048]; |
316 | 0 | fz_dirname(dirname, filename, sizeof dirname); |
317 | 0 | return htdoc_open_document_with_buffer(ctx, fz_open_directory(ctx, dirname), fz_read_file(ctx, filename), FORMAT_XHTML); |
318 | 0 | } |
319 | | |
320 | | static const char *xhtdoc_extensions[] = |
321 | | { |
322 | | "xhtml", |
323 | | NULL |
324 | | }; |
325 | | |
326 | | static const char *xhtdoc_mimetypes[] = |
327 | | { |
328 | | "application/xhtml+xml", |
329 | | NULL |
330 | | }; |
331 | | |
332 | | fz_document_handler xhtml_document_handler = |
333 | | { |
334 | | NULL, |
335 | | xhtdoc_open_document, |
336 | | xhtdoc_open_document_with_stream, |
337 | | xhtdoc_extensions, |
338 | | xhtdoc_mimetypes |
339 | | }; |
340 | | |
341 | | static fz_document * |
342 | | fb2doc_open_document_with_stream(fz_context *ctx, fz_stream *file) |
343 | 0 | { |
344 | 0 | return htdoc_open_document_with_buffer(ctx, NULL, fz_read_all(ctx, file, 0), FORMAT_FB2); |
345 | 0 | } |
346 | | |
347 | | static fz_document * |
348 | | fb2doc_open_document(fz_context *ctx, const char *filename) |
349 | 0 | { |
350 | 0 | return htdoc_open_document_with_buffer(ctx, NULL, fz_read_file(ctx, filename), FORMAT_FB2); |
351 | 0 | } |
352 | | |
353 | | static const char *fb2doc_extensions[] = |
354 | | { |
355 | | "fb2", |
356 | | "xml", |
357 | | NULL |
358 | | }; |
359 | | |
360 | | static const char *fb2doc_mimetypes[] = |
361 | | { |
362 | | "application/x-fictionbook", |
363 | | "application/xml", |
364 | | "text/xml", |
365 | | NULL |
366 | | }; |
367 | | |
368 | | fz_document_handler fb2_document_handler = |
369 | | { |
370 | | NULL, |
371 | | fb2doc_open_document, |
372 | | fb2doc_open_document_with_stream, |
373 | | fb2doc_extensions, |
374 | | fb2doc_mimetypes |
375 | | }; |
376 | | |
377 | | static fz_document * |
378 | | mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi) |
379 | 0 | { |
380 | 0 | fz_archive *zip = NULL; |
381 | 0 | fz_buffer *html; |
382 | 0 | fz_var(zip); |
383 | 0 | fz_try(ctx) |
384 | 0 | { |
385 | 0 | zip = fz_extract_html_from_mobi(ctx, mobi); |
386 | 0 | html = fz_read_archive_entry(ctx, zip, "index.html"); |
387 | 0 | } |
388 | 0 | fz_always(ctx) |
389 | 0 | { |
390 | 0 | fz_drop_buffer(ctx, mobi); |
391 | 0 | } |
392 | 0 | fz_catch(ctx) |
393 | 0 | { |
394 | 0 | fz_drop_archive(ctx, zip); |
395 | 0 | fz_rethrow(ctx); |
396 | 0 | } |
397 | 0 | return htdoc_open_document_with_buffer(ctx, zip, html, FORMAT_MOBI); |
398 | 0 | } |
399 | | |
400 | | static fz_document * |
401 | | mobi_open_document_with_stream(fz_context *ctx, fz_stream *file) |
402 | 0 | { |
403 | 0 | return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0)); |
404 | 0 | } |
405 | | |
406 | | static fz_document * |
407 | | mobi_open_document(fz_context *ctx, const char *filename) |
408 | 0 | { |
409 | 0 | return mobi_open_document_with_buffer(ctx, fz_read_file(ctx, filename)); |
410 | 0 | } |
411 | | |
412 | | static const char *mobi_extensions[] = |
413 | | { |
414 | | "mobi", |
415 | | "prc", |
416 | | "pdb", |
417 | | NULL |
418 | | }; |
419 | | |
420 | | static const char *mobi_mimetypes[] = |
421 | | { |
422 | | "application/x-mobipocket-ebook", |
423 | | NULL |
424 | | }; |
425 | | |
426 | | fz_document_handler mobi_document_handler = |
427 | | { |
428 | | NULL, |
429 | | mobi_open_document, |
430 | | mobi_open_document_with_stream, |
431 | | mobi_extensions, |
432 | | mobi_mimetypes |
433 | | }; |