/src/mupdf/source/html/html-doc.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2026 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "html-imp.h" |
25 | | |
26 | | #include <string.h> |
27 | | #include <math.h> |
28 | | |
29 | | enum { T, R, B, L }; |
30 | | |
31 | | typedef struct |
32 | | { |
33 | | fz_document super; |
34 | | fz_archive *zip; |
35 | | fz_html_font_set *set; |
36 | | fz_html *html; |
37 | | fz_outline *outline; |
38 | | const fz_htdoc_format_t *format; |
39 | | } html_document; |
40 | | |
41 | | typedef struct |
42 | | { |
43 | | fz_page super; |
44 | | html_document *doc; |
45 | | int number; |
46 | | } html_page; |
47 | | |
48 | | static void |
49 | | htdoc_drop_document(fz_context *ctx, fz_document *doc_) |
50 | 0 | { |
51 | 0 | html_document *doc = (html_document*)doc_; |
52 | 0 | fz_drop_archive(ctx, doc->zip); |
53 | 0 | fz_drop_html(ctx, doc->html); |
54 | 0 | fz_drop_html_font_set(ctx, doc->set); |
55 | 0 | fz_drop_outline(ctx, doc->outline); |
56 | 0 | } |
57 | | |
58 | | static fz_link_dest |
59 | | htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest) |
60 | 0 | { |
61 | 0 | html_document *doc = (html_document*)doc_; |
62 | 0 | const char *s = strchr(dest, '#'); |
63 | 0 | if (s && s[1] != 0) |
64 | 0 | { |
65 | 0 | float y = fz_find_html_target(ctx, doc->html, s+1); |
66 | 0 | if (y >= 0) |
67 | 0 | { |
68 | 0 | int page = y / doc->html->page_h; |
69 | 0 | return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0); |
70 | 0 | } |
71 | 0 | } |
72 | | |
73 | 0 | return fz_make_link_dest_none(); |
74 | 0 | } |
75 | | |
76 | | static int |
77 | | htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter) |
78 | 0 | { |
79 | 0 | html_document *doc = (html_document*)doc_; |
80 | 0 | if (doc->html->tree.root->s.layout.b > 0) |
81 | 0 | return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h); |
82 | 0 | return 1; |
83 | 0 | } |
84 | | |
85 | | static void |
86 | | htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node) |
87 | 0 | { |
88 | 0 | while (node) |
89 | 0 | { |
90 | 0 | fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri); |
91 | 0 | node->page = dest.loc; |
92 | 0 | node->x = dest.x; |
93 | 0 | node->y = dest.y; |
94 | 0 | htdoc_update_outline(ctx, doc, node->down); |
95 | 0 | node = node->next; |
96 | 0 | } |
97 | 0 | } |
98 | | |
99 | | static void |
100 | | htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em) |
101 | 0 | { |
102 | 0 | html_document *doc = (html_document*)doc_; |
103 | |
|
104 | 0 | fz_layout_html(ctx, doc->html, w, h, em); |
105 | |
|
106 | 0 | htdoc_update_outline(ctx, doc_, doc->outline); |
107 | 0 | } |
108 | | |
109 | | static void |
110 | | htdoc_drop_page(fz_context *ctx, fz_page *page_) |
111 | 0 | { |
112 | 0 | } |
113 | | |
114 | | static fz_rect |
115 | | htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box) |
116 | 0 | { |
117 | 0 | html_page *page = (html_page*)page_; |
118 | 0 | html_document *doc = page->doc; |
119 | 0 | fz_rect bbox; |
120 | 0 | bbox.x0 = 0; |
121 | 0 | bbox.y0 = 0; |
122 | 0 | bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R]; |
123 | 0 | bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B]; |
124 | 0 | return bbox; |
125 | 0 | } |
126 | | |
127 | | static void |
128 | | htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie) |
129 | 0 | { |
130 | 0 | html_page *page = (html_page*)page_; |
131 | 0 | html_document *doc = page->doc; |
132 | 0 | fz_draw_html(ctx, dev, ctm, doc->html, page->number); |
133 | 0 | } |
134 | | |
135 | | static fz_link * |
136 | | htdoc_load_links(fz_context *ctx, fz_page *page_) |
137 | 0 | { |
138 | 0 | html_page *page = (html_page*)page_; |
139 | 0 | html_document *doc = page->doc; |
140 | 0 | return fz_load_html_links(ctx, doc->html, page->number, ""); |
141 | 0 | } |
142 | | |
143 | | static fz_bookmark |
144 | | htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc) |
145 | 0 | { |
146 | 0 | html_document *doc = (html_document*)doc_; |
147 | 0 | return fz_make_html_bookmark(ctx, doc->html, loc.page); |
148 | 0 | } |
149 | | |
150 | | static fz_location |
151 | | htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark) |
152 | 0 | { |
153 | 0 | html_document *doc = (html_document*)doc_; |
154 | 0 | return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark)); |
155 | 0 | } |
156 | | |
157 | | static fz_page * |
158 | | htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number) |
159 | 0 | { |
160 | 0 | html_document *doc = (html_document*)doc_; |
161 | 0 | html_page *page = fz_new_derived_page(ctx, html_page, doc_); |
162 | 0 | page->super.bound_page = htdoc_bound_page; |
163 | 0 | page->super.run_page_contents = htdoc_run_page; |
164 | 0 | page->super.load_links = htdoc_load_links; |
165 | 0 | page->super.drop_page = htdoc_drop_page; |
166 | 0 | page->doc = doc; |
167 | 0 | page->number = number; |
168 | 0 | return (fz_page*)page; |
169 | 0 | } |
170 | | |
171 | | static fz_outline * |
172 | | htdoc_load_outline(fz_context *ctx, fz_document *doc_) |
173 | 0 | { |
174 | 0 | html_document *doc = (html_document*)doc_; |
175 | 0 | return fz_keep_outline(ctx, doc->outline); |
176 | 0 | } |
177 | | |
178 | | static int |
179 | | htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size) |
180 | 0 | { |
181 | 0 | html_document *doc = (html_document *)doc_; |
182 | 0 | if (!strcmp(key, FZ_META_FORMAT)) |
183 | 0 | return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size); |
184 | 0 | if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) |
185 | 0 | return 1 + (int)fz_strlcpy(buf, doc->html->title, size); |
186 | 0 | return -1; |
187 | 0 | } |
188 | | |
189 | | static fz_html * |
190 | | generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format) |
191 | 0 | { |
192 | 0 | fz_buffer *buffer_html = NULL; |
193 | 0 | fz_html *html = NULL; |
194 | |
|
195 | 0 | fz_var(buffer_html); |
196 | |
|
197 | 0 | fz_try(ctx) |
198 | 0 | { |
199 | 0 | if (format->convert_to_html) |
200 | 0 | buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css); |
201 | 0 | else |
202 | 0 | buffer_html = fz_keep_buffer(ctx, buffer_in); |
203 | 0 | html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi); |
204 | 0 | } |
205 | 0 | fz_always(ctx) |
206 | 0 | { |
207 | 0 | fz_drop_buffer(ctx, buffer_html); |
208 | 0 | } |
209 | 0 | fz_catch(ctx) |
210 | 0 | { |
211 | 0 | fz_drop_html(ctx, html); |
212 | 0 | fz_rethrow(ctx); |
213 | 0 | } |
214 | 0 | return html; |
215 | 0 | } |
216 | | |
217 | | fz_document * |
218 | | fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format) |
219 | 0 | { |
220 | 0 | html_document *doc = NULL; |
221 | |
|
222 | 0 | fz_var(doc); |
223 | 0 | fz_var(dir); |
224 | |
|
225 | 0 | fz_try(ctx) |
226 | 0 | { |
227 | 0 | doc = fz_new_derived_document(ctx, html_document); |
228 | 0 | doc->super.drop_document = htdoc_drop_document; |
229 | 0 | doc->super.layout = htdoc_layout; |
230 | 0 | doc->super.load_outline = htdoc_load_outline; |
231 | 0 | doc->super.resolve_link_dest = htdoc_resolve_link; |
232 | 0 | doc->super.make_bookmark = htdoc_make_bookmark; |
233 | 0 | doc->super.lookup_bookmark = htdoc_lookup_bookmark; |
234 | 0 | doc->super.count_pages = htdoc_count_pages; |
235 | 0 | doc->super.load_page = htdoc_load_page; |
236 | 0 | doc->super.lookup_metadata = htdoc_lookup_metadata; |
237 | 0 | doc->super.is_reflowable = 1; |
238 | |
|
239 | 0 | doc->zip = fz_keep_archive(ctx, dir); |
240 | 0 | doc->format = format; |
241 | 0 | doc->set = fz_new_html_font_set(ctx); |
242 | 0 | doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format); |
243 | 0 | doc->outline = fz_load_html_outline(ctx, doc->html); |
244 | 0 | } |
245 | 0 | fz_always(ctx) |
246 | 0 | fz_drop_buffer(ctx, buf); |
247 | 0 | fz_catch(ctx) |
248 | 0 | { |
249 | 0 | fz_drop_document(ctx, &doc->super); |
250 | 0 | fz_rethrow(ctx); |
251 | 0 | } |
252 | | |
253 | 0 | return (fz_document*)doc; |
254 | 0 | } |
255 | | |
256 | | fz_document * |
257 | | fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format) |
258 | 0 | { |
259 | 0 | fz_buffer *buf = NULL; |
260 | |
|
261 | 0 | if (stm) |
262 | 0 | buf = fz_read_all(ctx, stm, 0); |
263 | |
|
264 | 0 | return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format); |
265 | 0 | } |
266 | | |
267 | | /* Variant specific functions */ |
268 | | |
269 | | /* Generic HTML document handler */ |
270 | | |
271 | | static int isws(int c) |
272 | 24 | { |
273 | 24 | return c == 32 || c == 9 || c == 10 || c == 13 || c == 12; |
274 | 24 | } |
275 | | |
276 | | static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml) |
277 | 24 | { |
278 | 24 | uint8_t buffer[4096]; |
279 | 24 | size_t i, n, m; |
280 | 24 | enum { |
281 | 24 | state_top, |
282 | 24 | state_open, |
283 | 24 | state_pling, |
284 | 24 | state_query, |
285 | 24 | state_maybe_doctype, |
286 | 24 | state_maybe_doctype_ws, |
287 | 24 | state_maybe_doctype_html, |
288 | 24 | state_maybe_doctype_html_xhtml, |
289 | 24 | state_maybe_comment, |
290 | 24 | state_maybe_html, |
291 | 24 | state_maybe_html_xhtml, |
292 | 24 | state_comment |
293 | 24 | }; |
294 | 24 | int state = state_top; |
295 | 24 | int type = 0; |
296 | | |
297 | 24 | if (hstate) |
298 | 24 | *hstate = NULL; |
299 | 24 | if (free_state) |
300 | 24 | *free_state = NULL; |
301 | | |
302 | 24 | if (stream == NULL) |
303 | 0 | return 0; |
304 | | |
305 | | /* Simple state machine. Search for "<!doctype html" or "<html" in the first |
306 | | * 4K of the file, allowing for comments and whitespace and case insensitivity. */ |
307 | | |
308 | 24 | n = fz_read(ctx, stream, buffer, sizeof(buffer)); |
309 | 24 | fz_seek(ctx, stream, 0, SEEK_SET); |
310 | 24 | if (n == 0) |
311 | 0 | return 0; |
312 | | |
313 | 24 | i = 0; |
314 | 24 | if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) |
315 | 0 | { |
316 | | /* UTF-8 encoded BOM. Just skip it. */ |
317 | 0 | i = 3; |
318 | 0 | } |
319 | 24 | else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) |
320 | 0 | { |
321 | | /* UTF-16, big endian. */ |
322 | 0 | type = 1; |
323 | 0 | i = 2; |
324 | 0 | n &= ~1; |
325 | 0 | } |
326 | 24 | else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) |
327 | 0 | { |
328 | | /* UTF-16, little endian. */ |
329 | 0 | i = 2; |
330 | 0 | type = 2; |
331 | 0 | n &= ~1; |
332 | 0 | } |
333 | | |
334 | 24 | while (i < n) |
335 | 24 | { |
336 | 24 | int c; |
337 | | |
338 | 24 | switch (type) |
339 | 24 | { |
340 | 24 | case 0: /* UTF-8 */ |
341 | 24 | c = buffer[i++]; |
342 | 24 | break; |
343 | 0 | case 1: /* UTF-16 - big endian */ |
344 | 0 | c = buffer[i++] << 8; |
345 | 0 | c |= buffer[i++]; |
346 | 0 | break; |
347 | 0 | case 2: /* UTF-16 - little endian */ |
348 | 0 | c = buffer[i++]; |
349 | 0 | c |= buffer[i++] << 8; |
350 | 0 | break; |
351 | 24 | } |
352 | | |
353 | 24 | switch (state) |
354 | 24 | { |
355 | 24 | case state_top: |
356 | 24 | if (isws(c)) |
357 | 0 | continue; /* whitespace */ |
358 | 24 | if (c == '<') |
359 | 0 | state = state_open; |
360 | 24 | else |
361 | 24 | return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */ |
362 | 0 | break; |
363 | 0 | case state_open: |
364 | 0 | if (isws(c)) |
365 | 0 | continue; /* whitespace */ |
366 | 0 | if (c == '!') |
367 | 0 | state = state_pling; |
368 | 0 | else if (c == '?') |
369 | 0 | state = state_query; |
370 | 0 | else if (c == 'h' || c == 'H') |
371 | 0 | state = state_maybe_html; |
372 | 0 | else |
373 | 0 | return 0; /* Not an acceptable opening tag. */ |
374 | 0 | m = 0; |
375 | 0 | break; |
376 | 0 | case state_query: |
377 | 0 | if (c == '>') |
378 | 0 | state = state_top; |
379 | 0 | break; |
380 | 0 | case state_pling: |
381 | 0 | if (isws(c)) |
382 | 0 | continue; /* whitespace */ |
383 | 0 | else if (c == '-') |
384 | 0 | state = state_maybe_comment; |
385 | 0 | else if (c == 'd' || c == 'D') |
386 | 0 | state = state_maybe_doctype; |
387 | 0 | else |
388 | 0 | return 0; /* Not an acceptable opening tag. */ |
389 | 0 | break; |
390 | 0 | case state_maybe_comment: |
391 | 0 | if (c == '-') |
392 | 0 | state = state_comment; |
393 | 0 | else |
394 | 0 | return 0; /* Not an acceptable opening tag. */ |
395 | 0 | break; |
396 | 0 | case state_comment: |
397 | 0 | if (c == '-') |
398 | 0 | { |
399 | 0 | m++; |
400 | 0 | } |
401 | 0 | else if (c == '>' && m >= 2) |
402 | 0 | { |
403 | 0 | state = state_top; |
404 | 0 | } |
405 | 0 | else |
406 | 0 | m = 0; |
407 | 0 | break; |
408 | 0 | case state_maybe_doctype: |
409 | 0 | if (c == "octype"[m] || c == "OCTYPE"[m]) |
410 | 0 | { |
411 | 0 | m++; |
412 | 0 | if (m == 6) |
413 | 0 | { |
414 | 0 | state = state_maybe_doctype_ws; |
415 | 0 | m = 0; |
416 | 0 | } |
417 | 0 | } |
418 | 0 | else |
419 | 0 | return 0; /* Not an acceptable opening tag. */ |
420 | 0 | break; |
421 | 0 | case state_maybe_doctype_ws: |
422 | 0 | if (isws(c)) |
423 | 0 | m++; |
424 | 0 | else if (m > 0 && (c == 'h' || c == 'H')) |
425 | 0 | { |
426 | 0 | state = state_maybe_doctype_html; |
427 | 0 | m = 0; |
428 | 0 | } |
429 | 0 | else |
430 | 0 | return 0; /* Not an acceptable opening tag. */ |
431 | 0 | break; |
432 | 0 | case state_maybe_doctype_html: |
433 | 0 | if (c == "tml"[m] || c == "TML"[m]) |
434 | 0 | { |
435 | 0 | m++; |
436 | 0 | if (m == 3) |
437 | 0 | { |
438 | 0 | state = state_maybe_doctype_html_xhtml; |
439 | 0 | m = 0; |
440 | 0 | } |
441 | 0 | } |
442 | 0 | else |
443 | 0 | return 0; /* Not an acceptable opening tag. */ |
444 | 0 | break; |
445 | 0 | case state_maybe_doctype_html_xhtml: |
446 | 0 | if (c == '>') |
447 | 0 | { |
448 | | /* Not xhtml - the xhtml agent can handle this at a pinch (so 25), |
449 | | * but we'd rather the html one did (75). */ |
450 | 0 | return xhtml ? 25 : 75; |
451 | 0 | } |
452 | 0 | if (c >= 'A' && c <= 'Z') |
453 | 0 | c += 'a'-'A'; |
454 | 0 | if (c == "xhtml"[m]) |
455 | 0 | { |
456 | 0 | m++; |
457 | 0 | if (m == 5) |
458 | 0 | { |
459 | | /* xhtml - the xhtml agent would be better (75) than the html |
460 | | * agent (25). */ |
461 | 0 | return xhtml ? 75 : 25; |
462 | 0 | } |
463 | 0 | } |
464 | 0 | else |
465 | 0 | m = 0; |
466 | 0 | break; |
467 | 0 | case state_maybe_html: |
468 | 0 | if (c == "tml"[m] || c == "TML"[m]) |
469 | 0 | { |
470 | 0 | m++; |
471 | 0 | if (m == 3) |
472 | 0 | { |
473 | 0 | state = state_maybe_html_xhtml; |
474 | 0 | m = 0; |
475 | 0 | } |
476 | 0 | } |
477 | 0 | else |
478 | 0 | return 0; /* Not an acceptable opening tag. */ |
479 | 0 | break; |
480 | 0 | case state_maybe_html_xhtml: |
481 | 0 | if (c == '>') |
482 | 0 | { |
483 | | /* Not xhtml - the xhtml agent can handle this at a pinch (so 25), |
484 | | * but we'd rather the html one did (75). */ |
485 | 0 | return xhtml ? 25 : 75; |
486 | 0 | } |
487 | 0 | if (c >= 'A' && c <= 'Z') |
488 | 0 | c += 'a'-'A'; |
489 | 0 | if (c == "xhtml"[m]) |
490 | 0 | { |
491 | 0 | m++; |
492 | 0 | if (m == 5) |
493 | 0 | { |
494 | | /* xhtml - the xhtml agent would be better (75) than the html |
495 | | * agent (25). */ |
496 | 0 | return xhtml ? 75 : 25; |
497 | 0 | } |
498 | 0 | } |
499 | 0 | else |
500 | 0 | m = 0; |
501 | 0 | break; |
502 | 24 | } |
503 | 24 | } |
504 | | |
505 | 0 | return 0; |
506 | 24 | } |
507 | | |
508 | | int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state) |
509 | 12 | { |
510 | 12 | return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0); |
511 | 12 | } |
512 | | |
513 | | static const fz_htdoc_format_t fz_htdoc_html5 = |
514 | | { |
515 | | "HTML5", |
516 | | NULL, |
517 | | 0, 1, 0 |
518 | | }; |
519 | | |
520 | | static fz_document * |
521 | | htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) |
522 | 0 | { |
523 | 0 | return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5); |
524 | 0 | } |
525 | | |
526 | | static const char *htdoc_extensions[] = |
527 | | { |
528 | | "htm", |
529 | | "html", |
530 | | NULL |
531 | | }; |
532 | | |
533 | | static const char *htdoc_mimetypes[] = |
534 | | { |
535 | | "text/html", |
536 | | NULL |
537 | | }; |
538 | | |
539 | | fz_document_handler html_document_handler = |
540 | | { |
541 | | NULL, |
542 | | htdoc_open_document, |
543 | | htdoc_extensions, |
544 | | htdoc_mimetypes, |
545 | | htdoc_recognize_html_content, |
546 | | 1 |
547 | | }; |
548 | | |
549 | | /* XHTML document handler */ |
550 | | |
551 | | static const fz_htdoc_format_t fz_htdoc_xhtml = |
552 | | { |
553 | | "XHTML", |
554 | | NULL, |
555 | | 1, 1, 0 |
556 | | }; |
557 | | |
558 | | static fz_document * |
559 | | xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) |
560 | 0 | { |
561 | 0 | return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml); |
562 | 0 | } |
563 | | |
564 | | int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state) |
565 | 12 | { |
566 | 12 | return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1); |
567 | 12 | } |
568 | | |
569 | | static const char *xhtdoc_extensions[] = |
570 | | { |
571 | | "xhtml", |
572 | | NULL |
573 | | }; |
574 | | |
575 | | static const char *xhtdoc_mimetypes[] = |
576 | | { |
577 | | "application/xhtml+xml", |
578 | | NULL |
579 | | }; |
580 | | |
581 | | fz_document_handler xhtml_document_handler = |
582 | | { |
583 | | NULL, |
584 | | xhtdoc_open_document, |
585 | | xhtdoc_extensions, |
586 | | xhtdoc_mimetypes, |
587 | | xhtdoc_recognize_xhtml_content, |
588 | | 1 |
589 | | }; |
590 | | |
591 | | /* FB2 document handler */ |
592 | | |
593 | | static const fz_htdoc_format_t fz_htdoc_fb2 = |
594 | | { |
595 | | "FictionBook2", |
596 | | NULL, |
597 | | 1, 0, 0 |
598 | | }; |
599 | | |
600 | | static fz_document * |
601 | | fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) |
602 | 0 | { |
603 | 0 | return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2); |
604 | 0 | } |
605 | | |
606 | | static int |
607 | | fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state) |
608 | 12 | { |
609 | 12 | const char *match = "<FictionBook"; |
610 | 12 | int pos = 0; |
611 | 12 | int n = 4096; |
612 | 12 | int c; |
613 | | |
614 | 12 | if (state) |
615 | 12 | *state = NULL; |
616 | 12 | if (free_state) |
617 | 12 | *free_state = NULL; |
618 | | |
619 | 12 | if (stream == NULL) |
620 | 0 | return 0; |
621 | | |
622 | 12 | do |
623 | 49.1k | { |
624 | 49.1k | c = fz_read_byte(ctx, stream); |
625 | 49.1k | if (c == EOF) |
626 | 0 | return 0; |
627 | 49.1k | if (c == match[pos]) |
628 | 206 | { |
629 | 206 | pos++; |
630 | 206 | if (pos == 12) |
631 | 0 | return 100; |
632 | 206 | } |
633 | 48.9k | else |
634 | 48.9k | { |
635 | | /* Restart matching, but recheck c against the start. */ |
636 | 48.9k | pos = (c == match[0]); |
637 | 48.9k | } |
638 | 49.1k | } |
639 | 49.1k | while (--n > 0); |
640 | | |
641 | 12 | return 0; |
642 | 12 | } |
643 | | |
644 | | static const char *fb2doc_extensions[] = |
645 | | { |
646 | | "fb2", |
647 | | "xml", |
648 | | NULL |
649 | | }; |
650 | | |
651 | | static const char *fb2doc_mimetypes[] = |
652 | | { |
653 | | "application/x-fictionbook", |
654 | | "application/xml", |
655 | | "text/xml", |
656 | | NULL |
657 | | }; |
658 | | |
659 | | fz_document_handler fb2_document_handler = |
660 | | { |
661 | | NULL, |
662 | | fb2doc_open_document, |
663 | | fb2doc_extensions, |
664 | | fb2doc_mimetypes, |
665 | | fb2doc_recognize_content |
666 | | }; |
667 | | |
668 | | /* Mobi document handler */ |
669 | | |
670 | | static const fz_htdoc_format_t fz_htdoc_mobi = |
671 | | { |
672 | | "MOBI", |
673 | | NULL, |
674 | | 1, 1, 1 |
675 | | }; |
676 | | |
677 | | static fz_document * |
678 | | mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi) |
679 | 0 | { |
680 | 0 | fz_archive *dir = NULL; |
681 | 0 | fz_buffer *html; |
682 | 0 | fz_document *doc; |
683 | 0 | fz_var(dir); |
684 | 0 | fz_try(ctx) |
685 | 0 | { |
686 | 0 | dir = fz_extract_html_from_mobi(ctx, mobi); |
687 | 0 | html = fz_read_archive_entry(ctx, dir, "index.html"); |
688 | 0 | doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi); |
689 | 0 | } |
690 | 0 | fz_always(ctx) |
691 | 0 | { |
692 | 0 | fz_drop_buffer(ctx, mobi); |
693 | 0 | fz_drop_archive(ctx, dir); |
694 | 0 | } |
695 | 0 | fz_catch(ctx) |
696 | 0 | { |
697 | 0 | fz_rethrow(ctx); |
698 | 0 | } |
699 | 0 | return doc; |
700 | 0 | } |
701 | | |
702 | | static int |
703 | | mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state) |
704 | 12 | { |
705 | 12 | char text[8]; |
706 | | |
707 | 12 | if (state) |
708 | 12 | *state = NULL; |
709 | 12 | if (free_state) |
710 | 12 | *free_state = NULL; |
711 | | |
712 | 12 | if (stream == NULL) |
713 | 0 | return 0; |
714 | | |
715 | 12 | fz_seek(ctx, stream, 32 + 28, SEEK_SET); |
716 | 12 | if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8) |
717 | 0 | return 0; |
718 | 12 | if (memcmp(text, "BOOKMOBI", 8) == 0) |
719 | 0 | return 100; |
720 | 12 | if (memcmp(text, "TEXtREAd", 8) == 0) |
721 | 0 | return 100; |
722 | | |
723 | 12 | return 0; |
724 | 12 | } |
725 | | |
726 | | static fz_document * |
727 | | mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) |
728 | 0 | { |
729 | 0 | return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0)); |
730 | 0 | } |
731 | | |
732 | | static const char *mobi_extensions[] = |
733 | | { |
734 | | "mobi", |
735 | | "prc", |
736 | | "pdb", |
737 | | NULL |
738 | | }; |
739 | | |
740 | | static const char *mobi_mimetypes[] = |
741 | | { |
742 | | "application/x-mobipocket-ebook", |
743 | | NULL |
744 | | }; |
745 | | |
746 | | fz_document_handler mobi_document_handler = |
747 | | { |
748 | | NULL, |
749 | | mobi_open_document, |
750 | | mobi_extensions, |
751 | | mobi_mimetypes, |
752 | | mobi_recognize_content |
753 | | }; |