/src/mupdf/source/html/mobi.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2022 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "html-imp.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | 0 | #define FORMAT_HTML 1 |
29 | 0 | #define FORMAT_TEXT 2 |
30 | | |
31 | 0 | #define COMPRESSION_NONE 1 |
32 | 0 | #define COMPRESSION_PALMDOC 2 |
33 | | #define COMPRESSION_HUFF_CDIC 17480 |
34 | | |
35 | 0 | #define TEXT_ENCODING_LATIN_1 0 |
36 | 0 | #define TEXT_ENCODING_1252 1252 |
37 | 0 | #define TEXT_ENCODING_UTF8 65001 |
38 | | |
39 | | static void |
40 | | skip_bytes(fz_context *ctx, fz_stream *stm, size_t len) |
41 | 0 | { |
42 | 0 | size_t skipped = fz_skip(ctx, stm, len); |
43 | 0 | if (skipped < len) |
44 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data"); |
45 | 0 | } |
46 | | |
47 | | static void |
48 | | mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size) |
49 | 0 | { |
50 | 0 | unsigned char buf[4096]; |
51 | 0 | size_t n; |
52 | 0 | if (size > 4096) |
53 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large"); |
54 | 0 | n = fz_read(ctx, stm, buf, size); |
55 | 0 | if (n < size) |
56 | 0 | fz_warn(ctx, "premature end in mobi uncompressed text data"); |
57 | 0 | fz_append_data(ctx, out, buf, n); |
58 | 0 | } |
59 | | |
60 | | static void |
61 | | mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size) |
62 | 0 | { |
63 | | // https://wiki.mobileread.com/wiki/PalmDOC |
64 | 0 | size_t end = out->len + size; |
65 | 0 | while (out->len < end) |
66 | 0 | { |
67 | 0 | int c = fz_read_byte(ctx, stm); |
68 | 0 | if (c == EOF) |
69 | 0 | break; |
70 | 0 | if (c >= 0x01 && c <= 0x08) |
71 | 0 | { |
72 | 0 | unsigned char buf[8]; |
73 | 0 | size_t n = fz_read(ctx, stm, buf, c); |
74 | 0 | fz_append_data(ctx, out, buf, n); |
75 | 0 | if (n < (size_t) c) |
76 | 0 | break; |
77 | 0 | } |
78 | 0 | else if (c <= 0x7f) |
79 | 0 | { |
80 | 0 | fz_append_byte(ctx, out, c); |
81 | 0 | } |
82 | 0 | else if (c >= 0x80 && c <= 0xbf) |
83 | 0 | { |
84 | 0 | int cc, x, distance, length; |
85 | 0 | cc = fz_read_byte(ctx, stm); |
86 | 0 | if (cc == EOF) |
87 | 0 | break; |
88 | 0 | x = (c << 8) | cc; |
89 | 0 | distance = (x >> 3) & 0x7ff; |
90 | 0 | length = (x & 7) + 3; |
91 | 0 | if (distance > 0 && (size_t)distance <= out->len) |
92 | 0 | { |
93 | 0 | int i; |
94 | 0 | int p = (int)(out->len - distance); |
95 | 0 | for (i = 0; i < length; ++i) |
96 | 0 | fz_append_byte(ctx, out, out->data[p + i]); |
97 | 0 | } |
98 | 0 | } |
99 | 0 | else if (c >= 0xc0 && c <= 0xff) |
100 | 0 | { |
101 | 0 | fz_append_byte(ctx, out, ' '); |
102 | 0 | fz_append_byte(ctx, out, c ^ 0x80); |
103 | 0 | } |
104 | 0 | } |
105 | |
|
106 | 0 | if (out->len < end) |
107 | 0 | fz_warn(ctx, "premature end in mobi palmdoc data"); |
108 | 0 | } |
109 | | |
110 | | static uint32_t |
111 | | mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format) |
112 | 0 | { |
113 | | // https://wiki.mobileread.com/wiki/MOBI |
114 | 0 | uint32_t compression, text_length, record_count, text_encoding, i; |
115 | 0 | unsigned char buf[4]; |
116 | 0 | fz_range range = { 0 }; |
117 | 0 | fz_stream *rec = NULL; |
118 | 0 | size_t n; |
119 | |
|
120 | 0 | fz_var(rec); |
121 | |
|
122 | 0 | fz_try(ctx) |
123 | 0 | { |
124 | 0 | range.offset = offset[0]; |
125 | 0 | range.length = offset[1] - offset[0]; |
126 | 0 | rec = fz_open_range_filter(ctx, stm, &range, 1); |
127 | | |
128 | | // PalmDOC header |
129 | 0 | compression = fz_read_uint16(ctx, rec); |
130 | 0 | skip_bytes(ctx, rec, 2); |
131 | 0 | text_length = fz_read_uint32(ctx, rec); |
132 | 0 | record_count = fz_read_uint16(ctx, rec); |
133 | 0 | skip_bytes(ctx, rec, 2); |
134 | 0 | skip_bytes(ctx, rec, 2); // encryption |
135 | 0 | skip_bytes(ctx, rec, 2); |
136 | | |
137 | | // Optional MOBI header |
138 | 0 | text_encoding = TEXT_ENCODING_LATIN_1; |
139 | 0 | n = fz_read(ctx, rec, buf, 4); |
140 | 0 | if (n == 4 && !memcmp(buf, "MOBI", 4)) |
141 | 0 | { |
142 | 0 | skip_bytes(ctx, rec, 4); |
143 | 0 | skip_bytes(ctx, rec, 4); |
144 | 0 | text_encoding = fz_read_uint32(ctx, rec); |
145 | 0 | } |
146 | 0 | } |
147 | 0 | fz_always(ctx) |
148 | 0 | fz_drop_stream(ctx, rec); |
149 | 0 | fz_catch(ctx) |
150 | 0 | fz_rethrow(ctx); |
151 | | |
152 | 0 | if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC) |
153 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method"); |
154 | 0 | if (text_encoding != TEXT_ENCODING_LATIN_1 && |
155 | 0 | text_encoding != TEXT_ENCODING_1252 && |
156 | 0 | text_encoding != TEXT_ENCODING_UTF8) |
157 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding"); |
158 | | |
159 | 0 | for (i = 1; i <= record_count && i < total_count; ++i) |
160 | 0 | { |
161 | 0 | uint32_t remain = text_length - (uint32_t)out->len; |
162 | 0 | uint32_t size = remain < 4096 ? remain : 4096; |
163 | |
|
164 | 0 | fz_try(ctx) |
165 | 0 | { |
166 | 0 | range.offset = offset[i]; |
167 | 0 | range.length = offset[i + 1] - offset[i]; |
168 | 0 | rec = fz_open_range_filter(ctx, stm, &range, 1); |
169 | |
|
170 | 0 | if (compression == COMPRESSION_NONE) |
171 | 0 | mobi_read_text_none(ctx, out, rec, size); |
172 | 0 | else |
173 | 0 | mobi_read_text_palmdoc(ctx, out, rec, size); |
174 | 0 | } |
175 | 0 | fz_always(ctx) |
176 | 0 | fz_drop_stream(ctx, rec); |
177 | 0 | fz_catch(ctx) |
178 | 0 | fz_rethrow(ctx); |
179 | 0 | } |
180 | | |
181 | 0 | if (format == FORMAT_TEXT && out->len > 6) |
182 | 0 | { |
183 | 0 | if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6)) |
184 | 0 | format = FORMAT_HTML; |
185 | 0 | } |
186 | |
|
187 | 0 | if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT) |
188 | 0 | { |
189 | 0 | unsigned char *p; |
190 | 0 | size_t i, n = fz_buffer_extract(ctx, out, &p); |
191 | 0 | fz_resize_buffer(ctx, out, 0); |
192 | 0 | if (format == FORMAT_TEXT) |
193 | 0 | fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>"); |
194 | 0 | for (i = 0; i < n; ++i) |
195 | 0 | { |
196 | 0 | int c = p[i]; |
197 | 0 | if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&')) |
198 | 0 | { |
199 | 0 | if (c == '<') |
200 | 0 | fz_append_string(ctx, out, "<"); |
201 | 0 | else if (c == '>') |
202 | 0 | fz_append_string(ctx, out, ">"); |
203 | 0 | else if (c == '&') |
204 | 0 | fz_append_string(ctx, out, "&"); |
205 | 0 | } |
206 | 0 | else |
207 | 0 | { |
208 | 0 | switch (text_encoding) |
209 | 0 | { |
210 | 0 | case TEXT_ENCODING_UTF8: |
211 | 0 | fz_append_byte(ctx, out, c); |
212 | 0 | break; |
213 | 0 | case TEXT_ENCODING_LATIN_1: |
214 | 0 | fz_append_rune(ctx, out, c); |
215 | 0 | break; |
216 | 0 | case TEXT_ENCODING_1252: |
217 | 0 | fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]); |
218 | 0 | break; |
219 | 0 | } |
220 | 0 | } |
221 | 0 | } |
222 | 0 | if (format == FORMAT_TEXT) |
223 | 0 | fz_append_string(ctx, out, "</body></html>"); |
224 | 0 | fz_free(ctx, p); |
225 | 0 | } |
226 | | |
227 | 0 | return record_count; |
228 | 0 | } |
229 | | |
230 | | static void drop_tree_entry(fz_context *ctx, void *ent) |
231 | 0 | { |
232 | 0 | fz_drop_buffer(ctx, ent); |
233 | 0 | } |
234 | | |
235 | | fz_archive * |
236 | | fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi) |
237 | 0 | { |
238 | 0 | fz_stream *stm = NULL; |
239 | 0 | fz_buffer *buffer = NULL; |
240 | 0 | fz_tree *tree = NULL; |
241 | 0 | uint32_t *offsets = NULL; |
242 | 0 | char buf[32]; |
243 | 0 | uint32_t i, k, extra; |
244 | 0 | uint32_t recindex; |
245 | 0 | uint32_t minoffset, maxoffset; |
246 | 0 | int format = FORMAT_TEXT; |
247 | 0 | size_t n; |
248 | | |
249 | | // https://wiki.mobileread.com/wiki/PalmDOC |
250 | |
|
251 | 0 | fz_var(stm); |
252 | 0 | fz_var(buffer); |
253 | 0 | fz_var(offsets); |
254 | 0 | fz_var(tree); |
255 | |
|
256 | 0 | fz_try(ctx) |
257 | 0 | { |
258 | 0 | stm = fz_open_buffer(ctx, mobi); |
259 | |
|
260 | 0 | skip_bytes(ctx, stm, 32); // database name |
261 | 0 | skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc |
262 | |
|
263 | 0 | n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator |
264 | 0 | buf[8] = 0; |
265 | |
|
266 | 0 | if (n == 8 && !memcmp(buf, "BOOKMOBI", 8)) |
267 | 0 | format = FORMAT_HTML; |
268 | 0 | else if (n == 8 && !memcmp(buf, "TEXtREAd", 8)) |
269 | 0 | format = FORMAT_TEXT; |
270 | 0 | else if (n != 8) |
271 | 0 | fz_warn(ctx, "premature end in data"); |
272 | 0 | else |
273 | 0 | fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf); |
274 | |
|
275 | 0 | skip_bytes(ctx, stm, 8); // database internal fields |
276 | | |
277 | | // record info list count |
278 | 0 | n = fz_read_uint16(ctx, stm); |
279 | |
|
280 | 0 | minoffset = (uint32_t)fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1; |
281 | 0 | maxoffset = (uint32_t)mobi->len; |
282 | | |
283 | | // record info list |
284 | 0 | offsets = fz_malloc_array(ctx, n + 1, uint32_t); |
285 | 0 | for (i = 0, k = 0; i < n; ++i) |
286 | 0 | { |
287 | 0 | uint32_t offset = fz_read_uint32(ctx, stm); |
288 | 0 | if (offset <= minoffset) |
289 | 0 | continue; |
290 | 0 | if (offset >= maxoffset) |
291 | 0 | continue; |
292 | 0 | offsets[k++] = offset; |
293 | 0 | skip_bytes(ctx, stm, 4); |
294 | 0 | minoffset = fz_mini(minoffset, offsets[i]); |
295 | 0 | } |
296 | 0 | offsets[k] = (uint32_t)mobi->len; |
297 | | |
298 | | // adjust n in case some out of bound offsets were skipped |
299 | 0 | n = k; |
300 | 0 | if (n == 0) |
301 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read"); |
302 | | |
303 | | // decompress text data |
304 | 0 | buffer = fz_new_buffer(ctx, 128 << 10); |
305 | 0 | extra = mobi_read_data(ctx, buffer, stm, offsets, n, format); |
306 | 0 | fz_terminate_buffer(ctx, buffer); |
307 | |
|
308 | 0 | #ifndef NDEBUG |
309 | 0 | if (fz_atoi(getenv("FZ_DEBUG_MOBI"))) |
310 | 0 | fz_save_buffer(ctx, buffer, "mobi.xhtml"); |
311 | 0 | #endif |
312 | |
|
313 | 0 | tree = fz_tree_insert(ctx, tree, "index.html", buffer); |
314 | 0 | buffer = NULL; |
315 | | |
316 | | // copy image data records into tree |
317 | 0 | recindex = 1; |
318 | 0 | for (i = extra; i < n; ++i) |
319 | 0 | { |
320 | 0 | uint32_t size = offsets[i+1] - offsets[i]; |
321 | 0 | if (size > 8) |
322 | 0 | { |
323 | 0 | unsigned char *data = mobi->data + offsets[i]; |
324 | 0 | if (fz_recognize_image_format(ctx, data)) |
325 | 0 | { |
326 | 0 | buffer = fz_new_buffer_from_copied_data(ctx, data, size); |
327 | 0 | fz_snprintf(buf, sizeof buf, "%05d", recindex); |
328 | 0 | tree = fz_tree_insert(ctx, tree, buf, buffer); |
329 | 0 | buffer = NULL; |
330 | 0 | recindex++; |
331 | 0 | } |
332 | 0 | } |
333 | 0 | } |
334 | 0 | } |
335 | 0 | fz_always(ctx) |
336 | 0 | { |
337 | 0 | fz_drop_stream(ctx, stm); |
338 | 0 | fz_free(ctx, offsets); |
339 | 0 | } |
340 | 0 | fz_catch(ctx) |
341 | 0 | { |
342 | 0 | fz_drop_buffer(ctx, buffer); |
343 | 0 | fz_drop_tree(ctx, tree, drop_tree_entry); |
344 | 0 | fz_rethrow(ctx); |
345 | 0 | } |
346 | | |
347 | 0 | return fz_new_tree_archive(ctx, tree); |
348 | 0 | } |