Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/html/mobi.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2022 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "html-imp.h"
25
26
#include <string.h>
27
28
0
#define FORMAT_HTML 1
29
0
#define FORMAT_TEXT 2
30
31
0
#define COMPRESSION_NONE 1
32
0
#define COMPRESSION_PALMDOC 2
33
#define COMPRESSION_HUFF_CDIC 17480
34
35
0
#define TEXT_ENCODING_LATIN_1 0
36
0
#define TEXT_ENCODING_1252 1252
37
0
#define TEXT_ENCODING_UTF8 65001
38
39
static void
40
skip_bytes(fz_context *ctx, fz_stream *stm, size_t len)
41
0
{
42
0
  size_t skipped = fz_skip(ctx, stm, len);
43
0
  if (skipped < len)
44
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data");
45
0
}
46
47
static void
48
mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
49
0
{
50
0
  unsigned char buf[4096];
51
0
  size_t n;
52
0
  if (size > 4096)
53
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large");
54
0
  n = fz_read(ctx, stm, buf, size);
55
0
  if (n < size)
56
0
    fz_warn(ctx, "premature end in mobi uncompressed text data");
57
0
  fz_append_data(ctx, out, buf, n);
58
0
}
59
60
static void
61
mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
62
0
{
63
  // https://wiki.mobileread.com/wiki/PalmDOC
64
0
  size_t end = out->len + size;
65
0
  while (out->len < end)
66
0
  {
67
0
    int c = fz_read_byte(ctx, stm);
68
0
    if (c == EOF)
69
0
      break;
70
0
    if (c >= 0x01 && c <= 0x08)
71
0
    {
72
0
      unsigned char buf[8];
73
0
      size_t n = fz_read(ctx, stm, buf, c);
74
0
      fz_append_data(ctx, out, buf, n);
75
0
      if (n < (size_t) c)
76
0
        break;
77
0
    }
78
0
    else if (c <= 0x7f)
79
0
    {
80
0
      fz_append_byte(ctx, out, c);
81
0
    }
82
0
    else if (c >= 0x80 && c <= 0xbf)
83
0
    {
84
0
      int cc, x, distance, length;
85
0
      cc = fz_read_byte(ctx, stm);
86
0
      if (cc == EOF)
87
0
        break;
88
0
      x = (c << 8) | cc;
89
0
      distance = (x >> 3) & 0x7ff;
90
0
      length = (x & 7) + 3;
91
0
      if (distance > 0 && (size_t)distance <= out->len)
92
0
      {
93
0
        int i;
94
0
        int p = (int)(out->len - distance);
95
0
        for (i = 0; i < length; ++i)
96
0
          fz_append_byte(ctx, out, out->data[p + i]);
97
0
      }
98
0
    }
99
0
    else if (c >= 0xc0 && c <= 0xff)
100
0
    {
101
0
      fz_append_byte(ctx, out, ' ');
102
0
      fz_append_byte(ctx, out, c ^ 0x80);
103
0
    }
104
0
  }
105
106
0
  if (out->len < end)
107
0
    fz_warn(ctx, "premature end in mobi palmdoc data");
108
0
}
109
110
static uint32_t
111
mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format)
112
0
{
113
  // https://wiki.mobileread.com/wiki/MOBI
114
0
  uint32_t compression, text_length, record_count, text_encoding, i;
115
0
  unsigned char buf[4];
116
0
  fz_range range = { 0 };
117
0
  fz_stream *rec = NULL;
118
0
  size_t n;
119
120
0
  fz_var(rec);
121
122
0
  fz_try(ctx)
123
0
  {
124
0
    range.offset = offset[0];
125
0
    range.length = offset[1] - offset[0];
126
0
    rec = fz_open_range_filter(ctx, stm, &range, 1);
127
128
    // PalmDOC header
129
0
    compression = fz_read_uint16(ctx, rec);
130
0
    skip_bytes(ctx, rec, 2);
131
0
    text_length = fz_read_uint32(ctx, rec);
132
0
    record_count = fz_read_uint16(ctx, rec);
133
0
    skip_bytes(ctx, rec, 2);
134
0
    skip_bytes(ctx, rec, 2); // encryption
135
0
    skip_bytes(ctx, rec, 2);
136
137
    // Optional MOBI header
138
0
    text_encoding = TEXT_ENCODING_LATIN_1;
139
0
    n = fz_read(ctx, rec, buf, 4);
140
0
    if (n == 4 && !memcmp(buf, "MOBI", 4))
141
0
    {
142
0
      skip_bytes(ctx, rec, 4);
143
0
      skip_bytes(ctx, rec, 4);
144
0
      text_encoding = fz_read_uint32(ctx, rec);
145
0
    }
146
0
  }
147
0
  fz_always(ctx)
148
0
    fz_drop_stream(ctx, rec);
149
0
  fz_catch(ctx)
150
0
    fz_rethrow(ctx);
151
152
0
  if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC)
153
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method");
154
0
  if (text_encoding != TEXT_ENCODING_LATIN_1 &&
155
0
    text_encoding != TEXT_ENCODING_1252 &&
156
0
    text_encoding != TEXT_ENCODING_UTF8)
157
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding");
158
159
0
  for (i = 1; i <= record_count && i < total_count; ++i)
160
0
  {
161
0
    uint32_t remain = text_length - (uint32_t)out->len;
162
0
    uint32_t size = remain < 4096 ? remain : 4096;
163
164
0
    fz_try(ctx)
165
0
    {
166
0
      range.offset = offset[i];
167
0
      range.length = offset[i + 1] - offset[i];
168
0
      rec = fz_open_range_filter(ctx, stm, &range, 1);
169
170
0
      if (compression == COMPRESSION_NONE)
171
0
        mobi_read_text_none(ctx, out, rec, size);
172
0
      else
173
0
        mobi_read_text_palmdoc(ctx, out, rec, size);
174
0
    }
175
0
    fz_always(ctx)
176
0
      fz_drop_stream(ctx, rec);
177
0
    fz_catch(ctx)
178
0
      fz_rethrow(ctx);
179
0
  }
180
181
0
  if (format == FORMAT_TEXT && out->len > 6)
182
0
  {
183
0
    if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6))
184
0
      format = FORMAT_HTML;
185
0
  }
186
187
0
  if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT)
188
0
  {
189
0
    unsigned char *p;
190
0
    size_t i, n = fz_buffer_extract(ctx, out, &p);
191
0
    fz_resize_buffer(ctx, out, 0);
192
0
    if (format == FORMAT_TEXT)
193
0
      fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>");
194
0
    for (i = 0; i < n; ++i)
195
0
    {
196
0
      int c = p[i];
197
0
      if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&'))
198
0
      {
199
0
        if (c == '<')
200
0
          fz_append_string(ctx, out, "&lt;");
201
0
        else if (c == '>')
202
0
          fz_append_string(ctx, out, "&gt;");
203
0
        else if (c == '&')
204
0
          fz_append_string(ctx, out, "&amp;");
205
0
      }
206
0
      else
207
0
      {
208
0
        switch (text_encoding)
209
0
        {
210
0
        case TEXT_ENCODING_UTF8:
211
0
          fz_append_byte(ctx, out, c);
212
0
          break;
213
0
        case TEXT_ENCODING_LATIN_1:
214
0
          fz_append_rune(ctx, out, c);
215
0
          break;
216
0
        case TEXT_ENCODING_1252:
217
0
          fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]);
218
0
          break;
219
0
        }
220
0
      }
221
0
    }
222
0
    if (format == FORMAT_TEXT)
223
0
      fz_append_string(ctx, out, "</body></html>");
224
0
    fz_free(ctx, p);
225
0
  }
226
227
0
  return record_count;
228
0
}
229
230
static void drop_tree_entry(fz_context *ctx, void *ent)
231
0
{
232
0
  fz_drop_buffer(ctx, ent);
233
0
}
234
235
fz_archive *
236
fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi)
237
0
{
238
0
  fz_stream *stm = NULL;
239
0
  fz_buffer *buffer = NULL;
240
0
  fz_tree *tree = NULL;
241
0
  uint32_t *offsets = NULL;
242
0
  char buf[32];
243
0
  uint32_t i, k, extra;
244
0
  uint32_t recindex;
245
0
  uint32_t minoffset, maxoffset;
246
0
  int format = FORMAT_TEXT;
247
0
  size_t n;
248
249
  // https://wiki.mobileread.com/wiki/PalmDOC
250
251
0
  fz_var(stm);
252
0
  fz_var(buffer);
253
0
  fz_var(offsets);
254
0
  fz_var(tree);
255
256
0
  fz_try(ctx)
257
0
  {
258
0
    stm = fz_open_buffer(ctx, mobi);
259
260
0
    skip_bytes(ctx, stm, 32); // database name
261
0
    skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc
262
263
0
    n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator
264
0
    buf[8] = 0;
265
266
0
    if (n == 8 && !memcmp(buf, "BOOKMOBI", 8))
267
0
      format = FORMAT_HTML;
268
0
    else if (n == 8 && !memcmp(buf, "TEXtREAd", 8))
269
0
      format = FORMAT_TEXT;
270
0
    else if (n != 8)
271
0
      fz_warn(ctx, "premature end in data");
272
0
    else
273
0
      fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf);
274
275
0
    skip_bytes(ctx, stm, 8); // database internal fields
276
277
    // record info list count
278
0
    n = fz_read_uint16(ctx, stm);
279
280
0
    minoffset = (uint32_t)fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1;
281
0
    maxoffset = (uint32_t)mobi->len;
282
283
    // record info list
284
0
    offsets = fz_malloc_array(ctx, n + 1, uint32_t);
285
0
    for (i = 0, k = 0; i < n; ++i)
286
0
    {
287
0
      uint32_t offset = fz_read_uint32(ctx, stm);
288
0
      if (offset <= minoffset)
289
0
        continue;
290
0
      if (offset >= maxoffset)
291
0
        continue;
292
0
      offsets[k++] = offset;
293
0
      skip_bytes(ctx, stm, 4);
294
0
      minoffset = fz_mini(minoffset, offsets[i]);
295
0
    }
296
0
    offsets[k] = (uint32_t)mobi->len;
297
298
    // adjust n in case some out of bound offsets were skipped
299
0
    n = k;
300
0
    if (n == 0)
301
0
      fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read");
302
303
    // decompress text data
304
0
    buffer = fz_new_buffer(ctx, 128 << 10);
305
0
    extra = mobi_read_data(ctx, buffer, stm, offsets, n, format);
306
0
    fz_terminate_buffer(ctx, buffer);
307
308
0
#ifndef NDEBUG
309
0
    if (fz_atoi(getenv("FZ_DEBUG_MOBI")))
310
0
      fz_save_buffer(ctx, buffer, "mobi.xhtml");
311
0
#endif
312
313
0
    tree = fz_tree_insert(ctx, tree, "index.html", buffer);
314
0
    buffer = NULL;
315
316
    // copy image data records into tree
317
0
    recindex = 1;
318
0
    for (i = extra; i < n; ++i)
319
0
    {
320
0
      uint32_t size = offsets[i+1] - offsets[i];
321
0
      if (size > 8)
322
0
      {
323
0
        unsigned char *data = mobi->data + offsets[i];
324
0
        if (fz_recognize_image_format(ctx, data))
325
0
        {
326
0
          buffer = fz_new_buffer_from_copied_data(ctx, data, size);
327
0
          fz_snprintf(buf, sizeof buf, "%05d", recindex);
328
0
          tree = fz_tree_insert(ctx, tree, buf, buffer);
329
0
          buffer = NULL;
330
0
          recindex++;
331
0
        }
332
0
      }
333
0
    }
334
0
  }
335
0
  fz_always(ctx)
336
0
  {
337
0
    fz_drop_stream(ctx, stm);
338
0
    fz_free(ctx, offsets);
339
0
  }
340
0
  fz_catch(ctx)
341
0
  {
342
0
    fz_drop_buffer(ctx, buffer);
343
0
    fz_drop_tree(ctx, tree, drop_tree_entry);
344
0
    fz_rethrow(ctx);
345
0
  }
346
347
0
  return fz_new_tree_archive(ctx, tree);
348
0
}