Coverage Report

Created: 2024-07-05 06:13

/src/mupdf/source/html/epub-doc.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "html-imp.h"
25
26
#include <string.h>
27
#include <math.h>
28
29
#include <zlib.h> /* for crc32 */
30
31
enum { T, R, B, L };
32
33
typedef struct epub_chapter epub_chapter;
34
typedef struct epub_page epub_page;
35
36
typedef struct
37
{
38
  int max_chapters;
39
  int num_chapters;
40
  float layout_w;
41
  float layout_h;
42
  float layout_em;
43
  uint32_t css_sum;
44
  int use_doc_css;
45
  int *pages_in_chapter;
46
} epub_accelerator;
47
48
typedef struct
49
{
50
  fz_document super;
51
  fz_archive *zip;
52
  fz_html_font_set *set;
53
  int count;
54
  epub_chapter *spine;
55
  fz_outline *outline;
56
  char *dc_title, *dc_creator;
57
  float layout_w, layout_h, layout_em;
58
  epub_accelerator *accel;
59
  uint32_t css_sum;
60
61
  /* A common pattern of use is for us to open a document,
62
   * load a page, draw it, drop it, load the next page,
63
   * draw it, drop it etc. This means that the HTML for
64
   * a chapter might get thrown away between the drop and
65
   * the the next load (if the chapter is large, and the
66
   * store size is low). Accordingly, we store a handle
67
   * to the most recently used html block here, thus
68
   * ensuring that the stored copy won't be evicted. */
69
  fz_html *most_recent_html;
70
} epub_document;
71
72
struct epub_chapter
73
{
74
  epub_document *doc;
75
  char *path;
76
  int number;
77
  epub_chapter *next;
78
};
79
80
struct epub_page
81
{
82
  fz_page super;
83
  epub_chapter *ch;
84
  int number;
85
  fz_html *html;
86
};
87
88
static uint32_t
89
user_css_sum(fz_context *ctx)
90
0
{
91
0
  uint32_t sum = 0;
92
0
  const char *css = fz_user_css(ctx);
93
0
  sum = crc32(0, NULL, 0);
94
0
  if (css)
95
0
    sum = crc32(sum, (Byte*)css, (int)strlen(css));
96
0
  return sum;
97
0
}
98
99
static int dummy = 1;
100
101
struct encrypted {
102
  fz_archive super;
103
  fz_archive *chain;
104
  fz_tree *info;
105
};
106
107
static int has_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
108
0
{
109
0
  struct encrypted *arch = (struct encrypted *)arch_;
110
0
  return fz_has_archive_entry(ctx, arch->chain, name);
111
0
}
112
113
static fz_stream *open_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
114
0
{
115
0
  struct encrypted *arch = (struct encrypted *)arch_;
116
0
  if (fz_tree_lookup(ctx, arch->info, name))
117
0
    return NULL;
118
0
  return fz_open_archive_entry(ctx, arch->chain, name);
119
0
}
120
121
static fz_buffer *read_encrypted_entry(fz_context *ctx, fz_archive *arch_, const char *name)
122
0
{
123
0
  struct encrypted *arch = (struct encrypted *)arch_;
124
0
  if (fz_tree_lookup(ctx, arch->info, name))
125
0
    return NULL;
126
0
  return fz_read_archive_entry(ctx, arch->chain, name);
127
0
}
128
129
static void drop_encrypted_archive(fz_context *ctx, fz_archive *arch_)
130
0
{
131
0
  struct encrypted *arch = (struct encrypted *)arch_;
132
0
  fz_drop_tree(ctx, arch->info, NULL);
133
0
  fz_drop_archive(ctx, arch->chain);
134
0
}
135
136
static fz_archive *new_encrypted_archive(fz_context *ctx, fz_archive *chain, fz_tree *info)
137
0
{
138
0
  struct encrypted *arch;
139
140
0
  arch = fz_new_derived_archive(ctx, NULL, struct encrypted);
141
0
  arch->super.format = "encrypted";
142
0
  arch->super.has_entry = has_encrypted_entry;
143
0
  arch->super.read_entry = read_encrypted_entry;
144
0
  arch->super.open_entry = open_encrypted_entry;
145
0
  arch->super.drop_archive = drop_encrypted_archive;
146
0
  arch->chain = chain;
147
0
  arch->info = info;
148
149
0
  return &arch->super;
150
0
}
151
152
static void
153
epub_parse_encryption(fz_context *ctx, epub_document *doc, fz_xml *root)
154
0
{
155
0
  fz_tree *info = NULL;
156
0
  fz_xml *edata;
157
158
0
  for (edata = fz_xml_find_down(root, "EncryptedData"); edata; edata = fz_xml_find_next(edata, "EncryptedData"))
159
0
  {
160
0
    fz_xml *cdata = fz_xml_find_down(edata, "CipherData");
161
0
    fz_xml *cref = fz_xml_find_down(cdata, "CipherReference");
162
0
    char *uri = fz_xml_att(cref, "URI");
163
0
    if (uri)
164
0
    {
165
      // TODO: Support reading EncryptedKey and EncryptionMethod to decrypt content.
166
0
      info = fz_tree_insert(ctx, info, uri, &dummy);
167
0
    }
168
0
  }
169
170
0
  if (info)
171
0
  {
172
0
    doc->zip = new_encrypted_archive(ctx, doc->zip, info);
173
0
  }
174
0
}
175
176
static fz_html *epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch);
177
178
static int count_laid_out_pages(fz_html *html)
179
0
{
180
0
  if (html->tree.root->s.layout.b > 0)
181
0
    return ceilf(html->tree.root->s.layout.b / html->page_h);
182
0
  return 1;
183
0
}
184
185
static void
186
invalidate_accelerator(fz_context *ctx, epub_accelerator *acc)
187
0
{
188
0
  int i;
189
190
0
  for (i = 0; i < acc->max_chapters; i++)
191
0
    acc->pages_in_chapter[i] = -1;
192
0
}
193
194
static int count_chapter_pages(fz_context *ctx, epub_document *doc, epub_chapter *ch)
195
0
{
196
0
  epub_accelerator *acc = doc->accel;
197
0
  int use_doc_css = fz_use_document_css(ctx);
198
199
0
  if (use_doc_css != acc->use_doc_css || doc->css_sum != acc->css_sum)
200
0
  {
201
0
    acc->use_doc_css = use_doc_css;
202
0
    acc->css_sum = doc->css_sum;
203
0
    invalidate_accelerator(ctx, acc);
204
0
  }
205
206
0
  if (ch->number < acc->num_chapters && acc->pages_in_chapter[ch->number] != -1)
207
0
    return acc->pages_in_chapter[ch->number];
208
209
0
  fz_drop_html(ctx, epub_get_laid_out_html(ctx, doc, ch));
210
0
  return acc->pages_in_chapter[ch->number];
211
0
}
212
213
static fz_link_dest
214
epub_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
215
0
{
216
0
  epub_document *doc = (epub_document*)doc_;
217
0
  epub_chapter *ch;
218
0
  int i;
219
220
0
  const char *s = strchr(dest, '#');
221
0
  size_t n = s ? (size_t)(s - dest) : strlen(dest);
222
0
  if (s && s[1] == 0)
223
0
    s = NULL;
224
225
0
  for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
226
0
  {
227
0
    if (!strncmp(ch->path, dest, n) && ch->path[n] == 0)
228
0
    {
229
0
      if (s)
230
0
      {
231
0
        float y;
232
0
        fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
233
0
        int ph = html->page_h;
234
235
        /* Search for a matching fragment */
236
0
        y = fz_find_html_target(ctx, html, s+1);
237
0
        fz_drop_html(ctx, html);
238
0
        if (y >= 0)
239
0
        {
240
0
          int page = y / ph;
241
0
          return fz_make_link_dest_xyz(i, page, 0, y - page * ph, 0);
242
0
        }
243
0
        return fz_make_link_dest_none();
244
0
      }
245
0
      return fz_make_link_dest_xyz(i, 0, 0, 0, 0);
246
0
    }
247
0
  }
248
249
0
  return fz_make_link_dest_none();
250
0
}
251
252
static void
253
epub_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
254
0
{
255
0
  epub_document *doc = (epub_document*)doc_;
256
0
  uint32_t css_sum = user_css_sum(ctx);
257
0
  int use_doc_css = fz_use_document_css(ctx);
258
259
0
  if (doc->layout_w == w && doc->layout_h == h && doc->layout_em == em && doc->css_sum == css_sum)
260
0
    return;
261
0
  doc->layout_w = w;
262
0
  doc->layout_h = h;
263
0
  doc->layout_em = em;
264
265
0
  if (doc->accel == NULL)
266
0
    return;
267
268
  /* When we load the saved accelerator, doc->accel
269
   * can be populated with different values than doc.
270
   * This is really useful as doc starts out with the
271
   * values being 0. If we've got the right values
272
   * already, then don't bin the data! */
273
0
  if (doc->accel->layout_w == w &&
274
0
    doc->accel->layout_h == h &&
275
0
    doc->accel->layout_em == em &&
276
0
    doc->accel->use_doc_css == use_doc_css &&
277
0
    doc->accel->css_sum == css_sum)
278
0
    return;
279
280
0
  doc->accel->layout_w = w;
281
0
  doc->accel->layout_h = h;
282
0
  doc->accel->layout_em = em;
283
0
  doc->accel->use_doc_css = use_doc_css;
284
0
  doc->accel->css_sum = css_sum;
285
0
  invalidate_accelerator(ctx, doc->accel);
286
0
}
287
288
static int
289
epub_count_chapters(fz_context *ctx, fz_document *doc_)
290
0
{
291
0
  epub_document *doc = (epub_document*)doc_;
292
0
  epub_chapter *ch;
293
0
  int count = 0;
294
0
  for (ch = doc->spine; ch; ch = ch->next)
295
0
    ++count;
296
0
  return count;
297
0
}
298
299
static int
300
epub_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
301
0
{
302
0
  epub_document *doc = (epub_document*)doc_;
303
0
  epub_chapter *ch;
304
0
  int i;
305
0
  for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
306
0
  {
307
0
    if (i == chapter)
308
0
    {
309
0
      return count_chapter_pages(ctx, doc, ch);
310
0
    }
311
0
  }
312
0
  return 0;
313
0
}
314
315
0
#define MAGIC_ACCELERATOR 0xacce1e7a
316
0
#define MAGIC_ACCEL_EPUB  0x62755065
317
0
#define ACCEL_VERSION     0x00010001
318
319
static void epub_load_accelerator(fz_context *ctx, epub_document *doc, fz_stream *accel)
320
0
{
321
0
  int v;
322
0
  float w, h, em;
323
0
  int num_chapters;
324
0
  epub_accelerator *acc = NULL;
325
0
  uint32_t css_sum;
326
0
  int use_doc_css;
327
0
  int make_new = (accel == NULL);
328
329
0
  fz_var(acc);
330
331
0
  if (accel)
332
0
  {
333
    /* Try to read the accelerator data. If we fail silently give up. */
334
0
    fz_try(ctx)
335
0
    {
336
0
      v = fz_read_int32_le(ctx, accel);
337
0
      if (v != (int32_t)MAGIC_ACCELERATOR)
338
0
      {
339
0
        make_new = 1;
340
0
        break;
341
0
      }
342
343
0
      v = fz_read_int32_le(ctx, accel);
344
0
      if (v != MAGIC_ACCEL_EPUB)
345
0
      {
346
0
        make_new = 1;
347
0
        break;
348
0
      }
349
350
0
      v = fz_read_int32_le(ctx, accel);
351
0
      if (v != ACCEL_VERSION)
352
0
      {
353
0
        make_new = 1;
354
0
        break;
355
0
      }
356
357
0
      w = fz_read_float_le(ctx, accel);
358
0
      h = fz_read_float_le(ctx, accel);
359
0
      em = fz_read_float_le(ctx, accel);
360
0
      css_sum = fz_read_uint32_le(ctx, accel);
361
0
      use_doc_css = fz_read_int32_le(ctx, accel);
362
363
0
      num_chapters = fz_read_int32_le(ctx, accel);
364
0
      if (num_chapters <= 0)
365
0
      {
366
0
        make_new = 1;
367
0
        break;
368
0
      }
369
370
0
      acc = fz_malloc_struct(ctx, epub_accelerator);
371
0
      acc->pages_in_chapter = Memento_label(fz_malloc_array(ctx, num_chapters, int), "accel_pages_in_chapter");
372
0
      acc->max_chapters = acc->num_chapters = num_chapters;
373
0
      acc->layout_w = w;
374
0
      acc->layout_h = h;
375
0
      acc->layout_em = em;
376
0
      acc->css_sum = css_sum;
377
0
      acc->use_doc_css = use_doc_css;
378
379
0
      for (v = 0; v < num_chapters; v++)
380
0
        acc->pages_in_chapter[v] = fz_read_int32_le(ctx, accel);
381
0
    }
382
0
    fz_catch(ctx)
383
0
    {
384
0
      if (acc)
385
0
        fz_free(ctx, acc->pages_in_chapter);
386
0
      fz_free(ctx, acc);
387
      /* Swallow the error and run unaccelerated */
388
0
      fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
389
0
      fz_report_error(ctx);
390
0
      make_new = 1;
391
0
    }
392
0
  }
393
394
  /* If we aren't given an accelerator to load (or the one we're given
395
   * is bad) create a blank stub and we can fill it out as we go. */
396
0
  if (make_new)
397
0
  {
398
0
    acc = fz_malloc_struct(ctx, epub_accelerator);
399
0
    acc->css_sum = doc->css_sum;
400
0
    acc->use_doc_css = fz_use_document_css(ctx);
401
0
  }
402
403
0
  doc->accel = acc;
404
0
}
405
406
static void
407
accelerate_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch, fz_html *html)
408
0
{
409
0
  epub_accelerator *acc = doc->accel;
410
0
  int p = count_laid_out_pages(html);
411
412
0
  if (ch->number < acc->num_chapters)
413
0
  {
414
0
    if (acc->pages_in_chapter[ch->number] != p && acc->pages_in_chapter[ch->number] != -1)
415
0
    {
416
0
      fz_warn(ctx, "Invalidating stale accelerator data.");
417
0
      invalidate_accelerator(ctx, doc->accel);
418
0
    }
419
0
    acc->pages_in_chapter[ch->number] = p;
420
0
    return;
421
0
  }
422
423
0
  if (ch->number >= acc->max_chapters)
424
0
  {
425
0
    int n = acc->max_chapters;
426
0
    int i;
427
0
    if (n == 0)
428
0
      n = 4;
429
0
    while (n <= ch->number)
430
0
      n *= 2;
431
432
0
    acc->pages_in_chapter = fz_realloc_array(ctx, acc->pages_in_chapter, n, int);
433
0
    for (i = acc->max_chapters; i < n; i++)
434
0
      acc->pages_in_chapter[i] = -1;
435
0
    acc->max_chapters = n;
436
0
  }
437
0
  acc->pages_in_chapter[ch->number] = p;
438
0
  if (acc->num_chapters < ch->number+1)
439
0
    acc->num_chapters = ch->number+1;
440
0
}
441
442
static void
443
epub_drop_page(fz_context *ctx, fz_page *page_)
444
0
{
445
0
  epub_page *page = (epub_page *)page_;
446
0
  fz_drop_html(ctx, page->html);
447
0
}
448
449
static epub_chapter *
450
epub_load_chapter(fz_context *ctx, epub_document *doc, const char *path, int i)
451
0
{
452
0
  epub_chapter *ch;
453
454
0
  ch = fz_malloc_struct(ctx, epub_chapter);
455
0
  fz_try(ctx)
456
0
  {
457
0
    ch->path = Memento_label(fz_strdup(ctx, path), "chapter_path");
458
0
    ch->number = i;
459
0
  }
460
0
  fz_catch(ctx)
461
0
  {
462
0
    fz_free(ctx, ch);
463
0
    fz_rethrow(ctx);
464
0
  }
465
466
0
  return ch;
467
0
}
468
469
static fz_html *
470
epub_parse_chapter(fz_context *ctx, epub_document *doc, epub_chapter *ch)
471
0
{
472
0
  fz_archive *zip = doc->zip;
473
0
  fz_buffer *buf;
474
0
  char base_uri[2048];
475
0
  fz_html *html;
476
477
  /* Look for one we made earlier */
478
0
  html = fz_find_html(ctx, doc, ch->number);
479
0
  if (html)
480
0
    return html;
481
482
0
  fz_dirname(base_uri, ch->path, sizeof base_uri);
483
484
0
  buf = fz_read_archive_entry(ctx, zip, ch->path);
485
0
  fz_try(ctx)
486
0
    html = fz_parse_html(ctx, doc->set, zip, base_uri, buf, fz_user_css(ctx), 1, 1, 0);
487
0
  fz_always(ctx)
488
0
    fz_drop_buffer(ctx, buf);
489
0
  fz_catch(ctx)
490
0
    fz_rethrow(ctx);
491
492
0
  return fz_store_html(ctx, html, doc, ch->number);
493
0
}
494
495
static fz_html *
496
epub_get_laid_out_html(fz_context *ctx, epub_document *doc, epub_chapter *ch)
497
0
{
498
0
  fz_html *html = epub_parse_chapter(ctx, doc, ch);
499
0
  fz_try(ctx)
500
0
  {
501
0
    fz_layout_html(ctx, html, doc->layout_w, doc->layout_h, doc->layout_em);
502
0
    accelerate_chapter(ctx, doc, ch, html);
503
0
  }
504
0
  fz_catch(ctx)
505
0
  {
506
0
    fz_drop_html(ctx, html);
507
0
    fz_rethrow(ctx);
508
0
  }
509
510
0
  fz_drop_html(ctx, doc->most_recent_html);
511
0
  doc->most_recent_html = fz_keep_html(ctx, html);
512
513
0
  return html;
514
0
}
515
516
static fz_rect
517
epub_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
518
0
{
519
0
  epub_document *doc = (epub_document*)page_->doc;
520
0
  epub_page *page = (epub_page*)page_;
521
0
  epub_chapter *ch = page->ch;
522
0
  fz_rect bbox;
523
0
  fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
524
525
0
  bbox.x0 = 0;
526
0
  bbox.y0 = 0;
527
0
  bbox.x1 = html->page_w + html->page_margin[L] + html->page_margin[R];
528
0
  bbox.y1 = html->page_h + html->page_margin[T] + html->page_margin[B];
529
0
  fz_drop_html(ctx, html);
530
0
  return bbox;
531
0
}
532
533
static void
534
epub_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
535
0
{
536
0
  epub_page *page = (epub_page*)page_;
537
538
0
  fz_draw_html(ctx, dev, ctm, page->html, page->number);
539
0
}
540
541
static fz_link *
542
epub_load_links(fz_context *ctx, fz_page *page_)
543
0
{
544
0
  epub_page *page = (epub_page*)page_;
545
0
  epub_chapter *ch = page->ch;
546
547
0
  return fz_load_html_links(ctx, page->html, page->number, ch->path);
548
0
}
549
550
static fz_bookmark
551
epub_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
552
0
{
553
0
  epub_document *doc = (epub_document*)doc_;
554
0
  epub_chapter *ch;
555
0
  int i;
556
557
0
  for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
558
0
  {
559
0
    if (i == loc.chapter)
560
0
    {
561
0
      fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
562
0
      fz_bookmark mark = fz_make_html_bookmark(ctx, html, loc.page);
563
0
      fz_drop_html(ctx, html);
564
0
      return mark;
565
0
    }
566
0
  }
567
568
0
  return 0;
569
0
}
570
571
static fz_location
572
epub_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
573
0
{
574
0
  epub_document *doc = (epub_document*)doc_;
575
0
  epub_chapter *ch;
576
0
  int i;
577
578
0
  for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
579
0
  {
580
0
    fz_html *html = epub_get_laid_out_html(ctx, doc, ch);
581
0
    int p = fz_lookup_html_bookmark(ctx, html, mark);
582
0
    fz_drop_html(ctx, html);
583
0
    if (p != -1)
584
0
      return fz_make_location(i, p);
585
0
  }
586
0
  return fz_make_location(-1, -1);
587
0
}
588
589
static fz_page *
590
epub_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
591
0
{
592
0
  epub_document *doc = (epub_document*)doc_;
593
0
  epub_chapter *ch;
594
0
  int i;
595
0
  for (i = 0, ch = doc->spine; ch; ++i, ch = ch->next)
596
0
  {
597
0
    if (i == chapter)
598
0
    {
599
0
      epub_page *page = fz_new_derived_page(ctx, epub_page, doc_);
600
0
      page->super.bound_page = epub_bound_page;
601
0
      page->super.run_page_contents = epub_run_page;
602
0
      page->super.load_links = epub_load_links;
603
0
      page->super.drop_page = epub_drop_page;
604
0
      page->ch = ch;
605
0
      page->number = number;
606
0
      page->html = epub_get_laid_out_html(ctx, doc, ch);
607
0
      return (fz_page*)page;
608
0
    }
609
0
  }
610
0
  return NULL;
611
0
}
612
613
static void
614
epub_page_label(fz_context *ctx, fz_document *doc_, int chapter, int number, char *buf, size_t size)
615
0
{
616
0
  fz_snprintf(buf, size, "ch. %d, p. %d", chapter+1, number+1);
617
0
}
618
619
static void
620
epub_drop_accelerator(fz_context *ctx, epub_accelerator *acc)
621
0
{
622
0
  if (acc == NULL)
623
0
    return;
624
625
0
  fz_free(ctx, acc->pages_in_chapter);
626
0
  fz_free(ctx, acc);
627
0
}
628
629
static void
630
epub_drop_document(fz_context *ctx, fz_document *doc_)
631
0
{
632
0
  epub_document *doc = (epub_document*)doc_;
633
0
  epub_chapter *ch, *next;
634
0
  ch = doc->spine;
635
0
  while (ch)
636
0
  {
637
0
    next = ch->next;
638
0
    fz_free(ctx, ch->path);
639
0
    fz_free(ctx, ch);
640
0
    ch = next;
641
0
  }
642
0
  epub_drop_accelerator(ctx, doc->accel);
643
0
  fz_drop_archive(ctx, doc->zip);
644
0
  fz_drop_html_font_set(ctx, doc->set);
645
0
  fz_drop_outline(ctx, doc->outline);
646
0
  fz_free(ctx, doc->dc_title);
647
0
  fz_free(ctx, doc->dc_creator);
648
0
  fz_drop_html(ctx, doc->most_recent_html);
649
0
  fz_purge_stored_html(ctx, doc);
650
0
}
651
652
static const char *
653
rel_path_from_idref(fz_xml *manifest, const char *idref)
654
0
{
655
0
  fz_xml *item;
656
0
  if (!idref)
657
0
    return NULL;
658
0
  item = fz_xml_find_down(manifest, "item");
659
0
  while (item)
660
0
  {
661
0
    const char *id = fz_xml_att(item, "id");
662
0
    if (id && !strcmp(id, idref))
663
0
      return fz_xml_att(item, "href");
664
0
    item = fz_xml_find_next(item, "item");
665
0
  }
666
0
  return NULL;
667
0
}
668
669
static const char *
670
path_from_idref(char *path, fz_xml *manifest, const char *base_uri, const char *idref, int n)
671
0
{
672
0
  const char *rel_path = rel_path_from_idref(manifest, idref);
673
0
  if (!rel_path)
674
0
  {
675
0
    path[0] = 0;
676
0
    return NULL;
677
0
  }
678
0
  fz_strlcpy(path, base_uri, n);
679
0
  fz_strlcat(path, "/", n);
680
0
  fz_strlcat(path, rel_path, n);
681
0
  return fz_cleanname(fz_urldecode(path));
682
0
}
683
684
static fz_outline *
685
epub_parse_ncx_imp(fz_context *ctx, epub_document *doc, fz_xml *node, char *base_uri)
686
0
{
687
0
  char path[2048];
688
0
  fz_outline *outline, *head, **tailp;
689
690
0
  head = NULL;
691
0
  tailp = &head;
692
693
0
  node = fz_xml_find_down(node, "navPoint");
694
0
  while (node)
695
0
  {
696
0
    char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(fz_xml_find_down(node, "navLabel"), "text")));
697
0
    char *content = fz_xml_att(fz_xml_find_down(node, "content"), "src");
698
0
    if (text && content)
699
0
    {
700
0
      fz_strlcpy(path, base_uri, sizeof path);
701
0
      fz_strlcat(path, "/", sizeof path);
702
0
      fz_strlcat(path, content, sizeof path);
703
0
      fz_urldecode(path);
704
0
      fz_cleanname(path);
705
706
0
      fz_try(ctx)
707
0
      {
708
0
        *tailp = outline = fz_new_outline(ctx);
709
0
        tailp = &(*tailp)->next;
710
0
        outline->title = Memento_label(fz_strdup(ctx, text), "outline_title");
711
0
        outline->uri = Memento_label(fz_strdup(ctx, path), "outline_uri");
712
0
        outline->page = fz_make_location(-1, -1);
713
0
        outline->down = epub_parse_ncx_imp(ctx, doc, node, base_uri);
714
0
        outline->is_open = 1;
715
0
      }
716
0
      fz_catch(ctx)
717
0
      {
718
0
        fz_drop_outline(ctx, head);
719
0
        fz_rethrow(ctx);
720
0
      }
721
0
    }
722
0
    node = fz_xml_find_next(node, "navPoint");
723
0
  }
724
725
0
  return head;
726
0
}
727
728
static void
729
epub_parse_ncx(fz_context *ctx, epub_document *doc, const char *path)
730
0
{
731
0
  fz_archive *zip = doc->zip;
732
0
  fz_buffer *buf = NULL;
733
0
  fz_xml_doc *ncx = NULL;
734
0
  char base_uri[2048];
735
736
0
  fz_var(buf);
737
0
  fz_var(ncx);
738
739
0
  fz_try(ctx)
740
0
  {
741
0
    fz_dirname(base_uri, path, sizeof base_uri);
742
0
    buf = fz_read_archive_entry(ctx, zip, path);
743
0
    ncx = fz_parse_xml(ctx, buf, 0);
744
0
    doc->outline = epub_parse_ncx_imp(ctx, doc, fz_xml_find_down(fz_xml_root(ncx), "navMap"), base_uri);
745
0
  }
746
0
  fz_always(ctx)
747
0
  {
748
0
    fz_drop_buffer(ctx, buf);
749
0
    fz_drop_xml(ctx, ncx);
750
0
  }
751
0
  fz_catch(ctx)
752
0
    fz_rethrow(ctx);
753
0
}
754
755
static char *
756
find_metadata(fz_context *ctx, fz_xml *metadata, char *key)
757
0
{
758
0
  char *text = fz_xml_text(fz_xml_down(fz_xml_find_down(metadata, key)));
759
0
  if (text)
760
0
    return fz_strdup(ctx, text);
761
0
  return NULL;
762
0
}
763
764
static fz_buffer *
765
read_container_and_prefix(fz_context *ctx, fz_archive *zip, char *prefix, size_t prefix_len)
766
0
{
767
0
  int n = fz_count_archive_entries(ctx, zip);
768
0
  int i;
769
770
0
  prefix[0] = 0;
771
772
  /* First off, look for the container.xml at the top level. */
773
0
  for (i = 0; i < n; i++)
774
0
  {
775
0
    const char *p = fz_list_archive_entry(ctx, zip, i);
776
777
0
    if (!strcmp(p, "META-INF/container.xml"))
778
0
      return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
779
0
  }
780
781
  /* If that failed, look for the first such file in a subdirectory. */
782
0
  for (i = 0; i < n; i++)
783
0
  {
784
0
    const char *p = fz_list_archive_entry(ctx, zip, i);
785
0
    size_t z = strlen(p);
786
0
    size_t z0 = sizeof("META-INF/container.xml")-1;
787
788
0
    if (z < z0)
789
0
      continue;
790
0
    if (!strcmp(p + z - z0, "META-INF/container.xml"))
791
0
    {
792
0
      if (z - z0 >= prefix_len)
793
0
      {
794
0
        fz_warn(ctx, "Ignoring %s as path too long.", p);
795
0
        continue;
796
0
      }
797
0
      memcpy(prefix, p, z-z0);
798
0
      prefix[z-z0] = 0;
799
0
      return fz_read_archive_entry(ctx, zip, p);
800
0
    }
801
0
  }
802
803
0
  return fz_read_archive_entry(ctx, zip, "META-INF/container.xml");
804
0
}
805
806
static void
807
epub_parse_header(fz_context *ctx, epub_document *doc)
808
0
{
809
0
  fz_archive *zip = doc->zip;
810
0
  fz_buffer *buf = NULL;
811
0
  fz_xml_doc *encryption_xml = NULL;
812
0
  fz_xml_doc *container_xml = NULL;
813
0
  fz_xml_doc *content_opf = NULL;
814
0
  fz_xml *container, *rootfiles, *rootfile;
815
0
  fz_xml *package, *manifest, *spine, *itemref, *metadata;
816
0
  char base_uri[2048];
817
0
  const char *full_path;
818
0
  const char *version;
819
0
  char ncx[2048], s[2048];
820
0
  char *prefixed_full_path = NULL;
821
0
  size_t prefix_len;
822
0
  epub_chapter **tailp;
823
0
  int i;
824
825
0
  fz_var(buf);
826
0
  fz_var(encryption_xml);
827
0
  fz_var(container_xml);
828
0
  fz_var(content_opf);
829
0
  fz_var(prefixed_full_path);
830
831
0
  fz_try(ctx)
832
0
  {
833
    /* parse META-INF/encryption.xml to figure out which entries are encrypted */
834
835
    /* parse META-INF/container.xml to find OPF */
836
    /* Reuse base_uri to read the prefix. */
837
0
    buf = read_container_and_prefix(ctx, zip, base_uri, sizeof(base_uri));
838
0
    container_xml = fz_parse_xml(ctx, buf, 0);
839
0
    fz_drop_buffer(ctx, buf);
840
0
    buf = NULL;
841
842
    /* Some epub files can be prefixed by a directory name. This (normally
843
     * empty!) will be in base_uri. */
844
0
    prefix_len = strlen(base_uri);
845
0
    {
846
      /* Further abuse base_uri to hold a temporary name. */
847
0
      const size_t z0 = sizeof("META-INF/encryption.xml")-1;
848
0
      if (sizeof(base_uri) <= prefix_len + z0)
849
0
        fz_throw(ctx, FZ_ERROR_FORMAT, "Prefix too long in epub");
850
0
      strcpy(base_uri + prefix_len, "META-INF/encryption.xml");
851
0
      if (fz_has_archive_entry(ctx, zip, base_uri))
852
0
      {
853
0
        fz_warn(ctx, "EPUB may be locked by DRM");
854
855
0
        buf = fz_read_archive_entry(ctx, zip, base_uri);
856
0
        encryption_xml = fz_parse_xml(ctx, buf, 0);
857
0
        fz_drop_buffer(ctx, buf);
858
0
        buf = NULL;
859
860
0
        epub_parse_encryption(ctx, doc, fz_xml_find(fz_xml_root(encryption_xml), "encryption"));
861
0
        zip = doc->zip;
862
0
      }
863
0
    }
864
865
0
    container = fz_xml_find(fz_xml_root(container_xml), "container");
866
0
    rootfiles = fz_xml_find_down(container, "rootfiles");
867
0
    rootfile = fz_xml_find_down(rootfiles, "rootfile");
868
0
    full_path = fz_xml_att(rootfile, "full-path");
869
0
    if (!full_path)
870
0
      fz_throw(ctx, FZ_ERROR_FORMAT, "cannot find root file in EPUB");
871
872
0
    fz_dirname(base_uri+prefix_len, full_path, sizeof(base_uri) - prefix_len);
873
874
0
    prefixed_full_path = fz_malloc(ctx, strlen(full_path) + prefix_len + 1);
875
0
    memcpy(prefixed_full_path, base_uri, prefix_len);
876
0
    strcpy(prefixed_full_path + prefix_len, full_path);
877
878
    /* parse OPF to find NCX and spine */
879
880
0
    buf = fz_read_archive_entry(ctx, zip, prefixed_full_path);
881
0
    content_opf = fz_parse_xml(ctx, buf, 0);
882
0
    fz_drop_buffer(ctx, buf);
883
0
    buf = NULL;
884
885
0
    package = fz_xml_find(fz_xml_root(content_opf), "package");
886
0
    version = fz_xml_att(package, "version");
887
0
    if (!version || strcmp(version, "2.0"))
888
0
      fz_warn(ctx, "unknown epub version: %s", version ? version : "<none>");
889
890
0
    metadata = fz_xml_find_down(package, "metadata");
891
0
    if (metadata)
892
0
    {
893
0
      doc->dc_title = Memento_label(find_metadata(ctx, metadata, "title"), "epub_title");
894
0
      doc->dc_creator = Memento_label(find_metadata(ctx, metadata, "creator"), "epub_creator");
895
0
    }
896
897
0
    manifest = fz_xml_find_down(package, "manifest");
898
0
    spine = fz_xml_find_down(package, "spine");
899
900
0
    if (path_from_idref(ncx, manifest, base_uri, fz_xml_att(spine, "toc"), sizeof ncx))
901
0
    {
902
0
      epub_parse_ncx(ctx, doc, ncx);
903
0
    }
904
905
0
    doc->spine = NULL;
906
0
    tailp = &doc->spine;
907
0
    itemref = fz_xml_find_down(spine, "itemref");
908
0
    i = 0;
909
0
    while (itemref)
910
0
    {
911
0
      if (path_from_idref(s, manifest, base_uri, fz_xml_att(itemref, "idref"), sizeof s))
912
0
      {
913
0
        fz_try(ctx)
914
0
        {
915
0
          *tailp = epub_load_chapter(ctx, doc, s, i);
916
0
          tailp = &(*tailp)->next;
917
0
          i++;
918
0
        }
919
0
        fz_catch(ctx)
920
0
        {
921
0
          fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
922
0
          fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
923
0
          fz_report_error(ctx);
924
0
          fz_warn(ctx, "ignoring chapter %s", s);
925
0
        }
926
0
      }
927
0
      itemref = fz_xml_find_next(itemref, "itemref");
928
0
    }
929
0
  }
930
0
  fz_always(ctx)
931
0
  {
932
0
    fz_drop_xml(ctx, content_opf);
933
0
    fz_drop_xml(ctx, container_xml);
934
0
    fz_drop_xml(ctx, encryption_xml);
935
0
    fz_drop_buffer(ctx, buf);
936
0
    fz_free(ctx, prefixed_full_path);
937
0
  }
938
0
  fz_catch(ctx)
939
0
    fz_rethrow(ctx);
940
0
}
941
942
static fz_outline *
943
epub_load_outline(fz_context *ctx, fz_document *doc_)
944
0
{
945
0
  epub_document *doc = (epub_document*)doc_;
946
0
  return fz_keep_outline(ctx, doc->outline);
947
0
}
948
949
static int
950
epub_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
951
0
{
952
0
  epub_document *doc = (epub_document*)doc_;
953
0
  if (!strcmp(key, FZ_META_FORMAT))
954
0
    return 1 + (int)fz_strlcpy(buf, "EPUB", size);
955
0
  if (!strcmp(key, FZ_META_INFO_TITLE) && doc->dc_title)
956
0
    return 1 + (int)fz_strlcpy(buf, doc->dc_title, size);
957
0
  if (!strcmp(key, FZ_META_INFO_AUTHOR) && doc->dc_creator)
958
0
    return 1 + (int)fz_strlcpy(buf, doc->dc_creator, size);
959
0
  return -1;
960
0
}
961
962
static void
963
epub_output_accelerator(fz_context *ctx, fz_document *doc_, fz_output *out)
964
0
{
965
0
  epub_document *doc = (epub_document*)doc_;
966
0
  int i;
967
968
0
  fz_try(ctx)
969
0
  {
970
0
    if (doc->accel == NULL)
971
0
      fz_throw(ctx, FZ_ERROR_ARGUMENT, "No accelerator data to write");
972
973
0
    fz_write_int32_le(ctx, out, MAGIC_ACCELERATOR);
974
0
    fz_write_int32_le(ctx, out, MAGIC_ACCEL_EPUB);
975
0
    fz_write_int32_le(ctx, out, ACCEL_VERSION);
976
0
    fz_write_float_le(ctx, out, doc->accel->layout_w);
977
0
    fz_write_float_le(ctx, out, doc->accel->layout_h);
978
0
    fz_write_float_le(ctx, out, doc->accel->layout_em);
979
0
    fz_write_uint32_le(ctx, out, doc->accel->css_sum);
980
0
    fz_write_int32_le(ctx, out, doc->accel->use_doc_css);
981
0
    fz_write_int32_le(ctx, out, doc->accel->num_chapters);
982
0
    for (i = 0; i < doc->accel->num_chapters; i++)
983
0
      fz_write_int32_le(ctx, out, doc->accel->pages_in_chapter[i]);
984
985
0
    fz_close_output(ctx, out);
986
0
  }
987
0
  fz_always(ctx)
988
0
    fz_drop_output(ctx, out);
989
0
  fz_catch(ctx)
990
0
    fz_rethrow(ctx);
991
0
}
992
993
/* Takes ownership of zip. Will always eventually drop it.
994
 * Never takes ownership of accel. */
995
static fz_document *
996
epub_init(fz_context *ctx, fz_archive *zip, fz_stream *accel)
997
0
{
998
0
  epub_document *doc = NULL;
999
1000
0
  fz_var(doc);
1001
0
  fz_var(zip);
1002
1003
0
  fz_try(ctx)
1004
0
  {
1005
0
    doc = fz_new_derived_document(ctx, epub_document);
1006
0
    doc->zip = zip;
1007
0
    zip = NULL;
1008
1009
0
    doc->super.drop_document = epub_drop_document;
1010
0
    doc->super.layout = epub_layout;
1011
0
    doc->super.load_outline = epub_load_outline;
1012
0
    doc->super.resolve_link_dest = epub_resolve_link;
1013
0
    doc->super.make_bookmark = epub_make_bookmark;
1014
0
    doc->super.lookup_bookmark = epub_lookup_bookmark;
1015
0
    doc->super.count_chapters = epub_count_chapters;
1016
0
    doc->super.count_pages = epub_count_pages;
1017
0
    doc->super.load_page = epub_load_page;
1018
0
    doc->super.page_label = epub_page_label;
1019
0
    doc->super.lookup_metadata = epub_lookup_metadata;
1020
0
    doc->super.output_accelerator = epub_output_accelerator;
1021
0
    doc->super.is_reflowable = 1;
1022
1023
0
    doc->set = fz_new_html_font_set(ctx);
1024
0
    doc->css_sum = user_css_sum(ctx);
1025
0
    epub_load_accelerator(ctx, doc, accel);
1026
0
    epub_parse_header(ctx, doc);
1027
0
  }
1028
0
  fz_catch(ctx)
1029
0
  {
1030
0
    fz_drop_archive(ctx, zip);
1031
0
    fz_drop_document(ctx, &doc->super);
1032
0
    fz_rethrow(ctx);
1033
0
  }
1034
1035
0
  return (fz_document*)doc;
1036
0
}
1037
1038
static fz_document *
1039
epub_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir)
1040
0
{
1041
0
  fz_stream *file2 = NULL;
1042
0
  fz_document *doc;
1043
0
  fz_archive *zip = NULL;
1044
1045
0
  if (file == NULL)
1046
0
  {
1047
    /* Directory case: file == NULL and dir == the directory. */
1048
0
    if (fz_has_archive_entry(ctx, dir, "META-INF/container.xml"))
1049
0
      file2 = file = fz_open_archive_entry(ctx, dir, "META-INF/container.xml");
1050
0
    else
1051
0
      file2 = file = fz_open_archive_entry(ctx, dir, "META-INF\\container.xml");
1052
0
    if (file == NULL)
1053
0
      fz_throw(ctx, FZ_ERROR_FORMAT, "Not an epub file");
1054
0
    zip = fz_keep_archive(ctx, dir);
1055
0
  }
1056
0
  else
1057
0
  {
1058
    /* File case: file != NULL and dir can be ignored. */
1059
0
    zip = fz_open_archive_with_stream(ctx, file);
1060
0
  }
1061
1062
1063
0
  fz_try(ctx)
1064
0
    doc = epub_init(ctx, zip, file);
1065
0
  fz_always(ctx)
1066
0
    fz_drop_stream(ctx, file2);
1067
0
  fz_catch(ctx)
1068
0
    fz_rethrow(ctx);
1069
1070
0
  return doc;
1071
0
}
1072
1073
static int
1074
epub_recognize(fz_context *doc, const fz_document_handler *handler, const char *magic)
1075
2.11k
{
1076
2.11k
  if (strstr(magic, "META-INF/container.xml") || strstr(magic, "META-INF\\container.xml"))
1077
0
    return 200;
1078
2.11k
  return 0;
1079
2.11k
}
1080
1081
static int
1082
epub_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir)
1083
8.83k
{
1084
8.83k
  fz_archive *arch = NULL;
1085
8.83k
  int ret = 0;
1086
1087
8.83k
  fz_var(arch);
1088
8.83k
  fz_var(ret);
1089
1090
17.6k
  fz_try(ctx)
1091
17.6k
  {
1092
8.83k
    if (stream == NULL)
1093
0
      arch = fz_keep_archive(ctx, dir);
1094
8.83k
    else
1095
8.83k
    {
1096
8.83k
      arch = fz_try_open_archive_with_stream(ctx, stream);
1097
8.83k
      if (arch == NULL)
1098
8.68k
        break;
1099
8.83k
    }
1100
1101
152
    if (fz_has_archive_entry(ctx, arch, "META-INF/container.xml") ||
1102
152
      fz_has_archive_entry(ctx, arch, "META-INF\\container.xml"))
1103
0
      ret = 100;
1104
152
  }
1105
17.6k
  fz_always(ctx)
1106
8.83k
    fz_drop_archive(ctx, arch);
1107
8.83k
  fz_catch(ctx)
1108
101
    fz_rethrow(ctx);
1109
1110
8.73k
  return ret;
1111
8.83k
}
1112
1113
static const char *epub_extensions[] =
1114
{
1115
  "epub",
1116
  NULL
1117
};
1118
1119
static const char *epub_mimetypes[] =
1120
{
1121
  "application/epub+zip",
1122
  NULL
1123
};
1124
1125
fz_document_handler epub_document_handler =
1126
{
1127
  epub_recognize,
1128
  epub_open_document,
1129
  epub_extensions,
1130
  epub_mimetypes,
1131
  epub_recognize_content
1132
};