Coverage Report

Created: 2025-11-07 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/pdf/pdf-repair.c
Line
Count
Source
1
// Copyright (C) 2004-2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "pdf-imp.h"
25
26
#include <string.h>
27
28
/* Scan file for objects and reconstruct xref table */
29
30
struct entry
31
{
32
  int num;
33
  int gen;
34
  int64_t ofs;
35
  int64_t stm_ofs;
36
  int64_t stm_len;
37
};
38
39
typedef struct
40
{
41
  int max;
42
  int len;
43
  pdf_obj **roots;
44
} pdf_root_list;
45
46
static void
47
add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj)
48
3
{
49
3
  if (roots->max == roots->len)
50
3
  {
51
3
    int new_max_roots = roots->max * 2;
52
3
    if (new_max_roots == 0)
53
3
      new_max_roots = 4;
54
3
    roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0]));
55
3
    roots->max = new_max_roots;
56
3
  }
57
3
  roots->roots[roots->len] = pdf_keep_obj(ctx, obj);
58
3
  roots->len++;
59
3
}
60
61
static pdf_root_list *
62
fz_new_root_list(fz_context *ctx)
63
16
{
64
16
  return fz_malloc_struct(ctx, pdf_root_list);
65
16
}
66
67
static void
68
pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots)
69
23
{
70
23
  int i, n;
71
72
23
  if (roots == NULL)
73
7
    return;
74
75
16
  n = roots->len;
76
19
  for (i = 0; i < n; i++)
77
3
    pdf_drop_obj(ctx, roots->roots[i]);
78
16
  fz_free(ctx, roots->roots);
79
16
  fz_free(ctx, roots);
80
16
}
81
82
int
83
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
84
824
{
85
824
  fz_stream *file = doc->file;
86
824
  pdf_token tok;
87
824
  int64_t stm_len;
88
824
  int64_t local_ofs;
89
90
824
  if (tmpofs == NULL)
91
0
    tmpofs = &local_ofs;
92
824
  if (stmofsp == NULL)
93
0
    stmofsp = &local_ofs;
94
95
824
  *stmofsp = 0;
96
824
  if (stmlenp)
97
824
    *stmlenp = -1;
98
99
824
  stm_len = 0;
100
101
824
  *tmpofs = fz_tell(ctx, file);
102
824
  if (*tmpofs < 0)
103
0
    fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
104
105
  /* On entry to this function, we know that we've just seen
106
   * '<int> <int> obj'. We expect the next thing we see to be a
107
   * pdf object. Regardless of the type of thing we meet next
108
   * we only need to fully parse it if it is a dictionary. */
109
824
  tok = pdf_lex(ctx, file, buf);
110
111
  /* Don't let a truncated object at EOF overwrite a good one */
112
824
  if (tok == PDF_TOK_EOF)
113
0
    fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object");
114
115
824
  if (tok == PDF_TOK_OPEN_DICT)
116
800
  {
117
800
    pdf_obj *obj, *dict = NULL;
118
119
1.60k
    fz_try(ctx)
120
1.60k
    {
121
800
      dict = pdf_parse_dict(ctx, doc, file, buf);
122
800
    }
123
1.60k
    fz_catch(ctx)
124
7
    {
125
7
      fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
126
7
      fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
127
      /* Don't let a broken object at EOF overwrite a good one */
128
7
      if (file->eof)
129
1
        fz_rethrow(ctx);
130
      /* Silently swallow the error */
131
6
      fz_report_error(ctx);
132
6
      dict = pdf_new_dict(ctx, doc, 2);
133
6
    }
134
135
    /* We must be careful not to try to resolve any indirections
136
     * here. We have just read dict, so we know it to be a non
137
     * indirected dictionary. Before we look at any values that
138
     * we get back from looking up in it, we need to check they
139
     * aren't indirected. */
140
141
799
    if (encrypt || id || root)
142
799
    {
143
799
      obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
144
799
      if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
145
3
      {
146
3
        if (encrypt)
147
3
        {
148
3
          obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
149
3
          if (obj)
150
0
          {
151
0
            pdf_drop_obj(ctx, *encrypt);
152
0
            *encrypt = pdf_keep_obj(ctx, obj);
153
0
          }
154
3
        }
155
156
3
        if (id)
157
3
        {
158
3
          obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
159
3
          if (obj)
160
3
          {
161
3
            pdf_drop_obj(ctx, *id);
162
3
            *id = pdf_keep_obj(ctx, obj);
163
3
          }
164
3
        }
165
166
3
        if (root)
167
3
          *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
168
3
      }
169
799
    }
170
171
799
    obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
172
799
    if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
173
321
      stm_len = pdf_to_int64(ctx, obj);
174
175
799
    if (doc->file_reading_linearly && page)
176
0
    {
177
0
      obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
178
0
      if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
179
0
      {
180
0
        pdf_drop_obj(ctx, *page);
181
0
        *page = pdf_keep_obj(ctx, dict);
182
0
      }
183
0
    }
184
185
799
    pdf_drop_obj(ctx, dict);
186
799
  }
187
188
1.64k
  while ( tok != PDF_TOK_STREAM &&
189
1.32k
    tok != PDF_TOK_ENDOBJ &&
190
853
    tok != PDF_TOK_ERROR &&
191
847
    tok != PDF_TOK_EOF &&
192
847
    tok != PDF_TOK_INT )
193
823
  {
194
823
    *tmpofs = fz_tell(ctx, file);
195
823
    if (*tmpofs < 0)
196
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
197
823
    tok = pdf_lex(ctx, file, buf);
198
823
  }
199
200
823
  if (tok == PDF_TOK_STREAM)
201
322
  {
202
322
    int c = fz_read_byte(ctx, file);
203
322
    if (c == '\r') {
204
40
      c = fz_peek_byte(ctx, file);
205
40
      if (c == '\n')
206
40
        fz_read_byte(ctx, file);
207
40
    }
208
209
322
    *stmofsp = fz_tell(ctx, file);
210
322
    if (*stmofsp < 0)
211
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
212
213
322
    if (stm_len > 0)
214
321
    {
215
321
      fz_seek(ctx, file, *stmofsp + stm_len, 0);
216
642
      fz_try(ctx)
217
642
      {
218
321
        tok = pdf_lex(ctx, file, buf);
219
321
      }
220
642
      fz_catch(ctx)
221
0
      {
222
0
        fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
223
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
224
0
        fz_report_error(ctx);
225
0
        fz_warn(ctx, "cannot find endstream token, falling back to scanning");
226
0
      }
227
321
      if (tok == PDF_TOK_ENDSTREAM)
228
292
        goto atobjend;
229
29
      fz_seek(ctx, file, *stmofsp, 0);
230
29
    }
231
232
30
    (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
233
234
8.49M
    while (memcmp(buf->scratch, "endstream", 9) != 0)
235
8.49M
    {
236
8.49M
      c = fz_read_byte(ctx, file);
237
8.49M
      if (c == EOF)
238
10
        break;
239
8.49M
      memmove(&buf->scratch[0], &buf->scratch[1], 8);
240
8.49M
      buf->scratch[8] = c;
241
8.49M
    }
242
243
30
    if (stmlenp)
244
30
      *stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
245
246
322
atobjend:
247
322
    *tmpofs = fz_tell(ctx, file);
248
322
    if (*tmpofs < 0)
249
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
250
322
    tok = pdf_lex(ctx, file, buf);
251
322
    if (tok != PDF_TOK_ENDOBJ)
252
10
      fz_warn(ctx, "object missing 'endobj' token");
253
312
    else
254
312
    {
255
      /* Read another token as we always return the next one */
256
312
      *tmpofs = fz_tell(ctx, file);
257
312
      if (*tmpofs < 0)
258
0
        fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
259
312
      tok = pdf_lex(ctx, file, buf);
260
312
    }
261
322
  }
262
823
  return tok;
263
823
}
264
265
static int64_t
266
entry_offset(fz_context *ctx, pdf_document *doc, int num)
267
0
{
268
0
  pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num);
269
270
0
  if (entry->type == 0 || entry->type == 'f')
271
0
    return 0;
272
0
  if (entry->type == 'n')
273
0
    return entry->ofs;
274
0
  assert(entry->type == 'o');
275
276
  /* It must be in a stream. Return the entry of that stream. */
277
0
  entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs);
278
  /* If it's NOT in a stream, then we'll invalidate this entry in a moment.
279
   * For now, just return an illegal offset. */
280
0
  if (entry->type != 'n')
281
0
    return -1;
282
283
0
  return entry->ofs;
284
0
}
285
286
static void
287
pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
288
0
{
289
0
  pdf_obj *obj;
290
0
  fz_stream *stm = NULL;
291
0
  pdf_token tok;
292
0
  int i, n, count;
293
0
  pdf_lexbuf buf;
294
295
0
  fz_var(stm);
296
297
0
  pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
298
299
0
  fz_try(ctx)
300
0
  {
301
0
    obj = pdf_load_object(ctx, doc, stm_num);
302
303
0
    count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
304
305
0
    pdf_drop_obj(ctx, obj);
306
307
0
    stm = pdf_open_stream_number(ctx, doc, stm_num);
308
309
0
    for (i = 0; i < count; i++)
310
0
    {
311
0
      pdf_xref_entry *entry;
312
0
      int replace;
313
314
0
      tok = pdf_lex(ctx, stm, &buf);
315
0
      if (tok != PDF_TOK_INT)
316
0
        fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
317
318
0
      n = buf.i;
319
0
      if (n < 0)
320
0
      {
321
0
        fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
322
0
        continue;
323
0
      }
324
0
      else if (n >= PDF_MAX_OBJECT_NUMBER)
325
0
      {
326
0
        fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
327
0
        continue;
328
0
      }
329
330
0
      entry = pdf_get_populating_xref_entry(ctx, doc, n);
331
332
      /* Bug 708286: Do not allow an object from an ObjStm to override an object
333
       * that isn't in an ObjStm that we've already read, that occurs after it
334
       * in the file. */
335
0
      replace = 1;
336
0
      if (entry->type != 0 && entry->type != 'f')
337
0
      {
338
0
        int64_t existing_entry_offset = entry_offset(ctx, doc, n);
339
340
0
        if (existing_entry_offset < 0)
341
0
        {
342
          /* The existing entry is invalid. Anything must be better than that! */
343
0
        }
344
0
        else
345
0
        {
346
0
          int64_t this_entry_offset = entry_offset(ctx, doc, stm_num);
347
348
0
          if (existing_entry_offset > this_entry_offset)
349
0
            replace = 0;
350
0
        }
351
0
      }
352
353
0
      if (replace)
354
0
      {
355
0
        entry->ofs = stm_num;
356
0
        entry->gen = i;
357
0
        entry->num = n;
358
0
        entry->stm_ofs = 0;
359
0
        pdf_drop_obj(ctx, entry->obj);
360
0
        entry->obj = NULL;
361
0
        entry->type = 'o';
362
0
      }
363
364
0
      tok = pdf_lex(ctx, stm, &buf);
365
0
      if (tok != PDF_TOK_INT)
366
0
        fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
367
0
    }
368
0
  }
369
0
  fz_always(ctx)
370
0
  {
371
0
    fz_drop_stream(ctx, stm);
372
0
    pdf_lexbuf_fin(ctx, &buf);
373
0
  }
374
0
  fz_catch(ctx)
375
0
  {
376
0
    fz_rethrow(ctx);
377
0
  }
378
0
}
379
380
static void
381
orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
382
30
{
383
30
  if (doc->orphans_count == doc->orphans_max)
384
10
  {
385
10
    int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
386
387
20
    fz_try(ctx)
388
20
    {
389
10
      doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
390
10
      doc->orphans_max = new_max;
391
10
    }
392
20
    fz_catch(ctx)
393
0
    {
394
0
      pdf_drop_obj(ctx, obj);
395
0
      fz_rethrow(ctx);
396
0
    }
397
10
  }
398
30
  doc->orphans[doc->orphans_count++] = obj;
399
30
}
400
401
static int is_white(int c)
402
0
{
403
0
  return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
404
0
}
405
406
static pdf_root_list *
407
pdf_repair_xref_base(fz_context *ctx, pdf_document *doc)
408
16
{
409
16
  pdf_obj *dict, *obj = NULL;
410
16
  pdf_obj *length;
411
412
16
  pdf_obj *encrypt = NULL;
413
16
  pdf_obj *id = NULL;
414
16
  pdf_obj *info = NULL;
415
16
  pdf_root_list *roots = NULL;
416
417
16
  struct entry *list = NULL;
418
16
  int listlen;
419
16
  int listcap;
420
16
  int maxnum = 0;
421
422
16
  int num = 0;
423
16
  int gen = 0;
424
16
  int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
425
16
  int64_t stm_len;
426
16
  pdf_token tok;
427
16
  int next;
428
16
  int i;
429
16
  size_t j, n;
430
16
  int c;
431
16
  pdf_lexbuf *buf = &doc->lexbuf.base;
432
433
16
  fz_var(encrypt);
434
16
  fz_var(id);
435
16
  fz_var(info);
436
16
  fz_var(list);
437
16
  fz_var(obj);
438
16
  fz_var(roots);
439
440
16
  if (!doc->is_fdf)
441
16
    fz_warn(ctx, "repairing PDF document");
442
443
16
  if (doc->repair_attempted)
444
0
    fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again");
445
446
16
  doc->bias = 0; // reset bias!
447
448
16
  doc->repair_attempted = 1;
449
16
  doc->repair_in_progress = 1;
450
451
16
  pdf_drop_page_tree_internal(ctx, doc);
452
16
  doc->use_page_tree_map = 1;
453
16
  pdf_forget_xref(ctx, doc);
454
455
16
  fz_seek(ctx, doc->file, 0, 0);
456
457
32
  fz_try(ctx)
458
32
  {
459
16
    pdf_xref_entry *entry;
460
16
    listlen = 0;
461
16
    listcap = 1024;
462
16
    list = fz_malloc_array(ctx, listcap, struct entry);
463
464
16
    roots = fz_new_root_list(ctx);
465
466
    /* look for '%PDF' version marker within first kilobyte of file */
467
16
    n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024));
468
469
16
    fz_seek(ctx, doc->file, 0, 0);
470
16
    if (n >= 5)
471
16
    {
472
5.13k
      for (j = 0; j < n - 5; j++)
473
5.12k
      {
474
5.12k
        if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0)
475
11
        {
476
11
          fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
477
11
          break;
478
11
        }
479
5.12k
      }
480
16
    }
481
482
    /* skip comment line after version marker since some generators
483
     * forget to terminate the comment with a newline */
484
16
    c = fz_read_byte(ctx, doc->file);
485
16
    while (c >= 0 && (c == ' ' || c == '%'))
486
0
      c = fz_read_byte(ctx, doc->file);
487
16
    if (c != EOF)
488
16
      fz_unread_byte(ctx, doc->file);
489
490
465k
    while (1)
491
465k
    {
492
465k
      tmpofs = fz_tell(ctx, doc->file);
493
465k
      if (tmpofs < 0)
494
0
        fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
495
496
930k
      fz_try(ctx)
497
930k
        tok = pdf_lex_no_string(ctx, doc->file, buf);
498
930k
      fz_catch(ctx)
499
0
      {
500
0
        fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
501
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
502
0
        fz_report_error(ctx);
503
0
        fz_warn(ctx, "skipping ahead to next token");
504
0
        do
505
0
          c = fz_read_byte(ctx, doc->file);
506
0
        while (c != EOF && !is_white(c));
507
0
        if (c == EOF)
508
0
          tok = PDF_TOK_EOF;
509
0
        else
510
0
          continue;
511
0
      }
512
513
      /* If we have the next token already, then we'll jump
514
       * back here, rather than going through the top of
515
       * the loop. */
516
465k
    have_next_token:
517
518
465k
      if (tok == PDF_TOK_INT)
519
53.4k
      {
520
53.4k
        if (buf->i < 0)
521
4
        {
522
4
          num = 0;
523
4
          gen = 0;
524
4
          continue;
525
4
        }
526
53.4k
        numofs = genofs;
527
53.4k
        num = gen;
528
53.4k
        genofs = tmpofs;
529
53.4k
        gen = buf->i;
530
53.4k
      }
531
532
412k
      else if (tok == PDF_TOK_OBJ)
533
824
      {
534
824
        pdf_obj *root = NULL;
535
536
1.64k
        fz_try(ctx)
537
1.64k
        {
538
824
          stm_len = 0;
539
824
          stm_ofs = 0;
540
824
          tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
541
824
          if (root)
542
3
            add_root(ctx, roots, root);
543
824
        }
544
1.64k
        fz_always(ctx)
545
824
        {
546
824
          pdf_drop_obj(ctx, root);
547
824
        }
548
824
        fz_catch(ctx)
549
1
        {
550
1
          int errcode = fz_caught(ctx);
551
          /* If we haven't seen a root yet, there is nothing
552
           * we can do, but give up. Otherwise, we'll make
553
           * do. */
554
1
          if (roots->len == 0 ||
555
0
            errcode == FZ_ERROR_TRYLATER ||
556
0
            errcode == FZ_ERROR_SYSTEM)
557
1
          {
558
1
            pdf_drop_root_list(ctx, roots);
559
1
            roots = NULL;
560
1
            fz_rethrow(ctx);
561
1
          }
562
0
          fz_report_error(ctx);
563
0
          fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
564
0
          break;
565
1
        }
566
567
823
        if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
568
0
        {
569
0
          fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
570
0
          goto have_next_token;
571
0
        }
572
573
823
        gen = fz_clampi(gen, 0, 65535);
574
575
823
        if (listlen + 1 == listcap)
576
0
        {
577
0
          listcap = (listcap * 3) / 2;
578
0
          list = fz_realloc_array(ctx, list, listcap, struct entry);
579
0
        }
580
581
823
        list[listlen].num = num;
582
823
        list[listlen].gen = gen;
583
823
        list[listlen].ofs = numofs;
584
823
        list[listlen].stm_ofs = stm_ofs;
585
823
        list[listlen].stm_len = stm_len;
586
823
        listlen ++;
587
588
823
        if (num > maxnum)
589
712
          maxnum = num;
590
591
823
        goto have_next_token;
592
823
      }
593
594
      /* If we find a dictionary it is probably the trailer,
595
       * but could be a stream (or bogus) dictionary caused
596
       * by a corrupt file. */
597
411k
      else if (tok == PDF_TOK_OPEN_DICT)
598
34
      {
599
34
        pdf_obj *dictobj;
600
601
68
        fz_try(ctx)
602
68
        {
603
34
          dict = pdf_parse_dict(ctx, doc, doc->file, buf);
604
34
        }
605
68
        fz_catch(ctx)
606
32
        {
607
32
          fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
608
32
          fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
609
          /* If this was the real trailer dict
610
           * it was broken, in which case we are
611
           * in trouble. Keep going though in
612
           * case this was just a bogus dict. */
613
32
          fz_report_error(ctx);
614
32
          continue;
615
32
        }
616
617
4
        fz_try(ctx)
618
4
        {
619
2
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
620
2
          if (dictobj)
621
0
          {
622
0
            pdf_drop_obj(ctx, encrypt);
623
0
            encrypt = pdf_keep_obj(ctx, dictobj);
624
0
          }
625
626
2
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
627
2
          if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
628
0
          {
629
0
            pdf_drop_obj(ctx, id);
630
0
            id = pdf_keep_obj(ctx, dictobj);
631
0
          }
632
633
2
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
634
2
          if (dictobj)
635
0
            add_root(ctx, roots, dictobj);
636
637
2
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
638
2
          if (dictobj)
639
0
          {
640
0
            pdf_drop_obj(ctx, info);
641
0
            info = pdf_keep_obj(ctx, dictobj);
642
0
          }
643
2
        }
644
4
        fz_always(ctx)
645
2
          pdf_drop_obj(ctx, dict);
646
2
        fz_catch(ctx)
647
0
          fz_rethrow(ctx);
648
2
      }
649
650
411k
      else if (tok == PDF_TOK_EOF)
651
15
      {
652
15
        break;
653
15
      }
654
655
411k
      else
656
411k
      {
657
411k
        num = 0;
658
411k
        gen = 0;
659
411k
      }
660
465k
    }
661
662
15
    if (listlen == 0)
663
5
      fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found");
664
665
    /* make xref reasonable */
666
667
    /*
668
      Dummy access to entry to assure sufficient space in the xref table
669
      and avoid repeated reallocs in the loop
670
    */
671
    /* Ensure that the first xref table is a 'solid' one from
672
     * 0 to maxnum. */
673
10
    pdf_ensure_solid_xref(ctx, doc, maxnum);
674
675
10.8k
    for (i = 1; i < maxnum; i++)
676
10.8k
    {
677
10.8k
      entry = pdf_get_populating_xref_entry(ctx, doc, i);
678
10.8k
      if (entry->obj != NULL)
679
0
        continue;
680
10.8k
      entry->type = 'f';
681
10.8k
      entry->ofs = 0;
682
10.8k
      entry->gen = 0;
683
10.8k
      entry->num = 0;
684
685
10.8k
      entry->stm_ofs = 0;
686
10.8k
    }
687
688
355
    for (i = 0; i < listlen; i++)
689
345
    {
690
345
      entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
691
345
      entry->type = 'n';
692
345
      entry->ofs = list[i].ofs;
693
345
      entry->gen = list[i].gen;
694
345
      entry->num = list[i].num;
695
696
345
      entry->stm_ofs = list[i].stm_ofs;
697
698
      /* correct stream length for unencrypted documents */
699
345
      if (!encrypt && list[i].stm_len >= 0)
700
30
      {
701
30
        pdf_obj *old_obj = NULL;
702
30
        dict = pdf_load_object(ctx, doc, list[i].num);
703
704
60
        fz_try(ctx)
705
60
        {
706
30
          length = pdf_new_int(ctx, list[i].stm_len);
707
30
          pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
708
30
          if (old_obj)
709
30
            orphan_object(ctx, doc, old_obj);
710
30
        }
711
60
        fz_always(ctx)
712
30
          pdf_drop_obj(ctx, dict);
713
30
        fz_catch(ctx)
714
0
          fz_rethrow(ctx);
715
30
      }
716
345
    }
717
718
10
    entry = pdf_get_populating_xref_entry(ctx, doc, 0);
719
10
    entry->type = 'f';
720
10
    entry->ofs = 0;
721
10
    entry->gen = 65535;
722
10
    entry->num = 0;
723
10
    entry->stm_ofs = 0;
724
725
10
    next = 0;
726
10.8k
    for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
727
10.8k
    {
728
10.8k
      entry = pdf_get_populating_xref_entry(ctx, doc, i);
729
10.8k
      if (entry->type == 'f')
730
10.5k
      {
731
10.5k
        entry->ofs = next;
732
10.5k
        if (entry->gen < 65535)
733
10.5k
          entry->gen ++;
734
10.5k
        next = i;
735
10.5k
      }
736
10.8k
    }
737
738
    /* create a repaired trailer, Root will be added later */
739
740
10
    obj = pdf_new_dict(ctx, doc, 5);
741
    /* During repair there is only a single xref section */
742
10
    pdf_set_populating_xref_trailer(ctx, doc, obj);
743
10
    pdf_drop_obj(ctx, obj);
744
10
    obj = NULL;
745
746
10
    pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1);
747
748
10
    if (info)
749
0
    {
750
0
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
751
0
      pdf_drop_obj(ctx, info);
752
0
      info = NULL;
753
0
    }
754
755
10
    if (encrypt)
756
0
    {
757
0
      if (pdf_is_indirect(ctx, encrypt))
758
0
      {
759
        /* create new reference with non-NULL xref pointer */
760
0
        obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
761
0
        pdf_drop_obj(ctx, encrypt);
762
0
        encrypt = obj;
763
0
        obj = NULL;
764
0
      }
765
0
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
766
0
      pdf_drop_obj(ctx, encrypt);
767
0
      encrypt = NULL;
768
0
    }
769
770
10
    if (id)
771
3
    {
772
3
      if (pdf_is_indirect(ctx, id))
773
0
      {
774
        /* create new reference with non-NULL xref pointer */
775
0
        obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
776
0
        pdf_drop_obj(ctx, id);
777
0
        id = obj;
778
0
        obj = NULL;
779
0
      }
780
3
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
781
3
      pdf_drop_obj(ctx, id);
782
3
      id = NULL;
783
3
    }
784
10
  }
785
32
  fz_always(ctx)
786
16
  {
787
16
    fz_free(ctx, list);
788
16
    doc->repair_in_progress = 0;
789
16
  }
790
16
  fz_catch(ctx)
791
6
  {
792
6
    pdf_drop_root_list(ctx, roots);
793
6
    pdf_drop_obj(ctx, encrypt);
794
6
    pdf_drop_obj(ctx, id);
795
6
    pdf_drop_obj(ctx, obj);
796
6
    pdf_drop_obj(ctx, info);
797
6
    if (ctx->throw_on_repair)
798
0
      fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt");
799
6
    fz_rethrow(ctx);
800
6
  }
801
802
4
  if (ctx->throw_on_repair)
803
0
  {
804
0
    pdf_drop_root_list(ctx, roots);
805
0
    fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired");
806
0
  }
807
808
4
  return roots;
809
4
}
810
811
static void
812
pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
813
10
{
814
10
  pdf_obj *dict;
815
10
  int i;
816
10
  int xref_len = pdf_xref_len(ctx, doc);
817
818
10.8k
  for (i = 0; i < xref_len; i++)
819
10.8k
  {
820
10.8k
    pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
821
822
10.8k
    if (entry->stm_ofs)
823
174
    {
824
174
      dict = pdf_load_object(ctx, doc, i);
825
348
      fz_try(ctx)
826
348
      {
827
174
        if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
828
0
          pdf_repair_obj_stm(ctx, doc, i);
829
174
      }
830
348
      fz_always(ctx)
831
174
        pdf_drop_obj(ctx, dict);
832
174
      fz_catch(ctx)
833
0
      {
834
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
835
0
        fz_report_error(ctx);
836
0
        fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
837
0
      }
838
174
    }
839
10.8k
  }
840
841
  /* Ensure that streamed objects reside inside a known non-streamed object */
842
10.8k
  for (i = 0; i < xref_len; i++)
843
10.8k
  {
844
10.8k
    pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
845
846
10.8k
    if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
847
0
    {
848
0
      fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i);
849
0
      entry->type = 'f';
850
0
    }
851
10.8k
  }
852
10
}
853
854
static void
855
pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots)
856
10
{
857
10
  int i;
858
859
10
  for (i = roots->len-1; i >= 0; i--)
860
3
  {
861
3
    if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i]))
862
3
    {
863
3
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]);
864
3
      break;
865
3
    }
866
3
  }
867
10
}
868
869
static void
870
pdf_repair_trailer(fz_context *ctx, pdf_document *doc)
871
10
{
872
10
  int hasroot, hasinfo;
873
10
  pdf_obj *obj, *nobj;
874
10
  pdf_obj *dict = NULL;
875
10
  int i;
876
877
10
  int xref_len = pdf_xref_len(ctx, doc);
878
879
10
  hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
880
10
  hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
881
882
10
  fz_var(dict);
883
884
20
  fz_try(ctx)
885
20
  {
886
    /* Scan from the end so we have a better chance of finding
887
     * newer objects if there are multiple instances of Info and
888
     * Root objects.
889
     */
890
10.8k
    for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i)
891
10.8k
    {
892
10.8k
      pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i);
893
10.8k
      if (entry->type == 0 || entry->type == 'f')
894
10.5k
        continue;
895
896
690
      fz_try(ctx)
897
690
      {
898
345
        dict = pdf_load_object(ctx, doc, i);
899
345
      }
900
690
      fz_catch(ctx)
901
6
      {
902
6
        fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
903
6
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
904
6
        fz_report_error(ctx);
905
6
        fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
906
6
        continue;
907
6
      }
908
909
339
      if (!hasroot)
910
303
      {
911
303
        obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
912
303
        if (obj == PDF_NAME(Catalog))
913
3
        {
914
3
          nobj = pdf_new_indirect(ctx, doc, i, 0);
915
3
          pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
916
3
          hasroot = 1;
917
3
        }
918
303
      }
919
920
339
      if (!hasinfo)
921
339
      {
922
339
        if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
923
1
        {
924
1
          nobj = pdf_new_indirect(ctx, doc, i, 0);
925
1
          pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
926
1
          hasinfo = 1;
927
1
        }
928
339
      }
929
930
339
      pdf_drop_obj(ctx, dict);
931
339
      dict = NULL;
932
339
    }
933
10
  }
934
20
  fz_always(ctx)
935
10
  {
936
    /* ensure that strings are not used in their repaired, non-decrypted form */
937
10
    if (doc->crypt)
938
0
    {
939
0
      pdf_crypt *tmp;
940
0
      pdf_clear_xref(ctx, doc);
941
942
      /* ensure that Encryption dictionary and ID are cached without decryption,
943
         otherwise a decrypted Encryption dictionary and ID may be used when saving
944
         the PDF causing it to be inconsistent (since strings/streams are encrypted
945
         with the actual encryption key, not the decrypted encryption key). */
946
0
      tmp = doc->crypt;
947
0
      doc->crypt = NULL;
948
0
      fz_try(ctx)
949
0
      {
950
0
        (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt)));
951
0
        (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID)));
952
0
      }
953
0
      fz_always(ctx)
954
0
        doc->crypt = tmp;
955
0
      fz_catch(ctx)
956
0
      {
957
0
        fz_rethrow(ctx);
958
0
      }
959
0
    }
960
10
  }
961
10
  fz_catch(ctx)
962
0
  {
963
0
    pdf_drop_obj(ctx, dict);
964
0
    fz_rethrow(ctx);
965
0
  }
966
10
}
967
968
void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc))
969
16
{
970
16
  pdf_root_list *roots = NULL;
971
972
16
  fz_var(roots);
973
974
32
  fz_try(ctx)
975
32
  {
976
16
    roots = pdf_repair_xref_base(ctx, doc);
977
16
    if (mid)
978
10
      mid(ctx, doc);
979
16
    pdf_repair_obj_stms(ctx, doc);
980
16
    pdf_repair_roots(ctx, doc, roots);
981
16
    pdf_repair_trailer(ctx, doc);
982
16
  }
983
32
  fz_always(ctx)
984
16
    pdf_drop_root_list(ctx, roots);
985
16
  fz_catch(ctx)
986
6
    fz_rethrow(ctx);
987
16
}