Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/pdf/pdf-repair.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/pdf.h"
25
26
#include <string.h>
27
28
/* Scan file for objects and reconstruct xref table */
29
30
struct entry
31
{
32
  int num;
33
  int gen;
34
  int64_t ofs;
35
  int64_t stm_ofs;
36
  int64_t stm_len;
37
};
38
39
static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots)
40
2.42k
{
41
2.42k
  if (*num_roots == *max_roots)
42
1.78k
  {
43
1.78k
    int new_max_roots = *max_roots * 2;
44
1.78k
    if (new_max_roots == 0)
45
1.74k
      new_max_roots = 4;
46
1.78k
    *roots = fz_realloc_array(ctx, *roots, new_max_roots, pdf_obj*);
47
1.78k
    *max_roots = new_max_roots;
48
1.78k
  }
49
2.42k
  (*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj);
50
2.42k
}
51
52
int
53
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
54
197k
{
55
197k
  fz_stream *file = doc->file;
56
197k
  pdf_token tok;
57
197k
  int64_t stm_len;
58
197k
  int64_t local_ofs;
59
60
197k
  if (tmpofs == NULL)
61
0
    tmpofs = &local_ofs;
62
197k
  if (stmofsp == NULL)
63
0
    stmofsp = &local_ofs;
64
65
197k
  *stmofsp = 0;
66
197k
  if (stmlenp)
67
197k
    *stmlenp = -1;
68
69
197k
  stm_len = 0;
70
71
197k
  *tmpofs = fz_tell(ctx, file);
72
197k
  if (*tmpofs < 0)
73
0
    fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
74
75
  /* On entry to this function, we know that we've just seen
76
   * '<int> <int> obj'. We expect the next thing we see to be a
77
   * pdf object. Regardless of the type of thing we meet next
78
   * we only need to fully parse it if it is a dictionary. */
79
197k
  tok = pdf_lex(ctx, file, buf);
80
81
  /* Don't let a truncated object at EOF overwrite a good one */
82
197k
  if (tok == PDF_TOK_EOF)
83
0
    fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object");
84
85
197k
  if (tok == PDF_TOK_OPEN_DICT)
86
173k
  {
87
173k
    pdf_obj *obj, *dict = NULL;
88
89
347k
    fz_try(ctx)
90
347k
    {
91
173k
      dict = pdf_parse_dict(ctx, doc, file, buf);
92
173k
    }
93
347k
    fz_catch(ctx)
94
12.9k
    {
95
12.9k
      fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
96
12.9k
      fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
97
      /* Don't let a broken object at EOF overwrite a good one */
98
12.9k
      if (file->eof)
99
182
        fz_rethrow(ctx);
100
      /* Silently swallow the error */
101
12.7k
      fz_report_error(ctx);
102
12.7k
      dict = pdf_new_dict(ctx, doc, 2);
103
12.7k
    }
104
105
    /* We must be careful not to try to resolve any indirections
106
     * here. We have just read dict, so we know it to be a non
107
     * indirected dictionary. Before we look at any values that
108
     * we get back from looking up in it, we need to check they
109
     * aren't indirected. */
110
111
173k
    if (encrypt || id || root)
112
173k
    {
113
173k
      obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
114
173k
      if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
115
1.07k
      {
116
1.07k
        if (encrypt)
117
1.07k
        {
118
1.07k
          obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
119
1.07k
          if (obj)
120
149
          {
121
149
            pdf_drop_obj(ctx, *encrypt);
122
149
            *encrypt = pdf_keep_obj(ctx, obj);
123
149
          }
124
1.07k
        }
125
126
1.07k
        if (id)
127
1.07k
        {
128
1.07k
          obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
129
1.07k
          if (obj)
130
1.03k
          {
131
1.03k
            pdf_drop_obj(ctx, *id);
132
1.03k
            *id = pdf_keep_obj(ctx, obj);
133
1.03k
          }
134
1.07k
        }
135
136
1.07k
        if (root)
137
1.07k
          *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
138
1.07k
      }
139
173k
    }
140
141
173k
    obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
142
173k
    if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
143
53.9k
      stm_len = pdf_to_int64(ctx, obj);
144
145
173k
    if (doc->file_reading_linearly && page)
146
0
    {
147
0
      obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
148
0
      if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
149
0
      {
150
0
        pdf_drop_obj(ctx, *page);
151
0
        *page = pdf_keep_obj(ctx, dict);
152
0
      }
153
0
    }
154
155
173k
    pdf_drop_obj(ctx, dict);
156
173k
  }
157
158
432k
  while ( tok != PDF_TOK_STREAM &&
159
432k
    tok != PDF_TOK_ENDOBJ &&
160
432k
    tok != PDF_TOK_ERROR &&
161
432k
    tok != PDF_TOK_EOF &&
162
432k
    tok != PDF_TOK_INT )
163
234k
  {
164
234k
    *tmpofs = fz_tell(ctx, file);
165
234k
    if (*tmpofs < 0)
166
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
167
234k
    tok = pdf_lex(ctx, file, buf);
168
234k
  }
169
170
197k
  if (tok == PDF_TOK_STREAM)
171
68.0k
  {
172
68.0k
    int c = fz_read_byte(ctx, file);
173
68.0k
    if (c == '\r') {
174
29.8k
      c = fz_peek_byte(ctx, file);
175
29.8k
      if (c == '\n')
176
29.4k
        fz_read_byte(ctx, file);
177
29.8k
    }
178
179
68.0k
    *stmofsp = fz_tell(ctx, file);
180
68.0k
    if (*stmofsp < 0)
181
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
182
183
68.0k
    if (stm_len > 0)
184
53.1k
    {
185
53.1k
      fz_seek(ctx, file, *stmofsp + stm_len, 0);
186
106k
      fz_try(ctx)
187
106k
      {
188
53.1k
        tok = pdf_lex(ctx, file, buf);
189
53.1k
      }
190
106k
      fz_catch(ctx)
191
0
      {
192
0
        fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
193
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
194
0
        fz_report_error(ctx);
195
0
        fz_warn(ctx, "cannot find endstream token, falling back to scanning");
196
0
      }
197
53.1k
      if (tok == PDF_TOK_ENDSTREAM)
198
39.5k
        goto atobjend;
199
13.6k
      fz_seek(ctx, file, *stmofsp, 0);
200
13.6k
    }
201
202
28.5k
    (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
203
204
142M
    while (memcmp(buf->scratch, "endstream", 9) != 0)
205
142M
    {
206
142M
      c = fz_read_byte(ctx, file);
207
142M
      if (c == EOF)
208
7.01k
        break;
209
142M
      memmove(&buf->scratch[0], &buf->scratch[1], 8);
210
142M
      buf->scratch[8] = c;
211
142M
    }
212
213
28.5k
    if (stmlenp)
214
28.5k
      *stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
215
216
68.0k
atobjend:
217
68.0k
    *tmpofs = fz_tell(ctx, file);
218
68.0k
    if (*tmpofs < 0)
219
0
      fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
220
68.0k
    tok = pdf_lex(ctx, file, buf);
221
68.0k
    if (tok != PDF_TOK_ENDOBJ)
222
9.08k
      fz_warn(ctx, "object missing 'endobj' token");
223
58.9k
    else
224
58.9k
    {
225
      /* Read another token as we always return the next one */
226
58.9k
      *tmpofs = fz_tell(ctx, file);
227
58.9k
      if (*tmpofs < 0)
228
0
        fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
229
58.9k
      tok = pdf_lex(ctx, file, buf);
230
58.9k
    }
231
68.0k
  }
232
197k
  return tok;
233
197k
}
234
235
static void
236
pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
237
4.87k
{
238
4.87k
  pdf_obj *obj;
239
4.87k
  fz_stream *stm = NULL;
240
4.87k
  pdf_token tok;
241
4.87k
  int i, n, count;
242
4.87k
  pdf_lexbuf buf;
243
244
4.87k
  fz_var(stm);
245
246
4.87k
  pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
247
248
9.75k
  fz_try(ctx)
249
9.75k
  {
250
4.87k
    obj = pdf_load_object(ctx, doc, stm_num);
251
252
4.87k
    count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
253
254
4.87k
    pdf_drop_obj(ctx, obj);
255
256
4.87k
    stm = pdf_open_stream_number(ctx, doc, stm_num);
257
258
70.8k
    for (i = 0; i < count; i++)
259
67.0k
    {
260
67.0k
      pdf_xref_entry *entry;
261
262
67.0k
      tok = pdf_lex(ctx, stm, &buf);
263
67.0k
      if (tok != PDF_TOK_INT)
264
981
        fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
265
266
66.0k
      n = buf.i;
267
66.0k
      if (n < 0)
268
55
      {
269
55
        fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
270
55
        continue;
271
55
      }
272
66.0k
      else if (n >= pdf_xref_len(ctx, doc))
273
3.46k
      {
274
3.46k
        fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
275
3.46k
        continue;
276
3.46k
      }
277
278
62.5k
      entry = pdf_get_populating_xref_entry(ctx, doc, n);
279
62.5k
      entry->ofs = stm_num;
280
62.5k
      entry->gen = i;
281
62.5k
      entry->num = n;
282
62.5k
      entry->stm_ofs = 0;
283
62.5k
      pdf_drop_obj(ctx, entry->obj);
284
62.5k
      entry->obj = NULL;
285
62.5k
      entry->type = 'o';
286
287
62.5k
      tok = pdf_lex(ctx, stm, &buf);
288
62.5k
      if (tok != PDF_TOK_INT)
289
98
        fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num);
290
62.5k
    }
291
4.87k
  }
292
9.75k
  fz_always(ctx)
293
4.87k
  {
294
4.87k
    fz_drop_stream(ctx, stm);
295
4.87k
    pdf_lexbuf_fin(ctx, &buf);
296
4.87k
  }
297
4.87k
  fz_catch(ctx)
298
1.08k
  {
299
1.08k
    fz_rethrow(ctx);
300
1.08k
  }
301
3.79k
}
302
303
static void
304
orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
305
22.6k
{
306
22.6k
  if (doc->orphans_count == doc->orphans_max)
307
7.81k
  {
308
7.81k
    int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
309
310
15.6k
    fz_try(ctx)
311
15.6k
    {
312
7.81k
      doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
313
7.81k
      doc->orphans_max = new_max;
314
7.81k
    }
315
15.6k
    fz_catch(ctx)
316
0
    {
317
0
      pdf_drop_obj(ctx, obj);
318
0
      fz_rethrow(ctx);
319
0
    }
320
7.81k
  }
321
22.6k
  doc->orphans[doc->orphans_count++] = obj;
322
22.6k
}
323
324
static int is_white(int c)
325
0
{
326
0
  return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
327
0
}
328
329
void
330
pdf_repair_xref(fz_context *ctx, pdf_document *doc)
331
11.4k
{
332
11.4k
  pdf_obj *dict, *obj = NULL;
333
11.4k
  pdf_obj *length;
334
335
11.4k
  pdf_obj *encrypt = NULL;
336
11.4k
  pdf_obj *id = NULL;
337
11.4k
  pdf_obj **roots = NULL;
338
11.4k
  pdf_obj *info = NULL;
339
340
11.4k
  struct entry *list = NULL;
341
11.4k
  int listlen;
342
11.4k
  int listcap;
343
11.4k
  int maxnum = 0;
344
345
11.4k
  int num = 0;
346
11.4k
  int gen = 0;
347
11.4k
  int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
348
11.4k
  int64_t stm_len;
349
11.4k
  pdf_token tok;
350
11.4k
  int next;
351
11.4k
  int i;
352
11.4k
  size_t j, n;
353
11.4k
  int c;
354
11.4k
  pdf_lexbuf *buf = &doc->lexbuf.base;
355
11.4k
  int num_roots = 0;
356
11.4k
  int max_roots = 0;
357
358
11.4k
  fz_var(encrypt);
359
11.4k
  fz_var(id);
360
11.4k
  fz_var(roots);
361
11.4k
  fz_var(num_roots);
362
11.4k
  fz_var(max_roots);
363
11.4k
  fz_var(info);
364
11.4k
  fz_var(list);
365
11.4k
  fz_var(obj);
366
367
11.4k
  if (!doc->is_fdf)
368
11.4k
    fz_warn(ctx, "repairing PDF document");
369
370
11.4k
  if (doc->repair_attempted)
371
121
    fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again");
372
373
11.3k
  doc->repair_attempted = 1;
374
11.3k
  doc->repair_in_progress = 1;
375
376
11.3k
  pdf_drop_page_tree_internal(ctx, doc);
377
11.3k
  doc->page_tree_broken = 0;
378
11.3k
  pdf_forget_xref(ctx, doc);
379
380
11.3k
  fz_seek(ctx, doc->file, 0, 0);
381
382
22.7k
  fz_try(ctx)
383
22.7k
  {
384
11.3k
    pdf_xref_entry *entry;
385
11.3k
    listlen = 0;
386
11.3k
    listcap = 1024;
387
11.3k
    list = fz_malloc_array(ctx, listcap, struct entry);
388
389
    /* look for '%PDF' version marker within first kilobyte of file */
390
11.3k
    n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024));
391
392
11.3k
    fz_seek(ctx, doc->file, 0, 0);
393
11.3k
    if (n >= 4)
394
11.3k
    {
395
4.25M
      for (j = 0; j < n - 4; j++)
396
4.25M
      {
397
4.25M
        if (memcmp(&buf->scratch[j], "%PDF", 4) == 0 || memcmp(&buf->scratch[j], "%FDF", 4) == 0)
398
7.42k
        {
399
7.42k
          fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
400
7.42k
          break;
401
7.42k
        }
402
4.25M
      }
403
11.3k
    }
404
405
    /* skip comment line after version marker since some generators
406
     * forget to terminate the comment with a newline */
407
11.3k
    c = fz_read_byte(ctx, doc->file);
408
18.9k
    while (c >= 0 && (c == ' ' || c == '%'))
409
7.62k
      c = fz_read_byte(ctx, doc->file);
410
11.3k
    if (c != EOF)
411
11.3k
      fz_unread_byte(ctx, doc->file);
412
413
2.69M
    while (1)
414
2.69M
    {
415
2.69M
      tmpofs = fz_tell(ctx, doc->file);
416
2.69M
      if (tmpofs < 0)
417
0
        fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file");
418
419
5.38M
      fz_try(ctx)
420
5.38M
        tok = pdf_lex_no_string(ctx, doc->file, buf);
421
5.38M
      fz_catch(ctx)
422
0
      {
423
0
        fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
424
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
425
0
        fz_report_error(ctx);
426
0
        fz_warn(ctx, "skipping ahead to next token");
427
0
        do
428
0
          c = fz_read_byte(ctx, doc->file);
429
0
        while (c != EOF && !is_white(c));
430
0
        if (c == EOF)
431
0
          tok = PDF_TOK_EOF;
432
0
        else
433
0
          continue;
434
0
      }
435
436
      /* If we have the next token already, then we'll jump
437
       * back here, rather than going through the top of
438
       * the loop. */
439
2.88M
    have_next_token:
440
441
2.88M
      if (tok == PDF_TOK_INT)
442
1.03M
      {
443
1.03M
        if (buf->i < 0)
444
4.90k
        {
445
4.90k
          num = 0;
446
4.90k
          gen = 0;
447
4.90k
          continue;
448
4.90k
        }
449
1.03M
        numofs = genofs;
450
1.03M
        num = gen;
451
1.03M
        genofs = tmpofs;
452
1.03M
        gen = buf->i;
453
1.03M
      }
454
455
1.84M
      else if (tok == PDF_TOK_OBJ)
456
197k
      {
457
197k
        pdf_obj *root = NULL;
458
459
395k
        fz_try(ctx)
460
395k
        {
461
197k
          stm_len = 0;
462
197k
          stm_ofs = 0;
463
197k
          tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
464
197k
          if (root)
465
965
            add_root(ctx, root, &roots, &num_roots, &max_roots);
466
197k
        }
467
395k
        fz_always(ctx)
468
197k
        {
469
197k
          pdf_drop_obj(ctx, root);
470
197k
        }
471
197k
        fz_catch(ctx)
472
182
        {
473
182
          fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
474
182
          fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
475
          /* If we haven't seen a root yet, there is nothing
476
           * we can do, but give up. Otherwise, we'll make
477
           * do. */
478
182
          if (!roots)
479
76
            fz_rethrow(ctx);
480
106
          fz_report_error(ctx);
481
106
          fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
482
106
          break;
483
182
        }
484
485
197k
        if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
486
3.98k
        {
487
3.98k
          fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
488
3.98k
          goto have_next_token;
489
3.98k
        }
490
491
193k
        gen = fz_clampi(gen, 0, 65535);
492
493
193k
        if (listlen + 1 == listcap)
494
6
        {
495
6
          listcap = (listcap * 3) / 2;
496
6
          list = fz_realloc_array(ctx, list, listcap, struct entry);
497
6
        }
498
499
193k
        list[listlen].num = num;
500
193k
        list[listlen].gen = gen;
501
193k
        list[listlen].ofs = numofs;
502
193k
        list[listlen].stm_ofs = stm_ofs;
503
193k
        list[listlen].stm_len = stm_len;
504
193k
        listlen ++;
505
506
193k
        if (num > maxnum)
507
111k
          maxnum = num;
508
509
193k
        goto have_next_token;
510
197k
      }
511
512
      /* If we find a dictionary it is probably the trailer,
513
       * but could be a stream (or bogus) dictionary caused
514
       * by a corrupt file. */
515
1.65M
      else if (tok == PDF_TOK_OPEN_DICT)
516
9.33k
      {
517
9.33k
        pdf_obj *dictobj;
518
519
18.6k
        fz_try(ctx)
520
18.6k
        {
521
9.33k
          dict = pdf_parse_dict(ctx, doc, doc->file, buf);
522
9.33k
        }
523
18.6k
        fz_catch(ctx)
524
3.73k
        {
525
3.73k
          fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
526
3.73k
          fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
527
          /* If this was the real trailer dict
528
           * it was broken, in which case we are
529
           * in trouble. Keep going though in
530
           * case this was just a bogus dict. */
531
3.73k
          fz_report_error(ctx);
532
3.73k
          continue;
533
3.73k
        }
534
535
11.1k
        fz_try(ctx)
536
11.1k
        {
537
5.59k
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
538
5.59k
          if (dictobj)
539
838
          {
540
838
            pdf_drop_obj(ctx, encrypt);
541
838
            encrypt = pdf_keep_obj(ctx, dictobj);
542
838
          }
543
544
5.59k
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
545
5.59k
          if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
546
714
          {
547
714
            pdf_drop_obj(ctx, id);
548
714
            id = pdf_keep_obj(ctx, dictobj);
549
714
          }
550
551
5.59k
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
552
5.59k
          if (dictobj)
553
1.46k
            add_root(ctx, dictobj, &roots, &num_roots, &max_roots);
554
555
5.59k
          dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
556
5.59k
          if (dictobj)
557
703
          {
558
703
            pdf_drop_obj(ctx, info);
559
703
            info = pdf_keep_obj(ctx, dictobj);
560
703
          }
561
5.59k
        }
562
11.1k
        fz_always(ctx)
563
5.59k
          pdf_drop_obj(ctx, dict);
564
5.59k
        fz_catch(ctx)
565
0
          fz_rethrow(ctx);
566
5.59k
      }
567
568
1.64M
      else if (tok == PDF_TOK_EOF)
569
11.1k
      {
570
11.1k
        break;
571
11.1k
      }
572
573
1.63M
      else
574
1.63M
      {
575
1.63M
        num = 0;
576
1.63M
        gen = 0;
577
1.63M
      }
578
2.88M
    }
579
580
11.2k
    if (listlen == 0)
581
280
      fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found");
582
583
    /* make xref reasonable */
584
585
    /*
586
      Dummy access to entry to assure sufficient space in the xref table
587
      and avoid repeated reallocs in the loop
588
    */
589
    /* Ensure that the first xref table is a 'solid' one from
590
     * 0 to maxnum. */
591
10.9k
    pdf_ensure_solid_xref(ctx, doc, maxnum);
592
593
33.5M
    for (i = 1; i < maxnum; i++)
594
33.4M
    {
595
33.4M
      entry = pdf_get_populating_xref_entry(ctx, doc, i);
596
33.4M
      if (entry->obj != NULL)
597
0
        continue;
598
33.4M
      entry->type = 'f';
599
33.4M
      entry->ofs = 0;
600
33.4M
      entry->gen = 0;
601
33.4M
      entry->num = 0;
602
603
33.4M
      entry->stm_ofs = 0;
604
33.4M
    }
605
606
203k
    for (i = 0; i < listlen; i++)
607
192k
    {
608
192k
      entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
609
192k
      entry->type = 'n';
610
192k
      entry->ofs = list[i].ofs;
611
192k
      entry->gen = list[i].gen;
612
192k
      entry->num = list[i].num;
613
614
192k
      entry->stm_ofs = list[i].stm_ofs;
615
616
      /* correct stream length for unencrypted documents */
617
192k
      if (!encrypt && list[i].stm_len >= 0)
618
27.7k
      {
619
27.7k
        pdf_obj *old_obj = NULL;
620
27.7k
        dict = pdf_load_object(ctx, doc, list[i].num);
621
622
55.4k
        fz_try(ctx)
623
55.4k
        {
624
27.7k
          length = pdf_new_int(ctx, list[i].stm_len);
625
27.7k
          pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
626
27.7k
          if (old_obj)
627
22.6k
            orphan_object(ctx, doc, old_obj);
628
27.7k
        }
629
55.4k
        fz_always(ctx)
630
27.7k
          pdf_drop_obj(ctx, dict);
631
27.7k
        fz_catch(ctx)
632
1
          fz_rethrow(ctx);
633
27.7k
      }
634
192k
    }
635
636
10.9k
    entry = pdf_get_populating_xref_entry(ctx, doc, 0);
637
10.9k
    entry->type = 'f';
638
10.9k
    entry->ofs = 0;
639
10.9k
    entry->gen = 65535;
640
10.9k
    entry->num = 0;
641
10.9k
    entry->stm_ofs = 0;
642
643
10.9k
    next = 0;
644
33.4M
    for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
645
33.4M
    {
646
33.4M
      entry = pdf_get_populating_xref_entry(ctx, doc, i);
647
33.4M
      if (entry->type == 'f')
648
33.2M
      {
649
33.2M
        entry->ofs = next;
650
33.2M
        if (entry->gen < 65535)
651
33.2M
          entry->gen ++;
652
33.2M
        next = i;
653
33.2M
      }
654
33.4M
    }
655
656
    /* create a repaired trailer, Root will be added later */
657
658
10.9k
    obj = pdf_new_dict(ctx, doc, 5);
659
    /* During repair there is only a single xref section */
660
10.9k
    pdf_set_populating_xref_trailer(ctx, doc, obj);
661
10.9k
    pdf_drop_obj(ctx, obj);
662
10.9k
    obj = NULL;
663
664
10.9k
    pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1);
665
666
10.9k
    if (roots)
667
1.74k
    {
668
1.94k
      for (i = num_roots-1; i > 0; i--)
669
536
      {
670
536
        if (pdf_is_dict(ctx, roots[i]))
671
338
          break;
672
536
      }
673
1.74k
      if (i >= 0)
674
1.74k
      {
675
1.74k
        pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots[i]);
676
1.74k
      }
677
1.74k
    }
678
10.9k
    if (info)
679
589
    {
680
589
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
681
589
      pdf_drop_obj(ctx, info);
682
589
      info = NULL;
683
589
    }
684
685
10.9k
    if (encrypt)
686
249
    {
687
249
      if (pdf_is_indirect(ctx, encrypt))
688
191
      {
689
        /* create new reference with non-NULL xref pointer */
690
191
        obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
691
191
        pdf_drop_obj(ctx, encrypt);
692
191
        encrypt = obj;
693
191
        obj = NULL;
694
191
      }
695
249
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
696
249
      pdf_drop_obj(ctx, encrypt);
697
249
      encrypt = NULL;
698
249
    }
699
700
10.9k
    if (id)
701
1.14k
    {
702
1.14k
      if (pdf_is_indirect(ctx, id))
703
1
      {
704
        /* create new reference with non-NULL xref pointer */
705
1
        obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
706
1
        pdf_drop_obj(ctx, id);
707
1
        id = obj;
708
1
        obj = NULL;
709
1
      }
710
1.14k
      pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
711
1.14k
      pdf_drop_obj(ctx, id);
712
1.14k
      id = NULL;
713
1.14k
    }
714
10.9k
  }
715
22.7k
  fz_always(ctx)
716
11.3k
  {
717
13.7k
    for (i = 0; i < num_roots; i++)
718
2.42k
      pdf_drop_obj(ctx, roots[i]);
719
11.3k
    fz_free(ctx, roots);
720
11.3k
    fz_free(ctx, list);
721
11.3k
    doc->repair_in_progress = 0;
722
11.3k
  }
723
11.3k
  fz_catch(ctx)
724
371
  {
725
371
    pdf_drop_obj(ctx, encrypt);
726
371
    pdf_drop_obj(ctx, id);
727
371
    pdf_drop_obj(ctx, obj);
728
371
    pdf_drop_obj(ctx, info);
729
371
    if (ctx->throw_on_repair)
730
1
      fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt");
731
370
    fz_rethrow(ctx);
732
371
  }
733
734
10.6k
  if (ctx->throw_on_repair)
735
6
    fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired");
736
10.6k
}
737
738
void
739
pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
740
11.0k
{
741
11.0k
  pdf_obj *dict;
742
11.0k
  int i;
743
11.0k
  int xref_len = pdf_xref_len(ctx, doc);
744
745
33.4M
  for (i = 0; i < xref_len; i++)
746
33.4M
  {
747
33.4M
    pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
748
749
33.4M
    if (entry->stm_ofs)
750
62.2k
    {
751
62.2k
      dict = pdf_load_object(ctx, doc, i);
752
124k
      fz_try(ctx)
753
124k
      {
754
62.1k
        if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
755
4.87k
          pdf_repair_obj_stm(ctx, doc, i);
756
62.1k
      }
757
124k
      fz_always(ctx)
758
62.1k
        pdf_drop_obj(ctx, dict);
759
62.1k
      fz_catch(ctx)
760
1.08k
      {
761
1.08k
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
762
1.08k
        fz_report_error(ctx);
763
1.08k
        fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
764
1.08k
      }
765
62.2k
    }
766
33.4M
  }
767
768
  /* Ensure that streamed objects reside inside a known non-streamed object */
769
33.4M
  for (i = 0; i < xref_len; i++)
770
33.4M
  {
771
33.4M
    pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
772
773
33.4M
    if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
774
0
      fz_throw(ctx, FZ_ERROR_FORMAT, "invalid reference to non-object-stream: %d (%d 0 R)", (int)entry->ofs, i);
775
33.4M
  }
776
11.0k
}