/src/mupdf/source/pdf/pdf-repair.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "pdf-imp.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | /* Scan file for objects and reconstruct xref table */ |
29 | | |
30 | | struct entry |
31 | | { |
32 | | int num; |
33 | | int gen; |
34 | | int64_t ofs; |
35 | | int64_t stm_ofs; |
36 | | int64_t stm_len; |
37 | | }; |
38 | | |
39 | | typedef struct |
40 | | { |
41 | | int max; |
42 | | int len; |
43 | | pdf_obj **roots; |
44 | | } pdf_root_list; |
45 | | |
46 | | static void |
47 | | add_root(fz_context *ctx, pdf_root_list *roots, pdf_obj *obj) |
48 | 3 | { |
49 | 3 | if (roots->max == roots->len) |
50 | 3 | { |
51 | 3 | int new_max_roots = roots->max * 2; |
52 | 3 | if (new_max_roots == 0) |
53 | 3 | new_max_roots = 4; |
54 | 3 | roots->roots = fz_realloc(ctx, roots->roots, new_max_roots * sizeof(roots->roots[0])); |
55 | 3 | roots->max = new_max_roots; |
56 | 3 | } |
57 | 3 | roots->roots[roots->len] = pdf_keep_obj(ctx, obj); |
58 | 3 | roots->len++; |
59 | 3 | } |
60 | | |
61 | | static pdf_root_list * |
62 | | fz_new_root_list(fz_context *ctx) |
63 | 16 | { |
64 | 16 | return fz_malloc_struct(ctx, pdf_root_list); |
65 | 16 | } |
66 | | |
67 | | static void |
68 | | pdf_drop_root_list(fz_context *ctx, pdf_root_list *roots) |
69 | 23 | { |
70 | 23 | int i, n; |
71 | | |
72 | 23 | if (roots == NULL) |
73 | 7 | return; |
74 | | |
75 | 16 | n = roots->len; |
76 | 19 | for (i = 0; i < n; i++) |
77 | 3 | pdf_drop_obj(ctx, roots->roots[i]); |
78 | 16 | fz_free(ctx, roots->roots); |
79 | 16 | fz_free(ctx, roots); |
80 | 16 | } |
81 | | |
82 | | int |
83 | | pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root) |
84 | 824 | { |
85 | 824 | fz_stream *file = doc->file; |
86 | 824 | pdf_token tok; |
87 | 824 | int64_t stm_len; |
88 | 824 | int64_t local_ofs; |
89 | | |
90 | 824 | if (tmpofs == NULL) |
91 | 0 | tmpofs = &local_ofs; |
92 | 824 | if (stmofsp == NULL) |
93 | 0 | stmofsp = &local_ofs; |
94 | | |
95 | 824 | *stmofsp = 0; |
96 | 824 | if (stmlenp) |
97 | 824 | *stmlenp = -1; |
98 | | |
99 | 824 | stm_len = 0; |
100 | | |
101 | 824 | *tmpofs = fz_tell(ctx, file); |
102 | 824 | if (*tmpofs < 0) |
103 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
104 | | |
105 | | /* On entry to this function, we know that we've just seen |
106 | | * '<int> <int> obj'. We expect the next thing we see to be a |
107 | | * pdf object. Regardless of the type of thing we meet next |
108 | | * we only need to fully parse it if it is a dictionary. */ |
109 | 824 | tok = pdf_lex(ctx, file, buf); |
110 | | |
111 | | /* Don't let a truncated object at EOF overwrite a good one */ |
112 | 824 | if (tok == PDF_TOK_EOF) |
113 | 0 | fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object"); |
114 | | |
115 | 824 | if (tok == PDF_TOK_OPEN_DICT) |
116 | 800 | { |
117 | 800 | pdf_obj *obj, *dict = NULL; |
118 | | |
119 | 1.60k | fz_try(ctx) |
120 | 1.60k | { |
121 | 800 | dict = pdf_parse_dict(ctx, doc, file, buf); |
122 | 800 | } |
123 | 1.60k | fz_catch(ctx) |
124 | 7 | { |
125 | 7 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
126 | 7 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
127 | | /* Don't let a broken object at EOF overwrite a good one */ |
128 | 7 | if (file->eof) |
129 | 1 | fz_rethrow(ctx); |
130 | | /* Silently swallow the error */ |
131 | 6 | fz_report_error(ctx); |
132 | 6 | dict = pdf_new_dict(ctx, doc, 2); |
133 | 6 | } |
134 | | |
135 | | /* We must be careful not to try to resolve any indirections |
136 | | * here. We have just read dict, so we know it to be a non |
137 | | * indirected dictionary. Before we look at any values that |
138 | | * we get back from looking up in it, we need to check they |
139 | | * aren't indirected. */ |
140 | | |
141 | 799 | if (encrypt || id || root) |
142 | 799 | { |
143 | 799 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); |
144 | 799 | if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef))) |
145 | 3 | { |
146 | 3 | if (encrypt) |
147 | 3 | { |
148 | 3 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); |
149 | 3 | if (obj) |
150 | 0 | { |
151 | 0 | pdf_drop_obj(ctx, *encrypt); |
152 | 0 | *encrypt = pdf_keep_obj(ctx, obj); |
153 | 0 | } |
154 | 3 | } |
155 | | |
156 | 3 | if (id) |
157 | 3 | { |
158 | 3 | obj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); |
159 | 3 | if (obj) |
160 | 3 | { |
161 | 3 | pdf_drop_obj(ctx, *id); |
162 | 3 | *id = pdf_keep_obj(ctx, obj); |
163 | 3 | } |
164 | 3 | } |
165 | | |
166 | 3 | if (root) |
167 | 3 | *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root))); |
168 | 3 | } |
169 | 799 | } |
170 | | |
171 | 799 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Length)); |
172 | 799 | if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) |
173 | 321 | stm_len = pdf_to_int64(ctx, obj); |
174 | | |
175 | 799 | if (doc->file_reading_linearly && page) |
176 | 0 | { |
177 | 0 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); |
178 | 0 | if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page))) |
179 | 0 | { |
180 | 0 | pdf_drop_obj(ctx, *page); |
181 | 0 | *page = pdf_keep_obj(ctx, dict); |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | 799 | pdf_drop_obj(ctx, dict); |
186 | 799 | } |
187 | | |
188 | 1.64k | while ( tok != PDF_TOK_STREAM && |
189 | 1.32k | tok != PDF_TOK_ENDOBJ && |
190 | 853 | tok != PDF_TOK_ERROR && |
191 | 847 | tok != PDF_TOK_EOF && |
192 | 847 | tok != PDF_TOK_INT ) |
193 | 823 | { |
194 | 823 | *tmpofs = fz_tell(ctx, file); |
195 | 823 | if (*tmpofs < 0) |
196 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
197 | 823 | tok = pdf_lex(ctx, file, buf); |
198 | 823 | } |
199 | | |
200 | 823 | if (tok == PDF_TOK_STREAM) |
201 | 322 | { |
202 | 322 | int c = fz_read_byte(ctx, file); |
203 | 322 | if (c == '\r') { |
204 | 40 | c = fz_peek_byte(ctx, file); |
205 | 40 | if (c == '\n') |
206 | 40 | fz_read_byte(ctx, file); |
207 | 40 | } |
208 | | |
209 | 322 | *stmofsp = fz_tell(ctx, file); |
210 | 322 | if (*stmofsp < 0) |
211 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
212 | | |
213 | 322 | if (stm_len > 0) |
214 | 321 | { |
215 | 321 | fz_seek(ctx, file, *stmofsp + stm_len, 0); |
216 | 642 | fz_try(ctx) |
217 | 642 | { |
218 | 321 | tok = pdf_lex(ctx, file, buf); |
219 | 321 | } |
220 | 642 | fz_catch(ctx) |
221 | 0 | { |
222 | 0 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
223 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
224 | 0 | fz_report_error(ctx); |
225 | 0 | fz_warn(ctx, "cannot find endstream token, falling back to scanning"); |
226 | 0 | } |
227 | 321 | if (tok == PDF_TOK_ENDSTREAM) |
228 | 292 | goto atobjend; |
229 | 29 | fz_seek(ctx, file, *stmofsp, 0); |
230 | 29 | } |
231 | | |
232 | 30 | (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9); |
233 | | |
234 | 8.49M | while (memcmp(buf->scratch, "endstream", 9) != 0) |
235 | 8.49M | { |
236 | 8.49M | c = fz_read_byte(ctx, file); |
237 | 8.49M | if (c == EOF) |
238 | 10 | break; |
239 | 8.49M | memmove(&buf->scratch[0], &buf->scratch[1], 8); |
240 | 8.49M | buf->scratch[8] = c; |
241 | 8.49M | } |
242 | | |
243 | 30 | if (stmlenp) |
244 | 30 | *stmlenp = fz_tell(ctx, file) - *stmofsp - 9; |
245 | | |
246 | 322 | atobjend: |
247 | 322 | *tmpofs = fz_tell(ctx, file); |
248 | 322 | if (*tmpofs < 0) |
249 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
250 | 322 | tok = pdf_lex(ctx, file, buf); |
251 | 322 | if (tok != PDF_TOK_ENDOBJ) |
252 | 10 | fz_warn(ctx, "object missing 'endobj' token"); |
253 | 312 | else |
254 | 312 | { |
255 | | /* Read another token as we always return the next one */ |
256 | 312 | *tmpofs = fz_tell(ctx, file); |
257 | 312 | if (*tmpofs < 0) |
258 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
259 | 312 | tok = pdf_lex(ctx, file, buf); |
260 | 312 | } |
261 | 322 | } |
262 | 823 | return tok; |
263 | 823 | } |
264 | | |
265 | | static int64_t |
266 | | entry_offset(fz_context *ctx, pdf_document *doc, int num) |
267 | 0 | { |
268 | 0 | pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, num); |
269 | |
|
270 | 0 | if (entry->type == 0 || entry->type == 'f') |
271 | 0 | return 0; |
272 | 0 | if (entry->type == 'n') |
273 | 0 | return entry->ofs; |
274 | 0 | assert(entry->type == 'o'); |
275 | | |
276 | | /* It must be in a stream. Return the entry of that stream. */ |
277 | 0 | entry = pdf_get_populating_xref_entry(ctx, doc, entry->ofs); |
278 | | /* If it's NOT in a stream, then we'll invalidate this entry in a moment. |
279 | | * For now, just return an illegal offset. */ |
280 | 0 | if (entry->type != 'n') |
281 | 0 | return -1; |
282 | | |
283 | 0 | return entry->ofs; |
284 | 0 | } |
285 | | |
286 | | static void |
287 | | pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num) |
288 | 0 | { |
289 | 0 | pdf_obj *obj; |
290 | 0 | fz_stream *stm = NULL; |
291 | 0 | pdf_token tok; |
292 | 0 | int i, n, count; |
293 | 0 | pdf_lexbuf buf; |
294 | |
|
295 | 0 | fz_var(stm); |
296 | |
|
297 | 0 | pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL); |
298 | |
|
299 | 0 | fz_try(ctx) |
300 | 0 | { |
301 | 0 | obj = pdf_load_object(ctx, doc, stm_num); |
302 | |
|
303 | 0 | count = pdf_dict_get_int(ctx, obj, PDF_NAME(N)); |
304 | |
|
305 | 0 | pdf_drop_obj(ctx, obj); |
306 | |
|
307 | 0 | stm = pdf_open_stream_number(ctx, doc, stm_num); |
308 | |
|
309 | 0 | for (i = 0; i < count; i++) |
310 | 0 | { |
311 | 0 | pdf_xref_entry *entry; |
312 | 0 | int replace; |
313 | |
|
314 | 0 | tok = pdf_lex(ctx, stm, &buf); |
315 | 0 | if (tok != PDF_TOK_INT) |
316 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); |
317 | | |
318 | 0 | n = buf.i; |
319 | 0 | if (n < 0) |
320 | 0 | { |
321 | 0 | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); |
322 | 0 | continue; |
323 | 0 | } |
324 | 0 | else if (n >= PDF_MAX_OBJECT_NUMBER) |
325 | 0 | { |
326 | 0 | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); |
327 | 0 | continue; |
328 | 0 | } |
329 | | |
330 | 0 | entry = pdf_get_populating_xref_entry(ctx, doc, n); |
331 | | |
332 | | /* Bug 708286: Do not allow an object from an ObjStm to override an object |
333 | | * that isn't in an ObjStm that we've already read, that occurs after it |
334 | | * in the file. */ |
335 | 0 | replace = 1; |
336 | 0 | if (entry->type != 0 && entry->type != 'f') |
337 | 0 | { |
338 | 0 | int64_t existing_entry_offset = entry_offset(ctx, doc, n); |
339 | |
|
340 | 0 | if (existing_entry_offset < 0) |
341 | 0 | { |
342 | | /* The existing entry is invalid. Anything must be better than that! */ |
343 | 0 | } |
344 | 0 | else |
345 | 0 | { |
346 | 0 | int64_t this_entry_offset = entry_offset(ctx, doc, stm_num); |
347 | |
|
348 | 0 | if (existing_entry_offset > this_entry_offset) |
349 | 0 | replace = 0; |
350 | 0 | } |
351 | 0 | } |
352 | |
|
353 | 0 | if (replace) |
354 | 0 | { |
355 | 0 | entry->ofs = stm_num; |
356 | 0 | entry->gen = i; |
357 | 0 | entry->num = n; |
358 | 0 | entry->stm_ofs = 0; |
359 | 0 | pdf_drop_obj(ctx, entry->obj); |
360 | 0 | entry->obj = NULL; |
361 | 0 | entry->type = 'o'; |
362 | 0 | } |
363 | |
|
364 | 0 | tok = pdf_lex(ctx, stm, &buf); |
365 | 0 | if (tok != PDF_TOK_INT) |
366 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); |
367 | 0 | } |
368 | 0 | } |
369 | 0 | fz_always(ctx) |
370 | 0 | { |
371 | 0 | fz_drop_stream(ctx, stm); |
372 | 0 | pdf_lexbuf_fin(ctx, &buf); |
373 | 0 | } |
374 | 0 | fz_catch(ctx) |
375 | 0 | { |
376 | 0 | fz_rethrow(ctx); |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | | static void |
381 | | orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj) |
382 | 30 | { |
383 | 30 | if (doc->orphans_count == doc->orphans_max) |
384 | 10 | { |
385 | 10 | int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32); |
386 | | |
387 | 20 | fz_try(ctx) |
388 | 20 | { |
389 | 10 | doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*); |
390 | 10 | doc->orphans_max = new_max; |
391 | 10 | } |
392 | 20 | fz_catch(ctx) |
393 | 0 | { |
394 | 0 | pdf_drop_obj(ctx, obj); |
395 | 0 | fz_rethrow(ctx); |
396 | 0 | } |
397 | 10 | } |
398 | 30 | doc->orphans[doc->orphans_count++] = obj; |
399 | 30 | } |
400 | | |
401 | | static int is_white(int c) |
402 | 0 | { |
403 | 0 | return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20'; |
404 | 0 | } |
405 | | |
406 | | static pdf_root_list * |
407 | | pdf_repair_xref_base(fz_context *ctx, pdf_document *doc) |
408 | 16 | { |
409 | 16 | pdf_obj *dict, *obj = NULL; |
410 | 16 | pdf_obj *length; |
411 | | |
412 | 16 | pdf_obj *encrypt = NULL; |
413 | 16 | pdf_obj *id = NULL; |
414 | 16 | pdf_obj *info = NULL; |
415 | 16 | pdf_root_list *roots = NULL; |
416 | | |
417 | 16 | struct entry *list = NULL; |
418 | 16 | int listlen; |
419 | 16 | int listcap; |
420 | 16 | int maxnum = 0; |
421 | | |
422 | 16 | int num = 0; |
423 | 16 | int gen = 0; |
424 | 16 | int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0; |
425 | 16 | int64_t stm_len; |
426 | 16 | pdf_token tok; |
427 | 16 | int next; |
428 | 16 | int i; |
429 | 16 | size_t j, n; |
430 | 16 | int c; |
431 | 16 | pdf_lexbuf *buf = &doc->lexbuf.base; |
432 | | |
433 | 16 | fz_var(encrypt); |
434 | 16 | fz_var(id); |
435 | 16 | fz_var(info); |
436 | 16 | fz_var(list); |
437 | 16 | fz_var(obj); |
438 | 16 | fz_var(roots); |
439 | | |
440 | 16 | if (!doc->is_fdf) |
441 | 16 | fz_warn(ctx, "repairing PDF document"); |
442 | | |
443 | 16 | if (doc->repair_attempted) |
444 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again"); |
445 | | |
446 | 16 | doc->bias = 0; // reset bias! |
447 | | |
448 | 16 | doc->repair_attempted = 1; |
449 | 16 | doc->repair_in_progress = 1; |
450 | | |
451 | 16 | pdf_drop_page_tree_internal(ctx, doc); |
452 | 16 | doc->use_page_tree_map = 1; |
453 | 16 | pdf_forget_xref(ctx, doc); |
454 | | |
455 | 16 | fz_seek(ctx, doc->file, 0, 0); |
456 | | |
457 | 32 | fz_try(ctx) |
458 | 32 | { |
459 | 16 | pdf_xref_entry *entry; |
460 | 16 | listlen = 0; |
461 | 16 | listcap = 1024; |
462 | 16 | list = fz_malloc_array(ctx, listcap, struct entry); |
463 | | |
464 | 16 | roots = fz_new_root_list(ctx); |
465 | | |
466 | | /* look for '%PDF' version marker within first kilobyte of file */ |
467 | 16 | n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024)); |
468 | | |
469 | 16 | fz_seek(ctx, doc->file, 0, 0); |
470 | 16 | if (n >= 5) |
471 | 16 | { |
472 | 5.13k | for (j = 0; j < n - 5; j++) |
473 | 5.12k | { |
474 | 5.12k | if (memcmp(&buf->scratch[j], "%PDF-", 5) == 0 || memcmp(&buf->scratch[j], "%FDF-", 5) == 0) |
475 | 11 | { |
476 | 11 | fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */ |
477 | 11 | break; |
478 | 11 | } |
479 | 5.12k | } |
480 | 16 | } |
481 | | |
482 | | /* skip comment line after version marker since some generators |
483 | | * forget to terminate the comment with a newline */ |
484 | 16 | c = fz_read_byte(ctx, doc->file); |
485 | 16 | while (c >= 0 && (c == ' ' || c == '%')) |
486 | 0 | c = fz_read_byte(ctx, doc->file); |
487 | 16 | if (c != EOF) |
488 | 16 | fz_unread_byte(ctx, doc->file); |
489 | | |
490 | 465k | while (1) |
491 | 465k | { |
492 | 465k | tmpofs = fz_tell(ctx, doc->file); |
493 | 465k | if (tmpofs < 0) |
494 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
495 | | |
496 | 930k | fz_try(ctx) |
497 | 930k | tok = pdf_lex_no_string(ctx, doc->file, buf); |
498 | 930k | fz_catch(ctx) |
499 | 0 | { |
500 | 0 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
501 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
502 | 0 | fz_report_error(ctx); |
503 | 0 | fz_warn(ctx, "skipping ahead to next token"); |
504 | 0 | do |
505 | 0 | c = fz_read_byte(ctx, doc->file); |
506 | 0 | while (c != EOF && !is_white(c)); |
507 | 0 | if (c == EOF) |
508 | 0 | tok = PDF_TOK_EOF; |
509 | 0 | else |
510 | 0 | continue; |
511 | 0 | } |
512 | | |
513 | | /* If we have the next token already, then we'll jump |
514 | | * back here, rather than going through the top of |
515 | | * the loop. */ |
516 | 465k | have_next_token: |
517 | | |
518 | 465k | if (tok == PDF_TOK_INT) |
519 | 53.4k | { |
520 | 53.4k | if (buf->i < 0) |
521 | 4 | { |
522 | 4 | num = 0; |
523 | 4 | gen = 0; |
524 | 4 | continue; |
525 | 4 | } |
526 | 53.4k | numofs = genofs; |
527 | 53.4k | num = gen; |
528 | 53.4k | genofs = tmpofs; |
529 | 53.4k | gen = buf->i; |
530 | 53.4k | } |
531 | | |
532 | 412k | else if (tok == PDF_TOK_OBJ) |
533 | 824 | { |
534 | 824 | pdf_obj *root = NULL; |
535 | | |
536 | 1.64k | fz_try(ctx) |
537 | 1.64k | { |
538 | 824 | stm_len = 0; |
539 | 824 | stm_ofs = 0; |
540 | 824 | tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root); |
541 | 824 | if (root) |
542 | 3 | add_root(ctx, roots, root); |
543 | 824 | } |
544 | 1.64k | fz_always(ctx) |
545 | 824 | { |
546 | 824 | pdf_drop_obj(ctx, root); |
547 | 824 | } |
548 | 824 | fz_catch(ctx) |
549 | 1 | { |
550 | 1 | int errcode = fz_caught(ctx); |
551 | | /* If we haven't seen a root yet, there is nothing |
552 | | * we can do, but give up. Otherwise, we'll make |
553 | | * do. */ |
554 | 1 | if (roots->len == 0 || |
555 | 0 | errcode == FZ_ERROR_TRYLATER || |
556 | 0 | errcode == FZ_ERROR_SYSTEM) |
557 | 1 | { |
558 | 1 | pdf_drop_root_list(ctx, roots); |
559 | 1 | roots = NULL; |
560 | 1 | fz_rethrow(ctx); |
561 | 1 | } |
562 | 0 | fz_report_error(ctx); |
563 | 0 | fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen); |
564 | 0 | break; |
565 | 1 | } |
566 | | |
567 | 823 | if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER) |
568 | 0 | { |
569 | 0 | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen); |
570 | 0 | goto have_next_token; |
571 | 0 | } |
572 | | |
573 | 823 | gen = fz_clampi(gen, 0, 65535); |
574 | | |
575 | 823 | if (listlen + 1 == listcap) |
576 | 0 | { |
577 | 0 | listcap = (listcap * 3) / 2; |
578 | 0 | list = fz_realloc_array(ctx, list, listcap, struct entry); |
579 | 0 | } |
580 | | |
581 | 823 | list[listlen].num = num; |
582 | 823 | list[listlen].gen = gen; |
583 | 823 | list[listlen].ofs = numofs; |
584 | 823 | list[listlen].stm_ofs = stm_ofs; |
585 | 823 | list[listlen].stm_len = stm_len; |
586 | 823 | listlen ++; |
587 | | |
588 | 823 | if (num > maxnum) |
589 | 712 | maxnum = num; |
590 | | |
591 | 823 | goto have_next_token; |
592 | 823 | } |
593 | | |
594 | | /* If we find a dictionary it is probably the trailer, |
595 | | * but could be a stream (or bogus) dictionary caused |
596 | | * by a corrupt file. */ |
597 | 411k | else if (tok == PDF_TOK_OPEN_DICT) |
598 | 34 | { |
599 | 34 | pdf_obj *dictobj; |
600 | | |
601 | 68 | fz_try(ctx) |
602 | 68 | { |
603 | 34 | dict = pdf_parse_dict(ctx, doc, doc->file, buf); |
604 | 34 | } |
605 | 68 | fz_catch(ctx) |
606 | 32 | { |
607 | 32 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
608 | 32 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
609 | | /* If this was the real trailer dict |
610 | | * it was broken, in which case we are |
611 | | * in trouble. Keep going though in |
612 | | * case this was just a bogus dict. */ |
613 | 32 | fz_report_error(ctx); |
614 | 32 | continue; |
615 | 32 | } |
616 | | |
617 | 4 | fz_try(ctx) |
618 | 4 | { |
619 | 2 | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); |
620 | 2 | if (dictobj) |
621 | 0 | { |
622 | 0 | pdf_drop_obj(ctx, encrypt); |
623 | 0 | encrypt = pdf_keep_obj(ctx, dictobj); |
624 | 0 | } |
625 | | |
626 | 2 | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); |
627 | 2 | if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)))) |
628 | 0 | { |
629 | 0 | pdf_drop_obj(ctx, id); |
630 | 0 | id = pdf_keep_obj(ctx, dictobj); |
631 | 0 | } |
632 | | |
633 | 2 | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root)); |
634 | 2 | if (dictobj) |
635 | 0 | add_root(ctx, roots, dictobj); |
636 | | |
637 | 2 | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info)); |
638 | 2 | if (dictobj) |
639 | 0 | { |
640 | 0 | pdf_drop_obj(ctx, info); |
641 | 0 | info = pdf_keep_obj(ctx, dictobj); |
642 | 0 | } |
643 | 2 | } |
644 | 4 | fz_always(ctx) |
645 | 2 | pdf_drop_obj(ctx, dict); |
646 | 2 | fz_catch(ctx) |
647 | 0 | fz_rethrow(ctx); |
648 | 2 | } |
649 | | |
650 | 411k | else if (tok == PDF_TOK_EOF) |
651 | 15 | { |
652 | 15 | break; |
653 | 15 | } |
654 | | |
655 | 411k | else |
656 | 411k | { |
657 | 411k | num = 0; |
658 | 411k | gen = 0; |
659 | 411k | } |
660 | 465k | } |
661 | | |
662 | 15 | if (listlen == 0) |
663 | 5 | fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found"); |
664 | | |
665 | | /* make xref reasonable */ |
666 | | |
667 | | /* |
668 | | Dummy access to entry to assure sufficient space in the xref table |
669 | | and avoid repeated reallocs in the loop |
670 | | */ |
671 | | /* Ensure that the first xref table is a 'solid' one from |
672 | | * 0 to maxnum. */ |
673 | 10 | pdf_ensure_solid_xref(ctx, doc, maxnum); |
674 | | |
675 | 10.8k | for (i = 1; i < maxnum; i++) |
676 | 10.8k | { |
677 | 10.8k | entry = pdf_get_populating_xref_entry(ctx, doc, i); |
678 | 10.8k | if (entry->obj != NULL) |
679 | 0 | continue; |
680 | 10.8k | entry->type = 'f'; |
681 | 10.8k | entry->ofs = 0; |
682 | 10.8k | entry->gen = 0; |
683 | 10.8k | entry->num = 0; |
684 | | |
685 | 10.8k | entry->stm_ofs = 0; |
686 | 10.8k | } |
687 | | |
688 | 355 | for (i = 0; i < listlen; i++) |
689 | 345 | { |
690 | 345 | entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num); |
691 | 345 | entry->type = 'n'; |
692 | 345 | entry->ofs = list[i].ofs; |
693 | 345 | entry->gen = list[i].gen; |
694 | 345 | entry->num = list[i].num; |
695 | | |
696 | 345 | entry->stm_ofs = list[i].stm_ofs; |
697 | | |
698 | | /* correct stream length for unencrypted documents */ |
699 | 345 | if (!encrypt && list[i].stm_len >= 0) |
700 | 30 | { |
701 | 30 | pdf_obj *old_obj = NULL; |
702 | 30 | dict = pdf_load_object(ctx, doc, list[i].num); |
703 | | |
704 | 60 | fz_try(ctx) |
705 | 60 | { |
706 | 30 | length = pdf_new_int(ctx, list[i].stm_len); |
707 | 30 | pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj); |
708 | 30 | if (old_obj) |
709 | 30 | orphan_object(ctx, doc, old_obj); |
710 | 30 | } |
711 | 60 | fz_always(ctx) |
712 | 30 | pdf_drop_obj(ctx, dict); |
713 | 30 | fz_catch(ctx) |
714 | 0 | fz_rethrow(ctx); |
715 | 30 | } |
716 | 345 | } |
717 | | |
718 | 10 | entry = pdf_get_populating_xref_entry(ctx, doc, 0); |
719 | 10 | entry->type = 'f'; |
720 | 10 | entry->ofs = 0; |
721 | 10 | entry->gen = 65535; |
722 | 10 | entry->num = 0; |
723 | 10 | entry->stm_ofs = 0; |
724 | | |
725 | 10 | next = 0; |
726 | 10.8k | for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--) |
727 | 10.8k | { |
728 | 10.8k | entry = pdf_get_populating_xref_entry(ctx, doc, i); |
729 | 10.8k | if (entry->type == 'f') |
730 | 10.5k | { |
731 | 10.5k | entry->ofs = next; |
732 | 10.5k | if (entry->gen < 65535) |
733 | 10.5k | entry->gen ++; |
734 | 10.5k | next = i; |
735 | 10.5k | } |
736 | 10.8k | } |
737 | | |
738 | | /* create a repaired trailer, Root will be added later */ |
739 | | |
740 | 10 | obj = pdf_new_dict(ctx, doc, 5); |
741 | | /* During repair there is only a single xref section */ |
742 | 10 | pdf_set_populating_xref_trailer(ctx, doc, obj); |
743 | 10 | pdf_drop_obj(ctx, obj); |
744 | 10 | obj = NULL; |
745 | | |
746 | 10 | pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1); |
747 | | |
748 | 10 | if (info) |
749 | 0 | { |
750 | 0 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info); |
751 | 0 | pdf_drop_obj(ctx, info); |
752 | 0 | info = NULL; |
753 | 0 | } |
754 | | |
755 | 10 | if (encrypt) |
756 | 0 | { |
757 | 0 | if (pdf_is_indirect(ctx, encrypt)) |
758 | 0 | { |
759 | | /* create new reference with non-NULL xref pointer */ |
760 | 0 | obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt)); |
761 | 0 | pdf_drop_obj(ctx, encrypt); |
762 | 0 | encrypt = obj; |
763 | 0 | obj = NULL; |
764 | 0 | } |
765 | 0 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt); |
766 | 0 | pdf_drop_obj(ctx, encrypt); |
767 | 0 | encrypt = NULL; |
768 | 0 | } |
769 | | |
770 | 10 | if (id) |
771 | 3 | { |
772 | 3 | if (pdf_is_indirect(ctx, id)) |
773 | 0 | { |
774 | | /* create new reference with non-NULL xref pointer */ |
775 | 0 | obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id)); |
776 | 0 | pdf_drop_obj(ctx, id); |
777 | 0 | id = obj; |
778 | 0 | obj = NULL; |
779 | 0 | } |
780 | 3 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id); |
781 | 3 | pdf_drop_obj(ctx, id); |
782 | 3 | id = NULL; |
783 | 3 | } |
784 | 10 | } |
785 | 32 | fz_always(ctx) |
786 | 16 | { |
787 | 16 | fz_free(ctx, list); |
788 | 16 | doc->repair_in_progress = 0; |
789 | 16 | } |
790 | 16 | fz_catch(ctx) |
791 | 6 | { |
792 | 6 | pdf_drop_root_list(ctx, roots); |
793 | 6 | pdf_drop_obj(ctx, encrypt); |
794 | 6 | pdf_drop_obj(ctx, id); |
795 | 6 | pdf_drop_obj(ctx, obj); |
796 | 6 | pdf_drop_obj(ctx, info); |
797 | 6 | if (ctx->throw_on_repair) |
798 | 0 | fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt"); |
799 | 6 | fz_rethrow(ctx); |
800 | 6 | } |
801 | | |
802 | 4 | if (ctx->throw_on_repair) |
803 | 0 | { |
804 | 0 | pdf_drop_root_list(ctx, roots); |
805 | 0 | fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired"); |
806 | 0 | } |
807 | | |
808 | 4 | return roots; |
809 | 4 | } |
810 | | |
811 | | static void |
812 | | pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc) |
813 | 10 | { |
814 | 10 | pdf_obj *dict; |
815 | 10 | int i; |
816 | 10 | int xref_len = pdf_xref_len(ctx, doc); |
817 | | |
818 | 10.8k | for (i = 0; i < xref_len; i++) |
819 | 10.8k | { |
820 | 10.8k | pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); |
821 | | |
822 | 10.8k | if (entry->stm_ofs) |
823 | 174 | { |
824 | 174 | dict = pdf_load_object(ctx, doc, i); |
825 | 348 | fz_try(ctx) |
826 | 348 | { |
827 | 174 | if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm))) |
828 | 0 | pdf_repair_obj_stm(ctx, doc, i); |
829 | 174 | } |
830 | 348 | fz_always(ctx) |
831 | 174 | pdf_drop_obj(ctx, dict); |
832 | 174 | fz_catch(ctx) |
833 | 0 | { |
834 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
835 | 0 | fz_report_error(ctx); |
836 | 0 | fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i); |
837 | 0 | } |
838 | 174 | } |
839 | 10.8k | } |
840 | | |
841 | | /* Ensure that streamed objects reside inside a known non-streamed object */ |
842 | 10.8k | for (i = 0; i < xref_len; i++) |
843 | 10.8k | { |
844 | 10.8k | pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); |
845 | | |
846 | 10.8k | if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n') |
847 | 0 | { |
848 | 0 | fz_warn(ctx, "invalid reference to non-object-stream: %d, assuming %d 0 R is a freed object", (int)entry->ofs, i); |
849 | 0 | entry->type = 'f'; |
850 | 0 | } |
851 | 10.8k | } |
852 | 10 | } |
853 | | |
854 | | static void |
855 | | pdf_repair_roots(fz_context *ctx, pdf_document *doc, pdf_root_list *roots) |
856 | 10 | { |
857 | 10 | int i; |
858 | | |
859 | 10 | for (i = roots->len-1; i >= 0; i--) |
860 | 3 | { |
861 | 3 | if (pdf_is_indirect(ctx, roots->roots[i]) && pdf_is_dict(ctx, roots->roots[i])) |
862 | 3 | { |
863 | 3 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots->roots[i]); |
864 | 3 | break; |
865 | 3 | } |
866 | 3 | } |
867 | 10 | } |
868 | | |
869 | | static void |
870 | | pdf_repair_trailer(fz_context *ctx, pdf_document *doc) |
871 | 10 | { |
872 | 10 | int hasroot, hasinfo; |
873 | 10 | pdf_obj *obj, *nobj; |
874 | 10 | pdf_obj *dict = NULL; |
875 | 10 | int i; |
876 | | |
877 | 10 | int xref_len = pdf_xref_len(ctx, doc); |
878 | | |
879 | 10 | hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL); |
880 | 10 | hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL); |
881 | | |
882 | 10 | fz_var(dict); |
883 | | |
884 | 20 | fz_try(ctx) |
885 | 20 | { |
886 | | /* Scan from the end so we have a better chance of finding |
887 | | * newer objects if there are multiple instances of Info and |
888 | | * Root objects. |
889 | | */ |
890 | 10.8k | for (i = xref_len - 1; i > 0 && (!hasinfo || !hasroot); --i) |
891 | 10.8k | { |
892 | 10.8k | pdf_xref_entry *entry = pdf_get_xref_entry_no_null(ctx, doc, i); |
893 | 10.8k | if (entry->type == 0 || entry->type == 'f') |
894 | 10.5k | continue; |
895 | | |
896 | 690 | fz_try(ctx) |
897 | 690 | { |
898 | 345 | dict = pdf_load_object(ctx, doc, i); |
899 | 345 | } |
900 | 690 | fz_catch(ctx) |
901 | 6 | { |
902 | 6 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
903 | 6 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
904 | 6 | fz_report_error(ctx); |
905 | 6 | fz_warn(ctx, "ignoring broken object (%d 0 R)", i); |
906 | 6 | continue; |
907 | 6 | } |
908 | | |
909 | 339 | if (!hasroot) |
910 | 303 | { |
911 | 303 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); |
912 | 303 | if (obj == PDF_NAME(Catalog)) |
913 | 3 | { |
914 | 3 | nobj = pdf_new_indirect(ctx, doc, i, 0); |
915 | 3 | pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj); |
916 | 3 | hasroot = 1; |
917 | 3 | } |
918 | 303 | } |
919 | | |
920 | 339 | if (!hasinfo) |
921 | 339 | { |
922 | 339 | if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer))) |
923 | 1 | { |
924 | 1 | nobj = pdf_new_indirect(ctx, doc, i, 0); |
925 | 1 | pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj); |
926 | 1 | hasinfo = 1; |
927 | 1 | } |
928 | 339 | } |
929 | | |
930 | 339 | pdf_drop_obj(ctx, dict); |
931 | 339 | dict = NULL; |
932 | 339 | } |
933 | 10 | } |
934 | 20 | fz_always(ctx) |
935 | 10 | { |
936 | | /* ensure that strings are not used in their repaired, non-decrypted form */ |
937 | 10 | if (doc->crypt) |
938 | 0 | { |
939 | 0 | pdf_crypt *tmp; |
940 | 0 | pdf_clear_xref(ctx, doc); |
941 | | |
942 | | /* ensure that Encryption dictionary and ID are cached without decryption, |
943 | | otherwise a decrypted Encryption dictionary and ID may be used when saving |
944 | | the PDF causing it to be inconsistent (since strings/streams are encrypted |
945 | | with the actual encryption key, not the decrypted encryption key). */ |
946 | 0 | tmp = doc->crypt; |
947 | 0 | doc->crypt = NULL; |
948 | 0 | fz_try(ctx) |
949 | 0 | { |
950 | 0 | (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt))); |
951 | 0 | (void) pdf_resolve_indirect(ctx, pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID))); |
952 | 0 | } |
953 | 0 | fz_always(ctx) |
954 | 0 | doc->crypt = tmp; |
955 | 0 | fz_catch(ctx) |
956 | 0 | { |
957 | 0 | fz_rethrow(ctx); |
958 | 0 | } |
959 | 0 | } |
960 | 10 | } |
961 | 10 | fz_catch(ctx) |
962 | 0 | { |
963 | 0 | pdf_drop_obj(ctx, dict); |
964 | 0 | fz_rethrow(ctx); |
965 | 0 | } |
966 | 10 | } |
967 | | |
968 | | void pdf_repair_xref_aux(fz_context *ctx, pdf_document *doc, void (*mid)(fz_context *ctx, pdf_document *doc)) |
969 | 16 | { |
970 | 16 | pdf_root_list *roots = NULL; |
971 | | |
972 | 16 | fz_var(roots); |
973 | | |
974 | 32 | fz_try(ctx) |
975 | 32 | { |
976 | 16 | roots = pdf_repair_xref_base(ctx, doc); |
977 | 16 | if (mid) |
978 | 10 | mid(ctx, doc); |
979 | 16 | pdf_repair_obj_stms(ctx, doc); |
980 | 16 | pdf_repair_roots(ctx, doc, roots); |
981 | 16 | pdf_repair_trailer(ctx, doc); |
982 | 16 | } |
983 | 32 | fz_always(ctx) |
984 | 16 | pdf_drop_root_list(ctx, roots); |
985 | 16 | fz_catch(ctx) |
986 | 6 | fz_rethrow(ctx); |
987 | 16 | } |