/src/mupdf/source/pdf/pdf-repair.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | /* Scan file for objects and reconstruct xref table */ |
29 | | |
30 | | struct entry |
31 | | { |
32 | | int num; |
33 | | int gen; |
34 | | int64_t ofs; |
35 | | int64_t stm_ofs; |
36 | | int64_t stm_len; |
37 | | }; |
38 | | |
39 | | static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots) |
40 | 2.42k | { |
41 | 2.42k | if (*num_roots == *max_roots) |
42 | 1.78k | { |
43 | 1.78k | int new_max_roots = *max_roots * 2; |
44 | 1.78k | if (new_max_roots == 0) |
45 | 1.74k | new_max_roots = 4; |
46 | 1.78k | *roots = fz_realloc_array(ctx, *roots, new_max_roots, pdf_obj*); |
47 | 1.78k | *max_roots = new_max_roots; |
48 | 1.78k | } |
49 | 2.42k | (*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj); |
50 | 2.42k | } |
51 | | |
52 | | int |
53 | | pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int64_t *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root) |
54 | 197k | { |
55 | 197k | fz_stream *file = doc->file; |
56 | 197k | pdf_token tok; |
57 | 197k | int64_t stm_len; |
58 | 197k | int64_t local_ofs; |
59 | | |
60 | 197k | if (tmpofs == NULL) |
61 | 0 | tmpofs = &local_ofs; |
62 | 197k | if (stmofsp == NULL) |
63 | 0 | stmofsp = &local_ofs; |
64 | | |
65 | 197k | *stmofsp = 0; |
66 | 197k | if (stmlenp) |
67 | 197k | *stmlenp = -1; |
68 | | |
69 | 197k | stm_len = 0; |
70 | | |
71 | 197k | *tmpofs = fz_tell(ctx, file); |
72 | 197k | if (*tmpofs < 0) |
73 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
74 | | |
75 | | /* On entry to this function, we know that we've just seen |
76 | | * '<int> <int> obj'. We expect the next thing we see to be a |
77 | | * pdf object. Regardless of the type of thing we meet next |
78 | | * we only need to fully parse it if it is a dictionary. */ |
79 | 197k | tok = pdf_lex(ctx, file, buf); |
80 | | |
81 | | /* Don't let a truncated object at EOF overwrite a good one */ |
82 | 197k | if (tok == PDF_TOK_EOF) |
83 | 0 | fz_throw(ctx, FZ_ERROR_SYNTAX, "truncated object"); |
84 | | |
85 | 197k | if (tok == PDF_TOK_OPEN_DICT) |
86 | 173k | { |
87 | 173k | pdf_obj *obj, *dict = NULL; |
88 | | |
89 | 347k | fz_try(ctx) |
90 | 347k | { |
91 | 173k | dict = pdf_parse_dict(ctx, doc, file, buf); |
92 | 173k | } |
93 | 347k | fz_catch(ctx) |
94 | 12.9k | { |
95 | 12.9k | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
96 | 12.9k | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
97 | | /* Don't let a broken object at EOF overwrite a good one */ |
98 | 12.9k | if (file->eof) |
99 | 182 | fz_rethrow(ctx); |
100 | | /* Silently swallow the error */ |
101 | 12.7k | fz_report_error(ctx); |
102 | 12.7k | dict = pdf_new_dict(ctx, doc, 2); |
103 | 12.7k | } |
104 | | |
105 | | /* We must be careful not to try to resolve any indirections |
106 | | * here. We have just read dict, so we know it to be a non |
107 | | * indirected dictionary. Before we look at any values that |
108 | | * we get back from looking up in it, we need to check they |
109 | | * aren't indirected. */ |
110 | | |
111 | 173k | if (encrypt || id || root) |
112 | 173k | { |
113 | 173k | obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); |
114 | 173k | if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef))) |
115 | 1.07k | { |
116 | 1.07k | if (encrypt) |
117 | 1.07k | { |
118 | 1.07k | obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); |
119 | 1.07k | if (obj) |
120 | 149 | { |
121 | 149 | pdf_drop_obj(ctx, *encrypt); |
122 | 149 | *encrypt = pdf_keep_obj(ctx, obj); |
123 | 149 | } |
124 | 1.07k | } |
125 | | |
126 | 1.07k | if (id) |
127 | 1.07k | { |
128 | 1.07k | obj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); |
129 | 1.07k | if (obj) |
130 | 1.03k | { |
131 | 1.03k | pdf_drop_obj(ctx, *id); |
132 | 1.03k | *id = pdf_keep_obj(ctx, obj); |
133 | 1.03k | } |
134 | 1.07k | } |
135 | | |
136 | 1.07k | if (root) |
137 | 1.07k | *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root))); |
138 | 1.07k | } |
139 | 173k | } |
140 | | |
141 | 173k | obj = pdf_dict_get(ctx, dict, PDF_NAME(Length)); |
142 | 173k | if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj)) |
143 | 53.9k | stm_len = pdf_to_int64(ctx, obj); |
144 | | |
145 | 173k | if (doc->file_reading_linearly && page) |
146 | 0 | { |
147 | 0 | obj = pdf_dict_get(ctx, dict, PDF_NAME(Type)); |
148 | 0 | if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page))) |
149 | 0 | { |
150 | 0 | pdf_drop_obj(ctx, *page); |
151 | 0 | *page = pdf_keep_obj(ctx, dict); |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | 173k | pdf_drop_obj(ctx, dict); |
156 | 173k | } |
157 | | |
158 | 432k | while ( tok != PDF_TOK_STREAM && |
159 | 432k | tok != PDF_TOK_ENDOBJ && |
160 | 432k | tok != PDF_TOK_ERROR && |
161 | 432k | tok != PDF_TOK_EOF && |
162 | 432k | tok != PDF_TOK_INT ) |
163 | 234k | { |
164 | 234k | *tmpofs = fz_tell(ctx, file); |
165 | 234k | if (*tmpofs < 0) |
166 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
167 | 234k | tok = pdf_lex(ctx, file, buf); |
168 | 234k | } |
169 | | |
170 | 197k | if (tok == PDF_TOK_STREAM) |
171 | 68.0k | { |
172 | 68.0k | int c = fz_read_byte(ctx, file); |
173 | 68.0k | if (c == '\r') { |
174 | 29.8k | c = fz_peek_byte(ctx, file); |
175 | 29.8k | if (c == '\n') |
176 | 29.4k | fz_read_byte(ctx, file); |
177 | 29.8k | } |
178 | | |
179 | 68.0k | *stmofsp = fz_tell(ctx, file); |
180 | 68.0k | if (*stmofsp < 0) |
181 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
182 | | |
183 | 68.0k | if (stm_len > 0) |
184 | 53.1k | { |
185 | 53.1k | fz_seek(ctx, file, *stmofsp + stm_len, 0); |
186 | 106k | fz_try(ctx) |
187 | 106k | { |
188 | 53.1k | tok = pdf_lex(ctx, file, buf); |
189 | 53.1k | } |
190 | 106k | fz_catch(ctx) |
191 | 0 | { |
192 | 0 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
193 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
194 | 0 | fz_report_error(ctx); |
195 | 0 | fz_warn(ctx, "cannot find endstream token, falling back to scanning"); |
196 | 0 | } |
197 | 53.1k | if (tok == PDF_TOK_ENDSTREAM) |
198 | 39.5k | goto atobjend; |
199 | 13.6k | fz_seek(ctx, file, *stmofsp, 0); |
200 | 13.6k | } |
201 | | |
202 | 28.5k | (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9); |
203 | | |
204 | 142M | while (memcmp(buf->scratch, "endstream", 9) != 0) |
205 | 142M | { |
206 | 142M | c = fz_read_byte(ctx, file); |
207 | 142M | if (c == EOF) |
208 | 7.01k | break; |
209 | 142M | memmove(&buf->scratch[0], &buf->scratch[1], 8); |
210 | 142M | buf->scratch[8] = c; |
211 | 142M | } |
212 | | |
213 | 28.5k | if (stmlenp) |
214 | 28.5k | *stmlenp = fz_tell(ctx, file) - *stmofsp - 9; |
215 | | |
216 | 68.0k | atobjend: |
217 | 68.0k | *tmpofs = fz_tell(ctx, file); |
218 | 68.0k | if (*tmpofs < 0) |
219 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
220 | 68.0k | tok = pdf_lex(ctx, file, buf); |
221 | 68.0k | if (tok != PDF_TOK_ENDOBJ) |
222 | 9.08k | fz_warn(ctx, "object missing 'endobj' token"); |
223 | 58.9k | else |
224 | 58.9k | { |
225 | | /* Read another token as we always return the next one */ |
226 | 58.9k | *tmpofs = fz_tell(ctx, file); |
227 | 58.9k | if (*tmpofs < 0) |
228 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
229 | 58.9k | tok = pdf_lex(ctx, file, buf); |
230 | 58.9k | } |
231 | 68.0k | } |
232 | 197k | return tok; |
233 | 197k | } |
234 | | |
235 | | static void |
236 | | pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num) |
237 | 4.87k | { |
238 | 4.87k | pdf_obj *obj; |
239 | 4.87k | fz_stream *stm = NULL; |
240 | 4.87k | pdf_token tok; |
241 | 4.87k | int i, n, count; |
242 | 4.87k | pdf_lexbuf buf; |
243 | | |
244 | 4.87k | fz_var(stm); |
245 | | |
246 | 4.87k | pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL); |
247 | | |
248 | 9.75k | fz_try(ctx) |
249 | 9.75k | { |
250 | 4.87k | obj = pdf_load_object(ctx, doc, stm_num); |
251 | | |
252 | 4.87k | count = pdf_dict_get_int(ctx, obj, PDF_NAME(N)); |
253 | | |
254 | 4.87k | pdf_drop_obj(ctx, obj); |
255 | | |
256 | 4.87k | stm = pdf_open_stream_number(ctx, doc, stm_num); |
257 | | |
258 | 70.8k | for (i = 0; i < count; i++) |
259 | 67.0k | { |
260 | 67.0k | pdf_xref_entry *entry; |
261 | | |
262 | 67.0k | tok = pdf_lex(ctx, stm, &buf); |
263 | 67.0k | if (tok != PDF_TOK_INT) |
264 | 981 | fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); |
265 | | |
266 | 66.0k | n = buf.i; |
267 | 66.0k | if (n < 0) |
268 | 55 | { |
269 | 55 | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); |
270 | 55 | continue; |
271 | 55 | } |
272 | 66.0k | else if (n >= pdf_xref_len(ctx, doc)) |
273 | 3.46k | { |
274 | 3.46k | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i); |
275 | 3.46k | continue; |
276 | 3.46k | } |
277 | | |
278 | 62.5k | entry = pdf_get_populating_xref_entry(ctx, doc, n); |
279 | 62.5k | entry->ofs = stm_num; |
280 | 62.5k | entry->gen = i; |
281 | 62.5k | entry->num = n; |
282 | 62.5k | entry->stm_ofs = 0; |
283 | 62.5k | pdf_drop_obj(ctx, entry->obj); |
284 | 62.5k | entry->obj = NULL; |
285 | 62.5k | entry->type = 'o'; |
286 | | |
287 | 62.5k | tok = pdf_lex(ctx, stm, &buf); |
288 | 62.5k | if (tok != PDF_TOK_INT) |
289 | 98 | fz_throw(ctx, FZ_ERROR_FORMAT, "corrupt object stream (%d 0 R)", stm_num); |
290 | 62.5k | } |
291 | 4.87k | } |
292 | 9.75k | fz_always(ctx) |
293 | 4.87k | { |
294 | 4.87k | fz_drop_stream(ctx, stm); |
295 | 4.87k | pdf_lexbuf_fin(ctx, &buf); |
296 | 4.87k | } |
297 | 4.87k | fz_catch(ctx) |
298 | 1.08k | { |
299 | 1.08k | fz_rethrow(ctx); |
300 | 1.08k | } |
301 | 3.79k | } |
302 | | |
303 | | static void |
304 | | orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj) |
305 | 22.6k | { |
306 | 22.6k | if (doc->orphans_count == doc->orphans_max) |
307 | 7.81k | { |
308 | 7.81k | int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32); |
309 | | |
310 | 15.6k | fz_try(ctx) |
311 | 15.6k | { |
312 | 7.81k | doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*); |
313 | 7.81k | doc->orphans_max = new_max; |
314 | 7.81k | } |
315 | 15.6k | fz_catch(ctx) |
316 | 0 | { |
317 | 0 | pdf_drop_obj(ctx, obj); |
318 | 0 | fz_rethrow(ctx); |
319 | 0 | } |
320 | 7.81k | } |
321 | 22.6k | doc->orphans[doc->orphans_count++] = obj; |
322 | 22.6k | } |
323 | | |
324 | | static int is_white(int c) |
325 | 0 | { |
326 | 0 | return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20'; |
327 | 0 | } |
328 | | |
329 | | void |
330 | | pdf_repair_xref(fz_context *ctx, pdf_document *doc) |
331 | 11.4k | { |
332 | 11.4k | pdf_obj *dict, *obj = NULL; |
333 | 11.4k | pdf_obj *length; |
334 | | |
335 | 11.4k | pdf_obj *encrypt = NULL; |
336 | 11.4k | pdf_obj *id = NULL; |
337 | 11.4k | pdf_obj **roots = NULL; |
338 | 11.4k | pdf_obj *info = NULL; |
339 | | |
340 | 11.4k | struct entry *list = NULL; |
341 | 11.4k | int listlen; |
342 | 11.4k | int listcap; |
343 | 11.4k | int maxnum = 0; |
344 | | |
345 | 11.4k | int num = 0; |
346 | 11.4k | int gen = 0; |
347 | 11.4k | int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0; |
348 | 11.4k | int64_t stm_len; |
349 | 11.4k | pdf_token tok; |
350 | 11.4k | int next; |
351 | 11.4k | int i; |
352 | 11.4k | size_t j, n; |
353 | 11.4k | int c; |
354 | 11.4k | pdf_lexbuf *buf = &doc->lexbuf.base; |
355 | 11.4k | int num_roots = 0; |
356 | 11.4k | int max_roots = 0; |
357 | | |
358 | 11.4k | fz_var(encrypt); |
359 | 11.4k | fz_var(id); |
360 | 11.4k | fz_var(roots); |
361 | 11.4k | fz_var(num_roots); |
362 | 11.4k | fz_var(max_roots); |
363 | 11.4k | fz_var(info); |
364 | 11.4k | fz_var(list); |
365 | 11.4k | fz_var(obj); |
366 | | |
367 | 11.4k | if (!doc->is_fdf) |
368 | 11.4k | fz_warn(ctx, "repairing PDF document"); |
369 | | |
370 | 11.4k | if (doc->repair_attempted) |
371 | 121 | fz_throw(ctx, FZ_ERROR_FORMAT, "Repair failed already - not trying again"); |
372 | | |
373 | 11.3k | doc->repair_attempted = 1; |
374 | 11.3k | doc->repair_in_progress = 1; |
375 | | |
376 | 11.3k | pdf_drop_page_tree_internal(ctx, doc); |
377 | 11.3k | doc->page_tree_broken = 0; |
378 | 11.3k | pdf_forget_xref(ctx, doc); |
379 | | |
380 | 11.3k | fz_seek(ctx, doc->file, 0, 0); |
381 | | |
382 | 22.7k | fz_try(ctx) |
383 | 22.7k | { |
384 | 11.3k | pdf_xref_entry *entry; |
385 | 11.3k | listlen = 0; |
386 | 11.3k | listcap = 1024; |
387 | 11.3k | list = fz_malloc_array(ctx, listcap, struct entry); |
388 | | |
389 | | /* look for '%PDF' version marker within first kilobyte of file */ |
390 | 11.3k | n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_minz(buf->size, 1024)); |
391 | | |
392 | 11.3k | fz_seek(ctx, doc->file, 0, 0); |
393 | 11.3k | if (n >= 4) |
394 | 11.3k | { |
395 | 4.25M | for (j = 0; j < n - 4; j++) |
396 | 4.25M | { |
397 | 4.25M | if (memcmp(&buf->scratch[j], "%PDF", 4) == 0 || memcmp(&buf->scratch[j], "%FDF", 4) == 0) |
398 | 7.42k | { |
399 | 7.42k | fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */ |
400 | 7.42k | break; |
401 | 7.42k | } |
402 | 4.25M | } |
403 | 11.3k | } |
404 | | |
405 | | /* skip comment line after version marker since some generators |
406 | | * forget to terminate the comment with a newline */ |
407 | 11.3k | c = fz_read_byte(ctx, doc->file); |
408 | 18.9k | while (c >= 0 && (c == ' ' || c == '%')) |
409 | 7.62k | c = fz_read_byte(ctx, doc->file); |
410 | 11.3k | if (c != EOF) |
411 | 11.3k | fz_unread_byte(ctx, doc->file); |
412 | | |
413 | 2.69M | while (1) |
414 | 2.69M | { |
415 | 2.69M | tmpofs = fz_tell(ctx, doc->file); |
416 | 2.69M | if (tmpofs < 0) |
417 | 0 | fz_throw(ctx, FZ_ERROR_SYSTEM, "cannot tell in file"); |
418 | | |
419 | 5.38M | fz_try(ctx) |
420 | 5.38M | tok = pdf_lex_no_string(ctx, doc->file, buf); |
421 | 5.38M | fz_catch(ctx) |
422 | 0 | { |
423 | 0 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
424 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
425 | 0 | fz_report_error(ctx); |
426 | 0 | fz_warn(ctx, "skipping ahead to next token"); |
427 | 0 | do |
428 | 0 | c = fz_read_byte(ctx, doc->file); |
429 | 0 | while (c != EOF && !is_white(c)); |
430 | 0 | if (c == EOF) |
431 | 0 | tok = PDF_TOK_EOF; |
432 | 0 | else |
433 | 0 | continue; |
434 | 0 | } |
435 | | |
436 | | /* If we have the next token already, then we'll jump |
437 | | * back here, rather than going through the top of |
438 | | * the loop. */ |
439 | 2.88M | have_next_token: |
440 | | |
441 | 2.88M | if (tok == PDF_TOK_INT) |
442 | 1.03M | { |
443 | 1.03M | if (buf->i < 0) |
444 | 4.90k | { |
445 | 4.90k | num = 0; |
446 | 4.90k | gen = 0; |
447 | 4.90k | continue; |
448 | 4.90k | } |
449 | 1.03M | numofs = genofs; |
450 | 1.03M | num = gen; |
451 | 1.03M | genofs = tmpofs; |
452 | 1.03M | gen = buf->i; |
453 | 1.03M | } |
454 | | |
455 | 1.84M | else if (tok == PDF_TOK_OBJ) |
456 | 197k | { |
457 | 197k | pdf_obj *root = NULL; |
458 | | |
459 | 395k | fz_try(ctx) |
460 | 395k | { |
461 | 197k | stm_len = 0; |
462 | 197k | stm_ofs = 0; |
463 | 197k | tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root); |
464 | 197k | if (root) |
465 | 965 | add_root(ctx, root, &roots, &num_roots, &max_roots); |
466 | 197k | } |
467 | 395k | fz_always(ctx) |
468 | 197k | { |
469 | 197k | pdf_drop_obj(ctx, root); |
470 | 197k | } |
471 | 197k | fz_catch(ctx) |
472 | 182 | { |
473 | 182 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
474 | 182 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
475 | | /* If we haven't seen a root yet, there is nothing |
476 | | * we can do, but give up. Otherwise, we'll make |
477 | | * do. */ |
478 | 182 | if (!roots) |
479 | 76 | fz_rethrow(ctx); |
480 | 106 | fz_report_error(ctx); |
481 | 106 | fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen); |
482 | 106 | break; |
483 | 182 | } |
484 | | |
485 | 197k | if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER) |
486 | 3.98k | { |
487 | 3.98k | fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen); |
488 | 3.98k | goto have_next_token; |
489 | 3.98k | } |
490 | | |
491 | 193k | gen = fz_clampi(gen, 0, 65535); |
492 | | |
493 | 193k | if (listlen + 1 == listcap) |
494 | 6 | { |
495 | 6 | listcap = (listcap * 3) / 2; |
496 | 6 | list = fz_realloc_array(ctx, list, listcap, struct entry); |
497 | 6 | } |
498 | | |
499 | 193k | list[listlen].num = num; |
500 | 193k | list[listlen].gen = gen; |
501 | 193k | list[listlen].ofs = numofs; |
502 | 193k | list[listlen].stm_ofs = stm_ofs; |
503 | 193k | list[listlen].stm_len = stm_len; |
504 | 193k | listlen ++; |
505 | | |
506 | 193k | if (num > maxnum) |
507 | 111k | maxnum = num; |
508 | | |
509 | 193k | goto have_next_token; |
510 | 197k | } |
511 | | |
512 | | /* If we find a dictionary it is probably the trailer, |
513 | | * but could be a stream (or bogus) dictionary caused |
514 | | * by a corrupt file. */ |
515 | 1.65M | else if (tok == PDF_TOK_OPEN_DICT) |
516 | 9.33k | { |
517 | 9.33k | pdf_obj *dictobj; |
518 | | |
519 | 18.6k | fz_try(ctx) |
520 | 18.6k | { |
521 | 9.33k | dict = pdf_parse_dict(ctx, doc, doc->file, buf); |
522 | 9.33k | } |
523 | 18.6k | fz_catch(ctx) |
524 | 3.73k | { |
525 | 3.73k | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
526 | 3.73k | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
527 | | /* If this was the real trailer dict |
528 | | * it was broken, in which case we are |
529 | | * in trouble. Keep going though in |
530 | | * case this was just a bogus dict. */ |
531 | 3.73k | fz_report_error(ctx); |
532 | 3.73k | continue; |
533 | 3.73k | } |
534 | | |
535 | 11.1k | fz_try(ctx) |
536 | 11.1k | { |
537 | 5.59k | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)); |
538 | 5.59k | if (dictobj) |
539 | 838 | { |
540 | 838 | pdf_drop_obj(ctx, encrypt); |
541 | 838 | encrypt = pdf_keep_obj(ctx, dictobj); |
542 | 838 | } |
543 | | |
544 | 5.59k | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID)); |
545 | 5.59k | if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt)))) |
546 | 714 | { |
547 | 714 | pdf_drop_obj(ctx, id); |
548 | 714 | id = pdf_keep_obj(ctx, dictobj); |
549 | 714 | } |
550 | | |
551 | 5.59k | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root)); |
552 | 5.59k | if (dictobj) |
553 | 1.46k | add_root(ctx, dictobj, &roots, &num_roots, &max_roots); |
554 | | |
555 | 5.59k | dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info)); |
556 | 5.59k | if (dictobj) |
557 | 703 | { |
558 | 703 | pdf_drop_obj(ctx, info); |
559 | 703 | info = pdf_keep_obj(ctx, dictobj); |
560 | 703 | } |
561 | 5.59k | } |
562 | 11.1k | fz_always(ctx) |
563 | 5.59k | pdf_drop_obj(ctx, dict); |
564 | 5.59k | fz_catch(ctx) |
565 | 0 | fz_rethrow(ctx); |
566 | 5.59k | } |
567 | | |
568 | 1.64M | else if (tok == PDF_TOK_EOF) |
569 | 11.1k | { |
570 | 11.1k | break; |
571 | 11.1k | } |
572 | | |
573 | 1.63M | else |
574 | 1.63M | { |
575 | 1.63M | num = 0; |
576 | 1.63M | gen = 0; |
577 | 1.63M | } |
578 | 2.88M | } |
579 | | |
580 | 11.2k | if (listlen == 0) |
581 | 280 | fz_throw(ctx, FZ_ERROR_FORMAT, "no objects found"); |
582 | | |
583 | | /* make xref reasonable */ |
584 | | |
585 | | /* |
586 | | Dummy access to entry to assure sufficient space in the xref table |
587 | | and avoid repeated reallocs in the loop |
588 | | */ |
589 | | /* Ensure that the first xref table is a 'solid' one from |
590 | | * 0 to maxnum. */ |
591 | 10.9k | pdf_ensure_solid_xref(ctx, doc, maxnum); |
592 | | |
593 | 33.5M | for (i = 1; i < maxnum; i++) |
594 | 33.4M | { |
595 | 33.4M | entry = pdf_get_populating_xref_entry(ctx, doc, i); |
596 | 33.4M | if (entry->obj != NULL) |
597 | 0 | continue; |
598 | 33.4M | entry->type = 'f'; |
599 | 33.4M | entry->ofs = 0; |
600 | 33.4M | entry->gen = 0; |
601 | 33.4M | entry->num = 0; |
602 | | |
603 | 33.4M | entry->stm_ofs = 0; |
604 | 33.4M | } |
605 | | |
606 | 203k | for (i = 0; i < listlen; i++) |
607 | 192k | { |
608 | 192k | entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num); |
609 | 192k | entry->type = 'n'; |
610 | 192k | entry->ofs = list[i].ofs; |
611 | 192k | entry->gen = list[i].gen; |
612 | 192k | entry->num = list[i].num; |
613 | | |
614 | 192k | entry->stm_ofs = list[i].stm_ofs; |
615 | | |
616 | | /* correct stream length for unencrypted documents */ |
617 | 192k | if (!encrypt && list[i].stm_len >= 0) |
618 | 27.7k | { |
619 | 27.7k | pdf_obj *old_obj = NULL; |
620 | 27.7k | dict = pdf_load_object(ctx, doc, list[i].num); |
621 | | |
622 | 55.4k | fz_try(ctx) |
623 | 55.4k | { |
624 | 27.7k | length = pdf_new_int(ctx, list[i].stm_len); |
625 | 27.7k | pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj); |
626 | 27.7k | if (old_obj) |
627 | 22.6k | orphan_object(ctx, doc, old_obj); |
628 | 27.7k | } |
629 | 55.4k | fz_always(ctx) |
630 | 27.7k | pdf_drop_obj(ctx, dict); |
631 | 27.7k | fz_catch(ctx) |
632 | 1 | fz_rethrow(ctx); |
633 | 27.7k | } |
634 | 192k | } |
635 | | |
636 | 10.9k | entry = pdf_get_populating_xref_entry(ctx, doc, 0); |
637 | 10.9k | entry->type = 'f'; |
638 | 10.9k | entry->ofs = 0; |
639 | 10.9k | entry->gen = 65535; |
640 | 10.9k | entry->num = 0; |
641 | 10.9k | entry->stm_ofs = 0; |
642 | | |
643 | 10.9k | next = 0; |
644 | 33.4M | for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--) |
645 | 33.4M | { |
646 | 33.4M | entry = pdf_get_populating_xref_entry(ctx, doc, i); |
647 | 33.4M | if (entry->type == 'f') |
648 | 33.2M | { |
649 | 33.2M | entry->ofs = next; |
650 | 33.2M | if (entry->gen < 65535) |
651 | 33.2M | entry->gen ++; |
652 | 33.2M | next = i; |
653 | 33.2M | } |
654 | 33.4M | } |
655 | | |
656 | | /* create a repaired trailer, Root will be added later */ |
657 | | |
658 | 10.9k | obj = pdf_new_dict(ctx, doc, 5); |
659 | | /* During repair there is only a single xref section */ |
660 | 10.9k | pdf_set_populating_xref_trailer(ctx, doc, obj); |
661 | 10.9k | pdf_drop_obj(ctx, obj); |
662 | 10.9k | obj = NULL; |
663 | | |
664 | 10.9k | pdf_dict_put_int(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), maxnum + 1); |
665 | | |
666 | 10.9k | if (roots) |
667 | 1.74k | { |
668 | 1.94k | for (i = num_roots-1; i > 0; i--) |
669 | 536 | { |
670 | 536 | if (pdf_is_dict(ctx, roots[i])) |
671 | 338 | break; |
672 | 536 | } |
673 | 1.74k | if (i >= 0) |
674 | 1.74k | { |
675 | 1.74k | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots[i]); |
676 | 1.74k | } |
677 | 1.74k | } |
678 | 10.9k | if (info) |
679 | 589 | { |
680 | 589 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info); |
681 | 589 | pdf_drop_obj(ctx, info); |
682 | 589 | info = NULL; |
683 | 589 | } |
684 | | |
685 | 10.9k | if (encrypt) |
686 | 249 | { |
687 | 249 | if (pdf_is_indirect(ctx, encrypt)) |
688 | 191 | { |
689 | | /* create new reference with non-NULL xref pointer */ |
690 | 191 | obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt)); |
691 | 191 | pdf_drop_obj(ctx, encrypt); |
692 | 191 | encrypt = obj; |
693 | 191 | obj = NULL; |
694 | 191 | } |
695 | 249 | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt); |
696 | 249 | pdf_drop_obj(ctx, encrypt); |
697 | 249 | encrypt = NULL; |
698 | 249 | } |
699 | | |
700 | 10.9k | if (id) |
701 | 1.14k | { |
702 | 1.14k | if (pdf_is_indirect(ctx, id)) |
703 | 1 | { |
704 | | /* create new reference with non-NULL xref pointer */ |
705 | 1 | obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id)); |
706 | 1 | pdf_drop_obj(ctx, id); |
707 | 1 | id = obj; |
708 | 1 | obj = NULL; |
709 | 1 | } |
710 | 1.14k | pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id); |
711 | 1.14k | pdf_drop_obj(ctx, id); |
712 | 1.14k | id = NULL; |
713 | 1.14k | } |
714 | 10.9k | } |
715 | 22.7k | fz_always(ctx) |
716 | 11.3k | { |
717 | 13.7k | for (i = 0; i < num_roots; i++) |
718 | 2.42k | pdf_drop_obj(ctx, roots[i]); |
719 | 11.3k | fz_free(ctx, roots); |
720 | 11.3k | fz_free(ctx, list); |
721 | 11.3k | doc->repair_in_progress = 0; |
722 | 11.3k | } |
723 | 11.3k | fz_catch(ctx) |
724 | 371 | { |
725 | 371 | pdf_drop_obj(ctx, encrypt); |
726 | 371 | pdf_drop_obj(ctx, id); |
727 | 371 | pdf_drop_obj(ctx, obj); |
728 | 371 | pdf_drop_obj(ctx, info); |
729 | 371 | if (ctx->throw_on_repair) |
730 | 1 | fz_throw(ctx, FZ_ERROR_REPAIRED, "Error during repair attempt"); |
731 | 370 | fz_rethrow(ctx); |
732 | 371 | } |
733 | | |
734 | 10.6k | if (ctx->throw_on_repair) |
735 | 6 | fz_throw(ctx, FZ_ERROR_REPAIRED, "File repaired"); |
736 | 10.6k | } |
737 | | |
738 | | void |
739 | | pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc) |
740 | 11.0k | { |
741 | 11.0k | pdf_obj *dict; |
742 | 11.0k | int i; |
743 | 11.0k | int xref_len = pdf_xref_len(ctx, doc); |
744 | | |
745 | 33.4M | for (i = 0; i < xref_len; i++) |
746 | 33.4M | { |
747 | 33.4M | pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); |
748 | | |
749 | 33.4M | if (entry->stm_ofs) |
750 | 62.2k | { |
751 | 62.2k | dict = pdf_load_object(ctx, doc, i); |
752 | 124k | fz_try(ctx) |
753 | 124k | { |
754 | 62.1k | if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm))) |
755 | 4.87k | pdf_repair_obj_stm(ctx, doc, i); |
756 | 62.1k | } |
757 | 124k | fz_always(ctx) |
758 | 62.1k | pdf_drop_obj(ctx, dict); |
759 | 62.1k | fz_catch(ctx) |
760 | 1.08k | { |
761 | 1.08k | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
762 | 1.08k | fz_report_error(ctx); |
763 | 1.08k | fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i); |
764 | 1.08k | } |
765 | 62.2k | } |
766 | 33.4M | } |
767 | | |
768 | | /* Ensure that streamed objects reside inside a known non-streamed object */ |
769 | 33.4M | for (i = 0; i < xref_len; i++) |
770 | 33.4M | { |
771 | 33.4M | pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i); |
772 | | |
773 | 33.4M | if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n') |
774 | 0 | fz_throw(ctx, FZ_ERROR_FORMAT, "invalid reference to non-object-stream: %d (%d 0 R)", (int)entry->ofs, i); |
775 | 33.4M | } |
776 | 11.0k | } |