/src/clamav/libclamav/pdf.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved. |
3 | | * Copyright (C) 2007-2013 Sourcefire, Inc. |
4 | | * |
5 | | * Authors: Nigel Horne, Török Edvin |
6 | | * |
7 | | * Also based on Matt Olney's pdf parser in snort-nrt. |
8 | | * |
9 | | * This program is free software; you can redistribute it and/or modify |
10 | | * it under the terms of the GNU General Public License version 2 as |
11 | | * published by the Free Software Foundation. |
12 | | * |
13 | | * This program is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | * GNU General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU General Public License |
19 | | * along with this program; if not, write to the Free Software |
20 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
21 | | * MA 02110-1301, USA. |
22 | | * |
23 | | * TODO: Embedded fonts |
24 | | * TODO: Predictor image handling |
25 | | */ |
26 | | |
27 | | #if HAVE_CONFIG_H |
28 | | #include "clamav-config.h" |
29 | | #endif |
30 | | |
31 | | #include <stdio.h> |
32 | | #include <sys/types.h> |
33 | | #include <sys/stat.h> |
34 | | #include <ctype.h> |
35 | | #include <string.h> |
36 | | #include <fcntl.h> |
37 | | #include <stdlib.h> |
38 | | #include <errno.h> |
39 | | #ifdef HAVE_LIMITS_H |
40 | | #include <limits.h> |
41 | | #endif |
42 | | #ifdef HAVE_UNISTD_H |
43 | | #include <unistd.h> |
44 | | #endif |
45 | | #include <zlib.h> |
46 | | |
47 | | #if HAVE_ICONV |
48 | | #include <iconv.h> |
49 | | #endif |
50 | | |
51 | | #ifdef _WIN32 |
52 | | #include <stdint.h> |
53 | | #endif |
54 | | |
55 | | #include "clamav.h" |
56 | | #include "others.h" |
57 | | #include "pdf.h" |
58 | | #include "pdfdecode.h" |
59 | | #include "scanners.h" |
60 | | #include "fmap.h" |
61 | | #include "str.h" |
62 | | #include "entconv.h" |
63 | | #include "bytecode.h" |
64 | | #include "bytecode_api.h" |
65 | | #include "arc4.h" |
66 | | #include "rijndael.h" |
67 | | #include "textnorm.h" |
68 | | #include "conv.h" |
69 | | #include "json_api.h" |
70 | | |
71 | | #ifdef CL_DEBUG |
72 | | /*#define SAVE_TMP |
73 | | *Save the file being worked on in tmp */ |
74 | | #endif |
75 | | |
76 | 2.33M | #define MAX_PDF_OBJECTS (64 * 1024) |
77 | | |
78 | | struct pdf_struct; |
79 | | |
80 | | static const char *pdf_nextlinestart(const char *ptr, size_t len); |
81 | | static const char *pdf_nextobject(const char *ptr, size_t len); |
82 | | |
83 | | /* PDF statistics callbacks and related */ |
84 | | struct pdfname_action; |
85 | | |
86 | | static void pdf_export_json(struct pdf_struct *); |
87 | | |
88 | | static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
89 | | static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
90 | | static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
91 | | static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
92 | | static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
93 | | static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
94 | | static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
95 | | static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
96 | | static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
97 | | static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
98 | | static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
99 | | static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
100 | | static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
101 | | static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
102 | | static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
103 | | static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
104 | | static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
105 | | static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
106 | | static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
107 | | static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
108 | | static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
109 | | static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
110 | | static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
111 | | static void Title_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
112 | | static void Subject_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
113 | | static void Keywords_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
114 | | static void Pages_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *); |
115 | | static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); |
116 | | static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); |
117 | | static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); |
118 | | static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); |
119 | | |
120 | | /* End PDF statistics callbacks and related */ |
121 | | |
122 | | static int pdf_readint(const char *q0, int len, const char *key); |
123 | | static const char *pdf_getdict(const char *q0, int *len, const char *key); |
124 | | static char *pdf_readval(const char *q, int len, const char *key); |
125 | | static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape); |
126 | | |
127 | | static int xrefCheck(const char *xref, const char *eof) |
128 | 25.0k | { |
129 | 25.0k | const char *q; |
130 | | |
131 | 26.3k | while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r')) |
132 | 1.32k | xref++; |
133 | | |
134 | 25.0k | if (xref + 4 >= eof) |
135 | 271 | return -1; |
136 | | |
137 | 24.8k | if (!memcmp(xref, "xref", strlen("xref"))) { |
138 | 395 | cli_dbgmsg("cli_pdf: found xref\n"); |
139 | 395 | return 0; |
140 | 395 | } |
141 | | |
142 | | /* could be xref stream */ |
143 | 45.0M | for (q = xref; q + 5 < eof; q++) { |
144 | 45.0M | if (!memcmp(q, "/XRef", strlen("/XRef"))) { |
145 | 4.32k | cli_dbgmsg("cli_pdf: found /XRef\n"); |
146 | 4.32k | return 0; |
147 | 4.32k | } |
148 | 45.0M | } |
149 | | |
150 | 20.0k | return -1; |
151 | 24.4k | } |
152 | | |
153 | | /* define this to be noisy about things that we can't parse properly */ |
154 | | #undef NOISY |
155 | | |
156 | | #ifdef NOISY |
157 | | #define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__) |
158 | | #define noisy_warnmsg(...) cli_warnmsg(__VA_ARGS__) |
159 | | #else |
160 | | #define noisy_msg(pdf, ...) |
161 | | #define noisy_warnmsg(...) |
162 | | #endif |
163 | | |
164 | | /** |
165 | | * @brief Searching BACKwards, find the next character that is not a whitespace. |
166 | | * |
167 | | * @param q Index to start from (at the end of the search space) |
168 | | * @param start Beginning of the search space. |
169 | | * |
170 | | * @return const char* Address of the final non-whitespace character OR the same address as the start. |
171 | | */ |
172 | | static const char *findNextNonWSBack(const char *q, const char *start) |
173 | 3.16M | { |
174 | 3.59M | while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20)) |
175 | 435k | q--; |
176 | | |
177 | 3.16M | return q; |
178 | 3.16M | } |
179 | | |
180 | | /** |
181 | | * @brief Searching FORwards, find the next character that is not a whitespace. |
182 | | * |
183 | | * @param q Index to start from (at the end of the search space) |
184 | | * @param end End of the search space. |
185 | | * |
186 | | * @return const char* Address of the final non-whitespace character OR the same address as the start. |
187 | | */ |
188 | | static const char *findNextNonWS(const char *q, const char *end) |
189 | 718k | { |
190 | 1.46M | while (q < end && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20)) |
191 | 741k | q++; |
192 | | |
193 | 718k | return q; |
194 | 718k | } |
195 | | |
196 | | /** |
197 | | * @brief Find bounds of stream. |
198 | | * |
199 | | * PDF streams are prefixed with "stream" and suffixed with "endstream". |
200 | | * Return value indicates success or failure. |
201 | | * |
202 | | * @param start start address of search space. |
203 | | * @param size size of search space |
204 | | * @param[out] stream output param, address of start of stream data |
205 | | * @param[out] stream_size output param, size of stream data |
206 | | * @param newline_hack hack to support newlines that are \r\n, and not just \n or just \r. |
207 | | * |
208 | | * @return cl_error_t CL_SUCCESS if stream bounds were found. |
209 | | * @return cl_error_t CL_BREAK if stream bounds could not be found. |
210 | | * @return cl_error_t CL_EFORMAT if stream start was found, but not end. (truncated) |
211 | | * @return cl_error_t CL_EARG if invalid args were provided. |
212 | | */ |
213 | | static cl_error_t find_stream_bounds( |
214 | | const char *start, |
215 | | size_t size, |
216 | | const char **stream, |
217 | | size_t *stream_size, |
218 | | int newline_hack) |
219 | 1.35M | { |
220 | 1.35M | cl_error_t status = CL_BREAK; |
221 | | |
222 | 1.35M | const char *idx; |
223 | 1.35M | const char *stream_begin; |
224 | 1.35M | const char *endstream_begin; |
225 | 1.35M | size_t bytesleft = size; |
226 | | |
227 | 1.35M | if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) { |
228 | 0 | status = CL_EARG; |
229 | 0 | return status; |
230 | 0 | } |
231 | | |
232 | 1.35M | *stream = NULL; |
233 | 1.35M | *stream_size = 0; |
234 | | |
235 | | /* Begin by finding the "stream" string that prefixes stream data. */ |
236 | 1.35M | if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) { |
237 | 713k | idx = stream_begin + strlen("stream"); |
238 | 713k | if ((size_t)(idx - start) >= bytesleft) |
239 | 1.21k | goto done; |
240 | 712k | bytesleft -= idx - start; |
241 | | |
242 | | /* Skip any new line characters. */ |
243 | 712k | if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') { |
244 | 296k | idx += 2; |
245 | 296k | bytesleft -= 2; |
246 | 296k | if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') { |
247 | 592 | idx++; |
248 | 592 | bytesleft--; |
249 | 592 | } |
250 | 415k | } else if (bytesleft && idx[0] == '\xa') { |
251 | 68.8k | idx++; |
252 | 68.8k | bytesleft--; |
253 | 68.8k | } |
254 | | |
255 | | /* Pass back start of the stream data. */ |
256 | 712k | *stream = idx; |
257 | | |
258 | | /* Now find the "endstream" string that suffixes stream data. */ |
259 | 712k | endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream")); |
260 | 712k | if (!endstream_begin) { |
261 | | /* Couldn't find "endstream", but that's ok -- |
262 | | * -- we'll just count the rest of the provided buffer. */ |
263 | 520k | cli_dbgmsg("find_stream_bounds: Truncated stream found!\n"); |
264 | 520k | endstream_begin = start + size; |
265 | 520k | status = CL_EFORMAT; |
266 | 520k | } |
267 | | |
268 | | /* Pass back end of the stream data, as offset from start. */ |
269 | 712k | *stream_size = endstream_begin - *stream; |
270 | | |
271 | 712k | if (CL_EFORMAT != status) |
272 | 191k | status = CL_SUCCESS; |
273 | 712k | } |
274 | | |
275 | 1.35M | done: |
276 | | |
277 | 1.35M | return status; |
278 | 1.35M | } |
279 | | |
280 | | /** |
281 | | * @brief Find the next *indirect* object in an object stream, adds it to our list of |
282 | | * objects, and increments nobj. |
283 | | * |
284 | | * Indirect objects in a stream DON'T begin with "obj" and end with "endobj". |
285 | | * Instead, they have an objid and an offset from the first object to point you |
286 | | * right at them. |
287 | | * |
288 | | * If found, objstm->current will be updated to the next objid. |
289 | | * |
290 | | * All objects in an object stream are indirect and thus do not begin or start |
291 | | * with "obj" or "endobj". Instead, the object stream takes the following |
292 | | * format. |
293 | | * |
294 | | * <dictionary describing stream> objstm content endobjstm |
295 | | * |
296 | | * where content looks something like the following: |
297 | | * |
298 | | * 15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>> |
299 | | * |
300 | | * In the above example, the literal string (ab) is indirect object # 15, and |
301 | | * begins at offset 0 of the set of objects. The next object, # 16 begis at |
302 | | * offset 3 is a dictionary. The final object is also a dictionary, beginning |
303 | | * at offset 46. |
304 | | * |
305 | | * @param pdf Pdf struct that keeps track of all information found in the PDF. |
306 | | * @param objstm |
307 | | * |
308 | | * @return CL_SUCCESS if success |
309 | | * @return CL_EPARSE if parsing error |
310 | | * @return CL_EMEM if error allocating memory |
311 | | * @return CL_EARG if invalid arguments |
312 | | */ |
313 | | int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, struct pdf_obj **obj_found) |
314 | 254k | { |
315 | 254k | cl_error_t status = CL_EPARSE; |
316 | 254k | struct pdf_obj *obj = NULL; |
317 | 254k | unsigned long objid = 0, objoff = 0; |
318 | 254k | long temp_long = 0; |
319 | 254k | const char *index = NULL; |
320 | 254k | size_t bytes_remaining = 0; |
321 | | |
322 | 254k | if (NULL == pdf || NULL == objstm) { |
323 | 0 | cli_warnmsg("pdf_findobj_in_objstm: invalid arguments\n"); |
324 | 0 | return CL_EARG; |
325 | 0 | } |
326 | | |
327 | 254k | if (pdf->nobjs >= MAX_PDF_OBJECTS) { |
328 | 0 | pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS; |
329 | |
|
330 | 0 | cli_dbgmsg("pdf_findobj_in_objstm: reached object maximum\n"); |
331 | 0 | status = CL_BREAK; |
332 | 0 | goto done; |
333 | 0 | } |
334 | | |
335 | 254k | *obj_found = NULL; |
336 | | |
337 | 254k | index = objstm->streambuf + objstm->current_pair; |
338 | 254k | bytes_remaining = objstm->streambuf_len - objstm->current_pair; |
339 | | |
340 | 254k | obj = calloc(sizeof(struct pdf_obj), 1); |
341 | 254k | if (!obj) { |
342 | 0 | cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n"); |
343 | 0 | status = CL_EMEM; |
344 | 0 | goto done; |
345 | 0 | } |
346 | | |
347 | | /* This object is in a stream, not in the regular map buffer. */ |
348 | 254k | obj->objstm = objstm; |
349 | | |
350 | | /* objstm->current_pair points directly to the objid */ |
351 | 254k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
352 | | /* Failed to find objid */ |
353 | 4.23k | cli_dbgmsg("pdf_findobj_in_objstm: Failed to find objid for obj in object stream\n"); |
354 | 4.23k | status = CL_EPARSE; |
355 | 4.23k | goto done; |
356 | 250k | } else if (temp_long < 0) { |
357 | 122 | cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long); |
358 | 122 | status = CL_EPARSE; |
359 | 122 | goto done; |
360 | 122 | } |
361 | 249k | objid = (unsigned long)temp_long; |
362 | | |
363 | | /* Find the obj offset that appears just after the objid*/ |
364 | 726k | while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) { |
365 | 476k | index++; |
366 | 476k | bytes_remaining--; |
367 | 476k | } |
368 | 249k | index = findNextNonWS(index, objstm->streambuf + objstm->first); |
369 | 249k | bytes_remaining = objstm->streambuf + objstm->streambuf_len - index; |
370 | | |
371 | 249k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
372 | | /* Failed to find obj offset */ |
373 | 495 | cli_dbgmsg("pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream\n"); |
374 | 495 | status = CL_EPARSE; |
375 | 495 | goto done; |
376 | 249k | } else if (temp_long < 0) { |
377 | 646 | cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long); |
378 | 646 | status = CL_EPARSE; |
379 | 646 | goto done; |
380 | 646 | } |
381 | 248k | objoff = (unsigned long)temp_long; |
382 | | |
383 | 248k | if ((size_t)objstm->first + (size_t)objoff > objstm->streambuf_len) { |
384 | | /* Alleged obj location is further than the length of the stream */ |
385 | 971 | cli_dbgmsg("pdf_findobj_in_objstm: obj offset found is greater than the length of the stream.\n"); |
386 | 971 | status = CL_EPARSE; |
387 | 971 | goto done; |
388 | 971 | } |
389 | | |
390 | 247k | objstm->current = objstm->first + objoff; |
391 | | |
392 | 247k | obj->id = (objid << 8) | (0 & 0xff); |
393 | 247k | obj->start = objstm->current; |
394 | 247k | obj->flags = 0; |
395 | | |
396 | 247k | objstm->nobjs_found++; |
397 | | |
398 | 1.08M | while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) { |
399 | 832k | index++; |
400 | 832k | bytes_remaining--; |
401 | 832k | } |
402 | 247k | objstm->current_pair = (uint32_t)(findNextNonWS(index, objstm->streambuf + objstm->first) - objstm->streambuf); |
403 | | |
404 | | /* Update current_pair, if there are more */ |
405 | 247k | if ((objstm->nobjs_found < objstm->n) && |
406 | 247k | (index < objstm->streambuf + objstm->streambuf_len)) { |
407 | 221k | unsigned long next_objoff = 0; |
408 | | |
409 | | /* |
410 | | * While we're at it, |
411 | | * lets record the size as running up to the next object offset. |
412 | | * |
413 | | * To do so, we will need to parse the next obj pair. |
414 | | */ |
415 | | /* objstm->current_pair points directly to the objid */ |
416 | 221k | index = objstm->streambuf + objstm->current_pair; |
417 | 221k | bytes_remaining = objstm->streambuf + objstm->streambuf_len - index; |
418 | | |
419 | | /* We don't actually care about the object id at this point, so reading the object id is commented out. |
420 | | I didn't delete it entirely in case the object id is needed in the future. */ |
421 | | // if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
422 | | // /* Failed to find objid for next obj */ |
423 | | // cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found); |
424 | | // status = CL_EPARSE; |
425 | | // goto done; |
426 | | // } else if (temp_long < 0) { |
427 | | // cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long); |
428 | | // status = CL_EPARSE; |
429 | | // goto done; |
430 | | // } |
431 | | // next_objid = (unsigned long)temp_long; |
432 | | |
433 | | /* Find the obj offset that appears just after the objid*/ |
434 | 669k | while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) { |
435 | 448k | index++; |
436 | 448k | bytes_remaining--; |
437 | 448k | } |
438 | 221k | index = findNextNonWS(index, objstm->streambuf + objstm->first); |
439 | 221k | bytes_remaining = objstm->streambuf + objstm->streambuf_len - index; |
440 | | |
441 | 221k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
442 | | /* Failed to find obj offset for next obj */ |
443 | 4.10k | cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found); |
444 | 4.10k | status = CL_EPARSE; |
445 | 4.10k | goto done; |
446 | 217k | } else if (temp_long < 0) { |
447 | 399 | cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long); |
448 | 399 | status = CL_EPARSE; |
449 | 399 | goto done; |
450 | 399 | } |
451 | 216k | next_objoff = (unsigned long)temp_long; |
452 | | |
453 | 216k | if (next_objoff <= objoff) { |
454 | | /* Failed to find obj offset for next obj */ |
455 | 938 | cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's less than or equal to the current one!\n"); |
456 | 938 | status = CL_EPARSE; |
457 | 938 | goto done; |
458 | 215k | } else if (objstm->first + next_objoff > objstm->streambuf_len) { |
459 | | /* Failed to find obj offset for next obj */ |
460 | 2.88k | cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's further out than the size of the stream!\n"); |
461 | 2.88k | status = CL_EPARSE; |
462 | 2.88k | goto done; |
463 | 2.88k | } |
464 | | |
465 | 212k | obj->size = next_objoff - objoff; |
466 | 212k | } else { |
467 | | /* |
468 | | * Should be no more objects. We should verify. |
469 | | * |
470 | | * Either way... |
471 | | * obj->size should be the rest of the buffer. |
472 | | */ |
473 | 26.6k | if (objstm->nobjs_found < objstm->n) { |
474 | 225 | cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n"); |
475 | 225 | } |
476 | | |
477 | 26.6k | obj->size = objstm->streambuf_len - obj->start; |
478 | 26.6k | } |
479 | | |
480 | | /* Success! Add the object to the list of all objects found. */ |
481 | 239k | pdf->nobjs++; |
482 | 239k | CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs, |
483 | 239k | cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n"), |
484 | 239k | status = CL_EMEM); |
485 | 239k | pdf->objs[pdf->nobjs - 1] = obj; |
486 | | |
487 | 239k | *obj_found = obj; |
488 | | |
489 | 239k | status = CL_SUCCESS; |
490 | | |
491 | 254k | done: |
492 | 254k | if (CL_SUCCESS != status) { |
493 | 14.7k | if (NULL != obj) { |
494 | 14.7k | free(obj); |
495 | 14.7k | } |
496 | 14.7k | } |
497 | 254k | return status; |
498 | 239k | } |
499 | | |
500 | | /** |
501 | | * @brief Find the next *indirect* object. |
502 | | * |
503 | | * Indirect objects located outside of an object stream are prefaced with: |
504 | | * <objid> <genid> obj |
505 | | * |
506 | | * Each of the above are separated by whitespace of some sort. |
507 | | * |
508 | | * Indirect objects are postfaced with: |
509 | | * endobj |
510 | | * |
511 | | * The specification does not say if whitespace is required before or after "endobj". |
512 | | * |
513 | | * Identify truncated objects. |
514 | | * |
515 | | * If found, pdf->offset will be updated to just after the "endobj". |
516 | | * If truncated, pdf->offset will == pdf->size. |
517 | | * If not found, pdf->offset will not be updated. |
518 | | * |
519 | | * @param pdf Pdf context struct that keeps track of all information found in the PDF. |
520 | | * |
521 | | * @return CL_SUCCESS if success |
522 | | * @return CL_BREAK if no more objects |
523 | | * @return CL_EPARSE if parsing error |
524 | | * @return CL_EMEM if error allocating memory |
525 | | */ |
526 | | cl_error_t pdf_findobj(struct pdf_struct *pdf) |
527 | 2.08M | { |
528 | 2.08M | cl_error_t status = CL_EPARSE; |
529 | 2.08M | const char *start, *idx, *genid_search_index, *objid_search_index; |
530 | | |
531 | 2.08M | const char *obj_begin = NULL, *obj_end = NULL; |
532 | 2.08M | const char *endobj_begin = NULL, *endobj_end = NULL; |
533 | | |
534 | 2.08M | struct pdf_obj *obj = NULL; |
535 | 2.08M | size_t bytesleft; |
536 | 2.08M | unsigned long genid, objid; |
537 | 2.08M | long temp_long; |
538 | | |
539 | 2.08M | if (pdf->nobjs >= MAX_PDF_OBJECTS) { |
540 | 0 | pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS; |
541 | |
|
542 | 0 | cli_dbgmsg("pdf_findobj: reached object maximum\n"); |
543 | 0 | status = CL_BREAK; |
544 | 0 | goto done; |
545 | 0 | } |
546 | 2.08M | pdf->nobjs++; |
547 | 2.08M | CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs, status = CL_EMEM); |
548 | | |
549 | 2.08M | obj = malloc(sizeof(struct pdf_obj)); |
550 | 2.08M | if (!obj) { |
551 | 0 | status = CL_EMEM; |
552 | 0 | goto done; |
553 | 0 | } |
554 | 2.08M | pdf->objs[pdf->nobjs - 1] = obj; |
555 | | |
556 | 2.08M | memset(obj, 0, sizeof(*obj)); |
557 | | |
558 | 2.08M | start = pdf->map + pdf->offset; |
559 | 2.08M | bytesleft = pdf->size - pdf->offset; |
560 | | |
561 | | /* |
562 | | * Start by searching for "obj" |
563 | | */ |
564 | 2.08M | idx = start + 1; |
565 | 2.27M | while (bytesleft > 1 + strlen("obj")) { |
566 | | /* `- 1` accounts for size of white space before obj */ |
567 | 1.91M | idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj")); |
568 | 1.91M | if (NULL == idx) { |
569 | 70.7k | status = CL_BREAK; |
570 | 70.7k | goto done; /* No more objs. */ |
571 | 70.7k | } |
572 | | |
573 | | /* verify that the word has a whitespace before it, and is not the end of |
574 | | * a previous word */ |
575 | 1.84M | idx--; |
576 | 1.84M | bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start); |
577 | | |
578 | 1.84M | if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) { |
579 | | /* This instance of "obj" appears to be part of a longer string. |
580 | | * Skip it, and keep searching for an object. */ |
581 | 195k | idx += 1 + strlen("obj"); |
582 | 195k | bytesleft -= 1 + strlen("obj"); |
583 | 195k | continue; |
584 | 195k | } |
585 | | |
586 | | /* Found the beginning of the word */ |
587 | 1.64M | obj_begin = idx; |
588 | 1.64M | obj_end = idx + 1 + strlen("obj"); |
589 | | |
590 | 1.64M | break; |
591 | 1.84M | } |
592 | | |
593 | 2.01M | if ((NULL == obj_begin) || (NULL == obj_end)) { |
594 | 362k | status = CL_BREAK; |
595 | 362k | goto done; /* No more objs. */ |
596 | 362k | } |
597 | | |
598 | | /* Find the generation id (genid) that appears before the "obj" */ |
599 | 1.64M | genid_search_index = findNextNonWSBack(obj_begin - 1, start); |
600 | 3.59M | while (genid_search_index > start && isdigit(*genid_search_index)) |
601 | 1.94M | genid_search_index--; |
602 | | |
603 | 1.64M | if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) { |
604 | 109k | cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs); |
605 | | /* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */ |
606 | 109k | pdf->offset = obj_end - pdf->map; |
607 | 109k | status = CL_EPARSE; |
608 | 109k | goto done; |
609 | 1.53M | } else if (temp_long < 0) { |
610 | 26.4k | cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long); |
611 | 26.4k | pdf->offset = obj_end - pdf->map; |
612 | 26.4k | status = CL_EPARSE; |
613 | 26.4k | goto done; |
614 | 26.4k | } |
615 | 1.51M | genid = (unsigned long)temp_long; |
616 | | |
617 | | /* Find the object id (objid) that appears before the genid */ |
618 | 1.51M | objid_search_index = findNextNonWSBack(genid_search_index - 1, start); |
619 | 4.01M | while (objid_search_index > start && isdigit(*objid_search_index)) |
620 | 2.50M | objid_search_index--; |
621 | | |
622 | 1.51M | if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) { |
623 | | /* |
624 | | * Edge case: |
625 | | * |
626 | | * PDFs with multiple revisions will have %%EOF before the end of the file, |
627 | | * followed by the next revision of the PDF, which will probably be an immediate objid. |
628 | | * |
629 | | * Example: |
630 | | * %%EOF1 1 obj <blah> endobj |
631 | | * |
632 | | * If this is the case, we can detect it and continue parsing after the %%EOF. |
633 | | */ |
634 | 146k | if (objid_search_index - strlen("%%EO") > start) { |
635 | 113k | const char *lastfile = objid_search_index - strlen("%%EO"); |
636 | 113k | if (0 != strncmp(lastfile, "%%EOF", 5)) { |
637 | | /* Nope, wasn't %%EOF */ |
638 | 112k | cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
639 | | /* Skip past the "obj" thing, and continue. */ |
640 | 112k | pdf->offset = obj_end - pdf->map; |
641 | 112k | status = CL_EPARSE; |
642 | 112k | goto done; |
643 | 112k | } |
644 | | /* Yup, Looks, like the file continues after %%EOF. |
645 | | * Probably another revision. Keep parsing... */ |
646 | 1.07k | objid_search_index++; |
647 | 1.07k | cli_dbgmsg("pdf_findobj: %%%%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map)); |
648 | 33.1k | } else { |
649 | | /* Failed parsing at the very beginning */ |
650 | 33.1k | cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
651 | | /* Probably not a real object. Skip past the "obj" thing, and continue. */ |
652 | 33.1k | pdf->offset = obj_end - pdf->map; |
653 | 33.1k | status = CL_EPARSE; |
654 | 33.1k | goto done; |
655 | 33.1k | } |
656 | | /* Try again, with offset slightly adjusted */ |
657 | 1.07k | if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) { |
658 | 834 | cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs); |
659 | | /* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */ |
660 | 834 | pdf->offset = obj_end - pdf->map; |
661 | 834 | status = CL_EPARSE; |
662 | 834 | goto done; |
663 | 834 | } else if (temp_long < 0) { |
664 | 0 | cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
665 | 0 | pdf->offset = obj_end - pdf->map; |
666 | 0 | status = CL_EPARSE; |
667 | 0 | goto done; |
668 | 0 | } |
669 | | |
670 | 236 | cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n"); |
671 | 1.36M | } else if (temp_long < 0) { |
672 | 15.6k | cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long); |
673 | 15.6k | pdf->offset = obj_end - pdf->map; |
674 | 15.6k | status = CL_EPARSE; |
675 | 15.6k | goto done; |
676 | 15.6k | } |
677 | 1.35M | objid = (unsigned long)temp_long; |
678 | | |
679 | 1.35M | obj->id = (objid << 8) | (genid & 0xff); |
680 | 1.35M | obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */ |
681 | 1.35M | obj->flags = 0; |
682 | | |
683 | | /* |
684 | | * We now have the objid, genid, and object start. |
685 | | * Find the object end ("endobj"). |
686 | | */ |
687 | | /* `- 1` accounts for size of white space before obj */ |
688 | 1.35M | endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj")); |
689 | 1.35M | if (NULL == endobj_begin) { |
690 | | /* No end to object. |
691 | | * PDF appears to be malformed or truncated. |
692 | | * Will record the object size as going ot the end of the file. |
693 | | * Will record that the object is truncated. |
694 | | * Will position the pdf offset to the end of the PDF. |
695 | | * The next iteration of this function will find no more objects. */ |
696 | 352k | obj->flags |= 1 << OBJ_TRUNCATED; |
697 | 352k | obj->size = (pdf->map + pdf->size) - obj_end; |
698 | 352k | pdf->offset = pdf->size; |
699 | | |
700 | | /* Truncated "object" found! */ |
701 | 352k | status = CL_SUCCESS; |
702 | 352k | goto done; |
703 | 352k | } |
704 | 998k | endobj_end = endobj_begin + strlen("endobj"); |
705 | | |
706 | | /* Size of the object goes from "obj" <-> "endobject". */ |
707 | 998k | obj->size = endobj_begin - obj_end; |
708 | 998k | pdf->offset = endobj_end - pdf->map; |
709 | | |
710 | | /* |
711 | | * Object found! |
712 | | */ |
713 | 998k | status = CL_SUCCESS; /* truncated file, no end to obj. */ |
714 | | |
715 | 2.08M | done: |
716 | 2.08M | if (status == CL_SUCCESS) { |
717 | 1.35M | cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size); |
718 | 1.35M | } else { |
719 | | /* Remove the unused obj reference from our list of objects found */ |
720 | | /* No need to realloc pdf->objs back down. It won't leak. */ |
721 | 732k | pdf->objs[pdf->nobjs - 1] = NULL; |
722 | 732k | pdf->nobjs--; |
723 | | |
724 | | /* Free up the obj struct. */ |
725 | 732k | if (NULL != obj) |
726 | 732k | free(obj); |
727 | | |
728 | 732k | if (status == CL_BREAK) { |
729 | 433k | cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs); |
730 | 433k | } else if (status == CL_EMEM) { |
731 | 0 | cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs); |
732 | 298k | } else { |
733 | 298k | cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status); |
734 | 298k | } |
735 | 732k | } |
736 | | |
737 | 2.08M | return status; |
738 | 998k | } |
739 | | |
740 | | static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum) |
741 | 330k | { |
742 | 330k | UNUSEDPARAM(obj); |
743 | | |
744 | 330k | if (cli_checklimits("pdf", pdf->ctx, (uint64_t)*sum, 0, 0)) |
745 | 7.16k | return len; |
746 | | |
747 | 323k | *sum += len; |
748 | | |
749 | 323k | return cli_writen(fout, buf, len); |
750 | 330k | } |
751 | | |
752 | | void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag) |
753 | 1.19M | { |
754 | 1.19M | const char *s = ""; |
755 | 1.19M | pdf->flags |= 1 << flag; |
756 | 1.19M | if (!cli_debug_flag) |
757 | 1.19M | return; |
758 | | |
759 | 0 | switch (flag) { |
760 | 0 | case UNTERMINATED_OBJ_DICT: |
761 | 0 | s = "dictionary not terminated"; |
762 | 0 | break; |
763 | 0 | case ESCAPED_COMMON_PDFNAME: |
764 | | /* like /JavaScript */ |
765 | 0 | s = "escaped common pdfname"; |
766 | 0 | break; |
767 | 0 | case BAD_STREAM_FILTERS: |
768 | 0 | s = "duplicate stream filters"; |
769 | 0 | break; |
770 | 0 | case BAD_PDF_VERSION: |
771 | 0 | s = "bad pdf version"; |
772 | 0 | break; |
773 | 0 | case BAD_PDF_HEADERPOS: |
774 | 0 | s = "bad pdf header position"; |
775 | 0 | break; |
776 | 0 | case BAD_PDF_TRAILER: |
777 | 0 | s = "bad pdf trailer"; |
778 | 0 | break; |
779 | 0 | case BAD_PDF_TOOMANYOBJS: |
780 | 0 | s = "too many pdf objs"; |
781 | 0 | break; |
782 | 0 | case BAD_FLATE: |
783 | 0 | s = "bad deflate stream"; |
784 | 0 | break; |
785 | 0 | case BAD_FLATESTART: |
786 | 0 | s = "bad deflate stream start"; |
787 | 0 | break; |
788 | 0 | case BAD_STREAMSTART: |
789 | 0 | s = "bad stream start"; |
790 | 0 | break; |
791 | 0 | case UNKNOWN_FILTER: |
792 | 0 | s = "unknown filter used"; |
793 | 0 | break; |
794 | 0 | case BAD_ASCIIDECODE: |
795 | 0 | s = "bad ASCII decode"; |
796 | 0 | break; |
797 | 0 | case HEX_JAVASCRIPT: |
798 | 0 | s = "hex javascript"; |
799 | 0 | break; |
800 | 0 | case BAD_INDOBJ: |
801 | 0 | s = "referencing nonexistent obj"; |
802 | 0 | break; |
803 | 0 | case HAS_OPENACTION: |
804 | 0 | s = "has /OpenAction"; |
805 | 0 | break; |
806 | 0 | case HAS_LAUNCHACTION: |
807 | 0 | s = "has /LaunchAction"; |
808 | 0 | break; |
809 | 0 | case BAD_STREAMLEN: |
810 | 0 | s = "bad /Length, too small"; |
811 | 0 | break; |
812 | 0 | case ENCRYPTED_PDF: |
813 | 0 | s = "PDF is encrypted"; |
814 | 0 | break; |
815 | 0 | case LINEARIZED_PDF: |
816 | 0 | s = "linearized PDF"; |
817 | 0 | break; |
818 | 0 | case MANY_FILTERS: |
819 | 0 | s = "more than 2 filters per obj"; |
820 | 0 | break; |
821 | 0 | case DECRYPTABLE_PDF: |
822 | 0 | s = "decryptable PDF"; |
823 | 0 | break; |
824 | 0 | } |
825 | | |
826 | 0 | cli_dbgmsg("pdfobj_flag: %s flagged in object %u %u\n", s, obj->id >> 8, obj->id & 0xff); |
827 | 0 | } |
828 | | |
829 | | struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid) |
830 | 339k | { |
831 | 339k | uint32_t j; |
832 | 339k | uint32_t i; |
833 | | |
834 | | /* search starting at previous obj (if exists) */ |
835 | 67.4M | for (i = 0; i < pdf->nobjs; i++) { |
836 | 67.4M | if (pdf->objs[i] == obj) |
837 | 338k | break; |
838 | 67.4M | } |
839 | | |
840 | 67.3M | for (j = i; j < pdf->nobjs; j++) { |
841 | 67.0M | obj = pdf->objs[j]; |
842 | 67.0M | if (obj->id == objid) |
843 | 43.4k | return obj; |
844 | 67.0M | } |
845 | | |
846 | | /* restart search from beginning if not found */ |
847 | 66.1M | for (j = 0; j < i; j++) { |
848 | 65.8M | obj = pdf->objs[j]; |
849 | 65.8M | if (obj->id == objid) |
850 | 4.80k | return obj; |
851 | 65.8M | } |
852 | | |
853 | 290k | return NULL; |
854 | 295k | } |
855 | | |
856 | | /** |
857 | | * @brief Find and interpret the "/Length" dictionary key value. |
858 | | * |
859 | | * The value may be: |
860 | | * - a direct object (i.e. just a number) |
861 | | * - an indirect object, where the value is somewhere else in the document and we have to look it up. |
862 | | * indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'. |
863 | | * |
864 | | * Example dictionary with a single key "/Length" that relies direct object for the value. |
865 | | * |
866 | | * 1 0 obj |
867 | | * << /Length 534 |
868 | | * /Filter [ /ASCII85Decode /LZWDecode ] |
869 | | * >> |
870 | | * stream |
871 | | * J..)6T`?p&<!J9%_[umg"B7/Z7KNXbN'S+,*Q/&"OLT'FLIDK#!n`$"<Atdi`\Vn%b%)&'cA*VnK\CJY(sF>c!Jnl@ |
872 | | * RM]WM;jjH6Gnc75idkL5]+cPZKEBPWdR>FF(kj1_R%W_d&/jS!;iuad7h?[L-F$+]]0A3Ck*$I0KZ?;<)CJtqi65Xb |
873 | | * Vc3\n5ua:Q/=0$W<#N3U;H,MQKqfg1?:lUpR;6oN[C2E4ZNr8Udn.'p+?#X+1>0Kuk$bCDF/(3fL5]Oq)^kJZ!C2H1 |
874 | | * 'TO]Rl?Q:&'<5&iP!$Rq;BXRecDN[IJB`,)o8XJOSJ9sDS]hQ;Rj@!ND)bD_q&C\g:inYC%)&u#:u,M6Bm%IY!Kb1+ |
875 | | * ":aAa'S`ViJglLb8<W9k6Yl\\0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop"5KatWRT"JQ#qYuL, |
876 | | * JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~> |
877 | | * endstream |
878 | | * endobj |
879 | | * |
880 | | * Example dictionary with a single key "/Length" that relies on an indirect object for the value. |
881 | | * |
882 | | * 7 0 obj |
883 | | * << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0. |
884 | | * stream |
885 | | * BT |
886 | | * /F1 12 Tf |
887 | | * 72 712 Td |
888 | | * ( A stream with an indirect length ) Tj |
889 | | * ET |
890 | | * endstream |
891 | | * endobj |
892 | | * |
893 | | * 8 0 obj |
894 | | * 77 % The length of the preceding stream |
895 | | * endobj |
896 | | * |
897 | | * @param pdf Pdf context structure. |
898 | | * @param obj Pdf object context structure. |
899 | | * @param start Pointer start of the dictionary string. |
900 | | * @param len Remaining length of the dictioary string in bytes. |
901 | | * @return size_t Unsigned integer value of the "/Length" key |
902 | | */ |
903 | | static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *dict_start, size_t dict_len) |
904 | 674k | { |
905 | 674k | size_t length = 0; |
906 | 674k | const char *obj_start = dict_start; |
907 | 674k | size_t bytes_remaining = dict_len; |
908 | 674k | long temp_long = 0; |
909 | 674k | const char *index; |
910 | | |
911 | 674k | if (bytes_remaining < 8) { |
912 | 25.3k | return 0; |
913 | 25.3k | } |
914 | | |
915 | | /* |
916 | | * Find the "/Length" dictionary key |
917 | | */ |
918 | 648k | index = cli_memstr(obj_start, bytes_remaining, "/Length", 7); |
919 | 648k | if (!index) |
920 | 326k | return 0; |
921 | | |
922 | 322k | bytes_remaining -= index - obj_start; |
923 | | |
924 | 322k | if (bytes_remaining < 1) { |
925 | 0 | return 0; |
926 | 0 | } |
927 | | |
928 | | /* Step the index into the "/Length" string. */ |
929 | 322k | index++; |
930 | 322k | bytes_remaining--; |
931 | | |
932 | | /* Find the start of the next direct or indirect object. |
933 | | * pdf_nextobject() assumes we started searching from within a previous object */ |
934 | 322k | obj_start = pdf_nextobject(index, bytes_remaining); |
935 | 322k | if (!obj_start) |
936 | 730 | return 0; |
937 | | |
938 | 321k | if (bytes_remaining < (size_t)(obj_start - index)) { |
939 | 0 | return 0; |
940 | 0 | } |
941 | 321k | bytes_remaining -= obj_start - index; |
942 | 321k | index = obj_start; |
943 | | |
944 | | /* Read the value. This could either be the direct length value, |
945 | | or the object id of the indirect object that has the length */ |
946 | 321k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
947 | 27.0k | cli_dbgmsg("find_length: failed to parse object length or objid\n"); |
948 | 27.0k | return 0; |
949 | 294k | } else if (temp_long < 0) { |
950 | 5.59k | cli_dbgmsg("find_length: Encountered invalid negative object length or objid (%ld).\n", temp_long); |
951 | 5.59k | return 0; |
952 | 5.59k | } |
953 | 289k | length = (size_t)temp_long; /* length or maybe object id */ |
954 | | |
955 | | /* |
956 | | * Keep parsing, skipping past the first integer that might have been what we wanted. |
957 | | * If it's an indirect object, we'll find a Generation ID followed by the letter 'R' |
958 | | * I.e. something like " 0 R" |
959 | | */ |
960 | 1.17M | while ((bytes_remaining > 0) && isdigit(*index)) { |
961 | 880k | index++; |
962 | 880k | bytes_remaining--; |
963 | 880k | } |
964 | | |
965 | 289k | if ((bytes_remaining > 0) && (*index == ' ')) { |
966 | 33.3k | unsigned long genid; |
967 | | |
968 | 33.3k | index++; |
969 | 33.3k | bytes_remaining--; |
970 | | |
971 | 33.3k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
972 | 4.23k | cli_dbgmsg("find_length: failed to parse object genid\n"); |
973 | 4.23k | return 0; |
974 | 29.0k | } else if (temp_long < 0) { |
975 | 3.80k | cli_dbgmsg("find_length: Encountered invalid negative object genid (%ld).\n", temp_long); |
976 | 3.80k | return 0; |
977 | 3.80k | } |
978 | 25.2k | genid = (unsigned long)temp_long; |
979 | | |
980 | 125k | while ((bytes_remaining > 0) && isdigit(*index)) { |
981 | 100k | index++; |
982 | 100k | bytes_remaining--; |
983 | 100k | } |
984 | | |
985 | 25.2k | if (bytes_remaining < 2) { |
986 | 0 | return 0; |
987 | 0 | } |
988 | | |
989 | 25.2k | if (index[0] == ' ' && index[1] == 'R') { |
990 | | /* |
991 | | * Ok so we found a genid and that 'R'. Which means that first value |
992 | | * was actually the objid. |
993 | | * We can look up the indirect object using this information. |
994 | | */ |
995 | 16.6k | unsigned long objid = length; |
996 | 16.6k | const char *indirect_obj_start = NULL; |
997 | | |
998 | 16.6k | cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid); |
999 | | |
1000 | 16.6k | obj = find_obj(pdf, obj, (length << 8) | (genid & 0xff)); |
1001 | 16.6k | if (!obj) { |
1002 | 7.52k | cli_dbgmsg("find_length: indirect object not found\n"); |
1003 | 7.52k | return 0; |
1004 | 7.52k | } |
1005 | | |
1006 | 9.15k | indirect_obj_start = pdf->map + obj->start; |
1007 | 9.15k | bytes_remaining = pdf->size - obj->start; |
1008 | | |
1009 | | /* Ok so we found the indirect object, lets read the value. */ |
1010 | 9.15k | index = pdf_nextobject(indirect_obj_start, bytes_remaining); |
1011 | 9.15k | if (!index) { |
1012 | 171 | cli_dbgmsg("find_length: next object not found\n"); |
1013 | 171 | return 0; |
1014 | 171 | } |
1015 | | |
1016 | 8.98k | if (bytes_remaining < (size_t)(index - indirect_obj_start)) { |
1017 | 0 | return 0; |
1018 | 0 | } |
1019 | 8.98k | bytes_remaining -= index - indirect_obj_start; |
1020 | | |
1021 | | /* Found the value, so lets parse it as a long, but prohibit negative lengths. */ |
1022 | 8.98k | if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) { |
1023 | 4.40k | cli_dbgmsg("find_length: failed to parse object length from indirect object\n"); |
1024 | 4.40k | return 0; |
1025 | 4.58k | } else if (temp_long < 0) { |
1026 | 399 | cli_dbgmsg("find_length: Encountered invalid negative obj length (%ld).\n", temp_long); |
1027 | 399 | return 0; |
1028 | 399 | } |
1029 | 4.18k | length = (size_t)temp_long; |
1030 | 4.18k | } |
1031 | 25.2k | } |
1032 | | |
1033 | | /* limit length */ |
1034 | 268k | if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size) |
1035 | 40.3k | length = pdf->size - (obj_start - pdf->map) - 5; |
1036 | | |
1037 | 268k | return length; |
1038 | 289k | } |
1039 | | |
1040 | 1.14M | #define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION)) |
1041 | | |
1042 | | static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd) |
1043 | 2.25M | { |
1044 | 2.25M | int ret; |
1045 | 2.25M | struct cli_bc_ctx *bc_ctx; |
1046 | 2.25M | cli_ctx *ctx = NULL; |
1047 | 2.25M | fmap_t *map; |
1048 | | |
1049 | 2.25M | if (NULL == pdf) |
1050 | 0 | return CL_EARG; |
1051 | | |
1052 | 2.25M | ctx = pdf->ctx; |
1053 | | |
1054 | 2.25M | bc_ctx = cli_bytecode_context_alloc(); |
1055 | 2.25M | if (!bc_ctx) { |
1056 | 0 | cli_errmsg("run_pdf_hooks: can't allocate memory for bc_ctx\n"); |
1057 | 0 | return CL_EMEM; |
1058 | 0 | } |
1059 | | |
1060 | 2.25M | map = ctx->fmap; |
1061 | 2.25M | if (fd != -1) { |
1062 | 959k | map = fmap(fd, 0, 0, NULL); |
1063 | 959k | if (!map) { |
1064 | 0 | cli_dbgmsg("run_pdf_hooks: can't mmap pdf extracted obj\n"); |
1065 | 0 | map = ctx->fmap; |
1066 | 0 | fd = -1; |
1067 | 0 | } |
1068 | 959k | } |
1069 | | |
1070 | 2.25M | cli_bytecode_context_setpdf(bc_ctx, phase, pdf->nobjs, pdf->objs, &pdf->flags, pdf->size, pdf->startoff); |
1071 | 2.25M | cli_bytecode_context_setctx(bc_ctx, ctx); |
1072 | 2.25M | ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_PDF, map); |
1073 | 2.25M | cli_bytecode_context_destroy(bc_ctx); |
1074 | | |
1075 | 2.25M | if (fd != -1) |
1076 | 959k | funmap(map); |
1077 | | |
1078 | 2.25M | return ret; |
1079 | 2.25M | } |
1080 | | |
1081 | | static void dbg_printhex(const char *msg, const char *hex, unsigned len); |
1082 | | |
1083 | | static void aes_256cbc_decrypt(const unsigned char *in, size_t *length, unsigned char *q, char *key, unsigned key_n, int has_iv) |
1084 | 30.5k | { |
1085 | 30.5k | uint32_t rk[RKLENGTH(256)]; |
1086 | 30.5k | unsigned char iv[16]; |
1087 | 30.5k | size_t len = 0; |
1088 | 30.5k | unsigned char pad, i; |
1089 | 30.5k | int nrounds; |
1090 | | |
1091 | 30.5k | if (in == NULL || length == NULL) { |
1092 | 0 | cli_dbgmsg("aes_256cbc_decrypt: invalid NULL parameters!\n"); |
1093 | 0 | noisy_warnmsg("aes_256cbc_decrypt: invalid NULL parameters!\n"); |
1094 | 0 | return; |
1095 | 0 | } |
1096 | | |
1097 | 30.5k | len = *length; |
1098 | | |
1099 | 30.5k | cli_dbgmsg("aes_256cbc_decrypt: key length: %d, data length: %zu\n", key_n, *length); |
1100 | 30.5k | if (!(key_n == 16 || key_n == 24 || key_n == 32)) { |
1101 | 0 | cli_dbgmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8); |
1102 | 0 | noisy_warnmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8); |
1103 | 0 | return; |
1104 | 0 | } |
1105 | | |
1106 | 30.5k | if (len < 32) { |
1107 | 3.94k | cli_dbgmsg("aes_256cbc_decrypt: len is <32: %zu\n", len); |
1108 | 3.94k | noisy_warnmsg("aes_256cbc_decrypt: len is <32: %zu\n", len); |
1109 | 3.94k | return; |
1110 | 3.94k | } |
1111 | | |
1112 | 26.6k | if (has_iv) { |
1113 | 24.1k | memcpy(iv, in, 16); |
1114 | 24.1k | in += 16; |
1115 | 24.1k | len -= 16; |
1116 | 24.1k | } else { |
1117 | 2.50k | memset(iv, 0, sizeof(iv)); |
1118 | 2.50k | } |
1119 | | |
1120 | 26.6k | cli_dbgmsg("aes_256cbc_decrypt: Calling rijndaelSetupDecrypt\n"); |
1121 | 26.6k | nrounds = rijndaelSetupDecrypt(rk, (const unsigned char *)key, key_n * 8); |
1122 | 26.6k | if (!nrounds) { |
1123 | 0 | cli_dbgmsg("aes_256cbc_decrypt: nrounds = 0\n"); |
1124 | 0 | return; |
1125 | 0 | } |
1126 | 26.6k | cli_dbgmsg("aes_256cbc_decrypt: Beginning rijndaelDecrypt\n"); |
1127 | | |
1128 | 954k | while (len >= 16) { |
1129 | 927k | unsigned i; |
1130 | | |
1131 | 927k | rijndaelDecrypt(rk, nrounds, in, q); |
1132 | 15.7M | for (i = 0; i < 16; i++) |
1133 | 14.8M | q[i] ^= iv[i]; |
1134 | | |
1135 | 927k | memcpy(iv, in, 16); |
1136 | | |
1137 | 927k | q += 16; |
1138 | 927k | in += 16; |
1139 | 927k | len -= 16; |
1140 | 927k | } |
1141 | 26.6k | if (has_iv) { |
1142 | 24.1k | len += 16; |
1143 | 24.1k | pad = q[-1]; |
1144 | | |
1145 | 24.1k | if (pad > 0x10) { |
1146 | 18.9k | cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16); |
1147 | 18.9k | noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16); |
1148 | 18.9k | *length -= len; |
1149 | 18.9k | return; |
1150 | 18.9k | } |
1151 | | |
1152 | 5.17k | q -= pad; |
1153 | 21.0k | for (i = 1; i < pad; i++) { |
1154 | 16.5k | if (q[i] != pad) { |
1155 | 611 | cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad); |
1156 | 611 | noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad); |
1157 | 611 | *length -= len; |
1158 | | |
1159 | 611 | return; |
1160 | 611 | } |
1161 | 16.5k | } |
1162 | | |
1163 | 4.56k | len += pad; |
1164 | 4.56k | } |
1165 | | |
1166 | 7.06k | *length -= len; |
1167 | | |
1168 | 7.06k | cli_dbgmsg("aes_256cbc_decrypt: length is %zu\n", *length); |
1169 | 7.06k | } |
1170 | | |
1171 | | static void aes_128cbc_encrypt(const unsigned char *in, size_t in_length, unsigned char *out, size_t *out_length, const unsigned char *key, size_t key_n, const unsigned char *iv) |
1172 | 647k | { |
1173 | 647k | uint32_t rk[RKLENGTH(128)]; |
1174 | 647k | unsigned char real_iv[16] = {0}; |
1175 | 647k | int nrounds; |
1176 | 647k | uint8_t i = 0; |
1177 | | |
1178 | 647k | cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length: %zu, data length: %zu\n", key_n, in_length); |
1179 | 647k | if (key_n > 16) { |
1180 | 0 | cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length is %zu!\n", key_n * 8); |
1181 | 0 | return; |
1182 | 0 | } |
1183 | | |
1184 | 647k | if (in_length < 16) { |
1185 | 0 | cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length); |
1186 | 0 | noisy_warnmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length); |
1187 | 0 | return; |
1188 | 0 | } |
1189 | | |
1190 | 647k | cli_dbgmsg("aes_128cbc_encrypt: Calling rijndaelSetupEncrypt\n"); |
1191 | 647k | nrounds = rijndaelSetupEncrypt(rk, key, key_n * 8); |
1192 | 647k | if (!nrounds) { |
1193 | 0 | cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: nrounds = 0\n"); |
1194 | 0 | return; |
1195 | 0 | } |
1196 | 647k | cli_dbgmsg("aes_128cbc_encrypt: Beginning rijndaelEncrypt\n"); |
1197 | | |
1198 | 647k | if (iv) |
1199 | 647k | memcpy(real_iv, iv, sizeof(real_iv)); |
1200 | | |
1201 | 647k | *out_length = 0; |
1202 | 166M | while (in_length >= 16) { |
1203 | 2.82G | for (i = 0; i < 16; i++) |
1204 | 2.65G | real_iv[i] ^= in[i]; |
1205 | | |
1206 | 166M | rijndaelEncrypt(rk, nrounds, real_iv, real_iv); |
1207 | | |
1208 | 2.82G | for (i = 0; i < 16; i++) |
1209 | 2.65G | out[i] = real_iv[i]; |
1210 | | |
1211 | 166M | out += 16; |
1212 | 166M | *out_length += 16; |
1213 | 166M | in += 16; |
1214 | 166M | in_length -= 16; |
1215 | 166M | } |
1216 | | |
1217 | 647k | cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: length is %zu\n", *out_length); |
1218 | 647k | } |
1219 | | |
1220 | | char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *length, enum enc_method enc_method) |
1221 | 50.1k | { |
1222 | 50.1k | unsigned char *key, *q, result[16]; |
1223 | 50.1k | unsigned n; |
1224 | 50.1k | struct arc4_state arc4; |
1225 | | |
1226 | 50.1k | if (!length || !*length || !in) { |
1227 | 77 | noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u: Invalid arguments.\n", id >> 8, id & 0xff); |
1228 | 77 | return NULL; |
1229 | 77 | } |
1230 | | |
1231 | 50.0k | if (NULL == pdf->key || 0 == pdf->keylen) { |
1232 | 7.01k | noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u: PDF key never identified.\n", id >> 8, id & 0xff); |
1233 | 7.01k | return NULL; |
1234 | 7.01k | } |
1235 | | |
1236 | 43.0k | n = pdf->keylen + 5; |
1237 | 43.0k | if (enc_method == ENC_AESV2) |
1238 | 1.61k | n += 4; |
1239 | | |
1240 | 43.0k | key = cli_max_malloc(n); |
1241 | 43.0k | if (!key) { |
1242 | 0 | noisy_warnmsg("decrypt_any: malloc failed\n"); |
1243 | 0 | return NULL; |
1244 | 0 | } |
1245 | | |
1246 | 43.0k | memcpy(key, pdf->key, pdf->keylen); |
1247 | 43.0k | q = key + pdf->keylen; |
1248 | 43.0k | *q++ = id >> 8; |
1249 | 43.0k | *q++ = id >> 16; |
1250 | 43.0k | *q++ = id >> 24; |
1251 | 43.0k | *q++ = id; |
1252 | 43.0k | *q++ = 0; |
1253 | 43.0k | if (enc_method == ENC_AESV2) |
1254 | 1.61k | memcpy(q, "sAlT", 4); |
1255 | | |
1256 | 43.0k | cl_hash_data("md5", key, n, result, NULL); |
1257 | 43.0k | free(key); |
1258 | | |
1259 | 43.0k | n = pdf->keylen + 5; |
1260 | 43.0k | if (n > 16) |
1261 | 41.8k | n = 16; |
1262 | | |
1263 | 43.0k | q = cli_max_calloc(*length, sizeof(char)); |
1264 | 43.0k | if (!q) { |
1265 | 0 | noisy_warnmsg("decrypt_any: malloc failed\n"); |
1266 | 0 | return NULL; |
1267 | 0 | } |
1268 | | |
1269 | 43.0k | switch (enc_method) { |
1270 | 1.08k | case ENC_V2: |
1271 | 1.08k | cli_dbgmsg("cli_pdf: enc is v2\n"); |
1272 | 1.08k | memcpy(q, in, *length); |
1273 | 1.08k | if (false == arc4_init(&arc4, result, n)) { |
1274 | 0 | noisy_warnmsg("decrypt_any: failed to init arc4\n"); |
1275 | 0 | free(q); |
1276 | 0 | return NULL; |
1277 | 0 | } |
1278 | 1.08k | arc4_apply(&arc4, q, (unsigned)*length); /* TODO: may truncate for very large lengths */ |
1279 | | |
1280 | 1.08k | noisy_msg(pdf, "decrypt_any: decrypted ARC4 data\n"); |
1281 | | |
1282 | 1.08k | break; |
1283 | 1.61k | case ENC_AESV2: |
1284 | 1.61k | cli_dbgmsg("cli_pdf: enc is aesv2\n"); |
1285 | 1.61k | aes_256cbc_decrypt((const unsigned char *)in, length, q, (char *)result, n, 1); |
1286 | | |
1287 | 1.61k | noisy_msg(pdf, "decrypt_any: decrypted AES(v2) data\n"); |
1288 | | |
1289 | 1.61k | break; |
1290 | 26.4k | case ENC_AESV3: |
1291 | 26.4k | cli_dbgmsg("decrypt_any: enc is aesv3\n"); |
1292 | | |
1293 | 26.4k | aes_256cbc_decrypt((const unsigned char *)in, length, q, pdf->key, pdf->keylen, 1); |
1294 | | |
1295 | 26.4k | noisy_msg(pdf, "decrypted AES(v3) data\n"); |
1296 | | |
1297 | 26.4k | break; |
1298 | 3.22k | case ENC_IDENTITY: |
1299 | 3.22k | cli_dbgmsg("decrypt_any: enc is identity\n"); |
1300 | 3.22k | memcpy(q, in, *length); |
1301 | | |
1302 | 3.22k | noisy_msg(pdf, "decrypt_any: identity encryption\n"); |
1303 | | |
1304 | 3.22k | break; |
1305 | 135 | case ENC_NONE: |
1306 | 135 | cli_dbgmsg("decrypt_any: enc is none\n"); |
1307 | | |
1308 | 135 | noisy_msg(pdf, "encryption is none\n"); |
1309 | | |
1310 | 135 | free(q); |
1311 | 135 | return NULL; |
1312 | 10.5k | case ENC_UNKNOWN: |
1313 | 10.5k | cli_dbgmsg("decrypt_any: enc is unknown\n"); |
1314 | 10.5k | free(q); |
1315 | | |
1316 | 10.5k | noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n", |
1317 | 10.5k | id >> 8, id & 0xff); |
1318 | | |
1319 | 10.5k | return NULL; |
1320 | 43.0k | } |
1321 | | |
1322 | 32.3k | return (char *)q; |
1323 | 43.0k | } |
1324 | | |
1325 | | enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj) |
1326 | 41.9k | { |
1327 | 41.9k | if (obj->flags & (1 << OBJ_EMBEDDED_FILE)) |
1328 | 337 | return pdf->enc_method_embeddedfile; |
1329 | | |
1330 | 41.5k | if (obj->flags & (1 << OBJ_STREAM)) |
1331 | 39.6k | return pdf->enc_method_stream; |
1332 | | |
1333 | 1.97k | return pdf->enc_method_string; |
1334 | 41.5k | } |
1335 | | |
1336 | | enum cstate { |
1337 | | CSTATE_NONE, |
1338 | | CSTATE_TJ, |
1339 | | CSTATE_TJ_PAROPEN |
1340 | | }; |
1341 | | |
1342 | | static void process(struct text_norm_state *s, enum cstate *st, const char *buf, size_t length, int fout) |
1343 | 546k | { |
1344 | 1.62G | do { |
1345 | 1.62G | switch (*st) { |
1346 | 1.56M | case CSTATE_NONE: |
1347 | 1.56M | if (*buf == '[') { |
1348 | 22.7k | *st = CSTATE_TJ; |
1349 | 1.54M | } else { |
1350 | 1.54M | const char *nl = memchr(buf, '\n', length); |
1351 | 1.54M | if (!nl) |
1352 | 317k | return; |
1353 | | |
1354 | 1.22M | if ((size_t)(nl - buf) > length) { |
1355 | 0 | length = 0; |
1356 | 1.22M | } else { |
1357 | 1.22M | length -= nl - buf; |
1358 | 1.22M | } |
1359 | 1.22M | buf = nl; |
1360 | 1.22M | } |
1361 | | |
1362 | 1.25M | break; |
1363 | 587M | case CSTATE_TJ: |
1364 | 587M | if (*buf == '(') |
1365 | 200k | *st = CSTATE_TJ_PAROPEN; |
1366 | | |
1367 | 587M | break; |
1368 | 1.03G | case CSTATE_TJ_PAROPEN: |
1369 | 1.03G | if (*buf == ')') { |
1370 | 184k | *st = CSTATE_TJ; |
1371 | 1.03G | } else { |
1372 | 1.03G | if (text_normalize_buffer(s, (const unsigned char *)buf, 1) != 1) { |
1373 | 638 | cli_writen(fout, s->out, s->out_pos); |
1374 | 638 | text_normalize_reset(s); |
1375 | 638 | } |
1376 | 1.03G | } |
1377 | | |
1378 | 1.03G | break; |
1379 | 1.62G | } |
1380 | | |
1381 | 1.62G | buf++; |
1382 | 1.62G | if (length > 0) |
1383 | 1.62G | length--; |
1384 | 1.62G | } while (length > 0); |
1385 | 546k | } |
1386 | | |
1387 | | static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj) |
1388 | 69.5k | { |
1389 | 69.5k | struct text_norm_state s; |
1390 | 69.5k | char fullname[1024]; |
1391 | 69.5k | char outbuff[BUFSIZ]; |
1392 | 69.5k | char inbuf[BUFSIZ]; |
1393 | 69.5k | int fout; |
1394 | 69.5k | size_t n; |
1395 | 69.5k | cl_error_t rc; |
1396 | 69.5k | enum cstate st = CSTATE_NONE; |
1397 | | |
1398 | 69.5k | snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d contents", pdf->dir, obj->id >> 8, obj->id & 0xff); |
1399 | 69.5k | fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600); |
1400 | 69.5k | if (fout < 0) { |
1401 | 0 | char err[128]; |
1402 | |
|
1403 | 0 | cli_errmsg("pdf_scan_contents: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err))); |
1404 | 0 | return CL_ETMPFILE; |
1405 | 0 | } |
1406 | | |
1407 | 69.5k | text_normalize_init(&s, (unsigned char *)outbuff, sizeof(outbuff)); |
1408 | 616k | while (1) { |
1409 | 616k | n = cli_readn(fd, inbuf, sizeof(inbuf)); |
1410 | 616k | if ((n == 0) || (n == (size_t)-1)) |
1411 | 69.5k | break; |
1412 | | |
1413 | 546k | process(&s, &st, inbuf, n, fout); |
1414 | 546k | } |
1415 | | |
1416 | 69.5k | cli_writen(fout, s.out, s.out_pos); |
1417 | | |
1418 | 69.5k | lseek(fout, 0, SEEK_SET); |
1419 | 69.5k | rc = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE); |
1420 | 69.5k | close(fout); |
1421 | | |
1422 | 69.5k | if (!pdf->ctx->engine->keeptmp || (s.out_pos == 0)) |
1423 | 69.5k | if (cli_unlink(fullname) && rc != CL_VIRUS) |
1424 | 0 | rc = CL_EUNLINK; |
1425 | | |
1426 | 69.5k | return rc; |
1427 | 69.5k | } |
1428 | | |
1429 | | cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags) |
1430 | 1.58M | { |
1431 | 1.58M | char fullname[PATH_MAX + 1]; |
1432 | 1.58M | int fout = -1; |
1433 | 1.58M | size_t sum = 0; |
1434 | 1.58M | cl_error_t rc = CL_SUCCESS; |
1435 | 1.58M | int dump = 1; |
1436 | | |
1437 | 1.58M | cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
1438 | | |
1439 | 1.58M | if (PDF_OBJECT_RECURSION_LIMIT < pdf->parse_recursion_depth) { |
1440 | 0 | cli_dbgmsg("pdf_extract_obj: Recursion limit reached.\n"); |
1441 | 0 | return CL_SUCCESS; |
1442 | 0 | } |
1443 | | |
1444 | 1.58M | if (obj->extracted) { |
1445 | | // Should not attempt to extract the same object more than once. |
1446 | 1.47k | return CL_SUCCESS; |
1447 | 1.47k | } |
1448 | | // We're not done yet, but this is enough to say we've tried. |
1449 | | // Trying again won't help any. |
1450 | 1.58M | obj->extracted = true; |
1451 | | |
1452 | 1.58M | if (obj->objstm) { |
1453 | 239k | cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n"); |
1454 | 239k | if (obj->objstm->streambuf == NULL) { |
1455 | 0 | cli_warnmsg("pdf_extract_obj: object in object stream has null stream buffer!\n"); |
1456 | 0 | return CL_EFORMAT; |
1457 | 0 | } |
1458 | 239k | } |
1459 | | |
1460 | | /* TODO: call bytecode hook here, allow override dumpability */ |
1461 | 1.58M | if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) { |
1462 | | /* don't dump all streams */ |
1463 | 580k | dump = 0; |
1464 | 580k | } |
1465 | | |
1466 | 1.58M | if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) { |
1467 | | /* don't dump / scan non-JPG images */ |
1468 | 4.48k | dump = 0; |
1469 | 4.48k | } |
1470 | | |
1471 | 1.58M | if (obj->flags & (1 << OBJ_FORCEDUMP)) { |
1472 | | /* bytecode can force dump by setting this flag */ |
1473 | 1.38k | dump = 1; |
1474 | 1.38k | } |
1475 | | |
1476 | 1.58M | if (!dump) |
1477 | 583k | return CL_CLEAN; |
1478 | | |
1479 | 1.00M | cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
1480 | | |
1481 | 1.00M | snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d", pdf->dir, obj->id >> 8, obj->id & 0xff); |
1482 | 1.00M | fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600); |
1483 | 1.00M | if (fout < 0) { |
1484 | 25 | char err[128]; |
1485 | 25 | cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err))); |
1486 | | |
1487 | 25 | return CL_ETMPFILE; |
1488 | 25 | } |
1489 | | |
1490 | 1.00M | if (!(flags & PDF_EXTRACT_OBJ_SCAN)) { |
1491 | 1.36k | if (NULL != obj->path) { |
1492 | 0 | obj->path = strdup(fullname); |
1493 | 0 | } |
1494 | 1.36k | } |
1495 | | |
1496 | 1.00M | if ((NULL == obj->objstm) && |
1497 | 1.00M | (obj->flags & (1 << OBJ_STREAM))) { |
1498 | | /* |
1499 | | * Object contains a stream. Parse this now. |
1500 | | */ |
1501 | 674k | cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
1502 | | |
1503 | 674k | const char *start = pdf->map + obj->start; |
1504 | | |
1505 | 674k | size_t length; |
1506 | 674k | size_t orig_length; |
1507 | 674k | int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */ |
1508 | | |
1509 | 674k | const char *pstr; |
1510 | 674k | struct pdf_dict *dparams = NULL; |
1511 | 674k | struct objstm_struct *objstm = NULL; |
1512 | 674k | int xref = 0; |
1513 | | |
1514 | | /* Find and interpret the length dictionary value */ |
1515 | 674k | length = find_length(pdf, obj, start, dict_len); |
1516 | | |
1517 | 674k | orig_length = length; |
1518 | | |
1519 | 674k | if (length > obj->stream_size) { |
1520 | 79.5k | cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
1521 | 79.5k | noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size); |
1522 | | |
1523 | 79.5k | length = obj->stream_size; |
1524 | 79.5k | } |
1525 | | |
1526 | 674k | if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length == 0)) { |
1527 | | /* |
1528 | | * If the length is unknown and this doesn't contain a FLATE encoded filter... |
1529 | | * Calculate the length using the stream size, and trimming |
1530 | | * off any newline/carriage returns from the end of the stream. |
1531 | | */ |
1532 | 374k | const char *q = start + obj->stream_size; |
1533 | 374k | length = obj->stream_size; |
1534 | 374k | q--; |
1535 | | |
1536 | 374k | if (length > 0) { |
1537 | 360k | if (*q == '\n') { |
1538 | 5.93k | q--; |
1539 | 5.93k | length--; |
1540 | | |
1541 | 5.93k | if (length > 0 && *q == '\r') |
1542 | 1.18k | length--; |
1543 | 354k | } else if (*q == '\r') { |
1544 | 27.7k | length--; |
1545 | 27.7k | } |
1546 | 360k | } |
1547 | | |
1548 | 374k | cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length); |
1549 | 374k | } else { |
1550 | 299k | if (obj->stream_size > (size_t)length + 2) { |
1551 | 168k | cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n", |
1552 | 168k | (size_t)length, obj->stream_size); |
1553 | 168k | length = obj->stream_size; |
1554 | 168k | } |
1555 | 299k | } |
1556 | | |
1557 | 674k | if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) { |
1558 | 114k | cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n", |
1559 | 114k | (long long)orig_length, (long long)length, obj->stream_size); |
1560 | 114k | pdfobj_flag(pdf, obj, BAD_STREAMLEN); |
1561 | 114k | } |
1562 | | |
1563 | 674k | if (0 == length) { |
1564 | 53.1k | length = obj->stream_size; |
1565 | 53.1k | if (0 == length) { |
1566 | 23.1k | cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n"); |
1567 | 23.1k | goto done; /* Empty stream, nothing to scan */ |
1568 | 23.1k | } |
1569 | 53.1k | } |
1570 | | |
1571 | | /* Check if XRef is enabled */ |
1572 | 651k | if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) { |
1573 | 25.2k | xref = 1; |
1574 | 25.2k | } |
1575 | | |
1576 | | /* |
1577 | | * Identify the DecodeParms, if available. |
1578 | | */ |
1579 | 651k | if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) { |
1580 | 66.6k | cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n"); |
1581 | 584k | } else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) { |
1582 | 41.2k | cli_dbgmsg("pdf_extract_obj: Found /DP\n"); |
1583 | 41.2k | } |
1584 | | |
1585 | 651k | if (pstr) { |
1586 | | /* shift pstr left to "<<" for pdf_parse_dict */ |
1587 | 199k | while ((*pstr == '<') && (pstr > start)) { |
1588 | 91.8k | pstr--; |
1589 | 91.8k | dict_len++; |
1590 | 91.8k | } |
1591 | | |
1592 | | /* shift pstr right to "<<" for pdf_parse_dict */ |
1593 | 1.48M | while ((*pstr != '<') && (dict_len > 0)) { |
1594 | 1.38M | pstr++; |
1595 | 1.38M | dict_len--; |
1596 | 1.38M | } |
1597 | | |
1598 | 107k | if (dict_len > 4) { |
1599 | 105k | pdf->parse_recursion_depth++; |
1600 | 105k | dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL); |
1601 | 105k | pdf->parse_recursion_depth--; |
1602 | 105k | } else { |
1603 | 2.29k | cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n"); |
1604 | 2.29k | } |
1605 | 107k | } |
1606 | | |
1607 | | /* |
1608 | | * Go back to the start of the dictionary and check to see if the stream |
1609 | | * is an object stream. If so, collect the relevant info. |
1610 | | */ |
1611 | 651k | dict_len = obj->stream - start; |
1612 | 651k | if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) { |
1613 | 85.5k | int32_t objstm_first = -1; |
1614 | 85.5k | int32_t objstm_length = -1; |
1615 | 85.5k | int32_t objstm_n = -1; |
1616 | | |
1617 | 85.5k | cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n"); |
1618 | | |
1619 | 85.5k | dict_len = obj->stream - start; |
1620 | 85.5k | if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) { |
1621 | 10.3k | cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n"); |
1622 | 75.2k | } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) { |
1623 | 4.55k | cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n"); |
1624 | 70.6k | } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) { |
1625 | 7.21k | cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n"); |
1626 | 63.4k | } else { |
1627 | | /* Add objstm to pdf struct, so it can be freed eventually */ |
1628 | 63.4k | pdf->nobjstms++; |
1629 | 63.4k | pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
1630 | 63.4k | if (!pdf->objstms) { |
1631 | 0 | cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
1632 | 0 | pdf_free_dict(dparams); |
1633 | 0 | return CL_EMEM; |
1634 | 0 | } |
1635 | | |
1636 | 63.4k | objstm = malloc(sizeof(struct objstm_struct)); |
1637 | 63.4k | if (!objstm) { |
1638 | 0 | cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms); |
1639 | 0 | pdf_free_dict(dparams); |
1640 | 0 | return CL_EMEM; |
1641 | 0 | } |
1642 | 63.4k | pdf->objstms[pdf->nobjstms - 1] = objstm; |
1643 | | |
1644 | 63.4k | memset(objstm, 0, sizeof(*objstm)); |
1645 | | |
1646 | 63.4k | objstm->first = (uint32_t)objstm_first; |
1647 | 63.4k | objstm->current = (uint32_t)objstm_first; |
1648 | 63.4k | objstm->current_pair = 0; |
1649 | 63.4k | objstm->length = (uint32_t)objstm_length; |
1650 | 63.4k | objstm->n = (uint32_t)objstm_n; |
1651 | | |
1652 | 63.4k | cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first); |
1653 | 63.4k | cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length); |
1654 | 63.4k | cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n); |
1655 | 63.4k | } |
1656 | 85.5k | } |
1657 | | |
1658 | 651k | sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm); |
1659 | 651k | if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) { |
1660 | 130k | cli_dbgmsg("Error decoding stream! Error code: %d\n", rc); |
1661 | | |
1662 | | /* It's ok if we couldn't decode the stream, |
1663 | | * make a best effort to keep parsing... |
1664 | | * Unless we were unable to allocate memory.*/ |
1665 | 130k | if (CL_EMEM == rc) { |
1666 | 0 | goto really_done; |
1667 | 0 | } |
1668 | 130k | if (CL_EPARSE == rc) { |
1669 | 130k | rc = CL_SUCCESS; |
1670 | 130k | } |
1671 | | |
1672 | 130k | if (NULL != objstm) { |
1673 | | /* |
1674 | | * If we were expecting an objstm and there was a failure... |
1675 | | * discard the memory for last object stream. |
1676 | | */ |
1677 | 8.92k | if (NULL != pdf->objstms) { |
1678 | 8.92k | if (NULL != pdf->objstms[pdf->nobjstms - 1]) { |
1679 | 8.92k | if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) { |
1680 | 0 | free(pdf->objstms[pdf->nobjstms - 1]->streambuf); |
1681 | 0 | pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL; |
1682 | 0 | } |
1683 | 8.92k | free(pdf->objstms[pdf->nobjstms - 1]); |
1684 | 8.92k | pdf->objstms[pdf->nobjstms - 1] = NULL; |
1685 | 8.92k | } |
1686 | | |
1687 | | /* Pop the objstm off the end of the pdf->objstms array. */ |
1688 | 8.92k | if (pdf->nobjstms > 0) { |
1689 | 8.92k | pdf->nobjstms--; |
1690 | 8.92k | if (0 == pdf->nobjstms) { |
1691 | 2.05k | free(pdf->objstms); |
1692 | 2.05k | pdf->objstms = NULL; |
1693 | 6.86k | } else { |
1694 | 6.86k | pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms); |
1695 | | |
1696 | 6.86k | if (!pdf->objstms) { |
1697 | 0 | cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n"); |
1698 | 0 | return CL_EMEM; |
1699 | 0 | } |
1700 | 6.86k | } |
1701 | 8.92k | } else { |
1702 | | /* hm.. this shouldn't happen */ |
1703 | 0 | cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n"); |
1704 | 0 | } |
1705 | 8.92k | } |
1706 | 8.92k | } |
1707 | 130k | } |
1708 | | |
1709 | 651k | if (dparams) |
1710 | 75.2k | pdf_free_dict(dparams); |
1711 | | |
1712 | 651k | if (rc == CL_VIRUS) { |
1713 | 0 | sum = 0; /* prevents post-filter scan */ |
1714 | 0 | goto done; |
1715 | 0 | } |
1716 | | |
1717 | 651k | } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) { |
1718 | 15.0k | const char *q2; |
1719 | 15.0k | const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
1720 | 15.0k | : (const char *)(obj->start + pdf->map); |
1721 | | |
1722 | | /* TODO: get obj-endobj size */ |
1723 | 15.0k | off_t bytesleft = obj->size; |
1724 | | |
1725 | 15.0k | if (bytesleft < 0) { |
1726 | 0 | goto done; |
1727 | 0 | } |
1728 | | |
1729 | 32.8k | do { |
1730 | 32.8k | char *js = NULL; |
1731 | 32.8k | size_t js_len = 0; |
1732 | 32.8k | const char *q3; |
1733 | | |
1734 | 32.8k | q2 = cli_memstr(q, bytesleft, "/JavaScript", 11); |
1735 | 32.8k | if (!q2) |
1736 | 14.2k | break; |
1737 | | |
1738 | 18.6k | bytesleft -= q2 - q + 11; |
1739 | 18.6k | q = q2 + 11; |
1740 | | |
1741 | 18.6k | js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF))); |
1742 | 18.6k | bytesleft -= q2 - q; |
1743 | 18.6k | q = q2; |
1744 | | |
1745 | 18.6k | if (js) { |
1746 | 8.89k | char *decrypted = NULL; |
1747 | 8.89k | const char *out = js; |
1748 | 8.89k | js_len = strlen(js); |
1749 | 8.89k | if (pdf->flags & (1 << DECRYPTABLE_PDF)) { |
1750 | 2.94k | cli_dbgmsg("pdf_extract_obj: encrypted string\n"); |
1751 | 2.94k | decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string); |
1752 | | |
1753 | 2.94k | if (decrypted) { |
1754 | 2.06k | noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
1755 | 2.06k | out = decrypted; |
1756 | 2.06k | } |
1757 | 2.94k | } |
1758 | | |
1759 | 8.89k | if ((pdf->ctx->options->general & CL_SCAN_GENERAL_COLLECT_METADATA) && pdf->ctx->wrkproperty != NULL) { |
1760 | 8.89k | struct json_object *pdfobj, *jbig2arr; |
1761 | | |
1762 | 8.89k | if (NULL == (pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"))) { |
1763 | 0 | cli_errmsg("pdf_extract_obj: failed to get PDFStats JSON object\n"); |
1764 | 8.89k | } else if (NULL == (jbig2arr = cli_jsonarray(pdfobj, "JavascriptObjects"))) { |
1765 | 0 | cli_errmsg("pdf_extract_obj: failed to get JavascriptObjects JSON object\n"); |
1766 | 8.89k | } else { |
1767 | 8.89k | cli_jsonint_array(jbig2arr, obj->id >> 8); |
1768 | 8.89k | } |
1769 | 8.89k | } |
1770 | | |
1771 | 8.89k | pdf->stats.njs++; |
1772 | | |
1773 | 8.89k | if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) { |
1774 | 0 | rc = CL_EWRITE; |
1775 | 0 | free(js); |
1776 | 0 | break; |
1777 | 0 | } |
1778 | | |
1779 | 8.89k | free(decrypted); |
1780 | 8.89k | free(js); |
1781 | 8.89k | cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft); |
1782 | | |
1783 | 8.89k | if (bytesleft > 0) { |
1784 | 8.89k | q2 = pdf_nextobject(q, bytesleft); |
1785 | 8.89k | if (!q2) |
1786 | 4.06k | q2 = q + bytesleft - 1; |
1787 | | |
1788 | | /* non-conforming PDFs that don't escape ) properly */ |
1789 | 8.89k | q3 = memchr(q, ')', bytesleft); |
1790 | 8.89k | if (q3 && q3 < q2) |
1791 | 440 | q2 = q3; |
1792 | | |
1793 | 11.8k | while (q2 > q && q2[-1] == ' ') |
1794 | 2.98k | q2--; |
1795 | | |
1796 | 8.89k | if (q2 > q) { |
1797 | 6.22k | q--; |
1798 | 6.22k | filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum); |
1799 | 6.22k | q++; |
1800 | 6.22k | } |
1801 | 8.89k | } |
1802 | 8.89k | } |
1803 | | |
1804 | 18.6k | } while (bytesleft > 0); |
1805 | 315k | } else { |
1806 | 315k | off_t bytesleft = obj->size; |
1807 | | |
1808 | 315k | if (bytesleft < 0) |
1809 | 0 | rc = CL_EFORMAT; |
1810 | 315k | else { |
1811 | 315k | if (obj->objstm) { |
1812 | 8.95k | if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
1813 | 0 | rc = CL_EWRITE; |
1814 | 306k | } else { |
1815 | 306k | if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft) |
1816 | 0 | rc = CL_EWRITE; |
1817 | 306k | } |
1818 | 315k | } |
1819 | 315k | } |
1820 | | |
1821 | 1.00M | done: |
1822 | | |
1823 | 1.00M | cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff); |
1824 | 1.00M | cli_dbgmsg("pdf_extract_obj: ... to %s\n", fullname); |
1825 | | |
1826 | 1.00M | if (flags & PDF_EXTRACT_OBJ_SCAN && sum) { |
1827 | 962k | int rc2; |
1828 | | |
1829 | | /* TODO: invoke bytecode on this pdf obj with metainformation associated */ |
1830 | 962k | lseek(fout, 0, SEEK_SET); |
1831 | 962k | rc2 = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE); |
1832 | 962k | if (rc2 != CL_SUCCESS) { |
1833 | 3.16k | rc = rc2; |
1834 | 3.16k | goto really_done; |
1835 | 3.16k | } |
1836 | | |
1837 | 959k | if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) { |
1838 | 959k | rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout); |
1839 | 959k | if (rc2 == CL_VIRUS) { |
1840 | 0 | rc = rc2; |
1841 | 0 | goto really_done; |
1842 | 0 | } |
1843 | 959k | } |
1844 | | |
1845 | 959k | if (((rc == CL_CLEAN) || (rc == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) { |
1846 | 69.5k | lseek(fout, 0, SEEK_SET); |
1847 | 69.5k | cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff); |
1848 | | |
1849 | 69.5k | rc2 = pdf_scan_contents(fout, pdf, obj); |
1850 | 69.5k | if (rc2 != CL_SUCCESS) { |
1851 | 5 | rc = rc2; |
1852 | 5 | goto really_done; |
1853 | 5 | } |
1854 | 69.5k | } |
1855 | 959k | } |
1856 | | |
1857 | 1.00M | really_done: |
1858 | 1.00M | close(fout); |
1859 | | |
1860 | 1.00M | if (CL_EMEM != rc) { |
1861 | 1.00M | if (flags & PDF_EXTRACT_OBJ_SCAN && !pdf->ctx->engine->keeptmp) |
1862 | 1.00M | if (cli_unlink(fullname) && rc != CL_VIRUS) |
1863 | 0 | rc = CL_EUNLINK; |
1864 | 1.00M | } |
1865 | | |
1866 | 1.00M | return rc; |
1867 | 1.00M | } |
1868 | | |
1869 | | enum objstate { |
1870 | | STATE_NONE, |
1871 | | STATE_S, |
1872 | | STATE_FILTER, |
1873 | | STATE_JAVASCRIPT, |
1874 | | STATE_OPENACTION, |
1875 | | STATE_LINEARIZED, |
1876 | | STATE_LAUNCHACTION, |
1877 | | STATE_CONTENTS, |
1878 | | STATE_ANY /* for actions table below */ |
1879 | | }; |
1880 | | |
1881 | | #define NAMEFLAG_NONE 0x0 |
1882 | 2.44M | #define NAMEFLAG_HEURISTIC 0x1 |
1883 | | |
1884 | | struct pdfname_action { |
1885 | | const char *pdfname; |
1886 | | enum pdf_objflags set_objflag; /* OBJ_DICT is noop */ |
1887 | | enum objstate from_state; /* STATE_NONE is noop */ |
1888 | | enum objstate to_state; |
1889 | | uint32_t nameflags; |
1890 | | void (*pdf_stats_cb)(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act); |
1891 | | }; |
1892 | | |
1893 | | static struct pdfname_action pdfname_actions[] = { |
1894 | | {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb}, |
1895 | | {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, |
1896 | | {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb}, |
1897 | | {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb}, |
1898 | | {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, EmbeddedFile_cb}, |
1899 | | {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb}, |
1900 | | {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb}, |
1901 | | {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Image_cb}, |
1902 | | {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb}, |
1903 | | {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb}, |
1904 | | {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb}, |
1905 | | {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb}, |
1906 | | {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb}, |
1907 | | {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb}, |
1908 | | {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JBIG2Decode_cb}, |
1909 | | {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb}, |
1910 | | {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb}, |
1911 | | {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JPXDecode_cb}, |
1912 | | {"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, Crypt_cb}, |
1913 | | {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, Standard_cb}, |
1914 | | {"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, Sig_cb}, |
1915 | | {"V", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, |
1916 | | {"R", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, |
1917 | | {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NAMEFLAG_HEURISTIC, NULL}, |
1918 | | {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NAMEFLAG_HEURISTIC, NULL}, |
1919 | | {"JavaScript", OBJ_JAVASCRIPT, STATE_ANY, STATE_JAVASCRIPT, NAMEFLAG_HEURISTIC, JavaScript_cb}, |
1920 | | {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, |
1921 | | {"S", OBJ_DICT, STATE_NONE, STATE_S, NAMEFLAG_HEURISTIC, NULL}, |
1922 | | {"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, NULL}, |
1923 | | {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, NAMEFLAG_HEURISTIC, OpenAction_cb}, |
1924 | | {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, NAMEFLAG_HEURISTIC, Launch_cb}, |
1925 | | {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Page_cb}, |
1926 | | {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NAMEFLAG_HEURISTIC, NULL}, |
1927 | | {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Author_cb}, |
1928 | | {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Producer_cb}, |
1929 | | {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, CreationDate_cb}, |
1930 | | {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, ModificationDate_cb}, |
1931 | | {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Creator_cb}, |
1932 | | {"Title", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Title_cb}, |
1933 | | {"Keywords", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Keywords_cb}, |
1934 | | {"Subject", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Subject_cb}, |
1935 | | {"Pages", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Pages_cb}, |
1936 | | {"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb}, |
1937 | | {"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb}, |
1938 | | {"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb}, |
1939 | | {"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb}}; |
1940 | | |
1941 | 2.32M | #define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT)) |
1942 | | |
1943 | | static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const char *pdfname, int escapes, enum objstate *state) |
1944 | 9.07M | { |
1945 | 9.07M | struct pdfname_action *act = NULL; |
1946 | 9.07M | unsigned j; |
1947 | | |
1948 | 9.07M | obj->statsflags |= OBJ_FLAG_PDFNAME_DONE; |
1949 | | |
1950 | 374M | for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) { |
1951 | 368M | if (!strcmp(pdfname, pdfname_actions[j].pdfname)) { |
1952 | 2.44M | act = &pdfname_actions[j]; |
1953 | 2.44M | break; |
1954 | 2.44M | } |
1955 | 368M | } |
1956 | | |
1957 | 9.07M | if (!act) { |
1958 | | /* these are digital signature objects, filter doesn't matter, |
1959 | | * we don't need them anyway */ |
1960 | 6.63M | if (*state == STATE_FILTER && !(obj->flags & (1 << OBJ_SIGNED)) && !(obj->flags & KNOWN_FILTERS)) { |
1961 | 274k | cli_dbgmsg("handle_pdfname: unknown filter %s\n", pdfname); |
1962 | 274k | obj->flags |= 1 << OBJ_FILTER_UNKNOWN; |
1963 | 274k | } |
1964 | | |
1965 | 6.63M | return; |
1966 | 6.63M | } |
1967 | | |
1968 | | /* record filter order */ |
1969 | 2.44M | if (obj->numfilters < PDF_FILTERLIST_MAX && (*state == STATE_FILTER) && ((1 << act->set_objflag) & KNOWN_FILTERS)) |
1970 | 298k | obj->filterlist[obj->numfilters++] = act->set_objflag; |
1971 | | |
1972 | 2.44M | if ((act->nameflags & NAMEFLAG_HEURISTIC) && escapes) { |
1973 | | /* if a commonly used PDF name is escaped that is certainly |
1974 | | suspicious. */ |
1975 | 1.16k | cli_dbgmsg("handle_pdfname: pdfname %s is escaped\n", pdfname); |
1976 | 1.16k | pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME); |
1977 | 1.16k | } |
1978 | | |
1979 | 2.44M | if ((act->pdf_stats_cb)) |
1980 | 1.12M | act->pdf_stats_cb(pdf, obj, act); |
1981 | | |
1982 | 2.44M | if (act->from_state == *state || act->from_state == STATE_ANY) { |
1983 | 2.15M | *state = act->to_state; |
1984 | | |
1985 | 2.15M | if (*state == STATE_FILTER && act->set_objflag != OBJ_DICT && (obj->flags & (1 << act->set_objflag))) { |
1986 | 70.6k | cli_dbgmsg("handle_pdfname: duplicate stream filter %s\n", pdfname); |
1987 | 70.6k | pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS); |
1988 | 70.6k | } |
1989 | | |
1990 | 2.15M | obj->flags |= 1 << act->set_objflag; |
1991 | 2.15M | } else { |
1992 | | /* auto-reset states */ |
1993 | 282k | switch (*state) { |
1994 | 8.02k | case STATE_S: |
1995 | 8.02k | *state = STATE_NONE; |
1996 | 8.02k | break; |
1997 | 274k | default: |
1998 | 274k | break; |
1999 | 282k | } |
2000 | 282k | } |
2001 | 2.44M | } |
2002 | | |
2003 | | static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len) |
2004 | 54.5k | { |
2005 | 54.5k | const char *q, *q2; |
2006 | 54.5k | unsigned long objid; |
2007 | 54.5k | unsigned long genid; |
2008 | 54.5k | long temp_long; |
2009 | | |
2010 | 54.5k | if (len >= 16 && !strncmp(enc, "/EncryptMetadata", 16)) { |
2011 | 3.43k | q = cli_memstr(enc + 16, len - 16, "/Encrypt", 8); |
2012 | 3.43k | if (!q) |
2013 | 925 | return; |
2014 | | |
2015 | 2.51k | len -= q - enc; |
2016 | 2.51k | enc = q; |
2017 | 2.51k | } |
2018 | | |
2019 | 53.6k | q = enc + 8; |
2020 | 53.6k | len -= 8; |
2021 | 53.6k | q2 = pdf_nextobject(q, len); |
2022 | 53.6k | if (!q2 || !isdigit(*q2)) |
2023 | 6.13k | return; |
2024 | 47.5k | len -= q2 - q; |
2025 | 47.5k | q = q2; |
2026 | | |
2027 | 47.5k | if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) { |
2028 | 206 | cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse objid\n"); |
2029 | 206 | return; |
2030 | 47.3k | } else if (temp_long < 0) { |
2031 | 0 | cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative objid (%ld).\n", temp_long); |
2032 | 0 | return; |
2033 | 0 | } |
2034 | 47.3k | objid = (unsigned long)temp_long; |
2035 | | |
2036 | 47.3k | objid = objid << 8; |
2037 | 47.3k | q2 = pdf_nextobject(q, len); |
2038 | 47.3k | if (!q2 || !isdigit(*q2)) |
2039 | 3.54k | return; |
2040 | 43.7k | len -= q2 - q; |
2041 | 43.7k | q = q2; |
2042 | | |
2043 | 43.7k | if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) { |
2044 | 333 | cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse genid\n"); |
2045 | 333 | return; |
2046 | 43.4k | } else if (temp_long < 0) { |
2047 | 0 | cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative genid (%ld).\n", temp_long); |
2048 | 0 | return; |
2049 | 0 | } |
2050 | 43.4k | genid = (unsigned long)temp_long; |
2051 | | |
2052 | 43.4k | objid |= genid & 0xff; |
2053 | 43.4k | q2 = pdf_nextobject(q, len); |
2054 | 43.4k | if (!q2 || *q2 != 'R') |
2055 | 3.79k | return; |
2056 | | |
2057 | 39.6k | cli_dbgmsg("pdf_parse_encrypt: Encrypt dictionary in obj %lu %lu\n", objid >> 8, objid & 0xff); |
2058 | | |
2059 | 39.6k | pdf->enc_objid = objid; |
2060 | 39.6k | } |
2061 | | |
2062 | | static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length) |
2063 | 86.1k | { |
2064 | 86.1k | const char *enc; |
2065 | | |
2066 | 86.1k | enc = cli_memstr(s, length, "/Encrypt", 8); |
2067 | 86.1k | if (enc) { |
2068 | 54.5k | char *newID; |
2069 | 54.5k | unsigned int newIDlen = 0; |
2070 | | |
2071 | 54.5k | pdf->flags |= 1 << ENCRYPTED_PDF; |
2072 | 54.5k | pdf_parse_encrypt(pdf, enc, s + length - enc); |
2073 | 54.5k | newID = pdf_readstring(s, length, "/ID", &newIDlen, NULL, false); |
2074 | | |
2075 | 54.5k | if (newID) { |
2076 | 40.5k | free(pdf->fileID); |
2077 | 40.5k | pdf->fileID = newID; |
2078 | 40.5k | pdf->fileIDlen = newIDlen; |
2079 | 40.5k | } |
2080 | 54.5k | } |
2081 | 86.1k | } |
2082 | | |
2083 | | void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj) |
2084 | 1.59M | { |
2085 | | /* enough to hold common pdf names, we don't need all the names */ |
2086 | 1.59M | char pdfname[64]; |
2087 | 1.59M | const char *q2, *q3; |
2088 | 1.59M | const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL; |
2089 | 1.59M | const char *q = NULL; |
2090 | 1.59M | const char *dict = NULL, *enddict = NULL, *start = NULL; |
2091 | 1.59M | off_t dict_length = 0, full_dict_length = 0, bytesleft = 0; |
2092 | 1.59M | size_t i = 0; |
2093 | 1.59M | unsigned filters = 0, blockopens = 0; |
2094 | 1.59M | enum objstate objstate = STATE_NONE; |
2095 | | |
2096 | 1.59M | json_object *pdfobj = NULL, *jsonobj = NULL; |
2097 | | |
2098 | 1.59M | if (NULL == pdf || NULL == obj) { |
2099 | 0 | cli_warnmsg("pdf_parseobj: invalid arguments\n"); |
2100 | 0 | return; |
2101 | 0 | } |
2102 | | |
2103 | 1.59M | cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff); |
2104 | | |
2105 | 1.59M | if (obj->objstm) { |
2106 | 239k | if ((size_t)obj->start > obj->objstm->streambuf_len) { |
2107 | 0 | cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n", |
2108 | 0 | obj->id >> 8, obj->id & 0xff, obj->start, obj->objstm->streambuf_len); |
2109 | 0 | return; |
2110 | 0 | } |
2111 | 239k | q = (const char *)(obj->start + obj->objstm->streambuf); |
2112 | 1.35M | } else { |
2113 | 1.35M | if ((size_t)obj->start > pdf->size) { |
2114 | 0 | cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of PDF (%lld).\n", |
2115 | 0 | obj->id >> 8, obj->id & 0xff, obj->start, (long long)pdf->size); |
2116 | 0 | return; |
2117 | 0 | } |
2118 | 1.35M | q = (const char *)(obj->start + pdf->map); |
2119 | 1.35M | } |
2120 | 1.59M | start = q; |
2121 | | |
2122 | 1.59M | if (obj->size <= 0) |
2123 | 2.04k | return; |
2124 | | |
2125 | 1.58M | if (obj->objstm) { |
2126 | 239k | bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start); |
2127 | 1.35M | } else { |
2128 | 1.35M | bytesleft = MIN(obj->size, pdf->size - obj->start); |
2129 | 1.35M | } |
2130 | | |
2131 | | /* For objects that aren't already in an object stream^, check if they contain a stream. |
2132 | | * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */ |
2133 | 1.58M | if (NULL == obj->objstm) { |
2134 | | /* Check if object contains stream */ |
2135 | 1.35M | cl_error_t has_stream; |
2136 | 1.35M | const char *stream = NULL; |
2137 | 1.35M | size_t stream_size = 0; |
2138 | | |
2139 | 1.35M | has_stream = find_stream_bounds( |
2140 | 1.35M | start, |
2141 | 1.35M | obj->size, |
2142 | 1.35M | &stream, |
2143 | 1.35M | &stream_size, |
2144 | 1.35M | (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY)); |
2145 | | |
2146 | 1.35M | if ((CL_SUCCESS == has_stream) || |
2147 | 1.35M | (CL_EFORMAT == has_stream)) { |
2148 | | /* Stream found. Store this fact and the stream bounds. */ |
2149 | 712k | cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size); |
2150 | 712k | obj->flags |= (1 << OBJ_STREAM); |
2151 | 712k | obj->stream = stream; |
2152 | 712k | obj->stream_size = stream_size; |
2153 | 712k | } |
2154 | 1.35M | } |
2155 | | |
2156 | | /* find start of dictionary */ |
2157 | 12.5M | do { |
2158 | 12.5M | nextobj = pdf_nextobject(q, bytesleft); |
2159 | 12.5M | bytesleft -= nextobj - q; |
2160 | | |
2161 | 12.5M | if (!nextobj || bytesleft < 0) { |
2162 | 308k | cli_dbgmsg("pdf_parseobj: %u %u obj: no dictionary\n", obj->id >> 8, obj->id & 0xff); |
2163 | | |
2164 | 308k | if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { |
2165 | 308k | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
2166 | 308k | if (!(pdfobj)) |
2167 | 0 | return; |
2168 | 308k | } |
2169 | | |
2170 | 308k | if (pdfobj) { |
2171 | 308k | if (!(jsonobj)) |
2172 | 308k | jsonobj = cli_jsonarray(pdfobj, "ObjectsWithoutDictionaries"); |
2173 | 308k | if (jsonobj) |
2174 | 308k | cli_jsonint_array(jsonobj, obj->id >> 8); |
2175 | 308k | } |
2176 | | |
2177 | 308k | return; |
2178 | 308k | } |
2179 | | |
2180 | | /* |
2181 | | * Opening `<` for object's dictionary may be back 1 character, |
2182 | | * provided q is not at the start of the buffer (it shouldn't be). |
2183 | | */ |
2184 | 12.2M | if (obj->objstm) { |
2185 | 2.73M | if (obj->objstm->streambuf == q) { |
2186 | 0 | q3 = memchr(q, '<', nextobj - q); |
2187 | 2.73M | } else { |
2188 | 2.73M | q3 = memchr(q - 1, '<', nextobj - q + 1); |
2189 | 2.73M | } |
2190 | 9.48M | } else { |
2191 | 9.48M | if (pdf->map == q) { |
2192 | 0 | q3 = memchr(q, '<', nextobj - q); |
2193 | 9.48M | } else { |
2194 | 9.48M | q3 = memchr(q - 1, '<', nextobj - q + 1); |
2195 | 9.48M | } |
2196 | 9.48M | } |
2197 | 12.2M | nextobj++; |
2198 | 12.2M | bytesleft--; |
2199 | 12.2M | q = nextobj; |
2200 | 12.2M | } while (!q3 || q3[1] != '<'); |
2201 | 1.28M | dict = q3 + 2; |
2202 | 1.28M | q = dict; |
2203 | 1.28M | blockopens++; |
2204 | 1.28M | bytesleft = obj->size - (q - start); |
2205 | 1.28M | enddict = q + bytesleft - 1; |
2206 | | |
2207 | | /* find end of dictionary block */ |
2208 | 1.28M | if (bytesleft < 0) { |
2209 | 0 | cli_dbgmsg("pdf_parseobj: %u %u obj: broken dictionary\n", obj->id >> 8, obj->id & 0xff); |
2210 | |
|
2211 | 0 | if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { |
2212 | 0 | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
2213 | 0 | if (!(pdfobj)) |
2214 | 0 | return; |
2215 | 0 | } |
2216 | | |
2217 | 0 | if (pdfobj) { |
2218 | 0 | if (!(jsonobj)) |
2219 | 0 | jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries"); |
2220 | 0 | if (jsonobj) |
2221 | 0 | cli_jsonint_array(jsonobj, obj->id >> 8); |
2222 | 0 | } |
2223 | |
|
2224 | 0 | return; |
2225 | 0 | } |
2226 | | |
2227 | | /* while still looking ... */ |
2228 | 4.49M | while ((q < enddict - 1) && (blockopens > 0)) { |
2229 | | /* find next close */ |
2230 | 3.42M | nextclose = memchr(q, '>', enddict - q); |
2231 | 3.42M | if (nextclose && (nextclose[1] == '>')) { |
2232 | | /* check for nested open */ |
2233 | 4.11M | while ((nextopen = memchr(q - 1, '<', nextclose - q + 1)) != NULL) { |
2234 | 2.65M | if (nextopen[1] == '<') { |
2235 | | /* nested open */ |
2236 | 617k | blockopens++; |
2237 | 617k | q = nextopen + 2; |
2238 | 2.03M | } else { |
2239 | | /* unmatched < before next close */ |
2240 | 2.03M | q = nextopen + 2; |
2241 | 2.03M | } |
2242 | 2.65M | } |
2243 | | /* close block */ |
2244 | 1.46M | blockopens--; |
2245 | 1.46M | q = nextclose + 2; |
2246 | 1.95M | } else if (nextclose) { |
2247 | | /* found one > but not two */ |
2248 | 1.74M | q = nextclose + 2; |
2249 | 1.74M | } else { |
2250 | | /* next closing not found */ |
2251 | 211k | break; |
2252 | 211k | } |
2253 | 3.42M | } |
2254 | | |
2255 | | /* Was end of dictionary found? */ |
2256 | 1.28M | if (blockopens) { |
2257 | | /* probably truncated */ |
2258 | 273k | cli_dbgmsg("pdf_parseobj: %u %u obj broken dictionary\n", obj->id >> 8, obj->id & 0xff); |
2259 | | |
2260 | 273k | if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) { |
2261 | 273k | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
2262 | 273k | if (!(pdfobj)) |
2263 | 0 | return; |
2264 | 273k | } |
2265 | | |
2266 | 273k | if (pdfobj) { |
2267 | 273k | if (!(jsonobj)) |
2268 | 273k | jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries"); |
2269 | 273k | if (jsonobj) |
2270 | 273k | cli_jsonint_array(jsonobj, obj->id >> 8); |
2271 | 273k | } |
2272 | | |
2273 | 273k | return; |
2274 | 273k | } |
2275 | | |
2276 | 1.00M | enddict = nextclose; |
2277 | 1.00M | obj->flags |= 1 << OBJ_DICT; |
2278 | 1.00M | full_dict_length = dict_length = enddict - dict; |
2279 | | |
2280 | | /* This code prints the dictionary content. |
2281 | | { |
2282 | | char * dictionary = malloc(dict_length + 1); |
2283 | | if (dictionary) { |
2284 | | for (i = 0; i < dict_length; i++) { |
2285 | | if (dict[i] == '\r') |
2286 | | dictionary[i] = '\n'; |
2287 | | else if (isprint(dict[i]) || isspace(dict[i])) |
2288 | | dictionary[i] = dict[i]; |
2289 | | else |
2290 | | dictionary[i] = '*'; |
2291 | | } |
2292 | | dictionary[dict_length] = '\0'; |
2293 | | cli_dbgmsg("pdf_parseobj: dictionary is <<%s>>\n", dictionary); |
2294 | | free(dictionary); |
2295 | | } |
2296 | | } |
2297 | | */ |
2298 | | |
2299 | | /* process pdf names */ |
2300 | 10.0M | for (q = dict; dict_length > 0;) { |
2301 | 9.91M | int escapes = 0, breakout = 0; |
2302 | 9.91M | q2 = memchr(q, '/', dict_length); |
2303 | 9.91M | if (!q2) |
2304 | 840k | break; |
2305 | | |
2306 | 9.07M | dict_length -= q2 - q; |
2307 | 9.07M | q = q2; |
2308 | | /* normalize PDF names */ |
2309 | 94.8M | for (i = 0; dict_length > 0 && (i < sizeof(pdfname) - 1); i++) { |
2310 | 94.6M | q++; |
2311 | 94.6M | dict_length--; |
2312 | | |
2313 | 94.6M | if (*q == '#') { |
2314 | 280k | if (cli_hex2str_to(q + 1, pdfname + i, 2) == -1) |
2315 | 227k | break; |
2316 | | |
2317 | 52.2k | q += 2; |
2318 | 52.2k | dict_length -= 2; |
2319 | 52.2k | escapes = 1; |
2320 | 52.2k | continue; |
2321 | 280k | } |
2322 | | |
2323 | 94.3M | switch (*q) { |
2324 | 2.81M | case ' ': |
2325 | 2.90M | case '\t': |
2326 | 3.14M | case '\r': |
2327 | 3.52M | case '\n': |
2328 | 6.63M | case '/': |
2329 | 7.39M | case '>': |
2330 | 7.71M | case '[': |
2331 | 7.88M | case ']': |
2332 | 8.38M | case '<': |
2333 | 8.66M | case '(': |
2334 | 8.66M | breakout = 1; |
2335 | 94.3M | } |
2336 | | |
2337 | 94.3M | if (breakout) |
2338 | 8.66M | break; |
2339 | | |
2340 | 85.7M | pdfname[i] = *q; |
2341 | 85.7M | } |
2342 | | |
2343 | 9.07M | pdfname[i] = '\0'; |
2344 | | |
2345 | 9.07M | handle_pdfname(pdf, obj, pdfname, escapes, &objstate); |
2346 | 9.07M | if (objstate == STATE_LINEARIZED) { |
2347 | 55.6k | long trailer_end, trailer; |
2348 | | |
2349 | 55.6k | pdfobj_flag(pdf, obj, LINEARIZED_PDF); |
2350 | 55.6k | objstate = STATE_NONE; |
2351 | 55.6k | trailer_end = pdf_readint(dict, full_dict_length, "/H"); |
2352 | 55.6k | if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) { |
2353 | 26.6k | trailer = trailer_end - 1024; |
2354 | 26.6k | if (trailer < 0) |
2355 | 25.7k | trailer = 0; |
2356 | | |
2357 | 26.6k | q2 = pdf->map + trailer; |
2358 | 26.6k | cli_dbgmsg("pdf_parseobj: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end); |
2359 | 26.6k | pdf_parse_trailer(pdf, q2, trailer_end - trailer); |
2360 | 26.6k | if (pdf->fileID) |
2361 | 13.5k | cli_dbgmsg("pdf_parseobj: found fileID\n"); |
2362 | 26.6k | } |
2363 | 55.6k | } |
2364 | | |
2365 | 9.07M | if (objstate == STATE_LAUNCHACTION) |
2366 | 97.8k | pdfobj_flag(pdf, obj, HAS_LAUNCHACTION); |
2367 | 9.07M | if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || objstate == STATE_OPENACTION || objstate == STATE_CONTENTS)) { |
2368 | 425k | off_t dict_remaining = dict_length; |
2369 | | |
2370 | 425k | if (objstate == STATE_OPENACTION) |
2371 | 304k | pdfobj_flag(pdf, obj, HAS_OPENACTION); |
2372 | | |
2373 | 425k | q2 = pdf_nextobject(q, dict_remaining); |
2374 | 425k | if (q2 && isdigit(*q2)) { |
2375 | 360k | const char *q2_old = NULL; |
2376 | 360k | unsigned long objid; |
2377 | 360k | unsigned long genid; |
2378 | 360k | long temp_long; |
2379 | | |
2380 | 360k | dict_remaining -= (off_t)(q2 - q); |
2381 | | |
2382 | 360k | if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) { |
2383 | 3.04k | cli_dbgmsg("pdf_parseobj: failed to parse object objid\n"); |
2384 | 3.04k | return; |
2385 | 357k | } else if (temp_long < 0) { |
2386 | 0 | cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long); |
2387 | 0 | return; |
2388 | 0 | } |
2389 | 357k | objid = (unsigned long)temp_long; |
2390 | | |
2391 | 357k | objid = objid << 8; |
2392 | | |
2393 | 799k | while ((dict_remaining > 0) && isdigit(*q2)) { |
2394 | 442k | q2++; |
2395 | 442k | dict_remaining--; |
2396 | 442k | } |
2397 | | |
2398 | 357k | q2_old = q2; |
2399 | 357k | q2 = pdf_nextobject(q2, dict_remaining); |
2400 | 357k | if (q2 && isdigit(*q2)) { |
2401 | 324k | dict_remaining -= (off_t)(q2 - q2_old); |
2402 | 324k | if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) { |
2403 | 1.06k | cli_dbgmsg("pdf_parseobj: failed to parse object genid\n"); |
2404 | 1.06k | return; |
2405 | 323k | } else if (temp_long < 0) { |
2406 | 0 | cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long); |
2407 | 0 | return; |
2408 | 0 | } |
2409 | 323k | genid = (unsigned long)temp_long; |
2410 | | |
2411 | 323k | objid |= genid & 0xff; |
2412 | | |
2413 | 323k | q2 = pdf_nextobject(q2, dict_remaining); |
2414 | 323k | if (q2 && *q2 == 'R') { |
2415 | 285k | struct pdf_obj *obj2; |
2416 | | |
2417 | 285k | cli_dbgmsg("pdf_parseobj: found %s stored in indirect object %lu %lu\n", pdfname, objid >> 8, objid & 0xff); |
2418 | 285k | obj2 = find_obj(pdf, obj, objid); |
2419 | 285k | if (obj2) { |
2420 | 10.6k | enum pdf_objflags flag = OBJ_STREAM; |
2421 | | |
2422 | 10.6k | switch (objstate) { |
2423 | 985 | case STATE_JAVASCRIPT: |
2424 | 985 | flag = OBJ_JAVASCRIPT; |
2425 | 985 | break; |
2426 | 1.41k | case STATE_OPENACTION: |
2427 | 1.41k | flag = OBJ_OPENACTION; |
2428 | 1.41k | break; |
2429 | 8.23k | case STATE_CONTENTS: |
2430 | 8.23k | flag = OBJ_CONTENTS; |
2431 | 8.23k | break; |
2432 | 0 | default: |
2433 | 0 | cli_dbgmsg("pdf_parseobj: Unexpected object type\n"); |
2434 | 0 | return; |
2435 | 10.6k | } |
2436 | | |
2437 | 10.6k | obj->flags &= ~(1 << flag); /* Disable flag for current object ... */ |
2438 | 10.6k | obj2->flags |= 1 << flag; /* ... and set the flag for the indirect object instead! */ |
2439 | 275k | } else { |
2440 | 275k | pdfobj_flag(pdf, obj, BAD_INDOBJ); |
2441 | 275k | } |
2442 | 285k | } |
2443 | 323k | } |
2444 | 357k | } |
2445 | | |
2446 | 421k | objstate = STATE_NONE; |
2447 | 421k | } |
2448 | 9.07M | } |
2449 | | |
2450 | 47.1M | for (i = 0; i < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); i++) { |
2451 | 46.1M | const struct pdfname_action *act = &pdfname_actions[i]; |
2452 | | |
2453 | 46.1M | if ((obj->flags & (1 << act->set_objflag)) && |
2454 | 46.1M | act->from_state == STATE_FILTER && |
2455 | 46.1M | act->to_state == STATE_FILTER && |
2456 | 46.1M | act->set_objflag != OBJ_FILTER_CRYPT && |
2457 | 46.1M | act->set_objflag != OBJ_FILTER_STANDARD) { |
2458 | 534k | filters++; |
2459 | 534k | } |
2460 | 46.1M | } |
2461 | | |
2462 | 1.00M | if (filters > 2) { |
2463 | | /* more than 2 non-crypt filters */ |
2464 | 24.9k | pdfobj_flag(pdf, obj, MANY_FILTERS); |
2465 | 24.9k | } |
2466 | | |
2467 | 1.00M | if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS)) |
2468 | 305k | obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN); |
2469 | | |
2470 | 1.00M | if (obj->flags & (1 << OBJ_FILTER_UNKNOWN)) |
2471 | 21.4k | pdfobj_flag(pdf, obj, UNKNOWN_FILTER); |
2472 | | |
2473 | 1.00M | cli_dbgmsg("pdf_parseobj: %u %u obj flags: %02x\n", obj->id >> 8, obj->id & 0xff, obj->flags); |
2474 | 1.00M | } |
2475 | | |
2476 | | /** |
2477 | | * @brief Given a pointer to a dictionary object and a key, get the key's value. |
2478 | | * |
2479 | | * @param q0 Offset of the start of the dictionary. |
2480 | | * @param[in,out] len In: The number of bytes in the dictionary. |
2481 | | * Out: The number of bytes remaining from the start |
2482 | | * of the value to the end of the dict |
2483 | | * @param key Null terminated 'key' to search for. |
2484 | | * @return const char* Address of the dictionary key's 'value'. |
2485 | | */ |
2486 | | static const char *pdf_getdict(const char *q0, int *len, const char *key) |
2487 | 2.44M | { |
2488 | 2.44M | const char *q; |
2489 | | |
2490 | 2.44M | if (*len <= 0) { |
2491 | 1.20k | cli_dbgmsg("pdf_getdict: bad length %d\n", *len); |
2492 | 1.20k | return NULL; |
2493 | 1.20k | } |
2494 | | |
2495 | 2.44M | if (!q0) |
2496 | 1.67k | return NULL; |
2497 | | |
2498 | | /* find the key */ |
2499 | 2.44M | q = cli_memstr(q0, *len, key, strlen(key)); |
2500 | 2.44M | if (!q) { |
2501 | 1.79M | cli_dbgmsg("pdf_getdict: %s not found in dict\n", key); |
2502 | 1.79M | return NULL; |
2503 | 1.79M | } |
2504 | | |
2505 | 655k | *len -= q - q0; |
2506 | 655k | q0 = q; |
2507 | | |
2508 | | /* find the start of the value object */ |
2509 | 655k | q = pdf_nextobject(q0 + 1, *len - 1); |
2510 | 655k | if (!q) { |
2511 | 881 | cli_dbgmsg("pdf_getdict: %s is invalid in dict\n", key); |
2512 | 881 | return NULL; |
2513 | 881 | } |
2514 | | |
2515 | | /* if the value is a dictionary object, include the < > brackets.*/ |
2516 | 948k | while (q > q0 && (q[-1] == '<' || q[-1] == '\n')) |
2517 | 293k | q--; |
2518 | | |
2519 | 654k | *len -= q - q0; |
2520 | 654k | return q; |
2521 | 655k | } |
2522 | | |
2523 | | /** |
2524 | | * @brief Read the value string from a PDF dictionary key/value pair. |
2525 | | * |
2526 | | * @param q0 A pointer into the PDF dictionary. |
2527 | | * @param len The bytes remaining in the file. |
2528 | | * @param key The key we're looking for. |
2529 | | * @param [out] slen The length of the output string |
2530 | | * @param [out] qend The pointer we wound up at, after the end of the value. |
2531 | | * @param noescape Select 'true' to ignore escape characters, 'false' to process them. |
2532 | | * @return char* |
2533 | | */ |
2534 | | static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape) |
2535 | 119k | { |
2536 | 119k | char *s, *s0; |
2537 | 119k | const char *start, *q, *end; |
2538 | 119k | if (slen) |
2539 | 101k | *slen = 0; |
2540 | | |
2541 | 119k | if (qend) |
2542 | 18.6k | *qend = q0; |
2543 | | |
2544 | 119k | q = pdf_getdict(q0, &len, key); |
2545 | 119k | if (!q || len <= 0) |
2546 | 17.5k | return NULL; |
2547 | | |
2548 | 102k | if (*q == '(') { |
2549 | 49.3k | int paren = 1; |
2550 | 49.3k | start = ++q; |
2551 | 49.3k | len--; |
2552 | 18.6M | for (; paren > 0 && len > 0; q++, len--) { |
2553 | 18.5M | switch (*q) { |
2554 | 31.2k | case '(': |
2555 | 31.2k | paren++; |
2556 | 31.2k | break; |
2557 | 68.9k | case ')': |
2558 | 68.9k | paren--; |
2559 | 68.9k | break; |
2560 | 175k | case '\\': |
2561 | 175k | q++; |
2562 | 175k | len--; |
2563 | 175k | break; |
2564 | 18.2M | default: |
2565 | 18.2M | break; |
2566 | 18.5M | } |
2567 | 18.5M | } |
2568 | | |
2569 | 49.3k | if (len <= 0) { |
2570 | 5.58k | cli_errmsg("pdf_readstring: Invalid, truncated dictionary.\n"); |
2571 | 5.58k | return NULL; |
2572 | 5.58k | } |
2573 | | |
2574 | 43.7k | if (qend) |
2575 | 8.50k | *qend = q; |
2576 | | |
2577 | 43.7k | q--; |
2578 | 43.7k | len = q - start; |
2579 | 43.7k | s0 = s = cli_max_malloc(len + 1); |
2580 | 43.7k | if (!s) { |
2581 | 0 | cli_errmsg("pdf_readstring: Unable to allocate buffer\n"); |
2582 | 0 | return NULL; |
2583 | 0 | } |
2584 | | |
2585 | 43.7k | end = start + len; |
2586 | 43.7k | if (noescape) { |
2587 | 5.55k | memcpy(s0, start, len); |
2588 | 5.55k | s = s0 + len; |
2589 | 38.2k | } else { |
2590 | 10.9M | for (q = start; q < end; q++) { |
2591 | 10.8M | if (*q != '\\') { |
2592 | 10.7M | *s++ = *q; |
2593 | 10.7M | } else { |
2594 | 116k | q++; |
2595 | 116k | switch (*q) { |
2596 | 14.0k | case 'n': |
2597 | 14.0k | *s++ = '\n'; |
2598 | 14.0k | break; |
2599 | 15.4k | case 'r': |
2600 | 15.4k | *s++ = '\r'; |
2601 | 15.4k | break; |
2602 | 167 | case 't': |
2603 | 167 | *s++ = '\t'; |
2604 | 167 | break; |
2605 | 363 | case 'b': |
2606 | 363 | *s++ = '\b'; |
2607 | 363 | break; |
2608 | 5.96k | case 'f': |
2609 | 5.96k | *s++ = '\f'; |
2610 | 5.96k | break; |
2611 | 11.2k | case '(': /* fall-through */ |
2612 | 22.2k | case ')': /* fall-through */ |
2613 | 43.3k | case '\\': |
2614 | 43.3k | *s++ = *q; |
2615 | 43.3k | break; |
2616 | 217 | case '\n': |
2617 | | /* ignore */ |
2618 | 217 | break; |
2619 | 2.90k | case '\r': |
2620 | | /* ignore */ |
2621 | 2.90k | if (q + 1 < end && q[1] == '\n') |
2622 | 288 | q++; |
2623 | 2.90k | break; |
2624 | 2.86k | case '0': |
2625 | 3.69k | case '1': |
2626 | 9.02k | case '2': |
2627 | 11.1k | case '3': |
2628 | 12.1k | case '4': |
2629 | 12.3k | case '5': |
2630 | 12.8k | case '6': |
2631 | 13.3k | case '7': |
2632 | 14.1k | case '8': |
2633 | 14.7k | case '9': |
2634 | | /* octal escape */ |
2635 | 14.7k | if (q + 2 < end) { |
2636 | 14.6k | *s++ = 64 * (q[0] - '0') + 8 * (q[1] - '0') + (q[2] - '0'); |
2637 | 14.6k | q += 2; |
2638 | 14.6k | } |
2639 | 14.7k | break; |
2640 | 18.8k | default: |
2641 | | /* ignore */ |
2642 | 18.8k | *s++ = '\\'; |
2643 | 18.8k | q--; |
2644 | 18.8k | break; |
2645 | 116k | } |
2646 | 116k | } |
2647 | 10.8M | } |
2648 | 38.2k | } |
2649 | | |
2650 | 43.7k | *s++ = '\0'; |
2651 | 43.7k | if (slen) |
2652 | 35.2k | *slen = s - s0 - 1; |
2653 | | |
2654 | 43.7k | return s0; |
2655 | 43.7k | } |
2656 | | |
2657 | 52.8k | if ((*q == '<') && (len >= 3)) { |
2658 | 49.1k | start = ++q; |
2659 | 49.1k | len -= 1; |
2660 | | // skip newlines after < |
2661 | 49.7k | while (len > 0 && *start == '\n') { |
2662 | 512 | start = ++q; |
2663 | 512 | len -= 1; |
2664 | 512 | } |
2665 | 49.1k | q = memchr(q + 1, '>', len - 1); |
2666 | 49.1k | if (!q) |
2667 | 341 | return NULL; |
2668 | | |
2669 | 48.8k | if (qend) |
2670 | 801 | *qend = q; |
2671 | | |
2672 | 48.8k | s = cli_max_malloc((q - start) / 2 + 1); |
2673 | 48.8k | if (s == NULL) { /* oops, couldn't allocate memory */ |
2674 | 0 | cli_dbgmsg("pdf_readstring: unable to allocate memory...\n"); |
2675 | 0 | return NULL; |
2676 | 0 | } |
2677 | | |
2678 | 48.8k | if (cli_hex2str_to(start, s, q - start)) { |
2679 | 5.78k | cli_dbgmsg("pdf_readstring: %s has bad hex value\n", key); |
2680 | 5.78k | free(s); |
2681 | 5.78k | return NULL; |
2682 | 5.78k | } |
2683 | | |
2684 | 43.0k | s[(q - start) / 2] = '\0'; |
2685 | 43.0k | if (slen) |
2686 | 42.6k | *slen = (q - start) / 2; |
2687 | | |
2688 | 43.0k | return s; |
2689 | 48.8k | } |
2690 | | |
2691 | 3.65k | cli_dbgmsg("pdf_readstring: %s is invalid string in dict\n", key); |
2692 | 3.65k | return NULL; |
2693 | 52.8k | } |
2694 | | |
2695 | | static char *pdf_readval(const char *q, int len, const char *key) |
2696 | 44.8k | { |
2697 | 44.8k | const char *end; |
2698 | 44.8k | char *s; |
2699 | 44.8k | int origlen = len; |
2700 | | |
2701 | 44.8k | q = pdf_getdict(q, &len, key); |
2702 | 44.8k | if (!q || len <= 0) |
2703 | 19.5k | return NULL; |
2704 | | |
2705 | 25.3k | while (len > 0 && *q && *q == ' ') { |
2706 | 0 | q++; |
2707 | 0 | len--; |
2708 | 0 | } |
2709 | | |
2710 | 25.3k | if (*q != '/') |
2711 | 482 | return NULL; |
2712 | | |
2713 | 24.8k | q++; |
2714 | 24.8k | len--; |
2715 | 24.8k | end = q; |
2716 | | |
2717 | 176k | while (len > 0 && *end && !(*end == '/' || (len > 1 && end[0] == '>' && end[1] == '>'))) { |
2718 | 151k | end++; |
2719 | 151k | len--; |
2720 | 151k | } |
2721 | | |
2722 | | /* end-of-buffer whitespace trimming */ |
2723 | 25.4k | while (len < origlen && isspace(*(end - 1))) { |
2724 | 602 | end--; |
2725 | 602 | len++; |
2726 | 602 | } |
2727 | | |
2728 | 24.8k | s = cli_max_malloc(end - q + 1); |
2729 | 24.8k | if (!s) |
2730 | 0 | return NULL; |
2731 | | |
2732 | 24.8k | memcpy(s, q, end - q); |
2733 | 24.8k | s[end - q] = '\0'; |
2734 | | |
2735 | 24.8k | return s; |
2736 | 24.8k | } |
2737 | | |
2738 | | static int pdf_readint(const char *q0, int len, const char *key) |
2739 | 361k | { |
2740 | 361k | long value = 0; |
2741 | 361k | const char *q = pdf_getdict(q0, &len, key); |
2742 | | |
2743 | 361k | if (q == NULL) { |
2744 | 49.7k | value = -1; |
2745 | 311k | } else if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)len, 0, 10, &value)) { |
2746 | 23.4k | value = -1; |
2747 | 23.4k | } |
2748 | 361k | return value; |
2749 | 361k | } |
2750 | | |
2751 | | static int pdf_readbool(const char *q0, int len, const char *key, int Default) |
2752 | 10.5k | { |
2753 | 10.5k | const char *q = pdf_getdict(q0, &len, key); |
2754 | | |
2755 | 10.5k | if (!q || len < 5) |
2756 | 10.4k | return Default; |
2757 | | |
2758 | 94 | if (!strncmp(q, "true", 4)) |
2759 | 0 | return 1; |
2760 | | |
2761 | 94 | if (!strncmp(q, "false", 5)) |
2762 | 0 | return 0; |
2763 | | |
2764 | 94 | cli_dbgmsg("pdf_readbool: invalid value for %s bool\n", key); |
2765 | | |
2766 | 94 | return Default; |
2767 | 94 | } |
2768 | | |
2769 | | static const char *key_padding = |
2770 | | "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4e\x56\xff\xfa\x01\x08" |
2771 | | "\x2e\x2e\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A"; |
2772 | | |
2773 | | static void dbg_printhex(const char *msg, const char *hex, unsigned len) |
2774 | 75.6k | { |
2775 | 75.6k | if (cli_debug_flag) { |
2776 | 0 | char *kh = cli_str2hex(hex, len); |
2777 | |
|
2778 | 0 | cli_dbgmsg("cli_pdf: %s: %s\n", msg, kh); |
2779 | |
|
2780 | 0 | free(kh); |
2781 | 0 | } |
2782 | 75.6k | } |
2783 | | |
2784 | | /** |
2785 | | * @brief Compute the hash of the password concatenated with the validation salt and (for owner-password checks) the U string. |
2786 | | * |
2787 | | * Some details and comments for how to compute this hash comes from the PyPDF project: |
2788 | | * https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L568 |
2789 | | * |
2790 | | * @param password The password to hash. |
2791 | | * @param pwlen The length of the password. |
2792 | | * @param salt The validation salt. |
2793 | | * @param hash The resulting hash. |
2794 | | * @param U [Optional] The U string (for owner-password checks). |
2795 | | */ |
2796 | | static void compute_hash_r6(const char *password, size_t pwlen, const unsigned char salt[16], unsigned char hash[32], const char *U) |
2797 | 9.81k | { |
2798 | 9.81k | unsigned char data[(128 + 64 + 48) * 64]; |
2799 | 9.81k | unsigned char block[64]; |
2800 | 9.81k | int32_t block_size = 32; |
2801 | 9.81k | size_t in_data_len = 0, out_data_len; |
2802 | 9.81k | int32_t i, j, sum; |
2803 | 9.81k | uint8_t sha256[32], sha384[48], sha512[64]; |
2804 | | |
2805 | | /* |
2806 | | * Compute a SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of the owner or user validation salt. |
2807 | | */ |
2808 | 9.81k | memcpy(data, password, pwlen); |
2809 | 9.81k | memcpy(data + pwlen, salt, 8); |
2810 | | |
2811 | 9.81k | if (NULL != U) { |
2812 | | // If it's for the owner password check, we also concatenate the 48-byte U string. |
2813 | 3.47k | memcpy(data + pwlen + 8, U, 48); |
2814 | | |
2815 | 3.47k | cl_sha256(data, pwlen + 8 + 48, block, NULL); |
2816 | 6.34k | } else { |
2817 | 6.34k | cl_sha256(data, pwlen + 8, block, NULL); |
2818 | 6.34k | } |
2819 | | |
2820 | 656k | for (i = 0; i < 64 || i < (data[(in_data_len * 64) - 1] + 32); i++) { |
2821 | 647k | memcpy(data, password, pwlen); |
2822 | 647k | memcpy(data + pwlen, block, block_size); |
2823 | | |
2824 | 647k | in_data_len = pwlen + block_size; |
2825 | | |
2826 | 647k | if (NULL != U) { |
2827 | | // If it's for the owner password check, we also concatenate the 48-byte U string. |
2828 | 231k | memcpy(data + pwlen + block_size, U, 48); |
2829 | 231k | in_data_len += 48; |
2830 | 231k | } |
2831 | | |
2832 | 41.4M | for (j = 1; j < 64; j++) |
2833 | 40.7M | memcpy(data + j * in_data_len, data, in_data_len); |
2834 | | |
2835 | 647k | aes_128cbc_encrypt(data, in_data_len * 64, data, &out_data_len, block, 16, block + 16); |
2836 | | |
2837 | 10.9M | for (j = 0, sum = 0; j < 16; j++) |
2838 | 10.3M | sum += data[j]; |
2839 | | |
2840 | 647k | block_size = 32 + (sum % 3) * 16; |
2841 | 647k | switch (block_size) { |
2842 | 239k | case 32: |
2843 | 239k | cl_sha256(data, in_data_len * 64, sha256, NULL); |
2844 | 239k | memcpy(block, sha256, 32); |
2845 | 239k | break; |
2846 | | |
2847 | 193k | case 48: |
2848 | 193k | cl_sha384(data, in_data_len * 64, sha384, NULL); |
2849 | 193k | memcpy(block, sha384, 48); |
2850 | 193k | break; |
2851 | | |
2852 | 213k | case 64: |
2853 | 213k | cl_sha512(data, in_data_len * 64, sha512, NULL); |
2854 | 213k | memcpy(block, sha512, 64); |
2855 | 213k | break; |
2856 | 647k | } |
2857 | 647k | } |
2858 | | |
2859 | 9.81k | memcpy(hash, block, 32); |
2860 | 9.81k | } |
2861 | | |
2862 | | /** |
2863 | | * @brief Check if the owner password matches an empty password. |
2864 | | * |
2865 | | * Will set the DECRYPTABLE_PDF flag if the owner password is empty. |
2866 | | * Will also set the key and keylen fields in the pdf_struct. |
2867 | | * |
2868 | | * Some details and comments for how to check the owner password comes from the PyPDF project: |
2869 | | * https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L397 |
2870 | | * |
2871 | | * @param pdf The PDF context. |
2872 | | * @param R The encryption version. |
2873 | | * @param O The /O string. |
2874 | | * @param U The /U string. |
2875 | | * @param OE The /OE string. |
2876 | | * @param OE_len The length of the /OE string. |
2877 | | */ |
2878 | | static void check_owner_password(struct pdf_struct *pdf, int R, |
2879 | | const char *O, const char *U, |
2880 | | const char *OE, size_t OE_len) |
2881 | 9.95k | { |
2882 | 9.95k | bool password_empty = false; |
2883 | | |
2884 | 9.95k | dbg_printhex("U: ", U, 32); |
2885 | 9.95k | dbg_printhex("O: ", O, 32); |
2886 | | |
2887 | 9.95k | switch (R) { |
2888 | 3.93k | case 6: { |
2889 | 3.93k | unsigned char hash[32], validationkey[32]; |
2890 | | |
2891 | 3.93k | size_t pwlen = 0; |
2892 | 3.93k | char password[] = ""; |
2893 | | |
2894 | 3.93k | if (NULL == OE) { |
2895 | 466 | cli_dbgmsg("check_owner_password: Missing OE value!\n"); |
2896 | 466 | noisy_warnmsg("check_owner_password: Missing OE value!\n"); |
2897 | 466 | goto done; |
2898 | 466 | } |
2899 | | |
2900 | 3.46k | dbg_printhex("OE: ", OE, OE_len); |
2901 | | |
2902 | | /* |
2903 | | * Test the password against the owner key by computing the SHA-256 hash of the UTF-8 password concatenated |
2904 | | * with the 8 bytes of owner validation salt, concatenated with the 48-byte U string. |
2905 | | */ |
2906 | 3.46k | compute_hash_r6( |
2907 | 3.46k | password, |
2908 | 3.46k | pwlen, |
2909 | 3.46k | (const unsigned char *)(O + 32), // owner validation salt |
2910 | 3.46k | validationkey, |
2911 | 3.46k | U); |
2912 | | |
2913 | | /* If the 32-byte result matches the first 32 bytes of the O string, this is the owner password. */ |
2914 | 3.46k | if (0 != memcmp(O, validationkey, sizeof(validationkey))) { |
2915 | 3.46k | cli_dbgmsg("check_owner_password: Owner password check did not match!\n"); |
2916 | 3.46k | break; |
2917 | 3.46k | } |
2918 | | |
2919 | | /* |
2920 | | * Compute an intermediate owner key by computing the SHA-256 hash of the UTF-8 password concatenated with |
2921 | | * the 8 bytes of owner key salt, concatenated with the 48-byte U string. |
2922 | | */ |
2923 | 4 | compute_hash_r6( |
2924 | 4 | password, |
2925 | 4 | pwlen, |
2926 | 4 | (const unsigned char *)(O + 40), // owner key salt |
2927 | 4 | hash, |
2928 | 4 | U); |
2929 | | |
2930 | 4 | if (OE_len != 32) { |
2931 | 0 | cli_dbgmsg("check_owner_password: OE length is not 32: %zu\n", OE_len); |
2932 | 0 | noisy_warnmsg("check_owner_password: OE length is not 32: %zu\n", OE_len); |
2933 | 4 | } else { |
2934 | 4 | pdf->keylen = 32; |
2935 | 4 | pdf->key = cli_max_malloc(pdf->keylen); |
2936 | 4 | if (!pdf->key) { |
2937 | 0 | cli_errmsg("check_owner_password: Cannot allocate memory for pdf->key\n"); |
2938 | 0 | goto done; |
2939 | 0 | } |
2940 | | |
2941 | 4 | aes_256cbc_decrypt((const unsigned char *)OE, &OE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0); |
2942 | 4 | dbg_printhex("check_owner_password: Candidate encryption key", pdf->key, pdf->keylen); |
2943 | | |
2944 | 4 | password_empty = true; |
2945 | 4 | } |
2946 | | |
2947 | 4 | break; |
2948 | 4 | } |
2949 | 6.01k | default: { |
2950 | 6.01k | cli_dbgmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R); |
2951 | 6.01k | noisy_warnmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R); |
2952 | 6.01k | } |
2953 | 9.95k | } |
2954 | | |
2955 | 9.48k | if (password_empty) { |
2956 | | /* The key we computed above is the key used to encrypt the streams. We could decrypt it now if we wanted to */ |
2957 | 4 | pdf->flags |= 1 << DECRYPTABLE_PDF; |
2958 | | |
2959 | 4 | cli_dbgmsg("check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n"); |
2960 | 4 | noisy_msg(pdf, "check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n"); |
2961 | 9.48k | } else { |
2962 | | /* The key is not valid, we would need the user or the owner password to decrypt */ |
2963 | 9.48k | cli_dbgmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n"); |
2964 | 9.48k | noisy_warnmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n"); |
2965 | 9.48k | } |
2966 | | |
2967 | 9.95k | done: |
2968 | | |
2969 | 9.95k | return; |
2970 | 9.48k | } |
2971 | | |
2972 | | static void check_user_password(struct pdf_struct *pdf, int R, const char *O, |
2973 | | const char *U, int32_t P, int EM, |
2974 | | const char *UE, size_t UE_len, |
2975 | | unsigned length) |
2976 | 9.95k | { |
2977 | 9.95k | unsigned i; |
2978 | 9.95k | uint8_t result[16]; |
2979 | 9.95k | char data[32]; |
2980 | 9.95k | struct arc4_state arc4; |
2981 | 9.95k | bool password_empty = false; |
2982 | | |
2983 | 9.95k | dbg_printhex("U: ", U, 32); |
2984 | 9.95k | dbg_printhex("O: ", O, 32); |
2985 | | |
2986 | 9.95k | switch (R) { |
2987 | 2.00k | case 2: |
2988 | 4.64k | case 3: |
2989 | 5.98k | case 4: { |
2990 | 5.98k | unsigned char *d; |
2991 | 5.98k | size_t sz = 68 + pdf->fileIDlen + (R >= 4 && !EM ? 4 : 0); |
2992 | 5.98k | d = calloc(1, sz); |
2993 | | |
2994 | 5.98k | if (!(d)) |
2995 | 0 | goto done; |
2996 | | |
2997 | 5.98k | memcpy(d, key_padding, 32); |
2998 | 5.98k | memcpy(d + 32, O, 32); |
2999 | 5.98k | P = le32_to_host(P); |
3000 | 5.98k | memcpy(d + 64, &P, 4); |
3001 | 5.98k | memcpy(d + 68, pdf->fileID, pdf->fileIDlen); |
3002 | | |
3003 | | /* 7.6.3.3 Algorithm 2 */ |
3004 | | /* empty password, password == padding */ |
3005 | 5.98k | if (R >= 4 && !EM) { |
3006 | 0 | uint32_t v = 0xFFFFFFFF; |
3007 | 0 | memcpy(d + 68 + pdf->fileIDlen, &v, 4); |
3008 | 0 | } |
3009 | | |
3010 | 5.98k | cl_hash_data("md5", d, sz, result, NULL); |
3011 | 5.98k | free(d); |
3012 | 5.98k | if (length > 128) |
3013 | 89 | length = 128; |
3014 | 5.98k | if (R >= 3) { |
3015 | | /* Yes, this really is on purpose */ |
3016 | 203k | for (i = 0; i < 50; i++) |
3017 | 199k | cl_hash_data("md5", result, length / 8, result, NULL); |
3018 | 3.98k | } |
3019 | 5.98k | if (R == 2) |
3020 | 2.00k | length = 40; |
3021 | | |
3022 | 5.98k | pdf->keylen = length / 8; |
3023 | 5.98k | pdf->key = cli_max_malloc(pdf->keylen); |
3024 | 5.98k | if (!pdf->key) |
3025 | 0 | goto done; |
3026 | | |
3027 | 5.98k | memcpy(pdf->key, result, pdf->keylen); |
3028 | 5.98k | dbg_printhex("md5", (const char *)result, 16); |
3029 | 5.98k | dbg_printhex("Candidate encryption key", pdf->key, pdf->keylen); |
3030 | | |
3031 | | /* 7.6.3.3 Algorithm 6 */ |
3032 | 5.98k | if (R == 2) { |
3033 | | /* 7.6.3.3 Algorithm 4 */ |
3034 | 2.00k | memcpy(data, key_padding, 32); |
3035 | 2.00k | if (false == arc4_init(&arc4, (const uint8_t *)(pdf->key), pdf->keylen)) { |
3036 | 0 | noisy_warnmsg("check_user_password: failed to init arc4\n"); |
3037 | 0 | goto done; |
3038 | 0 | } |
3039 | 2.00k | arc4_apply(&arc4, (uint8_t *)data, 32); |
3040 | 2.00k | dbg_printhex("computed U (R2)", data, 32); |
3041 | 2.00k | if (!memcmp(data, U, 32)) |
3042 | 470 | password_empty = true; |
3043 | 3.98k | } else { |
3044 | | // R is 3 or 4 |
3045 | 3.98k | unsigned len = pdf->keylen; |
3046 | 3.98k | unsigned char *d; |
3047 | | |
3048 | 3.98k | d = calloc(1, 32 + pdf->fileIDlen); |
3049 | 3.98k | if (!(d)) |
3050 | 0 | goto done; |
3051 | | |
3052 | | /* 7.6.3.3 Algorithm 5 */ |
3053 | 3.98k | memcpy(d, key_padding, 32); |
3054 | 3.98k | memcpy(d + 32, pdf->fileID, pdf->fileIDlen); |
3055 | 3.98k | cl_hash_data("md5", d, 32 + pdf->fileIDlen, result, NULL); |
3056 | 3.98k | memcpy(data, pdf->key, len); |
3057 | | |
3058 | 3.98k | if (false == arc4_init(&arc4, (const uint8_t *)data, len)) { |
3059 | 0 | noisy_warnmsg("check_user_password: failed to init arc4\n"); |
3060 | 0 | goto done; |
3061 | 0 | } |
3062 | 3.98k | arc4_apply(&arc4, result, 16); |
3063 | 79.7k | for (i = 1; i <= 19; i++) { |
3064 | 75.7k | unsigned j; |
3065 | | |
3066 | 775k | for (j = 0; j < len; j++) |
3067 | 700k | data[j] = pdf->key[j] ^ i; |
3068 | | |
3069 | 75.7k | if (false == arc4_init(&arc4, (const uint8_t *)data, len)) { |
3070 | 0 | noisy_warnmsg("check_user_password: failed to init arc4\n"); |
3071 | 0 | goto done; |
3072 | 0 | } |
3073 | 75.7k | arc4_apply(&arc4, result, 16); |
3074 | 75.7k | } |
3075 | | |
3076 | 3.98k | dbg_printhex("fileID", pdf->fileID, pdf->fileIDlen); |
3077 | 3.98k | dbg_printhex("computed U (R>=3)", (const char *)result, 16); |
3078 | 3.98k | if (!memcmp(result, U, 16)) |
3079 | 942 | password_empty = true; |
3080 | 3.98k | free(d); |
3081 | 3.98k | } |
3082 | | |
3083 | 5.98k | break; |
3084 | 5.98k | } |
3085 | 5.98k | case 5: { |
3086 | 31 | uint8_t result2[32]; |
3087 | | |
3088 | | /* supplement to ISO3200, 3.5.2 Algorithm 3.11 */ |
3089 | | /* user validation salt */ |
3090 | 31 | cl_sha256(U + 32, 8, result2, NULL); |
3091 | 31 | dbg_printhex("Computed U", (const char *)result2, 32); |
3092 | 31 | if (!memcmp(result2, U, 32)) { |
3093 | | /* Algorithm 3.2a could be used to recover encryption key */ |
3094 | 0 | cl_sha256(U + 40, 8, result2, NULL); |
3095 | |
|
3096 | 0 | if (UE_len != 32) { |
3097 | 0 | cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len); |
3098 | 0 | noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len); |
3099 | 0 | } else { |
3100 | 0 | pdf->keylen = 32; |
3101 | 0 | pdf->key = cli_max_malloc(pdf->keylen); |
3102 | 0 | if (!pdf->key) { |
3103 | 0 | cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n"); |
3104 | 0 | goto done; |
3105 | 0 | } |
3106 | | |
3107 | 0 | aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)result2, 32, 0); |
3108 | 0 | dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen); |
3109 | |
|
3110 | 0 | password_empty = true; |
3111 | 0 | } |
3112 | 0 | } |
3113 | | |
3114 | 31 | break; |
3115 | 31 | } |
3116 | 3.93k | case 6: { |
3117 | 3.93k | unsigned char hash[32], validationkey[32]; |
3118 | | |
3119 | 3.93k | size_t pwlen = 0; |
3120 | 3.93k | char password[] = ""; |
3121 | | |
3122 | 3.93k | if (NULL == UE) { |
3123 | 371 | cli_dbgmsg("check_user_password: Missing UE value!\n"); |
3124 | 371 | noisy_warnmsg("check_user_password: Missing UE value!\n"); |
3125 | 371 | goto done; |
3126 | 371 | } |
3127 | | |
3128 | 3.56k | dbg_printhex("UE: ", UE, UE_len); |
3129 | | |
3130 | | /* |
3131 | | * Test the password against the user key by computing the SHA-256 hash of the UTF-8 password concatenated |
3132 | | * with the 8 bytes of user validation salt. |
3133 | | */ |
3134 | 3.56k | compute_hash_r6( |
3135 | 3.56k | password, |
3136 | 3.56k | pwlen, |
3137 | 3.56k | (const unsigned char *)(U + 32), // user validation salt |
3138 | 3.56k | validationkey, |
3139 | 3.56k | NULL); // no U string for user password check |
3140 | | |
3141 | | /* If the 32-byte result matches the first 32 bytes of the U string, this is the user password. */ |
3142 | 3.56k | if (0 != memcmp(U, validationkey, sizeof(validationkey))) { |
3143 | 776 | cli_dbgmsg("check_user_password: User password check did not match!\n"); |
3144 | 776 | break; |
3145 | 776 | } |
3146 | | |
3147 | | /* |
3148 | | * Compute an intermediate user key by computing the SHA-256 hash of the UTF-8 password concatenated with |
3149 | | * the 8 bytes of user key salt. |
3150 | | */ |
3151 | 2.78k | compute_hash_r6( |
3152 | 2.78k | password, |
3153 | 2.78k | pwlen, |
3154 | 2.78k | (const unsigned char *)(U + 40), // user key salt |
3155 | 2.78k | hash, |
3156 | 2.78k | NULL); // no U string for user password check |
3157 | | |
3158 | 2.78k | if (UE_len != 32) { |
3159 | 281 | cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len); |
3160 | 281 | noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len); |
3161 | 2.50k | } else { |
3162 | 2.50k | pdf->keylen = 32; |
3163 | 2.50k | pdf->key = cli_max_malloc(pdf->keylen); |
3164 | 2.50k | if (!pdf->key) { |
3165 | 0 | cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n"); |
3166 | 0 | goto done; |
3167 | 0 | } |
3168 | | |
3169 | 2.50k | aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0); |
3170 | 2.50k | dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen); |
3171 | | |
3172 | 2.50k | password_empty = true; |
3173 | 2.50k | } |
3174 | | |
3175 | 2.78k | break; |
3176 | 2.78k | } |
3177 | 2.78k | default: { |
3178 | | /* Supported R is in {2,3,4,5} */ |
3179 | 0 | cli_dbgmsg("check_user_password: R value out of range\n"); |
3180 | 0 | noisy_warnmsg("check_user_password: R value out of range\n"); |
3181 | 0 | } |
3182 | 9.95k | } |
3183 | | |
3184 | 9.57k | if (password_empty) { |
3185 | 3.91k | cli_dbgmsg("check_user_password: user password is empty\n"); |
3186 | 3.91k | noisy_msg(pdf, "check_user_password: encrypted PDF found, user password is empty, will attempt to decrypt\n"); |
3187 | | /* The key we computed above is the key used to encrypt the streams. |
3188 | | * We could decrypt it now if we wanted to */ |
3189 | 3.91k | pdf->flags |= 1 << DECRYPTABLE_PDF; |
3190 | 5.66k | } else { |
3191 | | /* the key is not valid, we would need the user or the owner password to decrypt */ |
3192 | 5.66k | cli_dbgmsg("check_user_password: user/owner password would be required for decryption\n"); |
3193 | 5.66k | noisy_warnmsg("check_user_password: encrypted PDF found, user password is NOT empty, cannot decrypt!\n"); |
3194 | 5.66k | } |
3195 | | |
3196 | 9.95k | done: |
3197 | 9.95k | return; |
3198 | 9.57k | } |
3199 | | |
3200 | | enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def) |
3201 | 32.0k | { |
3202 | 32.0k | const char *q; |
3203 | 32.0k | char *CFM = NULL; |
3204 | 32.0k | enum enc_method ret = ENC_UNKNOWN; |
3205 | | |
3206 | 32.0k | if (!key) |
3207 | 15.6k | return def; |
3208 | | |
3209 | 16.4k | if (!strcmp(key, "Identity")) |
3210 | 69 | return ENC_IDENTITY; |
3211 | | |
3212 | 16.3k | q = pdf_getdict(dict, (int *)(&len), key); |
3213 | 16.3k | if (!q) |
3214 | 3.22k | return def; |
3215 | | |
3216 | 13.1k | CFM = pdf_readval(q, len, "/CFM"); |
3217 | 13.1k | if (CFM) { |
3218 | 8.74k | cli_dbgmsg("parse_enc_method: %s CFM: %s\n", key, CFM); |
3219 | 8.74k | if (!strncmp(CFM, "V2", 2)) |
3220 | 41 | ret = ENC_V2; |
3221 | 8.70k | else if (!strncmp(CFM, "AESV2", 5)) |
3222 | 857 | ret = ENC_AESV2; |
3223 | 7.85k | else if (!strncmp(CFM, "AESV3", 5)) |
3224 | 5.34k | ret = ENC_AESV3; |
3225 | 2.50k | else if (!strncmp(CFM, "None", 4)) |
3226 | 403 | ret = ENC_NONE; |
3227 | | |
3228 | 8.74k | free(CFM); |
3229 | 8.74k | } |
3230 | | |
3231 | 13.1k | return ret; |
3232 | 16.3k | } |
3233 | | |
3234 | | void pdf_handle_enc(struct pdf_struct *pdf) |
3235 | 433k | { |
3236 | 433k | struct pdf_obj *obj; |
3237 | 433k | uint32_t len, n, R, P, length, EM = 1, i, oulen; |
3238 | | |
3239 | 433k | char *O = NULL; |
3240 | 433k | char *OE = NULL; |
3241 | 433k | size_t OE_len = 0; |
3242 | | |
3243 | 433k | char *U = NULL; |
3244 | 433k | char *UE = NULL; |
3245 | 433k | size_t UE_len = 0; |
3246 | | |
3247 | 433k | char *StmF = NULL; |
3248 | 433k | char *StrF = NULL; |
3249 | 433k | char *EFF = NULL; |
3250 | | |
3251 | 433k | const char *q, *q2; |
3252 | | |
3253 | 433k | if (pdf->enc_objid == ~0u) |
3254 | 395k | return; |
3255 | 38.5k | if (!pdf->fileID) { |
3256 | 6.49k | cli_dbgmsg("pdf_handle_enc: no file ID\n"); |
3257 | 6.49k | noisy_warnmsg("pdf_handle_enc: no file ID\n"); |
3258 | 6.49k | return; |
3259 | 6.49k | } |
3260 | | |
3261 | 32.0k | obj = find_obj(pdf, pdf->objs[0], pdf->enc_objid); |
3262 | 32.0k | if (!obj) { |
3263 | 6.14k | cli_dbgmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff); |
3264 | 6.14k | noisy_warnmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff); |
3265 | 6.14k | return; |
3266 | 6.14k | } |
3267 | | |
3268 | 25.9k | len = obj->size; |
3269 | | |
3270 | 25.9k | q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
3271 | 25.9k | : (const char *)(obj->start + pdf->map); |
3272 | | |
3273 | 25.9k | O = U = UE = StmF = StrF = EFF = NULL; |
3274 | | |
3275 | 25.9k | pdf->enc_method_string = ENC_UNKNOWN; |
3276 | 25.9k | pdf->enc_method_stream = ENC_UNKNOWN; |
3277 | 25.9k | pdf->enc_method_embeddedfile = ENC_UNKNOWN; |
3278 | | |
3279 | 25.9k | q2 = cli_memstr(q, len, "/Standard", 9); |
3280 | 25.9k | if (!q2) { |
3281 | 4.60k | cli_dbgmsg("pdf_handle_enc: /Standard not found\n"); |
3282 | 4.60k | noisy_warnmsg("pdf_handle_enc: /Standard not found\n"); |
3283 | 4.60k | goto done; |
3284 | 4.60k | } |
3285 | | |
3286 | | /* we can have both of these: |
3287 | | * /AESV2/Length /Standard/Length |
3288 | | * /Length /Standard |
3289 | | * make sure we don't mistake AES's length for Standard's */ |
3290 | 21.3k | length = pdf_readint(q2, len - (q2 - q), "/Length"); |
3291 | 21.3k | if (length == ~0u) |
3292 | 11.9k | length = pdf_readint(q, len, "/Length"); |
3293 | | |
3294 | 21.3k | if (length < 40) { |
3295 | 2.51k | cli_dbgmsg("pdf_handle_enc: invalid length: %d\n", length); |
3296 | 2.51k | length = 40; |
3297 | 2.51k | } |
3298 | | |
3299 | 21.3k | R = pdf_readint(q, len, "/R"); |
3300 | 21.3k | if (R == ~0u) { |
3301 | 1.08k | cli_dbgmsg("pdf_handle_enc: invalid R\n"); |
3302 | 1.08k | noisy_warnmsg("pdf_handle_enc: invalid R\n"); |
3303 | 1.08k | goto done; |
3304 | 1.08k | } |
3305 | | |
3306 | 20.2k | if ((R > 6) || (R < 2)) { |
3307 | 589 | cli_dbgmsg("pdf_handle_enc: R value outside supported range [2..6]\n"); |
3308 | 589 | noisy_warnmsg("pdf_handle_enc: R value outside supported range [2..6]\n"); |
3309 | 589 | goto done; |
3310 | 589 | } |
3311 | | |
3312 | 19.6k | P = pdf_readint(q, len, "/P"); |
3313 | 19.6k | if (R < 6) { // P field doesn't seem to be required for R6. |
3314 | 11.6k | if (P == ~0u) { |
3315 | 1.94k | cli_dbgmsg("pdf_handle_enc: invalid P\n"); |
3316 | 1.94k | noisy_warnmsg("pdf_handle_enc: invalid P\n"); |
3317 | 1.94k | goto done; |
3318 | 1.94k | } |
3319 | 11.6k | } |
3320 | | |
3321 | 17.6k | if (R < 5) { |
3322 | 9.45k | oulen = 32; |
3323 | 9.45k | } else { |
3324 | 8.23k | oulen = 48; |
3325 | 8.23k | } |
3326 | | |
3327 | 17.6k | if (R == 2 || R == 3) { |
3328 | 7.11k | pdf->enc_method_stream = ENC_V2; |
3329 | 7.11k | pdf->enc_method_string = ENC_V2; |
3330 | 7.11k | pdf->enc_method_embeddedfile = ENC_V2; |
3331 | 10.5k | } else if (R == 4 || R == 5 || R == 6) { |
3332 | 10.5k | EM = pdf_readbool(q, len, "/EncryptMetadata", 1); |
3333 | 10.5k | StmF = pdf_readval(q, len, "/StmF"); |
3334 | 10.5k | StrF = pdf_readval(q, len, "/StrF"); |
3335 | 10.5k | EFF = pdf_readval(q, len, "/EFF"); |
3336 | 10.5k | n = len; |
3337 | 10.5k | pdf->CF = pdf_getdict(q, (int *)(&n), "/CF"); |
3338 | 10.5k | pdf->CF_n = n; |
3339 | | |
3340 | 10.5k | if (StmF) { |
3341 | 8.00k | cli_dbgmsg("pdf_handle_enc: StmF: %s\n", StmF); |
3342 | 8.00k | } |
3343 | 10.5k | if (StrF) { |
3344 | 7.76k | cli_dbgmsg("pdf_handle_enc: StrF: %s\n", StrF); |
3345 | 7.76k | } |
3346 | 10.5k | if (EFF) { |
3347 | 342 | cli_dbgmsg("pdf_handle_enc: EFF: %s\n", EFF); |
3348 | 342 | } |
3349 | | |
3350 | 10.5k | pdf->enc_method_stream = parse_enc_method(pdf->CF, n, StmF, ENC_IDENTITY); |
3351 | 10.5k | pdf->enc_method_string = parse_enc_method(pdf->CF, n, StrF, ENC_IDENTITY); |
3352 | 10.5k | pdf->enc_method_embeddedfile = parse_enc_method(pdf->CF, n, EFF, pdf->enc_method_stream); |
3353 | | |
3354 | 10.5k | cli_dbgmsg("pdf_handle_enc: EncryptMetadata: %s\n", EM ? "true" : "false"); |
3355 | | |
3356 | 10.5k | if (R == 4) { |
3357 | 2.33k | length = 128; |
3358 | 8.23k | } else { |
3359 | 8.23k | length = 256; |
3360 | | |
3361 | | /* |
3362 | | * Read the UE value (for checking user-password) |
3363 | | */ |
3364 | 8.23k | n = 0; |
3365 | 8.23k | UE = pdf_readstring(q, len, "/UE", &n, NULL, false); |
3366 | 8.23k | UE_len = n; |
3367 | | |
3368 | | /* |
3369 | | * Read the OE value (for checking owner-password) |
3370 | | */ |
3371 | 8.23k | n = 0; |
3372 | 8.23k | OE = pdf_readstring(q, len, "/OE", &n, NULL, false); |
3373 | 8.23k | OE_len = n; |
3374 | 8.23k | } |
3375 | 10.5k | } |
3376 | | |
3377 | 17.6k | if (length == ~0u) |
3378 | 5.41k | length = 40; |
3379 | | |
3380 | | /* |
3381 | | * Read the O value |
3382 | | */ |
3383 | 17.6k | n = 0; |
3384 | 17.6k | O = pdf_readstring(q, len, "/O", &n, NULL, false); |
3385 | 17.6k | if (!O || n < oulen) { |
3386 | 3.74k | cli_dbgmsg("pdf_handle_enc: invalid O: %d\n", n); |
3387 | 3.74k | noisy_warnmsg("pdf_handle_enc: invalid O: %d\n", n); |
3388 | 3.74k | if (O) { |
3389 | 1.81k | dbg_printhex("invalid O", O, n); |
3390 | 1.81k | } |
3391 | | |
3392 | 3.74k | goto done; |
3393 | 3.74k | } |
3394 | 13.9k | if (n > oulen) { |
3395 | 364k | for (i = oulen; i < n; i++) { |
3396 | 359k | if (O[i]) { |
3397 | 1.56k | dbg_printhex("pdf_handle_enc: too long O", O, n); |
3398 | 1.56k | noisy_warnmsg("pdf_handle_enc: too long O: %u", n); |
3399 | 1.56k | goto done; |
3400 | 1.56k | } |
3401 | 359k | } |
3402 | 6.67k | } |
3403 | | |
3404 | | /* |
3405 | | * Read the U value |
3406 | | */ |
3407 | 12.3k | n = 0; |
3408 | 12.3k | U = pdf_readstring(q, len, "/U", &n, NULL, false); |
3409 | 12.3k | if (!U || n < oulen) { |
3410 | 1.12k | cli_dbgmsg("pdf_handle_enc: invalid U: %u\n", n); |
3411 | 1.12k | noisy_warnmsg("pdf_handle_enc: invalid U: %u\n", n); |
3412 | 1.12k | if (U) { |
3413 | 433 | dbg_printhex("invalid U", U, n); |
3414 | 433 | } |
3415 | | |
3416 | 1.12k | goto done; |
3417 | 1.12k | } |
3418 | | |
3419 | 11.2k | if (n > oulen) { |
3420 | 274k | for (i = oulen; i < n; i++) { |
3421 | 270k | if (U[i]) { |
3422 | 486 | dbg_printhex("too long U", U, n); |
3423 | 486 | goto done; |
3424 | 486 | } |
3425 | 270k | } |
3426 | 4.22k | } |
3427 | | |
3428 | 10.7k | cli_dbgmsg("pdf_handle_enc: Encrypt R: %d, P %x, length: %u\n", R, P, length); |
3429 | 10.7k | if (length % 8) { |
3430 | 808 | cli_dbgmsg("pdf_handle_enc: wrong key length, not multiple of 8\n"); |
3431 | 808 | noisy_warnmsg("pdf_handle_enc: wrong key length, not multiple of 8\n"); |
3432 | 808 | goto done; |
3433 | 808 | } |
3434 | | |
3435 | | // Check the owner password. |
3436 | 9.95k | check_owner_password(pdf, R, O, U, OE, OE_len); |
3437 | | |
3438 | 9.95k | if (NULL == pdf->key) { |
3439 | | // Wasn't the owner password, let's try the user password. |
3440 | 9.95k | check_user_password(pdf, R, O, U, P, EM, UE, UE_len, length); |
3441 | 9.95k | } |
3442 | | |
3443 | 25.9k | done: |
3444 | 25.9k | free(O); |
3445 | 25.9k | free(OE); |
3446 | | |
3447 | 25.9k | free(U); |
3448 | 25.9k | free(UE); |
3449 | | |
3450 | 25.9k | free(StmF); |
3451 | 25.9k | free(StrF); |
3452 | 25.9k | free(EFF); |
3453 | 25.9k | } |
3454 | | |
3455 | | /** |
3456 | | * @brief Search pdf buffer for objects. Parse each. |
3457 | | * |
3458 | | * Newly found objects will be extracted after completion when the extraction for loop continues. |
3459 | | * |
3460 | | * @param pdf Pdf struct that keeps track of all information found in the PDF. |
3461 | | * @param objstm Pointer to an object stream to parse. |
3462 | | * |
3463 | | * @return cl_error_t Error code. |
3464 | | */ |
3465 | | cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm) |
3466 | 54.5k | { |
3467 | 54.5k | cl_error_t status = CL_EFORMAT; |
3468 | 54.5k | cl_error_t retval = CL_EPARSE; |
3469 | 54.5k | uint32_t badobjects = 0; |
3470 | 54.5k | size_t i = 0; |
3471 | | |
3472 | 54.5k | struct pdf_obj *obj = NULL; |
3473 | | |
3474 | 54.5k | if ((NULL == objstm) || (NULL == objstm->streambuf)) { |
3475 | 0 | status = CL_EARG; |
3476 | 0 | goto done; |
3477 | 0 | } |
3478 | | |
3479 | 54.5k | if ((0 == objstm->first) || |
3480 | 54.5k | (0 == objstm->streambuf_len) || |
3481 | 54.5k | (0 == objstm->n)) { |
3482 | 3.31k | cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n"); |
3483 | 3.31k | goto done; |
3484 | 3.31k | } |
3485 | | |
3486 | 51.2k | if (objstm->first >= objstm->streambuf_len) { |
3487 | 10.0k | cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Invalid objstm values. Offset of first obj greater than stream length.\n"); |
3488 | 10.0k | goto done; |
3489 | 10.0k | } |
3490 | | |
3491 | | /* Process each object */ |
3492 | 280k | for (i = 0; i < objstm->n; i++) { |
3493 | 254k | obj = NULL; |
3494 | | |
3495 | 254k | if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) { |
3496 | 0 | cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n"); |
3497 | 0 | status = CL_ETIMEOUT; |
3498 | 0 | goto done; |
3499 | 0 | } |
3500 | | |
3501 | | /* Find object */ |
3502 | 254k | retval = pdf_findobj_in_objstm(pdf, objstm, &obj); |
3503 | 254k | if (retval != CL_SUCCESS) { |
3504 | 14.7k | if (retval != CL_BREAK) { |
3505 | 14.7k | cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n", |
3506 | 14.7k | objstm->nobjs_found, objstm->n); |
3507 | 14.7k | badobjects++; |
3508 | 14.7k | pdf->stats.ninvalidobjs++; |
3509 | 14.7k | } |
3510 | 14.7k | break; |
3511 | 14.7k | } |
3512 | | |
3513 | 239k | cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Found object %u %u in object stream at offset: %u\n", obj->id >> 8, obj->id & 0xff, obj->start); |
3514 | | |
3515 | 239k | if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) { |
3516 | 0 | cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n"); |
3517 | 0 | status = CL_ETIMEOUT; |
3518 | 0 | goto done; |
3519 | 0 | } |
3520 | | |
3521 | | /* Parse object */ |
3522 | 239k | pdf_parseobj(pdf, obj); |
3523 | 239k | } |
3524 | | |
3525 | 41.2k | if (badobjects) { |
3526 | 14.7k | status = CL_EFORMAT; |
3527 | 14.7k | goto done; |
3528 | 14.7k | } |
3529 | | |
3530 | 26.4k | status = CL_SUCCESS; |
3531 | | |
3532 | 54.5k | done: |
3533 | 54.5k | return status; |
3534 | 26.4k | } |
3535 | | |
3536 | | /** |
3537 | | * @brief Search pdf buffer for objects. Parse each and then extract each. |
3538 | | * |
3539 | | * @param pdf Pdf struct that keeps track of all information found in the PDF. |
3540 | | * |
3541 | | * @return cl_error_t Error code. |
3542 | | */ |
3543 | | static cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf) |
3544 | 433k | { |
3545 | 433k | cl_error_t status = CL_SUCCESS; |
3546 | 433k | int32_t rv = 0; |
3547 | 433k | unsigned int i = 0; |
3548 | 433k | uint32_t badobjects = 0; |
3549 | 433k | cli_ctx *ctx = NULL; |
3550 | | |
3551 | 433k | if (NULL == pdf) { |
3552 | 0 | cli_errmsg("pdf_find_and_extract_objs: Invalid arguments.\n"); |
3553 | 0 | status = CL_EARG; |
3554 | 0 | goto done; |
3555 | 0 | } |
3556 | | |
3557 | 433k | ctx = pdf->ctx; |
3558 | | |
3559 | | /* parse PDF and find obj offsets */ |
3560 | 2.08M | while (CL_BREAK != (rv = pdf_findobj(pdf))) { |
3561 | 1.64M | if (rv == CL_EMEM) { |
3562 | 0 | cli_errmsg("pdf_find_and_extract_objs: Memory allocation error.\n"); |
3563 | 0 | status = CL_EMEM; |
3564 | 0 | goto done; |
3565 | 0 | } |
3566 | 1.64M | } |
3567 | | |
3568 | | /* must parse after finding all objs, so we can flag indirect objects */ |
3569 | 1.78M | for (i = 0; i < pdf->nobjs; i++) { |
3570 | 1.35M | struct pdf_obj *obj = pdf->objs[i]; |
3571 | | |
3572 | 1.35M | if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) { |
3573 | 0 | cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while parsing objects.\n"); |
3574 | |
|
3575 | 0 | status = CL_ETIMEOUT; |
3576 | 0 | goto done; |
3577 | 0 | } |
3578 | | |
3579 | 1.35M | pdf_parseobj(pdf, obj); |
3580 | 1.35M | } |
3581 | | |
3582 | 433k | pdf_handle_enc(pdf); |
3583 | 433k | if (pdf->flags & (1 << ENCRYPTED_PDF)) |
3584 | 53.3k | cli_dbgmsg("pdf_find_and_extract_objs: encrypted pdf found, %s!\n", |
3585 | 53.3k | (pdf->flags & (1 << DECRYPTABLE_PDF)) ? "decryptable" : "not decryptable, stream will probably fail to decompress"); |
3586 | | |
3587 | 433k | if (SCAN_HEURISTIC_ENCRYPTED_DOC && |
3588 | 433k | (pdf->flags & (1 << ENCRYPTED_PDF)) && |
3589 | 433k | !(pdf->flags & (1 << DECRYPTABLE_PDF))) { |
3590 | | /* It is encrypted, and a password/key needs to be supplied to decrypt. |
3591 | | * This doesn't trigger for PDFs that are encrypted but don't need |
3592 | | * a password to decrypt */ |
3593 | 49.3k | status = cli_append_potentially_unwanted(pdf->ctx, "Heuristics.Encrypted.PDF"); |
3594 | 49.3k | } |
3595 | | |
3596 | 433k | if (CL_SUCCESS == status) { |
3597 | 433k | status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1); |
3598 | 433k | cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status); |
3599 | 433k | } |
3600 | | |
3601 | 433k | if (CL_SUCCESS == status) { |
3602 | | /* extract PDF objs */ |
3603 | 2.02M | for (i = 0; !status && i < pdf->nobjs; i++) { |
3604 | 1.58M | struct pdf_obj *obj = pdf->objs[i]; |
3605 | | |
3606 | 1.58M | if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) { |
3607 | 0 | cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while extracting objects.\n"); |
3608 | |
|
3609 | 0 | status = CL_ETIMEOUT; |
3610 | 0 | goto done; |
3611 | 0 | } |
3612 | | |
3613 | 1.58M | pdf->parse_recursion_depth++; |
3614 | 1.58M | status = pdf_extract_obj(pdf, obj, PDF_EXTRACT_OBJ_SCAN); |
3615 | 1.58M | pdf->parse_recursion_depth--; |
3616 | 1.58M | switch (status) { |
3617 | 0 | case CL_EFORMAT: |
3618 | | /* Don't halt on one bad object */ |
3619 | 0 | cli_dbgmsg("pdf_find_and_extract_objs: Format error when extracting object, skipping to the next object.\n"); |
3620 | 0 | badobjects++; |
3621 | 0 | pdf->stats.ninvalidobjs++; |
3622 | 0 | status = CL_CLEAN; |
3623 | 0 | break; |
3624 | 0 | case CL_VIRUS: |
3625 | 0 | break; |
3626 | 1.58M | default: |
3627 | 1.58M | break; |
3628 | 1.58M | } |
3629 | 1.58M | } |
3630 | 433k | } |
3631 | | |
3632 | 433k | done: |
3633 | 433k | if ((CL_SUCCESS == status) && badobjects) { |
3634 | 0 | status = CL_EFORMAT; |
3635 | 0 | } |
3636 | | |
3637 | 433k | return status; |
3638 | 433k | } |
3639 | | |
3640 | | /** |
3641 | | * @brief Primary function for parsing and scanning a PDF. |
3642 | | * |
3643 | | * @param dir Filepath for temp file. |
3644 | | * @param ctx clam scan context structure. |
3645 | | * @param offset offset of pdf in ctx->fmap |
3646 | | * |
3647 | | * @return int Returns cl_error_t status value. |
3648 | | */ |
3649 | | cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset) |
3650 | 441k | { |
3651 | 441k | cl_error_t rc = CL_SUCCESS; |
3652 | 441k | struct pdf_struct pdf; |
3653 | 441k | fmap_t *map = ctx->fmap; |
3654 | 441k | size_t size = map->len - offset; |
3655 | 441k | off_t versize = size > 1032 ? 1032 : size; |
3656 | 441k | off_t map_off, bytesleft; |
3657 | 441k | unsigned long xref; |
3658 | 441k | long temp_long; |
3659 | 441k | const char *pdfver, *tmp, *start, *eofmap, *q, *eof; |
3660 | 441k | unsigned i; |
3661 | 441k | unsigned int objs_found = 0; |
3662 | | |
3663 | 441k | json_object *pdfobj = NULL; |
3664 | 441k | char *begin, *end, *p1; |
3665 | | |
3666 | 441k | cli_dbgmsg("in cli_pdf(%s)\n", dir); |
3667 | 441k | memset(&pdf, 0, sizeof(pdf)); |
3668 | 441k | pdf.ctx = ctx; |
3669 | 441k | pdf.dir = dir; |
3670 | 441k | pdf.enc_objid = ~0u; |
3671 | | |
3672 | 441k | pdfver = start = fmap_need_off_once(map, offset, versize); |
3673 | | |
3674 | | /* Check PDF version */ |
3675 | 441k | if (!pdfver) { |
3676 | 0 | cli_errmsg("cli_pdf: mmap() failed (1)\n"); |
3677 | 0 | rc = CL_EMAP; |
3678 | 0 | goto done; |
3679 | 0 | } |
3680 | | |
3681 | 441k | if (ctx->wrkproperty) |
3682 | 441k | pdfobj = cli_jsonobj(ctx->wrkproperty, "PDFStats"); |
3683 | | |
3684 | | /* offset is 0 when coming from filetype2 */ |
3685 | 441k | tmp = cli_memstr(pdfver, versize, "%PDF-", 5); |
3686 | 441k | if (!tmp) { |
3687 | 7.32k | cli_dbgmsg("cli_pdf: no PDF- header found\n"); |
3688 | 7.32k | noisy_warnmsg("cli_pdf: no PDF- header found\n"); |
3689 | | |
3690 | 7.32k | rc = CL_SUCCESS; |
3691 | 7.32k | goto done; |
3692 | 7.32k | } |
3693 | | |
3694 | 434k | versize -= tmp - pdfver; |
3695 | 434k | pdfver = tmp; |
3696 | | |
3697 | 434k | if (versize < 8) { |
3698 | 352 | rc = CL_EFORMAT; |
3699 | 352 | goto done; |
3700 | 352 | } |
3701 | | |
3702 | | /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */ |
3703 | 433k | if (pdfver[5] != '1' || pdfver[6] != '.' || |
3704 | 433k | pdfver[7] < '1' || pdfver[7] > '9') { |
3705 | 304k | pdf.flags |= 1 << BAD_PDF_VERSION; |
3706 | 304k | cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver); |
3707 | | |
3708 | 304k | if (pdfobj) |
3709 | 304k | cli_jsonbool(pdfobj, "BadVersion", 1); |
3710 | 304k | } else { |
3711 | 129k | if (pdfobj) { |
3712 | 129k | begin = (char *)(pdfver + 5); |
3713 | 129k | end = begin + 2; |
3714 | 129k | strtoul(end, &end, 10); |
3715 | 129k | p1 = cli_max_calloc((end - begin) + 2, 1); |
3716 | 129k | if (p1) { |
3717 | 129k | strncpy(p1, begin, end - begin); |
3718 | 129k | p1[end - begin] = '\0'; |
3719 | 129k | cli_jsonstr(pdfobj, "PDFVersion", p1); |
3720 | 129k | free(p1); |
3721 | 129k | } |
3722 | 129k | } |
3723 | 129k | } |
3724 | | |
3725 | 433k | if (pdfver != start || offset) { |
3726 | 414k | pdf.flags |= 1 << BAD_PDF_HEADERPOS; |
3727 | 414k | cli_dbgmsg("cli_pdf: PDF header is not at position 0: %lld\n", (long long)(pdfver - start + offset)); |
3728 | | |
3729 | 414k | if (pdfobj) |
3730 | 414k | cli_jsonbool(pdfobj, "BadVersionLocation", 1); |
3731 | 414k | } |
3732 | | |
3733 | 433k | offset += pdfver - start; |
3734 | | |
3735 | | /* find trailer and xref, don't fail if not found */ |
3736 | 433k | map_off = (off_t)map->len - 2048; |
3737 | 433k | if (map_off < 0) |
3738 | 284k | map_off = 0; |
3739 | | |
3740 | 433k | bytesleft = map->len - map_off; |
3741 | | |
3742 | 433k | eofmap = fmap_need_off_once(map, map_off, bytesleft); |
3743 | 433k | if (!eofmap) { |
3744 | 0 | cli_errmsg("cli_pdf: mmap() failed (2)\n"); |
3745 | |
|
3746 | 0 | rc = CL_EMAP; |
3747 | 0 | goto done; |
3748 | 0 | } |
3749 | | |
3750 | 433k | eof = eofmap + bytesleft; |
3751 | 469M | for (q = &eofmap[bytesleft - 5]; q > eofmap; q--) { |
3752 | 468M | if (memcmp(q, "%%EOF", 5) == 0) |
3753 | 90.7k | break; |
3754 | 468M | } |
3755 | | |
3756 | 433k | if (q <= eofmap) { |
3757 | 342k | pdf.flags |= 1 << BAD_PDF_TRAILER; |
3758 | 342k | cli_dbgmsg("cli_pdf: %%%%EOF not found\n"); |
3759 | | |
3760 | 342k | if (pdfobj) |
3761 | 342k | cli_jsonbool(pdfobj, "NoEOF", 1); |
3762 | 342k | } else { |
3763 | 90.7k | const char *t; |
3764 | | |
3765 | | /*size = q - eofmap + map_off;*/ |
3766 | 90.7k | q -= 9; |
3767 | 27.2M | for (; q > eofmap; q--) { |
3768 | 27.1M | if (memcmp(q, "startxref", 9) == 0) |
3769 | 59.4k | break; |
3770 | 27.1M | } |
3771 | | |
3772 | 90.7k | if (q <= eofmap) { |
3773 | 31.2k | pdf.flags |= 1 << BAD_PDF_TRAILER; |
3774 | 31.2k | cli_dbgmsg("cli_pdf: startxref not found\n"); |
3775 | | |
3776 | 31.2k | if (pdfobj) |
3777 | 31.2k | cli_jsonbool(pdfobj, "NoXREF", 1); |
3778 | 59.4k | } else { |
3779 | 50.6M | for (t = q; t > eofmap; t--) { |
3780 | 50.5M | if (memcmp(t, "trailer", 7) == 0) |
3781 | 8.38k | break; |
3782 | 50.5M | } |
3783 | | |
3784 | 59.4k | pdf_parse_trailer(&pdf, eofmap, eof - eofmap); |
3785 | 59.4k | q += 9; |
3786 | | |
3787 | 137k | while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) { |
3788 | 78.3k | q++; |
3789 | 78.3k | } |
3790 | | |
3791 | 59.4k | if (CL_SUCCESS != cli_strntol_wrap(q, q - eofmap + map_off, 0, 10, &temp_long)) { |
3792 | 9.02k | cli_dbgmsg("cli_pdf: failed to parse PDF trailer xref\n"); |
3793 | 9.02k | pdf.flags |= 1 << BAD_PDF_TRAILER; |
3794 | 50.4k | } else if (temp_long < 0) { |
3795 | 4.26k | cli_dbgmsg("cli_pdf: Encountered invalid negative PDF trailer xref (%ld).\n", temp_long); |
3796 | 4.26k | pdf.flags |= 1 << BAD_PDF_TRAILER; |
3797 | 46.1k | } else { |
3798 | 46.1k | xref = (unsigned long)temp_long; |
3799 | 46.1k | bytesleft = map->len - offset - xref; |
3800 | 46.1k | if (bytesleft > 4096) |
3801 | 5.24k | bytesleft = 4096; |
3802 | | |
3803 | 46.1k | q = fmap_need_off_once(map, offset + xref, bytesleft); |
3804 | 46.1k | if (!q || xrefCheck(q, q + bytesleft) == -1) { |
3805 | 41.4k | cli_dbgmsg("cli_pdf: did not find valid xref\n"); |
3806 | 41.4k | pdf.flags |= 1 << BAD_PDF_TRAILER; |
3807 | 41.4k | } |
3808 | 46.1k | } |
3809 | 59.4k | } |
3810 | 90.7k | } |
3811 | | |
3812 | 433k | size -= offset; |
3813 | 433k | pdf.size = size; |
3814 | 433k | pdf.map = fmap_need_off(map, offset, size); |
3815 | 433k | if (!pdf.map) { |
3816 | 0 | cli_errmsg("cli_pdf: mmap() failed (3)\n"); |
3817 | |
|
3818 | 0 | rc = CL_EMAP; |
3819 | 0 | goto done; |
3820 | 0 | } |
3821 | | |
3822 | 433k | pdf.startoff = offset; |
3823 | | |
3824 | 433k | rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1); |
3825 | 433k | if (CL_SUCCESS != rc) { |
3826 | 0 | cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc); |
3827 | |
|
3828 | 0 | rc = rc == CL_BREAK ? CL_CLEAN : rc; |
3829 | 0 | goto done; |
3830 | 0 | } |
3831 | | |
3832 | | /* |
3833 | | * Find and extract all objects in the PDF. |
3834 | | * This methodology adds objects from object streams. |
3835 | | */ |
3836 | 433k | objs_found = pdf.nobjs; |
3837 | 433k | rc = pdf_find_and_extract_objs(&pdf); |
3838 | | |
3839 | 433k | if (CL_EMEM == rc) { |
3840 | 5 | cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs had an allocation failure\n"); |
3841 | 5 | goto err; |
3842 | 433k | } else if (pdf.nobjs <= objs_found) { |
3843 | 49.6k | cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs did not find any new objects!\n"); |
3844 | 384k | } else { |
3845 | 384k | cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs found %d new objects.\n", pdf.nobjs - objs_found); |
3846 | 384k | } |
3847 | | |
3848 | 433k | if (pdf.flags & (1 << ENCRYPTED_PDF)) |
3849 | 53.3k | pdf.flags &= ~((1 << BAD_FLATESTART) | (1 << BAD_STREAMSTART) | (1 << BAD_ASCIIDECODE)); |
3850 | | |
3851 | 433k | if (pdf.flags && CL_SUCCESS == rc) { |
3852 | 429k | cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags); |
3853 | 429k | rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1); |
3854 | | |
3855 | 429k | if (CL_SUCCESS == rc && SCAN_HEURISTICS && (ctx->dconf->other & OTHER_CONF_PDFNAMEOBJ)) { |
3856 | 429k | if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) { |
3857 | | /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */ |
3858 | 1.00k | rc = cli_append_potentially_unwanted(ctx, "Heuristics.PDF.ObfuscatedNameObject"); |
3859 | 1.00k | } |
3860 | 429k | } |
3861 | | #if 0 |
3862 | | /* TODO: find both trailers, and /Encrypt settings */ |
3863 | | if (pdf.flags & (1 << LINEARIZED_PDF)) |
3864 | | pdf.flags &= ~ (1 << BAD_ASCIIDECODE); |
3865 | | if (pdf.flags & (1 << MANY_FILTERS)) |
3866 | | pdf.flags &= ~ (1 << BAD_ASCIIDECODE); |
3867 | | if (CL_SUCCESS == rc && (pdf.flags & |
3868 | | ((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) | |
3869 | | (1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)| |
3870 | | (1<<UNTERMINATED_OBJ_DICT) | (1<<UNKNOWN_FILTER)))) { |
3871 | | rc = CL_EUNPACK; |
3872 | | } |
3873 | | #endif |
3874 | 429k | } |
3875 | | |
3876 | 441k | done: |
3877 | 441k | if (CL_SUCCESS == rc && pdf.stats.ninvalidobjs > 0) { |
3878 | 9.31k | rc = CL_EFORMAT; |
3879 | 9.31k | } |
3880 | | |
3881 | 441k | err: |
3882 | | |
3883 | 441k | pdf_export_json(&pdf); |
3884 | | |
3885 | 441k | if (pdf.objstms) { |
3886 | 90.7k | for (i = 0; i < pdf.nobjstms; i++) { |
3887 | 54.5k | if (pdf.objstms[i]) { |
3888 | 54.5k | if (pdf.objstms[i]->streambuf) { |
3889 | 54.5k | free(pdf.objstms[i]->streambuf); |
3890 | 54.5k | pdf.objstms[i]->streambuf = NULL; |
3891 | 54.5k | } |
3892 | 54.5k | free(pdf.objstms[i]); |
3893 | 54.5k | pdf.objstms[i] = NULL; |
3894 | 54.5k | } |
3895 | 54.5k | } |
3896 | 36.1k | free(pdf.objstms); |
3897 | 36.1k | pdf.objstms = NULL; |
3898 | 36.1k | } |
3899 | | |
3900 | 441k | if (NULL != pdf.objs) { |
3901 | 2.02M | for (i = 0; i < pdf.nobjs; i++) { |
3902 | 1.59M | if (NULL != pdf.objs[i]) { |
3903 | 1.59M | if (NULL != pdf.objs[i]->path) { |
3904 | 0 | free(pdf.objs[i]->path); |
3905 | 0 | pdf.objs[i]->path = NULL; |
3906 | 0 | } |
3907 | 1.59M | free(pdf.objs[i]); |
3908 | 1.59M | pdf.objs[i] = NULL; |
3909 | 1.59M | } |
3910 | 1.59M | } |
3911 | 433k | free(pdf.objs); |
3912 | 433k | pdf.objs = NULL; |
3913 | 433k | } |
3914 | 441k | if (pdf.fileID) { |
3915 | 39.5k | free(pdf.fileID); |
3916 | 39.5k | pdf.fileID = NULL; |
3917 | 39.5k | } |
3918 | 441k | if (pdf.key) { |
3919 | 8.49k | free(pdf.key); |
3920 | 8.49k | pdf.key = NULL; |
3921 | 8.49k | } |
3922 | | |
3923 | | /* PDF hooks may abort, don't return CL_BREAK to caller! */ |
3924 | 441k | rc = (rc == CL_BREAK) ? CL_CLEAN : rc; |
3925 | | |
3926 | 441k | cli_dbgmsg("cli_pdf: returning %d\n", rc); |
3927 | 441k | return rc; |
3928 | 441k | } |
3929 | | |
3930 | | /** |
3931 | | * @brief Skip the rest of the current line, and find the start of the next line. |
3932 | | * |
3933 | | * @param ptr Current offset into buffer. |
3934 | | * @param len Remaining bytes in buffer. |
3935 | | * |
3936 | | * @return const char* Address of next line, or NULL if no next line in buffer. |
3937 | | */ |
3938 | | static const char * |
3939 | | pdf_nextlinestart(const char *ptr, size_t len) |
3940 | 3.25M | { |
3941 | 3.25M | if (!ptr || (0 == len)) { |
3942 | | /* Invalid args */ |
3943 | 0 | return NULL; |
3944 | 0 | } |
3945 | | |
3946 | 26.1M | while (strchr("\r\n", *ptr) == NULL) { |
3947 | 22.9M | if (--len == 0L) |
3948 | 5.71k | return NULL; |
3949 | | |
3950 | 22.9M | ptr++; |
3951 | 22.9M | } |
3952 | | |
3953 | 8.65M | while (strchr("\r\n", *ptr) != NULL) { |
3954 | 5.47M | if (--len == 0L) |
3955 | 70.4k | return NULL; |
3956 | | |
3957 | 5.40M | ptr++; |
3958 | 5.40M | } |
3959 | | |
3960 | 3.18M | return ptr; |
3961 | 3.25M | } |
3962 | | |
3963 | | /** |
3964 | | * @brief Return the start of the next PDF object. |
3965 | | * |
3966 | | * This assumes that we're not in a stream. |
3967 | | * |
3968 | | * @param ptr Current offset into buffer. |
3969 | | * @param len Remaining bytes in buffer. |
3970 | | * |
3971 | | * @return const char* Address of next object in the buffer, or NULL if there is none in the buffer. |
3972 | | */ |
3973 | | static const char * |
3974 | | pdf_nextobject(const char *ptr, size_t len) |
3975 | 14.7M | { |
3976 | 14.7M | const char *p; |
3977 | 14.7M | int inobject = 1; |
3978 | | |
3979 | 499M | while (len) { |
3980 | 499M | switch (*ptr) { |
3981 | 1.47M | case '\n': |
3982 | 2.55M | case '\r': |
3983 | 3.25M | case '%': /* comment */ |
3984 | 3.25M | p = pdf_nextlinestart(ptr, len); |
3985 | 3.25M | if (p == NULL) |
3986 | 76.1k | return NULL; |
3987 | | |
3988 | 3.18M | len -= (size_t)(p - ptr); |
3989 | 3.18M | ptr = p; |
3990 | 3.18M | inobject = 0; |
3991 | | |
3992 | 3.18M | break; |
3993 | 7.94M | case ' ': |
3994 | 8.52M | case '\t': |
3995 | 9.03M | case '[': /* Start of an array object */ |
3996 | 9.59M | case '\v': |
3997 | 9.83M | case '\f': |
3998 | 13.4M | case '<': /* Start of a dictionary object */ |
3999 | 13.4M | inobject = 0; |
4000 | 13.4M | ptr++; |
4001 | 13.4M | len--; |
4002 | | |
4003 | 13.4M | break; |
4004 | 3.39M | case '/': /* Start of a name object */ |
4005 | 3.39M | return ptr; |
4006 | 312k | case '(': /* start of JS */ |
4007 | 312k | return ptr; |
4008 | 478M | default: |
4009 | 478M | if (!inobject) { |
4010 | | /* TODO: parse and return object type */ |
4011 | 10.7M | return ptr; |
4012 | 10.7M | } |
4013 | | |
4014 | 468M | ptr++; |
4015 | 468M | len--; |
4016 | 499M | } |
4017 | 499M | } |
4018 | | |
4019 | 251k | return NULL; |
4020 | 14.7M | } |
4021 | | |
4022 | | /* PDF statistics */ |
4023 | | static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4024 | 34.6k | { |
4025 | 34.6k | UNUSEDPARAM(obj); |
4026 | 34.6k | UNUSEDPARAM(act); |
4027 | | |
4028 | 34.6k | if (NULL == pdf) |
4029 | 0 | return; |
4030 | | |
4031 | 34.6k | pdf->stats.nasciihexdecode++; |
4032 | 34.6k | } |
4033 | | |
4034 | | static void ASCII85Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4035 | 31.7k | { |
4036 | 31.7k | UNUSEDPARAM(obj); |
4037 | 31.7k | UNUSEDPARAM(act); |
4038 | | |
4039 | 31.7k | if (NULL == pdf) |
4040 | 0 | return; |
4041 | | |
4042 | 31.7k | pdf->stats.nascii85decode++; |
4043 | 31.7k | } |
4044 | | |
4045 | | static void EmbeddedFile_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4046 | 21.0k | { |
4047 | 21.0k | UNUSEDPARAM(obj); |
4048 | 21.0k | UNUSEDPARAM(act); |
4049 | | |
4050 | 21.0k | if (NULL == pdf) |
4051 | 0 | return; |
4052 | | |
4053 | 21.0k | pdf->stats.nembeddedfile++; |
4054 | 21.0k | } |
4055 | | |
4056 | | static void FlateDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4057 | 186k | { |
4058 | 186k | UNUSEDPARAM(obj); |
4059 | 186k | UNUSEDPARAM(act); |
4060 | | |
4061 | 186k | if (NULL == pdf) |
4062 | 0 | return; |
4063 | | |
4064 | 186k | pdf->stats.nflate++; |
4065 | 186k | } |
4066 | | |
4067 | | static void Image_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4068 | 8.98k | { |
4069 | 8.98k | UNUSEDPARAM(obj); |
4070 | 8.98k | UNUSEDPARAM(act); |
4071 | | |
4072 | 8.98k | if (NULL == pdf) |
4073 | 0 | return; |
4074 | | |
4075 | 8.98k | pdf->stats.nimage++; |
4076 | 8.98k | } |
4077 | | |
4078 | | static void LZWDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4079 | 26.6k | { |
4080 | 26.6k | UNUSEDPARAM(obj); |
4081 | 26.6k | UNUSEDPARAM(act); |
4082 | | |
4083 | 26.6k | if (NULL == pdf) |
4084 | 0 | return; |
4085 | | |
4086 | 26.6k | pdf->stats.nlzw++; |
4087 | 26.6k | } |
4088 | | |
4089 | | static void RunLengthDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4090 | 27.3k | { |
4091 | 27.3k | UNUSEDPARAM(obj); |
4092 | 27.3k | UNUSEDPARAM(act); |
4093 | | |
4094 | 27.3k | if (NULL == pdf) |
4095 | 0 | return; |
4096 | | |
4097 | 27.3k | pdf->stats.nrunlengthdecode++; |
4098 | 27.3k | } |
4099 | | |
4100 | | static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4101 | 3.20k | { |
4102 | 3.20k | UNUSEDPARAM(obj); |
4103 | 3.20k | UNUSEDPARAM(act); |
4104 | | |
4105 | 3.20k | if (NULL == pdf) |
4106 | 0 | return; |
4107 | | |
4108 | 3.20k | pdf->stats.nfaxdecode++; |
4109 | 3.20k | } |
4110 | | |
4111 | | static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4112 | 2.41k | { |
4113 | 2.41k | cli_ctx *ctx = NULL; |
4114 | 2.41k | struct json_object *pdfobj, *jbig2arr; |
4115 | | |
4116 | 2.41k | UNUSEDPARAM(obj); |
4117 | 2.41k | UNUSEDPARAM(act); |
4118 | | |
4119 | 2.41k | if (NULL == pdf) |
4120 | 0 | return; |
4121 | | |
4122 | 2.41k | ctx = pdf->ctx; |
4123 | | |
4124 | 2.41k | if (!(SCAN_COLLECT_METADATA)) |
4125 | 0 | return; |
4126 | | |
4127 | 2.41k | if (!(pdf->ctx->wrkproperty)) |
4128 | 0 | return; |
4129 | | |
4130 | 2.41k | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4131 | 2.41k | if (!(pdfobj)) |
4132 | 0 | return; |
4133 | | |
4134 | 2.41k | jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects"); |
4135 | 2.41k | if (!(jbig2arr)) |
4136 | 0 | return; |
4137 | | |
4138 | 2.41k | cli_jsonint_array(jbig2arr, obj->id >> 8); |
4139 | | |
4140 | 2.41k | pdf->stats.njbig2decode++; |
4141 | 2.41k | } |
4142 | | |
4143 | | static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4144 | 16.8k | { |
4145 | 16.8k | UNUSEDPARAM(obj); |
4146 | 16.8k | UNUSEDPARAM(act); |
4147 | | |
4148 | 16.8k | if (NULL == pdf) |
4149 | 0 | return; |
4150 | | |
4151 | 16.8k | pdf->stats.ndctdecode++; |
4152 | 16.8k | } |
4153 | | |
4154 | | static void JPXDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4155 | 4.46k | { |
4156 | 4.46k | UNUSEDPARAM(obj); |
4157 | 4.46k | UNUSEDPARAM(act); |
4158 | | |
4159 | 4.46k | if (NULL == pdf) |
4160 | 0 | return; |
4161 | | |
4162 | 4.46k | pdf->stats.njpxdecode++; |
4163 | 4.46k | } |
4164 | | |
4165 | | static void Crypt_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4166 | 15.6k | { |
4167 | 15.6k | UNUSEDPARAM(obj); |
4168 | 15.6k | UNUSEDPARAM(act); |
4169 | | |
4170 | 15.6k | if (NULL == pdf) |
4171 | 0 | return; |
4172 | | |
4173 | 15.6k | pdf->stats.ncrypt++; |
4174 | 15.6k | } |
4175 | | |
4176 | | static void Standard_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4177 | 47.2k | { |
4178 | 47.2k | UNUSEDPARAM(obj); |
4179 | 47.2k | UNUSEDPARAM(act); |
4180 | | |
4181 | 47.2k | if (NULL == pdf) |
4182 | 0 | return; |
4183 | | |
4184 | 47.2k | pdf->stats.nstandard++; |
4185 | 47.2k | } |
4186 | | |
4187 | | static void Sig_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4188 | 2.68k | { |
4189 | 2.68k | UNUSEDPARAM(obj); |
4190 | 2.68k | UNUSEDPARAM(act); |
4191 | | |
4192 | 2.68k | if (NULL == pdf) |
4193 | 0 | return; |
4194 | | |
4195 | 2.68k | pdf->stats.nsigned++; |
4196 | 2.68k | } |
4197 | | |
4198 | | static void JavaScript_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4199 | 23.0k | { |
4200 | 23.0k | UNUSEDPARAM(pdf); |
4201 | 23.0k | UNUSEDPARAM(obj); |
4202 | 23.0k | UNUSEDPARAM(act); |
4203 | | |
4204 | | /* |
4205 | | * Don't record the pdf->stats or JSON now, we'll look for the actual |
4206 | | * Javascript in the object when we extract it later. This is to prevent |
4207 | | * false positives when objects reference an indirect object which doesn't |
4208 | | * actually have any content. |
4209 | | */ |
4210 | 23.0k | } |
4211 | | |
4212 | | static void OpenAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4213 | 304k | { |
4214 | 304k | UNUSEDPARAM(obj); |
4215 | 304k | UNUSEDPARAM(act); |
4216 | | |
4217 | 304k | if (NULL == pdf) |
4218 | 0 | return; |
4219 | | |
4220 | 304k | pdf->stats.nopenaction++; |
4221 | 304k | } |
4222 | | |
4223 | | static void Launch_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4224 | 13.6k | { |
4225 | 13.6k | UNUSEDPARAM(obj); |
4226 | 13.6k | UNUSEDPARAM(act); |
4227 | | |
4228 | 13.6k | if (NULL == pdf) |
4229 | 0 | return; |
4230 | | |
4231 | 13.6k | pdf->stats.nlaunch++; |
4232 | 13.6k | } |
4233 | | |
4234 | | static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4235 | 28.7k | { |
4236 | 28.7k | UNUSEDPARAM(obj); |
4237 | 28.7k | UNUSEDPARAM(act); |
4238 | | |
4239 | 28.7k | if (NULL == pdf) |
4240 | 0 | return; |
4241 | | |
4242 | 28.7k | pdf->stats.npage++; |
4243 | 28.7k | } |
4244 | | |
4245 | | static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4246 | 14.0k | { |
4247 | 14.0k | cli_ctx *ctx = NULL; |
4248 | | |
4249 | 14.0k | UNUSEDPARAM(act); |
4250 | | |
4251 | 14.0k | if (NULL == pdf) |
4252 | 0 | return; |
4253 | | |
4254 | 14.0k | ctx = pdf->ctx; |
4255 | | |
4256 | 14.0k | if (!(SCAN_COLLECT_METADATA)) |
4257 | 0 | return; |
4258 | | |
4259 | 14.0k | if (!(pdf->stats.author)) { |
4260 | 9.29k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4261 | 9.29k | : (const char *)(obj->start + pdf->map); |
4262 | | |
4263 | 9.29k | pdf->stats.author = calloc(1, sizeof(struct pdf_stats_entry)); |
4264 | 9.29k | if (!(pdf->stats.author)) |
4265 | 0 | return; |
4266 | | |
4267 | 9.29k | pdf->parse_recursion_depth++; |
4268 | 9.29k | pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta)); |
4269 | 9.29k | pdf->parse_recursion_depth--; |
4270 | 9.29k | } |
4271 | 14.0k | } |
4272 | | |
4273 | | static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4274 | 10.2k | { |
4275 | 10.2k | cli_ctx *ctx = NULL; |
4276 | | |
4277 | 10.2k | UNUSEDPARAM(act); |
4278 | | |
4279 | 10.2k | if (NULL == pdf) |
4280 | 0 | return; |
4281 | | |
4282 | 10.2k | ctx = pdf->ctx; |
4283 | | |
4284 | 10.2k | if (!(SCAN_COLLECT_METADATA)) |
4285 | 0 | return; |
4286 | | |
4287 | 10.2k | if (!(pdf->stats.creator)) { |
4288 | 7.14k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4289 | 7.14k | : (const char *)(obj->start + pdf->map); |
4290 | | |
4291 | 7.14k | pdf->stats.creator = calloc(1, sizeof(struct pdf_stats_entry)); |
4292 | 7.14k | if (!(pdf->stats.creator)) |
4293 | 0 | return; |
4294 | | |
4295 | 7.14k | pdf->parse_recursion_depth++; |
4296 | 7.14k | pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta)); |
4297 | 7.14k | pdf->parse_recursion_depth--; |
4298 | 7.14k | } |
4299 | 10.2k | } |
4300 | | |
4301 | | static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4302 | 18.9k | { |
4303 | 18.9k | cli_ctx *ctx = NULL; |
4304 | | |
4305 | 18.9k | UNUSEDPARAM(act); |
4306 | | |
4307 | 18.9k | if (NULL == pdf) |
4308 | 0 | return; |
4309 | | |
4310 | 18.9k | ctx = pdf->ctx; |
4311 | | |
4312 | 18.9k | if (!(SCAN_COLLECT_METADATA)) |
4313 | 0 | return; |
4314 | | |
4315 | 18.9k | if (!(pdf->stats.modificationdate)) { |
4316 | 12.5k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4317 | 12.5k | : (const char *)(obj->start + pdf->map); |
4318 | | |
4319 | 12.5k | pdf->stats.modificationdate = calloc(1, sizeof(struct pdf_stats_entry)); |
4320 | 12.5k | if (!(pdf->stats.modificationdate)) |
4321 | 0 | return; |
4322 | | |
4323 | 12.5k | pdf->parse_recursion_depth++; |
4324 | 12.5k | pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta)); |
4325 | 12.5k | pdf->parse_recursion_depth--; |
4326 | 12.5k | } |
4327 | 18.9k | } |
4328 | | |
4329 | | static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4330 | 50.0k | { |
4331 | 50.0k | cli_ctx *ctx = NULL; |
4332 | | |
4333 | 50.0k | UNUSEDPARAM(act); |
4334 | | |
4335 | 50.0k | if (NULL == pdf) |
4336 | 0 | return; |
4337 | | |
4338 | 50.0k | ctx = pdf->ctx; |
4339 | | |
4340 | 50.0k | if (!(SCAN_COLLECT_METADATA)) |
4341 | 0 | return; |
4342 | | |
4343 | 50.0k | if (!(pdf->stats.creationdate)) { |
4344 | 23.9k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4345 | 23.9k | : (const char *)(obj->start + pdf->map); |
4346 | | |
4347 | 23.9k | pdf->stats.creationdate = calloc(1, sizeof(struct pdf_stats_entry)); |
4348 | 23.9k | if (!(pdf->stats.creationdate)) |
4349 | 0 | return; |
4350 | | |
4351 | 23.9k | pdf->parse_recursion_depth++; |
4352 | 23.9k | pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta)); |
4353 | 23.9k | pdf->parse_recursion_depth--; |
4354 | 23.9k | } |
4355 | 50.0k | } |
4356 | | |
4357 | | static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4358 | 18.9k | { |
4359 | 18.9k | cli_ctx *ctx = NULL; |
4360 | | |
4361 | 18.9k | UNUSEDPARAM(act); |
4362 | | |
4363 | 18.9k | if (NULL == pdf) |
4364 | 0 | return; |
4365 | | |
4366 | 18.9k | ctx = pdf->ctx; |
4367 | | |
4368 | 18.9k | if (!(SCAN_COLLECT_METADATA)) |
4369 | 0 | return; |
4370 | | |
4371 | 18.9k | if (!(pdf->stats.producer)) { |
4372 | 12.3k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4373 | 12.3k | : (const char *)(obj->start + pdf->map); |
4374 | | |
4375 | 12.3k | pdf->stats.producer = calloc(1, sizeof(struct pdf_stats_entry)); |
4376 | 12.3k | if (!(pdf->stats.producer)) |
4377 | 0 | return; |
4378 | | |
4379 | 12.3k | pdf->parse_recursion_depth++; |
4380 | 12.3k | pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta)); |
4381 | 12.3k | pdf->parse_recursion_depth--; |
4382 | 12.3k | } |
4383 | 18.9k | } |
4384 | | |
4385 | | static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4386 | 16.4k | { |
4387 | 16.4k | cli_ctx *ctx = NULL; |
4388 | | |
4389 | 16.4k | UNUSEDPARAM(act); |
4390 | | |
4391 | 16.4k | if (NULL == pdf) |
4392 | 0 | return; |
4393 | | |
4394 | 16.4k | ctx = pdf->ctx; |
4395 | | |
4396 | 16.4k | if (!(SCAN_COLLECT_METADATA)) |
4397 | 0 | return; |
4398 | | |
4399 | 16.4k | if (!(pdf->stats.title)) { |
4400 | 9.43k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4401 | 9.43k | : (const char *)(obj->start + pdf->map); |
4402 | | |
4403 | 9.43k | pdf->stats.title = calloc(1, sizeof(struct pdf_stats_entry)); |
4404 | 9.43k | if (!(pdf->stats.title)) |
4405 | 0 | return; |
4406 | | |
4407 | 9.43k | pdf->parse_recursion_depth++; |
4408 | 9.43k | pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta)); |
4409 | 9.43k | pdf->parse_recursion_depth--; |
4410 | 9.43k | } |
4411 | 16.4k | } |
4412 | | |
4413 | | static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4414 | 7.99k | { |
4415 | 7.99k | cli_ctx *ctx = NULL; |
4416 | | |
4417 | 7.99k | UNUSEDPARAM(act); |
4418 | | |
4419 | 7.99k | if (NULL == pdf) |
4420 | 0 | return; |
4421 | | |
4422 | 7.99k | ctx = pdf->ctx; |
4423 | | |
4424 | 7.99k | if (!(SCAN_COLLECT_METADATA)) |
4425 | 0 | return; |
4426 | | |
4427 | 7.99k | if (!(pdf->stats.keywords)) { |
4428 | 5.68k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4429 | 5.68k | : (const char *)(obj->start + pdf->map); |
4430 | | |
4431 | 5.68k | pdf->stats.keywords = calloc(1, sizeof(struct pdf_stats_entry)); |
4432 | 5.68k | if (!(pdf->stats.keywords)) |
4433 | 0 | return; |
4434 | | |
4435 | 5.68k | pdf->parse_recursion_depth++; |
4436 | 5.68k | pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta)); |
4437 | 5.68k | pdf->parse_recursion_depth--; |
4438 | 5.68k | } |
4439 | 7.99k | } |
4440 | | |
4441 | | static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4442 | 17.6k | { |
4443 | 17.6k | cli_ctx *ctx = NULL; |
4444 | | |
4445 | 17.6k | UNUSEDPARAM(act); |
4446 | | |
4447 | 17.6k | if (NULL == pdf) |
4448 | 0 | return; |
4449 | | |
4450 | 17.6k | ctx = pdf->ctx; |
4451 | | |
4452 | 17.6k | if (!(SCAN_COLLECT_METADATA)) |
4453 | 0 | return; |
4454 | | |
4455 | 17.6k | if (!(pdf->stats.subject)) { |
4456 | 11.1k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4457 | 11.1k | : (const char *)(obj->start + pdf->map); |
4458 | | |
4459 | 11.1k | pdf->stats.subject = calloc(1, sizeof(struct pdf_stats_entry)); |
4460 | 11.1k | if (!(pdf->stats.subject)) |
4461 | 0 | return; |
4462 | | |
4463 | 11.1k | pdf->parse_recursion_depth++; |
4464 | 11.1k | pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta)); |
4465 | 11.1k | pdf->parse_recursion_depth--; |
4466 | 11.1k | } |
4467 | 17.6k | } |
4468 | | |
4469 | | static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4470 | 995 | { |
4471 | 995 | UNUSEDPARAM(obj); |
4472 | 995 | UNUSEDPARAM(act); |
4473 | | |
4474 | 995 | if (NULL == pdf) |
4475 | 0 | return; |
4476 | | |
4477 | 995 | pdf->stats.nrichmedia++; |
4478 | 995 | } |
4479 | | |
4480 | | static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4481 | 10.4k | { |
4482 | 10.4k | UNUSEDPARAM(obj); |
4483 | 10.4k | UNUSEDPARAM(act); |
4484 | | |
4485 | 10.4k | if (NULL == pdf) |
4486 | 0 | return; |
4487 | | |
4488 | 10.4k | pdf->stats.nacroform++; |
4489 | 10.4k | } |
4490 | | |
4491 | | static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4492 | 627 | { |
4493 | 627 | UNUSEDPARAM(obj); |
4494 | 627 | UNUSEDPARAM(act); |
4495 | | |
4496 | 627 | if (NULL == pdf) |
4497 | 0 | return; |
4498 | | |
4499 | 627 | pdf->stats.nxfa++; |
4500 | 627 | } |
4501 | | |
4502 | | static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4503 | 151k | { |
4504 | 151k | cli_ctx *ctx = NULL; |
4505 | 151k | struct pdf_array *array; |
4506 | 151k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4507 | 151k | : (const char *)(obj->start + pdf->map); |
4508 | 151k | const char *begin; |
4509 | 151k | unsigned long npages = 0, count; |
4510 | 151k | long temp_long; |
4511 | 151k | struct pdf_array_node *node; |
4512 | 151k | json_object *pdfobj; |
4513 | 151k | size_t countsize = 0; |
4514 | | |
4515 | 151k | UNUSEDPARAM(act); |
4516 | | |
4517 | 151k | if (!(pdf) || !(pdf->ctx->wrkproperty)) |
4518 | 0 | return; |
4519 | | |
4520 | 151k | ctx = pdf->ctx; |
4521 | | |
4522 | 151k | if (!(SCAN_COLLECT_METADATA)) |
4523 | 0 | return; |
4524 | | |
4525 | 151k | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4526 | 151k | if (!(pdfobj)) |
4527 | 0 | return; |
4528 | | |
4529 | 151k | begin = cli_memstr(objstart, obj->size, "/Kids", 5); |
4530 | 151k | if (!(begin)) |
4531 | 43.9k | return; |
4532 | | |
4533 | 107k | begin += 5; |
4534 | | |
4535 | 107k | pdf->parse_recursion_depth++; |
4536 | 107k | array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL); |
4537 | 107k | pdf->parse_recursion_depth--; |
4538 | | |
4539 | 107k | if (!(array)) { |
4540 | 46.4k | cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4541 | 46.4k | return; |
4542 | 46.4k | } |
4543 | | |
4544 | 494k | for (node = array->nodes; node != NULL; node = node->next) |
4545 | 434k | if (node->datasz) |
4546 | 414k | if (strchr((char *)(node->data), 'R')) |
4547 | 33.2k | npages++; |
4548 | | |
4549 | 60.5k | begin = cli_memstr(objstart, obj->size, "/Count", 6); |
4550 | 60.5k | if (!(begin)) { |
4551 | 42.2k | cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4552 | 42.2k | goto cleanup; |
4553 | 42.2k | } |
4554 | | |
4555 | 18.3k | begin += 6; |
4556 | 44.0k | while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0])) |
4557 | 25.6k | begin++; |
4558 | | |
4559 | 18.3k | if ((size_t)(begin - objstart) >= obj->size) { |
4560 | 288 | goto cleanup; |
4561 | 288 | } |
4562 | | |
4563 | 18.0k | countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin) |
4564 | 18.0k | : (size_t)(obj->start + pdf->map + obj->size - begin); |
4565 | | |
4566 | 18.0k | if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) { |
4567 | 5.07k | cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4568 | 12.9k | } else if (temp_long < 0) { |
4569 | 138 | cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4570 | 12.8k | } else { |
4571 | 12.8k | count = (unsigned long)temp_long; |
4572 | 12.8k | if (count != npages) { |
4573 | 8.88k | cli_jsonbool(pdfobj, "IncorrectPagesCount", 1); |
4574 | 8.88k | } |
4575 | 12.8k | } |
4576 | | |
4577 | 60.5k | cleanup: |
4578 | 60.5k | pdf_free_array(array); |
4579 | 60.5k | } |
4580 | | |
4581 | | static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act) |
4582 | 11.6k | { |
4583 | 11.6k | cli_ctx *ctx = NULL; |
4584 | 11.6k | json_object *colorsobj, *pdfobj; |
4585 | 11.6k | unsigned long ncolors; |
4586 | 11.6k | long temp_long; |
4587 | 11.6k | char *p1; |
4588 | 11.6k | const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf) |
4589 | 11.6k | : (const char *)(obj->start + pdf->map); |
4590 | | |
4591 | 11.6k | UNUSEDPARAM(act); |
4592 | | |
4593 | 11.6k | if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->wrkproperty)) |
4594 | 0 | return; |
4595 | | |
4596 | 11.6k | ctx = pdf->ctx; |
4597 | | |
4598 | 11.6k | if (!(SCAN_COLLECT_METADATA)) |
4599 | 0 | return; |
4600 | | |
4601 | 11.6k | p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7); |
4602 | 11.6k | if (!(p1)) |
4603 | 0 | return; |
4604 | | |
4605 | 11.6k | p1 += 7; |
4606 | | |
4607 | | /* Ensure that we have at least one whitespace character plus at least one number */ |
4608 | 11.6k | if (obj->size - (size_t)(p1 - objstart) < 2) |
4609 | 0 | return; |
4610 | | |
4611 | 15.1k | while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0])) |
4612 | 3.43k | p1++; |
4613 | | |
4614 | 11.6k | if ((size_t)(p1 - objstart) == obj->size) |
4615 | 0 | return; |
4616 | | |
4617 | 11.6k | if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) { |
4618 | 11.6k | return; |
4619 | 11.6k | } else if (temp_long < 0) { |
4620 | 0 | return; |
4621 | 0 | } |
4622 | 0 | ncolors = (unsigned long)temp_long; |
4623 | | |
4624 | | /* We only care if the number of colors > 2**24 */ |
4625 | 0 | if (ncolors < 1 << 24) |
4626 | 0 | return; |
4627 | | |
4628 | 0 | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4629 | 0 | if (!(pdfobj)) |
4630 | 0 | return; |
4631 | | |
4632 | 0 | colorsobj = cli_jsonarray(pdfobj, "BigColors"); |
4633 | 0 | if (!(colorsobj)) |
4634 | 0 | return; |
4635 | | |
4636 | 0 | cli_jsonint_array(colorsobj, obj->id >> 8); |
4637 | 0 | } |
4638 | | |
4639 | | static void pdf_free_stats(struct pdf_struct *pdf) |
4640 | 441k | { |
4641 | | |
4642 | 441k | if (!pdf) { |
4643 | 0 | return; |
4644 | 0 | } |
4645 | | |
4646 | 441k | if ((pdf->stats.author)) { |
4647 | 9.29k | if (pdf->stats.author->data) |
4648 | 7.37k | free(pdf->stats.author->data); |
4649 | 9.29k | free(pdf->stats.author); |
4650 | 9.29k | pdf->stats.author = NULL; |
4651 | 9.29k | } |
4652 | | |
4653 | 441k | if (pdf->stats.creator) { |
4654 | 7.14k | if (pdf->stats.creator->data) |
4655 | 3.95k | free(pdf->stats.creator->data); |
4656 | 7.14k | free(pdf->stats.creator); |
4657 | 7.14k | pdf->stats.creator = NULL; |
4658 | 7.14k | } |
4659 | | |
4660 | 441k | if (pdf->stats.producer) { |
4661 | 12.3k | if (pdf->stats.producer->data) |
4662 | 8.99k | free(pdf->stats.producer->data); |
4663 | 12.3k | free(pdf->stats.producer); |
4664 | 12.3k | pdf->stats.producer = NULL; |
4665 | 12.3k | } |
4666 | | |
4667 | 441k | if (pdf->stats.modificationdate) { |
4668 | 12.5k | if (pdf->stats.modificationdate->data) |
4669 | 9.66k | free(pdf->stats.modificationdate->data); |
4670 | 12.5k | free(pdf->stats.modificationdate); |
4671 | 12.5k | pdf->stats.modificationdate = NULL; |
4672 | 12.5k | } |
4673 | | |
4674 | 441k | if (pdf->stats.creationdate) { |
4675 | 23.9k | if (pdf->stats.creationdate->data) |
4676 | 6.31k | free(pdf->stats.creationdate->data); |
4677 | 23.9k | free(pdf->stats.creationdate); |
4678 | 23.9k | pdf->stats.creationdate = NULL; |
4679 | 23.9k | } |
4680 | | |
4681 | 441k | if (pdf->stats.title) { |
4682 | 9.43k | if (pdf->stats.title->data) |
4683 | 6.31k | free(pdf->stats.title->data); |
4684 | 9.43k | free(pdf->stats.title); |
4685 | 9.43k | pdf->stats.title = NULL; |
4686 | 9.43k | } |
4687 | | |
4688 | 441k | if (pdf->stats.subject) { |
4689 | 11.1k | if (pdf->stats.subject->data) |
4690 | 8.11k | free(pdf->stats.subject->data); |
4691 | 11.1k | free(pdf->stats.subject); |
4692 | 11.1k | pdf->stats.subject = NULL; |
4693 | 11.1k | } |
4694 | | |
4695 | 441k | if (pdf->stats.keywords) { |
4696 | 5.68k | if (pdf->stats.keywords->data) |
4697 | 4.69k | free(pdf->stats.keywords->data); |
4698 | 5.68k | free(pdf->stats.keywords); |
4699 | 5.68k | pdf->stats.keywords = NULL; |
4700 | 5.68k | } |
4701 | 441k | } |
4702 | | |
4703 | | static void pdf_export_json(struct pdf_struct *pdf) |
4704 | 441k | { |
4705 | 441k | cli_ctx *ctx = NULL; |
4706 | 441k | json_object *pdfobj; |
4707 | 441k | unsigned long i; |
4708 | | |
4709 | 441k | if (NULL == pdf) |
4710 | 0 | return; |
4711 | | |
4712 | 441k | if (!(pdf->ctx)) { |
4713 | 0 | goto cleanup; |
4714 | 0 | } |
4715 | | |
4716 | 441k | ctx = pdf->ctx; |
4717 | | |
4718 | 441k | if (!(SCAN_COLLECT_METADATA) || !(pdf->ctx->wrkproperty)) { |
4719 | 0 | goto cleanup; |
4720 | 0 | } |
4721 | | |
4722 | 441k | pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"); |
4723 | 441k | if (!(pdfobj)) { |
4724 | 0 | goto cleanup; |
4725 | 0 | } |
4726 | | |
4727 | 441k | if (pdf->stats.author) { |
4728 | 9.29k | if (!pdf->stats.author->meta.success) { |
4729 | 4.25k | char *out = pdf_finalize_string(pdf, pdf->stats.author->meta.obj, pdf->stats.author->data, pdf->stats.author->meta.length); |
4730 | 4.25k | if (out) { |
4731 | 140 | free(pdf->stats.author->data); |
4732 | 140 | pdf->stats.author->data = out; |
4733 | 140 | pdf->stats.author->meta.length = strlen(out); |
4734 | 140 | pdf->stats.author->meta.success = 1; |
4735 | 140 | } |
4736 | 4.25k | } |
4737 | | |
4738 | 9.29k | if (pdf->stats.author->meta.success && cli_isutf8(pdf->stats.author->data, pdf->stats.author->meta.length)) { |
4739 | 5.18k | cli_jsonstr(pdfobj, "Author", pdf->stats.author->data); |
4740 | 5.18k | } else if (pdf->stats.author->data && pdf->stats.author->meta.length) { |
4741 | 922 | char *b64 = cl_base64_encode(pdf->stats.author->data, pdf->stats.author->meta.length); |
4742 | 922 | cli_jsonstr(pdfobj, "Author", b64); |
4743 | 922 | cli_jsonbool(pdfobj, "Author_base64", 1); |
4744 | 922 | free(b64); |
4745 | 3.19k | } else { |
4746 | 3.19k | cli_jsonstr(pdfobj, "Author", ""); |
4747 | 3.19k | } |
4748 | 9.29k | } |
4749 | 441k | if (pdf->stats.creator) { |
4750 | 7.14k | if (!pdf->stats.creator->meta.success) { |
4751 | 4.34k | char *out = pdf_finalize_string(pdf, pdf->stats.creator->meta.obj, pdf->stats.creator->data, pdf->stats.creator->meta.length); |
4752 | 4.34k | if (out) { |
4753 | 73 | free(pdf->stats.creator->data); |
4754 | 73 | pdf->stats.creator->data = out; |
4755 | 73 | pdf->stats.creator->meta.length = strlen(out); |
4756 | 73 | pdf->stats.creator->meta.success = 1; |
4757 | 73 | } |
4758 | 4.34k | } |
4759 | | |
4760 | 7.14k | if (pdf->stats.creator->meta.success && cli_isutf8(pdf->stats.creator->data, pdf->stats.creator->meta.length)) { |
4761 | 2.86k | cli_jsonstr(pdfobj, "Creator", pdf->stats.creator->data); |
4762 | 4.27k | } else if (pdf->stats.creator->data && pdf->stats.creator->meta.length) { |
4763 | 709 | char *b64 = cl_base64_encode(pdf->stats.creator->data, pdf->stats.creator->meta.length); |
4764 | 709 | cli_jsonstr(pdfobj, "Creator", b64); |
4765 | 709 | cli_jsonbool(pdfobj, "Creator_base64", 1); |
4766 | 709 | free(b64); |
4767 | 3.56k | } else { |
4768 | 3.56k | cli_jsonstr(pdfobj, "Creator", ""); |
4769 | 3.56k | } |
4770 | 7.14k | } |
4771 | 441k | if (pdf->stats.producer) { |
4772 | 12.3k | if (!pdf->stats.producer->meta.success) { |
4773 | 6.22k | char *out = pdf_finalize_string(pdf, pdf->stats.producer->meta.obj, pdf->stats.producer->data, pdf->stats.producer->meta.length); |
4774 | 6.22k | if (out) { |
4775 | 123 | free(pdf->stats.producer->data); |
4776 | 123 | pdf->stats.producer->data = out; |
4777 | 123 | pdf->stats.producer->meta.length = strlen(out); |
4778 | 123 | pdf->stats.producer->meta.success = 1; |
4779 | 123 | } |
4780 | 6.22k | } |
4781 | | |
4782 | 12.3k | if (pdf->stats.producer->meta.success && cli_isutf8(pdf->stats.producer->data, pdf->stats.producer->meta.length)) { |
4783 | 6.21k | cli_jsonstr(pdfobj, "Producer", pdf->stats.producer->data); |
4784 | 6.21k | } else if (pdf->stats.producer->data && pdf->stats.producer->meta.length) { |
4785 | 2.26k | char *b64 = cl_base64_encode(pdf->stats.producer->data, pdf->stats.producer->meta.length); |
4786 | 2.26k | cli_jsonstr(pdfobj, "Producer", b64); |
4787 | 2.26k | cli_jsonbool(pdfobj, "Producer_base64", 1); |
4788 | 2.26k | free(b64); |
4789 | 3.83k | } else { |
4790 | 3.83k | cli_jsonstr(pdfobj, "Producer", ""); |
4791 | 3.83k | } |
4792 | 12.3k | } |
4793 | 441k | if (pdf->stats.modificationdate) { |
4794 | 12.5k | if (!pdf->stats.modificationdate->meta.success) { |
4795 | 7.40k | char *out = pdf_finalize_string(pdf, pdf->stats.modificationdate->meta.obj, pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length); |
4796 | 7.40k | if (out) { |
4797 | 59 | free(pdf->stats.modificationdate->data); |
4798 | 59 | pdf->stats.modificationdate->data = out; |
4799 | 59 | pdf->stats.modificationdate->meta.length = strlen(out); |
4800 | 59 | pdf->stats.modificationdate->meta.success = 1; |
4801 | 59 | } |
4802 | 7.40k | } |
4803 | | |
4804 | 12.5k | if (pdf->stats.modificationdate->meta.success && cli_isutf8(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length)) { |
4805 | 5.23k | cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate->data); |
4806 | 7.34k | } else if (pdf->stats.modificationdate->data && pdf->stats.modificationdate->meta.length) { |
4807 | 4.38k | char *b64 = cl_base64_encode(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length); |
4808 | 4.38k | cli_jsonstr(pdfobj, "ModificationDate", b64); |
4809 | 4.38k | cli_jsonbool(pdfobj, "ModificationDate_base64", 1); |
4810 | 4.38k | free(b64); |
4811 | 4.38k | } else { |
4812 | 2.96k | cli_jsonstr(pdfobj, "ModificationDate", ""); |
4813 | 2.96k | } |
4814 | 12.5k | } |
4815 | 441k | if (pdf->stats.creationdate) { |
4816 | 23.9k | if (!pdf->stats.creationdate->meta.success) { |
4817 | 19.6k | char *out = pdf_finalize_string(pdf, pdf->stats.creationdate->meta.obj, pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length); |
4818 | 19.6k | if (out) { |
4819 | 138 | free(pdf->stats.creationdate->data); |
4820 | 138 | pdf->stats.creationdate->data = out; |
4821 | 138 | pdf->stats.creationdate->meta.length = strlen(out); |
4822 | 138 | pdf->stats.creationdate->meta.success = 1; |
4823 | 138 | } |
4824 | 19.6k | } |
4825 | | |
4826 | 23.9k | if (pdf->stats.creationdate->meta.success && cli_isutf8(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length)) { |
4827 | 4.41k | cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate->data); |
4828 | 19.5k | } else if (pdf->stats.creationdate->data && pdf->stats.creationdate->meta.length) { |
4829 | 1.89k | char *b64 = cl_base64_encode(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length); |
4830 | 1.89k | cli_jsonstr(pdfobj, "CreationDate", b64); |
4831 | 1.89k | cli_jsonbool(pdfobj, "CreationDate_base64", 1); |
4832 | 1.89k | free(b64); |
4833 | 17.6k | } else { |
4834 | 17.6k | cli_jsonstr(pdfobj, "CreationDate", ""); |
4835 | 17.6k | } |
4836 | 23.9k | } |
4837 | 441k | if (pdf->stats.title) { |
4838 | 9.43k | if (!pdf->stats.title->meta.success) { |
4839 | 6.09k | char *out = pdf_finalize_string(pdf, pdf->stats.title->meta.obj, pdf->stats.title->data, pdf->stats.title->meta.length); |
4840 | 6.09k | if (out) { |
4841 | 235 | free(pdf->stats.title->data); |
4842 | 235 | pdf->stats.title->data = out; |
4843 | 235 | pdf->stats.title->meta.length = strlen(out); |
4844 | 235 | pdf->stats.title->meta.success = 1; |
4845 | 235 | } |
4846 | 6.09k | } |
4847 | | |
4848 | 9.43k | if (pdf->stats.title->meta.success && cli_isutf8(pdf->stats.title->data, pdf->stats.title->meta.length)) { |
4849 | 3.57k | cli_jsonstr(pdfobj, "Title", pdf->stats.title->data); |
4850 | 5.86k | } else if (pdf->stats.title->data && pdf->stats.title->meta.length) { |
4851 | 2.73k | char *b64 = cl_base64_encode(pdf->stats.title->data, pdf->stats.title->meta.length); |
4852 | 2.73k | cli_jsonstr(pdfobj, "Title", b64); |
4853 | 2.73k | cli_jsonbool(pdfobj, "Title_base64", 1); |
4854 | 2.73k | free(b64); |
4855 | 3.12k | } else { |
4856 | 3.12k | cli_jsonstr(pdfobj, "Title", ""); |
4857 | 3.12k | } |
4858 | 9.43k | } |
4859 | 441k | if (pdf->stats.subject) { |
4860 | 11.1k | if (!pdf->stats.subject->meta.success) { |
4861 | 4.58k | char *out = pdf_finalize_string(pdf, pdf->stats.subject->meta.obj, pdf->stats.subject->data, pdf->stats.subject->meta.length); |
4862 | 4.58k | if (out) { |
4863 | 335 | free(pdf->stats.subject->data); |
4864 | 335 | pdf->stats.subject->data = out; |
4865 | 335 | pdf->stats.subject->meta.length = strlen(out); |
4866 | 335 | pdf->stats.subject->meta.success = 1; |
4867 | 335 | } |
4868 | 4.58k | } |
4869 | | |
4870 | 11.1k | if (pdf->stats.subject->meta.success && cli_isutf8(pdf->stats.subject->data, pdf->stats.subject->meta.length)) { |
4871 | 6.87k | cli_jsonstr(pdfobj, "Subject", pdf->stats.subject->data); |
4872 | 6.87k | } else if (pdf->stats.subject->data && pdf->stats.subject->meta.length) { |
4873 | 1.23k | char *b64 = cl_base64_encode(pdf->stats.subject->data, pdf->stats.subject->meta.length); |
4874 | 1.23k | cli_jsonstr(pdfobj, "Subject", b64); |
4875 | 1.23k | cli_jsonbool(pdfobj, "Subject_base64", 1); |
4876 | 1.23k | free(b64); |
4877 | 3.01k | } else { |
4878 | 3.01k | cli_jsonstr(pdfobj, "Subject", ""); |
4879 | 3.01k | } |
4880 | 11.1k | } |
4881 | 441k | if (pdf->stats.keywords) { |
4882 | 5.68k | if (!pdf->stats.keywords->meta.success) { |
4883 | 2.31k | char *out = pdf_finalize_string(pdf, pdf->stats.keywords->meta.obj, pdf->stats.keywords->data, pdf->stats.keywords->meta.length); |
4884 | 2.31k | if (out) { |
4885 | 14 | free(pdf->stats.keywords->data); |
4886 | 14 | pdf->stats.keywords->data = out; |
4887 | 14 | pdf->stats.keywords->meta.length = strlen(out); |
4888 | 14 | pdf->stats.keywords->meta.success = 1; |
4889 | 14 | } |
4890 | 2.31k | } |
4891 | | |
4892 | 5.68k | if (pdf->stats.keywords->meta.success && cli_isutf8(pdf->stats.keywords->data, pdf->stats.keywords->meta.length)) { |
4893 | 3.38k | cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords->data); |
4894 | 3.38k | } else if (pdf->stats.keywords->data && pdf->stats.keywords->meta.length) { |
4895 | 710 | char *b64 = cl_base64_encode(pdf->stats.keywords->data, pdf->stats.keywords->meta.length); |
4896 | 710 | cli_jsonstr(pdfobj, "Keywords", b64); |
4897 | 710 | cli_jsonbool(pdfobj, "Keywords_base64", 1); |
4898 | 710 | free(b64); |
4899 | 1.59k | } else { |
4900 | 1.59k | cli_jsonstr(pdfobj, "Keywords", ""); |
4901 | 1.59k | } |
4902 | 5.68k | } |
4903 | 441k | if (pdf->stats.ninvalidobjs) |
4904 | 9.35k | cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs); |
4905 | 441k | if (pdf->stats.njs) |
4906 | 4.33k | cli_jsonint(pdfobj, "JavaScriptObjectCount", pdf->stats.njs); |
4907 | 441k | if (pdf->stats.nflate) |
4908 | 73.1k | cli_jsonint(pdfobj, "DeflateObjectCount", pdf->stats.nflate); |
4909 | 441k | if (pdf->stats.nactivex) |
4910 | 0 | cli_jsonint(pdfobj, "ActiveXObjectCount", pdf->stats.nactivex); |
4911 | 441k | if (pdf->stats.nflash) |
4912 | 0 | cli_jsonint(pdfobj, "FlashObjectCount", pdf->stats.nflash); |
4913 | 441k | if (pdf->stats.ncolors) |
4914 | 0 | cli_jsonint(pdfobj, "ColorCount", pdf->stats.ncolors); |
4915 | 441k | if (pdf->stats.nasciihexdecode) |
4916 | 9.96k | cli_jsonint(pdfobj, "AsciiHexDecodeObjectCount", pdf->stats.nasciihexdecode); |
4917 | 441k | if (pdf->stats.nascii85decode) |
4918 | 14.7k | cli_jsonint(pdfobj, "Ascii85DecodeObjectCount", pdf->stats.nascii85decode); |
4919 | 441k | if (pdf->stats.nembeddedfile) |
4920 | 13.9k | cli_jsonint(pdfobj, "EmbeddedFileCount", pdf->stats.nembeddedfile); |
4921 | 441k | if (pdf->stats.nimage) |
4922 | 4.34k | cli_jsonint(pdfobj, "ImageCount", pdf->stats.nimage); |
4923 | 441k | if (pdf->stats.nlzw) |
4924 | 12.1k | cli_jsonint(pdfobj, "LZWCount", pdf->stats.nlzw); |
4925 | 441k | if (pdf->stats.nrunlengthdecode) |
4926 | 12.8k | cli_jsonint(pdfobj, "RunLengthDecodeCount", pdf->stats.nrunlengthdecode); |
4927 | 441k | if (pdf->stats.nfaxdecode) |
4928 | 1.58k | cli_jsonint(pdfobj, "FaxDecodeCount", pdf->stats.nfaxdecode); |
4929 | 441k | if (pdf->stats.njbig2decode) |
4930 | 1.80k | cli_jsonint(pdfobj, "JBIG2DecodeCount", pdf->stats.njbig2decode); |
4931 | 441k | if (pdf->stats.ndctdecode) |
4932 | 8.44k | cli_jsonint(pdfobj, "DCTDecodeCount", pdf->stats.ndctdecode); |
4933 | 441k | if (pdf->stats.njpxdecode) |
4934 | 1.38k | cli_jsonint(pdfobj, "JPXDecodeCount", pdf->stats.njpxdecode); |
4935 | 441k | if (pdf->stats.ncrypt) |
4936 | 9.12k | cli_jsonint(pdfobj, "CryptCount", pdf->stats.ncrypt); |
4937 | 441k | if (pdf->stats.nstandard) |
4938 | 27.9k | cli_jsonint(pdfobj, "StandardCount", pdf->stats.nstandard); |
4939 | 441k | if (pdf->stats.nsigned) |
4940 | 1.92k | cli_jsonint(pdfobj, "SignedCount", pdf->stats.nsigned); |
4941 | 441k | if (pdf->stats.nopenaction) |
4942 | 29.1k | cli_jsonint(pdfobj, "OpenActionCount", pdf->stats.nopenaction); |
4943 | 441k | if (pdf->stats.nlaunch) |
4944 | 7.72k | cli_jsonint(pdfobj, "LaunchCount", pdf->stats.nlaunch); |
4945 | 441k | if (pdf->stats.npage) |
4946 | 15.2k | cli_jsonint(pdfobj, "PageCount", pdf->stats.npage); |
4947 | 441k | if (pdf->stats.nrichmedia) |
4948 | 984 | cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia); |
4949 | 441k | if (pdf->stats.nacroform) |
4950 | 6.54k | cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform); |
4951 | 441k | if (pdf->stats.nxfa) |
4952 | 619 | cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa); |
4953 | 441k | if (pdf->flags & (1 << BAD_PDF_VERSION)) |
4954 | 304k | cli_jsonbool(pdfobj, "BadVersion", 1); |
4955 | 441k | if (pdf->flags & (1 << BAD_PDF_HEADERPOS)) |
4956 | 414k | cli_jsonbool(pdfobj, "BadHeaderPosition", 1); |
4957 | 441k | if (pdf->flags & (1 << BAD_PDF_TRAILER)) |
4958 | 428k | cli_jsonbool(pdfobj, "BadTrailer", 1); |
4959 | 441k | if (pdf->flags & (1 << BAD_PDF_TOOMANYOBJS)) |
4960 | 0 | cli_jsonbool(pdfobj, "TooManyObjects", 1); |
4961 | 441k | if (pdf->flags & (1 << ENCRYPTED_PDF)) { |
4962 | 53.3k | cli_jsonbool(pdfobj, "Encrypted", 1); |
4963 | 53.3k | if (pdf->flags & (1 << DECRYPTABLE_PDF)) |
4964 | 3.91k | cli_jsonbool(pdfobj, "Decryptable", 1); |
4965 | 49.3k | else |
4966 | 49.3k | cli_jsonbool(pdfobj, "Decryptable", 0); |
4967 | 53.3k | } |
4968 | | |
4969 | 2.03M | for (i = 0; i < pdf->nobjs; i++) { |
4970 | 1.59M | if (pdf->objs[i]->flags & (1 << OBJ_TRUNCATED)) { |
4971 | 352k | json_object *truncobj; |
4972 | | |
4973 | 352k | truncobj = cli_jsonarray(pdfobj, "TruncatedObjects"); |
4974 | 352k | if (!(truncobj)) |
4975 | 0 | continue; |
4976 | | |
4977 | 352k | cli_jsonint_array(truncobj, pdf->objs[i]->id >> 8); |
4978 | 352k | } |
4979 | 1.59M | } |
4980 | | |
4981 | 441k | cleanup: |
4982 | 441k | pdf_free_stats(pdf); |
4983 | 441k | } |