Coverage Report

Created: 2024-05-20 06:31

/src/clamav/libclamav/pdf.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
3
 *  Copyright (C) 2007-2013 Sourcefire, Inc.
4
 *
5
 *  Authors: Nigel Horne, Török Edvin
6
 *
7
 *  Also based on Matt Olney's pdf parser in snort-nrt.
8
 *
9
 *  This program is free software; you can redistribute it and/or modify
10
 *  it under the terms of the GNU General Public License version 2 as
11
 *  published by the Free Software Foundation.
12
 *
13
 *  This program is distributed in the hope that it will be useful,
14
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 *  GNU General Public License for more details.
17
 *
18
 *  You should have received a copy of the GNU General Public License
19
 *  along with this program; if not, write to the Free Software
20
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21
 *  MA 02110-1301, USA.
22
 *
23
 * TODO: Embedded fonts
24
 * TODO: Predictor image handling
25
 */
26
27
#if HAVE_CONFIG_H
28
#include "clamav-config.h"
29
#endif
30
31
#include <stdio.h>
32
#include <sys/types.h>
33
#include <sys/stat.h>
34
#include <ctype.h>
35
#include <string.h>
36
#include <fcntl.h>
37
#include <stdlib.h>
38
#include <errno.h>
39
#ifdef HAVE_LIMITS_H
40
#include <limits.h>
41
#endif
42
#ifdef HAVE_UNISTD_H
43
#include <unistd.h>
44
#endif
45
#include <zlib.h>
46
47
#if HAVE_ICONV
48
#include <iconv.h>
49
#endif
50
51
#ifdef _WIN32
52
#include <stdint.h>
53
#endif
54
55
#include "clamav.h"
56
#include "others.h"
57
#include "pdf.h"
58
#include "pdfdecode.h"
59
#include "scanners.h"
60
#include "fmap.h"
61
#include "str.h"
62
#include "entconv.h"
63
#include "bytecode.h"
64
#include "bytecode_api.h"
65
#include "arc4.h"
66
#include "rijndael.h"
67
#include "textnorm.h"
68
#include "conv.h"
69
#include "json_api.h"
70
71
#ifdef CL_DEBUG
72
/*#define SAVE_TMP
73
 *Save the file being worked on in tmp */
74
#endif
75
76
2.33M
#define MAX_PDF_OBJECTS (64 * 1024)
77
78
struct pdf_struct;
79
80
static const char *pdf_nextlinestart(const char *ptr, size_t len);
81
static const char *pdf_nextobject(const char *ptr, size_t len);
82
83
/* PDF statistics callbacks and related */
84
struct pdfname_action;
85
86
static void pdf_export_json(struct pdf_struct *);
87
88
static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
89
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
90
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
91
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
92
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
93
static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
94
static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
95
static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
96
static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
97
static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
98
static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
99
static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
100
static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
101
static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
102
static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
103
static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
104
static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
105
static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
106
static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
107
static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
108
static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
109
static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
110
static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
111
static void Title_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
112
static void Subject_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
113
static void Keywords_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
114
static void Pages_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
115
static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
116
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
117
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
118
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
119
120
/* End PDF statistics callbacks and related */
121
122
static int pdf_readint(const char *q0, int len, const char *key);
123
static const char *pdf_getdict(const char *q0, int *len, const char *key);
124
static char *pdf_readval(const char *q, int len, const char *key);
125
static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape);
126
127
static int xrefCheck(const char *xref, const char *eof)
128
25.0k
{
129
25.0k
    const char *q;
130
131
26.3k
    while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r'))
132
1.32k
        xref++;
133
134
25.0k
    if (xref + 4 >= eof)
135
271
        return -1;
136
137
24.8k
    if (!memcmp(xref, "xref", strlen("xref"))) {
138
395
        cli_dbgmsg("cli_pdf: found xref\n");
139
395
        return 0;
140
395
    }
141
142
    /* could be xref stream */
143
45.0M
    for (q = xref; q + 5 < eof; q++) {
144
45.0M
        if (!memcmp(q, "/XRef", strlen("/XRef"))) {
145
4.32k
            cli_dbgmsg("cli_pdf: found /XRef\n");
146
4.32k
            return 0;
147
4.32k
        }
148
45.0M
    }
149
150
20.0k
    return -1;
151
24.4k
}
152
153
/* define this to be noisy about things that we can't parse properly */
154
#undef NOISY
155
156
#ifdef NOISY
157
#define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__)
158
#define noisy_warnmsg(...) cli_warnmsg(__VA_ARGS__)
159
#else
160
#define noisy_msg(pdf, ...)
161
#define noisy_warnmsg(...)
162
#endif
163
164
/**
165
 * @brief   Searching BACKwards, find the next character that is not a whitespace.
166
 *
167
 * @param q         Index to start from (at the end of the search space)
168
 * @param start     Beginning of the search space.
169
 *
170
 * @return const char*  Address of the final non-whitespace character OR the same address as the start.
171
 */
172
static const char *findNextNonWSBack(const char *q, const char *start)
173
3.16M
{
174
3.59M
    while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
175
435k
        q--;
176
177
3.16M
    return q;
178
3.16M
}
179
180
/**
181
 * @brief   Searching FORwards, find the next character that is not a whitespace.
182
 *
183
 * @param q         Index to start from (at the end of the search space)
184
 * @param end       End of the search space.
185
 *
186
 * @return const char*  Address of the final non-whitespace character OR the same address as the start.
187
 */
188
static const char *findNextNonWS(const char *q, const char *end)
189
718k
{
190
1.46M
    while (q < end && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
191
741k
        q++;
192
193
718k
    return q;
194
718k
}
195
196
/**
197
 * @brief   Find bounds of stream.
198
 *
199
 * PDF streams are prefixed with "stream" and suffixed with "endstream".
200
 * Return value indicates success or failure.
201
 *
202
 * @param start             start address of search space.
203
 * @param size              size of search space
204
 * @param[out] stream       output param, address of start of stream data
205
 * @param[out] stream_size  output param, size of stream data
206
 * @param newline_hack      hack to support newlines that are \r\n, and not just \n or just \r.
207
 *
208
 * @return cl_error_t       CL_SUCCESS if stream bounds were found.
209
 * @return cl_error_t       CL_BREAK if stream bounds could not be found.
210
 * @return cl_error_t       CL_EFORMAT if stream start was found, but not end. (truncated)
211
 * @return cl_error_t       CL_EARG if invalid args were provided.
212
 */
213
static cl_error_t find_stream_bounds(
214
    const char *start,
215
    size_t size,
216
    const char **stream,
217
    size_t *stream_size,
218
    int newline_hack)
219
1.35M
{
220
1.35M
    cl_error_t status = CL_BREAK;
221
222
1.35M
    const char *idx;
223
1.35M
    const char *stream_begin;
224
1.35M
    const char *endstream_begin;
225
1.35M
    size_t bytesleft = size;
226
227
1.35M
    if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) {
228
0
        status = CL_EARG;
229
0
        return status;
230
0
    }
231
232
1.35M
    *stream      = NULL;
233
1.35M
    *stream_size = 0;
234
235
    /* Begin by finding the "stream" string that prefixes stream data. */
236
1.35M
    if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) {
237
713k
        idx = stream_begin + strlen("stream");
238
713k
        if ((size_t)(idx - start) >= bytesleft)
239
1.21k
            goto done;
240
712k
        bytesleft -= idx - start;
241
242
        /* Skip any new line characters. */
243
712k
        if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') {
244
296k
            idx += 2;
245
296k
            bytesleft -= 2;
246
296k
            if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') {
247
592
                idx++;
248
592
                bytesleft--;
249
592
            }
250
415k
        } else if (bytesleft && idx[0] == '\xa') {
251
68.8k
            idx++;
252
68.8k
            bytesleft--;
253
68.8k
        }
254
255
        /* Pass back start of the stream data. */
256
712k
        *stream = idx;
257
258
        /* Now find the "endstream" string that suffixes stream data. */
259
712k
        endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream"));
260
712k
        if (!endstream_begin) {
261
            /* Couldn't find "endstream", but that's ok --
262
             * -- we'll just count the rest of the provided buffer. */
263
520k
            cli_dbgmsg("find_stream_bounds: Truncated stream found!\n");
264
520k
            endstream_begin = start + size;
265
520k
            status          = CL_EFORMAT;
266
520k
        }
267
268
        /* Pass back end of the stream data, as offset from start. */
269
712k
        *stream_size = endstream_begin - *stream;
270
271
712k
        if (CL_EFORMAT != status)
272
191k
            status = CL_SUCCESS;
273
712k
    }
274
275
1.35M
done:
276
277
1.35M
    return status;
278
1.35M
}
279
280
/**
281
 * @brief Find the next *indirect* object in an object stream, adds it to our list of
282
 *        objects, and increments nobj.
283
 *
284
 * Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
285
 * Instead, they have an objid and an offset from the first object to point you
286
 * right at them.
287
 *
288
 * If found, objstm->current will be updated to the next objid.
289
 *
290
 * All objects in an object stream are indirect and thus do not begin or start
291
 * with "obj" or "endobj".  Instead, the object stream takes the following
292
 * format.
293
 *
294
 *      <dictionary describing stream> objstm content endobjstm
295
 *
296
 * where content looks something like the following:
297
 *
298
 *      15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
299
 *
300
 * In the above example, the literal string (ab) is indirect object # 15, and
301
 * begins at offset 0 of the set of objects.  The next object, # 16 begis at
302
 * offset 3 is a dictionary.  The final object is also a dictionary, beginning
303
 * at offset 46.
304
 *
305
 * @param pdf   Pdf struct that keeps track of all information found in the PDF.
306
 * @param objstm
307
 *
308
 * @return CL_SUCCESS  if success
309
 * @return CL_EPARSE   if parsing error
310
 * @return CL_EMEM     if error allocating memory
311
 * @return CL_EARG     if invalid arguments
312
 */
313
int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, struct pdf_obj **obj_found)
314
254k
{
315
254k
    cl_error_t status   = CL_EPARSE;
316
254k
    struct pdf_obj *obj = NULL;
317
254k
    unsigned long objid = 0, objoff = 0;
318
254k
    long temp_long         = 0;
319
254k
    const char *index      = NULL;
320
254k
    size_t bytes_remaining = 0;
321
322
254k
    if (NULL == pdf || NULL == objstm) {
323
0
        cli_warnmsg("pdf_findobj_in_objstm: invalid arguments\n");
324
0
        return CL_EARG;
325
0
    }
326
327
254k
    if (pdf->nobjs >= MAX_PDF_OBJECTS) {
328
0
        pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
329
330
0
        cli_dbgmsg("pdf_findobj_in_objstm: reached object maximum\n");
331
0
        status = CL_BREAK;
332
0
        goto done;
333
0
    }
334
335
254k
    *obj_found = NULL;
336
337
254k
    index           = objstm->streambuf + objstm->current_pair;
338
254k
    bytes_remaining = objstm->streambuf_len - objstm->current_pair;
339
340
254k
    obj = calloc(sizeof(struct pdf_obj), 1);
341
254k
    if (!obj) {
342
0
        cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n");
343
0
        status = CL_EMEM;
344
0
        goto done;
345
0
    }
346
347
    /* This object is in a stream, not in the regular map buffer. */
348
254k
    obj->objstm = objstm;
349
350
    /* objstm->current_pair points directly to the objid */
351
254k
    if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
352
        /* Failed to find objid */
353
4.23k
        cli_dbgmsg("pdf_findobj_in_objstm: Failed to find objid for obj in object stream\n");
354
4.23k
        status = CL_EPARSE;
355
4.23k
        goto done;
356
250k
    } else if (temp_long < 0) {
357
122
        cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
358
122
        status = CL_EPARSE;
359
122
        goto done;
360
122
    }
361
249k
    objid = (unsigned long)temp_long;
362
363
    /* Find the obj offset that appears just after the objid*/
364
726k
    while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
365
476k
        index++;
366
476k
        bytes_remaining--;
367
476k
    }
368
249k
    index           = findNextNonWS(index, objstm->streambuf + objstm->first);
369
249k
    bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
370
371
249k
    if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
372
        /* Failed to find obj offset */
373
495
        cli_dbgmsg("pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream\n");
374
495
        status = CL_EPARSE;
375
495
        goto done;
376
249k
    } else if (temp_long < 0) {
377
646
        cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
378
646
        status = CL_EPARSE;
379
646
        goto done;
380
646
    }
381
248k
    objoff = (unsigned long)temp_long;
382
383
248k
    if ((size_t)objstm->first + (size_t)objoff > objstm->streambuf_len) {
384
        /* Alleged obj location is further than the length of the stream */
385
971
        cli_dbgmsg("pdf_findobj_in_objstm: obj offset found is greater than the length of the stream.\n");
386
971
        status = CL_EPARSE;
387
971
        goto done;
388
971
    }
389
390
247k
    objstm->current = objstm->first + objoff;
391
392
247k
    obj->id    = (objid << 8) | (0 & 0xff);
393
247k
    obj->start = objstm->current;
394
247k
    obj->flags = 0;
395
396
247k
    objstm->nobjs_found++;
397
398
1.08M
    while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
399
832k
        index++;
400
832k
        bytes_remaining--;
401
832k
    }
402
247k
    objstm->current_pair = (uint32_t)(findNextNonWS(index, objstm->streambuf + objstm->first) - objstm->streambuf);
403
404
    /* Update current_pair, if there are more */
405
247k
    if ((objstm->nobjs_found < objstm->n) &&
406
247k
        (index < objstm->streambuf + objstm->streambuf_len)) {
407
221k
        unsigned long next_objoff = 0;
408
409
        /*
410
         * While we're at it,
411
         *   lets record the size as running up to the next object offset.
412
         *
413
         * To do so, we will need to parse the next obj pair.
414
         */
415
        /* objstm->current_pair points directly to the objid */
416
221k
        index           = objstm->streambuf + objstm->current_pair;
417
221k
        bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
418
419
        /* We don't actually care about the object id at this point, so reading the object id is commented out.
420
           I didn't delete it entirely in case the object id is needed in the future. */
421
        // if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
422
        //     /* Failed to find objid for next obj */
423
        //     cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
424
        //     status = CL_EPARSE;
425
        //     goto done;
426
        // } else if (temp_long < 0) {
427
        //     cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
428
        //     status = CL_EPARSE;
429
        //     goto done;
430
        // }
431
        // next_objid = (unsigned long)temp_long;
432
433
        /* Find the obj offset that appears just after the objid*/
434
669k
        while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
435
448k
            index++;
436
448k
            bytes_remaining--;
437
448k
        }
438
221k
        index           = findNextNonWS(index, objstm->streambuf + objstm->first);
439
221k
        bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
440
441
221k
        if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
442
            /* Failed to find obj offset for next obj */
443
4.10k
            cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
444
4.10k
            status = CL_EPARSE;
445
4.10k
            goto done;
446
217k
        } else if (temp_long < 0) {
447
399
            cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
448
399
            status = CL_EPARSE;
449
399
            goto done;
450
399
        }
451
216k
        next_objoff = (unsigned long)temp_long;
452
453
216k
        if (next_objoff <= objoff) {
454
            /* Failed to find obj offset for next obj */
455
938
            cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's less than or equal to the current one!\n");
456
938
            status = CL_EPARSE;
457
938
            goto done;
458
215k
        } else if (objstm->first + next_objoff > objstm->streambuf_len) {
459
            /* Failed to find obj offset for next obj */
460
2.88k
            cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's further out than the size of the stream!\n");
461
2.88k
            status = CL_EPARSE;
462
2.88k
            goto done;
463
2.88k
        }
464
465
212k
        obj->size = next_objoff - objoff;
466
212k
    } else {
467
        /*
468
         * Should be no more objects. We should verify.
469
         *
470
         * Either way...
471
         *   obj->size should be the rest of the buffer.
472
         */
473
26.6k
        if (objstm->nobjs_found < objstm->n) {
474
225
            cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
475
225
        }
476
477
26.6k
        obj->size = objstm->streambuf_len - obj->start;
478
26.6k
    }
479
480
    /* Success! Add the object to the list of all objects found. */
481
239k
    pdf->nobjs++;
482
239k
    CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs,
483
239k
                                 cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n"),
484
239k
                                 status = CL_EMEM);
485
239k
    pdf->objs[pdf->nobjs - 1] = obj;
486
487
239k
    *obj_found = obj;
488
489
239k
    status = CL_SUCCESS;
490
491
254k
done:
492
254k
    if (CL_SUCCESS != status) {
493
14.7k
        if (NULL != obj) {
494
14.7k
            free(obj);
495
14.7k
        }
496
14.7k
    }
497
254k
    return status;
498
239k
}
499
500
/**
501
 * @brief Find the next *indirect* object.
502
 *
503
 * Indirect objects located outside of an object stream are prefaced with:
504
 *      <objid> <genid> obj
505
 *
506
 * Each of the above are separated by whitespace of some sort.
507
 *
508
 * Indirect objects are postfaced with:
509
 *      endobj
510
 *
511
 * The specification does not say if whitespace is required before or after "endobj".
512
 *
513
 * Identify truncated objects.
514
 *
515
 * If found, pdf->offset will be updated to just after the "endobj".
516
 * If truncated, pdf->offset will == pdf->size.
517
 * If not found, pdf->offset will not be updated.
518
 *
519
 * @param pdf   Pdf context struct that keeps track of all information found in the PDF.
520
 *
521
 * @return CL_SUCCESS  if success
522
 * @return CL_BREAK    if no more objects
523
 * @return CL_EPARSE   if parsing error
524
 * @return CL_EMEM     if error allocating memory
525
 */
526
cl_error_t pdf_findobj(struct pdf_struct *pdf)
527
2.08M
{
528
2.08M
    cl_error_t status = CL_EPARSE;
529
2.08M
    const char *start, *idx, *genid_search_index, *objid_search_index;
530
531
2.08M
    const char *obj_begin = NULL, *obj_end = NULL;
532
2.08M
    const char *endobj_begin = NULL, *endobj_end = NULL;
533
534
2.08M
    struct pdf_obj *obj = NULL;
535
2.08M
    size_t bytesleft;
536
2.08M
    unsigned long genid, objid;
537
2.08M
    long temp_long;
538
539
2.08M
    if (pdf->nobjs >= MAX_PDF_OBJECTS) {
540
0
        pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
541
542
0
        cli_dbgmsg("pdf_findobj: reached object maximum\n");
543
0
        status = CL_BREAK;
544
0
        goto done;
545
0
    }
546
2.08M
    pdf->nobjs++;
547
2.08M
    CLI_MAX_REALLOC_OR_GOTO_DONE(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs, status = CL_EMEM);
548
549
2.08M
    obj = malloc(sizeof(struct pdf_obj));
550
2.08M
    if (!obj) {
551
0
        status = CL_EMEM;
552
0
        goto done;
553
0
    }
554
2.08M
    pdf->objs[pdf->nobjs - 1] = obj;
555
556
2.08M
    memset(obj, 0, sizeof(*obj));
557
558
2.08M
    start     = pdf->map + pdf->offset;
559
2.08M
    bytesleft = pdf->size - pdf->offset;
560
561
    /*
562
     * Start by searching for "obj"
563
     */
564
2.08M
    idx = start + 1;
565
2.27M
    while (bytesleft > 1 + strlen("obj")) {
566
        /* `- 1` accounts for size of white space before obj */
567
1.91M
        idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj"));
568
1.91M
        if (NULL == idx) {
569
70.7k
            status = CL_BREAK;
570
70.7k
            goto done; /* No more objs. */
571
70.7k
        }
572
573
        /* verify that the word has a whitespace before it, and is not the end of
574
         * a previous word */
575
1.84M
        idx--;
576
1.84M
        bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start);
577
578
1.84M
        if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) {
579
            /* This instance of "obj" appears to be part of a longer string.
580
             * Skip it, and keep searching for an object. */
581
195k
            idx += 1 + strlen("obj");
582
195k
            bytesleft -= 1 + strlen("obj");
583
195k
            continue;
584
195k
        }
585
586
        /* Found the beginning of the word */
587
1.64M
        obj_begin = idx;
588
1.64M
        obj_end   = idx + 1 + strlen("obj");
589
590
1.64M
        break;
591
1.84M
    }
592
593
2.01M
    if ((NULL == obj_begin) || (NULL == obj_end)) {
594
362k
        status = CL_BREAK;
595
362k
        goto done; /* No more objs. */
596
362k
    }
597
598
    /* Find the generation id (genid) that appears before the "obj" */
599
1.64M
    genid_search_index = findNextNonWSBack(obj_begin - 1, start);
600
3.59M
    while (genid_search_index > start && isdigit(*genid_search_index))
601
1.94M
        genid_search_index--;
602
603
1.64M
    if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) {
604
109k
        cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
605
        /* Failed to parse, probably not a real object.  Skip past the "obj" thing, and continue. */
606
109k
        pdf->offset = obj_end - pdf->map;
607
109k
        status      = CL_EPARSE;
608
109k
        goto done;
609
1.53M
    } else if (temp_long < 0) {
610
26.4k
        cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long);
611
26.4k
        pdf->offset = obj_end - pdf->map;
612
26.4k
        status      = CL_EPARSE;
613
26.4k
        goto done;
614
26.4k
    }
615
1.51M
    genid = (unsigned long)temp_long;
616
617
    /* Find the object id (objid) that appears before the genid */
618
1.51M
    objid_search_index = findNextNonWSBack(genid_search_index - 1, start);
619
4.01M
    while (objid_search_index > start && isdigit(*objid_search_index))
620
2.50M
        objid_search_index--;
621
622
1.51M
    if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) {
623
        /*
624
         * Edge case:
625
         *
626
         * PDFs with multiple revisions will have %%EOF before the end of the file,
627
         * followed by the next revision of the PDF, which will probably be an immediate objid.
628
         *
629
         * Example:
630
         *   %%EOF1 1 obj <blah> endobj
631
         *
632
         * If this is the case, we can detect it and continue parsing after the %%EOF.
633
         */
634
146k
        if (objid_search_index - strlen("%%EO") > start) {
635
113k
            const char *lastfile = objid_search_index - strlen("%%EO");
636
113k
            if (0 != strncmp(lastfile, "%%EOF", 5)) {
637
                /* Nope, wasn't %%EOF */
638
112k
                cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
639
                /* Skip past the "obj" thing, and continue. */
640
112k
                pdf->offset = obj_end - pdf->map;
641
112k
                status      = CL_EPARSE;
642
112k
                goto done;
643
112k
            }
644
            /* Yup, Looks, like the file continues after %%EOF.
645
             * Probably another revision.  Keep parsing... */
646
1.07k
            objid_search_index++;
647
1.07k
            cli_dbgmsg("pdf_findobj: %%%%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map));
648
33.1k
        } else {
649
            /* Failed parsing at the very beginning */
650
33.1k
            cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
651
            /* Probably not a real object.  Skip past the "obj" thing, and continue. */
652
33.1k
            pdf->offset = obj_end - pdf->map;
653
33.1k
            status      = CL_EPARSE;
654
33.1k
            goto done;
655
33.1k
        }
656
        /* Try again, with offset slightly adjusted */
657
1.07k
        if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) {
658
834
            cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
659
            /* Still failed... Probably not a real object.  Skip past the "obj" thing, and continue. */
660
834
            pdf->offset = obj_end - pdf->map;
661
834
            status      = CL_EPARSE;
662
834
            goto done;
663
834
        } else if (temp_long < 0) {
664
0
            cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
665
0
            pdf->offset = obj_end - pdf->map;
666
0
            status      = CL_EPARSE;
667
0
            goto done;
668
0
        }
669
670
236
        cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
671
1.36M
    } else if (temp_long < 0) {
672
15.6k
        cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
673
15.6k
        pdf->offset = obj_end - pdf->map;
674
15.6k
        status      = CL_EPARSE;
675
15.6k
        goto done;
676
15.6k
    }
677
1.35M
    objid = (unsigned long)temp_long;
678
679
1.35M
    obj->id    = (objid << 8) | (genid & 0xff);
680
1.35M
    obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */
681
1.35M
    obj->flags = 0;
682
683
    /*
684
     * We now have the objid, genid, and object start.
685
     * Find the object end ("endobj").
686
     */
687
    /* `- 1` accounts for size of white space before obj */
688
1.35M
    endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj"));
689
1.35M
    if (NULL == endobj_begin) {
690
        /* No end to object.
691
         * PDF appears to be malformed or truncated.
692
         * Will record the object size as going ot the end of the file.
693
         * Will record that the object is truncated.
694
         * Will position the pdf offset to the end of the PDF.
695
         * The next iteration of this function will find no more objects. */
696
352k
        obj->flags |= 1 << OBJ_TRUNCATED;
697
352k
        obj->size   = (pdf->map + pdf->size) - obj_end;
698
352k
        pdf->offset = pdf->size;
699
700
        /* Truncated "object" found! */
701
352k
        status = CL_SUCCESS;
702
352k
        goto done;
703
352k
    }
704
998k
    endobj_end = endobj_begin + strlen("endobj");
705
706
    /* Size of the object goes from "obj" <-> "endobject". */
707
998k
    obj->size   = endobj_begin - obj_end;
708
998k
    pdf->offset = endobj_end - pdf->map;
709
710
    /*
711
     * Object found!
712
     */
713
998k
    status = CL_SUCCESS; /* truncated file, no end to obj. */
714
715
2.08M
done:
716
2.08M
    if (status == CL_SUCCESS) {
717
1.35M
        cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size);
718
1.35M
    } else {
719
        /* Remove the unused obj reference from our list of objects found */
720
        /* No need to realloc pdf->objs back down.  It won't leak. */
721
732k
        pdf->objs[pdf->nobjs - 1] = NULL;
722
732k
        pdf->nobjs--;
723
724
        /* Free up the obj struct. */
725
732k
        if (NULL != obj)
726
732k
            free(obj);
727
728
732k
        if (status == CL_BREAK) {
729
433k
            cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
730
433k
        } else if (status == CL_EMEM) {
731
0
            cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
732
298k
        } else {
733
298k
            cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
734
298k
        }
735
732k
    }
736
737
2.08M
    return status;
738
998k
}
739
740
static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum)
741
330k
{
742
330k
    UNUSEDPARAM(obj);
743
744
330k
    if (cli_checklimits("pdf", pdf->ctx, (uint64_t)*sum, 0, 0))
745
7.16k
        return len;
746
747
323k
    *sum += len;
748
749
323k
    return cli_writen(fout, buf, len);
750
330k
}
751
752
void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
753
1.19M
{
754
1.19M
    const char *s = "";
755
1.19M
    pdf->flags |= 1 << flag;
756
1.19M
    if (!cli_debug_flag)
757
1.19M
        return;
758
759
0
    switch (flag) {
760
0
        case UNTERMINATED_OBJ_DICT:
761
0
            s = "dictionary not terminated";
762
0
            break;
763
0
        case ESCAPED_COMMON_PDFNAME:
764
            /* like /JavaScript */
765
0
            s = "escaped common pdfname";
766
0
            break;
767
0
        case BAD_STREAM_FILTERS:
768
0
            s = "duplicate stream filters";
769
0
            break;
770
0
        case BAD_PDF_VERSION:
771
0
            s = "bad pdf version";
772
0
            break;
773
0
        case BAD_PDF_HEADERPOS:
774
0
            s = "bad pdf header position";
775
0
            break;
776
0
        case BAD_PDF_TRAILER:
777
0
            s = "bad pdf trailer";
778
0
            break;
779
0
        case BAD_PDF_TOOMANYOBJS:
780
0
            s = "too many pdf objs";
781
0
            break;
782
0
        case BAD_FLATE:
783
0
            s = "bad deflate stream";
784
0
            break;
785
0
        case BAD_FLATESTART:
786
0
            s = "bad deflate stream start";
787
0
            break;
788
0
        case BAD_STREAMSTART:
789
0
            s = "bad stream start";
790
0
            break;
791
0
        case UNKNOWN_FILTER:
792
0
            s = "unknown filter used";
793
0
            break;
794
0
        case BAD_ASCIIDECODE:
795
0
            s = "bad ASCII decode";
796
0
            break;
797
0
        case HEX_JAVASCRIPT:
798
0
            s = "hex javascript";
799
0
            break;
800
0
        case BAD_INDOBJ:
801
0
            s = "referencing nonexistent obj";
802
0
            break;
803
0
        case HAS_OPENACTION:
804
0
            s = "has /OpenAction";
805
0
            break;
806
0
        case HAS_LAUNCHACTION:
807
0
            s = "has /LaunchAction";
808
0
            break;
809
0
        case BAD_STREAMLEN:
810
0
            s = "bad /Length, too small";
811
0
            break;
812
0
        case ENCRYPTED_PDF:
813
0
            s = "PDF is encrypted";
814
0
            break;
815
0
        case LINEARIZED_PDF:
816
0
            s = "linearized PDF";
817
0
            break;
818
0
        case MANY_FILTERS:
819
0
            s = "more than 2 filters per obj";
820
0
            break;
821
0
        case DECRYPTABLE_PDF:
822
0
            s = "decryptable PDF";
823
0
            break;
824
0
    }
825
826
0
    cli_dbgmsg("pdfobj_flag: %s flagged in object %u %u\n", s, obj->id >> 8, obj->id & 0xff);
827
0
}
828
829
struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid)
830
339k
{
831
339k
    uint32_t j;
832
339k
    uint32_t i;
833
834
    /* search starting at previous obj (if exists) */
835
67.4M
    for (i = 0; i < pdf->nobjs; i++) {
836
67.4M
        if (pdf->objs[i] == obj)
837
338k
            break;
838
67.4M
    }
839
840
67.3M
    for (j = i; j < pdf->nobjs; j++) {
841
67.0M
        obj = pdf->objs[j];
842
67.0M
        if (obj->id == objid)
843
43.4k
            return obj;
844
67.0M
    }
845
846
    /* restart search from beginning if not found */
847
66.1M
    for (j = 0; j < i; j++) {
848
65.8M
        obj = pdf->objs[j];
849
65.8M
        if (obj->id == objid)
850
4.80k
            return obj;
851
65.8M
    }
852
853
290k
    return NULL;
854
295k
}
855
856
/**
857
 * @brief   Find and interpret the "/Length" dictionary key value.
858
 *
859
 * The value may be:
860
 *  - a direct object (i.e. just a number)
861
 *  - an indirect object, where the value is somewhere else in the document and we have to look it up.
862
 *    indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
863
 *
864
 * Example dictionary with a single key "/Length" that relies direct object for the value.
865
 *
866
 *      1 0 obj
867
 *          << /Length 534
868
 *              /Filter [ /ASCII85Decode /LZWDecode ]
869
 *          >>
870
 *          stream
871
 *              J..)6T`?p&<!J9%_[umg"B7/Z7KNXbN'S+,*Q/&"OLT'FLIDK#!n`$"<Atdi`\Vn%b%)&'cA*VnK\CJY(sF>c!Jnl@
872
 *              RM]WM;jjH6Gnc75idkL5]+cPZKEBPWdR>FF(kj1_R%W_d&/jS!;iuad7h?[L-F$+]]0A3Ck*$I0KZ?;<)CJtqi65Xb
873
 *              Vc3\n5ua:Q/=0$W<#N3U;H,MQKqfg1?:lUpR;6oN[C2E4ZNr8Udn.'p+?#X+1>0Kuk$bCDF/(3fL5]Oq)^kJZ!C2H1
874
 *              'TO]Rl?Q:&'<5&iP!$Rq;BXRecDN[IJB`,)o8XJOSJ9sDS]hQ;Rj@!ND)bD_q&C\g:inYC%)&u#:u,M6Bm%IY!Kb1+
875
 *              ":aAa'S`ViJglLb8<W9k6Yl\\0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop"5KatWRT"JQ#qYuL,
876
 *              JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
877
 *          endstream
878
 *      endobj
879
 *
880
 * Example dictionary with a single key "/Length" that relies on an indirect object for the value.
881
 *
882
 *      7 0 obj
883
 *          << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
884
 *          stream
885
 *              BT
886
 *                  /F1 12 Tf
887
 *                   72 712 Td
888
 *                  ( A stream with an indirect length ) Tj
889
 *              ET
890
 *          endstream
891
 *      endobj
892
 *
893
 *      8 0 obj
894
 *          77 % The length of the preceding stream
895
 *      endobj
896
 *
897
 * @param pdf       Pdf context structure.
898
 * @param obj       Pdf object context structure.
899
 * @param start     Pointer start of the dictionary string.
900
 * @param len       Remaining length of the dictioary string in bytes.
901
 * @return size_t   Unsigned integer value of the "/Length" key
902
 */
903
static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *dict_start, size_t dict_len)
904
674k
{
905
674k
    size_t length          = 0;
906
674k
    const char *obj_start  = dict_start;
907
674k
    size_t bytes_remaining = dict_len;
908
674k
    long temp_long         = 0;
909
674k
    const char *index;
910
911
674k
    if (bytes_remaining < 8) {
912
25.3k
        return 0;
913
25.3k
    }
914
915
    /*
916
     * Find the "/Length" dictionary key
917
     */
918
648k
    index = cli_memstr(obj_start, bytes_remaining, "/Length", 7);
919
648k
    if (!index)
920
326k
        return 0;
921
922
322k
    bytes_remaining -= index - obj_start;
923
924
322k
    if (bytes_remaining < 1) {
925
0
        return 0;
926
0
    }
927
928
    /* Step the index into the "/Length" string. */
929
322k
    index++;
930
322k
    bytes_remaining--;
931
932
    /* Find the start of the next direct or indirect object.
933
     * pdf_nextobject() assumes we started searching from within a previous object */
934
322k
    obj_start = pdf_nextobject(index, bytes_remaining);
935
322k
    if (!obj_start)
936
730
        return 0;
937
938
321k
    if (bytes_remaining < (size_t)(obj_start - index)) {
939
0
        return 0;
940
0
    }
941
321k
    bytes_remaining -= obj_start - index;
942
321k
    index = obj_start;
943
944
    /* Read the value.  This could either be the direct length value,
945
       or the object id of the indirect object that has the length */
946
321k
    if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
947
27.0k
        cli_dbgmsg("find_length: failed to parse object length or objid\n");
948
27.0k
        return 0;
949
294k
    } else if (temp_long < 0) {
950
5.59k
        cli_dbgmsg("find_length: Encountered invalid negative object length or objid (%ld).\n", temp_long);
951
5.59k
        return 0;
952
5.59k
    }
953
289k
    length = (size_t)temp_long; /* length or maybe object id */
954
955
    /*
956
     * Keep parsing, skipping past the first integer that might have been what we wanted.
957
     * If it's an indirect object, we'll find a Generation ID followed by the letter 'R'
958
     * I.e. something like " 0 R"
959
     */
960
1.17M
    while ((bytes_remaining > 0) && isdigit(*index)) {
961
880k
        index++;
962
880k
        bytes_remaining--;
963
880k
    }
964
965
289k
    if ((bytes_remaining > 0) && (*index == ' ')) {
966
33.3k
        unsigned long genid;
967
968
33.3k
        index++;
969
33.3k
        bytes_remaining--;
970
971
33.3k
        if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
972
4.23k
            cli_dbgmsg("find_length: failed to parse object genid\n");
973
4.23k
            return 0;
974
29.0k
        } else if (temp_long < 0) {
975
3.80k
            cli_dbgmsg("find_length: Encountered invalid negative object genid (%ld).\n", temp_long);
976
3.80k
            return 0;
977
3.80k
        }
978
25.2k
        genid = (unsigned long)temp_long;
979
980
125k
        while ((bytes_remaining > 0) && isdigit(*index)) {
981
100k
            index++;
982
100k
            bytes_remaining--;
983
100k
        }
984
985
25.2k
        if (bytes_remaining < 2) {
986
0
            return 0;
987
0
        }
988
989
25.2k
        if (index[0] == ' ' && index[1] == 'R') {
990
            /*
991
             * Ok so we found a genid and that 'R'.  Which means that first value
992
             * was actually the objid.
993
             * We can look up the indirect object using this information.
994
             */
995
16.6k
            unsigned long objid            = length;
996
16.6k
            const char *indirect_obj_start = NULL;
997
998
16.6k
            cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid);
999
1000
16.6k
            obj = find_obj(pdf, obj, (length << 8) | (genid & 0xff));
1001
16.6k
            if (!obj) {
1002
7.52k
                cli_dbgmsg("find_length: indirect object not found\n");
1003
7.52k
                return 0;
1004
7.52k
            }
1005
1006
9.15k
            indirect_obj_start = pdf->map + obj->start;
1007
9.15k
            bytes_remaining    = pdf->size - obj->start;
1008
1009
            /* Ok so we found the indirect object, lets read the value. */
1010
9.15k
            index = pdf_nextobject(indirect_obj_start, bytes_remaining);
1011
9.15k
            if (!index) {
1012
171
                cli_dbgmsg("find_length: next object not found\n");
1013
171
                return 0;
1014
171
            }
1015
1016
8.98k
            if (bytes_remaining < (size_t)(index - indirect_obj_start)) {
1017
0
                return 0;
1018
0
            }
1019
8.98k
            bytes_remaining -= index - indirect_obj_start;
1020
1021
            /* Found the value, so lets parse it as a long, but prohibit negative lengths. */
1022
8.98k
            if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
1023
4.40k
                cli_dbgmsg("find_length: failed to parse object length from indirect object\n");
1024
4.40k
                return 0;
1025
4.58k
            } else if (temp_long < 0) {
1026
399
                cli_dbgmsg("find_length: Encountered invalid negative obj length (%ld).\n", temp_long);
1027
399
                return 0;
1028
399
            }
1029
4.18k
            length = (size_t)temp_long;
1030
4.18k
        }
1031
25.2k
    }
1032
1033
    /* limit length */
1034
268k
    if ((size_t)(obj_start - pdf->map) + length + 5 > pdf->size)
1035
40.3k
        length = pdf->size - (obj_start - pdf->map) - 5;
1036
1037
268k
    return length;
1038
289k
}
1039
1040
1.14M
#define DUMP_MASK ((1 << OBJ_CONTENTS) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_A85) | (1 << OBJ_EMBEDDED_FILE) | (1 << OBJ_JAVASCRIPT) | (1 << OBJ_OPENACTION) | (1 << OBJ_LAUNCHACTION))
1041
1042
static int run_pdf_hooks(struct pdf_struct *pdf, enum pdf_phase phase, int fd)
1043
2.25M
{
1044
2.25M
    int ret;
1045
2.25M
    struct cli_bc_ctx *bc_ctx;
1046
2.25M
    cli_ctx *ctx = NULL;
1047
2.25M
    fmap_t *map;
1048
1049
2.25M
    if (NULL == pdf)
1050
0
        return CL_EARG;
1051
1052
2.25M
    ctx = pdf->ctx;
1053
1054
2.25M
    bc_ctx = cli_bytecode_context_alloc();
1055
2.25M
    if (!bc_ctx) {
1056
0
        cli_errmsg("run_pdf_hooks: can't allocate memory for bc_ctx\n");
1057
0
        return CL_EMEM;
1058
0
    }
1059
1060
2.25M
    map = ctx->fmap;
1061
2.25M
    if (fd != -1) {
1062
959k
        map = fmap(fd, 0, 0, NULL);
1063
959k
        if (!map) {
1064
0
            cli_dbgmsg("run_pdf_hooks: can't mmap pdf extracted obj\n");
1065
0
            map = ctx->fmap;
1066
0
            fd  = -1;
1067
0
        }
1068
959k
    }
1069
1070
2.25M
    cli_bytecode_context_setpdf(bc_ctx, phase, pdf->nobjs, pdf->objs, &pdf->flags, pdf->size, pdf->startoff);
1071
2.25M
    cli_bytecode_context_setctx(bc_ctx, ctx);
1072
2.25M
    ret = cli_bytecode_runhook(ctx, ctx->engine, bc_ctx, BC_PDF, map);
1073
2.25M
    cli_bytecode_context_destroy(bc_ctx);
1074
1075
2.25M
    if (fd != -1)
1076
959k
        funmap(map);
1077
1078
2.25M
    return ret;
1079
2.25M
}
1080
1081
static void dbg_printhex(const char *msg, const char *hex, unsigned len);
1082
1083
static void aes_256cbc_decrypt(const unsigned char *in, size_t *length, unsigned char *q, char *key, unsigned key_n, int has_iv)
1084
30.5k
{
1085
30.5k
    uint32_t rk[RKLENGTH(256)];
1086
30.5k
    unsigned char iv[16];
1087
30.5k
    size_t len = 0;
1088
30.5k
    unsigned char pad, i;
1089
30.5k
    int nrounds;
1090
1091
30.5k
    if (in == NULL || length == NULL) {
1092
0
        cli_dbgmsg("aes_256cbc_decrypt: invalid NULL parameters!\n");
1093
0
        noisy_warnmsg("aes_256cbc_decrypt: invalid NULL parameters!\n");
1094
0
        return;
1095
0
    }
1096
1097
30.5k
    len = *length;
1098
1099
30.5k
    cli_dbgmsg("aes_256cbc_decrypt: key length: %d, data length: %zu\n", key_n, *length);
1100
30.5k
    if (!(key_n == 16 || key_n == 24 || key_n == 32)) {
1101
0
        cli_dbgmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8);
1102
0
        noisy_warnmsg("aes_256cbc_decrypt: invalid key length: %u!\n", key_n * 8);
1103
0
        return;
1104
0
    }
1105
1106
30.5k
    if (len < 32) {
1107
3.94k
        cli_dbgmsg("aes_256cbc_decrypt: len is <32: %zu\n", len);
1108
3.94k
        noisy_warnmsg("aes_256cbc_decrypt: len is <32: %zu\n", len);
1109
3.94k
        return;
1110
3.94k
    }
1111
1112
26.6k
    if (has_iv) {
1113
24.1k
        memcpy(iv, in, 16);
1114
24.1k
        in += 16;
1115
24.1k
        len -= 16;
1116
24.1k
    } else {
1117
2.50k
        memset(iv, 0, sizeof(iv));
1118
2.50k
    }
1119
1120
26.6k
    cli_dbgmsg("aes_256cbc_decrypt: Calling rijndaelSetupDecrypt\n");
1121
26.6k
    nrounds = rijndaelSetupDecrypt(rk, (const unsigned char *)key, key_n * 8);
1122
26.6k
    if (!nrounds) {
1123
0
        cli_dbgmsg("aes_256cbc_decrypt: nrounds = 0\n");
1124
0
        return;
1125
0
    }
1126
26.6k
    cli_dbgmsg("aes_256cbc_decrypt: Beginning rijndaelDecrypt\n");
1127
1128
954k
    while (len >= 16) {
1129
927k
        unsigned i;
1130
1131
927k
        rijndaelDecrypt(rk, nrounds, in, q);
1132
15.7M
        for (i = 0; i < 16; i++)
1133
14.8M
            q[i] ^= iv[i];
1134
1135
927k
        memcpy(iv, in, 16);
1136
1137
927k
        q += 16;
1138
927k
        in += 16;
1139
927k
        len -= 16;
1140
927k
    }
1141
26.6k
    if (has_iv) {
1142
24.1k
        len += 16;
1143
24.1k
        pad = q[-1];
1144
1145
24.1k
        if (pad > 0x10) {
1146
18.9k
            cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16);
1147
18.9k
            noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x (extra len: %zu)\n", pad, len - 16);
1148
18.9k
            *length -= len;
1149
18.9k
            return;
1150
18.9k
        }
1151
1152
5.17k
        q -= pad;
1153
21.0k
        for (i = 1; i < pad; i++) {
1154
16.5k
            if (q[i] != pad) {
1155
611
                cli_dbgmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad);
1156
611
                noisy_warnmsg("aes_256cbc_decrypt: bad pad: %x != %x\n", q[i], pad);
1157
611
                *length -= len;
1158
1159
611
                return;
1160
611
            }
1161
16.5k
        }
1162
1163
4.56k
        len += pad;
1164
4.56k
    }
1165
1166
7.06k
    *length -= len;
1167
1168
7.06k
    cli_dbgmsg("aes_256cbc_decrypt: length is %zu\n", *length);
1169
7.06k
}
1170
1171
static void aes_128cbc_encrypt(const unsigned char *in, size_t in_length, unsigned char *out, size_t *out_length, const unsigned char *key, size_t key_n, const unsigned char *iv)
1172
647k
{
1173
647k
    uint32_t rk[RKLENGTH(128)];
1174
647k
    unsigned char real_iv[16] = {0};
1175
647k
    int nrounds;
1176
647k
    uint8_t i = 0;
1177
1178
647k
    cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length: %zu, data length: %zu\n", key_n, in_length);
1179
647k
    if (key_n > 16) {
1180
0
        cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: key length is %zu!\n", key_n * 8);
1181
0
        return;
1182
0
    }
1183
1184
647k
    if (in_length < 16) {
1185
0
        cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length);
1186
0
        noisy_warnmsg("cli_pdf: aes_128cbc_encrypt: in_length is <16: %zu\n", in_length);
1187
0
        return;
1188
0
    }
1189
1190
647k
    cli_dbgmsg("aes_128cbc_encrypt: Calling rijndaelSetupEncrypt\n");
1191
647k
    nrounds = rijndaelSetupEncrypt(rk, key, key_n * 8);
1192
647k
    if (!nrounds) {
1193
0
        cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: nrounds = 0\n");
1194
0
        return;
1195
0
    }
1196
647k
    cli_dbgmsg("aes_128cbc_encrypt: Beginning rijndaelEncrypt\n");
1197
1198
647k
    if (iv)
1199
647k
        memcpy(real_iv, iv, sizeof(real_iv));
1200
1201
647k
    *out_length = 0;
1202
166M
    while (in_length >= 16) {
1203
2.82G
        for (i = 0; i < 16; i++)
1204
2.65G
            real_iv[i] ^= in[i];
1205
1206
166M
        rijndaelEncrypt(rk, nrounds, real_iv, real_iv);
1207
1208
2.82G
        for (i = 0; i < 16; i++)
1209
2.65G
            out[i] = real_iv[i];
1210
1211
166M
        out += 16;
1212
166M
        *out_length += 16;
1213
166M
        in += 16;
1214
166M
        in_length -= 16;
1215
166M
    }
1216
1217
647k
    cli_dbgmsg("cli_pdf: aes_128cbc_encrypt: length is %zu\n", *out_length);
1218
647k
}
1219
1220
char *decrypt_any(struct pdf_struct *pdf, uint32_t id, const char *in, size_t *length, enum enc_method enc_method)
1221
50.1k
{
1222
50.1k
    unsigned char *key, *q, result[16];
1223
50.1k
    unsigned n;
1224
50.1k
    struct arc4_state arc4;
1225
1226
50.1k
    if (!length || !*length || !in) {
1227
77
        noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u:  Invalid arguments.\n", id >> 8, id & 0xff);
1228
77
        return NULL;
1229
77
    }
1230
1231
50.0k
    if (NULL == pdf->key || 0 == pdf->keylen) {
1232
7.01k
        noisy_warnmsg("decrypt_any: decrypt failed for obj %u %u:  PDF key never identified.\n", id >> 8, id & 0xff);
1233
7.01k
        return NULL;
1234
7.01k
    }
1235
1236
43.0k
    n = pdf->keylen + 5;
1237
43.0k
    if (enc_method == ENC_AESV2)
1238
1.61k
        n += 4;
1239
1240
43.0k
    key = cli_max_malloc(n);
1241
43.0k
    if (!key) {
1242
0
        noisy_warnmsg("decrypt_any: malloc failed\n");
1243
0
        return NULL;
1244
0
    }
1245
1246
43.0k
    memcpy(key, pdf->key, pdf->keylen);
1247
43.0k
    q    = key + pdf->keylen;
1248
43.0k
    *q++ = id >> 8;
1249
43.0k
    *q++ = id >> 16;
1250
43.0k
    *q++ = id >> 24;
1251
43.0k
    *q++ = id;
1252
43.0k
    *q++ = 0;
1253
43.0k
    if (enc_method == ENC_AESV2)
1254
1.61k
        memcpy(q, "sAlT", 4);
1255
1256
43.0k
    cl_hash_data("md5", key, n, result, NULL);
1257
43.0k
    free(key);
1258
1259
43.0k
    n = pdf->keylen + 5;
1260
43.0k
    if (n > 16)
1261
41.8k
        n = 16;
1262
1263
43.0k
    q = cli_max_calloc(*length, sizeof(char));
1264
43.0k
    if (!q) {
1265
0
        noisy_warnmsg("decrypt_any: malloc failed\n");
1266
0
        return NULL;
1267
0
    }
1268
1269
43.0k
    switch (enc_method) {
1270
1.08k
        case ENC_V2:
1271
1.08k
            cli_dbgmsg("cli_pdf: enc is v2\n");
1272
1.08k
            memcpy(q, in, *length);
1273
1.08k
            if (false == arc4_init(&arc4, result, n)) {
1274
0
                noisy_warnmsg("decrypt_any: failed to init arc4\n");
1275
0
                free(q);
1276
0
                return NULL;
1277
0
            }
1278
1.08k
            arc4_apply(&arc4, q, (unsigned)*length); /* TODO: may truncate for very large lengths */
1279
1280
1.08k
            noisy_msg(pdf, "decrypt_any: decrypted ARC4 data\n");
1281
1282
1.08k
            break;
1283
1.61k
        case ENC_AESV2:
1284
1.61k
            cli_dbgmsg("cli_pdf: enc is aesv2\n");
1285
1.61k
            aes_256cbc_decrypt((const unsigned char *)in, length, q, (char *)result, n, 1);
1286
1287
1.61k
            noisy_msg(pdf, "decrypt_any: decrypted AES(v2) data\n");
1288
1289
1.61k
            break;
1290
26.4k
        case ENC_AESV3:
1291
26.4k
            cli_dbgmsg("decrypt_any: enc is aesv3\n");
1292
1293
26.4k
            aes_256cbc_decrypt((const unsigned char *)in, length, q, pdf->key, pdf->keylen, 1);
1294
1295
26.4k
            noisy_msg(pdf, "decrypted AES(v3) data\n");
1296
1297
26.4k
            break;
1298
3.22k
        case ENC_IDENTITY:
1299
3.22k
            cli_dbgmsg("decrypt_any: enc is identity\n");
1300
3.22k
            memcpy(q, in, *length);
1301
1302
3.22k
            noisy_msg(pdf, "decrypt_any: identity encryption\n");
1303
1304
3.22k
            break;
1305
135
        case ENC_NONE:
1306
135
            cli_dbgmsg("decrypt_any: enc is none\n");
1307
1308
135
            noisy_msg(pdf, "encryption is none\n");
1309
1310
135
            free(q);
1311
135
            return NULL;
1312
10.5k
        case ENC_UNKNOWN:
1313
10.5k
            cli_dbgmsg("decrypt_any: enc is unknown\n");
1314
10.5k
            free(q);
1315
1316
10.5k
            noisy_warnmsg("decrypt_any: unknown encryption method for obj %u %u\n",
1317
10.5k
                          id >> 8, id & 0xff);
1318
1319
10.5k
            return NULL;
1320
43.0k
    }
1321
1322
32.3k
    return (char *)q;
1323
43.0k
}
1324
1325
enum enc_method get_enc_method(struct pdf_struct *pdf, struct pdf_obj *obj)
1326
41.9k
{
1327
41.9k
    if (obj->flags & (1 << OBJ_EMBEDDED_FILE))
1328
337
        return pdf->enc_method_embeddedfile;
1329
1330
41.5k
    if (obj->flags & (1 << OBJ_STREAM))
1331
39.6k
        return pdf->enc_method_stream;
1332
1333
1.97k
    return pdf->enc_method_string;
1334
41.5k
}
1335
1336
enum cstate {
1337
    CSTATE_NONE,
1338
    CSTATE_TJ,
1339
    CSTATE_TJ_PAROPEN
1340
};
1341
1342
static void process(struct text_norm_state *s, enum cstate *st, const char *buf, size_t length, int fout)
1343
546k
{
1344
1.62G
    do {
1345
1.62G
        switch (*st) {
1346
1.56M
            case CSTATE_NONE:
1347
1.56M
                if (*buf == '[') {
1348
22.7k
                    *st = CSTATE_TJ;
1349
1.54M
                } else {
1350
1.54M
                    const char *nl = memchr(buf, '\n', length);
1351
1.54M
                    if (!nl)
1352
317k
                        return;
1353
1354
1.22M
                    if ((size_t)(nl - buf) > length) {
1355
0
                        length = 0;
1356
1.22M
                    } else {
1357
1.22M
                        length -= nl - buf;
1358
1.22M
                    }
1359
1.22M
                    buf = nl;
1360
1.22M
                }
1361
1362
1.25M
                break;
1363
587M
            case CSTATE_TJ:
1364
587M
                if (*buf == '(')
1365
200k
                    *st = CSTATE_TJ_PAROPEN;
1366
1367
587M
                break;
1368
1.03G
            case CSTATE_TJ_PAROPEN:
1369
1.03G
                if (*buf == ')') {
1370
184k
                    *st = CSTATE_TJ;
1371
1.03G
                } else {
1372
1.03G
                    if (text_normalize_buffer(s, (const unsigned char *)buf, 1) != 1) {
1373
638
                        cli_writen(fout, s->out, s->out_pos);
1374
638
                        text_normalize_reset(s);
1375
638
                    }
1376
1.03G
                }
1377
1378
1.03G
                break;
1379
1.62G
        }
1380
1381
1.62G
        buf++;
1382
1.62G
        if (length > 0)
1383
1.62G
            length--;
1384
1.62G
    } while (length > 0);
1385
546k
}
1386
1387
static int pdf_scan_contents(int fd, struct pdf_struct *pdf, struct pdf_obj *obj)
1388
69.5k
{
1389
69.5k
    struct text_norm_state s;
1390
69.5k
    char fullname[1024];
1391
69.5k
    char outbuff[BUFSIZ];
1392
69.5k
    char inbuf[BUFSIZ];
1393
69.5k
    int fout;
1394
69.5k
    size_t n;
1395
69.5k
    cl_error_t rc;
1396
69.5k
    enum cstate st = CSTATE_NONE;
1397
1398
69.5k
    snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d contents", pdf->dir, obj->id >> 8, obj->id & 0xff);
1399
69.5k
    fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
1400
69.5k
    if (fout < 0) {
1401
0
        char err[128];
1402
1403
0
        cli_errmsg("pdf_scan_contents: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
1404
0
        return CL_ETMPFILE;
1405
0
    }
1406
1407
69.5k
    text_normalize_init(&s, (unsigned char *)outbuff, sizeof(outbuff));
1408
616k
    while (1) {
1409
616k
        n = cli_readn(fd, inbuf, sizeof(inbuf));
1410
616k
        if ((n == 0) || (n == (size_t)-1))
1411
69.5k
            break;
1412
1413
546k
        process(&s, &st, inbuf, n, fout);
1414
546k
    }
1415
1416
69.5k
    cli_writen(fout, s.out, s.out_pos);
1417
1418
69.5k
    lseek(fout, 0, SEEK_SET);
1419
69.5k
    rc = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
1420
69.5k
    close(fout);
1421
1422
69.5k
    if (!pdf->ctx->engine->keeptmp || (s.out_pos == 0))
1423
69.5k
        if (cli_unlink(fullname) && rc != CL_VIRUS)
1424
0
            rc = CL_EUNLINK;
1425
1426
69.5k
    return rc;
1427
69.5k
}
1428
1429
cl_error_t pdf_extract_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t flags)
1430
1.58M
{
1431
1.58M
    char fullname[PATH_MAX + 1];
1432
1.58M
    int fout      = -1;
1433
1.58M
    size_t sum    = 0;
1434
1.58M
    cl_error_t rc = CL_SUCCESS;
1435
1.58M
    int dump      = 1;
1436
1437
1.58M
    cli_dbgmsg("pdf_extract_obj: obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1438
1439
1.58M
    if (PDF_OBJECT_RECURSION_LIMIT < pdf->parse_recursion_depth) {
1440
0
        cli_dbgmsg("pdf_extract_obj: Recursion limit reached.\n");
1441
0
        return CL_SUCCESS;
1442
0
    }
1443
1444
1.58M
    if (obj->extracted) {
1445
        // Should not attempt to extract the same object more than once.
1446
1.47k
        return CL_SUCCESS;
1447
1.47k
    }
1448
    // We're not done yet, but this is enough to say we've tried.
1449
    // Trying again won't help any.
1450
1.58M
    obj->extracted = true;
1451
1452
1.58M
    if (obj->objstm) {
1453
239k
        cli_dbgmsg("pdf_extract_obj: extracting obj found in objstm.\n");
1454
239k
        if (obj->objstm->streambuf == NULL) {
1455
0
            cli_warnmsg("pdf_extract_obj: object in object stream has null stream buffer!\n");
1456
0
            return CL_EFORMAT;
1457
0
        }
1458
239k
    }
1459
1460
    /* TODO: call bytecode hook here, allow override dumpability */
1461
1.58M
    if ((!(obj->flags & (1 << OBJ_STREAM)) || (obj->flags & (1 << OBJ_HASFILTERS))) && !(obj->flags & DUMP_MASK)) {
1462
        /* don't dump all streams */
1463
580k
        dump = 0;
1464
580k
    }
1465
1466
1.58M
    if ((obj->flags & (1 << OBJ_IMAGE)) && !(obj->flags & (1 << OBJ_FILTER_DCT))) {
1467
        /* don't dump / scan non-JPG images */
1468
4.48k
        dump = 0;
1469
4.48k
    }
1470
1471
1.58M
    if (obj->flags & (1 << OBJ_FORCEDUMP)) {
1472
        /* bytecode can force dump by setting this flag */
1473
1.38k
        dump = 1;
1474
1.38k
    }
1475
1476
1.58M
    if (!dump)
1477
583k
        return CL_CLEAN;
1478
1479
1.00M
    cli_dbgmsg("pdf_extract_obj: dumping obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1480
1481
1.00M
    snprintf(fullname, sizeof(fullname), "%s" PATHSEP "pdf obj %d %d", pdf->dir, obj->id >> 8, obj->id & 0xff);
1482
1.00M
    fout = open(fullname, O_RDWR | O_CREAT | O_EXCL | O_TRUNC | O_BINARY, 0600);
1483
1.00M
    if (fout < 0) {
1484
25
        char err[128];
1485
25
        cli_errmsg("pdf_extract_obj: can't create temporary file %s: %s\n", fullname, cli_strerror(errno, err, sizeof(err)));
1486
1487
25
        return CL_ETMPFILE;
1488
25
    }
1489
1490
1.00M
    if (!(flags & PDF_EXTRACT_OBJ_SCAN)) {
1491
1.36k
        if (NULL != obj->path) {
1492
0
            obj->path = strdup(fullname);
1493
0
        }
1494
1.36k
    }
1495
1496
1.00M
    if ((NULL == obj->objstm) &&
1497
1.00M
        (obj->flags & (1 << OBJ_STREAM))) {
1498
        /*
1499
         * Object contains a stream. Parse this now.
1500
         */
1501
674k
        cli_dbgmsg("pdf_extract_obj: parsing a stream in obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1502
1503
674k
        const char *start = pdf->map + obj->start;
1504
1505
674k
        size_t length;
1506
674k
        size_t orig_length;
1507
674k
        int dict_len = obj->stream - start; /* Dictionary should end where the stream begins */
1508
1509
674k
        const char *pstr;
1510
674k
        struct pdf_dict *dparams     = NULL;
1511
674k
        struct objstm_struct *objstm = NULL;
1512
674k
        int xref                     = 0;
1513
1514
        /* Find and interpret the length dictionary value */
1515
674k
        length = find_length(pdf, obj, start, dict_len);
1516
1517
674k
        orig_length = length;
1518
1519
674k
        if (length > obj->stream_size) {
1520
79.5k
            cli_dbgmsg("cli_pdf: Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1521
79.5k
            noisy_warnmsg("Stream length exceeds object length by %zu bytes. Length truncated to %zu bytes\n", length - obj->stream_size, obj->stream_size);
1522
1523
79.5k
            length = obj->stream_size;
1524
79.5k
        }
1525
1526
674k
        if (!(obj->flags & (1 << OBJ_FILTER_FLATE)) && (length == 0)) {
1527
            /*
1528
             * If the length is unknown and this doesn't contain a FLATE encoded filter...
1529
             * Calculate the length using the stream size, and trimming
1530
             * off any newline/carriage returns from the end of the stream.
1531
             */
1532
374k
            const char *q = start + obj->stream_size;
1533
374k
            length        = obj->stream_size;
1534
374k
            q--;
1535
1536
374k
            if (length > 0) {
1537
360k
                if (*q == '\n') {
1538
5.93k
                    q--;
1539
5.93k
                    length--;
1540
1541
5.93k
                    if (length > 0 && *q == '\r')
1542
1.18k
                        length--;
1543
354k
                } else if (*q == '\r') {
1544
27.7k
                    length--;
1545
27.7k
                }
1546
360k
            }
1547
1548
374k
            cli_dbgmsg("pdf_extract_obj: calculated length %lld\n", (long long)length);
1549
374k
        } else {
1550
299k
            if (obj->stream_size > (size_t)length + 2) {
1551
168k
                cli_dbgmsg("cli_pdf: calculated length %zu < %zu\n",
1552
168k
                           (size_t)length, obj->stream_size);
1553
168k
                length = obj->stream_size;
1554
168k
            }
1555
299k
        }
1556
1557
674k
        if ((0 != orig_length) && (obj->stream_size > (size_t)orig_length + 20)) {
1558
114k
            cli_dbgmsg("pdf_extract_obj: orig length: %lld, length: %lld, size: %zu\n",
1559
114k
                       (long long)orig_length, (long long)length, obj->stream_size);
1560
114k
            pdfobj_flag(pdf, obj, BAD_STREAMLEN);
1561
114k
        }
1562
1563
674k
        if (0 == length) {
1564
53.1k
            length = obj->stream_size;
1565
53.1k
            if (0 == length) {
1566
23.1k
                cli_dbgmsg("pdf_extract_obj: Alleged or calculated stream length and stream buffer size both 0\n");
1567
23.1k
                goto done; /* Empty stream, nothing to scan */
1568
23.1k
            }
1569
53.1k
        }
1570
1571
        /* Check if XRef is enabled */
1572
651k
        if (cli_memstr(start, dict_len, "/XRef", strlen("/XRef"))) {
1573
25.2k
            xref = 1;
1574
25.2k
        }
1575
1576
        /*
1577
         * Identify the DecodeParms, if available.
1578
         */
1579
651k
        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DecodeParms"))) {
1580
66.6k
            cli_dbgmsg("pdf_extract_obj: Found /DecodeParms\n");
1581
584k
        } else if (NULL != (pstr = pdf_getdict(start, &dict_len, "/DP"))) {
1582
41.2k
            cli_dbgmsg("pdf_extract_obj: Found /DP\n");
1583
41.2k
        }
1584
1585
651k
        if (pstr) {
1586
            /* shift pstr left to "<<" for pdf_parse_dict */
1587
199k
            while ((*pstr == '<') && (pstr > start)) {
1588
91.8k
                pstr--;
1589
91.8k
                dict_len++;
1590
91.8k
            }
1591
1592
            /* shift pstr right to "<<" for pdf_parse_dict */
1593
1.48M
            while ((*pstr != '<') && (dict_len > 0)) {
1594
1.38M
                pstr++;
1595
1.38M
                dict_len--;
1596
1.38M
            }
1597
1598
107k
            if (dict_len > 4) {
1599
105k
                pdf->parse_recursion_depth++;
1600
105k
                dparams = pdf_parse_dict(pdf, obj, obj->size, (char *)pstr, NULL);
1601
105k
                pdf->parse_recursion_depth--;
1602
105k
            } else {
1603
2.29k
                cli_dbgmsg("pdf_extract_obj: failed to locate DecodeParms dictionary start\n");
1604
2.29k
            }
1605
107k
        }
1606
1607
        /*
1608
         * Go back to the start of the dictionary and check to see if the stream
1609
         * is an object stream. If so, collect the relevant info.
1610
         */
1611
651k
        dict_len = obj->stream - start;
1612
651k
        if (NULL != (pstr = pdf_getdict(start, &dict_len, "/Type/ObjStm"))) {
1613
85.5k
            int32_t objstm_first  = -1;
1614
85.5k
            int32_t objstm_length = -1;
1615
85.5k
            int32_t objstm_n      = -1;
1616
1617
85.5k
            cli_dbgmsg("pdf_extract_obj: Found /Type/ObjStm\n");
1618
1619
85.5k
            dict_len = obj->stream - start;
1620
85.5k
            if ((-1 == (objstm_first = pdf_readint(start, dict_len, "/First")))) {
1621
10.3k
                cli_warnmsg("pdf_extract_obj: Failed to find offset of first object in object stream\n");
1622
75.2k
            } else if ((-1 == (objstm_length = pdf_readint(start, dict_len, "/Length")))) {
1623
4.55k
                cli_warnmsg("pdf_extract_obj: Failed to find length of object stream\n");
1624
70.6k
            } else if ((-1 == (objstm_n = pdf_readint(start, dict_len, "/N")))) {
1625
7.21k
                cli_warnmsg("pdf_extract_obj: Failed to find num objects in object stream\n");
1626
63.4k
            } else {
1627
                /* Add objstm to pdf struct, so it can be freed eventually */
1628
63.4k
                pdf->nobjstms++;
1629
63.4k
                pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1630
63.4k
                if (!pdf->objstms) {
1631
0
                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1632
0
                    pdf_free_dict(dparams);
1633
0
                    return CL_EMEM;
1634
0
                }
1635
1636
63.4k
                objstm = malloc(sizeof(struct objstm_struct));
1637
63.4k
                if (!objstm) {
1638
0
                    cli_warnmsg("pdf_extract_obj: out of memory parsing object stream (%u)\n", pdf->nobjstms);
1639
0
                    pdf_free_dict(dparams);
1640
0
                    return CL_EMEM;
1641
0
                }
1642
63.4k
                pdf->objstms[pdf->nobjstms - 1] = objstm;
1643
1644
63.4k
                memset(objstm, 0, sizeof(*objstm));
1645
1646
63.4k
                objstm->first        = (uint32_t)objstm_first;
1647
63.4k
                objstm->current      = (uint32_t)objstm_first;
1648
63.4k
                objstm->current_pair = 0;
1649
63.4k
                objstm->length       = (uint32_t)objstm_length;
1650
63.4k
                objstm->n            = (uint32_t)objstm_n;
1651
1652
63.4k
                cli_dbgmsg("pdf_extract_obj: ObjStm first obj at offset %d\n", objstm->first);
1653
63.4k
                cli_dbgmsg("pdf_extract_obj: ObjStm length is %d bytes\n", objstm->length);
1654
63.4k
                cli_dbgmsg("pdf_extract_obj: ObjStm should contain %d objects\n", objstm->n);
1655
63.4k
            }
1656
85.5k
        }
1657
1658
651k
        sum = pdf_decodestream(pdf, obj, dparams, obj->stream, (uint32_t)length, xref, fout, &rc, objstm);
1659
651k
        if ((CL_SUCCESS != rc) && (CL_VIRUS != rc)) {
1660
130k
            cli_dbgmsg("Error decoding stream! Error code: %d\n", rc);
1661
1662
            /* It's ok if we couldn't decode the stream,
1663
             *   make a best effort to keep parsing...
1664
             *   Unless we were unable to allocate memory.*/
1665
130k
            if (CL_EMEM == rc) {
1666
0
                goto really_done;
1667
0
            }
1668
130k
            if (CL_EPARSE == rc) {
1669
130k
                rc = CL_SUCCESS;
1670
130k
            }
1671
1672
130k
            if (NULL != objstm) {
1673
                /*
1674
                 * If we were expecting an objstm and there was a failure...
1675
                 *   discard the memory for last object stream.
1676
                 */
1677
8.92k
                if (NULL != pdf->objstms) {
1678
8.92k
                    if (NULL != pdf->objstms[pdf->nobjstms - 1]) {
1679
8.92k
                        if (NULL != pdf->objstms[pdf->nobjstms - 1]->streambuf) {
1680
0
                            free(pdf->objstms[pdf->nobjstms - 1]->streambuf);
1681
0
                            pdf->objstms[pdf->nobjstms - 1]->streambuf = NULL;
1682
0
                        }
1683
8.92k
                        free(pdf->objstms[pdf->nobjstms - 1]);
1684
8.92k
                        pdf->objstms[pdf->nobjstms - 1] = NULL;
1685
8.92k
                    }
1686
1687
                    /* Pop the objstm off the end of the pdf->objstms array. */
1688
8.92k
                    if (pdf->nobjstms > 0) {
1689
8.92k
                        pdf->nobjstms--;
1690
8.92k
                        if (0 == pdf->nobjstms) {
1691
2.05k
                            free(pdf->objstms);
1692
2.05k
                            pdf->objstms = NULL;
1693
6.86k
                        } else {
1694
6.86k
                            pdf->objstms = cli_max_realloc_or_free(pdf->objstms, sizeof(struct objstm_struct *) * pdf->nobjstms);
1695
1696
6.86k
                            if (!pdf->objstms) {
1697
0
                                cli_warnmsg("pdf_extract_obj: out of memory when shrinking down objstm array\n");
1698
0
                                return CL_EMEM;
1699
0
                            }
1700
6.86k
                        }
1701
8.92k
                    } else {
1702
                        /* hm.. this shouldn't happen */
1703
0
                        cli_warnmsg("pdf_extract_obj: Failure counting objstms.\n");
1704
0
                    }
1705
8.92k
                }
1706
8.92k
            }
1707
130k
        }
1708
1709
651k
        if (dparams)
1710
75.2k
            pdf_free_dict(dparams);
1711
1712
651k
        if (rc == CL_VIRUS) {
1713
0
            sum = 0; /* prevents post-filter scan */
1714
0
            goto done;
1715
0
        }
1716
1717
651k
    } else if (obj->flags & (1 << OBJ_JAVASCRIPT)) {
1718
15.0k
        const char *q2;
1719
15.0k
        const char *q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
1720
15.0k
                                      : (const char *)(obj->start + pdf->map);
1721
1722
        /* TODO: get obj-endobj size */
1723
15.0k
        off_t bytesleft = obj->size;
1724
1725
15.0k
        if (bytesleft < 0) {
1726
0
            goto done;
1727
0
        }
1728
1729
32.8k
        do {
1730
32.8k
            char *js      = NULL;
1731
32.8k
            size_t js_len = 0;
1732
32.8k
            const char *q3;
1733
1734
32.8k
            q2 = cli_memstr(q, bytesleft, "/JavaScript", 11);
1735
32.8k
            if (!q2)
1736
14.2k
                break;
1737
1738
18.6k
            bytesleft -= q2 - q + 11;
1739
18.6k
            q = q2 + 11;
1740
1741
18.6k
            js = pdf_readstring(q, bytesleft, "/JS", NULL, &q2, !(pdf->flags & (1 << DECRYPTABLE_PDF)));
1742
18.6k
            bytesleft -= q2 - q;
1743
18.6k
            q = q2;
1744
1745
18.6k
            if (js) {
1746
8.89k
                char *decrypted = NULL;
1747
8.89k
                const char *out = js;
1748
8.89k
                js_len          = strlen(js);
1749
8.89k
                if (pdf->flags & (1 << DECRYPTABLE_PDF)) {
1750
2.94k
                    cli_dbgmsg("pdf_extract_obj: encrypted string\n");
1751
2.94k
                    decrypted = decrypt_any(pdf, obj->id, js, &js_len, pdf->enc_method_string);
1752
1753
2.94k
                    if (decrypted) {
1754
2.06k
                        noisy_msg(pdf, "pdf_extract_obj: decrypted Javascript string from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1755
2.06k
                        out = decrypted;
1756
2.06k
                    }
1757
2.94k
                }
1758
1759
8.89k
                if ((pdf->ctx->options->general & CL_SCAN_GENERAL_COLLECT_METADATA) && pdf->ctx->wrkproperty != NULL) {
1760
8.89k
                    struct json_object *pdfobj, *jbig2arr;
1761
1762
8.89k
                    if (NULL == (pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats"))) {
1763
0
                        cli_errmsg("pdf_extract_obj: failed to get PDFStats JSON object\n");
1764
8.89k
                    } else if (NULL == (jbig2arr = cli_jsonarray(pdfobj, "JavascriptObjects"))) {
1765
0
                        cli_errmsg("pdf_extract_obj: failed to get JavascriptObjects JSON object\n");
1766
8.89k
                    } else {
1767
8.89k
                        cli_jsonint_array(jbig2arr, obj->id >> 8);
1768
8.89k
                    }
1769
8.89k
                }
1770
1771
8.89k
                pdf->stats.njs++;
1772
1773
8.89k
                if (filter_writen(pdf, obj, fout, out, js_len, (size_t *)&sum) != js_len) {
1774
0
                    rc = CL_EWRITE;
1775
0
                    free(js);
1776
0
                    break;
1777
0
                }
1778
1779
8.89k
                free(decrypted);
1780
8.89k
                free(js);
1781
8.89k
                cli_dbgmsg("pdf_extract_obj: bytesleft: %d\n", (int)bytesleft);
1782
1783
8.89k
                if (bytesleft > 0) {
1784
8.89k
                    q2 = pdf_nextobject(q, bytesleft);
1785
8.89k
                    if (!q2)
1786
4.06k
                        q2 = q + bytesleft - 1;
1787
1788
                    /* non-conforming PDFs that don't escape ) properly */
1789
8.89k
                    q3 = memchr(q, ')', bytesleft);
1790
8.89k
                    if (q3 && q3 < q2)
1791
440
                        q2 = q3;
1792
1793
11.8k
                    while (q2 > q && q2[-1] == ' ')
1794
2.98k
                        q2--;
1795
1796
8.89k
                    if (q2 > q) {
1797
6.22k
                        q--;
1798
6.22k
                        filter_writen(pdf, obj, fout, q, q2 - q, (size_t *)&sum);
1799
6.22k
                        q++;
1800
6.22k
                    }
1801
8.89k
                }
1802
8.89k
            }
1803
1804
18.6k
        } while (bytesleft > 0);
1805
315k
    } else {
1806
315k
        off_t bytesleft = obj->size;
1807
1808
315k
        if (bytesleft < 0)
1809
0
            rc = CL_EFORMAT;
1810
315k
        else {
1811
315k
            if (obj->objstm) {
1812
8.95k
                if (filter_writen(pdf, obj, fout, obj->objstm->streambuf + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1813
0
                    rc = CL_EWRITE;
1814
306k
            } else {
1815
306k
                if (filter_writen(pdf, obj, fout, pdf->map + obj->start, bytesleft, (size_t *)&sum) != (size_t)bytesleft)
1816
0
                    rc = CL_EWRITE;
1817
306k
            }
1818
315k
        }
1819
315k
    }
1820
1821
1.00M
done:
1822
1823
1.00M
    cli_dbgmsg("pdf_extract_obj: extracted %td bytes %u %u obj\n", sum, obj->id >> 8, obj->id & 0xff);
1824
1.00M
    cli_dbgmsg("pdf_extract_obj:         ... to %s\n", fullname);
1825
1826
1.00M
    if (flags & PDF_EXTRACT_OBJ_SCAN && sum) {
1827
962k
        int rc2;
1828
1829
        /* TODO: invoke bytecode on this pdf obj with metainformation associated */
1830
962k
        lseek(fout, 0, SEEK_SET);
1831
962k
        rc2 = cli_magic_scan_desc(fout, fullname, pdf->ctx, NULL, LAYER_ATTRIBUTES_NONE);
1832
962k
        if (rc2 != CL_SUCCESS) {
1833
3.16k
            rc = rc2;
1834
3.16k
            goto really_done;
1835
3.16k
        }
1836
1837
959k
        if ((rc == CL_CLEAN) || (rc == CL_VIRUS)) {
1838
959k
            rc2 = run_pdf_hooks(pdf, PDF_PHASE_POSTDUMP, fout);
1839
959k
            if (rc2 == CL_VIRUS) {
1840
0
                rc = rc2;
1841
0
                goto really_done;
1842
0
            }
1843
959k
        }
1844
1845
959k
        if (((rc == CL_CLEAN) || (rc == CL_VIRUS)) && (obj->flags & (1 << OBJ_CONTENTS))) {
1846
69.5k
            lseek(fout, 0, SEEK_SET);
1847
69.5k
            cli_dbgmsg("pdf_extract_obj: dumping contents from obj %u %u\n", obj->id >> 8, obj->id & 0xff);
1848
1849
69.5k
            rc2 = pdf_scan_contents(fout, pdf, obj);
1850
69.5k
            if (rc2 != CL_SUCCESS) {
1851
5
                rc = rc2;
1852
5
                goto really_done;
1853
5
            }
1854
69.5k
        }
1855
959k
    }
1856
1857
1.00M
really_done:
1858
1.00M
    close(fout);
1859
1860
1.00M
    if (CL_EMEM != rc) {
1861
1.00M
        if (flags & PDF_EXTRACT_OBJ_SCAN && !pdf->ctx->engine->keeptmp)
1862
1.00M
            if (cli_unlink(fullname) && rc != CL_VIRUS)
1863
0
                rc = CL_EUNLINK;
1864
1.00M
    }
1865
1866
1.00M
    return rc;
1867
1.00M
}
1868
1869
enum objstate {
1870
    STATE_NONE,
1871
    STATE_S,
1872
    STATE_FILTER,
1873
    STATE_JAVASCRIPT,
1874
    STATE_OPENACTION,
1875
    STATE_LINEARIZED,
1876
    STATE_LAUNCHACTION,
1877
    STATE_CONTENTS,
1878
    STATE_ANY /* for actions table below */
1879
};
1880
1881
#define NAMEFLAG_NONE 0x0
1882
2.44M
#define NAMEFLAG_HEURISTIC 0x1
1883
1884
struct pdfname_action {
1885
    const char *pdfname;
1886
    enum pdf_objflags set_objflag; /* OBJ_DICT is noop */
1887
    enum objstate from_state;      /* STATE_NONE is noop */
1888
    enum objstate to_state;
1889
    uint32_t nameflags;
1890
    void (*pdf_stats_cb)(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
1891
};
1892
1893
static struct pdfname_action pdfname_actions[] = {
1894
    {"ASCIIHexDecode", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
1895
    {"ASCII85Decode", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
1896
    {"A85", OBJ_FILTER_A85, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCII85Decode_cb},
1897
    {"AHx", OBJ_FILTER_AH, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, ASCIIHexDecode_cb},
1898
    {"EmbeddedFile", OBJ_EMBEDDED_FILE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, EmbeddedFile_cb},
1899
    {"FlateDecode", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb},
1900
    {"Fl", OBJ_FILTER_FLATE, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, FlateDecode_cb},
1901
    {"Image", OBJ_IMAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Image_cb},
1902
    {"LZWDecode", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb},
1903
    {"LZW", OBJ_FILTER_LZW, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, LZWDecode_cb},
1904
    {"RunLengthDecode", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb},
1905
    {"RL", OBJ_FILTER_RL, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, RunLengthDecode_cb},
1906
    {"CCITTFaxDecode", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb},
1907
    {"CCF", OBJ_FILTER_FAX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, CCITTFaxDecode_cb},
1908
    {"JBIG2Decode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JBIG2Decode_cb},
1909
    {"DCTDecode", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb},
1910
    {"DCT", OBJ_FILTER_DCT, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, DCTDecode_cb},
1911
    {"JPXDecode", OBJ_FILTER_JPX, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, JPXDecode_cb},
1912
    {"Crypt", OBJ_FILTER_CRYPT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, Crypt_cb},
1913
    {"Standard", OBJ_FILTER_STANDARD, STATE_FILTER, STATE_FILTER, NAMEFLAG_HEURISTIC, Standard_cb},
1914
    {"Sig", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, Sig_cb},
1915
    {"V", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
1916
    {"R", OBJ_SIGNED, STATE_ANY, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
1917
    {"Linearized", OBJ_DICT, STATE_NONE, STATE_LINEARIZED, NAMEFLAG_HEURISTIC, NULL},
1918
    {"Filter", OBJ_HASFILTERS, STATE_ANY, STATE_FILTER, NAMEFLAG_HEURISTIC, NULL},
1919
    {"JavaScript", OBJ_JAVASCRIPT, STATE_ANY, STATE_JAVASCRIPT, NAMEFLAG_HEURISTIC, JavaScript_cb},
1920
    {"Length", OBJ_DICT, STATE_FILTER, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
1921
    {"S", OBJ_DICT, STATE_NONE, STATE_S, NAMEFLAG_HEURISTIC, NULL},
1922
    {"Type", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, NULL},
1923
    {"OpenAction", OBJ_OPENACTION, STATE_ANY, STATE_OPENACTION, NAMEFLAG_HEURISTIC, OpenAction_cb},
1924
    {"Launch", OBJ_LAUNCHACTION, STATE_ANY, STATE_LAUNCHACTION, NAMEFLAG_HEURISTIC, Launch_cb},
1925
    {"Page", OBJ_PAGE, STATE_NONE, STATE_NONE, NAMEFLAG_HEURISTIC, Page_cb},
1926
    {"Contents", OBJ_CONTENTS, STATE_NONE, STATE_CONTENTS, NAMEFLAG_HEURISTIC, NULL},
1927
    {"Author", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Author_cb},
1928
    {"Producer", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Producer_cb},
1929
    {"CreationDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, CreationDate_cb},
1930
    {"ModDate", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, ModificationDate_cb},
1931
    {"Creator", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Creator_cb},
1932
    {"Title", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Title_cb},
1933
    {"Keywords", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Keywords_cb},
1934
    {"Subject", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Subject_cb},
1935
    {"Pages", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Pages_cb},
1936
    {"Colors", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, Colors_cb},
1937
    {"RichMedia", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, RichMedia_cb},
1938
    {"AcroForm", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, AcroForm_cb},
1939
    {"XFA", OBJ_DICT, STATE_NONE, STATE_NONE, NAMEFLAG_NONE, XFA_cb}};
1940
1941
2.32M
#define KNOWN_FILTERS ((1 << OBJ_FILTER_AH) | (1 << OBJ_FILTER_RL) | (1 << OBJ_FILTER_A85) | (1 << OBJ_FILTER_FLATE) | (1 << OBJ_FILTER_LZW) | (1 << OBJ_FILTER_FAX) | (1 << OBJ_FILTER_DCT) | (1 << OBJ_FILTER_JPX) | (1 << OBJ_FILTER_CRYPT))
1942
1943
static void handle_pdfname(struct pdf_struct *pdf, struct pdf_obj *obj, const char *pdfname, int escapes, enum objstate *state)
1944
9.07M
{
1945
9.07M
    struct pdfname_action *act = NULL;
1946
9.07M
    unsigned j;
1947
1948
9.07M
    obj->statsflags |= OBJ_FLAG_PDFNAME_DONE;
1949
1950
374M
    for (j = 0; j < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); j++) {
1951
368M
        if (!strcmp(pdfname, pdfname_actions[j].pdfname)) {
1952
2.44M
            act = &pdfname_actions[j];
1953
2.44M
            break;
1954
2.44M
        }
1955
368M
    }
1956
1957
9.07M
    if (!act) {
1958
        /* these are digital signature objects, filter doesn't matter,
1959
         * we don't need them anyway */
1960
6.63M
        if (*state == STATE_FILTER && !(obj->flags & (1 << OBJ_SIGNED)) && !(obj->flags & KNOWN_FILTERS)) {
1961
274k
            cli_dbgmsg("handle_pdfname: unknown filter %s\n", pdfname);
1962
274k
            obj->flags |= 1 << OBJ_FILTER_UNKNOWN;
1963
274k
        }
1964
1965
6.63M
        return;
1966
6.63M
    }
1967
1968
    /* record filter order */
1969
2.44M
    if (obj->numfilters < PDF_FILTERLIST_MAX && (*state == STATE_FILTER) && ((1 << act->set_objflag) & KNOWN_FILTERS))
1970
298k
        obj->filterlist[obj->numfilters++] = act->set_objflag;
1971
1972
2.44M
    if ((act->nameflags & NAMEFLAG_HEURISTIC) && escapes) {
1973
        /* if a commonly used PDF name is escaped that is certainly
1974
           suspicious. */
1975
1.16k
        cli_dbgmsg("handle_pdfname: pdfname %s is escaped\n", pdfname);
1976
1.16k
        pdfobj_flag(pdf, obj, ESCAPED_COMMON_PDFNAME);
1977
1.16k
    }
1978
1979
2.44M
    if ((act->pdf_stats_cb))
1980
1.12M
        act->pdf_stats_cb(pdf, obj, act);
1981
1982
2.44M
    if (act->from_state == *state || act->from_state == STATE_ANY) {
1983
2.15M
        *state = act->to_state;
1984
1985
2.15M
        if (*state == STATE_FILTER && act->set_objflag != OBJ_DICT && (obj->flags & (1 << act->set_objflag))) {
1986
70.6k
            cli_dbgmsg("handle_pdfname: duplicate stream filter %s\n", pdfname);
1987
70.6k
            pdfobj_flag(pdf, obj, BAD_STREAM_FILTERS);
1988
70.6k
        }
1989
1990
2.15M
        obj->flags |= 1 << act->set_objflag;
1991
2.15M
    } else {
1992
        /* auto-reset states */
1993
282k
        switch (*state) {
1994
8.02k
            case STATE_S:
1995
8.02k
                *state = STATE_NONE;
1996
8.02k
                break;
1997
274k
            default:
1998
274k
                break;
1999
282k
        }
2000
282k
    }
2001
2.44M
}
2002
2003
static void pdf_parse_encrypt(struct pdf_struct *pdf, const char *enc, int len)
2004
54.5k
{
2005
54.5k
    const char *q, *q2;
2006
54.5k
    unsigned long objid;
2007
54.5k
    unsigned long genid;
2008
54.5k
    long temp_long;
2009
2010
54.5k
    if (len >= 16 && !strncmp(enc, "/EncryptMetadata", 16)) {
2011
3.43k
        q = cli_memstr(enc + 16, len - 16, "/Encrypt", 8);
2012
3.43k
        if (!q)
2013
925
            return;
2014
2015
2.51k
        len -= q - enc;
2016
2.51k
        enc = q;
2017
2.51k
    }
2018
2019
53.6k
    q = enc + 8;
2020
53.6k
    len -= 8;
2021
53.6k
    q2 = pdf_nextobject(q, len);
2022
53.6k
    if (!q2 || !isdigit(*q2))
2023
6.13k
        return;
2024
47.5k
    len -= q2 - q;
2025
47.5k
    q = q2;
2026
2027
47.5k
    if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) {
2028
206
        cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse objid\n");
2029
206
        return;
2030
47.3k
    } else if (temp_long < 0) {
2031
0
        cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative objid (%ld).\n", temp_long);
2032
0
        return;
2033
0
    }
2034
47.3k
    objid = (unsigned long)temp_long;
2035
2036
47.3k
    objid = objid << 8;
2037
47.3k
    q2    = pdf_nextobject(q, len);
2038
47.3k
    if (!q2 || !isdigit(*q2))
2039
3.54k
        return;
2040
43.7k
    len -= q2 - q;
2041
43.7k
    q = q2;
2042
2043
43.7k
    if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)len, 0, 10, &temp_long)) {
2044
333
        cli_dbgmsg("pdf_parse_encrypt: Found Encrypt dictionary but failed to parse genid\n");
2045
333
        return;
2046
43.4k
    } else if (temp_long < 0) {
2047
0
        cli_dbgmsg("pdf_parse_encrypt: Encountered invalid negative genid (%ld).\n", temp_long);
2048
0
        return;
2049
0
    }
2050
43.4k
    genid = (unsigned long)temp_long;
2051
2052
43.4k
    objid |= genid & 0xff;
2053
43.4k
    q2 = pdf_nextobject(q, len);
2054
43.4k
    if (!q2 || *q2 != 'R')
2055
3.79k
        return;
2056
2057
39.6k
    cli_dbgmsg("pdf_parse_encrypt: Encrypt dictionary in obj %lu %lu\n", objid >> 8, objid & 0xff);
2058
2059
39.6k
    pdf->enc_objid = objid;
2060
39.6k
}
2061
2062
static void pdf_parse_trailer(struct pdf_struct *pdf, const char *s, long length)
2063
86.1k
{
2064
86.1k
    const char *enc;
2065
2066
86.1k
    enc = cli_memstr(s, length, "/Encrypt", 8);
2067
86.1k
    if (enc) {
2068
54.5k
        char *newID;
2069
54.5k
        unsigned int newIDlen = 0;
2070
2071
54.5k
        pdf->flags |= 1 << ENCRYPTED_PDF;
2072
54.5k
        pdf_parse_encrypt(pdf, enc, s + length - enc);
2073
54.5k
        newID = pdf_readstring(s, length, "/ID", &newIDlen, NULL, false);
2074
2075
54.5k
        if (newID) {
2076
40.5k
            free(pdf->fileID);
2077
40.5k
            pdf->fileID    = newID;
2078
40.5k
            pdf->fileIDlen = newIDlen;
2079
40.5k
        }
2080
54.5k
    }
2081
86.1k
}
2082
2083
void pdf_parseobj(struct pdf_struct *pdf, struct pdf_obj *obj)
2084
1.59M
{
2085
    /* enough to hold common pdf names, we don't need all the names */
2086
1.59M
    char pdfname[64];
2087
1.59M
    const char *q2, *q3;
2088
1.59M
    const char *nextobj = NULL, *nextopen = NULL, *nextclose = NULL;
2089
1.59M
    const char *q    = NULL;
2090
1.59M
    const char *dict = NULL, *enddict = NULL, *start = NULL;
2091
1.59M
    off_t dict_length = 0, full_dict_length = 0, bytesleft = 0;
2092
1.59M
    size_t i         = 0;
2093
1.59M
    unsigned filters = 0, blockopens = 0;
2094
1.59M
    enum objstate objstate = STATE_NONE;
2095
2096
1.59M
    json_object *pdfobj = NULL, *jsonobj = NULL;
2097
2098
1.59M
    if (NULL == pdf || NULL == obj) {
2099
0
        cli_warnmsg("pdf_parseobj: invalid arguments\n");
2100
0
        return;
2101
0
    }
2102
2103
1.59M
    cli_dbgmsg("pdf_parseobj: Parsing object %u %u\n", obj->id >> 8, obj->id & 0xff);
2104
2105
1.59M
    if (obj->objstm) {
2106
239k
        if ((size_t)obj->start > obj->objstm->streambuf_len) {
2107
0
            cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of object stream (%zu).\n",
2108
0
                       obj->id >> 8, obj->id & 0xff, obj->start, obj->objstm->streambuf_len);
2109
0
            return;
2110
0
        }
2111
239k
        q = (const char *)(obj->start + obj->objstm->streambuf);
2112
1.35M
    } else {
2113
1.35M
        if ((size_t)obj->start > pdf->size) {
2114
0
            cli_dbgmsg("pdf_parseobj: %u %u obj: obj start (%u) is greater than size of PDF (%lld).\n",
2115
0
                       obj->id >> 8, obj->id & 0xff, obj->start, (long long)pdf->size);
2116
0
            return;
2117
0
        }
2118
1.35M
        q = (const char *)(obj->start + pdf->map);
2119
1.35M
    }
2120
1.59M
    start = q;
2121
2122
1.59M
    if (obj->size <= 0)
2123
2.04k
        return;
2124
2125
1.58M
    if (obj->objstm) {
2126
239k
        bytesleft = MIN(obj->size, obj->objstm->streambuf_len - obj->start);
2127
1.35M
    } else {
2128
1.35M
        bytesleft = MIN(obj->size, pdf->size - obj->start);
2129
1.35M
    }
2130
2131
    /* For objects that aren't already in an object stream^, check if they contain a stream.
2132
     * ^Objects in object streams aren't supposed to contain streams, so we don't check them. */
2133
1.58M
    if (NULL == obj->objstm) {
2134
        /* Check if object contains stream */
2135
1.35M
        cl_error_t has_stream;
2136
1.35M
        const char *stream = NULL;
2137
1.35M
        size_t stream_size = 0;
2138
2139
1.35M
        has_stream = find_stream_bounds(
2140
1.35M
            start,
2141
1.35M
            obj->size,
2142
1.35M
            &stream,
2143
1.35M
            &stream_size,
2144
1.35M
            (pdf->enc_method_stream <= ENC_IDENTITY) && (pdf->enc_method_embeddedfile <= ENC_IDENTITY));
2145
2146
1.35M
        if ((CL_SUCCESS == has_stream) ||
2147
1.35M
            (CL_EFORMAT == has_stream)) {
2148
            /* Stream found. Store this fact and the stream bounds. */
2149
712k
            cli_dbgmsg("pdf_parseobj: %u %u contains stream, size: %zu\n", obj->id >> 8, obj->id & 0xff, stream_size);
2150
712k
            obj->flags |= (1 << OBJ_STREAM);
2151
712k
            obj->stream      = stream;
2152
712k
            obj->stream_size = stream_size;
2153
712k
        }
2154
1.35M
    }
2155
2156
    /* find start of dictionary */
2157
12.5M
    do {
2158
12.5M
        nextobj = pdf_nextobject(q, bytesleft);
2159
12.5M
        bytesleft -= nextobj - q;
2160
2161
12.5M
        if (!nextobj || bytesleft < 0) {
2162
308k
            cli_dbgmsg("pdf_parseobj: %u %u obj: no dictionary\n", obj->id >> 8, obj->id & 0xff);
2163
2164
308k
            if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
2165
308k
                pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
2166
308k
                if (!(pdfobj))
2167
0
                    return;
2168
308k
            }
2169
2170
308k
            if (pdfobj) {
2171
308k
                if (!(jsonobj))
2172
308k
                    jsonobj = cli_jsonarray(pdfobj, "ObjectsWithoutDictionaries");
2173
308k
                if (jsonobj)
2174
308k
                    cli_jsonint_array(jsonobj, obj->id >> 8);
2175
308k
            }
2176
2177
308k
            return;
2178
308k
        }
2179
2180
        /*
2181
         * Opening `<` for object's dictionary may be back 1 character,
2182
         * provided q is not at the start of the buffer (it shouldn't be).
2183
         */
2184
12.2M
        if (obj->objstm) {
2185
2.73M
            if (obj->objstm->streambuf == q) {
2186
0
                q3 = memchr(q, '<', nextobj - q);
2187
2.73M
            } else {
2188
2.73M
                q3 = memchr(q - 1, '<', nextobj - q + 1);
2189
2.73M
            }
2190
9.48M
        } else {
2191
9.48M
            if (pdf->map == q) {
2192
0
                q3 = memchr(q, '<', nextobj - q);
2193
9.48M
            } else {
2194
9.48M
                q3 = memchr(q - 1, '<', nextobj - q + 1);
2195
9.48M
            }
2196
9.48M
        }
2197
12.2M
        nextobj++;
2198
12.2M
        bytesleft--;
2199
12.2M
        q = nextobj;
2200
12.2M
    } while (!q3 || q3[1] != '<');
2201
1.28M
    dict = q3 + 2;
2202
1.28M
    q    = dict;
2203
1.28M
    blockopens++;
2204
1.28M
    bytesleft = obj->size - (q - start);
2205
1.28M
    enddict   = q + bytesleft - 1;
2206
2207
    /* find end of dictionary block */
2208
1.28M
    if (bytesleft < 0) {
2209
0
        cli_dbgmsg("pdf_parseobj: %u %u obj: broken dictionary\n", obj->id >> 8, obj->id & 0xff);
2210
2211
0
        if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
2212
0
            pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
2213
0
            if (!(pdfobj))
2214
0
                return;
2215
0
        }
2216
2217
0
        if (pdfobj) {
2218
0
            if (!(jsonobj))
2219
0
                jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries");
2220
0
            if (jsonobj)
2221
0
                cli_jsonint_array(jsonobj, obj->id >> 8);
2222
0
        }
2223
2224
0
        return;
2225
0
    }
2226
2227
    /* while still looking ... */
2228
4.49M
    while ((q < enddict - 1) && (blockopens > 0)) {
2229
        /* find next close */
2230
3.42M
        nextclose = memchr(q, '>', enddict - q);
2231
3.42M
        if (nextclose && (nextclose[1] == '>')) {
2232
            /* check for nested open */
2233
4.11M
            while ((nextopen = memchr(q - 1, '<', nextclose - q + 1)) != NULL) {
2234
2.65M
                if (nextopen[1] == '<') {
2235
                    /* nested open */
2236
617k
                    blockopens++;
2237
617k
                    q = nextopen + 2;
2238
2.03M
                } else {
2239
                    /* unmatched < before next close */
2240
2.03M
                    q = nextopen + 2;
2241
2.03M
                }
2242
2.65M
            }
2243
            /* close block */
2244
1.46M
            blockopens--;
2245
1.46M
            q = nextclose + 2;
2246
1.95M
        } else if (nextclose) {
2247
            /* found one > but not two */
2248
1.74M
            q = nextclose + 2;
2249
1.74M
        } else {
2250
            /* next closing not found */
2251
211k
            break;
2252
211k
        }
2253
3.42M
    }
2254
2255
    /* Was end of dictionary found? */
2256
1.28M
    if (blockopens) {
2257
        /* probably truncated */
2258
273k
        cli_dbgmsg("pdf_parseobj: %u %u obj broken dictionary\n", obj->id >> 8, obj->id & 0xff);
2259
2260
273k
        if (!(pdfobj) && pdf->ctx->wrkproperty != NULL) {
2261
273k
            pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
2262
273k
            if (!(pdfobj))
2263
0
                return;
2264
273k
        }
2265
2266
273k
        if (pdfobj) {
2267
273k
            if (!(jsonobj))
2268
273k
                jsonobj = cli_jsonarray(pdfobj, "ObjectsWithBrokenDictionaries");
2269
273k
            if (jsonobj)
2270
273k
                cli_jsonint_array(jsonobj, obj->id >> 8);
2271
273k
        }
2272
2273
273k
        return;
2274
273k
    }
2275
2276
1.00M
    enddict = nextclose;
2277
1.00M
    obj->flags |= 1 << OBJ_DICT;
2278
1.00M
    full_dict_length = dict_length = enddict - dict;
2279
2280
    /* This code prints the dictionary content.
2281
    {
2282
        char * dictionary = malloc(dict_length + 1);
2283
        if (dictionary) {
2284
            for (i = 0; i < dict_length; i++) {
2285
                if (dict[i] == '\r')
2286
                    dictionary[i] = '\n';
2287
                else if (isprint(dict[i]) || isspace(dict[i]))
2288
                    dictionary[i] = dict[i];
2289
                else
2290
                    dictionary[i] = '*';
2291
            }
2292
            dictionary[dict_length] = '\0';
2293
            cli_dbgmsg("pdf_parseobj: dictionary is <<%s>>\n", dictionary);
2294
            free(dictionary);
2295
        }
2296
    }
2297
    */
2298
2299
    /*  process pdf names */
2300
10.0M
    for (q = dict; dict_length > 0;) {
2301
9.91M
        int escapes = 0, breakout = 0;
2302
9.91M
        q2 = memchr(q, '/', dict_length);
2303
9.91M
        if (!q2)
2304
840k
            break;
2305
2306
9.07M
        dict_length -= q2 - q;
2307
9.07M
        q = q2;
2308
        /* normalize PDF names */
2309
94.8M
        for (i = 0; dict_length > 0 && (i < sizeof(pdfname) - 1); i++) {
2310
94.6M
            q++;
2311
94.6M
            dict_length--;
2312
2313
94.6M
            if (*q == '#') {
2314
280k
                if (cli_hex2str_to(q + 1, pdfname + i, 2) == -1)
2315
227k
                    break;
2316
2317
52.2k
                q += 2;
2318
52.2k
                dict_length -= 2;
2319
52.2k
                escapes = 1;
2320
52.2k
                continue;
2321
280k
            }
2322
2323
94.3M
            switch (*q) {
2324
2.81M
                case ' ':
2325
2.90M
                case '\t':
2326
3.14M
                case '\r':
2327
3.52M
                case '\n':
2328
6.63M
                case '/':
2329
7.39M
                case '>':
2330
7.71M
                case '[':
2331
7.88M
                case ']':
2332
8.38M
                case '<':
2333
8.66M
                case '(':
2334
8.66M
                    breakout = 1;
2335
94.3M
            }
2336
2337
94.3M
            if (breakout)
2338
8.66M
                break;
2339
2340
85.7M
            pdfname[i] = *q;
2341
85.7M
        }
2342
2343
9.07M
        pdfname[i] = '\0';
2344
2345
9.07M
        handle_pdfname(pdf, obj, pdfname, escapes, &objstate);
2346
9.07M
        if (objstate == STATE_LINEARIZED) {
2347
55.6k
            long trailer_end, trailer;
2348
2349
55.6k
            pdfobj_flag(pdf, obj, LINEARIZED_PDF);
2350
55.6k
            objstate    = STATE_NONE;
2351
55.6k
            trailer_end = pdf_readint(dict, full_dict_length, "/H");
2352
55.6k
            if ((trailer_end > 0) && ((size_t)trailer_end < pdf->size)) {
2353
26.6k
                trailer = trailer_end - 1024;
2354
26.6k
                if (trailer < 0)
2355
25.7k
                    trailer = 0;
2356
2357
26.6k
                q2 = pdf->map + trailer;
2358
26.6k
                cli_dbgmsg("pdf_parseobj: looking for trailer in linearized pdf: %ld - %ld\n", trailer, trailer_end);
2359
26.6k
                pdf_parse_trailer(pdf, q2, trailer_end - trailer);
2360
26.6k
                if (pdf->fileID)
2361
13.5k
                    cli_dbgmsg("pdf_parseobj: found fileID\n");
2362
26.6k
            }
2363
55.6k
        }
2364
2365
9.07M
        if (objstate == STATE_LAUNCHACTION)
2366
97.8k
            pdfobj_flag(pdf, obj, HAS_LAUNCHACTION);
2367
9.07M
        if (dict_length > 0 && (objstate == STATE_JAVASCRIPT || objstate == STATE_OPENACTION || objstate == STATE_CONTENTS)) {
2368
425k
            off_t dict_remaining = dict_length;
2369
2370
425k
            if (objstate == STATE_OPENACTION)
2371
304k
                pdfobj_flag(pdf, obj, HAS_OPENACTION);
2372
2373
425k
            q2 = pdf_nextobject(q, dict_remaining);
2374
425k
            if (q2 && isdigit(*q2)) {
2375
360k
                const char *q2_old = NULL;
2376
360k
                unsigned long objid;
2377
360k
                unsigned long genid;
2378
360k
                long temp_long;
2379
2380
360k
                dict_remaining -= (off_t)(q2 - q);
2381
2382
360k
                if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) {
2383
3.04k
                    cli_dbgmsg("pdf_parseobj: failed to parse object objid\n");
2384
3.04k
                    return;
2385
357k
                } else if (temp_long < 0) {
2386
0
                    cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long);
2387
0
                    return;
2388
0
                }
2389
357k
                objid = (unsigned long)temp_long;
2390
2391
357k
                objid = objid << 8;
2392
2393
799k
                while ((dict_remaining > 0) && isdigit(*q2)) {
2394
442k
                    q2++;
2395
442k
                    dict_remaining--;
2396
442k
                }
2397
2398
357k
                q2_old = q2;
2399
357k
                q2     = pdf_nextobject(q2, dict_remaining);
2400
357k
                if (q2 && isdigit(*q2)) {
2401
324k
                    dict_remaining -= (off_t)(q2 - q2_old);
2402
324k
                    if (CL_SUCCESS != cli_strntol_wrap(q2, (size_t)dict_remaining, 0, 10, &temp_long)) {
2403
1.06k
                        cli_dbgmsg("pdf_parseobj: failed to parse object genid\n");
2404
1.06k
                        return;
2405
323k
                    } else if (temp_long < 0) {
2406
0
                        cli_dbgmsg("pdf_parseobj: Encountered invalid negative genid (%ld).\n", temp_long);
2407
0
                        return;
2408
0
                    }
2409
323k
                    genid = (unsigned long)temp_long;
2410
2411
323k
                    objid |= genid & 0xff;
2412
2413
323k
                    q2 = pdf_nextobject(q2, dict_remaining);
2414
323k
                    if (q2 && *q2 == 'R') {
2415
285k
                        struct pdf_obj *obj2;
2416
2417
285k
                        cli_dbgmsg("pdf_parseobj: found %s stored in indirect object %lu %lu\n", pdfname, objid >> 8, objid & 0xff);
2418
285k
                        obj2 = find_obj(pdf, obj, objid);
2419
285k
                        if (obj2) {
2420
10.6k
                            enum pdf_objflags flag = OBJ_STREAM;
2421
2422
10.6k
                            switch (objstate) {
2423
985
                                case STATE_JAVASCRIPT:
2424
985
                                    flag = OBJ_JAVASCRIPT;
2425
985
                                    break;
2426
1.41k
                                case STATE_OPENACTION:
2427
1.41k
                                    flag = OBJ_OPENACTION;
2428
1.41k
                                    break;
2429
8.23k
                                case STATE_CONTENTS:
2430
8.23k
                                    flag = OBJ_CONTENTS;
2431
8.23k
                                    break;
2432
0
                                default:
2433
0
                                    cli_dbgmsg("pdf_parseobj: Unexpected object type\n");
2434
0
                                    return;
2435
10.6k
                            }
2436
2437
10.6k
                            obj->flags &= ~(1 << flag); /* Disable flag for current object ...                   */
2438
10.6k
                            obj2->flags |= 1 << flag;   /* ... and set the flag for the indirect object instead! */
2439
275k
                        } else {
2440
275k
                            pdfobj_flag(pdf, obj, BAD_INDOBJ);
2441
275k
                        }
2442
285k
                    }
2443
323k
                }
2444
357k
            }
2445
2446
421k
            objstate = STATE_NONE;
2447
421k
        }
2448
9.07M
    }
2449
2450
47.1M
    for (i = 0; i < sizeof(pdfname_actions) / sizeof(pdfname_actions[0]); i++) {
2451
46.1M
        const struct pdfname_action *act = &pdfname_actions[i];
2452
2453
46.1M
        if ((obj->flags & (1 << act->set_objflag)) &&
2454
46.1M
            act->from_state == STATE_FILTER &&
2455
46.1M
            act->to_state == STATE_FILTER &&
2456
46.1M
            act->set_objflag != OBJ_FILTER_CRYPT &&
2457
46.1M
            act->set_objflag != OBJ_FILTER_STANDARD) {
2458
534k
            filters++;
2459
534k
        }
2460
46.1M
    }
2461
2462
1.00M
    if (filters > 2) {
2463
        /* more than 2 non-crypt filters */
2464
24.9k
        pdfobj_flag(pdf, obj, MANY_FILTERS);
2465
24.9k
    }
2466
2467
1.00M
    if (obj->flags & ((1 << OBJ_SIGNED) | KNOWN_FILTERS))
2468
305k
        obj->flags &= ~(1 << OBJ_FILTER_UNKNOWN);
2469
2470
1.00M
    if (obj->flags & (1 << OBJ_FILTER_UNKNOWN))
2471
21.4k
        pdfobj_flag(pdf, obj, UNKNOWN_FILTER);
2472
2473
1.00M
    cli_dbgmsg("pdf_parseobj: %u %u obj flags: %02x\n", obj->id >> 8, obj->id & 0xff, obj->flags);
2474
1.00M
}
2475
2476
/**
2477
 * @brief   Given a pointer to a dictionary object and a key, get the key's value.
2478
 *
2479
 * @param q0            Offset of the start of the dictionary.
2480
 * @param[in,out] len   In: The number of bytes in the dictionary.
2481
 *                      Out: The number of bytes remaining from the start
2482
 *                           of the value to the end of the dict
2483
 * @param key           Null terminated 'key' to search for.
2484
 * @return const char*  Address of the dictionary key's 'value'.
2485
 */
2486
static const char *pdf_getdict(const char *q0, int *len, const char *key)
2487
2.44M
{
2488
2.44M
    const char *q;
2489
2490
2.44M
    if (*len <= 0) {
2491
1.20k
        cli_dbgmsg("pdf_getdict: bad length %d\n", *len);
2492
1.20k
        return NULL;
2493
1.20k
    }
2494
2495
2.44M
    if (!q0)
2496
1.67k
        return NULL;
2497
2498
    /* find the key */
2499
2.44M
    q = cli_memstr(q0, *len, key, strlen(key));
2500
2.44M
    if (!q) {
2501
1.79M
        cli_dbgmsg("pdf_getdict: %s not found in dict\n", key);
2502
1.79M
        return NULL;
2503
1.79M
    }
2504
2505
655k
    *len -= q - q0;
2506
655k
    q0 = q;
2507
2508
    /* find the start of the value object */
2509
655k
    q = pdf_nextobject(q0 + 1, *len - 1);
2510
655k
    if (!q) {
2511
881
        cli_dbgmsg("pdf_getdict: %s is invalid in dict\n", key);
2512
881
        return NULL;
2513
881
    }
2514
2515
    /* if the value is a dictionary object, include the < > brackets.*/
2516
948k
    while (q > q0 && (q[-1] == '<' || q[-1] == '\n'))
2517
293k
        q--;
2518
2519
654k
    *len -= q - q0;
2520
654k
    return q;
2521
655k
}
2522
2523
/**
2524
 * @brief Read the value string from a PDF dictionary key/value pair.
2525
 *
2526
 * @param q0            A pointer into the PDF dictionary.
2527
 * @param len           The bytes remaining in the file.
2528
 * @param key           The key we're looking for.
2529
 * @param [out] slen    The length of the output string
2530
 * @param [out] qend    The pointer we wound up at, after the end of the value.
2531
 * @param noescape      Select 'true' to ignore escape characters, 'false' to process them.
2532
 * @return char*
2533
 */
2534
static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape)
2535
119k
{
2536
119k
    char *s, *s0;
2537
119k
    const char *start, *q, *end;
2538
119k
    if (slen)
2539
101k
        *slen = 0;
2540
2541
119k
    if (qend)
2542
18.6k
        *qend = q0;
2543
2544
119k
    q = pdf_getdict(q0, &len, key);
2545
119k
    if (!q || len <= 0)
2546
17.5k
        return NULL;
2547
2548
102k
    if (*q == '(') {
2549
49.3k
        int paren = 1;
2550
49.3k
        start     = ++q;
2551
49.3k
        len--;
2552
18.6M
        for (; paren > 0 && len > 0; q++, len--) {
2553
18.5M
            switch (*q) {
2554
31.2k
                case '(':
2555
31.2k
                    paren++;
2556
31.2k
                    break;
2557
68.9k
                case ')':
2558
68.9k
                    paren--;
2559
68.9k
                    break;
2560
175k
                case '\\':
2561
175k
                    q++;
2562
175k
                    len--;
2563
175k
                    break;
2564
18.2M
                default:
2565
18.2M
                    break;
2566
18.5M
            }
2567
18.5M
        }
2568
2569
49.3k
        if (len <= 0) {
2570
5.58k
            cli_errmsg("pdf_readstring: Invalid, truncated dictionary.\n");
2571
5.58k
            return NULL;
2572
5.58k
        }
2573
2574
43.7k
        if (qend)
2575
8.50k
            *qend = q;
2576
2577
43.7k
        q--;
2578
43.7k
        len = q - start;
2579
43.7k
        s0 = s = cli_max_malloc(len + 1);
2580
43.7k
        if (!s) {
2581
0
            cli_errmsg("pdf_readstring: Unable to allocate buffer\n");
2582
0
            return NULL;
2583
0
        }
2584
2585
43.7k
        end = start + len;
2586
43.7k
        if (noescape) {
2587
5.55k
            memcpy(s0, start, len);
2588
5.55k
            s = s0 + len;
2589
38.2k
        } else {
2590
10.9M
            for (q = start; q < end; q++) {
2591
10.8M
                if (*q != '\\') {
2592
10.7M
                    *s++ = *q;
2593
10.7M
                } else {
2594
116k
                    q++;
2595
116k
                    switch (*q) {
2596
14.0k
                        case 'n':
2597
14.0k
                            *s++ = '\n';
2598
14.0k
                            break;
2599
15.4k
                        case 'r':
2600
15.4k
                            *s++ = '\r';
2601
15.4k
                            break;
2602
167
                        case 't':
2603
167
                            *s++ = '\t';
2604
167
                            break;
2605
363
                        case 'b':
2606
363
                            *s++ = '\b';
2607
363
                            break;
2608
5.96k
                        case 'f':
2609
5.96k
                            *s++ = '\f';
2610
5.96k
                            break;
2611
11.2k
                        case '(': /* fall-through */
2612
22.2k
                        case ')': /* fall-through */
2613
43.3k
                        case '\\':
2614
43.3k
                            *s++ = *q;
2615
43.3k
                            break;
2616
217
                        case '\n':
2617
                            /* ignore */
2618
217
                            break;
2619
2.90k
                        case '\r':
2620
                            /* ignore */
2621
2.90k
                            if (q + 1 < end && q[1] == '\n')
2622
288
                                q++;
2623
2.90k
                            break;
2624
2.86k
                        case '0':
2625
3.69k
                        case '1':
2626
9.02k
                        case '2':
2627
11.1k
                        case '3':
2628
12.1k
                        case '4':
2629
12.3k
                        case '5':
2630
12.8k
                        case '6':
2631
13.3k
                        case '7':
2632
14.1k
                        case '8':
2633
14.7k
                        case '9':
2634
                            /* octal escape */
2635
14.7k
                            if (q + 2 < end) {
2636
14.6k
                                *s++ = 64 * (q[0] - '0') + 8 * (q[1] - '0') + (q[2] - '0');
2637
14.6k
                                q += 2;
2638
14.6k
                            }
2639
14.7k
                            break;
2640
18.8k
                        default:
2641
                            /* ignore */
2642
18.8k
                            *s++ = '\\';
2643
18.8k
                            q--;
2644
18.8k
                            break;
2645
116k
                    }
2646
116k
                }
2647
10.8M
            }
2648
38.2k
        }
2649
2650
43.7k
        *s++ = '\0';
2651
43.7k
        if (slen)
2652
35.2k
            *slen = s - s0 - 1;
2653
2654
43.7k
        return s0;
2655
43.7k
    }
2656
2657
52.8k
    if ((*q == '<') && (len >= 3)) {
2658
49.1k
        start = ++q;
2659
49.1k
        len -= 1;
2660
        // skip newlines after <
2661
49.7k
        while (len > 0 && *start == '\n') {
2662
512
            start = ++q;
2663
512
            len -= 1;
2664
512
        }
2665
49.1k
        q = memchr(q + 1, '>', len - 1);
2666
49.1k
        if (!q)
2667
341
            return NULL;
2668
2669
48.8k
        if (qend)
2670
801
            *qend = q;
2671
2672
48.8k
        s = cli_max_malloc((q - start) / 2 + 1);
2673
48.8k
        if (s == NULL) { /* oops, couldn't allocate memory */
2674
0
            cli_dbgmsg("pdf_readstring: unable to allocate memory...\n");
2675
0
            return NULL;
2676
0
        }
2677
2678
48.8k
        if (cli_hex2str_to(start, s, q - start)) {
2679
5.78k
            cli_dbgmsg("pdf_readstring: %s has bad hex value\n", key);
2680
5.78k
            free(s);
2681
5.78k
            return NULL;
2682
5.78k
        }
2683
2684
43.0k
        s[(q - start) / 2] = '\0';
2685
43.0k
        if (slen)
2686
42.6k
            *slen = (q - start) / 2;
2687
2688
43.0k
        return s;
2689
48.8k
    }
2690
2691
3.65k
    cli_dbgmsg("pdf_readstring: %s is invalid string in dict\n", key);
2692
3.65k
    return NULL;
2693
52.8k
}
2694
2695
static char *pdf_readval(const char *q, int len, const char *key)
2696
44.8k
{
2697
44.8k
    const char *end;
2698
44.8k
    char *s;
2699
44.8k
    int origlen = len;
2700
2701
44.8k
    q = pdf_getdict(q, &len, key);
2702
44.8k
    if (!q || len <= 0)
2703
19.5k
        return NULL;
2704
2705
25.3k
    while (len > 0 && *q && *q == ' ') {
2706
0
        q++;
2707
0
        len--;
2708
0
    }
2709
2710
25.3k
    if (*q != '/')
2711
482
        return NULL;
2712
2713
24.8k
    q++;
2714
24.8k
    len--;
2715
24.8k
    end = q;
2716
2717
176k
    while (len > 0 && *end && !(*end == '/' || (len > 1 && end[0] == '>' && end[1] == '>'))) {
2718
151k
        end++;
2719
151k
        len--;
2720
151k
    }
2721
2722
    /* end-of-buffer whitespace trimming */
2723
25.4k
    while (len < origlen && isspace(*(end - 1))) {
2724
602
        end--;
2725
602
        len++;
2726
602
    }
2727
2728
24.8k
    s = cli_max_malloc(end - q + 1);
2729
24.8k
    if (!s)
2730
0
        return NULL;
2731
2732
24.8k
    memcpy(s, q, end - q);
2733
24.8k
    s[end - q] = '\0';
2734
2735
24.8k
    return s;
2736
24.8k
}
2737
2738
static int pdf_readint(const char *q0, int len, const char *key)
2739
361k
{
2740
361k
    long value    = 0;
2741
361k
    const char *q = pdf_getdict(q0, &len, key);
2742
2743
361k
    if (q == NULL) {
2744
49.7k
        value = -1;
2745
311k
    } else if (CL_SUCCESS != cli_strntol_wrap(q, (size_t)len, 0, 10, &value)) {
2746
23.4k
        value = -1;
2747
23.4k
    }
2748
361k
    return value;
2749
361k
}
2750
2751
static int pdf_readbool(const char *q0, int len, const char *key, int Default)
2752
10.5k
{
2753
10.5k
    const char *q = pdf_getdict(q0, &len, key);
2754
2755
10.5k
    if (!q || len < 5)
2756
10.4k
        return Default;
2757
2758
94
    if (!strncmp(q, "true", 4))
2759
0
        return 1;
2760
2761
94
    if (!strncmp(q, "false", 5))
2762
0
        return 0;
2763
2764
94
    cli_dbgmsg("pdf_readbool: invalid value for %s bool\n", key);
2765
2766
94
    return Default;
2767
94
}
2768
2769
static const char *key_padding =
2770
    "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4e\x56\xff\xfa\x01\x08"
2771
    "\x2e\x2e\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A";
2772
2773
static void dbg_printhex(const char *msg, const char *hex, unsigned len)
2774
75.6k
{
2775
75.6k
    if (cli_debug_flag) {
2776
0
        char *kh = cli_str2hex(hex, len);
2777
2778
0
        cli_dbgmsg("cli_pdf: %s: %s\n", msg, kh);
2779
2780
0
        free(kh);
2781
0
    }
2782
75.6k
}
2783
2784
/**
2785
 * @brief Compute the hash of the password concatenated with the validation salt and (for owner-password checks) the U string.
2786
 *
2787
 * Some details and comments for how to compute this hash comes from the PyPDF project:
2788
 * https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L568
2789
 *
2790
 * @param password  The password to hash.
2791
 * @param pwlen     The length of the password.
2792
 * @param salt      The validation salt.
2793
 * @param hash      The resulting hash.
2794
 * @param U         [Optional] The U string (for owner-password checks).
2795
 */
2796
static void compute_hash_r6(const char *password, size_t pwlen, const unsigned char salt[16], unsigned char hash[32], const char *U)
2797
9.81k
{
2798
9.81k
    unsigned char data[(128 + 64 + 48) * 64];
2799
9.81k
    unsigned char block[64];
2800
9.81k
    int32_t block_size = 32;
2801
9.81k
    size_t in_data_len = 0, out_data_len;
2802
9.81k
    int32_t i, j, sum;
2803
9.81k
    uint8_t sha256[32], sha384[48], sha512[64];
2804
2805
    /*
2806
     * Compute a SHA-256 hash of the UTF-8 password concatenated with the 8 bytes of the owner or user validation salt.
2807
     */
2808
9.81k
    memcpy(data, password, pwlen);
2809
9.81k
    memcpy(data + pwlen, salt, 8);
2810
2811
9.81k
    if (NULL != U) {
2812
        // If it's for the owner password check, we also concatenate the 48-byte U string.
2813
3.47k
        memcpy(data + pwlen + 8, U, 48);
2814
2815
3.47k
        cl_sha256(data, pwlen + 8 + 48, block, NULL);
2816
6.34k
    } else {
2817
6.34k
        cl_sha256(data, pwlen + 8, block, NULL);
2818
6.34k
    }
2819
2820
656k
    for (i = 0; i < 64 || i < (data[(in_data_len * 64) - 1] + 32); i++) {
2821
647k
        memcpy(data, password, pwlen);
2822
647k
        memcpy(data + pwlen, block, block_size);
2823
2824
647k
        in_data_len = pwlen + block_size;
2825
2826
647k
        if (NULL != U) {
2827
            // If it's for the owner password check, we also concatenate the 48-byte U string.
2828
231k
            memcpy(data + pwlen + block_size, U, 48);
2829
231k
            in_data_len += 48;
2830
231k
        }
2831
2832
41.4M
        for (j = 1; j < 64; j++)
2833
40.7M
            memcpy(data + j * in_data_len, data, in_data_len);
2834
2835
647k
        aes_128cbc_encrypt(data, in_data_len * 64, data, &out_data_len, block, 16, block + 16);
2836
2837
10.9M
        for (j = 0, sum = 0; j < 16; j++)
2838
10.3M
            sum += data[j];
2839
2840
647k
        block_size = 32 + (sum % 3) * 16;
2841
647k
        switch (block_size) {
2842
239k
            case 32:
2843
239k
                cl_sha256(data, in_data_len * 64, sha256, NULL);
2844
239k
                memcpy(block, sha256, 32);
2845
239k
                break;
2846
2847
193k
            case 48:
2848
193k
                cl_sha384(data, in_data_len * 64, sha384, NULL);
2849
193k
                memcpy(block, sha384, 48);
2850
193k
                break;
2851
2852
213k
            case 64:
2853
213k
                cl_sha512(data, in_data_len * 64, sha512, NULL);
2854
213k
                memcpy(block, sha512, 64);
2855
213k
                break;
2856
647k
        }
2857
647k
    }
2858
2859
9.81k
    memcpy(hash, block, 32);
2860
9.81k
}
2861
2862
/**
2863
 * @brief Check if the owner password matches an empty password.
2864
 *
2865
 * Will set the DECRYPTABLE_PDF flag if the owner password is empty.
2866
 * Will also set the key and keylen fields in the pdf_struct.
2867
 *
2868
 * Some details and comments for how to check the owner password comes from the PyPDF project:
2869
 * https://github.com/py-pdf/pypdf/blob/3.17.4/pypdf/_encryption.py#L397
2870
 *
2871
 * @param pdf       The PDF context.
2872
 * @param R         The encryption version.
2873
 * @param O         The /O string.
2874
 * @param U         The /U string.
2875
 * @param OE        The /OE string.
2876
 * @param OE_len    The length of the /OE string.
2877
 */
2878
static void check_owner_password(struct pdf_struct *pdf, int R,
2879
                                 const char *O, const char *U,
2880
                                 const char *OE, size_t OE_len)
2881
9.95k
{
2882
9.95k
    bool password_empty = false;
2883
2884
9.95k
    dbg_printhex("U: ", U, 32);
2885
9.95k
    dbg_printhex("O: ", O, 32);
2886
2887
9.95k
    switch (R) {
2888
3.93k
        case 6: {
2889
3.93k
            unsigned char hash[32], validationkey[32];
2890
2891
3.93k
            size_t pwlen    = 0;
2892
3.93k
            char password[] = "";
2893
2894
3.93k
            if (NULL == OE) {
2895
466
                cli_dbgmsg("check_owner_password: Missing OE value!\n");
2896
466
                noisy_warnmsg("check_owner_password: Missing OE value!\n");
2897
466
                goto done;
2898
466
            }
2899
2900
3.46k
            dbg_printhex("OE: ", OE, OE_len);
2901
2902
            /*
2903
             * Test the password against the owner key by computing the SHA-256 hash of the UTF-8 password concatenated
2904
             * with the 8 bytes of owner validation salt, concatenated with the 48-byte U string.
2905
             */
2906
3.46k
            compute_hash_r6(
2907
3.46k
                password,
2908
3.46k
                pwlen,
2909
3.46k
                (const unsigned char *)(O + 32), // owner validation salt
2910
3.46k
                validationkey,
2911
3.46k
                U);
2912
2913
            /* If the 32-byte result matches the first 32 bytes of the O string, this is the owner password. */
2914
3.46k
            if (0 != memcmp(O, validationkey, sizeof(validationkey))) {
2915
3.46k
                cli_dbgmsg("check_owner_password: Owner password check did not match!\n");
2916
3.46k
                break;
2917
3.46k
            }
2918
2919
            /*
2920
             * Compute an intermediate owner key by computing the SHA-256 hash of the UTF-8 password concatenated with
2921
             * the 8 bytes of owner key salt, concatenated with the 48-byte U string.
2922
             */
2923
4
            compute_hash_r6(
2924
4
                password,
2925
4
                pwlen,
2926
4
                (const unsigned char *)(O + 40), // owner key salt
2927
4
                hash,
2928
4
                U);
2929
2930
4
            if (OE_len != 32) {
2931
0
                cli_dbgmsg("check_owner_password: OE length is not 32: %zu\n", OE_len);
2932
0
                noisy_warnmsg("check_owner_password: OE length is not 32: %zu\n", OE_len);
2933
4
            } else {
2934
4
                pdf->keylen = 32;
2935
4
                pdf->key    = cli_max_malloc(pdf->keylen);
2936
4
                if (!pdf->key) {
2937
0
                    cli_errmsg("check_owner_password: Cannot allocate memory for pdf->key\n");
2938
0
                    goto done;
2939
0
                }
2940
2941
4
                aes_256cbc_decrypt((const unsigned char *)OE, &OE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0);
2942
4
                dbg_printhex("check_owner_password: Candidate encryption key", pdf->key, pdf->keylen);
2943
2944
4
                password_empty = true;
2945
4
            }
2946
2947
4
            break;
2948
4
        }
2949
6.01k
        default: {
2950
6.01k
            cli_dbgmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R);
2951
6.01k
            noisy_warnmsg("check_owner_password: Unknown or unsupported encryption version. R: %d\n", R);
2952
6.01k
        }
2953
9.95k
    }
2954
2955
9.48k
    if (password_empty) {
2956
        /* The key we computed above is the key used to encrypt the streams. We could decrypt it now if we wanted to */
2957
4
        pdf->flags |= 1 << DECRYPTABLE_PDF;
2958
2959
4
        cli_dbgmsg("check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n");
2960
4
        noisy_msg(pdf, "check_owner_password: encrypted PDF found, owner password is empty, will attempt to decrypt\n");
2961
9.48k
    } else {
2962
        /* The key is not valid, we would need the user or the owner password to decrypt */
2963
9.48k
        cli_dbgmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n");
2964
9.48k
        noisy_warnmsg("check_owner_password: encrypted PDF found but cannot decrypt with empty owner password\n");
2965
9.48k
    }
2966
2967
9.95k
done:
2968
2969
9.95k
    return;
2970
9.48k
}
2971
2972
static void check_user_password(struct pdf_struct *pdf, int R, const char *O,
2973
                                const char *U, int32_t P, int EM,
2974
                                const char *UE, size_t UE_len,
2975
                                unsigned length)
2976
9.95k
{
2977
9.95k
    unsigned i;
2978
9.95k
    uint8_t result[16];
2979
9.95k
    char data[32];
2980
9.95k
    struct arc4_state arc4;
2981
9.95k
    bool password_empty = false;
2982
2983
9.95k
    dbg_printhex("U: ", U, 32);
2984
9.95k
    dbg_printhex("O: ", O, 32);
2985
2986
9.95k
    switch (R) {
2987
2.00k
        case 2:
2988
4.64k
        case 3:
2989
5.98k
        case 4: {
2990
5.98k
            unsigned char *d;
2991
5.98k
            size_t sz = 68 + pdf->fileIDlen + (R >= 4 && !EM ? 4 : 0);
2992
5.98k
            d         = calloc(1, sz);
2993
2994
5.98k
            if (!(d))
2995
0
                goto done;
2996
2997
5.98k
            memcpy(d, key_padding, 32);
2998
5.98k
            memcpy(d + 32, O, 32);
2999
5.98k
            P = le32_to_host(P);
3000
5.98k
            memcpy(d + 64, &P, 4);
3001
5.98k
            memcpy(d + 68, pdf->fileID, pdf->fileIDlen);
3002
3003
            /* 7.6.3.3 Algorithm 2 */
3004
            /* empty password, password == padding */
3005
5.98k
            if (R >= 4 && !EM) {
3006
0
                uint32_t v = 0xFFFFFFFF;
3007
0
                memcpy(d + 68 + pdf->fileIDlen, &v, 4);
3008
0
            }
3009
3010
5.98k
            cl_hash_data("md5", d, sz, result, NULL);
3011
5.98k
            free(d);
3012
5.98k
            if (length > 128)
3013
89
                length = 128;
3014
5.98k
            if (R >= 3) {
3015
                /* Yes, this really is on purpose */
3016
203k
                for (i = 0; i < 50; i++)
3017
199k
                    cl_hash_data("md5", result, length / 8, result, NULL);
3018
3.98k
            }
3019
5.98k
            if (R == 2)
3020
2.00k
                length = 40;
3021
3022
5.98k
            pdf->keylen = length / 8;
3023
5.98k
            pdf->key    = cli_max_malloc(pdf->keylen);
3024
5.98k
            if (!pdf->key)
3025
0
                goto done;
3026
3027
5.98k
            memcpy(pdf->key, result, pdf->keylen);
3028
5.98k
            dbg_printhex("md5", (const char *)result, 16);
3029
5.98k
            dbg_printhex("Candidate encryption key", pdf->key, pdf->keylen);
3030
3031
            /* 7.6.3.3 Algorithm 6 */
3032
5.98k
            if (R == 2) {
3033
                /* 7.6.3.3 Algorithm 4 */
3034
2.00k
                memcpy(data, key_padding, 32);
3035
2.00k
                if (false == arc4_init(&arc4, (const uint8_t *)(pdf->key), pdf->keylen)) {
3036
0
                    noisy_warnmsg("check_user_password: failed to init arc4\n");
3037
0
                    goto done;
3038
0
                }
3039
2.00k
                arc4_apply(&arc4, (uint8_t *)data, 32);
3040
2.00k
                dbg_printhex("computed U (R2)", data, 32);
3041
2.00k
                if (!memcmp(data, U, 32))
3042
470
                    password_empty = true;
3043
3.98k
            } else {
3044
                // R is 3 or 4
3045
3.98k
                unsigned len = pdf->keylen;
3046
3.98k
                unsigned char *d;
3047
3048
3.98k
                d = calloc(1, 32 + pdf->fileIDlen);
3049
3.98k
                if (!(d))
3050
0
                    goto done;
3051
3052
                /* 7.6.3.3 Algorithm 5 */
3053
3.98k
                memcpy(d, key_padding, 32);
3054
3.98k
                memcpy(d + 32, pdf->fileID, pdf->fileIDlen);
3055
3.98k
                cl_hash_data("md5", d, 32 + pdf->fileIDlen, result, NULL);
3056
3.98k
                memcpy(data, pdf->key, len);
3057
3058
3.98k
                if (false == arc4_init(&arc4, (const uint8_t *)data, len)) {
3059
0
                    noisy_warnmsg("check_user_password: failed to init arc4\n");
3060
0
                    goto done;
3061
0
                }
3062
3.98k
                arc4_apply(&arc4, result, 16);
3063
79.7k
                for (i = 1; i <= 19; i++) {
3064
75.7k
                    unsigned j;
3065
3066
775k
                    for (j = 0; j < len; j++)
3067
700k
                        data[j] = pdf->key[j] ^ i;
3068
3069
75.7k
                    if (false == arc4_init(&arc4, (const uint8_t *)data, len)) {
3070
0
                        noisy_warnmsg("check_user_password: failed to init arc4\n");
3071
0
                        goto done;
3072
0
                    }
3073
75.7k
                    arc4_apply(&arc4, result, 16);
3074
75.7k
                }
3075
3076
3.98k
                dbg_printhex("fileID", pdf->fileID, pdf->fileIDlen);
3077
3.98k
                dbg_printhex("computed U (R>=3)", (const char *)result, 16);
3078
3.98k
                if (!memcmp(result, U, 16))
3079
942
                    password_empty = true;
3080
3.98k
                free(d);
3081
3.98k
            }
3082
3083
5.98k
            break;
3084
5.98k
        }
3085
5.98k
        case 5: {
3086
31
            uint8_t result2[32];
3087
3088
            /* supplement to ISO3200, 3.5.2 Algorithm 3.11 */
3089
            /* user validation salt */
3090
31
            cl_sha256(U + 32, 8, result2, NULL);
3091
31
            dbg_printhex("Computed U", (const char *)result2, 32);
3092
31
            if (!memcmp(result2, U, 32)) {
3093
                /* Algorithm 3.2a could be used to recover encryption key */
3094
0
                cl_sha256(U + 40, 8, result2, NULL);
3095
3096
0
                if (UE_len != 32) {
3097
0
                    cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
3098
0
                    noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
3099
0
                } else {
3100
0
                    pdf->keylen = 32;
3101
0
                    pdf->key    = cli_max_malloc(pdf->keylen);
3102
0
                    if (!pdf->key) {
3103
0
                        cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n");
3104
0
                        goto done;
3105
0
                    }
3106
3107
0
                    aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)result2, 32, 0);
3108
0
                    dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen);
3109
3110
0
                    password_empty = true;
3111
0
                }
3112
0
            }
3113
3114
31
            break;
3115
31
        }
3116
3.93k
        case 6: {
3117
3.93k
            unsigned char hash[32], validationkey[32];
3118
3119
3.93k
            size_t pwlen    = 0;
3120
3.93k
            char password[] = "";
3121
3122
3.93k
            if (NULL == UE) {
3123
371
                cli_dbgmsg("check_user_password: Missing UE value!\n");
3124
371
                noisy_warnmsg("check_user_password: Missing UE value!\n");
3125
371
                goto done;
3126
371
            }
3127
3128
3.56k
            dbg_printhex("UE: ", UE, UE_len);
3129
3130
            /*
3131
             * Test the password against the user key by computing the SHA-256 hash of the UTF-8 password concatenated
3132
             * with the 8 bytes of user validation salt.
3133
             */
3134
3.56k
            compute_hash_r6(
3135
3.56k
                password,
3136
3.56k
                pwlen,
3137
3.56k
                (const unsigned char *)(U + 32), // user validation salt
3138
3.56k
                validationkey,
3139
3.56k
                NULL); // no U string for user password check
3140
3141
            /* If the 32-byte result matches the first 32 bytes of the U string, this is the user password. */
3142
3.56k
            if (0 != memcmp(U, validationkey, sizeof(validationkey))) {
3143
776
                cli_dbgmsg("check_user_password: User password check did not match!\n");
3144
776
                break;
3145
776
            }
3146
3147
            /*
3148
             * Compute an intermediate user key by computing the SHA-256 hash of the UTF-8 password concatenated with
3149
             * the 8 bytes of user key salt.
3150
             */
3151
2.78k
            compute_hash_r6(
3152
2.78k
                password,
3153
2.78k
                pwlen,
3154
2.78k
                (const unsigned char *)(U + 40), // user key salt
3155
2.78k
                hash,
3156
2.78k
                NULL); // no U string for user password check
3157
3158
2.78k
            if (UE_len != 32) {
3159
281
                cli_dbgmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
3160
281
                noisy_warnmsg("check_user_password: UE length is not 32: %zu\n", UE_len);
3161
2.50k
            } else {
3162
2.50k
                pdf->keylen = 32;
3163
2.50k
                pdf->key    = cli_max_malloc(pdf->keylen);
3164
2.50k
                if (!pdf->key) {
3165
0
                    cli_errmsg("check_user_password: Cannot allocate memory for pdf->key\n");
3166
0
                    goto done;
3167
0
                }
3168
3169
2.50k
                aes_256cbc_decrypt((const unsigned char *)UE, &UE_len, (unsigned char *)(pdf->key), (char *)hash, 32, 0);
3170
2.50k
                dbg_printhex("check_user_password: Candidate encryption key", pdf->key, pdf->keylen);
3171
3172
2.50k
                password_empty = true;
3173
2.50k
            }
3174
3175
2.78k
            break;
3176
2.78k
        }
3177
2.78k
        default: {
3178
            /* Supported R is in {2,3,4,5} */
3179
0
            cli_dbgmsg("check_user_password: R value out of range\n");
3180
0
            noisy_warnmsg("check_user_password: R value out of range\n");
3181
0
        }
3182
9.95k
    }
3183
3184
9.57k
    if (password_empty) {
3185
3.91k
        cli_dbgmsg("check_user_password: user password is empty\n");
3186
3.91k
        noisy_msg(pdf, "check_user_password: encrypted PDF found, user password is empty, will attempt to decrypt\n");
3187
        /* The key we computed above is the key used to encrypt the streams.
3188
         * We could decrypt it now if we wanted to */
3189
3.91k
        pdf->flags |= 1 << DECRYPTABLE_PDF;
3190
5.66k
    } else {
3191
        /* the key is not valid, we would need the user or the owner password to decrypt */
3192
5.66k
        cli_dbgmsg("check_user_password: user/owner password would be required for decryption\n");
3193
5.66k
        noisy_warnmsg("check_user_password: encrypted PDF found, user password is NOT empty, cannot decrypt!\n");
3194
5.66k
    }
3195
3196
9.95k
done:
3197
9.95k
    return;
3198
9.57k
}
3199
3200
enum enc_method parse_enc_method(const char *dict, unsigned len, const char *key, enum enc_method def)
3201
32.0k
{
3202
32.0k
    const char *q;
3203
32.0k
    char *CFM           = NULL;
3204
32.0k
    enum enc_method ret = ENC_UNKNOWN;
3205
3206
32.0k
    if (!key)
3207
15.6k
        return def;
3208
3209
16.4k
    if (!strcmp(key, "Identity"))
3210
69
        return ENC_IDENTITY;
3211
3212
16.3k
    q = pdf_getdict(dict, (int *)(&len), key);
3213
16.3k
    if (!q)
3214
3.22k
        return def;
3215
3216
13.1k
    CFM = pdf_readval(q, len, "/CFM");
3217
13.1k
    if (CFM) {
3218
8.74k
        cli_dbgmsg("parse_enc_method: %s CFM: %s\n", key, CFM);
3219
8.74k
        if (!strncmp(CFM, "V2", 2))
3220
41
            ret = ENC_V2;
3221
8.70k
        else if (!strncmp(CFM, "AESV2", 5))
3222
857
            ret = ENC_AESV2;
3223
7.85k
        else if (!strncmp(CFM, "AESV3", 5))
3224
5.34k
            ret = ENC_AESV3;
3225
2.50k
        else if (!strncmp(CFM, "None", 4))
3226
403
            ret = ENC_NONE;
3227
3228
8.74k
        free(CFM);
3229
8.74k
    }
3230
3231
13.1k
    return ret;
3232
16.3k
}
3233
3234
void pdf_handle_enc(struct pdf_struct *pdf)
3235
433k
{
3236
433k
    struct pdf_obj *obj;
3237
433k
    uint32_t len, n, R, P, length, EM = 1, i, oulen;
3238
3239
433k
    char *O       = NULL;
3240
433k
    char *OE      = NULL;
3241
433k
    size_t OE_len = 0;
3242
3243
433k
    char *U       = NULL;
3244
433k
    char *UE      = NULL;
3245
433k
    size_t UE_len = 0;
3246
3247
433k
    char *StmF = NULL;
3248
433k
    char *StrF = NULL;
3249
433k
    char *EFF  = NULL;
3250
3251
433k
    const char *q, *q2;
3252
3253
433k
    if (pdf->enc_objid == ~0u)
3254
395k
        return;
3255
38.5k
    if (!pdf->fileID) {
3256
6.49k
        cli_dbgmsg("pdf_handle_enc: no file ID\n");
3257
6.49k
        noisy_warnmsg("pdf_handle_enc: no file ID\n");
3258
6.49k
        return;
3259
6.49k
    }
3260
3261
32.0k
    obj = find_obj(pdf, pdf->objs[0], pdf->enc_objid);
3262
32.0k
    if (!obj) {
3263
6.14k
        cli_dbgmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff);
3264
6.14k
        noisy_warnmsg("pdf_handle_enc: can't find encrypted object %d %d\n", pdf->enc_objid >> 8, pdf->enc_objid & 0xff);
3265
6.14k
        return;
3266
6.14k
    }
3267
3268
25.9k
    len = obj->size;
3269
3270
25.9k
    q = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
3271
25.9k
                      : (const char *)(obj->start + pdf->map);
3272
3273
25.9k
    O = U = UE = StmF = StrF = EFF = NULL;
3274
3275
25.9k
    pdf->enc_method_string       = ENC_UNKNOWN;
3276
25.9k
    pdf->enc_method_stream       = ENC_UNKNOWN;
3277
25.9k
    pdf->enc_method_embeddedfile = ENC_UNKNOWN;
3278
3279
25.9k
    q2 = cli_memstr(q, len, "/Standard", 9);
3280
25.9k
    if (!q2) {
3281
4.60k
        cli_dbgmsg("pdf_handle_enc: /Standard not found\n");
3282
4.60k
        noisy_warnmsg("pdf_handle_enc: /Standard not found\n");
3283
4.60k
        goto done;
3284
4.60k
    }
3285
3286
    /* we can have both of these:
3287
     * /AESV2/Length /Standard/Length
3288
     * /Length /Standard
3289
     * make sure we don't mistake AES's length for Standard's */
3290
21.3k
    length = pdf_readint(q2, len - (q2 - q), "/Length");
3291
21.3k
    if (length == ~0u)
3292
11.9k
        length = pdf_readint(q, len, "/Length");
3293
3294
21.3k
    if (length < 40) {
3295
2.51k
        cli_dbgmsg("pdf_handle_enc: invalid length: %d\n", length);
3296
2.51k
        length = 40;
3297
2.51k
    }
3298
3299
21.3k
    R = pdf_readint(q, len, "/R");
3300
21.3k
    if (R == ~0u) {
3301
1.08k
        cli_dbgmsg("pdf_handle_enc: invalid R\n");
3302
1.08k
        noisy_warnmsg("pdf_handle_enc: invalid R\n");
3303
1.08k
        goto done;
3304
1.08k
    }
3305
3306
20.2k
    if ((R > 6) || (R < 2)) {
3307
589
        cli_dbgmsg("pdf_handle_enc: R value outside supported range [2..6]\n");
3308
589
        noisy_warnmsg("pdf_handle_enc: R value outside supported range [2..6]\n");
3309
589
        goto done;
3310
589
    }
3311
3312
19.6k
    P = pdf_readint(q, len, "/P");
3313
19.6k
    if (R < 6) { // P field doesn't seem to be required for R6.
3314
11.6k
        if (P == ~0u) {
3315
1.94k
            cli_dbgmsg("pdf_handle_enc: invalid P\n");
3316
1.94k
            noisy_warnmsg("pdf_handle_enc: invalid P\n");
3317
1.94k
            goto done;
3318
1.94k
        }
3319
11.6k
    }
3320
3321
17.6k
    if (R < 5) {
3322
9.45k
        oulen = 32;
3323
9.45k
    } else {
3324
8.23k
        oulen = 48;
3325
8.23k
    }
3326
3327
17.6k
    if (R == 2 || R == 3) {
3328
7.11k
        pdf->enc_method_stream       = ENC_V2;
3329
7.11k
        pdf->enc_method_string       = ENC_V2;
3330
7.11k
        pdf->enc_method_embeddedfile = ENC_V2;
3331
10.5k
    } else if (R == 4 || R == 5 || R == 6) {
3332
10.5k
        EM        = pdf_readbool(q, len, "/EncryptMetadata", 1);
3333
10.5k
        StmF      = pdf_readval(q, len, "/StmF");
3334
10.5k
        StrF      = pdf_readval(q, len, "/StrF");
3335
10.5k
        EFF       = pdf_readval(q, len, "/EFF");
3336
10.5k
        n         = len;
3337
10.5k
        pdf->CF   = pdf_getdict(q, (int *)(&n), "/CF");
3338
10.5k
        pdf->CF_n = n;
3339
3340
10.5k
        if (StmF) {
3341
8.00k
            cli_dbgmsg("pdf_handle_enc: StmF: %s\n", StmF);
3342
8.00k
        }
3343
10.5k
        if (StrF) {
3344
7.76k
            cli_dbgmsg("pdf_handle_enc: StrF: %s\n", StrF);
3345
7.76k
        }
3346
10.5k
        if (EFF) {
3347
342
            cli_dbgmsg("pdf_handle_enc: EFF: %s\n", EFF);
3348
342
        }
3349
3350
10.5k
        pdf->enc_method_stream       = parse_enc_method(pdf->CF, n, StmF, ENC_IDENTITY);
3351
10.5k
        pdf->enc_method_string       = parse_enc_method(pdf->CF, n, StrF, ENC_IDENTITY);
3352
10.5k
        pdf->enc_method_embeddedfile = parse_enc_method(pdf->CF, n, EFF, pdf->enc_method_stream);
3353
3354
10.5k
        cli_dbgmsg("pdf_handle_enc: EncryptMetadata: %s\n", EM ? "true" : "false");
3355
3356
10.5k
        if (R == 4) {
3357
2.33k
            length = 128;
3358
8.23k
        } else {
3359
8.23k
            length = 256;
3360
3361
            /*
3362
             * Read the UE value (for checking user-password)
3363
             */
3364
8.23k
            n      = 0;
3365
8.23k
            UE     = pdf_readstring(q, len, "/UE", &n, NULL, false);
3366
8.23k
            UE_len = n;
3367
3368
            /*
3369
             * Read the OE value (for checking owner-password)
3370
             */
3371
8.23k
            n      = 0;
3372
8.23k
            OE     = pdf_readstring(q, len, "/OE", &n, NULL, false);
3373
8.23k
            OE_len = n;
3374
8.23k
        }
3375
10.5k
    }
3376
3377
17.6k
    if (length == ~0u)
3378
5.41k
        length = 40;
3379
3380
    /*
3381
     * Read the O value
3382
     */
3383
17.6k
    n = 0;
3384
17.6k
    O = pdf_readstring(q, len, "/O", &n, NULL, false);
3385
17.6k
    if (!O || n < oulen) {
3386
3.74k
        cli_dbgmsg("pdf_handle_enc: invalid O: %d\n", n);
3387
3.74k
        noisy_warnmsg("pdf_handle_enc: invalid O: %d\n", n);
3388
3.74k
        if (O) {
3389
1.81k
            dbg_printhex("invalid O", O, n);
3390
1.81k
        }
3391
3392
3.74k
        goto done;
3393
3.74k
    }
3394
13.9k
    if (n > oulen) {
3395
364k
        for (i = oulen; i < n; i++) {
3396
359k
            if (O[i]) {
3397
1.56k
                dbg_printhex("pdf_handle_enc: too long O", O, n);
3398
1.56k
                noisy_warnmsg("pdf_handle_enc: too long O: %u", n);
3399
1.56k
                goto done;
3400
1.56k
            }
3401
359k
        }
3402
6.67k
    }
3403
3404
    /*
3405
     * Read the U value
3406
     */
3407
12.3k
    n = 0;
3408
12.3k
    U = pdf_readstring(q, len, "/U", &n, NULL, false);
3409
12.3k
    if (!U || n < oulen) {
3410
1.12k
        cli_dbgmsg("pdf_handle_enc: invalid U: %u\n", n);
3411
1.12k
        noisy_warnmsg("pdf_handle_enc: invalid U: %u\n", n);
3412
1.12k
        if (U) {
3413
433
            dbg_printhex("invalid U", U, n);
3414
433
        }
3415
3416
1.12k
        goto done;
3417
1.12k
    }
3418
3419
11.2k
    if (n > oulen) {
3420
274k
        for (i = oulen; i < n; i++) {
3421
270k
            if (U[i]) {
3422
486
                dbg_printhex("too long U", U, n);
3423
486
                goto done;
3424
486
            }
3425
270k
        }
3426
4.22k
    }
3427
3428
10.7k
    cli_dbgmsg("pdf_handle_enc: Encrypt R: %d, P %x, length: %u\n", R, P, length);
3429
10.7k
    if (length % 8) {
3430
808
        cli_dbgmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
3431
808
        noisy_warnmsg("pdf_handle_enc: wrong key length, not multiple of 8\n");
3432
808
        goto done;
3433
808
    }
3434
3435
    // Check the owner password.
3436
9.95k
    check_owner_password(pdf, R, O, U, OE, OE_len);
3437
3438
9.95k
    if (NULL == pdf->key) {
3439
        // Wasn't the owner password, let's try the user password.
3440
9.95k
        check_user_password(pdf, R, O, U, P, EM, UE, UE_len, length);
3441
9.95k
    }
3442
3443
25.9k
done:
3444
25.9k
    free(O);
3445
25.9k
    free(OE);
3446
3447
25.9k
    free(U);
3448
25.9k
    free(UE);
3449
3450
25.9k
    free(StmF);
3451
25.9k
    free(StrF);
3452
25.9k
    free(EFF);
3453
25.9k
}
3454
3455
/**
3456
 * @brief Search pdf buffer for objects.  Parse each.
3457
 *
3458
 * Newly found objects will be extracted after completion when the extraction for loop continues.
3459
 *
3460
 * @param pdf           Pdf struct that keeps track of all information found in the PDF.
3461
 * @param objstm        Pointer to an object stream to parse.
3462
 *
3463
 * @return cl_error_t   Error code.
3464
 */
3465
cl_error_t pdf_find_and_parse_objs_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm)
3466
54.5k
{
3467
54.5k
    cl_error_t status   = CL_EFORMAT;
3468
54.5k
    cl_error_t retval   = CL_EPARSE;
3469
54.5k
    uint32_t badobjects = 0;
3470
54.5k
    size_t i            = 0;
3471
3472
54.5k
    struct pdf_obj *obj = NULL;
3473
3474
54.5k
    if ((NULL == objstm) || (NULL == objstm->streambuf)) {
3475
0
        status = CL_EARG;
3476
0
        goto done;
3477
0
    }
3478
3479
54.5k
    if ((0 == objstm->first) ||
3480
54.5k
        (0 == objstm->streambuf_len) ||
3481
54.5k
        (0 == objstm->n)) {
3482
3.31k
        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Empty object stream.\n");
3483
3.31k
        goto done;
3484
3.31k
    }
3485
3486
51.2k
    if (objstm->first >= objstm->streambuf_len) {
3487
10.0k
        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Invalid objstm values. Offset of first obj greater than stream length.\n");
3488
10.0k
        goto done;
3489
10.0k
    }
3490
3491
    /* Process each object */
3492
280k
    for (i = 0; i < objstm->n; i++) {
3493
254k
        obj = NULL;
3494
3495
254k
        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
3496
0
            cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n");
3497
0
            status = CL_ETIMEOUT;
3498
0
            goto done;
3499
0
        }
3500
3501
        /* Find object */
3502
254k
        retval = pdf_findobj_in_objstm(pdf, objstm, &obj);
3503
254k
        if (retval != CL_SUCCESS) {
3504
14.7k
            if (retval != CL_BREAK) {
3505
14.7k
                cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Fewer objects in stream than expected: %u found, %u expected.\n",
3506
14.7k
                           objstm->nobjs_found, objstm->n);
3507
14.7k
                badobjects++;
3508
14.7k
                pdf->stats.ninvalidobjs++;
3509
14.7k
            }
3510
14.7k
            break;
3511
14.7k
        }
3512
3513
239k
        cli_dbgmsg("pdf_find_and_parse_objs_in_objstm: Found object %u %u in object stream at offset: %u\n", obj->id >> 8, obj->id & 0xff, obj->start);
3514
3515
239k
        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
3516
0
            cli_dbgmsg("Timeout reached in the PDF parser while parsing object stream.\n");
3517
0
            status = CL_ETIMEOUT;
3518
0
            goto done;
3519
0
        }
3520
3521
        /* Parse object */
3522
239k
        pdf_parseobj(pdf, obj);
3523
239k
    }
3524
3525
41.2k
    if (badobjects) {
3526
14.7k
        status = CL_EFORMAT;
3527
14.7k
        goto done;
3528
14.7k
    }
3529
3530
26.4k
    status = CL_SUCCESS;
3531
3532
54.5k
done:
3533
54.5k
    return status;
3534
26.4k
}
3535
3536
/**
3537
 * @brief Search pdf buffer for objects.  Parse each and then extract each.
3538
 *
3539
 * @param pdf               Pdf struct that keeps track of all information found in the PDF.
3540
 *
3541
 * @return cl_error_t       Error code.
3542
 */
3543
static cl_error_t pdf_find_and_extract_objs(struct pdf_struct *pdf)
3544
433k
{
3545
433k
    cl_error_t status   = CL_SUCCESS;
3546
433k
    int32_t rv          = 0;
3547
433k
    unsigned int i      = 0;
3548
433k
    uint32_t badobjects = 0;
3549
433k
    cli_ctx *ctx        = NULL;
3550
3551
433k
    if (NULL == pdf) {
3552
0
        cli_errmsg("pdf_find_and_extract_objs: Invalid arguments.\n");
3553
0
        status = CL_EARG;
3554
0
        goto done;
3555
0
    }
3556
3557
433k
    ctx = pdf->ctx;
3558
3559
    /* parse PDF and find obj offsets */
3560
2.08M
    while (CL_BREAK != (rv = pdf_findobj(pdf))) {
3561
1.64M
        if (rv == CL_EMEM) {
3562
0
            cli_errmsg("pdf_find_and_extract_objs: Memory allocation error.\n");
3563
0
            status = CL_EMEM;
3564
0
            goto done;
3565
0
        }
3566
1.64M
    }
3567
3568
    /* must parse after finding all objs, so we can flag indirect objects */
3569
1.78M
    for (i = 0; i < pdf->nobjs; i++) {
3570
1.35M
        struct pdf_obj *obj = pdf->objs[i];
3571
3572
1.35M
        if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
3573
0
            cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while parsing objects.\n");
3574
3575
0
            status = CL_ETIMEOUT;
3576
0
            goto done;
3577
0
        }
3578
3579
1.35M
        pdf_parseobj(pdf, obj);
3580
1.35M
    }
3581
3582
433k
    pdf_handle_enc(pdf);
3583
433k
    if (pdf->flags & (1 << ENCRYPTED_PDF))
3584
53.3k
        cli_dbgmsg("pdf_find_and_extract_objs: encrypted pdf found, %s!\n",
3585
53.3k
                   (pdf->flags & (1 << DECRYPTABLE_PDF)) ? "decryptable" : "not decryptable, stream will probably fail to decompress");
3586
3587
433k
    if (SCAN_HEURISTIC_ENCRYPTED_DOC &&
3588
433k
        (pdf->flags & (1 << ENCRYPTED_PDF)) &&
3589
433k
        !(pdf->flags & (1 << DECRYPTABLE_PDF))) {
3590
        /* It is encrypted, and a password/key needs to be supplied to decrypt.
3591
         * This doesn't trigger for PDFs that are encrypted but don't need
3592
         * a password to decrypt */
3593
49.3k
        status = cli_append_potentially_unwanted(pdf->ctx, "Heuristics.Encrypted.PDF");
3594
49.3k
    }
3595
3596
433k
    if (CL_SUCCESS == status) {
3597
433k
        status = run_pdf_hooks(pdf, PDF_PHASE_PARSED, -1);
3598
433k
        cli_dbgmsg("pdf_find_and_extract_objs: (parsed hooks) returned %d\n", status);
3599
433k
    }
3600
3601
433k
    if (CL_SUCCESS == status) {
3602
        /* extract PDF objs */
3603
2.02M
        for (i = 0; !status && i < pdf->nobjs; i++) {
3604
1.58M
            struct pdf_obj *obj = pdf->objs[i];
3605
3606
1.58M
            if (cli_checktimelimit(pdf->ctx) != CL_SUCCESS) {
3607
0
                cli_dbgmsg("pdf_find_and_extract_objs: Timeout reached in the PDF parser while extracting objects.\n");
3608
3609
0
                status = CL_ETIMEOUT;
3610
0
                goto done;
3611
0
            }
3612
3613
1.58M
            pdf->parse_recursion_depth++;
3614
1.58M
            status = pdf_extract_obj(pdf, obj, PDF_EXTRACT_OBJ_SCAN);
3615
1.58M
            pdf->parse_recursion_depth--;
3616
1.58M
            switch (status) {
3617
0
                case CL_EFORMAT:
3618
                    /* Don't halt on one bad object */
3619
0
                    cli_dbgmsg("pdf_find_and_extract_objs: Format error when extracting object, skipping to the next object.\n");
3620
0
                    badobjects++;
3621
0
                    pdf->stats.ninvalidobjs++;
3622
0
                    status = CL_CLEAN;
3623
0
                    break;
3624
0
                case CL_VIRUS:
3625
0
                    break;
3626
1.58M
                default:
3627
1.58M
                    break;
3628
1.58M
            }
3629
1.58M
        }
3630
433k
    }
3631
3632
433k
done:
3633
433k
    if ((CL_SUCCESS == status) && badobjects) {
3634
0
        status = CL_EFORMAT;
3635
0
    }
3636
3637
433k
    return status;
3638
433k
}
3639
3640
/**
3641
 * @brief Primary function for parsing and scanning a PDF.
3642
 *
3643
 * @param dir       Filepath for temp file.
3644
 * @param ctx       clam scan context structure.
3645
 * @param offset    offset of pdf in ctx->fmap
3646
 *
3647
 * @return int      Returns cl_error_t status value.
3648
 */
3649
cl_error_t cli_pdf(const char *dir, cli_ctx *ctx, off_t offset)
3650
441k
{
3651
441k
    cl_error_t rc = CL_SUCCESS;
3652
441k
    struct pdf_struct pdf;
3653
441k
    fmap_t *map   = ctx->fmap;
3654
441k
    size_t size   = map->len - offset;
3655
441k
    off_t versize = size > 1032 ? 1032 : size;
3656
441k
    off_t map_off, bytesleft;
3657
441k
    unsigned long xref;
3658
441k
    long temp_long;
3659
441k
    const char *pdfver, *tmp, *start, *eofmap, *q, *eof;
3660
441k
    unsigned i;
3661
441k
    unsigned int objs_found = 0;
3662
3663
441k
    json_object *pdfobj = NULL;
3664
441k
    char *begin, *end, *p1;
3665
3666
441k
    cli_dbgmsg("in cli_pdf(%s)\n", dir);
3667
441k
    memset(&pdf, 0, sizeof(pdf));
3668
441k
    pdf.ctx       = ctx;
3669
441k
    pdf.dir       = dir;
3670
441k
    pdf.enc_objid = ~0u;
3671
3672
441k
    pdfver = start = fmap_need_off_once(map, offset, versize);
3673
3674
    /* Check PDF version */
3675
441k
    if (!pdfver) {
3676
0
        cli_errmsg("cli_pdf: mmap() failed (1)\n");
3677
0
        rc = CL_EMAP;
3678
0
        goto done;
3679
0
    }
3680
3681
441k
    if (ctx->wrkproperty)
3682
441k
        pdfobj = cli_jsonobj(ctx->wrkproperty, "PDFStats");
3683
3684
    /* offset is 0 when coming from filetype2 */
3685
441k
    tmp = cli_memstr(pdfver, versize, "%PDF-", 5);
3686
441k
    if (!tmp) {
3687
7.32k
        cli_dbgmsg("cli_pdf: no PDF- header found\n");
3688
7.32k
        noisy_warnmsg("cli_pdf: no PDF- header found\n");
3689
3690
7.32k
        rc = CL_SUCCESS;
3691
7.32k
        goto done;
3692
7.32k
    }
3693
3694
434k
    versize -= tmp - pdfver;
3695
434k
    pdfver = tmp;
3696
3697
434k
    if (versize < 8) {
3698
352
        rc = CL_EFORMAT;
3699
352
        goto done;
3700
352
    }
3701
3702
    /* Check for PDF-1.[0-9]. Although 1.7 is highest now, allow for future versions */
3703
433k
    if (pdfver[5] != '1' || pdfver[6] != '.' ||
3704
433k
        pdfver[7] < '1' || pdfver[7] > '9') {
3705
304k
        pdf.flags |= 1 << BAD_PDF_VERSION;
3706
304k
        cli_dbgmsg("cli_pdf: bad pdf version: %.8s\n", pdfver);
3707
3708
304k
        if (pdfobj)
3709
304k
            cli_jsonbool(pdfobj, "BadVersion", 1);
3710
304k
    } else {
3711
129k
        if (pdfobj) {
3712
129k
            begin = (char *)(pdfver + 5);
3713
129k
            end   = begin + 2;
3714
129k
            strtoul(end, &end, 10);
3715
129k
            p1 = cli_max_calloc((end - begin) + 2, 1);
3716
129k
            if (p1) {
3717
129k
                strncpy(p1, begin, end - begin);
3718
129k
                p1[end - begin] = '\0';
3719
129k
                cli_jsonstr(pdfobj, "PDFVersion", p1);
3720
129k
                free(p1);
3721
129k
            }
3722
129k
        }
3723
129k
    }
3724
3725
433k
    if (pdfver != start || offset) {
3726
414k
        pdf.flags |= 1 << BAD_PDF_HEADERPOS;
3727
414k
        cli_dbgmsg("cli_pdf: PDF header is not at position 0: %lld\n", (long long)(pdfver - start + offset));
3728
3729
414k
        if (pdfobj)
3730
414k
            cli_jsonbool(pdfobj, "BadVersionLocation", 1);
3731
414k
    }
3732
3733
433k
    offset += pdfver - start;
3734
3735
    /* find trailer and xref, don't fail if not found */
3736
433k
    map_off = (off_t)map->len - 2048;
3737
433k
    if (map_off < 0)
3738
284k
        map_off = 0;
3739
3740
433k
    bytesleft = map->len - map_off;
3741
3742
433k
    eofmap = fmap_need_off_once(map, map_off, bytesleft);
3743
433k
    if (!eofmap) {
3744
0
        cli_errmsg("cli_pdf: mmap() failed (2)\n");
3745
3746
0
        rc = CL_EMAP;
3747
0
        goto done;
3748
0
    }
3749
3750
433k
    eof = eofmap + bytesleft;
3751
469M
    for (q = &eofmap[bytesleft - 5]; q > eofmap; q--) {
3752
468M
        if (memcmp(q, "%%EOF", 5) == 0)
3753
90.7k
            break;
3754
468M
    }
3755
3756
433k
    if (q <= eofmap) {
3757
342k
        pdf.flags |= 1 << BAD_PDF_TRAILER;
3758
342k
        cli_dbgmsg("cli_pdf: %%%%EOF not found\n");
3759
3760
342k
        if (pdfobj)
3761
342k
            cli_jsonbool(pdfobj, "NoEOF", 1);
3762
342k
    } else {
3763
90.7k
        const char *t;
3764
3765
        /*size = q - eofmap + map_off;*/
3766
90.7k
        q -= 9;
3767
27.2M
        for (; q > eofmap; q--) {
3768
27.1M
            if (memcmp(q, "startxref", 9) == 0)
3769
59.4k
                break;
3770
27.1M
        }
3771
3772
90.7k
        if (q <= eofmap) {
3773
31.2k
            pdf.flags |= 1 << BAD_PDF_TRAILER;
3774
31.2k
            cli_dbgmsg("cli_pdf: startxref not found\n");
3775
3776
31.2k
            if (pdfobj)
3777
31.2k
                cli_jsonbool(pdfobj, "NoXREF", 1);
3778
59.4k
        } else {
3779
50.6M
            for (t = q; t > eofmap; t--) {
3780
50.5M
                if (memcmp(t, "trailer", 7) == 0)
3781
8.38k
                    break;
3782
50.5M
            }
3783
3784
59.4k
            pdf_parse_trailer(&pdf, eofmap, eof - eofmap);
3785
59.4k
            q += 9;
3786
3787
137k
            while (q < eof && (*q == ' ' || *q == '\n' || *q == '\r')) {
3788
78.3k
                q++;
3789
78.3k
            }
3790
3791
59.4k
            if (CL_SUCCESS != cli_strntol_wrap(q, q - eofmap + map_off, 0, 10, &temp_long)) {
3792
9.02k
                cli_dbgmsg("cli_pdf: failed to parse PDF trailer xref\n");
3793
9.02k
                pdf.flags |= 1 << BAD_PDF_TRAILER;
3794
50.4k
            } else if (temp_long < 0) {
3795
4.26k
                cli_dbgmsg("cli_pdf: Encountered invalid negative PDF trailer xref (%ld).\n", temp_long);
3796
4.26k
                pdf.flags |= 1 << BAD_PDF_TRAILER;
3797
46.1k
            } else {
3798
46.1k
                xref      = (unsigned long)temp_long;
3799
46.1k
                bytesleft = map->len - offset - xref;
3800
46.1k
                if (bytesleft > 4096)
3801
5.24k
                    bytesleft = 4096;
3802
3803
46.1k
                q = fmap_need_off_once(map, offset + xref, bytesleft);
3804
46.1k
                if (!q || xrefCheck(q, q + bytesleft) == -1) {
3805
41.4k
                    cli_dbgmsg("cli_pdf: did not find valid xref\n");
3806
41.4k
                    pdf.flags |= 1 << BAD_PDF_TRAILER;
3807
41.4k
                }
3808
46.1k
            }
3809
59.4k
        }
3810
90.7k
    }
3811
3812
433k
    size -= offset;
3813
433k
    pdf.size = size;
3814
433k
    pdf.map  = fmap_need_off(map, offset, size);
3815
433k
    if (!pdf.map) {
3816
0
        cli_errmsg("cli_pdf: mmap() failed (3)\n");
3817
3818
0
        rc = CL_EMAP;
3819
0
        goto done;
3820
0
    }
3821
3822
433k
    pdf.startoff = offset;
3823
3824
433k
    rc = run_pdf_hooks(&pdf, PDF_PHASE_PRE, -1);
3825
433k
    if (CL_SUCCESS != rc) {
3826
0
        cli_dbgmsg("cli_pdf: (pre hooks) returning %d\n", rc);
3827
3828
0
        rc = rc == CL_BREAK ? CL_CLEAN : rc;
3829
0
        goto done;
3830
0
    }
3831
3832
    /*
3833
     * Find and extract all objects in the PDF.
3834
     * This methodology adds objects from object streams.
3835
     */
3836
433k
    objs_found = pdf.nobjs;
3837
433k
    rc         = pdf_find_and_extract_objs(&pdf);
3838
3839
433k
    if (CL_EMEM == rc) {
3840
5
        cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs had an allocation failure\n");
3841
5
        goto err;
3842
433k
    } else if (pdf.nobjs <= objs_found) {
3843
49.6k
        cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs did not find any new objects!\n");
3844
384k
    } else {
3845
384k
        cli_dbgmsg("cli_pdf: pdf_find_and_extract_objs found %d new objects.\n", pdf.nobjs - objs_found);
3846
384k
    }
3847
3848
433k
    if (pdf.flags & (1 << ENCRYPTED_PDF))
3849
53.3k
        pdf.flags &= ~((1 << BAD_FLATESTART) | (1 << BAD_STREAMSTART) | (1 << BAD_ASCIIDECODE));
3850
3851
433k
    if (pdf.flags && CL_SUCCESS == rc) {
3852
429k
        cli_dbgmsg("cli_pdf: flags 0x%02x\n", pdf.flags);
3853
429k
        rc = run_pdf_hooks(&pdf, PDF_PHASE_END, -1);
3854
3855
429k
        if (CL_SUCCESS == rc && SCAN_HEURISTICS && (ctx->dconf->other & OTHER_CONF_PDFNAMEOBJ)) {
3856
429k
            if (pdf.flags & (1 << ESCAPED_COMMON_PDFNAME)) {
3857
                /* for example /Fl#61te#44#65#63#6f#64#65 instead of /FlateDecode */
3858
1.00k
                rc = cli_append_potentially_unwanted(ctx, "Heuristics.PDF.ObfuscatedNameObject");
3859
1.00k
            }
3860
429k
        }
3861
#if 0
3862
    /* TODO: find both trailers, and /Encrypt settings */
3863
    if (pdf.flags & (1 << LINEARIZED_PDF))
3864
        pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
3865
    if (pdf.flags & (1 << MANY_FILTERS))
3866
        pdf.flags &= ~ (1 << BAD_ASCIIDECODE);
3867
    if (CL_SUCCESS == rc && (pdf.flags &
3868
        ((1 << BAD_PDF_TOOMANYOBJS) | (1 << BAD_STREAM_FILTERS) |
3869
         (1<<BAD_FLATE) | (1<<BAD_ASCIIDECODE)|
3870
             (1<<UNTERMINATED_OBJ_DICT) | (1<<UNKNOWN_FILTER)))) {
3871
        rc = CL_EUNPACK;
3872
    }
3873
#endif
3874
429k
    }
3875
3876
441k
done:
3877
441k
    if (CL_SUCCESS == rc && pdf.stats.ninvalidobjs > 0) {
3878
9.31k
        rc = CL_EFORMAT;
3879
9.31k
    }
3880
3881
441k
err:
3882
3883
441k
    pdf_export_json(&pdf);
3884
3885
441k
    if (pdf.objstms) {
3886
90.7k
        for (i = 0; i < pdf.nobjstms; i++) {
3887
54.5k
            if (pdf.objstms[i]) {
3888
54.5k
                if (pdf.objstms[i]->streambuf) {
3889
54.5k
                    free(pdf.objstms[i]->streambuf);
3890
54.5k
                    pdf.objstms[i]->streambuf = NULL;
3891
54.5k
                }
3892
54.5k
                free(pdf.objstms[i]);
3893
54.5k
                pdf.objstms[i] = NULL;
3894
54.5k
            }
3895
54.5k
        }
3896
36.1k
        free(pdf.objstms);
3897
36.1k
        pdf.objstms = NULL;
3898
36.1k
    }
3899
3900
441k
    if (NULL != pdf.objs) {
3901
2.02M
        for (i = 0; i < pdf.nobjs; i++) {
3902
1.59M
            if (NULL != pdf.objs[i]) {
3903
1.59M
                if (NULL != pdf.objs[i]->path) {
3904
0
                    free(pdf.objs[i]->path);
3905
0
                    pdf.objs[i]->path = NULL;
3906
0
                }
3907
1.59M
                free(pdf.objs[i]);
3908
1.59M
                pdf.objs[i] = NULL;
3909
1.59M
            }
3910
1.59M
        }
3911
433k
        free(pdf.objs);
3912
433k
        pdf.objs = NULL;
3913
433k
    }
3914
441k
    if (pdf.fileID) {
3915
39.5k
        free(pdf.fileID);
3916
39.5k
        pdf.fileID = NULL;
3917
39.5k
    }
3918
441k
    if (pdf.key) {
3919
8.49k
        free(pdf.key);
3920
8.49k
        pdf.key = NULL;
3921
8.49k
    }
3922
3923
    /* PDF hooks may abort, don't return CL_BREAK to caller! */
3924
441k
    rc = (rc == CL_BREAK) ? CL_CLEAN : rc;
3925
3926
441k
    cli_dbgmsg("cli_pdf: returning %d\n", rc);
3927
441k
    return rc;
3928
441k
}
3929
3930
/**
3931
 * @brief   Skip the rest of the current line, and find the start of the next line.
3932
 *
3933
 * @param ptr   Current offset into buffer.
3934
 * @param len   Remaining bytes in buffer.
3935
 *
3936
 * @return const char*  Address of next line, or NULL if no next line in buffer.
3937
 */
3938
static const char *
3939
pdf_nextlinestart(const char *ptr, size_t len)
3940
3.25M
{
3941
3.25M
    if (!ptr || (0 == len)) {
3942
        /* Invalid args */
3943
0
        return NULL;
3944
0
    }
3945
3946
26.1M
    while (strchr("\r\n", *ptr) == NULL) {
3947
22.9M
        if (--len == 0L)
3948
5.71k
            return NULL;
3949
3950
22.9M
        ptr++;
3951
22.9M
    }
3952
3953
8.65M
    while (strchr("\r\n", *ptr) != NULL) {
3954
5.47M
        if (--len == 0L)
3955
70.4k
            return NULL;
3956
3957
5.40M
        ptr++;
3958
5.40M
    }
3959
3960
3.18M
    return ptr;
3961
3.25M
}
3962
3963
/**
3964
 * @brief   Return the start of the next PDF object.
3965
 *
3966
 * This assumes that we're not in a stream.
3967
 *
3968
 * @param ptr   Current offset into buffer.
3969
 * @param len   Remaining bytes in buffer.
3970
 *
3971
 * @return const char*  Address of next object in the buffer, or NULL if there is none in the buffer.
3972
 */
3973
static const char *
3974
pdf_nextobject(const char *ptr, size_t len)
3975
14.7M
{
3976
14.7M
    const char *p;
3977
14.7M
    int inobject = 1;
3978
3979
499M
    while (len) {
3980
499M
        switch (*ptr) {
3981
1.47M
            case '\n':
3982
2.55M
            case '\r':
3983
3.25M
            case '%': /* comment */
3984
3.25M
                p = pdf_nextlinestart(ptr, len);
3985
3.25M
                if (p == NULL)
3986
76.1k
                    return NULL;
3987
3988
3.18M
                len -= (size_t)(p - ptr);
3989
3.18M
                ptr      = p;
3990
3.18M
                inobject = 0;
3991
3992
3.18M
                break;
3993
7.94M
            case ' ':
3994
8.52M
            case '\t':
3995
9.03M
            case '[': /* Start of an array object */
3996
9.59M
            case '\v':
3997
9.83M
            case '\f':
3998
13.4M
            case '<': /* Start of a dictionary object */
3999
13.4M
                inobject = 0;
4000
13.4M
                ptr++;
4001
13.4M
                len--;
4002
4003
13.4M
                break;
4004
3.39M
            case '/': /* Start of a name object */
4005
3.39M
                return ptr;
4006
312k
            case '(': /* start of JS */
4007
312k
                return ptr;
4008
478M
            default:
4009
478M
                if (!inobject) {
4010
                    /* TODO: parse and return object type */
4011
10.7M
                    return ptr;
4012
10.7M
                }
4013
4014
468M
                ptr++;
4015
468M
                len--;
4016
499M
        }
4017
499M
    }
4018
4019
251k
    return NULL;
4020
14.7M
}
4021
4022
/* PDF statistics */
4023
static void ASCIIHexDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4024
34.6k
{
4025
34.6k
    UNUSEDPARAM(obj);
4026
34.6k
    UNUSEDPARAM(act);
4027
4028
34.6k
    if (NULL == pdf)
4029
0
        return;
4030
4031
34.6k
    pdf->stats.nasciihexdecode++;
4032
34.6k
}
4033
4034
static void ASCII85Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4035
31.7k
{
4036
31.7k
    UNUSEDPARAM(obj);
4037
31.7k
    UNUSEDPARAM(act);
4038
4039
31.7k
    if (NULL == pdf)
4040
0
        return;
4041
4042
31.7k
    pdf->stats.nascii85decode++;
4043
31.7k
}
4044
4045
static void EmbeddedFile_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4046
21.0k
{
4047
21.0k
    UNUSEDPARAM(obj);
4048
21.0k
    UNUSEDPARAM(act);
4049
4050
21.0k
    if (NULL == pdf)
4051
0
        return;
4052
4053
21.0k
    pdf->stats.nembeddedfile++;
4054
21.0k
}
4055
4056
static void FlateDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4057
186k
{
4058
186k
    UNUSEDPARAM(obj);
4059
186k
    UNUSEDPARAM(act);
4060
4061
186k
    if (NULL == pdf)
4062
0
        return;
4063
4064
186k
    pdf->stats.nflate++;
4065
186k
}
4066
4067
static void Image_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4068
8.98k
{
4069
8.98k
    UNUSEDPARAM(obj);
4070
8.98k
    UNUSEDPARAM(act);
4071
4072
8.98k
    if (NULL == pdf)
4073
0
        return;
4074
4075
8.98k
    pdf->stats.nimage++;
4076
8.98k
}
4077
4078
static void LZWDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4079
26.6k
{
4080
26.6k
    UNUSEDPARAM(obj);
4081
26.6k
    UNUSEDPARAM(act);
4082
4083
26.6k
    if (NULL == pdf)
4084
0
        return;
4085
4086
26.6k
    pdf->stats.nlzw++;
4087
26.6k
}
4088
4089
static void RunLengthDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4090
27.3k
{
4091
27.3k
    UNUSEDPARAM(obj);
4092
27.3k
    UNUSEDPARAM(act);
4093
4094
27.3k
    if (NULL == pdf)
4095
0
        return;
4096
4097
27.3k
    pdf->stats.nrunlengthdecode++;
4098
27.3k
}
4099
4100
static void CCITTFaxDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4101
3.20k
{
4102
3.20k
    UNUSEDPARAM(obj);
4103
3.20k
    UNUSEDPARAM(act);
4104
4105
3.20k
    if (NULL == pdf)
4106
0
        return;
4107
4108
3.20k
    pdf->stats.nfaxdecode++;
4109
3.20k
}
4110
4111
static void JBIG2Decode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4112
2.41k
{
4113
2.41k
    cli_ctx *ctx = NULL;
4114
2.41k
    struct json_object *pdfobj, *jbig2arr;
4115
4116
2.41k
    UNUSEDPARAM(obj);
4117
2.41k
    UNUSEDPARAM(act);
4118
4119
2.41k
    if (NULL == pdf)
4120
0
        return;
4121
4122
2.41k
    ctx = pdf->ctx;
4123
4124
2.41k
    if (!(SCAN_COLLECT_METADATA))
4125
0
        return;
4126
4127
2.41k
    if (!(pdf->ctx->wrkproperty))
4128
0
        return;
4129
4130
2.41k
    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4131
2.41k
    if (!(pdfobj))
4132
0
        return;
4133
4134
2.41k
    jbig2arr = cli_jsonarray(pdfobj, "JBIG2Objects");
4135
2.41k
    if (!(jbig2arr))
4136
0
        return;
4137
4138
2.41k
    cli_jsonint_array(jbig2arr, obj->id >> 8);
4139
4140
2.41k
    pdf->stats.njbig2decode++;
4141
2.41k
}
4142
4143
static void DCTDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4144
16.8k
{
4145
16.8k
    UNUSEDPARAM(obj);
4146
16.8k
    UNUSEDPARAM(act);
4147
4148
16.8k
    if (NULL == pdf)
4149
0
        return;
4150
4151
16.8k
    pdf->stats.ndctdecode++;
4152
16.8k
}
4153
4154
static void JPXDecode_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4155
4.46k
{
4156
4.46k
    UNUSEDPARAM(obj);
4157
4.46k
    UNUSEDPARAM(act);
4158
4159
4.46k
    if (NULL == pdf)
4160
0
        return;
4161
4162
4.46k
    pdf->stats.njpxdecode++;
4163
4.46k
}
4164
4165
static void Crypt_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4166
15.6k
{
4167
15.6k
    UNUSEDPARAM(obj);
4168
15.6k
    UNUSEDPARAM(act);
4169
4170
15.6k
    if (NULL == pdf)
4171
0
        return;
4172
4173
15.6k
    pdf->stats.ncrypt++;
4174
15.6k
}
4175
4176
static void Standard_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4177
47.2k
{
4178
47.2k
    UNUSEDPARAM(obj);
4179
47.2k
    UNUSEDPARAM(act);
4180
4181
47.2k
    if (NULL == pdf)
4182
0
        return;
4183
4184
47.2k
    pdf->stats.nstandard++;
4185
47.2k
}
4186
4187
static void Sig_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4188
2.68k
{
4189
2.68k
    UNUSEDPARAM(obj);
4190
2.68k
    UNUSEDPARAM(act);
4191
4192
2.68k
    if (NULL == pdf)
4193
0
        return;
4194
4195
2.68k
    pdf->stats.nsigned++;
4196
2.68k
}
4197
4198
static void JavaScript_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4199
23.0k
{
4200
23.0k
    UNUSEDPARAM(pdf);
4201
23.0k
    UNUSEDPARAM(obj);
4202
23.0k
    UNUSEDPARAM(act);
4203
4204
    /*
4205
     * Don't record the pdf->stats or JSON now, we'll look for the actual
4206
     * Javascript in the object when we extract it later. This is to prevent
4207
     * false positives when objects reference an indirect object which doesn't
4208
     * actually have any content.
4209
     */
4210
23.0k
}
4211
4212
static void OpenAction_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4213
304k
{
4214
304k
    UNUSEDPARAM(obj);
4215
304k
    UNUSEDPARAM(act);
4216
4217
304k
    if (NULL == pdf)
4218
0
        return;
4219
4220
304k
    pdf->stats.nopenaction++;
4221
304k
}
4222
4223
static void Launch_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4224
13.6k
{
4225
13.6k
    UNUSEDPARAM(obj);
4226
13.6k
    UNUSEDPARAM(act);
4227
4228
13.6k
    if (NULL == pdf)
4229
0
        return;
4230
4231
13.6k
    pdf->stats.nlaunch++;
4232
13.6k
}
4233
4234
static void Page_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4235
28.7k
{
4236
28.7k
    UNUSEDPARAM(obj);
4237
28.7k
    UNUSEDPARAM(act);
4238
4239
28.7k
    if (NULL == pdf)
4240
0
        return;
4241
4242
28.7k
    pdf->stats.npage++;
4243
28.7k
}
4244
4245
static void Author_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4246
14.0k
{
4247
14.0k
    cli_ctx *ctx = NULL;
4248
4249
14.0k
    UNUSEDPARAM(act);
4250
4251
14.0k
    if (NULL == pdf)
4252
0
        return;
4253
4254
14.0k
    ctx = pdf->ctx;
4255
4256
14.0k
    if (!(SCAN_COLLECT_METADATA))
4257
0
        return;
4258
4259
14.0k
    if (!(pdf->stats.author)) {
4260
9.29k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4261
9.29k
                                             : (const char *)(obj->start + pdf->map);
4262
4263
9.29k
        pdf->stats.author = calloc(1, sizeof(struct pdf_stats_entry));
4264
9.29k
        if (!(pdf->stats.author))
4265
0
            return;
4266
4267
9.29k
        pdf->parse_recursion_depth++;
4268
9.29k
        pdf->stats.author->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Author", NULL, &(pdf->stats.author->meta));
4269
9.29k
        pdf->parse_recursion_depth--;
4270
9.29k
    }
4271
14.0k
}
4272
4273
static void Creator_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4274
10.2k
{
4275
10.2k
    cli_ctx *ctx = NULL;
4276
4277
10.2k
    UNUSEDPARAM(act);
4278
4279
10.2k
    if (NULL == pdf)
4280
0
        return;
4281
4282
10.2k
    ctx = pdf->ctx;
4283
4284
10.2k
    if (!(SCAN_COLLECT_METADATA))
4285
0
        return;
4286
4287
10.2k
    if (!(pdf->stats.creator)) {
4288
7.14k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4289
7.14k
                                             : (const char *)(obj->start + pdf->map);
4290
4291
7.14k
        pdf->stats.creator = calloc(1, sizeof(struct pdf_stats_entry));
4292
7.14k
        if (!(pdf->stats.creator))
4293
0
            return;
4294
4295
7.14k
        pdf->parse_recursion_depth++;
4296
7.14k
        pdf->stats.creator->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Creator", NULL, &(pdf->stats.creator->meta));
4297
7.14k
        pdf->parse_recursion_depth--;
4298
7.14k
    }
4299
10.2k
}
4300
4301
static void ModificationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4302
18.9k
{
4303
18.9k
    cli_ctx *ctx = NULL;
4304
4305
18.9k
    UNUSEDPARAM(act);
4306
4307
18.9k
    if (NULL == pdf)
4308
0
        return;
4309
4310
18.9k
    ctx = pdf->ctx;
4311
4312
18.9k
    if (!(SCAN_COLLECT_METADATA))
4313
0
        return;
4314
4315
18.9k
    if (!(pdf->stats.modificationdate)) {
4316
12.5k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4317
12.5k
                                             : (const char *)(obj->start + pdf->map);
4318
4319
12.5k
        pdf->stats.modificationdate = calloc(1, sizeof(struct pdf_stats_entry));
4320
12.5k
        if (!(pdf->stats.modificationdate))
4321
0
            return;
4322
4323
12.5k
        pdf->parse_recursion_depth++;
4324
12.5k
        pdf->stats.modificationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/ModDate", NULL, &(pdf->stats.modificationdate->meta));
4325
12.5k
        pdf->parse_recursion_depth--;
4326
12.5k
    }
4327
18.9k
}
4328
4329
static void CreationDate_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4330
50.0k
{
4331
50.0k
    cli_ctx *ctx = NULL;
4332
4333
50.0k
    UNUSEDPARAM(act);
4334
4335
50.0k
    if (NULL == pdf)
4336
0
        return;
4337
4338
50.0k
    ctx = pdf->ctx;
4339
4340
50.0k
    if (!(SCAN_COLLECT_METADATA))
4341
0
        return;
4342
4343
50.0k
    if (!(pdf->stats.creationdate)) {
4344
23.9k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4345
23.9k
                                             : (const char *)(obj->start + pdf->map);
4346
4347
23.9k
        pdf->stats.creationdate = calloc(1, sizeof(struct pdf_stats_entry));
4348
23.9k
        if (!(pdf->stats.creationdate))
4349
0
            return;
4350
4351
23.9k
        pdf->parse_recursion_depth++;
4352
23.9k
        pdf->stats.creationdate->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/CreationDate", NULL, &(pdf->stats.creationdate->meta));
4353
23.9k
        pdf->parse_recursion_depth--;
4354
23.9k
    }
4355
50.0k
}
4356
4357
static void Producer_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4358
18.9k
{
4359
18.9k
    cli_ctx *ctx = NULL;
4360
4361
18.9k
    UNUSEDPARAM(act);
4362
4363
18.9k
    if (NULL == pdf)
4364
0
        return;
4365
4366
18.9k
    ctx = pdf->ctx;
4367
4368
18.9k
    if (!(SCAN_COLLECT_METADATA))
4369
0
        return;
4370
4371
18.9k
    if (!(pdf->stats.producer)) {
4372
12.3k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4373
12.3k
                                             : (const char *)(obj->start + pdf->map);
4374
4375
12.3k
        pdf->stats.producer = calloc(1, sizeof(struct pdf_stats_entry));
4376
12.3k
        if (!(pdf->stats.producer))
4377
0
            return;
4378
4379
12.3k
        pdf->parse_recursion_depth++;
4380
12.3k
        pdf->stats.producer->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Producer", NULL, &(pdf->stats.producer->meta));
4381
12.3k
        pdf->parse_recursion_depth--;
4382
12.3k
    }
4383
18.9k
}
4384
4385
static void Title_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4386
16.4k
{
4387
16.4k
    cli_ctx *ctx = NULL;
4388
4389
16.4k
    UNUSEDPARAM(act);
4390
4391
16.4k
    if (NULL == pdf)
4392
0
        return;
4393
4394
16.4k
    ctx = pdf->ctx;
4395
4396
16.4k
    if (!(SCAN_COLLECT_METADATA))
4397
0
        return;
4398
4399
16.4k
    if (!(pdf->stats.title)) {
4400
9.43k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4401
9.43k
                                             : (const char *)(obj->start + pdf->map);
4402
4403
9.43k
        pdf->stats.title = calloc(1, sizeof(struct pdf_stats_entry));
4404
9.43k
        if (!(pdf->stats.title))
4405
0
            return;
4406
4407
9.43k
        pdf->parse_recursion_depth++;
4408
9.43k
        pdf->stats.title->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Title", NULL, &(pdf->stats.title->meta));
4409
9.43k
        pdf->parse_recursion_depth--;
4410
9.43k
    }
4411
16.4k
}
4412
4413
static void Keywords_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4414
7.99k
{
4415
7.99k
    cli_ctx *ctx = NULL;
4416
4417
7.99k
    UNUSEDPARAM(act);
4418
4419
7.99k
    if (NULL == pdf)
4420
0
        return;
4421
4422
7.99k
    ctx = pdf->ctx;
4423
4424
7.99k
    if (!(SCAN_COLLECT_METADATA))
4425
0
        return;
4426
4427
7.99k
    if (!(pdf->stats.keywords)) {
4428
5.68k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4429
5.68k
                                             : (const char *)(obj->start + pdf->map);
4430
4431
5.68k
        pdf->stats.keywords = calloc(1, sizeof(struct pdf_stats_entry));
4432
5.68k
        if (!(pdf->stats.keywords))
4433
0
            return;
4434
4435
5.68k
        pdf->parse_recursion_depth++;
4436
5.68k
        pdf->stats.keywords->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Keywords", NULL, &(pdf->stats.keywords->meta));
4437
5.68k
        pdf->parse_recursion_depth--;
4438
5.68k
    }
4439
7.99k
}
4440
4441
static void Subject_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4442
17.6k
{
4443
17.6k
    cli_ctx *ctx = NULL;
4444
4445
17.6k
    UNUSEDPARAM(act);
4446
4447
17.6k
    if (NULL == pdf)
4448
0
        return;
4449
4450
17.6k
    ctx = pdf->ctx;
4451
4452
17.6k
    if (!(SCAN_COLLECT_METADATA))
4453
0
        return;
4454
4455
17.6k
    if (!(pdf->stats.subject)) {
4456
11.1k
        const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4457
11.1k
                                             : (const char *)(obj->start + pdf->map);
4458
4459
11.1k
        pdf->stats.subject = calloc(1, sizeof(struct pdf_stats_entry));
4460
11.1k
        if (!(pdf->stats.subject))
4461
0
            return;
4462
4463
11.1k
        pdf->parse_recursion_depth++;
4464
11.1k
        pdf->stats.subject->data = pdf_parse_string(pdf, obj, objstart, obj->size, "/Subject", NULL, &(pdf->stats.subject->meta));
4465
11.1k
        pdf->parse_recursion_depth--;
4466
11.1k
    }
4467
17.6k
}
4468
4469
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4470
995
{
4471
995
    UNUSEDPARAM(obj);
4472
995
    UNUSEDPARAM(act);
4473
4474
995
    if (NULL == pdf)
4475
0
        return;
4476
4477
995
    pdf->stats.nrichmedia++;
4478
995
}
4479
4480
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4481
10.4k
{
4482
10.4k
    UNUSEDPARAM(obj);
4483
10.4k
    UNUSEDPARAM(act);
4484
4485
10.4k
    if (NULL == pdf)
4486
0
        return;
4487
4488
10.4k
    pdf->stats.nacroform++;
4489
10.4k
}
4490
4491
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4492
627
{
4493
627
    UNUSEDPARAM(obj);
4494
627
    UNUSEDPARAM(act);
4495
4496
627
    if (NULL == pdf)
4497
0
        return;
4498
4499
627
    pdf->stats.nxfa++;
4500
627
}
4501
4502
static void Pages_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4503
151k
{
4504
151k
    cli_ctx *ctx = NULL;
4505
151k
    struct pdf_array *array;
4506
151k
    const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4507
151k
                                         : (const char *)(obj->start + pdf->map);
4508
151k
    const char *begin;
4509
151k
    unsigned long npages = 0, count;
4510
151k
    long temp_long;
4511
151k
    struct pdf_array_node *node;
4512
151k
    json_object *pdfobj;
4513
151k
    size_t countsize = 0;
4514
4515
151k
    UNUSEDPARAM(act);
4516
4517
151k
    if (!(pdf) || !(pdf->ctx->wrkproperty))
4518
0
        return;
4519
4520
151k
    ctx = pdf->ctx;
4521
4522
151k
    if (!(SCAN_COLLECT_METADATA))
4523
0
        return;
4524
4525
151k
    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4526
151k
    if (!(pdfobj))
4527
0
        return;
4528
4529
151k
    begin = cli_memstr(objstart, obj->size, "/Kids", 5);
4530
151k
    if (!(begin))
4531
43.9k
        return;
4532
4533
107k
    begin += 5;
4534
4535
107k
    pdf->parse_recursion_depth++;
4536
107k
    array = pdf_parse_array(pdf, obj, obj->size, (char *)begin, NULL);
4537
107k
    pdf->parse_recursion_depth--;
4538
4539
107k
    if (!(array)) {
4540
46.4k
        cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4541
46.4k
        return;
4542
46.4k
    }
4543
4544
494k
    for (node = array->nodes; node != NULL; node = node->next)
4545
434k
        if (node->datasz)
4546
414k
            if (strchr((char *)(node->data), 'R'))
4547
33.2k
                npages++;
4548
4549
60.5k
    begin = cli_memstr(objstart, obj->size, "/Count", 6);
4550
60.5k
    if (!(begin)) {
4551
42.2k
        cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4552
42.2k
        goto cleanup;
4553
42.2k
    }
4554
4555
18.3k
    begin += 6;
4556
44.0k
    while (((size_t)(begin - objstart) < obj->size) && isspace(begin[0]))
4557
25.6k
        begin++;
4558
4559
18.3k
    if ((size_t)(begin - objstart) >= obj->size) {
4560
288
        goto cleanup;
4561
288
    }
4562
4563
18.0k
    countsize = (obj->objstm) ? (size_t)(obj->start + obj->objstm->streambuf + obj->size - begin)
4564
18.0k
                              : (size_t)(obj->start + pdf->map + obj->size - begin);
4565
4566
18.0k
    if (CL_SUCCESS != cli_strntol_wrap(begin, countsize, 0, 10, &temp_long)) {
4567
5.07k
        cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4568
12.9k
    } else if (temp_long < 0) {
4569
138
        cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4570
12.8k
    } else {
4571
12.8k
        count = (unsigned long)temp_long;
4572
12.8k
        if (count != npages) {
4573
8.88k
            cli_jsonbool(pdfobj, "IncorrectPagesCount", 1);
4574
8.88k
        }
4575
12.8k
    }
4576
4577
60.5k
cleanup:
4578
60.5k
    pdf_free_array(array);
4579
60.5k
}
4580
4581
static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act)
4582
11.6k
{
4583
11.6k
    cli_ctx *ctx = NULL;
4584
11.6k
    json_object *colorsobj, *pdfobj;
4585
11.6k
    unsigned long ncolors;
4586
11.6k
    long temp_long;
4587
11.6k
    char *p1;
4588
11.6k
    const char *objstart = (obj->objstm) ? (const char *)(obj->start + obj->objstm->streambuf)
4589
11.6k
                                         : (const char *)(obj->start + pdf->map);
4590
4591
11.6k
    UNUSEDPARAM(act);
4592
4593
11.6k
    if (!(pdf) || !(pdf->ctx) || !(pdf->ctx->wrkproperty))
4594
0
        return;
4595
4596
11.6k
    ctx = pdf->ctx;
4597
4598
11.6k
    if (!(SCAN_COLLECT_METADATA))
4599
0
        return;
4600
4601
11.6k
    p1 = (char *)cli_memstr(objstart, obj->size, "/Colors", 7);
4602
11.6k
    if (!(p1))
4603
0
        return;
4604
4605
11.6k
    p1 += 7;
4606
4607
    /* Ensure that we have at least one whitespace character plus at least one number */
4608
11.6k
    if (obj->size - (size_t)(p1 - objstart) < 2)
4609
0
        return;
4610
4611
15.1k
    while (((size_t)(p1 - objstart) < obj->size) && isspace(p1[0]))
4612
3.43k
        p1++;
4613
4614
11.6k
    if ((size_t)(p1 - objstart) == obj->size)
4615
0
        return;
4616
4617
11.6k
    if (CL_SUCCESS != cli_strntol_wrap(p1, (size_t)((p1 - objstart) - obj->size), 0, 10, &temp_long)) {
4618
11.6k
        return;
4619
11.6k
    } else if (temp_long < 0) {
4620
0
        return;
4621
0
    }
4622
0
    ncolors = (unsigned long)temp_long;
4623
4624
    /* We only care if the number of colors > 2**24 */
4625
0
    if (ncolors < 1 << 24)
4626
0
        return;
4627
4628
0
    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4629
0
    if (!(pdfobj))
4630
0
        return;
4631
4632
0
    colorsobj = cli_jsonarray(pdfobj, "BigColors");
4633
0
    if (!(colorsobj))
4634
0
        return;
4635
4636
0
    cli_jsonint_array(colorsobj, obj->id >> 8);
4637
0
}
4638
4639
static void pdf_free_stats(struct pdf_struct *pdf)
4640
441k
{
4641
4642
441k
    if (!pdf) {
4643
0
        return;
4644
0
    }
4645
4646
441k
    if ((pdf->stats.author)) {
4647
9.29k
        if (pdf->stats.author->data)
4648
7.37k
            free(pdf->stats.author->data);
4649
9.29k
        free(pdf->stats.author);
4650
9.29k
        pdf->stats.author = NULL;
4651
9.29k
    }
4652
4653
441k
    if (pdf->stats.creator) {
4654
7.14k
        if (pdf->stats.creator->data)
4655
3.95k
            free(pdf->stats.creator->data);
4656
7.14k
        free(pdf->stats.creator);
4657
7.14k
        pdf->stats.creator = NULL;
4658
7.14k
    }
4659
4660
441k
    if (pdf->stats.producer) {
4661
12.3k
        if (pdf->stats.producer->data)
4662
8.99k
            free(pdf->stats.producer->data);
4663
12.3k
        free(pdf->stats.producer);
4664
12.3k
        pdf->stats.producer = NULL;
4665
12.3k
    }
4666
4667
441k
    if (pdf->stats.modificationdate) {
4668
12.5k
        if (pdf->stats.modificationdate->data)
4669
9.66k
            free(pdf->stats.modificationdate->data);
4670
12.5k
        free(pdf->stats.modificationdate);
4671
12.5k
        pdf->stats.modificationdate = NULL;
4672
12.5k
    }
4673
4674
441k
    if (pdf->stats.creationdate) {
4675
23.9k
        if (pdf->stats.creationdate->data)
4676
6.31k
            free(pdf->stats.creationdate->data);
4677
23.9k
        free(pdf->stats.creationdate);
4678
23.9k
        pdf->stats.creationdate = NULL;
4679
23.9k
    }
4680
4681
441k
    if (pdf->stats.title) {
4682
9.43k
        if (pdf->stats.title->data)
4683
6.31k
            free(pdf->stats.title->data);
4684
9.43k
        free(pdf->stats.title);
4685
9.43k
        pdf->stats.title = NULL;
4686
9.43k
    }
4687
4688
441k
    if (pdf->stats.subject) {
4689
11.1k
        if (pdf->stats.subject->data)
4690
8.11k
            free(pdf->stats.subject->data);
4691
11.1k
        free(pdf->stats.subject);
4692
11.1k
        pdf->stats.subject = NULL;
4693
11.1k
    }
4694
4695
441k
    if (pdf->stats.keywords) {
4696
5.68k
        if (pdf->stats.keywords->data)
4697
4.69k
            free(pdf->stats.keywords->data);
4698
5.68k
        free(pdf->stats.keywords);
4699
5.68k
        pdf->stats.keywords = NULL;
4700
5.68k
    }
4701
441k
}
4702
4703
static void pdf_export_json(struct pdf_struct *pdf)
4704
441k
{
4705
441k
    cli_ctx *ctx = NULL;
4706
441k
    json_object *pdfobj;
4707
441k
    unsigned long i;
4708
4709
441k
    if (NULL == pdf)
4710
0
        return;
4711
4712
441k
    if (!(pdf->ctx)) {
4713
0
        goto cleanup;
4714
0
    }
4715
4716
441k
    ctx = pdf->ctx;
4717
4718
441k
    if (!(SCAN_COLLECT_METADATA) || !(pdf->ctx->wrkproperty)) {
4719
0
        goto cleanup;
4720
0
    }
4721
4722
441k
    pdfobj = cli_jsonobj(pdf->ctx->wrkproperty, "PDFStats");
4723
441k
    if (!(pdfobj)) {
4724
0
        goto cleanup;
4725
0
    }
4726
4727
441k
    if (pdf->stats.author) {
4728
9.29k
        if (!pdf->stats.author->meta.success) {
4729
4.25k
            char *out = pdf_finalize_string(pdf, pdf->stats.author->meta.obj, pdf->stats.author->data, pdf->stats.author->meta.length);
4730
4.25k
            if (out) {
4731
140
                free(pdf->stats.author->data);
4732
140
                pdf->stats.author->data         = out;
4733
140
                pdf->stats.author->meta.length  = strlen(out);
4734
140
                pdf->stats.author->meta.success = 1;
4735
140
            }
4736
4.25k
        }
4737
4738
9.29k
        if (pdf->stats.author->meta.success && cli_isutf8(pdf->stats.author->data, pdf->stats.author->meta.length)) {
4739
5.18k
            cli_jsonstr(pdfobj, "Author", pdf->stats.author->data);
4740
5.18k
        } else if (pdf->stats.author->data && pdf->stats.author->meta.length) {
4741
922
            char *b64 = cl_base64_encode(pdf->stats.author->data, pdf->stats.author->meta.length);
4742
922
            cli_jsonstr(pdfobj, "Author", b64);
4743
922
            cli_jsonbool(pdfobj, "Author_base64", 1);
4744
922
            free(b64);
4745
3.19k
        } else {
4746
3.19k
            cli_jsonstr(pdfobj, "Author", "");
4747
3.19k
        }
4748
9.29k
    }
4749
441k
    if (pdf->stats.creator) {
4750
7.14k
        if (!pdf->stats.creator->meta.success) {
4751
4.34k
            char *out = pdf_finalize_string(pdf, pdf->stats.creator->meta.obj, pdf->stats.creator->data, pdf->stats.creator->meta.length);
4752
4.34k
            if (out) {
4753
73
                free(pdf->stats.creator->data);
4754
73
                pdf->stats.creator->data         = out;
4755
73
                pdf->stats.creator->meta.length  = strlen(out);
4756
73
                pdf->stats.creator->meta.success = 1;
4757
73
            }
4758
4.34k
        }
4759
4760
7.14k
        if (pdf->stats.creator->meta.success && cli_isutf8(pdf->stats.creator->data, pdf->stats.creator->meta.length)) {
4761
2.86k
            cli_jsonstr(pdfobj, "Creator", pdf->stats.creator->data);
4762
4.27k
        } else if (pdf->stats.creator->data && pdf->stats.creator->meta.length) {
4763
709
            char *b64 = cl_base64_encode(pdf->stats.creator->data, pdf->stats.creator->meta.length);
4764
709
            cli_jsonstr(pdfobj, "Creator", b64);
4765
709
            cli_jsonbool(pdfobj, "Creator_base64", 1);
4766
709
            free(b64);
4767
3.56k
        } else {
4768
3.56k
            cli_jsonstr(pdfobj, "Creator", "");
4769
3.56k
        }
4770
7.14k
    }
4771
441k
    if (pdf->stats.producer) {
4772
12.3k
        if (!pdf->stats.producer->meta.success) {
4773
6.22k
            char *out = pdf_finalize_string(pdf, pdf->stats.producer->meta.obj, pdf->stats.producer->data, pdf->stats.producer->meta.length);
4774
6.22k
            if (out) {
4775
123
                free(pdf->stats.producer->data);
4776
123
                pdf->stats.producer->data         = out;
4777
123
                pdf->stats.producer->meta.length  = strlen(out);
4778
123
                pdf->stats.producer->meta.success = 1;
4779
123
            }
4780
6.22k
        }
4781
4782
12.3k
        if (pdf->stats.producer->meta.success && cli_isutf8(pdf->stats.producer->data, pdf->stats.producer->meta.length)) {
4783
6.21k
            cli_jsonstr(pdfobj, "Producer", pdf->stats.producer->data);
4784
6.21k
        } else if (pdf->stats.producer->data && pdf->stats.producer->meta.length) {
4785
2.26k
            char *b64 = cl_base64_encode(pdf->stats.producer->data, pdf->stats.producer->meta.length);
4786
2.26k
            cli_jsonstr(pdfobj, "Producer", b64);
4787
2.26k
            cli_jsonbool(pdfobj, "Producer_base64", 1);
4788
2.26k
            free(b64);
4789
3.83k
        } else {
4790
3.83k
            cli_jsonstr(pdfobj, "Producer", "");
4791
3.83k
        }
4792
12.3k
    }
4793
441k
    if (pdf->stats.modificationdate) {
4794
12.5k
        if (!pdf->stats.modificationdate->meta.success) {
4795
7.40k
            char *out = pdf_finalize_string(pdf, pdf->stats.modificationdate->meta.obj, pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
4796
7.40k
            if (out) {
4797
59
                free(pdf->stats.modificationdate->data);
4798
59
                pdf->stats.modificationdate->data         = out;
4799
59
                pdf->stats.modificationdate->meta.length  = strlen(out);
4800
59
                pdf->stats.modificationdate->meta.success = 1;
4801
59
            }
4802
7.40k
        }
4803
4804
12.5k
        if (pdf->stats.modificationdate->meta.success && cli_isutf8(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length)) {
4805
5.23k
            cli_jsonstr(pdfobj, "ModificationDate", pdf->stats.modificationdate->data);
4806
7.34k
        } else if (pdf->stats.modificationdate->data && pdf->stats.modificationdate->meta.length) {
4807
4.38k
            char *b64 = cl_base64_encode(pdf->stats.modificationdate->data, pdf->stats.modificationdate->meta.length);
4808
4.38k
            cli_jsonstr(pdfobj, "ModificationDate", b64);
4809
4.38k
            cli_jsonbool(pdfobj, "ModificationDate_base64", 1);
4810
4.38k
            free(b64);
4811
4.38k
        } else {
4812
2.96k
            cli_jsonstr(pdfobj, "ModificationDate", "");
4813
2.96k
        }
4814
12.5k
    }
4815
441k
    if (pdf->stats.creationdate) {
4816
23.9k
        if (!pdf->stats.creationdate->meta.success) {
4817
19.6k
            char *out = pdf_finalize_string(pdf, pdf->stats.creationdate->meta.obj, pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
4818
19.6k
            if (out) {
4819
138
                free(pdf->stats.creationdate->data);
4820
138
                pdf->stats.creationdate->data         = out;
4821
138
                pdf->stats.creationdate->meta.length  = strlen(out);
4822
138
                pdf->stats.creationdate->meta.success = 1;
4823
138
            }
4824
19.6k
        }
4825
4826
23.9k
        if (pdf->stats.creationdate->meta.success && cli_isutf8(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length)) {
4827
4.41k
            cli_jsonstr(pdfobj, "CreationDate", pdf->stats.creationdate->data);
4828
19.5k
        } else if (pdf->stats.creationdate->data && pdf->stats.creationdate->meta.length) {
4829
1.89k
            char *b64 = cl_base64_encode(pdf->stats.creationdate->data, pdf->stats.creationdate->meta.length);
4830
1.89k
            cli_jsonstr(pdfobj, "CreationDate", b64);
4831
1.89k
            cli_jsonbool(pdfobj, "CreationDate_base64", 1);
4832
1.89k
            free(b64);
4833
17.6k
        } else {
4834
17.6k
            cli_jsonstr(pdfobj, "CreationDate", "");
4835
17.6k
        }
4836
23.9k
    }
4837
441k
    if (pdf->stats.title) {
4838
9.43k
        if (!pdf->stats.title->meta.success) {
4839
6.09k
            char *out = pdf_finalize_string(pdf, pdf->stats.title->meta.obj, pdf->stats.title->data, pdf->stats.title->meta.length);
4840
6.09k
            if (out) {
4841
235
                free(pdf->stats.title->data);
4842
235
                pdf->stats.title->data         = out;
4843
235
                pdf->stats.title->meta.length  = strlen(out);
4844
235
                pdf->stats.title->meta.success = 1;
4845
235
            }
4846
6.09k
        }
4847
4848
9.43k
        if (pdf->stats.title->meta.success && cli_isutf8(pdf->stats.title->data, pdf->stats.title->meta.length)) {
4849
3.57k
            cli_jsonstr(pdfobj, "Title", pdf->stats.title->data);
4850
5.86k
        } else if (pdf->stats.title->data && pdf->stats.title->meta.length) {
4851
2.73k
            char *b64 = cl_base64_encode(pdf->stats.title->data, pdf->stats.title->meta.length);
4852
2.73k
            cli_jsonstr(pdfobj, "Title", b64);
4853
2.73k
            cli_jsonbool(pdfobj, "Title_base64", 1);
4854
2.73k
            free(b64);
4855
3.12k
        } else {
4856
3.12k
            cli_jsonstr(pdfobj, "Title", "");
4857
3.12k
        }
4858
9.43k
    }
4859
441k
    if (pdf->stats.subject) {
4860
11.1k
        if (!pdf->stats.subject->meta.success) {
4861
4.58k
            char *out = pdf_finalize_string(pdf, pdf->stats.subject->meta.obj, pdf->stats.subject->data, pdf->stats.subject->meta.length);
4862
4.58k
            if (out) {
4863
335
                free(pdf->stats.subject->data);
4864
335
                pdf->stats.subject->data         = out;
4865
335
                pdf->stats.subject->meta.length  = strlen(out);
4866
335
                pdf->stats.subject->meta.success = 1;
4867
335
            }
4868
4.58k
        }
4869
4870
11.1k
        if (pdf->stats.subject->meta.success && cli_isutf8(pdf->stats.subject->data, pdf->stats.subject->meta.length)) {
4871
6.87k
            cli_jsonstr(pdfobj, "Subject", pdf->stats.subject->data);
4872
6.87k
        } else if (pdf->stats.subject->data && pdf->stats.subject->meta.length) {
4873
1.23k
            char *b64 = cl_base64_encode(pdf->stats.subject->data, pdf->stats.subject->meta.length);
4874
1.23k
            cli_jsonstr(pdfobj, "Subject", b64);
4875
1.23k
            cli_jsonbool(pdfobj, "Subject_base64", 1);
4876
1.23k
            free(b64);
4877
3.01k
        } else {
4878
3.01k
            cli_jsonstr(pdfobj, "Subject", "");
4879
3.01k
        }
4880
11.1k
    }
4881
441k
    if (pdf->stats.keywords) {
4882
5.68k
        if (!pdf->stats.keywords->meta.success) {
4883
2.31k
            char *out = pdf_finalize_string(pdf, pdf->stats.keywords->meta.obj, pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
4884
2.31k
            if (out) {
4885
14
                free(pdf->stats.keywords->data);
4886
14
                pdf->stats.keywords->data         = out;
4887
14
                pdf->stats.keywords->meta.length  = strlen(out);
4888
14
                pdf->stats.keywords->meta.success = 1;
4889
14
            }
4890
2.31k
        }
4891
4892
5.68k
        if (pdf->stats.keywords->meta.success && cli_isutf8(pdf->stats.keywords->data, pdf->stats.keywords->meta.length)) {
4893
3.38k
            cli_jsonstr(pdfobj, "Keywords", pdf->stats.keywords->data);
4894
3.38k
        } else if (pdf->stats.keywords->data && pdf->stats.keywords->meta.length) {
4895
710
            char *b64 = cl_base64_encode(pdf->stats.keywords->data, pdf->stats.keywords->meta.length);
4896
710
            cli_jsonstr(pdfobj, "Keywords", b64);
4897
710
            cli_jsonbool(pdfobj, "Keywords_base64", 1);
4898
710
            free(b64);
4899
1.59k
        } else {
4900
1.59k
            cli_jsonstr(pdfobj, "Keywords", "");
4901
1.59k
        }
4902
5.68k
    }
4903
441k
    if (pdf->stats.ninvalidobjs)
4904
9.35k
        cli_jsonint(pdfobj, "InvalidObjectCount", pdf->stats.ninvalidobjs);
4905
441k
    if (pdf->stats.njs)
4906
4.33k
        cli_jsonint(pdfobj, "JavaScriptObjectCount", pdf->stats.njs);
4907
441k
    if (pdf->stats.nflate)
4908
73.1k
        cli_jsonint(pdfobj, "DeflateObjectCount", pdf->stats.nflate);
4909
441k
    if (pdf->stats.nactivex)
4910
0
        cli_jsonint(pdfobj, "ActiveXObjectCount", pdf->stats.nactivex);
4911
441k
    if (pdf->stats.nflash)
4912
0
        cli_jsonint(pdfobj, "FlashObjectCount", pdf->stats.nflash);
4913
441k
    if (pdf->stats.ncolors)
4914
0
        cli_jsonint(pdfobj, "ColorCount", pdf->stats.ncolors);
4915
441k
    if (pdf->stats.nasciihexdecode)
4916
9.96k
        cli_jsonint(pdfobj, "AsciiHexDecodeObjectCount", pdf->stats.nasciihexdecode);
4917
441k
    if (pdf->stats.nascii85decode)
4918
14.7k
        cli_jsonint(pdfobj, "Ascii85DecodeObjectCount", pdf->stats.nascii85decode);
4919
441k
    if (pdf->stats.nembeddedfile)
4920
13.9k
        cli_jsonint(pdfobj, "EmbeddedFileCount", pdf->stats.nembeddedfile);
4921
441k
    if (pdf->stats.nimage)
4922
4.34k
        cli_jsonint(pdfobj, "ImageCount", pdf->stats.nimage);
4923
441k
    if (pdf->stats.nlzw)
4924
12.1k
        cli_jsonint(pdfobj, "LZWCount", pdf->stats.nlzw);
4925
441k
    if (pdf->stats.nrunlengthdecode)
4926
12.8k
        cli_jsonint(pdfobj, "RunLengthDecodeCount", pdf->stats.nrunlengthdecode);
4927
441k
    if (pdf->stats.nfaxdecode)
4928
1.58k
        cli_jsonint(pdfobj, "FaxDecodeCount", pdf->stats.nfaxdecode);
4929
441k
    if (pdf->stats.njbig2decode)
4930
1.80k
        cli_jsonint(pdfobj, "JBIG2DecodeCount", pdf->stats.njbig2decode);
4931
441k
    if (pdf->stats.ndctdecode)
4932
8.44k
        cli_jsonint(pdfobj, "DCTDecodeCount", pdf->stats.ndctdecode);
4933
441k
    if (pdf->stats.njpxdecode)
4934
1.38k
        cli_jsonint(pdfobj, "JPXDecodeCount", pdf->stats.njpxdecode);
4935
441k
    if (pdf->stats.ncrypt)
4936
9.12k
        cli_jsonint(pdfobj, "CryptCount", pdf->stats.ncrypt);
4937
441k
    if (pdf->stats.nstandard)
4938
27.9k
        cli_jsonint(pdfobj, "StandardCount", pdf->stats.nstandard);
4939
441k
    if (pdf->stats.nsigned)
4940
1.92k
        cli_jsonint(pdfobj, "SignedCount", pdf->stats.nsigned);
4941
441k
    if (pdf->stats.nopenaction)
4942
29.1k
        cli_jsonint(pdfobj, "OpenActionCount", pdf->stats.nopenaction);
4943
441k
    if (pdf->stats.nlaunch)
4944
7.72k
        cli_jsonint(pdfobj, "LaunchCount", pdf->stats.nlaunch);
4945
441k
    if (pdf->stats.npage)
4946
15.2k
        cli_jsonint(pdfobj, "PageCount", pdf->stats.npage);
4947
441k
    if (pdf->stats.nrichmedia)
4948
984
        cli_jsonint(pdfobj, "RichMediaCount", pdf->stats.nrichmedia);
4949
441k
    if (pdf->stats.nacroform)
4950
6.54k
        cli_jsonint(pdfobj, "AcroFormCount", pdf->stats.nacroform);
4951
441k
    if (pdf->stats.nxfa)
4952
619
        cli_jsonint(pdfobj, "XFACount", pdf->stats.nxfa);
4953
441k
    if (pdf->flags & (1 << BAD_PDF_VERSION))
4954
304k
        cli_jsonbool(pdfobj, "BadVersion", 1);
4955
441k
    if (pdf->flags & (1 << BAD_PDF_HEADERPOS))
4956
414k
        cli_jsonbool(pdfobj, "BadHeaderPosition", 1);
4957
441k
    if (pdf->flags & (1 << BAD_PDF_TRAILER))
4958
428k
        cli_jsonbool(pdfobj, "BadTrailer", 1);
4959
441k
    if (pdf->flags & (1 << BAD_PDF_TOOMANYOBJS))
4960
0
        cli_jsonbool(pdfobj, "TooManyObjects", 1);
4961
441k
    if (pdf->flags & (1 << ENCRYPTED_PDF)) {
4962
53.3k
        cli_jsonbool(pdfobj, "Encrypted", 1);
4963
53.3k
        if (pdf->flags & (1 << DECRYPTABLE_PDF))
4964
3.91k
            cli_jsonbool(pdfobj, "Decryptable", 1);
4965
49.3k
        else
4966
49.3k
            cli_jsonbool(pdfobj, "Decryptable", 0);
4967
53.3k
    }
4968
4969
2.03M
    for (i = 0; i < pdf->nobjs; i++) {
4970
1.59M
        if (pdf->objs[i]->flags & (1 << OBJ_TRUNCATED)) {
4971
352k
            json_object *truncobj;
4972
4973
352k
            truncobj = cli_jsonarray(pdfobj, "TruncatedObjects");
4974
352k
            if (!(truncobj))
4975
0
                continue;
4976
4977
352k
            cli_jsonint_array(truncobj, pdf->objs[i]->id >> 8);
4978
352k
        }
4979
1.59M
    }
4980
4981
441k
cleanup:
4982
441k
    pdf_free_stats(pdf);
4983
441k
}