Coverage Report

Created: 2026-06-09 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/qpdf/libqpdf/QPDFTokenizer.cc
Line
Count
Source
1
#include <qpdf/QPDFTokenizer_private.hh>
2
3
// DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
4
// including it in case it may accidentally be used.
5
6
#include <qpdf/InputSource_private.hh>
7
#include <qpdf/QIntC.hh>
8
#include <qpdf/QPDFExc.hh>
9
#include <qpdf/QPDFObjectHandle.hh>
10
#include <qpdf/QTC.hh>
11
#include <qpdf/QUtil.hh>
12
#include <qpdf/Util.hh>
13
14
#include <cstdlib>
15
#include <cstring>
16
#include <stdexcept>
17
18
using namespace qpdf;
19
20
using Token = QPDFTokenizer::Token;
21
using tt = QPDFTokenizer::token_type_e;
22
23
static inline bool
24
is_delimiter(char ch)
25
91.2M
{
26
91.2M
    return (
27
91.2M
        ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' ||
28
80.5M
        ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' ||
29
79.7M
        ch == '\v' || ch == '\f' || ch == 0);
30
91.2M
}
31
32
namespace
33
{
34
    class QPDFWordTokenFinder: public InputSource::Finder
35
    {
36
      public:
37
        QPDFWordTokenFinder(InputSource& is, std::string const& str) :
38
10.1k
            is(is),
39
10.1k
            str(str)
40
10.1k
        {
41
10.1k
        }
42
10.1k
        ~QPDFWordTokenFinder() override = default;
43
        bool check() override;
44
45
      private:
46
        InputSource& is;
47
        std::string str;
48
    };
49
} // namespace
50
51
bool
52
QPDFWordTokenFinder::check()
53
31.6k
{
54
    // Find a word token matching the given string, preceded by a delimiter, and followed by a
55
    // delimiter or EOF.
56
31.6k
    Tokenizer tokenizer;
57
31.6k
    tokenizer.nextToken(is, "finder", str.size() + 2);
58
31.6k
    qpdf_offset_t pos = is.tell();
59
31.6k
    if (tokenizer.getType() != tt::tt_word || tokenizer.getValue() != str) {
60
22.1k
        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
61
22.1k
        return false;
62
22.1k
    }
63
9.48k
    qpdf_offset_t token_start = is.getLastOffset();
64
9.48k
    char next;
65
9.48k
    bool next_okay = false;
66
9.48k
    if (is.read(&next, 1) == 0) {
67
13
        QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
68
13
        next_okay = true;
69
9.47k
    } else {
70
9.47k
        next_okay = is_delimiter(next);
71
9.47k
    }
72
9.48k
    is.seek(pos, SEEK_SET);
73
9.48k
    if (!next_okay) {
74
0
        return false;
75
0
    }
76
9.48k
    if (token_start == 0) {
77
        // Can't actually happen...we never start the search at the beginning of the input.
78
0
        return false;
79
0
    }
80
9.48k
    return true;
81
9.48k
}
82
83
void
84
Tokenizer::reset()
85
23.2M
{
86
23.2M
    state = st_before_token;
87
23.2M
    type = tt::tt_bad;
88
23.2M
    val.clear();
89
23.2M
    raw_val.clear();
90
23.2M
    error_message = "";
91
23.2M
    before_token = true;
92
23.2M
    in_token = false;
93
23.2M
    char_to_unread = '\0';
94
23.2M
    inline_image_bytes = 0;
95
23.2M
    string_depth = 0;
96
23.2M
    bad = false;
97
23.2M
}
98
99
QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
100
1.73k
    type(type),
101
1.73k
    value(value),
102
1.73k
    raw_value(value)
103
1.73k
{
104
1.73k
    if (type == tt_string) {
105
0
        raw_value = QPDFObjectHandle::newString(value).unparse();
106
1.73k
    } else if (type == tt_name) {
107
0
        raw_value = QPDFObjectHandle::newName(value).unparse();
108
0
    }
109
1.73k
}
110
111
QPDFTokenizer::QPDFTokenizer() :
112
2.65k
    m(std::make_unique<qpdf::Tokenizer>())
113
2.65k
{
114
2.65k
}
115
116
2.65k
QPDFTokenizer::~QPDFTokenizer() = default;
117
118
Tokenizer::Tokenizer()
119
65.1k
{
120
65.1k
    reset();
121
65.1k
}
122
123
void
124
QPDFTokenizer::allowEOF()
125
2.65k
{
126
2.65k
    m->allowEOF();
127
2.65k
}
128
129
void
130
Tokenizer::allowEOF()
131
23.9k
{
132
23.9k
    allow_eof = true;
133
23.9k
}
134
135
void
136
QPDFTokenizer::includeIgnorable()
137
2.65k
{
138
2.65k
    m->includeIgnorable();
139
2.65k
}
140
141
void
142
Tokenizer::includeIgnorable()
143
2.65k
{
144
2.65k
    include_ignorable = true;
145
2.65k
}
146
147
bool
148
Tokenizer::isSpace(char ch)
149
42.0M
{
150
42.0M
    return (ch == '\0' || util::is_space(ch));
151
42.0M
}
152
153
bool
154
Tokenizer::isDelimiter(char ch)
155
91.2M
{
156
91.2M
    return is_delimiter(ch);
157
91.2M
}
158
159
void
160
QPDFTokenizer::presentCharacter(char ch)
161
0
{
162
0
    m->presentCharacter(ch);
163
0
}
164
165
void
166
Tokenizer::presentCharacter(char ch)
167
16.9k
{
168
16.9k
    handleCharacter(ch);
169
170
16.9k
    if (in_token) {
171
0
        raw_val += ch;
172
0
    }
173
16.9k
}
174
175
void
176
Tokenizer::handleCharacter(char ch)
177
234M
{
178
    // In some cases, functions called below may call a second handler. This happens whenever you
179
    // have to use a character from the next token to detect the end of the current token.
180
181
234M
    switch (state) {
182
0
    case st_top:
183
0
        inTop(ch);
184
0
        return;
185
186
4.63M
    case st_in_space:
187
4.63M
        inSpace(ch);
188
4.63M
        return;
189
190
6.93M
    case st_in_comment:
191
6.93M
        inComment(ch);
192
6.93M
        return;
193
194
320k
    case st_lt:
195
320k
        inLt(ch);
196
320k
        return;
197
198
238k
    case st_gt:
199
238k
        inGt(ch);
200
238k
        return;
201
202
81.4M
    case st_in_string:
203
81.4M
        inString(ch);
204
81.4M
        return;
205
206
61.5M
    case st_name:
207
61.5M
        inName(ch);
208
61.5M
        return;
209
210
13.2M
    case st_number:
211
13.2M
        inNumber(ch);
212
13.2M
        return;
213
214
195k
    case st_real:
215
195k
        inReal(ch);
216
195k
        return;
217
218
139k
    case st_string_after_cr:
219
139k
        inStringAfterCR(ch);
220
139k
        return;
221
222
122k
    case st_string_escape:
223
122k
        inStringEscape(ch);
224
122k
        return;
225
226
7.40k
    case st_char_code:
227
7.40k
        inCharCode(ch);
228
7.40k
        return;
229
230
25.1M
    case st_literal:
231
25.1M
        inLiteral(ch);
232
25.1M
        return;
233
234
2.66M
    case st_inline_image:
235
2.66M
        inInlineImage(ch);
236
2.66M
        return;
237
238
101k
    case st_in_hexstring:
239
101k
        inHexstring(ch);
240
101k
        return;
241
242
84.4k
    case st_in_hexstring_2nd:
243
84.4k
        inHexstring2nd(ch);
244
84.4k
        return;
245
246
145k
    case st_name_hex1:
247
145k
        inNameHex1(ch);
248
145k
        return;
249
250
13.3k
    case st_name_hex2:
251
13.3k
        inNameHex2(ch);
252
13.3k
        return;
253
254
39.5k
    case st_sign:
255
39.5k
        inSign(ch);
256
39.5k
        return;
257
258
33.9k
    case st_decimal:
259
33.9k
        inDecimal(ch);
260
33.9k
        return;
261
262
37.2M
    case (st_before_token):
263
37.2M
        inBeforeToken(ch);
264
37.2M
        return;
265
266
0
    case (st_token_ready):
267
0
        inTokenReady(ch);
268
0
        return;
269
270
0
    default:
271
0
        throw std::logic_error("INTERNAL ERROR: invalid state while reading token");
272
234M
    }
273
234M
}
274
275
void
276
Tokenizer::inTokenReady(char ch)
277
0
{
278
0
    throw std::logic_error(
279
0
        "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting");
280
0
}
281
282
void
283
Tokenizer::inBeforeToken(char ch)
284
37.2M
{
285
    // Note: we specifically do not use ctype here.  It is locale-dependent.
286
37.2M
    if (isSpace(ch)) {
287
22.4M
        before_token = !include_ignorable;
288
22.4M
        in_token = include_ignorable;
289
22.4M
        if (include_ignorable) {
290
1.84M
            state = st_in_space;
291
1.84M
        }
292
22.4M
    } else if (ch == '%') {
293
30.1k
        before_token = !include_ignorable;
294
30.1k
        in_token = include_ignorable;
295
30.1k
        state = st_in_comment;
296
14.8M
    } else {
297
14.8M
        before_token = false;
298
14.8M
        in_token = true;
299
14.8M
        inTop(ch);
300
14.8M
    }
301
37.2M
}
302
303
void
304
Tokenizer::inTop(char ch)
305
14.8M
{
306
14.8M
    switch (ch) {
307
77.2k
    case '(':
308
77.2k
        string_depth = 1;
309
77.2k
        state = st_in_string;
310
77.2k
        return;
311
312
320k
    case '<':
313
320k
        state = st_lt;
314
320k
        return;
315
316
239k
    case '>':
317
239k
        state = st_gt;
318
239k
        return;
319
320
242k
    case (')'):
321
242k
        type = tt::tt_bad;
322
242k
        QTC::TC("qpdf", "QPDFTokenizer bad )");
323
242k
        error_message = "unexpected )";
324
242k
        state = st_token_ready;
325
242k
        return;
326
327
235k
    case '[':
328
235k
        type = tt::tt_array_open;
329
235k
        state = st_token_ready;
330
235k
        return;
331
332
131k
    case ']':
333
131k
        type = tt::tt_array_close;
334
131k
        state = st_token_ready;
335
131k
        return;
336
337
28.8k
    case '{':
338
28.8k
        type = tt::tt_brace_open;
339
28.8k
        state = st_token_ready;
340
28.8k
        return;
341
342
35.7k
    case '}':
343
35.7k
        type = tt::tt_brace_close;
344
35.7k
        state = st_token_ready;
345
35.7k
        return;
346
347
5.80M
    case '/':
348
5.80M
        state = st_name;
349
5.80M
        val += ch;
350
5.80M
        return;
351
352
779k
    case '0':
353
2.23M
    case '1':
354
2.54M
    case '2':
355
2.83M
    case '3':
356
3.90M
    case '4':
357
4.04M
    case '5':
358
4.24M
    case '6':
359
4.30M
    case '7':
360
4.38M
    case '8':
361
4.44M
    case '9':
362
4.44M
        state = st_number;
363
4.44M
        return;
364
365
13.9k
    case '+':
366
39.5k
    case '-':
367
39.5k
        state = st_sign;
368
39.5k
        return;
369
370
33.5k
    case '.':
371
33.5k
        state = st_decimal;
372
33.5k
        return;
373
374
3.20M
    default:
375
3.20M
        state = st_literal;
376
3.20M
        return;
377
14.8M
    }
378
14.8M
}
379
380
void
381
Tokenizer::inSpace(char ch)
382
4.63M
{
383
    // We only enter this state if include_ignorable is true.
384
4.63M
    if (!isSpace(ch)) {
385
1.84M
        type = tt::tt_space;
386
1.84M
        in_token = false;
387
1.84M
        char_to_unread = ch;
388
1.84M
        state = st_token_ready;
389
1.84M
    }
390
4.63M
}
391
392
void
393
Tokenizer::inComment(char ch)
394
6.93M
{
395
6.93M
    if ((ch == '\r') || (ch == '\n')) {
396
29.0k
        if (include_ignorable) {
397
5.13k
            type = tt::tt_comment;
398
5.13k
            in_token = false;
399
5.13k
            char_to_unread = ch;
400
5.13k
            state = st_token_ready;
401
23.9k
        } else {
402
23.9k
            state = st_before_token;
403
23.9k
        }
404
29.0k
    }
405
6.93M
}
406
407
void
408
Tokenizer::inString(char ch)
409
81.6M
{
410
81.6M
    switch (ch) {
411
122k
    case '\\':
412
122k
        state = st_string_escape;
413
122k
        return;
414
415
118k
    case '(':
416
118k
        val += ch;
417
118k
        ++string_depth;
418
118k
        return;
419
420
150k
    case ')':
421
150k
        if (--string_depth == 0) {
422
71.2k
            type = tt::tt_string;
423
71.2k
            state = st_token_ready;
424
71.2k
            return;
425
71.2k
        }
426
427
79.6k
        val += ch;
428
79.6k
        return;
429
430
139k
    case '\r':
431
        // CR by itself is converted to LF
432
139k
        val += '\n';
433
139k
        state = st_string_after_cr;
434
139k
        return;
435
436
354k
    case '\n':
437
354k
        val += ch;
438
354k
        return;
439
440
80.7M
    default:
441
80.7M
        val += ch;
442
80.7M
        return;
443
81.6M
    }
444
81.6M
}
445
446
void
447
Tokenizer::inName(char ch)
448
61.6M
{
449
61.6M
    if (isDelimiter(ch)) {
450
        // A C-locale whitespace character or delimiter terminates token.  It is important to unread
451
        // the whitespace character even though it is ignored since it may be the newline after a
452
        // stream keyword.  Removing it here could make the stream-reading code break on some files,
453
        // though not on any files in the test suite as of this
454
        // writing.
455
456
5.78M
        type = bad ? tt::tt_bad : tt::tt_name;
457
5.78M
        in_token = false;
458
5.78M
        char_to_unread = ch;
459
5.78M
        state = st_token_ready;
460
55.8M
    } else if (ch == '#') {
461
145k
        char_code = 0;
462
145k
        state = st_name_hex1;
463
55.7M
    } else {
464
55.7M
        val += ch;
465
55.7M
    }
466
61.6M
}
467
468
void
469
Tokenizer::inNameHex1(char ch)
470
145k
{
471
145k
    hex_char = ch;
472
473
145k
    if (char hval = util::hex_decode_char(ch); hval < '\20') {
474
13.3k
        char_code = int(hval) << 4;
475
13.3k
        state = st_name_hex2;
476
131k
    } else {
477
131k
        QTC::TC("qpdf", "QPDFTokenizer bad name 1");
478
131k
        error_message = "name with stray # will not work with PDF >= 1.2";
479
        // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
480
131k
        val += '\0';
481
131k
        state = st_name;
482
131k
        inName(ch);
483
131k
    }
484
145k
}
485
486
void
487
Tokenizer::inNameHex2(char ch)
488
13.3k
{
489
13.3k
    if (char hval = util::hex_decode_char(ch); hval < '\20') {
490
5.78k
        char_code |= int(hval);
491
7.56k
    } else {
492
7.56k
        QTC::TC("qpdf", "QPDFTokenizer bad name 2");
493
7.56k
        error_message = "name with stray # will not work with PDF >= 1.2";
494
        // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
495
7.56k
        val += '\0';
496
7.56k
        val += hex_char;
497
7.56k
        state = st_name;
498
7.56k
        inName(ch);
499
7.56k
        return;
500
7.56k
    }
501
5.78k
    if (char_code == 0) {
502
717
        QTC::TC("qpdf", "QPDFTokenizer null in name");
503
717
        error_message = "null character not allowed in name token";
504
717
        val += "#00";
505
717
        state = st_name;
506
717
        bad = true;
507
5.06k
    } else {
508
5.06k
        val += char(char_code);
509
5.06k
        state = st_name;
510
5.06k
    }
511
5.78k
}
512
513
void
514
Tokenizer::inSign(char ch)
515
39.5k
{
516
39.5k
    if (util::is_digit(ch)) {
517
17.6k
        state = st_number;
518
21.8k
    } else if (ch == '.') {
519
344
        state = st_decimal;
520
21.4k
    } else {
521
21.4k
        state = st_literal;
522
21.4k
        inLiteral(ch);
523
21.4k
    }
524
39.5k
}
525
526
void
527
Tokenizer::inDecimal(char ch)
528
33.9k
{
529
33.9k
    if (util::is_digit(ch)) {
530
15.7k
        state = st_real;
531
18.1k
    } else {
532
18.1k
        state = st_literal;
533
18.1k
        inLiteral(ch);
534
18.1k
    }
535
33.9k
}
536
537
void
538
Tokenizer::inNumber(char ch)
539
13.2M
{
540
13.2M
    if (util::is_digit(ch)) {
541
8.78M
    } else if (ch == '.') {
542
54.3k
        state = st_real;
543
4.38M
    } else if (isDelimiter(ch)) {
544
4.21M
        type = tt::tt_integer;
545
4.21M
        state = st_token_ready;
546
4.21M
        in_token = false;
547
4.21M
        char_to_unread = ch;
548
4.21M
    } else {
549
172k
        state = st_literal;
550
172k
    }
551
13.2M
}
552
553
void
554
Tokenizer::inReal(char ch)
555
195k
{
556
195k
    if (util::is_digit(ch)) {
557
125k
    } else if (isDelimiter(ch)) {
558
66.1k
        type = tt::tt_real;
559
66.1k
        state = st_token_ready;
560
66.1k
        in_token = false;
561
66.1k
        char_to_unread = ch;
562
66.1k
    } else {
563
3.77k
        state = st_literal;
564
3.77k
    }
565
195k
}
566
void
567
Tokenizer::inStringEscape(char ch)
568
122k
{
569
122k
    state = st_in_string;
570
122k
    switch (ch) {
571
926
    case '0':
572
1.60k
    case '1':
573
2.65k
    case '2':
574
3.27k
    case '3':
575
4.21k
    case '4':
576
4.78k
    case '5':
577
5.11k
    case '6':
578
6.04k
    case '7':
579
6.04k
        state = st_char_code;
580
6.04k
        char_code = 0;
581
6.04k
        digit_count = 0;
582
6.04k
        inCharCode(ch);
583
6.04k
        return;
584
585
2.80k
    case 'n':
586
2.80k
        val += '\n';
587
2.80k
        return;
588
589
8.18k
    case 'r':
590
8.18k
        val += '\r';
591
8.18k
        return;
592
593
19.1k
    case 't':
594
19.1k
        val += '\t';
595
19.1k
        return;
596
597
456
    case 'b':
598
456
        val += '\b';
599
456
        return;
600
601
17.1k
    case 'f':
602
17.1k
        val += '\f';
603
17.1k
        return;
604
605
333
    case '\n':
606
333
        return;
607
608
347
    case '\r':
609
347
        state = st_string_after_cr;
610
347
        return;
611
612
68.0k
    default:
613
        // PDF spec says backslash is ignored before anything else
614
68.0k
        val += ch;
615
68.0k
        return;
616
122k
    }
617
122k
}
618
619
void
620
Tokenizer::inStringAfterCR(char ch)
621
139k
{
622
139k
    state = st_in_string;
623
139k
    if (ch != '\n') {
624
112k
        inString(ch);
625
112k
    }
626
139k
}
627
628
void
629
Tokenizer::inLt(char ch)
630
320k
{
631
320k
    if (ch == '<') {
632
276k
        type = tt::tt_dict_open;
633
276k
        state = st_token_ready;
634
276k
        return;
635
276k
    }
636
637
44.1k
    state = st_in_hexstring;
638
44.1k
    inHexstring(ch);
639
44.1k
}
640
641
void
642
Tokenizer::inGt(char ch)
643
238k
{
644
238k
    if (ch == '>') {
645
205k
        type = tt::tt_dict_close;
646
205k
        state = st_token_ready;
647
205k
    } else {
648
32.8k
        type = tt::tt_bad;
649
32.8k
        QTC::TC("qpdf", "QPDFTokenizer bad >");
650
32.8k
        error_message = "unexpected >";
651
32.8k
        in_token = false;
652
32.8k
        char_to_unread = ch;
653
32.8k
        state = st_token_ready;
654
32.8k
    }
655
238k
}
656
657
void
658
Tokenizer::inLiteral(char ch)
659
25.1M
{
660
25.1M
    if (isDelimiter(ch)) {
661
        // A C-locale whitespace character or delimiter terminates token.  It is important to unread
662
        // the whitespace character even though it is ignored since it may be the newline after a
663
        // stream keyword.  Removing it here could make the stream-reading code break on some files,
664
        // though not on any files in the test suite as of this writing.
665
666
3.27M
        in_token = false;
667
3.27M
        char_to_unread = ch;
668
3.27M
        state = st_token_ready;
669
3.27M
        type = (raw_val == "true") || (raw_val == "false")
670
3.27M
            ? tt::tt_bool
671
3.27M
            : (raw_val == "null" ? tt::tt_null : tt::tt_word);
672
3.27M
    }
673
25.1M
}
674
675
void
676
Tokenizer::inHexstring(char ch)
677
145k
{
678
145k
    if (char hval = util::hex_decode_char(ch); hval < '\20') {
679
78.9k
        char_code = int(hval) << 4;
680
78.9k
        state = st_in_hexstring_2nd;
681
682
78.9k
    } else if (ch == '>') {
683
7.76k
        type = tt::tt_string;
684
7.76k
        state = st_token_ready;
685
686
59.2k
    } else if (isSpace(ch)) {
687
        // ignore
688
689
32.0k
    } else {
690
27.2k
        type = tt::tt_bad;
691
27.2k
        QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
692
27.2k
        error_message = std::string("invalid character (") + ch + ") in hexstring";
693
27.2k
        state = st_token_ready;
694
27.2k
    }
695
145k
}
696
697
void
698
Tokenizer::inHexstring2nd(char ch)
699
84.4k
{
700
84.4k
    if (char hval = util::hex_decode_char(ch); hval < '\20') {
701
69.9k
        val += char(char_code) | hval;
702
69.9k
        state = st_in_hexstring;
703
704
69.9k
    } else if (ch == '>') {
705
        // PDF spec says odd hexstrings have implicit trailing 0.
706
3.54k
        val += char(char_code);
707
3.54k
        type = tt::tt_string;
708
3.54k
        state = st_token_ready;
709
710
10.9k
    } else if (isSpace(ch)) {
711
        // ignore
712
713
5.80k
    } else {
714
5.14k
        type = tt::tt_bad;
715
5.14k
        QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
716
5.14k
        error_message = std::string("invalid character (") + ch + ") in hexstring";
717
5.14k
        state = st_token_ready;
718
5.14k
    }
719
84.4k
}
720
721
void
722
Tokenizer::inCharCode(char ch)
723
13.4k
{
724
13.4k
    bool handled = false;
725
13.4k
    if (('0' <= ch) && (ch <= '7')) {
726
8.36k
        char_code = 8 * char_code + (int(ch) - int('0'));
727
8.36k
        if (++(digit_count) < 3) {
728
7.52k
            return;
729
7.52k
        }
730
845
        handled = true;
731
845
    }
732
    // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF
733
    // Spec says to ignore high-order overflow.
734
5.92k
    val += char(char_code % 256);
735
5.92k
    state = st_in_string;
736
5.92k
    if (!handled) {
737
5.08k
        inString(ch);
738
5.08k
    }
739
5.92k
}
740
741
void
742
Tokenizer::inInlineImage(char ch)
743
2.66M
{
744
2.66M
    if ((raw_val.length() + 1) == inline_image_bytes) {
745
1.67k
        QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
746
1.67k
        type = tt::tt_inline_image;
747
1.67k
        inline_image_bytes = 0;
748
1.67k
        state = st_token_ready;
749
1.67k
    }
750
2.66M
}
751
752
void
753
QPDFTokenizer::presentEOF()
754
0
{
755
0
    m->presentEOF();
756
0
}
757
758
void
759
Tokenizer::presentEOF()
760
40.8k
{
761
40.8k
    switch (state) {
762
2.46k
    case st_name:
763
2.62k
    case st_name_hex1:
764
2.76k
    case st_name_hex2:
765
9.36k
    case st_number:
766
9.56k
    case st_real:
767
9.74k
    case st_sign:
768
9.91k
    case st_decimal:
769
16.9k
    case st_literal:
770
16.9k
        QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
771
        // Push any delimiter to the state machine to finish off the final token.
772
16.9k
        presentCharacter('\f');
773
16.9k
        in_token = true;
774
16.9k
        break;
775
776
0
    case st_top:
777
17.9k
    case st_before_token:
778
17.9k
        type = tt::tt_eof;
779
17.9k
        break;
780
781
1.10k
    case st_in_space:
782
1.10k
        type = include_ignorable ? tt::tt_space : tt::tt_eof;
783
1.10k
        break;
784
785
1.11k
    case st_in_comment:
786
1.11k
        type = include_ignorable ? tt::tt_comment : tt::tt_bad;
787
1.11k
        break;
788
789
0
    case st_token_ready:
790
0
        break;
791
792
3.70k
    default:
793
3.70k
        QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
794
3.70k
        type = tt::tt_bad;
795
3.70k
        error_message = "EOF while reading token";
796
40.8k
    }
797
40.8k
    state = st_token_ready;
798
40.8k
}
799
800
void
801
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
802
0
{
803
0
    m->expectInlineImage(*input);
804
0
}
805
806
void
807
QPDFTokenizer::expectInlineImage(InputSource& input)
808
1.73k
{
809
1.73k
    m->expectInlineImage(input);
810
1.73k
}
811
812
void
813
Tokenizer::expectInlineImage(InputSource& input)
814
1.73k
{
815
1.73k
    if (state == st_token_ready) {
816
0
        reset();
817
1.73k
    } else if (state != st_before_token) {
818
0
        throw std::logic_error(
819
0
            "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
820
0
    }
821
1.73k
    findEI(input);
822
1.73k
    before_token = false;
823
1.73k
    in_token = true;
824
1.73k
    state = st_inline_image;
825
1.73k
}
826
827
void
828
Tokenizer::findEI(InputSource& input)
829
1.73k
{
830
1.73k
    qpdf_offset_t last_offset = input.getLastOffset();
831
1.73k
    qpdf_offset_t pos = input.tell();
832
833
    // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
834
    // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
835
    // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the
836
    // end without finding one, return the last EI we found. Store the number of bytes expected in
837
    // the inline image including the EI and use that to break out of inline image, falling back to
838
    // the old method if needed.
839
840
1.73k
    bool okay = false;
841
1.73k
    bool first_try = true;
842
11.2k
    while (!okay) {
843
10.1k
        QPDFWordTokenFinder f(input, "EI");
844
10.1k
        if (!input.findFirst("EI", input.tell(), 0, f)) {
845
709
            break;
846
709
        }
847
9.48k
        inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
848
849
9.48k
        Tokenizer check;
850
9.48k
        bool found_bad = false;
851
        // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
852
        // like bad tokens, but there will always be at least 10 tokens between one inline image's
853
        // EI and the next valid one's ID since width, height, bits per pixel, and color space are
854
        // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
855
        // be pretty sure we've found the actual EI.
856
31.9k
        for (int i = 0; i < 10; ++i) {
857
30.8k
            check.nextToken(input, "checker");
858
30.8k
            auto typ = check.getType();
859
30.8k
            if (typ == tt::tt_eof) {
860
0
                okay = true;
861
30.8k
            } else if (typ == tt::tt_bad) {
862
2.85k
                found_bad = true;
863
28.0k
            } else if (typ == tt::tt_word) {
864
                // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into
865
                // "words". We recognize strings of alphabetic characters as potential valid
866
                // operators for purposes of telling whether we're in valid content or not. It's not
867
                // perfect, but it should work more reliably than what we used to do, which was
868
                // already good enough for the vast majority of files.
869
17.6k
                bool found_alpha = false;
870
17.6k
                bool found_non_printable = false;
871
17.6k
                bool found_other = false;
872
50.3k
                for (char ch: check.getValue()) {
873
50.3k
                    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '*')) {
874
                        // Treat '*' as alpha since there are valid PDF operators that contain *
875
                        // along with alphabetic characters.
876
34.4k
                        found_alpha = true;
877
34.4k
                    } else if (static_cast<signed char>(ch) < 32 && !isSpace(ch)) {
878
                        // Compare ch as a signed char so characters outside of 7-bit will be < 0.
879
1.84k
                        found_non_printable = true;
880
1.84k
                        break;
881
14.0k
                    } else {
882
14.0k
                        found_other = true;
883
14.0k
                    }
884
50.3k
                }
885
17.6k
                if (found_non_printable || (found_alpha && found_other)) {
886
5.60k
                    found_bad = true;
887
5.60k
                }
888
17.6k
            }
889
30.8k
            if (okay || found_bad) {
890
8.46k
                break;
891
8.46k
            }
892
30.8k
        }
893
9.48k
        if (!found_bad) {
894
1.02k
            okay = true;
895
1.02k
        }
896
9.48k
        if (!okay) {
897
8.46k
            first_try = false;
898
8.46k
        }
899
9.48k
    }
900
1.73k
    if (okay && (!first_try)) {
901
115
        QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
902
115
    }
903
904
1.73k
    input.seek(pos, SEEK_SET);
905
1.73k
    input.setLastOffset(last_offset);
906
1.73k
}
907
908
bool
909
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
910
0
{
911
0
    return m->getToken(token, unread_char, ch);
912
0
}
913
914
bool
915
Tokenizer::getToken(Token& token, bool& unread_char, char& ch)
916
6.44M
{
917
6.44M
    bool ready = (state == st_token_ready);
918
6.44M
    unread_char = !in_token && !before_token;
919
6.44M
    ch = char_to_unread;
920
6.44M
    if (ready) {
921
6.44M
        token = (!(type == tt::tt_name || type == tt::tt_string))
922
6.44M
            ? Token(type, raw_val, raw_val, error_message)
923
6.44M
            : Token(type, val, raw_val, error_message);
924
925
6.44M
        reset();
926
6.44M
    }
927
6.44M
    return ready;
928
6.44M
}
929
930
bool
931
QPDFTokenizer::betweenTokens()
932
0
{
933
0
    return m->betweenTokens();
934
0
}
935
936
bool
937
Tokenizer::betweenTokens()
938
0
{
939
0
    return before_token;
940
0
}
941
942
QPDFTokenizer::Token
943
QPDFTokenizer::readToken(
944
    InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
945
4.20M
{
946
4.20M
    return m->readToken(input, context, allow_bad, max_len);
947
4.20M
}
948
949
QPDFTokenizer::Token
950
QPDFTokenizer::readToken(
951
    std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
952
0
{
953
0
    return m->readToken(*input, context, allow_bad, max_len);
954
0
}
955
956
QPDFTokenizer::Token
957
Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
958
6.44M
{
959
6.44M
    nextToken(input, context, max_len);
960
961
6.44M
    Token token;
962
6.44M
    bool unread_char;
963
6.44M
    char char_to_unread;
964
6.44M
    getToken(token, unread_char, char_to_unread);
965
966
6.44M
    if (token.getType() == tt::tt_bad) {
967
454k
        if (allow_bad) {
968
454k
            QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
969
454k
        } else {
970
0
            throw QPDFExc(
971
0
                qpdf_e_damaged_pdf,
972
0
                input.getName(),
973
0
                context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
974
0
                input.getLastOffset(),
975
0
                token.getErrorMessage());
976
0
        }
977
454k
    }
978
6.44M
    return token;
979
6.44M
}
980
981
bool
982
Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
983
16.7M
{
984
16.7M
    if (state != st_inline_image) {
985
16.7M
        reset();
986
16.7M
    }
987
16.7M
    qpdf_offset_t offset = input.fastTell();
988
989
251M
    while (state != st_token_ready) {
990
234M
        char ch;
991
234M
        if (!input.fastRead(ch)) {
992
40.8k
            presentEOF();
993
994
40.8k
            if ((type == tt::tt_eof) && (!allow_eof)) {
995
                // Nothing in the qpdf library calls readToken without allowEOF anymore, so this
996
                // case is not exercised.
997
185
                type = tt::tt_bad;
998
185
                error_message = "unexpected EOF";
999
185
                offset = input.getLastOffset();
1000
185
            }
1001
234M
        } else {
1002
234M
            handleCharacter(ch);
1003
234M
            if (before_token) {
1004
23.0M
                ++offset;
1005
23.0M
            }
1006
234M
            if (in_token) {
1007
196M
                raw_val += ch;
1008
196M
            }
1009
234M
            if (max_len && (raw_val.length() >= max_len) && (state != st_token_ready)) {
1010
                // terminate this token now
1011
190k
                QTC::TC("qpdf", "QPDFTokenizer block long token");
1012
190k
                type = tt::tt_bad;
1013
190k
                state = st_token_ready;
1014
190k
                error_message = "exceeded allowable length while reading token";
1015
190k
            }
1016
234M
        }
1017
234M
    }
1018
1019
16.7M
    input.fastUnread(!in_token && !before_token);
1020
1021
16.7M
    if (type != tt::tt_eof) {
1022
16.6M
        input.setLastOffset(offset);
1023
16.6M
    }
1024
1025
16.7M
    return error_message.empty();
1026
16.7M
}