Coverage Report

Created: 2024-09-08 06:06

/src/qpdf/libqpdf/QPDFTokenizer.cc
Line
Count
Source (jump to first uncovered line)
1
#include <qpdf/QPDFTokenizer.hh>
2
3
// DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of
4
// including it in case it may accidentally be used.
5
6
#include <qpdf/QIntC.hh>
7
#include <qpdf/QPDFExc.hh>
8
#include <qpdf/QPDFObjectHandle.hh>
9
#include <qpdf/QTC.hh>
10
#include <qpdf/QUtil.hh>
11
12
#include <cstdlib>
13
#include <cstring>
14
#include <stdexcept>
15
16
static inline bool
17
is_delimiter(char ch)
18
105M
{
19
105M
    return (
20
105M
        ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' ||
21
105M
        ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' ||
22
105M
        ch == '\v' || ch == '\f' || ch == 0);
23
105M
}
24
25
namespace
26
{
27
    class QPDFWordTokenFinder: public InputSource::Finder
28
    {
29
      public:
30
        QPDFWordTokenFinder(InputSource& is, std::string const& str) :
31
            is(is),
32
            str(str)
33
0
        {
34
0
        }
35
0
        ~QPDFWordTokenFinder() override = default;
36
        bool check() override;
37
38
      private:
39
        InputSource& is;
40
        std::string str;
41
    };
42
} // namespace
43
44
bool
45
QPDFWordTokenFinder::check()
46
0
{
47
    // Find a word token matching the given string, preceded by a delimiter, and followed by a
48
    // delimiter or EOF.
49
0
    QPDFTokenizer tokenizer;
50
0
    QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
51
0
    qpdf_offset_t pos = is.tell();
52
0
    if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) {
53
0
        QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
54
0
        return false;
55
0
    }
56
0
    qpdf_offset_t token_start = is.getLastOffset();
57
0
    char next;
58
0
    bool next_okay = false;
59
0
    if (is.read(&next, 1) == 0) {
60
0
        QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
61
0
        next_okay = true;
62
0
    } else {
63
0
        next_okay = is_delimiter(next);
64
0
    }
65
0
    is.seek(pos, SEEK_SET);
66
0
    if (!next_okay) {
67
0
        return false;
68
0
    }
69
0
    if (token_start == 0) {
70
        // Can't actually happen...we never start the search at the beginning of the input.
71
0
        return false;
72
0
    }
73
0
    return true;
74
0
}
75
76
void
77
QPDFTokenizer::reset()
78
17.8M
{
79
17.8M
    state = st_before_token;
80
17.8M
    type = tt_bad;
81
17.8M
    val.clear();
82
17.8M
    raw_val.clear();
83
17.8M
    error_message = "";
84
17.8M
    before_token = true;
85
17.8M
    in_token = false;
86
17.8M
    char_to_unread = '\0';
87
17.8M
    inline_image_bytes = 0;
88
17.8M
    string_depth = 0;
89
17.8M
    bad = false;
90
17.8M
}
91
92
QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
93
    type(type),
94
    value(value),
95
    raw_value(value)
96
0
{
97
0
    if (type == tt_string) {
98
0
        raw_value = QPDFObjectHandle::newString(value).unparse();
99
0
    } else if (type == tt_name) {
100
0
        raw_value = QPDFObjectHandle::newName(value).unparse();
101
0
    }
102
0
}
103
104
QPDFTokenizer::QPDFTokenizer() :
105
    allow_eof(false),
106
    include_ignorable(false)
107
15.2k
{
108
15.2k
    reset();
109
15.2k
}
110
111
void
112
QPDFTokenizer::allowEOF()
113
6.27k
{
114
6.27k
    this->allow_eof = true;
115
6.27k
}
116
117
void
118
QPDFTokenizer::includeIgnorable()
119
0
{
120
0
    this->include_ignorable = true;
121
0
}
122
123
bool
124
QPDFTokenizer::isSpace(char ch)
125
49.8M
{
126
49.8M
    return ((ch == '\0') || QUtil::is_space(ch));
127
49.8M
}
128
129
bool
130
QPDFTokenizer::isDelimiter(char ch)
131
105M
{
132
105M
    return is_delimiter(ch);
133
105M
}
134
135
void
136
QPDFTokenizer::presentCharacter(char ch)
137
7.02k
{
138
7.02k
    handleCharacter(ch);
139
140
7.02k
    if (this->in_token) {
141
0
        this->raw_val += ch;
142
0
    }
143
7.02k
}
144
145
void
146
QPDFTokenizer::handleCharacter(char ch)
147
238M
{
148
    // In some cases, functions called below may call a second handler. This happens whenever you
149
    // have to use a character from the next token to detect the end of the current token.
150
151
238M
    switch (this->state) {
152
0
    case st_top:
153
0
        inTop(ch);
154
0
        return;
155
156
0
    case st_in_space:
157
0
        inSpace(ch);
158
0
        return;
159
160
2.13M
    case st_in_comment:
161
2.13M
        inComment(ch);
162
2.13M
        return;
163
164
704k
    case st_lt:
165
704k
        inLt(ch);
166
704k
        return;
167
168
459k
    case st_gt:
169
459k
        inGt(ch);
170
459k
        return;
171
172
49.5M
    case st_in_string:
173
49.5M
        inString(ch);
174
49.5M
        return;
175
176
35.8M
    case st_name:
177
35.8M
        inName(ch);
178
35.8M
        return;
179
180
31.0M
    case st_number:
181
31.0M
        inNumber(ch);
182
31.0M
        return;
183
184
771k
    case st_real:
185
771k
        inReal(ch);
186
771k
        return;
187
188
251k
    case st_string_after_cr:
189
251k
        inStringAfterCR(ch);
190
251k
        return;
191
192
104k
    case st_string_escape:
193
104k
        inStringEscape(ch);
194
104k
        return;
195
196
8.22k
    case st_char_code:
197
8.22k
        inCharCode(ch);
198
8.22k
        return;
199
200
66.1M
    case st_literal:
201
66.1M
        inLiteral(ch);
202
66.1M
        return;
203
204
0
    case st_inline_image:
205
0
        inInlineImage(ch);
206
0
        return;
207
208
1.21M
    case st_in_hexstring:
209
1.21M
        inHexstring(ch);
210
1.21M
        return;
211
212
1.07M
    case st_in_hexstring_2nd:
213
1.07M
        inHexstring2nd(ch);
214
1.07M
        return;
215
216
8.17k
    case st_name_hex1:
217
8.17k
        inNameHex1(ch);
218
8.17k
        return;
219
220
782
    case st_name_hex2:
221
782
        inNameHex2(ch);
222
782
        return;
223
224
17.2k
    case st_sign:
225
17.2k
        inSign(ch);
226
17.2k
        return;
227
228
5.59k
    case st_decimal:
229
5.59k
        inDecimal(ch);
230
5.59k
        return;
231
232
49.5M
    case (st_before_token):
233
49.5M
        inBeforeToken(ch);
234
49.5M
        return;
235
236
0
    case (st_token_ready):
237
0
        inTokenReady(ch);
238
0
        return;
239
240
0
    default:
241
0
        throw std::logic_error("INTERNAL ERROR: invalid state while reading token");
242
238M
    }
243
238M
}
244
245
void
246
QPDFTokenizer::inTokenReady(char ch)
247
0
{
248
0
    throw std::logic_error(
249
0
        "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting");
250
0
}
251
252
void
253
QPDFTokenizer::inBeforeToken(char ch)
254
49.5M
{
255
    // Note: we specifically do not use ctype here.  It is locale-dependent.
256
49.5M
    if (isSpace(ch)) {
257
37.6M
        this->before_token = !this->include_ignorable;
258
37.6M
        this->in_token = this->include_ignorable;
259
37.6M
        if (this->include_ignorable) {
260
0
            this->state = st_in_space;
261
0
        }
262
37.6M
    } else if (ch == '%') {
263
83.1k
        this->before_token = !this->include_ignorable;
264
83.1k
        this->in_token = this->include_ignorable;
265
83.1k
        this->state = st_in_comment;
266
11.8M
    } else {
267
11.8M
        this->before_token = false;
268
11.8M
        this->in_token = true;
269
11.8M
        inTop(ch);
270
11.8M
    }
271
49.5M
}
272
273
void
274
QPDFTokenizer::inTop(char ch)
275
11.8M
{
276
11.8M
    switch (ch) {
277
42.6k
    case '(':
278
42.6k
        this->string_depth = 1;
279
42.6k
        this->state = st_in_string;
280
42.6k
        return;
281
282
704k
    case '<':
283
704k
        this->state = st_lt;
284
704k
        return;
285
286
459k
    case '>':
287
459k
        this->state = st_gt;
288
459k
        return;
289
290
5.61k
    case (')'):
291
5.61k
        this->type = tt_bad;
292
5.61k
        QTC::TC("qpdf", "QPDFTokenizer bad )");
293
5.61k
        this->error_message = "unexpected )";
294
5.61k
        this->state = st_token_ready;
295
5.61k
        return;
296
297
213k
    case '[':
298
213k
        this->type = tt_array_open;
299
213k
        this->state = st_token_ready;
300
213k
        return;
301
302
232k
    case ']':
303
232k
        this->type = tt_array_close;
304
232k
        this->state = st_token_ready;
305
232k
        return;
306
307
2.46k
    case '{':
308
2.46k
        this->type = tt_brace_open;
309
2.46k
        this->state = st_token_ready;
310
2.46k
        return;
311
312
2.70k
    case '}':
313
2.70k
        this->type = tt_brace_close;
314
2.70k
        this->state = st_token_ready;
315
2.70k
        return;
316
317
2.64M
    case '/':
318
2.64M
        this->state = st_name;
319
2.64M
        this->val += ch;
320
2.64M
        return;
321
322
1.57M
    case '0':
323
2.28M
    case '1':
324
2.58M
    case '2':
325
2.79M
    case '3':
326
2.95M
    case '4':
327
3.26M
    case '5':
328
3.57M
    case '6':
329
3.81M
    case '7':
330
3.99M
    case '8':
331
4.10M
    case '9':
332
4.10M
        this->state = st_number;
333
4.10M
        return;
334
335
2.01k
    case '+':
336
17.2k
    case '-':
337
17.2k
        this->state = st_sign;
338
17.2k
        return;
339
340
5.53k
    case '.':
341
5.53k
        this->state = st_decimal;
342
5.53k
        return;
343
344
3.40M
    default:
345
3.40M
        this->state = st_literal;
346
3.40M
        return;
347
11.8M
    }
348
11.8M
}
349
350
void
351
QPDFTokenizer::inSpace(char ch)
352
0
{
353
    // We only enter this state if include_ignorable is true.
354
0
    if (!isSpace(ch)) {
355
0
        this->type = tt_space;
356
0
        this->in_token = false;
357
0
        this->char_to_unread = ch;
358
0
        this->state = st_token_ready;
359
0
    }
360
0
}
361
362
void
363
QPDFTokenizer::inComment(char ch)
364
2.13M
{
365
2.13M
    if ((ch == '\r') || (ch == '\n')) {
366
82.4k
        if (this->include_ignorable) {
367
0
            this->type = tt_comment;
368
0
            this->in_token = false;
369
0
            this->char_to_unread = ch;
370
0
            this->state = st_token_ready;
371
82.4k
        } else {
372
82.4k
            this->state = st_before_token;
373
82.4k
        }
374
82.4k
    }
375
2.13M
}
376
377
void
378
QPDFTokenizer::inString(char ch)
379
49.8M
{
380
49.8M
    switch (ch) {
381
104k
    case '\\':
382
104k
        this->state = st_string_escape;
383
104k
        return;
384
385
133k
    case '(':
386
133k
        this->val += ch;
387
133k
        ++this->string_depth;
388
133k
        return;
389
390
139k
    case ')':
391
139k
        if (--this->string_depth == 0) {
392
35.4k
            this->type = tt_string;
393
35.4k
            this->state = st_token_ready;
394
35.4k
            return;
395
35.4k
        }
396
397
103k
        this->val += ch;
398
103k
        return;
399
400
249k
    case '\r':
401
        // CR by itself is converted to LF
402
249k
        this->val += '\n';
403
249k
        this->state = st_string_after_cr;
404
249k
        return;
405
406
557k
    case '\n':
407
557k
        this->val += ch;
408
557k
        return;
409
410
48.6M
    default:
411
48.6M
        this->val += ch;
412
48.6M
        return;
413
49.8M
    }
414
49.8M
}
415
416
void
417
QPDFTokenizer::inName(char ch)
418
35.8M
{
419
35.8M
    if (isDelimiter(ch)) {
420
        // A C-locale whitespace character or delimiter terminates token.  It is important to unread
421
        // the whitespace character even though it is ignored since it may be the newline after a
422
        // stream keyword.  Removing it here could make the stream-reading code break on some files,
423
        // though not on any files in the test suite as of this
424
        // writing.
425
426
2.55M
        this->type = this->bad ? tt_bad : tt_name;
427
2.55M
        this->in_token = false;
428
2.55M
        this->char_to_unread = ch;
429
2.55M
        this->state = st_token_ready;
430
33.3M
    } else if (ch == '#') {
431
8.21k
        this->char_code = 0;
432
8.21k
        this->state = st_name_hex1;
433
33.3M
    } else {
434
33.3M
        this->val += ch;
435
33.3M
    }
436
35.8M
}
437
438
void
439
QPDFTokenizer::inNameHex1(char ch)
440
8.17k
{
441
8.17k
    this->hex_char = ch;
442
443
8.17k
    if (char hval = QUtil::hex_decode_char(ch); hval < '\20') {
444
782
        this->char_code = int(hval) << 4;
445
782
        this->state = st_name_hex2;
446
7.39k
    } else {
447
7.39k
        QTC::TC("qpdf", "QPDFTokenizer bad name 1");
448
7.39k
        this->error_message = "name with stray # will not work with PDF >= 1.2";
449
        // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
450
7.39k
        this->val += '\0';
451
7.39k
        this->state = st_name;
452
7.39k
        inName(ch);
453
7.39k
    }
454
8.17k
}
455
456
void
457
QPDFTokenizer::inNameHex2(char ch)
458
782
{
459
782
    if (char hval = QUtil::hex_decode_char(ch); hval < '\20') {
460
529
        this->char_code |= int(hval);
461
529
    } else {
462
253
        QTC::TC("qpdf", "QPDFTokenizer bad name 2");
463
253
        this->error_message = "name with stray # will not work with PDF >= 1.2";
464
        // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName.
465
253
        this->val += '\0';
466
253
        this->val += this->hex_char;
467
253
        this->state = st_name;
468
253
        inName(ch);
469
253
        return;
470
253
    }
471
529
    if (this->char_code == 0) {
472
90
        QTC::TC("qpdf", "QPDFTokenizer null in name");
473
90
        this->error_message = "null character not allowed in name token";
474
90
        this->val += "#00";
475
90
        this->state = st_name;
476
90
        this->bad = true;
477
439
    } else {
478
439
        this->val += char(this->char_code);
479
439
        this->state = st_name;
480
439
    }
481
529
}
482
483
void
484
QPDFTokenizer::inSign(char ch)
485
17.2k
{
486
17.2k
    if (QUtil::is_digit(ch)) {
487
11.0k
        this->state = st_number;
488
11.0k
    } else if (ch == '.') {
489
66
        this->state = st_decimal;
490
6.08k
    } else {
491
6.08k
        this->state = st_literal;
492
6.08k
        inLiteral(ch);
493
6.08k
    }
494
17.2k
}
495
496
void
497
QPDFTokenizer::inDecimal(char ch)
498
5.59k
{
499
5.59k
    if (QUtil::is_digit(ch)) {
500
949
        this->state = st_real;
501
4.65k
    } else {
502
4.65k
        this->state = st_literal;
503
4.65k
        inLiteral(ch);
504
4.65k
    }
505
5.59k
}
506
507
void
508
QPDFTokenizer::inNumber(char ch)
509
31.0M
{
510
31.0M
    if (QUtil::is_digit(ch)) {
511
27.1M
    } else if (ch == '.') {
512
189k
        this->state = st_real;
513
3.71M
    } else if (isDelimiter(ch)) {
514
3.66M
        this->type = tt_integer;
515
3.66M
        this->state = st_token_ready;
516
3.66M
        this->in_token = false;
517
3.66M
        this->char_to_unread = ch;
518
3.66M
    } else {
519
57.7k
        this->state = st_literal;
520
57.7k
    }
521
31.0M
}
522
523
void
524
QPDFTokenizer::inReal(char ch)
525
771k
{
526
771k
    if (QUtil::is_digit(ch)) {
527
581k
    } else if (isDelimiter(ch)) {
528
188k
        this->type = tt_real;
529
188k
        this->state = st_token_ready;
530
188k
        this->in_token = false;
531
188k
        this->char_to_unread = ch;
532
188k
    } else {
533
1.22k
        this->state = st_literal;
534
1.22k
    }
535
771k
}
536
void
537
QPDFTokenizer::inStringEscape(char ch)
538
104k
{
539
104k
    this->state = st_in_string;
540
104k
    switch (ch) {
541
566
    case '0':
542
1.61k
    case '1':
543
2.15k
    case '2':
544
3.33k
    case '3':
545
4.23k
    case '4':
546
4.88k
    case '5':
547
5.53k
    case '6':
548
6.83k
    case '7':
549
6.83k
        this->state = st_char_code;
550
6.83k
        this->char_code = 0;
551
6.83k
        this->digit_count = 0;
552
6.83k
        inCharCode(ch);
553
6.83k
        return;
554
555
723
    case 'n':
556
723
        this->val += '\n';
557
723
        return;
558
559
1.67k
    case 'r':
560
1.67k
        this->val += '\r';
561
1.67k
        return;
562
563
647
    case 't':
564
647
        this->val += '\t';
565
647
        return;
566
567
2.83k
    case 'b':
568
2.83k
        this->val += '\b';
569
2.83k
        return;
570
571
928
    case 'f':
572
928
        this->val += '\f';
573
928
        return;
574
575
673
    case '\n':
576
673
        return;
577
578
1.48k
    case '\r':
579
1.48k
        this->state = st_string_after_cr;
580
1.48k
        return;
581
582
89.0k
    default:
583
        // PDF spec says backslash is ignored before anything else
584
89.0k
        this->val += ch;
585
89.0k
        return;
586
104k
    }
587
104k
}
588
589
void
590
QPDFTokenizer::inStringAfterCR(char ch)
591
251k
{
592
251k
    this->state = st_in_string;
593
251k
    if (ch != '\n') {
594
218k
        inString(ch);
595
218k
    }
596
251k
}
597
598
void
599
QPDFTokenizer::inLt(char ch)
600
704k
{
601
704k
    if (ch == '<') {
602
514k
        this->type = tt_dict_open;
603
514k
        this->state = st_token_ready;
604
514k
        return;
605
514k
    }
606
607
189k
    this->state = st_in_hexstring;
608
189k
    inHexstring(ch);
609
189k
}
610
611
void
612
QPDFTokenizer::inGt(char ch)
613
459k
{
614
459k
    if (ch == '>') {
615
446k
        this->type = tt_dict_close;
616
446k
        this->state = st_token_ready;
617
446k
    } else {
618
12.8k
        this->type = tt_bad;
619
12.8k
        QTC::TC("qpdf", "QPDFTokenizer bad >");
620
12.8k
        this->error_message = "unexpected >";
621
12.8k
        this->in_token = false;
622
12.8k
        this->char_to_unread = ch;
623
12.8k
        this->state = st_token_ready;
624
12.8k
    }
625
459k
}
626
627
void
628
QPDFTokenizer::inLiteral(char ch)
629
66.1M
{
630
66.1M
    if (isDelimiter(ch)) {
631
        // A C-locale whitespace character or delimiter terminates token.  It is important to unread
632
        // the whitespace character even though it is ignored since it may be the newline after a
633
        // stream keyword.  Removing it here could make the stream-reading code break on some files,
634
        // though not on any files in the test suite as of this writing.
635
636
3.22M
        this->in_token = false;
637
3.22M
        this->char_to_unread = ch;
638
3.22M
        this->state = st_token_ready;
639
3.22M
        this->type = (this->raw_val == "true") || (this->raw_val == "false")
640
3.22M
            ? tt_bool
641
3.22M
            : (this->raw_val == "null" ? tt_null : tt_word);
642
3.22M
    }
643
66.1M
}
644
645
void
646
QPDFTokenizer::inHexstring(char ch)
647
1.40M
{
648
1.40M
    if (char hval = QUtil::hex_decode_char(ch); hval < '\20') {
649
1.05M
        this->char_code = int(hval) << 4;
650
1.05M
        this->state = st_in_hexstring_2nd;
651
652
1.05M
    } else if (ch == '>') {
653
70.2k
        this->type = tt_string;
654
70.2k
        this->state = st_token_ready;
655
656
284k
    } else if (isSpace(ch)) {
657
        // ignore
658
659
178k
    } else {
660
105k
        this->type = tt_bad;
661
105k
        QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
662
105k
        this->error_message = std::string("invalid character (") + ch + ") in hexstring";
663
105k
        this->state = st_token_ready;
664
105k
    }
665
1.40M
}
666
667
void
668
QPDFTokenizer::inHexstring2nd(char ch)
669
1.07M
{
670
1.07M
    if (char hval = QUtil::hex_decode_char(ch); hval < '\20') {
671
1.03M
        this->val += char(this->char_code) | hval;
672
1.03M
        this->state = st_in_hexstring;
673
674
1.03M
    } else if (ch == '>') {
675
        // PDF spec says odd hexstrings have implicit trailing 0.
676
1.05k
        this->val += char(this->char_code);
677
1.05k
        this->type = tt_string;
678
1.05k
        this->state = st_token_ready;
679
680
33.1k
    } else if (isSpace(ch)) {
681
        // ignore
682
683
31.1k
    } else {
684
2.05k
        this->type = tt_bad;
685
2.05k
        QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character");
686
2.05k
        this->error_message = std::string("invalid character (") + ch + ") in hexstring";
687
2.05k
        this->state = st_token_ready;
688
2.05k
    }
689
1.07M
}
690
691
void
692
QPDFTokenizer::inCharCode(char ch)
693
15.0k
{
694
15.0k
    bool handled = false;
695
15.0k
    if (('0' <= ch) && (ch <= '7')) {
696
8.31k
        this->char_code = 8 * this->char_code + (int(ch) - int('0'));
697
8.31k
        if (++(this->digit_count) < 3) {
698
8.22k
            return;
699
8.22k
        }
700
89
        handled = true;
701
89
    }
702
    // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF
703
    // Spec says to ignore high-order overflow.
704
6.83k
    this->val += char(this->char_code % 256);
705
6.83k
    this->state = st_in_string;
706
6.83k
    if (!handled) {
707
6.74k
        inString(ch);
708
6.74k
    }
709
6.83k
}
710
711
void
712
QPDFTokenizer::inInlineImage(char ch)
713
0
{
714
0
    if ((this->raw_val.length() + 1) == this->inline_image_bytes) {
715
0
        QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
716
0
        this->type = tt_inline_image;
717
0
        this->inline_image_bytes = 0;
718
0
        this->state = st_token_ready;
719
0
    }
720
0
}
721
722
void
723
QPDFTokenizer::presentEOF()
724
24.4k
{
725
24.4k
    switch (this->state) {
726
4.42k
    case st_name:
727
4.44k
    case st_name_hex1:
728
4.44k
    case st_name_hex2:
729
5.38k
    case st_number:
730
5.43k
    case st_real:
731
5.45k
    case st_sign:
732
5.47k
    case st_decimal:
733
7.02k
    case st_literal:
734
7.02k
        QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
735
        // Push any delimiter to the state machine to finish off the final token.
736
7.02k
        presentCharacter('\f');
737
7.02k
        this->in_token = true;
738
7.02k
        break;
739
740
0
    case st_top:
741
14.7k
    case st_before_token:
742
14.7k
        this->type = tt_eof;
743
14.7k
        break;
744
745
0
    case st_in_space:
746
0
        this->type = this->include_ignorable ? tt_space : tt_eof;
747
0
        break;
748
749
703
    case st_in_comment:
750
703
        this->type = this->include_ignorable ? tt_comment : tt_bad;
751
703
        break;
752
753
0
    case st_token_ready:
754
0
        break;
755
756
1.95k
    default:
757
1.95k
        QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
758
1.95k
        this->type = tt_bad;
759
1.95k
        this->error_message = "EOF while reading token";
760
24.4k
    }
761
24.4k
    this->state = st_token_ready;
762
24.4k
}
763
764
void
765
QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input)
766
0
{
767
0
    expectInlineImage(*input);
768
0
}
769
770
void
771
QPDFTokenizer::expectInlineImage(InputSource& input)
772
0
{
773
0
    if (this->state == st_token_ready) {
774
0
        reset();
775
0
    } else if (this->state != st_before_token) {
776
0
        throw std::logic_error(
777
0
            "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state");
778
0
    }
779
0
    findEI(input);
780
0
    this->before_token = false;
781
0
    this->in_token = true;
782
0
    this->state = st_inline_image;
783
0
}
784
785
void
786
QPDFTokenizer::findEI(InputSource& input)
787
0
{
788
0
    qpdf_offset_t last_offset = input.getLastOffset();
789
0
    qpdf_offset_t pos = input.tell();
790
791
    // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several
792
    // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part
793
    // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the
794
    // end without finding one, return the last EI we found. Store the number of bytes expected in
795
    // the inline image including the EI and use that to break out of inline image, falling back to
796
    // the old method if needed.
797
798
0
    bool okay = false;
799
0
    bool first_try = true;
800
0
    while (!okay) {
801
0
        QPDFWordTokenFinder f(input, "EI");
802
0
        if (!input.findFirst("EI", input.tell(), 0, f)) {
803
0
            break;
804
0
        }
805
0
        inline_image_bytes = QIntC::to_size(input.tell() - pos - 2);
806
807
0
        QPDFTokenizer check;
808
0
        bool found_bad = false;
809
        // Look at the next 10 tokens or up to EOF. The next inline image's image data would look
810
        // like bad tokens, but there will always be at least 10 tokens between one inline image's
811
        // EI and the next valid one's ID since width, height, bits per pixel, and color space are
812
        // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can
813
        // be pretty sure we've found the actual EI.
814
0
        for (int i = 0; i < 10; ++i) {
815
0
            QPDFTokenizer::Token t = check.readToken(input, "checker", true);
816
0
            token_type_e type = t.getType();
817
0
            if (type == tt_eof) {
818
0
                okay = true;
819
0
            } else if (type == tt_bad) {
820
0
                found_bad = true;
821
0
            } else if (t.isWord()) {
822
                // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into
823
                // "words". We recognize strings of alphabetic characters as potential valid
824
                // operators for purposes of telling whether we're in valid content or not. It's not
825
                // perfect, but it should work more reliably than what we used to do, which was
826
                // already good enough for the vast majority of files.
827
0
                bool found_alpha = false;
828
0
                bool found_non_printable = false;
829
0
                bool found_other = false;
830
0
                for (char ch: t.getValue()) {
831
0
                    if (((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A') && (ch <= 'Z')) ||
832
0
                        (ch == '*')) {
833
                        // Treat '*' as alpha since there are valid PDF operators that contain *
834
                        // along with alphabetic characters.
835
0
                        found_alpha = true;
836
0
                    } else if ((static_cast<signed char>(ch) < 32) && (!isSpace(ch))) {
837
                        // Compare ch as a signed char so characters outside of 7-bit will be < 0.
838
0
                        found_non_printable = true;
839
0
                        break;
840
0
                    } else {
841
0
                        found_other = true;
842
0
                    }
843
0
                }
844
0
                if (found_non_printable || (found_alpha && found_other)) {
845
0
                    found_bad = true;
846
0
                }
847
0
            }
848
0
            if (okay || found_bad) {
849
0
                break;
850
0
            }
851
0
        }
852
0
        if (!found_bad) {
853
0
            okay = true;
854
0
        }
855
0
        if (!okay) {
856
0
            first_try = false;
857
0
        }
858
0
    }
859
0
    if (okay && (!first_try)) {
860
0
        QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
861
0
    }
862
863
0
    input.seek(pos, SEEK_SET);
864
0
    input.setLastOffset(last_offset);
865
0
}
866
867
bool
868
QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
869
5.99M
{
870
5.99M
    bool ready = (this->state == st_token_ready);
871
5.99M
    unread_char = !this->in_token && !this->before_token;
872
5.99M
    ch = this->char_to_unread;
873
5.99M
    if (ready) {
874
5.99M
        token = (!(this->type == tt_name || this->type == tt_string))
875
5.99M
            ? Token(this->type, this->raw_val, this->raw_val, this->error_message)
876
5.99M
            : Token(this->type, this->val, this->raw_val, this->error_message);
877
878
5.99M
        this->reset();
879
5.99M
    }
880
5.99M
    return ready;
881
5.99M
}
882
883
bool
884
QPDFTokenizer::betweenTokens()
885
0
{
886
0
    return this->before_token;
887
0
}
888
889
QPDFTokenizer::Token
890
QPDFTokenizer::readToken(
891
    InputSource& input, std::string const& context, bool allow_bad, size_t max_len)
892
5.99M
{
893
5.99M
    nextToken(input, context, max_len);
894
895
5.99M
    Token token;
896
5.99M
    bool unread_char;
897
5.99M
    char char_to_unread;
898
5.99M
    getToken(token, unread_char, char_to_unread);
899
900
5.99M
    if (token.getType() == tt_bad) {
901
668k
        if (allow_bad) {
902
668k
            QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
903
668k
        } else {
904
0
            throw QPDFExc(
905
0
                qpdf_e_damaged_pdf,
906
0
                input.getName(),
907
0
                context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context,
908
0
                input.getLastOffset(),
909
0
                token.getErrorMessage());
910
0
        }
911
668k
    }
912
5.99M
    return token;
913
5.99M
}
914
915
QPDFTokenizer::Token
916
QPDFTokenizer::readToken(
917
    std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len)
918
5.99M
{
919
5.99M
    return readToken(*input, context, allow_bad, max_len);
920
5.99M
}
921
922
bool
923
QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len)
924
11.8M
{
925
11.8M
    if (this->state != st_inline_image) {
926
11.8M
        reset();
927
11.8M
    }
928
11.8M
    qpdf_offset_t offset = input.fastTell();
929
930
250M
    while (this->state != st_token_ready) {
931
238M
        char ch;
932
238M
        if (!input.fastRead(ch)) {
933
24.4k
            presentEOF();
934
935
24.4k
            if ((this->type == tt_eof) && (!this->allow_eof)) {
936
                // Nothing in the qpdf library calls readToken without allowEOF anymore, so this
937
                // case is not exercised.
938
0
                this->type = tt_bad;
939
0
                this->error_message = "unexpected EOF";
940
0
                offset = input.getLastOffset();
941
0
            }
942
238M
        } else {
943
238M
            handleCharacter(ch);
944
238M
            if (this->before_token) {
945
39.8M
                ++offset;
946
39.8M
            }
947
238M
            if (this->in_token) {
948
189M
                this->raw_val += ch;
949
189M
            }
950
238M
            if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) {
951
                // terminate this token now
952
562k
                QTC::TC("qpdf", "QPDFTokenizer block long token");
953
562k
                this->type = tt_bad;
954
562k
                this->state = st_token_ready;
955
562k
                this->error_message = "exceeded allowable length while reading token";
956
562k
            }
957
238M
        }
958
238M
    }
959
960
11.8M
    input.fastUnread(!this->in_token && !this->before_token);
961
962
11.8M
    if (this->type != tt_eof) {
963
11.8M
        input.setLastOffset(offset);
964
11.8M
    }
965
966
11.8M
    return this->error_message.empty();
967
11.8M
}