/src/qpdf/libqpdf/QPDFTokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include <qpdf/QPDFTokenizer_private.hh> |
2 | | |
3 | | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
4 | | // including it in case it may accidentally be used. |
5 | | |
6 | | #include <qpdf/InputSource_private.hh> |
7 | | #include <qpdf/QIntC.hh> |
8 | | #include <qpdf/QPDFExc.hh> |
9 | | #include <qpdf/QPDFObjectHandle.hh> |
10 | | #include <qpdf/QTC.hh> |
11 | | #include <qpdf/QUtil.hh> |
12 | | #include <qpdf/Util.hh> |
13 | | |
14 | | #include <cstdlib> |
15 | | #include <cstring> |
16 | | #include <stdexcept> |
17 | | |
18 | | using namespace qpdf; |
19 | | |
20 | | using Token = QPDFTokenizer::Token; |
21 | | using tt = QPDFTokenizer::token_type_e; |
22 | | |
23 | | static inline bool |
24 | | is_delimiter(char ch) |
25 | 1.73G | { |
26 | 1.73G | return ( |
27 | 1.73G | ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' || |
28 | 1.73G | ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' || |
29 | 1.73G | ch == '\v' || ch == '\f' || ch == 0); |
30 | 1.73G | } |
31 | | |
32 | | namespace |
33 | | { |
34 | | class QPDFWordTokenFinder: public InputSource::Finder |
35 | | { |
36 | | public: |
37 | | QPDFWordTokenFinder(InputSource& is, std::string const& str) : |
38 | 92.2k | is(is), |
39 | 92.2k | str(str) |
40 | 92.2k | { |
41 | 92.2k | } |
42 | 92.2k | ~QPDFWordTokenFinder() override = default; |
43 | | bool check() override; |
44 | | |
45 | | private: |
46 | | InputSource& is; |
47 | | std::string str; |
48 | | }; |
49 | | } // namespace |
50 | | |
51 | | bool |
52 | | QPDFWordTokenFinder::check() |
53 | 140k | { |
54 | | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
55 | | // delimiter or EOF. |
56 | 140k | Tokenizer tokenizer; |
57 | 140k | tokenizer.nextToken(is, "finder", str.size() + 2); |
58 | 140k | qpdf_offset_t pos = is.tell(); |
59 | 140k | if (tokenizer.getType() != tt::tt_word || tokenizer.getValue() != str) { |
60 | 51.8k | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
61 | 51.8k | return false; |
62 | 51.8k | } |
63 | 88.9k | qpdf_offset_t token_start = is.getLastOffset(); |
64 | 88.9k | char next; |
65 | 88.9k | bool next_okay = false; |
66 | 88.9k | if (is.read(&next, 1) == 0) { |
67 | 14 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
68 | 14 | next_okay = true; |
69 | 88.9k | } else { |
70 | 88.9k | next_okay = is_delimiter(next); |
71 | 88.9k | } |
72 | 88.9k | is.seek(pos, SEEK_SET); |
73 | 88.9k | if (!next_okay) { |
74 | 0 | return false; |
75 | 0 | } |
76 | 88.9k | if (token_start == 0) { |
77 | | // Can't actually happen...we never start the search at the beginning of the input. |
78 | 0 | return false; |
79 | 0 | } |
80 | 88.9k | return true; |
81 | 88.9k | } |
82 | | |
83 | | void |
84 | | Tokenizer::reset() |
85 | 445M | { |
86 | 445M | state = st_before_token; |
87 | 445M | type = tt::tt_bad; |
88 | 445M | val.clear(); |
89 | 445M | raw_val.clear(); |
90 | 445M | error_message = ""; |
91 | 445M | before_token = true; |
92 | 445M | in_token = false; |
93 | 445M | char_to_unread = '\0'; |
94 | 445M | inline_image_bytes = 0; |
95 | 445M | string_depth = 0; |
96 | 445M | bad = false; |
97 | 445M | } |
98 | | |
99 | | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
100 | 3.48k | type(type), |
101 | 3.48k | value(value), |
102 | 3.48k | raw_value(value) |
103 | 3.48k | { |
104 | 3.48k | if (type == tt_string) { |
105 | 0 | raw_value = QPDFObjectHandle::newString(value).unparse(); |
106 | 3.48k | } else if (type == tt_name) { |
107 | 0 | raw_value = QPDFObjectHandle::newName(value).unparse(); |
108 | 0 | } |
109 | 3.48k | } |
110 | | |
111 | | QPDFTokenizer::QPDFTokenizer() : |
112 | 148k | m(std::make_unique<qpdf::Tokenizer>()) |
113 | 148k | { |
114 | 148k | } |
115 | | |
116 | 148k | QPDFTokenizer::~QPDFTokenizer() = default; |
117 | | |
118 | | Tokenizer::Tokenizer() |
119 | 895k | { |
120 | 895k | reset(); |
121 | 895k | } |
122 | | |
123 | | void |
124 | | QPDFTokenizer::allowEOF() |
125 | 148k | { |
126 | 148k | m->allowEOF(); |
127 | 148k | } |
128 | | |
129 | | void |
130 | | Tokenizer::allowEOF() |
131 | 438k | { |
132 | 438k | allow_eof = true; |
133 | 438k | } |
134 | | |
135 | | void |
136 | | QPDFTokenizer::includeIgnorable() |
137 | 148k | { |
138 | 148k | m->includeIgnorable(); |
139 | 148k | } |
140 | | |
141 | | void |
142 | | Tokenizer::includeIgnorable() |
143 | 148k | { |
144 | 148k | include_ignorable = true; |
145 | 148k | } |
146 | | |
147 | | bool |
148 | | Tokenizer::isSpace(char ch) |
149 | 568M | { |
150 | 568M | return (ch == '\0' || util::is_space(ch)); |
151 | 568M | } |
152 | | |
153 | | bool |
154 | | Tokenizer::isDelimiter(char ch) |
155 | 1.73G | { |
156 | 1.73G | return is_delimiter(ch); |
157 | 1.73G | } |
158 | | |
159 | | void |
160 | | QPDFTokenizer::presentCharacter(char ch) |
161 | 0 | { |
162 | 0 | m->presentCharacter(ch); |
163 | 0 | } |
164 | | |
165 | | void |
166 | | Tokenizer::presentCharacter(char ch) |
167 | 411k | { |
168 | 411k | handleCharacter(ch); |
169 | | |
170 | 411k | if (in_token) { |
171 | 0 | raw_val += ch; |
172 | 0 | } |
173 | 411k | } |
174 | | |
175 | | void |
176 | | Tokenizer::handleCharacter(char ch) |
177 | 4.75G | { |
178 | | // In some cases, functions called below may call a second handler. This happens whenever you |
179 | | // have to use a character from the next token to detect the end of the current token. |
180 | | |
181 | 4.75G | switch (state) { |
182 | 0 | case st_top: |
183 | 0 | inTop(ch); |
184 | 0 | return; |
185 | | |
186 | 126M | case st_in_space: |
187 | 126M | inSpace(ch); |
188 | 126M | return; |
189 | | |
190 | 282M | case st_in_comment: |
191 | 282M | inComment(ch); |
192 | 282M | return; |
193 | | |
194 | 6.77M | case st_lt: |
195 | 6.77M | inLt(ch); |
196 | 6.77M | return; |
197 | | |
198 | 5.51M | case st_gt: |
199 | 5.51M | inGt(ch); |
200 | 5.51M | return; |
201 | | |
202 | 1.79G | case st_in_string: |
203 | 1.79G | inString(ch); |
204 | 1.79G | return; |
205 | | |
206 | 761M | case st_name: |
207 | 761M | inName(ch); |
208 | 761M | return; |
209 | | |
210 | 103M | case st_number: |
211 | 103M | inNumber(ch); |
212 | 103M | return; |
213 | | |
214 | 9.37M | case st_real: |
215 | 9.37M | inReal(ch); |
216 | 9.37M | return; |
217 | | |
218 | 6.72M | case st_string_after_cr: |
219 | 6.72M | inStringAfterCR(ch); |
220 | 6.72M | return; |
221 | | |
222 | 7.10M | case st_string_escape: |
223 | 7.10M | inStringEscape(ch); |
224 | 7.10M | return; |
225 | | |
226 | 1.03M | case st_char_code: |
227 | 1.03M | inCharCode(ch); |
228 | 1.03M | return; |
229 | | |
230 | 920M | case st_literal: |
231 | 920M | inLiteral(ch); |
232 | 920M | return; |
233 | | |
234 | 265M | case st_inline_image: |
235 | 265M | inInlineImage(ch); |
236 | 265M | return; |
237 | | |
238 | 8.89M | case st_in_hexstring: |
239 | 8.89M | inHexstring(ch); |
240 | 8.89M | return; |
241 | | |
242 | 8.60M | case st_in_hexstring_2nd: |
243 | 8.60M | inHexstring2nd(ch); |
244 | 8.60M | return; |
245 | | |
246 | 991k | case st_name_hex1: |
247 | 991k | inNameHex1(ch); |
248 | 991k | return; |
249 | | |
250 | 550k | case st_name_hex2: |
251 | 550k | inNameHex2(ch); |
252 | 550k | return; |
253 | | |
254 | 1.44M | case st_sign: |
255 | 1.44M | inSign(ch); |
256 | 1.44M | return; |
257 | | |
258 | 502k | case st_decimal: |
259 | 502k | inDecimal(ch); |
260 | 502k | return; |
261 | | |
262 | 439M | case (st_before_token): |
263 | 439M | inBeforeToken(ch); |
264 | 439M | return; |
265 | | |
266 | 0 | case (st_token_ready): |
267 | 0 | inTokenReady(ch); |
268 | 0 | return; |
269 | | |
270 | 0 | default: |
271 | 0 | throw std::logic_error("INTERNAL ERROR: invalid state while reading token"); |
272 | 4.75G | } |
273 | 4.75G | } |
274 | | |
275 | | void |
276 | | Tokenizer::inTokenReady(char ch) |
277 | 0 | { |
278 | 0 | throw std::logic_error( |
279 | 0 | "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); |
280 | 0 | } |
281 | | |
282 | | void |
283 | | Tokenizer::inBeforeToken(char ch) |
284 | 439M | { |
285 | | // Note: we specifically do not use ctype here. It is locale-dependent. |
286 | 439M | if (isSpace(ch)) { |
287 | 218M | before_token = !include_ignorable; |
288 | 218M | in_token = include_ignorable; |
289 | 218M | if (include_ignorable) { |
290 | 54.5M | state = st_in_space; |
291 | 54.5M | } |
292 | 220M | } else if (ch == '%') { |
293 | 754k | before_token = !include_ignorable; |
294 | 754k | in_token = include_ignorable; |
295 | 754k | state = st_in_comment; |
296 | 219M | } else { |
297 | 219M | before_token = false; |
298 | 219M | in_token = true; |
299 | 219M | inTop(ch); |
300 | 219M | } |
301 | 439M | } |
302 | | |
303 | | void |
304 | | Tokenizer::inTop(char ch) |
305 | 219M | { |
306 | 219M | switch (ch) { |
307 | 1.38M | case '(': |
308 | 1.38M | string_depth = 1; |
309 | 1.38M | state = st_in_string; |
310 | 1.38M | return; |
311 | | |
312 | 6.80M | case '<': |
313 | 6.80M | state = st_lt; |
314 | 6.80M | return; |
315 | | |
316 | 5.52M | case '>': |
317 | 5.52M | state = st_gt; |
318 | 5.52M | return; |
319 | | |
320 | 3.16M | case (')'): |
321 | 3.16M | type = tt::tt_bad; |
322 | 3.16M | QTC::TC("qpdf", "QPDFTokenizer bad )"); |
323 | 3.16M | error_message = "unexpected )"; |
324 | 3.16M | state = st_token_ready; |
325 | 3.16M | return; |
326 | | |
327 | 3.16M | case '[': |
328 | 3.16M | type = tt::tt_array_open; |
329 | 3.16M | state = st_token_ready; |
330 | 3.16M | return; |
331 | | |
332 | 5.74M | case ']': |
333 | 5.74M | type = tt::tt_array_close; |
334 | 5.74M | state = st_token_ready; |
335 | 5.74M | return; |
336 | | |
337 | 843k | case '{': |
338 | 843k | type = tt::tt_brace_open; |
339 | 843k | state = st_token_ready; |
340 | 843k | return; |
341 | | |
342 | 1.39M | case '}': |
343 | 1.39M | type = tt::tt_brace_close; |
344 | 1.39M | state = st_token_ready; |
345 | 1.39M | return; |
346 | | |
347 | 59.4M | case '/': |
348 | 59.4M | state = st_name; |
349 | 59.4M | val += ch; |
350 | 59.4M | return; |
351 | | |
352 | 16.9M | case '0': |
353 | 25.0M | case '1': |
354 | 29.8M | case '2': |
355 | 32.3M | case '3': |
356 | 35.3M | case '4': |
357 | 39.1M | case '5': |
358 | 41.4M | case '6': |
359 | 46.0M | case '7': |
360 | 47.5M | case '8': |
361 | 49.5M | case '9': |
362 | 49.5M | state = st_number; |
363 | 49.5M | return; |
364 | | |
365 | 542k | case '+': |
366 | 1.48M | case '-': |
367 | 1.48M | state = st_sign; |
368 | 1.48M | return; |
369 | | |
370 | 497k | case '.': |
371 | 497k | state = st_decimal; |
372 | 497k | return; |
373 | | |
374 | 80.7M | default: |
375 | 80.7M | state = st_literal; |
376 | 80.7M | return; |
377 | 219M | } |
378 | 219M | } |
379 | | |
380 | | void |
381 | | Tokenizer::inSpace(char ch) |
382 | 126M | { |
383 | | // We only enter this state if include_ignorable is true. |
384 | 126M | if (!isSpace(ch)) { |
385 | 54.5M | type = tt::tt_space; |
386 | 54.5M | in_token = false; |
387 | 54.5M | char_to_unread = ch; |
388 | 54.5M | state = st_token_ready; |
389 | 54.5M | } |
390 | 126M | } |
391 | | |
392 | | void |
393 | | Tokenizer::inComment(char ch) |
394 | 282M | { |
395 | 282M | if ((ch == '\r') || (ch == '\n')) { |
396 | 737k | if (include_ignorable) { |
397 | 178k | type = tt::tt_comment; |
398 | 178k | in_token = false; |
399 | 178k | char_to_unread = ch; |
400 | 178k | state = st_token_ready; |
401 | 559k | } else { |
402 | 559k | state = st_before_token; |
403 | 559k | } |
404 | 737k | } |
405 | 282M | } |
406 | | |
407 | | void |
408 | | Tokenizer::inString(char ch) |
409 | 1.80G | { |
410 | 1.80G | switch (ch) { |
411 | 7.10M | case '\\': |
412 | 7.10M | state = st_string_escape; |
413 | 7.10M | return; |
414 | | |
415 | 7.12M | case '(': |
416 | 7.12M | val += ch; |
417 | 7.12M | ++string_depth; |
418 | 7.12M | return; |
419 | | |
420 | 3.48M | case ')': |
421 | 3.48M | if (--string_depth == 0) { |
422 | 1.16M | type = tt::tt_string; |
423 | 1.16M | state = st_token_ready; |
424 | 1.16M | return; |
425 | 1.16M | } |
426 | | |
427 | 2.31M | val += ch; |
428 | 2.31M | return; |
429 | | |
430 | 6.72M | case '\r': |
431 | | // CR by itself is converted to LF |
432 | 6.72M | val += '\n'; |
433 | 6.72M | state = st_string_after_cr; |
434 | 6.72M | return; |
435 | | |
436 | 32.9M | case '\n': |
437 | 32.9M | val += ch; |
438 | 32.9M | return; |
439 | | |
440 | 1.74G | default: |
441 | 1.74G | val += ch; |
442 | 1.74G | return; |
443 | 1.80G | } |
444 | 1.80G | } |
445 | | |
446 | | void |
447 | | Tokenizer::inName(char ch) |
448 | 762M | { |
449 | 762M | if (isDelimiter(ch)) { |
450 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
451 | | // the whitespace character even though it is ignored since it may be the newline after a |
452 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
453 | | // though not on any files in the test suite as of this |
454 | | // writing. |
455 | | |
456 | 58.3M | type = bad ? tt::tt_bad : tt::tt_name; |
457 | 58.3M | in_token = false; |
458 | 58.3M | char_to_unread = ch; |
459 | 58.3M | state = st_token_ready; |
460 | 704M | } else if (ch == '#') { |
461 | 992k | char_code = 0; |
462 | 992k | state = st_name_hex1; |
463 | 703M | } else { |
464 | 703M | val += ch; |
465 | 703M | } |
466 | 762M | } |
467 | | |
468 | | void |
469 | | Tokenizer::inNameHex1(char ch) |
470 | 991k | { |
471 | 991k | hex_char = ch; |
472 | | |
473 | 991k | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
474 | 550k | char_code = int(hval) << 4; |
475 | 550k | state = st_name_hex2; |
476 | 550k | } else { |
477 | 441k | QTC::TC("qpdf", "QPDFTokenizer bad name 1"); |
478 | 441k | error_message = "name with stray # will not work with PDF >= 1.2"; |
479 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
480 | 441k | val += '\0'; |
481 | 441k | state = st_name; |
482 | 441k | inName(ch); |
483 | 441k | } |
484 | 991k | } |
485 | | |
486 | | void |
487 | | Tokenizer::inNameHex2(char ch) |
488 | 550k | { |
489 | 550k | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
490 | 437k | char_code |= int(hval); |
491 | 437k | } else { |
492 | 112k | QTC::TC("qpdf", "QPDFTokenizer bad name 2"); |
493 | 112k | error_message = "name with stray # will not work with PDF >= 1.2"; |
494 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
495 | 112k | val += '\0'; |
496 | 112k | val += hex_char; |
497 | 112k | state = st_name; |
498 | 112k | inName(ch); |
499 | 112k | return; |
500 | 112k | } |
501 | 437k | if (char_code == 0) { |
502 | 328k | QTC::TC("qpdf", "QPDFTokenizer null in name"); |
503 | 328k | error_message = "null character not allowed in name token"; |
504 | 328k | val += "#00"; |
505 | 328k | state = st_name; |
506 | 328k | bad = true; |
507 | 328k | } else { |
508 | 108k | val += char(char_code); |
509 | 108k | state = st_name; |
510 | 108k | } |
511 | 437k | } |
512 | | |
513 | | void |
514 | | Tokenizer::inSign(char ch) |
515 | 1.44M | { |
516 | 1.44M | if (util::is_digit(ch)) { |
517 | 639k | state = st_number; |
518 | 805k | } else if (ch == '.') { |
519 | 10.0k | state = st_decimal; |
520 | 795k | } else { |
521 | 795k | state = st_literal; |
522 | 795k | inLiteral(ch); |
523 | 795k | } |
524 | 1.44M | } |
525 | | |
526 | | void |
527 | | Tokenizer::inDecimal(char ch) |
528 | 502k | { |
529 | 502k | if (util::is_digit(ch)) { |
530 | 110k | state = st_real; |
531 | 391k | } else { |
532 | 391k | state = st_literal; |
533 | 391k | inLiteral(ch); |
534 | 391k | } |
535 | 502k | } |
536 | | |
537 | | void |
538 | | Tokenizer::inNumber(char ch) |
539 | 103M | { |
540 | 103M | if (util::is_digit(ch)) { |
541 | 56.4M | } else if (ch == '.') { |
542 | 3.03M | state = st_real; |
543 | 44.2M | } else if (isDelimiter(ch)) { |
544 | 40.3M | type = tt::tt_integer; |
545 | 40.3M | state = st_token_ready; |
546 | 40.3M | in_token = false; |
547 | 40.3M | char_to_unread = ch; |
548 | 40.3M | } else { |
549 | 3.90M | state = st_literal; |
550 | 3.90M | } |
551 | 103M | } |
552 | | |
553 | | void |
554 | | Tokenizer::inReal(char ch) |
555 | 9.37M | { |
556 | 9.37M | if (util::is_digit(ch)) { |
557 | 6.23M | } else if (isDelimiter(ch)) { |
558 | 2.69M | type = tt::tt_real; |
559 | 2.69M | state = st_token_ready; |
560 | 2.69M | in_token = false; |
561 | 2.69M | char_to_unread = ch; |
562 | 2.69M | } else { |
563 | 447k | state = st_literal; |
564 | 447k | } |
565 | 9.37M | } |
566 | | void |
567 | | Tokenizer::inStringEscape(char ch) |
568 | 7.10M | { |
569 | 7.10M | state = st_in_string; |
570 | 7.10M | switch (ch) { |
571 | 147k | case '0': |
572 | 261k | case '1': |
573 | 424k | case '2': |
574 | 547k | case '3': |
575 | 590k | case '4': |
576 | 609k | case '5': |
577 | 631k | case '6': |
578 | 656k | case '7': |
579 | 656k | state = st_char_code; |
580 | 656k | char_code = 0; |
581 | 656k | digit_count = 0; |
582 | 656k | inCharCode(ch); |
583 | 656k | return; |
584 | | |
585 | 153k | case 'n': |
586 | 153k | val += '\n'; |
587 | 153k | return; |
588 | | |
589 | 139k | case 'r': |
590 | 139k | val += '\r'; |
591 | 139k | return; |
592 | | |
593 | 193k | case 't': |
594 | 193k | val += '\t'; |
595 | 193k | return; |
596 | | |
597 | 146k | case 'b': |
598 | 146k | val += '\b'; |
599 | 146k | return; |
600 | | |
601 | 273k | case 'f': |
602 | 273k | val += '\f'; |
603 | 273k | return; |
604 | | |
605 | 9.67k | case '\n': |
606 | 9.67k | return; |
607 | | |
608 | 5.95k | case '\r': |
609 | 5.95k | state = st_string_after_cr; |
610 | 5.95k | return; |
611 | | |
612 | 5.52M | default: |
613 | | // PDF spec says backslash is ignored before anything else |
614 | 5.52M | val += ch; |
615 | 5.52M | return; |
616 | 7.10M | } |
617 | 7.10M | } |
618 | | |
619 | | void |
620 | | Tokenizer::inStringAfterCR(char ch) |
621 | 6.72M | { |
622 | 6.72M | state = st_in_string; |
623 | 6.72M | if (ch != '\n') { |
624 | 6.28M | inString(ch); |
625 | 6.28M | } |
626 | 6.72M | } |
627 | | |
628 | | void |
629 | | Tokenizer::inLt(char ch) |
630 | 6.77M | { |
631 | 6.77M | if (ch == '<') { |
632 | 4.88M | type = tt::tt_dict_open; |
633 | 4.88M | state = st_token_ready; |
634 | 4.88M | return; |
635 | 4.88M | } |
636 | | |
637 | 1.89M | state = st_in_hexstring; |
638 | 1.89M | inHexstring(ch); |
639 | 1.89M | } |
640 | | |
641 | | void |
642 | | Tokenizer::inGt(char ch) |
643 | 5.51M | { |
644 | 5.51M | if (ch == '>') { |
645 | 3.65M | type = tt::tt_dict_close; |
646 | 3.65M | state = st_token_ready; |
647 | 3.65M | } else { |
648 | 1.85M | type = tt::tt_bad; |
649 | 1.85M | QTC::TC("qpdf", "QPDFTokenizer bad >"); |
650 | 1.85M | error_message = "unexpected >"; |
651 | 1.85M | in_token = false; |
652 | 1.85M | char_to_unread = ch; |
653 | 1.85M | state = st_token_ready; |
654 | 1.85M | } |
655 | 5.51M | } |
656 | | |
657 | | void |
658 | | Tokenizer::inLiteral(char ch) |
659 | 921M | { |
660 | 921M | if (isDelimiter(ch)) { |
661 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
662 | | // the whitespace character even though it is ignored since it may be the newline after a |
663 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
664 | | // though not on any files in the test suite as of this writing. |
665 | | |
666 | 81.5M | in_token = false; |
667 | 81.5M | char_to_unread = ch; |
668 | 81.5M | state = st_token_ready; |
669 | 81.5M | type = (raw_val == "true") || (raw_val == "false") |
670 | 81.5M | ? tt::tt_bool |
671 | 81.5M | : (raw_val == "null" ? tt::tt_null : tt::tt_word); |
672 | 81.5M | } |
673 | 921M | } |
674 | | |
675 | | void |
676 | | Tokenizer::inHexstring(char ch) |
677 | 10.7M | { |
678 | 10.7M | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
679 | 8.37M | char_code = int(hval) << 4; |
680 | 8.37M | state = st_in_hexstring_2nd; |
681 | | |
682 | 8.37M | } else if (ch == '>') { |
683 | 714k | type = tt::tt_string; |
684 | 714k | state = st_token_ready; |
685 | | |
686 | 1.69M | } else if (isSpace(ch)) { |
687 | | // ignore |
688 | | |
689 | 915k | } else { |
690 | 915k | type = tt::tt_bad; |
691 | 915k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); |
692 | 915k | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
693 | 915k | state = st_token_ready; |
694 | 915k | } |
695 | 10.7M | } |
696 | | |
697 | | void |
698 | | Tokenizer::inHexstring2nd(char ch) |
699 | 8.60M | { |
700 | 8.60M | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
701 | 8.11M | val += char(char_code) | hval; |
702 | 8.11M | state = st_in_hexstring; |
703 | | |
704 | 8.11M | } else if (ch == '>') { |
705 | | // PDF spec says odd hexstrings have implicit trailing 0. |
706 | 67.6k | val += char(char_code); |
707 | 67.6k | type = tt::tt_string; |
708 | 67.6k | state = st_token_ready; |
709 | | |
710 | 420k | } else if (isSpace(ch)) { |
711 | | // ignore |
712 | | |
713 | 263k | } else { |
714 | 157k | type = tt::tt_bad; |
715 | 157k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); |
716 | 157k | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
717 | 157k | state = st_token_ready; |
718 | 157k | } |
719 | 8.60M | } |
720 | | |
721 | | void |
722 | | Tokenizer::inCharCode(char ch) |
723 | 1.68M | { |
724 | 1.68M | bool handled = false; |
725 | 1.68M | if (('0' <= ch) && (ch <= '7')) { |
726 | 1.37M | char_code = 8 * char_code + (int(ch) - int('0')); |
727 | 1.37M | if (++(digit_count) < 3) { |
728 | 1.05M | return; |
729 | 1.05M | } |
730 | 319k | handled = true; |
731 | 319k | } |
732 | | // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF |
733 | | // Spec says to ignore high-order overflow. |
734 | 633k | val += char(char_code % 256); |
735 | 633k | state = st_in_string; |
736 | 633k | if (!handled) { |
737 | 314k | inString(ch); |
738 | 314k | } |
739 | 633k | } |
740 | | |
741 | | void |
742 | | Tokenizer::inInlineImage(char ch) |
743 | 265M | { |
744 | 265M | if ((raw_val.length() + 1) == inline_image_bytes) { |
745 | 6.94k | QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); |
746 | 6.94k | type = tt::tt_inline_image; |
747 | 6.94k | inline_image_bytes = 0; |
748 | 6.94k | state = st_token_ready; |
749 | 6.94k | } |
750 | 265M | } |
751 | | |
752 | | void |
753 | | QPDFTokenizer::presentEOF() |
754 | 0 | { |
755 | 0 | m->presentEOF(); |
756 | 0 | } |
757 | | |
758 | | void |
759 | | Tokenizer::presentEOF() |
760 | 879k | { |
761 | 879k | switch (state) { |
762 | 176k | case st_name: |
763 | 177k | case st_name_hex1: |
764 | 178k | case st_name_hex2: |
765 | 239k | case st_number: |
766 | 243k | case st_real: |
767 | 244k | case st_sign: |
768 | 246k | case st_decimal: |
769 | 411k | case st_literal: |
770 | 411k | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
771 | | // Push any delimiter to the state machine to finish off the final token. |
772 | 411k | presentCharacter('\f'); |
773 | 411k | in_token = true; |
774 | 411k | break; |
775 | | |
776 | 0 | case st_top: |
777 | 395k | case st_before_token: |
778 | 395k | type = tt::tt_eof; |
779 | 395k | break; |
780 | | |
781 | 7.33k | case st_in_space: |
782 | 7.33k | type = include_ignorable ? tt::tt_space : tt::tt_eof; |
783 | 7.33k | break; |
784 | | |
785 | 17.0k | case st_in_comment: |
786 | 17.0k | type = include_ignorable ? tt::tt_comment : tt::tt_bad; |
787 | 17.0k | break; |
788 | | |
789 | 0 | case st_token_ready: |
790 | 0 | break; |
791 | | |
792 | 48.1k | default: |
793 | 48.1k | QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); |
794 | 48.1k | type = tt::tt_bad; |
795 | 48.1k | error_message = "EOF while reading token"; |
796 | 879k | } |
797 | 879k | state = st_token_ready; |
798 | 879k | } |
799 | | |
800 | | void |
801 | | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
802 | 0 | { |
803 | 0 | m->expectInlineImage(*input); |
804 | 0 | } |
805 | | |
806 | | void |
807 | | QPDFTokenizer::expectInlineImage(InputSource& input) |
808 | 3.48k | { |
809 | 3.48k | m->expectInlineImage(input); |
810 | 3.48k | } |
811 | | |
812 | | void |
813 | | Tokenizer::expectInlineImage(InputSource& input) |
814 | 7.67k | { |
815 | 7.67k | if (state == st_token_ready) { |
816 | 4.18k | reset(); |
817 | 4.18k | } else if (state != st_before_token) { |
818 | 0 | throw std::logic_error( |
819 | 0 | "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); |
820 | 0 | } |
821 | 7.67k | findEI(input); |
822 | 7.67k | before_token = false; |
823 | 7.67k | in_token = true; |
824 | 7.67k | state = st_inline_image; |
825 | 7.67k | } |
826 | | |
827 | | void |
828 | | Tokenizer::findEI(InputSource& input) |
829 | 7.67k | { |
830 | 7.67k | qpdf_offset_t last_offset = input.getLastOffset(); |
831 | 7.67k | qpdf_offset_t pos = input.tell(); |
832 | | |
833 | | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
834 | | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
835 | | // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the |
836 | | // end without finding one, return the last EI we found. Store the number of bytes expected in |
837 | | // the inline image including the EI and use that to break out of inline image, falling back to |
838 | | // the old method if needed. |
839 | | |
840 | 7.67k | bool okay = false; |
841 | 7.67k | bool first_try = true; |
842 | 96.6k | while (!okay) { |
843 | 92.2k | QPDFWordTokenFinder f(input, "EI"); |
844 | 92.2k | if (!input.findFirst("EI", input.tell(), 0, f)) { |
845 | 3.27k | break; |
846 | 3.27k | } |
847 | 88.9k | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
848 | | |
849 | 88.9k | Tokenizer check; |
850 | 88.9k | bool found_bad = false; |
851 | | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
852 | | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
853 | | // EI and the next valid one's ID since width, height, bits per pixel, and color space are |
854 | | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
855 | | // be pretty sure we've found the actual EI. |
856 | 222k | for (int i = 0; i < 10; ++i) { |
857 | 218k | check.nextToken(input, "checker"); |
858 | 218k | auto typ = check.getType(); |
859 | 218k | if (typ == tt::tt_eof) { |
860 | 0 | okay = true; |
861 | 218k | } else if (typ == tt::tt_bad) { |
862 | 25.1k | found_bad = true; |
863 | 192k | } else if (typ == tt::tt_word) { |
864 | | // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into |
865 | | // "words". We recognize strings of alphabetic characters as potential valid |
866 | | // operators for purposes of telling whether we're in valid content or not. It's not |
867 | | // perfect, but it should work more reliably than what we used to do, which was |
868 | | // already good enough for the vast majority of files. |
869 | 126k | bool found_alpha = false; |
870 | 126k | bool found_non_printable = false; |
871 | 126k | bool found_other = false; |
872 | 405k | for (char ch: check.getValue()) { |
873 | 405k | if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '*')) { |
874 | | // Treat '*' as alpha since there are valid PDF operators that contain * |
875 | | // along with alphabetic characters. |
876 | 205k | found_alpha = true; |
877 | 205k | } else if (static_cast<signed char>(ch) < 32 && !isSpace(ch)) { |
878 | | // Compare ch as a signed char so characters outside of 7-bit will be < 0. |
879 | 29.6k | found_non_printable = true; |
880 | 29.6k | break; |
881 | 170k | } else { |
882 | 170k | found_other = true; |
883 | 170k | } |
884 | 405k | } |
885 | 126k | if (found_non_printable || (found_alpha && found_other)) { |
886 | 59.3k | found_bad = true; |
887 | 59.3k | } |
888 | 126k | } |
889 | 218k | if (okay || found_bad) { |
890 | 84.5k | break; |
891 | 84.5k | } |
892 | 218k | } |
893 | 88.9k | if (!found_bad) { |
894 | 4.39k | okay = true; |
895 | 4.39k | } |
896 | 88.9k | if (!okay) { |
897 | 84.5k | first_try = false; |
898 | 84.5k | } |
899 | 88.9k | } |
900 | 7.67k | if (okay && (!first_try)) { |
901 | 787 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
902 | 787 | } |
903 | | |
904 | 7.67k | input.seek(pos, SEEK_SET); |
905 | 7.67k | input.setLastOffset(last_offset); |
906 | 7.67k | } |
907 | | |
908 | | bool |
909 | | QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
910 | 0 | { |
911 | 0 | return m->getToken(token, unread_char, ch); |
912 | 0 | } |
913 | | |
914 | | bool |
915 | | Tokenizer::getToken(Token& token, bool& unread_char, char& ch) |
916 | 169M | { |
917 | 169M | bool ready = (state == st_token_ready); |
918 | 169M | unread_char = !in_token && !before_token; |
919 | 169M | ch = char_to_unread; |
920 | 169M | if (ready) { |
921 | 169M | token = (!(type == tt::tt_name || type == tt::tt_string)) |
922 | 169M | ? Token(type, raw_val, raw_val, error_message) |
923 | 169M | : Token(type, val, raw_val, error_message); |
924 | | |
925 | 169M | reset(); |
926 | 169M | } |
927 | 169M | return ready; |
928 | 169M | } |
929 | | |
930 | | bool |
931 | | QPDFTokenizer::betweenTokens() |
932 | 0 | { |
933 | 0 | return m->betweenTokens(); |
934 | 0 | } |
935 | | |
936 | | bool |
937 | | Tokenizer::betweenTokens() |
938 | 0 | { |
939 | 0 | return before_token; |
940 | 0 | } |
941 | | |
942 | | QPDFTokenizer::Token |
943 | | QPDFTokenizer::readToken( |
944 | | InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
945 | 125M | { |
946 | 125M | return m->readToken(input, context, allow_bad, max_len); |
947 | 125M | } |
948 | | |
949 | | QPDFTokenizer::Token |
950 | | QPDFTokenizer::readToken( |
951 | | std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) |
952 | 0 | { |
953 | 0 | return m->readToken(*input, context, allow_bad, max_len); |
954 | 0 | } |
955 | | |
956 | | QPDFTokenizer::Token |
957 | | Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
958 | 169M | { |
959 | 169M | nextToken(input, context, max_len); |
960 | | |
961 | 169M | Token token; |
962 | 169M | bool unread_char; |
963 | 169M | char char_to_unread; |
964 | 169M | getToken(token, unread_char, char_to_unread); |
965 | | |
966 | 169M | if (token.getType() == tt::tt_bad) { |
967 | 9.02M | if (allow_bad) { |
968 | 9.02M | QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); |
969 | 9.02M | } else { |
970 | 0 | throw QPDFExc( |
971 | 0 | qpdf_e_damaged_pdf, |
972 | 0 | input.getName(), |
973 | 0 | context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, |
974 | 0 | input.getLastOffset(), |
975 | 0 | token.getErrorMessage()); |
976 | 0 | } |
977 | 9.02M | } |
978 | 169M | return token; |
979 | 169M | } |
980 | | |
981 | | bool |
982 | | Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) |
983 | 274M | { |
984 | 274M | if (state != st_inline_image) { |
985 | 274M | reset(); |
986 | 274M | } |
987 | 274M | qpdf_offset_t offset = input.fastTell(); |
988 | | |
989 | 5.03G | while (state != st_token_ready) { |
990 | 4.75G | char ch; |
991 | 4.75G | if (!input.fastRead(ch)) { |
992 | 879k | presentEOF(); |
993 | | |
994 | 879k | if ((type == tt::tt_eof) && (!allow_eof)) { |
995 | | // Nothing in the qpdf library calls readToken without allowEOF anymore, so this |
996 | | // case is not exercised. |
997 | 541 | type = tt::tt_bad; |
998 | 541 | error_message = "unexpected EOF"; |
999 | 541 | offset = input.getLastOffset(); |
1000 | 541 | } |
1001 | 4.75G | } else { |
1002 | 4.75G | handleCharacter(ch); |
1003 | 4.75G | if (before_token) { |
1004 | 199M | ++offset; |
1005 | 199M | } |
1006 | 4.75G | if (in_token) { |
1007 | 4.31G | raw_val += ch; |
1008 | 4.31G | } |
1009 | 4.75G | if (max_len && (raw_val.length() >= max_len) && (state != st_token_ready)) { |
1010 | | // terminate this token now |
1011 | 9.20M | QTC::TC("qpdf", "QPDFTokenizer block long token"); |
1012 | 9.20M | type = tt::tt_bad; |
1013 | 9.20M | state = st_token_ready; |
1014 | 9.20M | error_message = "exceeded allowable length while reading token"; |
1015 | 9.20M | } |
1016 | 4.75G | } |
1017 | 4.75G | } |
1018 | | |
1019 | 274M | input.fastUnread(!in_token && !before_token); |
1020 | | |
1021 | 274M | if (type != tt::tt_eof) { |
1022 | 274M | input.setLastOffset(offset); |
1023 | 274M | } |
1024 | | |
1025 | 274M | return error_message.empty(); |
1026 | 274M | } |