/src/qpdf/libqpdf/QPDFTokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include <qpdf/QPDFTokenizer_private.hh> |
2 | | |
3 | | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
4 | | // including it in case it may accidentally be used. |
5 | | |
6 | | #include <qpdf/InputSource_private.hh> |
7 | | #include <qpdf/QIntC.hh> |
8 | | #include <qpdf/QPDFExc.hh> |
9 | | #include <qpdf/QPDFObjectHandle.hh> |
10 | | #include <qpdf/QTC.hh> |
11 | | #include <qpdf/QUtil.hh> |
12 | | #include <qpdf/Util.hh> |
13 | | |
14 | | #include <cstdlib> |
15 | | #include <cstring> |
16 | | #include <stdexcept> |
17 | | |
18 | | using namespace qpdf; |
19 | | |
20 | | using Token = QPDFTokenizer::Token; |
21 | | using tt = QPDFTokenizer::token_type_e; |
22 | | |
23 | | static inline bool |
24 | | is_delimiter(char ch) |
25 | 0 | { |
26 | 0 | return ( |
27 | 0 | ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' || |
28 | 0 | ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' || |
29 | 0 | ch == '\v' || ch == '\f' || ch == 0); |
30 | 0 | } |
31 | | |
32 | | namespace |
33 | | { |
34 | | class QPDFWordTokenFinder: public InputSource::Finder |
35 | | { |
36 | | public: |
37 | | QPDFWordTokenFinder(InputSource& is, std::string const& str) : |
38 | 0 | is(is), |
39 | 0 | str(str) |
40 | 0 | { |
41 | 0 | } |
42 | 0 | ~QPDFWordTokenFinder() override = default; |
43 | | bool check() override; |
44 | | |
45 | | private: |
46 | | InputSource& is; |
47 | | std::string str; |
48 | | }; |
49 | | } // namespace |
50 | | |
51 | | bool |
52 | | QPDFWordTokenFinder::check() |
53 | 0 | { |
54 | | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
55 | | // delimiter or EOF. |
56 | 0 | Tokenizer tokenizer; |
57 | 0 | tokenizer.nextToken(is, "finder", str.size() + 2); |
58 | 0 | qpdf_offset_t pos = is.tell(); |
59 | 0 | if (tokenizer.getType() != tt::tt_word || tokenizer.getValue() != str) { |
60 | 0 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
61 | 0 | return false; |
62 | 0 | } |
63 | 0 | qpdf_offset_t token_start = is.getLastOffset(); |
64 | 0 | char next; |
65 | 0 | bool next_okay = false; |
66 | 0 | if (is.read(&next, 1) == 0) { |
67 | 0 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
68 | 0 | next_okay = true; |
69 | 0 | } else { |
70 | 0 | next_okay = is_delimiter(next); |
71 | 0 | } |
72 | 0 | is.seek(pos, SEEK_SET); |
73 | 0 | if (!next_okay) { |
74 | 0 | return false; |
75 | 0 | } |
76 | 0 | if (token_start == 0) { |
77 | | // Can't actually happen...we never start the search at the beginning of the input. |
78 | 0 | return false; |
79 | 0 | } |
80 | 0 | return true; |
81 | 0 | } |
82 | | |
83 | | void |
84 | | Tokenizer::reset() |
85 | 0 | { |
86 | 0 | state = st_before_token; |
87 | 0 | type = tt::tt_bad; |
88 | 0 | val.clear(); |
89 | 0 | raw_val.clear(); |
90 | 0 | error_message = ""; |
91 | 0 | before_token = true; |
92 | 0 | in_token = false; |
93 | 0 | char_to_unread = '\0'; |
94 | 0 | inline_image_bytes = 0; |
95 | 0 | string_depth = 0; |
96 | 0 | bad = false; |
97 | 0 | } |
98 | | |
99 | | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
100 | 0 | type(type), |
101 | 0 | value(value), |
102 | 0 | raw_value(value) |
103 | 0 | { |
104 | 0 | if (type == tt_string) { |
105 | 0 | raw_value = QPDFObjectHandle::newString(value).unparse(); |
106 | 0 | } else if (type == tt_name) { |
107 | 0 | raw_value = QPDFObjectHandle::newName(value).unparse(); |
108 | 0 | } |
109 | 0 | } |
110 | | |
111 | | QPDFTokenizer::QPDFTokenizer() : |
112 | 0 | m(std::make_unique<qpdf::Tokenizer>()) |
113 | 0 | { |
114 | 0 | } |
115 | | |
116 | 0 | QPDFTokenizer::~QPDFTokenizer() = default; |
117 | | |
118 | | Tokenizer::Tokenizer() |
119 | 0 | { |
120 | 0 | reset(); |
121 | 0 | } |
122 | | |
123 | | void |
124 | | QPDFTokenizer::allowEOF() |
125 | 0 | { |
126 | 0 | m->allowEOF(); |
127 | 0 | } |
128 | | |
129 | | void |
130 | | Tokenizer::allowEOF() |
131 | 0 | { |
132 | 0 | allow_eof = true; |
133 | 0 | } |
134 | | |
135 | | void |
136 | | QPDFTokenizer::includeIgnorable() |
137 | 0 | { |
138 | 0 | m->includeIgnorable(); |
139 | 0 | } |
140 | | |
141 | | void |
142 | | Tokenizer::includeIgnorable() |
143 | 0 | { |
144 | 0 | include_ignorable = true; |
145 | 0 | } |
146 | | |
147 | | bool |
148 | | Tokenizer::isSpace(char ch) |
149 | 0 | { |
150 | 0 | return (ch == '\0' || util::is_space(ch)); |
151 | 0 | } |
152 | | |
153 | | bool |
154 | | Tokenizer::isDelimiter(char ch) |
155 | 0 | { |
156 | 0 | return is_delimiter(ch); |
157 | 0 | } |
158 | | |
159 | | void |
160 | | QPDFTokenizer::presentCharacter(char ch) |
161 | 0 | { |
162 | 0 | m->presentCharacter(ch); |
163 | 0 | } |
164 | | |
165 | | void |
166 | | Tokenizer::presentCharacter(char ch) |
167 | 0 | { |
168 | 0 | handleCharacter(ch); |
169 | |
|
170 | 0 | if (in_token) { |
171 | 0 | raw_val += ch; |
172 | 0 | } |
173 | 0 | } |
174 | | |
175 | | void |
176 | | Tokenizer::handleCharacter(char ch) |
177 | 0 | { |
178 | | // In some cases, functions called below may call a second handler. This happens whenever you |
179 | | // have to use a character from the next token to detect the end of the current token. |
180 | |
|
181 | 0 | switch (state) { |
182 | 0 | case st_top: |
183 | 0 | inTop(ch); |
184 | 0 | return; |
185 | | |
186 | 0 | case st_in_space: |
187 | 0 | inSpace(ch); |
188 | 0 | return; |
189 | | |
190 | 0 | case st_in_comment: |
191 | 0 | inComment(ch); |
192 | 0 | return; |
193 | | |
194 | 0 | case st_lt: |
195 | 0 | inLt(ch); |
196 | 0 | return; |
197 | | |
198 | 0 | case st_gt: |
199 | 0 | inGt(ch); |
200 | 0 | return; |
201 | | |
202 | 0 | case st_in_string: |
203 | 0 | inString(ch); |
204 | 0 | return; |
205 | | |
206 | 0 | case st_name: |
207 | 0 | inName(ch); |
208 | 0 | return; |
209 | | |
210 | 0 | case st_number: |
211 | 0 | inNumber(ch); |
212 | 0 | return; |
213 | | |
214 | 0 | case st_real: |
215 | 0 | inReal(ch); |
216 | 0 | return; |
217 | | |
218 | 0 | case st_string_after_cr: |
219 | 0 | inStringAfterCR(ch); |
220 | 0 | return; |
221 | | |
222 | 0 | case st_string_escape: |
223 | 0 | inStringEscape(ch); |
224 | 0 | return; |
225 | | |
226 | 0 | case st_char_code: |
227 | 0 | inCharCode(ch); |
228 | 0 | return; |
229 | | |
230 | 0 | case st_literal: |
231 | 0 | inLiteral(ch); |
232 | 0 | return; |
233 | | |
234 | 0 | case st_inline_image: |
235 | 0 | inInlineImage(ch); |
236 | 0 | return; |
237 | | |
238 | 0 | case st_in_hexstring: |
239 | 0 | inHexstring(ch); |
240 | 0 | return; |
241 | | |
242 | 0 | case st_in_hexstring_2nd: |
243 | 0 | inHexstring2nd(ch); |
244 | 0 | return; |
245 | | |
246 | 0 | case st_name_hex1: |
247 | 0 | inNameHex1(ch); |
248 | 0 | return; |
249 | | |
250 | 0 | case st_name_hex2: |
251 | 0 | inNameHex2(ch); |
252 | 0 | return; |
253 | | |
254 | 0 | case st_sign: |
255 | 0 | inSign(ch); |
256 | 0 | return; |
257 | | |
258 | 0 | case st_decimal: |
259 | 0 | inDecimal(ch); |
260 | 0 | return; |
261 | | |
262 | 0 | case (st_before_token): |
263 | 0 | inBeforeToken(ch); |
264 | 0 | return; |
265 | | |
266 | 0 | case (st_token_ready): |
267 | 0 | inTokenReady(ch); |
268 | 0 | return; |
269 | | |
270 | 0 | default: |
271 | 0 | throw std::logic_error("INTERNAL ERROR: invalid state while reading token"); |
272 | 0 | } |
273 | 0 | } |
274 | | |
275 | | void |
276 | | Tokenizer::inTokenReady(char ch) |
277 | 0 | { |
278 | 0 | throw std::logic_error( |
279 | 0 | "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); |
280 | 0 | } |
281 | | |
282 | | void |
283 | | Tokenizer::inBeforeToken(char ch) |
284 | 0 | { |
285 | | // Note: we specifically do not use ctype here. It is locale-dependent. |
286 | 0 | if (isSpace(ch)) { |
287 | 0 | before_token = !include_ignorable; |
288 | 0 | in_token = include_ignorable; |
289 | 0 | if (include_ignorable) { |
290 | 0 | state = st_in_space; |
291 | 0 | } |
292 | 0 | } else if (ch == '%') { |
293 | 0 | before_token = !include_ignorable; |
294 | 0 | in_token = include_ignorable; |
295 | 0 | state = st_in_comment; |
296 | 0 | } else { |
297 | 0 | before_token = false; |
298 | 0 | in_token = true; |
299 | 0 | inTop(ch); |
300 | 0 | } |
301 | 0 | } |
302 | | |
303 | | void |
304 | | Tokenizer::inTop(char ch) |
305 | 0 | { |
306 | 0 | switch (ch) { |
307 | 0 | case '(': |
308 | 0 | string_depth = 1; |
309 | 0 | state = st_in_string; |
310 | 0 | return; |
311 | | |
312 | 0 | case '<': |
313 | 0 | state = st_lt; |
314 | 0 | return; |
315 | | |
316 | 0 | case '>': |
317 | 0 | state = st_gt; |
318 | 0 | return; |
319 | | |
320 | 0 | case (')'): |
321 | 0 | type = tt::tt_bad; |
322 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad )"); |
323 | 0 | error_message = "unexpected )"; |
324 | 0 | state = st_token_ready; |
325 | 0 | return; |
326 | | |
327 | 0 | case '[': |
328 | 0 | type = tt::tt_array_open; |
329 | 0 | state = st_token_ready; |
330 | 0 | return; |
331 | | |
332 | 0 | case ']': |
333 | 0 | type = tt::tt_array_close; |
334 | 0 | state = st_token_ready; |
335 | 0 | return; |
336 | | |
337 | 0 | case '{': |
338 | 0 | type = tt::tt_brace_open; |
339 | 0 | state = st_token_ready; |
340 | 0 | return; |
341 | | |
342 | 0 | case '}': |
343 | 0 | type = tt::tt_brace_close; |
344 | 0 | state = st_token_ready; |
345 | 0 | return; |
346 | | |
347 | 0 | case '/': |
348 | 0 | state = st_name; |
349 | 0 | val += ch; |
350 | 0 | return; |
351 | | |
352 | 0 | case '0': |
353 | 0 | case '1': |
354 | 0 | case '2': |
355 | 0 | case '3': |
356 | 0 | case '4': |
357 | 0 | case '5': |
358 | 0 | case '6': |
359 | 0 | case '7': |
360 | 0 | case '8': |
361 | 0 | case '9': |
362 | 0 | state = st_number; |
363 | 0 | return; |
364 | | |
365 | 0 | case '+': |
366 | 0 | case '-': |
367 | 0 | state = st_sign; |
368 | 0 | return; |
369 | | |
370 | 0 | case '.': |
371 | 0 | state = st_decimal; |
372 | 0 | return; |
373 | | |
374 | 0 | default: |
375 | 0 | state = st_literal; |
376 | 0 | return; |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | | void |
381 | | Tokenizer::inSpace(char ch) |
382 | 0 | { |
383 | | // We only enter this state if include_ignorable is true. |
384 | 0 | if (!isSpace(ch)) { |
385 | 0 | type = tt::tt_space; |
386 | 0 | in_token = false; |
387 | 0 | char_to_unread = ch; |
388 | 0 | state = st_token_ready; |
389 | 0 | } |
390 | 0 | } |
391 | | |
392 | | void |
393 | | Tokenizer::inComment(char ch) |
394 | 0 | { |
395 | 0 | if ((ch == '\r') || (ch == '\n')) { |
396 | 0 | if (include_ignorable) { |
397 | 0 | type = tt::tt_comment; |
398 | 0 | in_token = false; |
399 | 0 | char_to_unread = ch; |
400 | 0 | state = st_token_ready; |
401 | 0 | } else { |
402 | 0 | state = st_before_token; |
403 | 0 | } |
404 | 0 | } |
405 | 0 | } |
406 | | |
407 | | void |
408 | | Tokenizer::inString(char ch) |
409 | 0 | { |
410 | 0 | switch (ch) { |
411 | 0 | case '\\': |
412 | 0 | state = st_string_escape; |
413 | 0 | return; |
414 | | |
415 | 0 | case '(': |
416 | 0 | val += ch; |
417 | 0 | ++string_depth; |
418 | 0 | return; |
419 | | |
420 | 0 | case ')': |
421 | 0 | if (--string_depth == 0) { |
422 | 0 | type = tt::tt_string; |
423 | 0 | state = st_token_ready; |
424 | 0 | return; |
425 | 0 | } |
426 | | |
427 | 0 | val += ch; |
428 | 0 | return; |
429 | | |
430 | 0 | case '\r': |
431 | | // CR by itself is converted to LF |
432 | 0 | val += '\n'; |
433 | 0 | state = st_string_after_cr; |
434 | 0 | return; |
435 | | |
436 | 0 | case '\n': |
437 | 0 | val += ch; |
438 | 0 | return; |
439 | | |
440 | 0 | default: |
441 | 0 | val += ch; |
442 | 0 | return; |
443 | 0 | } |
444 | 0 | } |
445 | | |
446 | | void |
447 | | Tokenizer::inName(char ch) |
448 | 0 | { |
449 | 0 | if (isDelimiter(ch)) { |
450 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
451 | | // the whitespace character even though it is ignored since it may be the newline after a |
452 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
453 | | // though not on any files in the test suite as of this |
454 | | // writing. |
455 | |
|
456 | 0 | type = bad ? tt::tt_bad : tt::tt_name; |
457 | 0 | in_token = false; |
458 | 0 | char_to_unread = ch; |
459 | 0 | state = st_token_ready; |
460 | 0 | } else if (ch == '#') { |
461 | 0 | char_code = 0; |
462 | 0 | state = st_name_hex1; |
463 | 0 | } else { |
464 | 0 | val += ch; |
465 | 0 | } |
466 | 0 | } |
467 | | |
468 | | void |
469 | | Tokenizer::inNameHex1(char ch) |
470 | 0 | { |
471 | 0 | hex_char = ch; |
472 | |
|
473 | 0 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
474 | 0 | char_code = int(hval) << 4; |
475 | 0 | state = st_name_hex2; |
476 | 0 | } else { |
477 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad name 1"); |
478 | 0 | error_message = "name with stray # will not work with PDF >= 1.2"; |
479 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
480 | 0 | val += '\0'; |
481 | 0 | state = st_name; |
482 | 0 | inName(ch); |
483 | 0 | } |
484 | 0 | } |
485 | | |
486 | | void |
487 | | Tokenizer::inNameHex2(char ch) |
488 | 0 | { |
489 | 0 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
490 | 0 | char_code |= int(hval); |
491 | 0 | } else { |
492 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad name 2"); |
493 | 0 | error_message = "name with stray # will not work with PDF >= 1.2"; |
494 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
495 | 0 | val += '\0'; |
496 | 0 | val += hex_char; |
497 | 0 | state = st_name; |
498 | 0 | inName(ch); |
499 | 0 | return; |
500 | 0 | } |
501 | 0 | if (char_code == 0) { |
502 | 0 | QTC::TC("qpdf", "QPDFTokenizer null in name"); |
503 | 0 | error_message = "null character not allowed in name token"; |
504 | 0 | val += "#00"; |
505 | 0 | state = st_name; |
506 | 0 | bad = true; |
507 | 0 | } else { |
508 | 0 | val += char(char_code); |
509 | 0 | state = st_name; |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | | void |
514 | | Tokenizer::inSign(char ch) |
515 | 0 | { |
516 | 0 | if (util::is_digit(ch)) { |
517 | 0 | state = st_number; |
518 | 0 | } else if (ch == '.') { |
519 | 0 | state = st_decimal; |
520 | 0 | } else { |
521 | 0 | state = st_literal; |
522 | 0 | inLiteral(ch); |
523 | 0 | } |
524 | 0 | } |
525 | | |
526 | | void |
527 | | Tokenizer::inDecimal(char ch) |
528 | 0 | { |
529 | 0 | if (util::is_digit(ch)) { |
530 | 0 | state = st_real; |
531 | 0 | } else { |
532 | 0 | state = st_literal; |
533 | 0 | inLiteral(ch); |
534 | 0 | } |
535 | 0 | } |
536 | | |
537 | | void |
538 | | Tokenizer::inNumber(char ch) |
539 | 0 | { |
540 | 0 | if (util::is_digit(ch)) { |
541 | 0 | } else if (ch == '.') { |
542 | 0 | state = st_real; |
543 | 0 | } else if (isDelimiter(ch)) { |
544 | 0 | type = tt::tt_integer; |
545 | 0 | state = st_token_ready; |
546 | 0 | in_token = false; |
547 | 0 | char_to_unread = ch; |
548 | 0 | } else { |
549 | 0 | state = st_literal; |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | | void |
554 | | Tokenizer::inReal(char ch) |
555 | 0 | { |
556 | 0 | if (util::is_digit(ch)) { |
557 | 0 | } else if (isDelimiter(ch)) { |
558 | 0 | type = tt::tt_real; |
559 | 0 | state = st_token_ready; |
560 | 0 | in_token = false; |
561 | 0 | char_to_unread = ch; |
562 | 0 | } else { |
563 | 0 | state = st_literal; |
564 | 0 | } |
565 | 0 | } |
566 | | void |
567 | | Tokenizer::inStringEscape(char ch) |
568 | 0 | { |
569 | 0 | state = st_in_string; |
570 | 0 | switch (ch) { |
571 | 0 | case '0': |
572 | 0 | case '1': |
573 | 0 | case '2': |
574 | 0 | case '3': |
575 | 0 | case '4': |
576 | 0 | case '5': |
577 | 0 | case '6': |
578 | 0 | case '7': |
579 | 0 | state = st_char_code; |
580 | 0 | char_code = 0; |
581 | 0 | digit_count = 0; |
582 | 0 | inCharCode(ch); |
583 | 0 | return; |
584 | | |
585 | 0 | case 'n': |
586 | 0 | val += '\n'; |
587 | 0 | return; |
588 | | |
589 | 0 | case 'r': |
590 | 0 | val += '\r'; |
591 | 0 | return; |
592 | | |
593 | 0 | case 't': |
594 | 0 | val += '\t'; |
595 | 0 | return; |
596 | | |
597 | 0 | case 'b': |
598 | 0 | val += '\b'; |
599 | 0 | return; |
600 | | |
601 | 0 | case 'f': |
602 | 0 | val += '\f'; |
603 | 0 | return; |
604 | | |
605 | 0 | case '\n': |
606 | 0 | return; |
607 | | |
608 | 0 | case '\r': |
609 | 0 | state = st_string_after_cr; |
610 | 0 | return; |
611 | | |
612 | 0 | default: |
613 | | // PDF spec says backslash is ignored before anything else |
614 | 0 | val += ch; |
615 | 0 | return; |
616 | 0 | } |
617 | 0 | } |
618 | | |
619 | | void |
620 | | Tokenizer::inStringAfterCR(char ch) |
621 | 0 | { |
622 | 0 | state = st_in_string; |
623 | 0 | if (ch != '\n') { |
624 | 0 | inString(ch); |
625 | 0 | } |
626 | 0 | } |
627 | | |
628 | | void |
629 | | Tokenizer::inLt(char ch) |
630 | 0 | { |
631 | 0 | if (ch == '<') { |
632 | 0 | type = tt::tt_dict_open; |
633 | 0 | state = st_token_ready; |
634 | 0 | return; |
635 | 0 | } |
636 | | |
637 | 0 | state = st_in_hexstring; |
638 | 0 | inHexstring(ch); |
639 | 0 | } |
640 | | |
641 | | void |
642 | | Tokenizer::inGt(char ch) |
643 | 0 | { |
644 | 0 | if (ch == '>') { |
645 | 0 | type = tt::tt_dict_close; |
646 | 0 | state = st_token_ready; |
647 | 0 | } else { |
648 | 0 | type = tt::tt_bad; |
649 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad >"); |
650 | 0 | error_message = "unexpected >"; |
651 | 0 | in_token = false; |
652 | 0 | char_to_unread = ch; |
653 | 0 | state = st_token_ready; |
654 | 0 | } |
655 | 0 | } |
656 | | |
657 | | void |
658 | | Tokenizer::inLiteral(char ch) |
659 | 0 | { |
660 | 0 | if (isDelimiter(ch)) { |
661 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
662 | | // the whitespace character even though it is ignored since it may be the newline after a |
663 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
664 | | // though not on any files in the test suite as of this writing. |
665 | |
|
666 | 0 | in_token = false; |
667 | 0 | char_to_unread = ch; |
668 | 0 | state = st_token_ready; |
669 | 0 | type = (raw_val == "true") || (raw_val == "false") |
670 | 0 | ? tt::tt_bool |
671 | 0 | : (raw_val == "null" ? tt::tt_null : tt::tt_word); |
672 | 0 | } |
673 | 0 | } |
674 | | |
675 | | void |
676 | | Tokenizer::inHexstring(char ch) |
677 | 0 | { |
678 | 0 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
679 | 0 | char_code = int(hval) << 4; |
680 | 0 | state = st_in_hexstring_2nd; |
681 | |
|
682 | 0 | } else if (ch == '>') { |
683 | 0 | type = tt::tt_string; |
684 | 0 | state = st_token_ready; |
685 | |
|
686 | 0 | } else if (isSpace(ch)) { |
687 | | // ignore |
688 | |
|
689 | 0 | } else { |
690 | 0 | type = tt::tt_bad; |
691 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); |
692 | 0 | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
693 | 0 | state = st_token_ready; |
694 | 0 | } |
695 | 0 | } |
696 | | |
697 | | void |
698 | | Tokenizer::inHexstring2nd(char ch) |
699 | 0 | { |
700 | 0 | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
701 | 0 | val += char(char_code) | hval; |
702 | 0 | state = st_in_hexstring; |
703 | |
|
704 | 0 | } else if (ch == '>') { |
705 | | // PDF spec says odd hexstrings have implicit trailing 0. |
706 | 0 | val += char(char_code); |
707 | 0 | type = tt::tt_string; |
708 | 0 | state = st_token_ready; |
709 | |
|
710 | 0 | } else if (isSpace(ch)) { |
711 | | // ignore |
712 | |
|
713 | 0 | } else { |
714 | 0 | type = tt::tt_bad; |
715 | 0 | QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); |
716 | 0 | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
717 | 0 | state = st_token_ready; |
718 | 0 | } |
719 | 0 | } |
720 | | |
721 | | void |
722 | | Tokenizer::inCharCode(char ch) |
723 | 0 | { |
724 | 0 | bool handled = false; |
725 | 0 | if (('0' <= ch) && (ch <= '7')) { |
726 | 0 | char_code = 8 * char_code + (int(ch) - int('0')); |
727 | 0 | if (++(digit_count) < 3) { |
728 | 0 | return; |
729 | 0 | } |
730 | 0 | handled = true; |
731 | 0 | } |
732 | | // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF |
733 | | // Spec says to ignore high-order overflow. |
734 | 0 | val += char(char_code % 256); |
735 | 0 | state = st_in_string; |
736 | 0 | if (!handled) { |
737 | 0 | inString(ch); |
738 | 0 | } |
739 | 0 | } |
740 | | |
741 | | void |
742 | | Tokenizer::inInlineImage(char ch) |
743 | 0 | { |
744 | 0 | if ((raw_val.length() + 1) == inline_image_bytes) { |
745 | 0 | QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); |
746 | 0 | type = tt::tt_inline_image; |
747 | 0 | inline_image_bytes = 0; |
748 | 0 | state = st_token_ready; |
749 | 0 | } |
750 | 0 | } |
751 | | |
752 | | void |
753 | | QPDFTokenizer::presentEOF() |
754 | 0 | { |
755 | 0 | m->presentEOF(); |
756 | 0 | } |
757 | | |
758 | | void |
759 | | Tokenizer::presentEOF() |
760 | 0 | { |
761 | 0 | switch (state) { |
762 | 0 | case st_name: |
763 | 0 | case st_name_hex1: |
764 | 0 | case st_name_hex2: |
765 | 0 | case st_number: |
766 | 0 | case st_real: |
767 | 0 | case st_sign: |
768 | 0 | case st_decimal: |
769 | 0 | case st_literal: |
770 | 0 | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
771 | | // Push any delimiter to the state machine to finish off the final token. |
772 | 0 | presentCharacter('\f'); |
773 | 0 | in_token = true; |
774 | 0 | break; |
775 | | |
776 | 0 | case st_top: |
777 | 0 | case st_before_token: |
778 | 0 | type = tt::tt_eof; |
779 | 0 | break; |
780 | | |
781 | 0 | case st_in_space: |
782 | 0 | type = include_ignorable ? tt::tt_space : tt::tt_eof; |
783 | 0 | break; |
784 | | |
785 | 0 | case st_in_comment: |
786 | 0 | type = include_ignorable ? tt::tt_comment : tt::tt_bad; |
787 | 0 | break; |
788 | | |
789 | 0 | case st_token_ready: |
790 | 0 | break; |
791 | | |
792 | 0 | default: |
793 | 0 | QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); |
794 | 0 | type = tt::tt_bad; |
795 | 0 | error_message = "EOF while reading token"; |
796 | 0 | } |
797 | 0 | state = st_token_ready; |
798 | 0 | } |
799 | | |
800 | | void |
801 | | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
802 | 0 | { |
803 | 0 | m->expectInlineImage(*input); |
804 | 0 | } |
805 | | |
806 | | void |
807 | | QPDFTokenizer::expectInlineImage(InputSource& input) |
808 | 0 | { |
809 | 0 | m->expectInlineImage(input); |
810 | 0 | } |
811 | | |
812 | | void |
813 | | Tokenizer::expectInlineImage(InputSource& input) |
814 | 0 | { |
815 | 0 | if (state == st_token_ready) { |
816 | 0 | reset(); |
817 | 0 | } else if (state != st_before_token) { |
818 | 0 | throw std::logic_error( |
819 | 0 | "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); |
820 | 0 | } |
821 | 0 | findEI(input); |
822 | 0 | before_token = false; |
823 | 0 | in_token = true; |
824 | 0 | state = st_inline_image; |
825 | 0 | } |
826 | | |
827 | | void |
828 | | Tokenizer::findEI(InputSource& input) |
829 | 0 | { |
830 | 0 | qpdf_offset_t last_offset = input.getLastOffset(); |
831 | 0 | qpdf_offset_t pos = input.tell(); |
832 | | |
833 | | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
834 | | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
835 | | // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the |
836 | | // end without finding one, return the last EI we found. Store the number of bytes expected in |
837 | | // the inline image including the EI and use that to break out of inline image, falling back to |
838 | | // the old method if needed. |
839 | |
|
840 | 0 | bool okay = false; |
841 | 0 | bool first_try = true; |
842 | 0 | while (!okay) { |
843 | 0 | QPDFWordTokenFinder f(input, "EI"); |
844 | 0 | if (!input.findFirst("EI", input.tell(), 0, f)) { |
845 | 0 | break; |
846 | 0 | } |
847 | 0 | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
848 | |
|
849 | 0 | Tokenizer check; |
850 | 0 | bool found_bad = false; |
851 | | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
852 | | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
853 | | // EI and the next valid one's ID since width, height, bits per pixel, and color space are |
854 | | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
855 | | // be pretty sure we've found the actual EI. |
856 | 0 | for (int i = 0; i < 10; ++i) { |
857 | 0 | check.nextToken(input, "checker"); |
858 | 0 | auto typ = check.getType(); |
859 | 0 | if (typ == tt::tt_eof) { |
860 | 0 | okay = true; |
861 | 0 | } else if (typ == tt::tt_bad) { |
862 | 0 | found_bad = true; |
863 | 0 | } else if (typ == tt::tt_word) { |
864 | | // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into |
865 | | // "words". We recognize strings of alphabetic characters as potential valid |
866 | | // operators for purposes of telling whether we're in valid content or not. It's not |
867 | | // perfect, but it should work more reliably than what we used to do, which was |
868 | | // already good enough for the vast majority of files. |
869 | 0 | bool found_alpha = false; |
870 | 0 | bool found_non_printable = false; |
871 | 0 | bool found_other = false; |
872 | 0 | for (char ch: check.getValue()) { |
873 | 0 | if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '*')) { |
874 | | // Treat '*' as alpha since there are valid PDF operators that contain * |
875 | | // along with alphabetic characters. |
876 | 0 | found_alpha = true; |
877 | 0 | } else if (static_cast<signed char>(ch) < 32 && !isSpace(ch)) { |
878 | | // Compare ch as a signed char so characters outside of 7-bit will be < 0. |
879 | 0 | found_non_printable = true; |
880 | 0 | break; |
881 | 0 | } else { |
882 | 0 | found_other = true; |
883 | 0 | } |
884 | 0 | } |
885 | 0 | if (found_non_printable || (found_alpha && found_other)) { |
886 | 0 | found_bad = true; |
887 | 0 | } |
888 | 0 | } |
889 | 0 | if (okay || found_bad) { |
890 | 0 | break; |
891 | 0 | } |
892 | 0 | } |
893 | 0 | if (!found_bad) { |
894 | 0 | okay = true; |
895 | 0 | } |
896 | 0 | if (!okay) { |
897 | 0 | first_try = false; |
898 | 0 | } |
899 | 0 | } |
900 | 0 | if (okay && (!first_try)) { |
901 | 0 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
902 | 0 | } |
903 | |
|
904 | 0 | input.seek(pos, SEEK_SET); |
905 | 0 | input.setLastOffset(last_offset); |
906 | 0 | } |
907 | | |
908 | | bool |
909 | | QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
910 | 0 | { |
911 | 0 | return m->getToken(token, unread_char, ch); |
912 | 0 | } |
913 | | |
914 | | bool |
915 | | Tokenizer::getToken(Token& token, bool& unread_char, char& ch) |
916 | 0 | { |
917 | 0 | bool ready = (state == st_token_ready); |
918 | 0 | unread_char = !in_token && !before_token; |
919 | 0 | ch = char_to_unread; |
920 | 0 | if (ready) { |
921 | 0 | token = (!(type == tt::tt_name || type == tt::tt_string)) |
922 | 0 | ? Token(type, raw_val, raw_val, error_message) |
923 | 0 | : Token(type, val, raw_val, error_message); |
924 | |
|
925 | 0 | reset(); |
926 | 0 | } |
927 | 0 | return ready; |
928 | 0 | } |
929 | | |
930 | | bool |
931 | | QPDFTokenizer::betweenTokens() |
932 | 0 | { |
933 | 0 | return m->betweenTokens(); |
934 | 0 | } |
935 | | |
936 | | bool |
937 | | Tokenizer::betweenTokens() |
938 | 0 | { |
939 | 0 | return before_token; |
940 | 0 | } |
941 | | |
942 | | QPDFTokenizer::Token |
943 | | QPDFTokenizer::readToken( |
944 | | InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
945 | 0 | { |
946 | 0 | return m->readToken(input, context, allow_bad, max_len); |
947 | 0 | } |
948 | | |
949 | | QPDFTokenizer::Token |
950 | | QPDFTokenizer::readToken( |
951 | | std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) |
952 | 0 | { |
953 | 0 | return m->readToken(*input, context, allow_bad, max_len); |
954 | 0 | } |
955 | | |
956 | | QPDFTokenizer::Token |
957 | | Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
958 | 0 | { |
959 | 0 | nextToken(input, context, max_len); |
960 | |
|
961 | 0 | Token token; |
962 | 0 | bool unread_char; |
963 | 0 | char char_to_unread; |
964 | 0 | getToken(token, unread_char, char_to_unread); |
965 | |
|
966 | 0 | if (token.getType() == tt::tt_bad) { |
967 | 0 | if (allow_bad) { |
968 | 0 | QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); |
969 | 0 | } else { |
970 | 0 | throw QPDFExc( |
971 | 0 | qpdf_e_damaged_pdf, |
972 | 0 | input.getName(), |
973 | 0 | context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, |
974 | 0 | input.getLastOffset(), |
975 | 0 | token.getErrorMessage()); |
976 | 0 | } |
977 | 0 | } |
978 | 0 | return token; |
979 | 0 | } |
980 | | |
981 | | bool |
982 | | Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) |
983 | 0 | { |
984 | 0 | if (state != st_inline_image) { |
985 | 0 | reset(); |
986 | 0 | } |
987 | 0 | qpdf_offset_t offset = input.fastTell(); |
988 | |
|
989 | 0 | while (state != st_token_ready) { |
990 | 0 | char ch; |
991 | 0 | if (!input.fastRead(ch)) { |
992 | 0 | presentEOF(); |
993 | |
|
994 | 0 | if ((type == tt::tt_eof) && (!allow_eof)) { |
995 | | // Nothing in the qpdf library calls readToken without allowEOF anymore, so this |
996 | | // case is not exercised. |
997 | 0 | type = tt::tt_bad; |
998 | 0 | error_message = "unexpected EOF"; |
999 | 0 | offset = input.getLastOffset(); |
1000 | 0 | } |
1001 | 0 | } else { |
1002 | 0 | handleCharacter(ch); |
1003 | 0 | if (before_token) { |
1004 | 0 | ++offset; |
1005 | 0 | } |
1006 | 0 | if (in_token) { |
1007 | 0 | raw_val += ch; |
1008 | 0 | } |
1009 | 0 | if (max_len && (raw_val.length() >= max_len) && (state != st_token_ready)) { |
1010 | | // terminate this token now |
1011 | 0 | QTC::TC("qpdf", "QPDFTokenizer block long token"); |
1012 | 0 | type = tt::tt_bad; |
1013 | 0 | state = st_token_ready; |
1014 | 0 | error_message = "exceeded allowable length while reading token"; |
1015 | 0 | } |
1016 | 0 | } |
1017 | 0 | } |
1018 | |
|
1019 | 0 | input.fastUnread(!in_token && !before_token); |
1020 | |
|
1021 | 0 | if (type != tt::tt_eof) { |
1022 | 0 | input.setLastOffset(offset); |
1023 | 0 | } |
1024 | |
|
1025 | 0 | return error_message.empty(); |
1026 | 0 | } |