/src/qpdf/libqpdf/QPDFTokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include <qpdf/QPDFTokenizer_private.hh> |
2 | | |
3 | | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
4 | | // including it in case it may accidentally be used. |
5 | | |
6 | | #include <qpdf/InputSource_private.hh> |
7 | | #include <qpdf/QIntC.hh> |
8 | | #include <qpdf/QPDFExc.hh> |
9 | | #include <qpdf/QPDFObjectHandle.hh> |
10 | | #include <qpdf/QTC.hh> |
11 | | #include <qpdf/QUtil.hh> |
12 | | #include <qpdf/Util.hh> |
13 | | |
14 | | #include <cstdlib> |
15 | | #include <cstring> |
16 | | #include <stdexcept> |
17 | | |
18 | | using namespace qpdf; |
19 | | |
20 | | using Token = QPDFTokenizer::Token; |
21 | | using tt = QPDFTokenizer::token_type_e; |
22 | | |
23 | | static inline bool |
24 | | is_delimiter(char ch) |
25 | 1.56G | { |
26 | 1.56G | return ( |
27 | 1.56G | ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' || |
28 | 1.56G | ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' || |
29 | 1.56G | ch == '\v' || ch == '\f' || ch == 0); |
30 | 1.56G | } |
31 | | |
32 | | namespace |
33 | | { |
34 | | class QPDFWordTokenFinder: public InputSource::Finder |
35 | | { |
36 | | public: |
37 | | QPDFWordTokenFinder(InputSource& is, std::string const& str) : |
38 | 75.9k | is(is), |
39 | 75.9k | str(str) |
40 | 75.9k | { |
41 | 75.9k | } |
42 | 75.9k | ~QPDFWordTokenFinder() override = default; |
43 | | bool check() override; |
44 | | |
45 | | private: |
46 | | InputSource& is; |
47 | | std::string str; |
48 | | }; |
49 | | } // namespace |
50 | | |
51 | | bool |
52 | | QPDFWordTokenFinder::check() |
53 | 138k | { |
54 | | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
55 | | // delimiter or EOF. |
56 | 138k | Tokenizer tokenizer; |
57 | 138k | tokenizer.nextToken(is, "finder", str.size() + 2); |
58 | 138k | qpdf_offset_t pos = is.tell(); |
59 | 138k | if (tokenizer.getType() != tt::tt_word || tokenizer.getValue() != str) { |
60 | 65.8k | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
61 | 65.8k | return false; |
62 | 65.8k | } |
63 | 72.3k | qpdf_offset_t token_start = is.getLastOffset(); |
64 | 72.3k | char next; |
65 | 72.3k | bool next_okay = false; |
66 | 72.3k | if (is.read(&next, 1) == 0) { |
67 | 12 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
68 | 12 | next_okay = true; |
69 | 72.3k | } else { |
70 | 72.3k | next_okay = is_delimiter(next); |
71 | 72.3k | } |
72 | 72.3k | is.seek(pos, SEEK_SET); |
73 | 72.3k | if (!next_okay) { |
74 | 0 | return false; |
75 | 0 | } |
76 | 72.3k | if (token_start == 0) { |
77 | | // Can't actually happen...we never start the search at the beginning of the input. |
78 | 0 | return false; |
79 | 0 | } |
80 | 72.3k | return true; |
81 | 72.3k | } |
82 | | |
83 | | void |
84 | | Tokenizer::reset() |
85 | 401M | { |
86 | 401M | state = st_before_token; |
87 | 401M | type = tt::tt_bad; |
88 | 401M | val.clear(); |
89 | 401M | raw_val.clear(); |
90 | 401M | error_message = ""; |
91 | 401M | before_token = true; |
92 | 401M | in_token = false; |
93 | 401M | char_to_unread = '\0'; |
94 | 401M | inline_image_bytes = 0; |
95 | 401M | string_depth = 0; |
96 | 401M | bad = false; |
97 | 401M | } |
98 | | |
99 | | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
100 | 3.61k | type(type), |
101 | 3.61k | value(value), |
102 | 3.61k | raw_value(value) |
103 | 3.61k | { |
104 | 3.61k | if (type == tt_string) { |
105 | 0 | raw_value = QPDFObjectHandle::newString(value).unparse(); |
106 | 3.61k | } else if (type == tt_name) { |
107 | 0 | raw_value = QPDFObjectHandle::newName(value).unparse(); |
108 | 0 | } |
109 | 3.61k | } |
110 | | |
111 | | QPDFTokenizer::QPDFTokenizer() : |
112 | 15.1k | m(std::make_unique<qpdf::Tokenizer>()) |
113 | 15.1k | { |
114 | 15.1k | } |
115 | | |
116 | 15.1k | QPDFTokenizer::~QPDFTokenizer() = default; |
117 | | |
118 | | Tokenizer::Tokenizer() |
119 | 760k | { |
120 | 760k | reset(); |
121 | 760k | } |
122 | | |
123 | | void |
124 | | QPDFTokenizer::allowEOF() |
125 | 15.1k | { |
126 | 15.1k | m->allowEOF(); |
127 | 15.1k | } |
128 | | |
129 | | void |
130 | | Tokenizer::allowEOF() |
131 | 331k | { |
132 | 331k | allow_eof = true; |
133 | 331k | } |
134 | | |
135 | | void |
136 | | QPDFTokenizer::includeIgnorable() |
137 | 15.1k | { |
138 | 15.1k | m->includeIgnorable(); |
139 | 15.1k | } |
140 | | |
141 | | void |
142 | | Tokenizer::includeIgnorable() |
143 | 15.1k | { |
144 | 15.1k | include_ignorable = true; |
145 | 15.1k | } |
146 | | |
147 | | bool |
148 | | Tokenizer::isSpace(char ch) |
149 | 477M | { |
150 | 477M | return (ch == '\0' || util::is_space(ch)); |
151 | 477M | } |
152 | | |
153 | | bool |
154 | | Tokenizer::isDelimiter(char ch) |
155 | 1.56G | { |
156 | 1.56G | return is_delimiter(ch); |
157 | 1.56G | } |
158 | | |
159 | | void |
160 | | QPDFTokenizer::presentCharacter(char ch) |
161 | 0 | { |
162 | 0 | m->presentCharacter(ch); |
163 | 0 | } |
164 | | |
165 | | void |
166 | | Tokenizer::presentCharacter(char ch) |
167 | 382k | { |
168 | 382k | handleCharacter(ch); |
169 | | |
170 | 382k | if (in_token) { |
171 | 0 | raw_val += ch; |
172 | 0 | } |
173 | 382k | } |
174 | | |
175 | | void |
176 | | Tokenizer::handleCharacter(char ch) |
177 | 4.33G | { |
178 | | // In some cases, functions called below may call a second handler. This happens whenever you |
179 | | // have to use a character from the next token to detect the end of the current token. |
180 | | |
181 | 4.33G | switch (state) { |
182 | 0 | case st_top: |
183 | 0 | inTop(ch); |
184 | 0 | return; |
185 | | |
186 | 89.1M | case st_in_space: |
187 | 89.1M | inSpace(ch); |
188 | 89.1M | return; |
189 | | |
190 | 268M | case st_in_comment: |
191 | 268M | inComment(ch); |
192 | 268M | return; |
193 | | |
194 | 6.64M | case st_lt: |
195 | 6.64M | inLt(ch); |
196 | 6.64M | return; |
197 | | |
198 | 4.92M | case st_gt: |
199 | 4.92M | inGt(ch); |
200 | 4.92M | return; |
201 | | |
202 | 1.76G | case st_in_string: |
203 | 1.76G | inString(ch); |
204 | 1.76G | return; |
205 | | |
206 | 815M | case st_name: |
207 | 815M | inName(ch); |
208 | 815M | return; |
209 | | |
210 | 91.3M | case st_number: |
211 | 91.3M | inNumber(ch); |
212 | 91.3M | return; |
213 | | |
214 | 5.91M | case st_real: |
215 | 5.91M | inReal(ch); |
216 | 5.91M | return; |
217 | | |
218 | 7.25M | case st_string_after_cr: |
219 | 7.25M | inStringAfterCR(ch); |
220 | 7.25M | return; |
221 | | |
222 | 8.25M | case st_string_escape: |
223 | 8.25M | inStringEscape(ch); |
224 | 8.25M | return; |
225 | | |
226 | 831k | case st_char_code: |
227 | 831k | inCharCode(ch); |
228 | 831k | return; |
229 | | |
230 | 702M | case st_literal: |
231 | 702M | inLiteral(ch); |
232 | 702M | return; |
233 | | |
234 | 164M | case st_inline_image: |
235 | 164M | inInlineImage(ch); |
236 | 164M | return; |
237 | | |
238 | 7.76M | case st_in_hexstring: |
239 | 7.76M | inHexstring(ch); |
240 | 7.76M | return; |
241 | | |
242 | 7.68M | case st_in_hexstring_2nd: |
243 | 7.68M | inHexstring2nd(ch); |
244 | 7.68M | return; |
245 | | |
246 | 1.49M | case st_name_hex1: |
247 | 1.49M | inNameHex1(ch); |
248 | 1.49M | return; |
249 | | |
250 | 498k | case st_name_hex2: |
251 | 498k | inNameHex2(ch); |
252 | 498k | return; |
253 | | |
254 | 1.25M | case st_sign: |
255 | 1.25M | inSign(ch); |
256 | 1.25M | return; |
257 | | |
258 | 458k | case st_decimal: |
259 | 458k | inDecimal(ch); |
260 | 458k | return; |
261 | | |
262 | 386M | case (st_before_token): |
263 | 386M | inBeforeToken(ch); |
264 | 386M | return; |
265 | | |
266 | 0 | case (st_token_ready): |
267 | 0 | inTokenReady(ch); |
268 | 0 | return; |
269 | | |
270 | 0 | default: |
271 | 0 | throw std::logic_error("INTERNAL ERROR: invalid state while reading token"); |
272 | 4.33G | } |
273 | 4.33G | } |
274 | | |
275 | | void |
276 | | Tokenizer::inTokenReady(char ch) |
277 | 0 | { |
278 | 0 | throw std::logic_error( |
279 | 0 | "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); |
280 | 0 | } |
281 | | |
282 | | void |
283 | | Tokenizer::inBeforeToken(char ch) |
284 | 386M | { |
285 | | // Note: we specifically do not use ctype here. It is locale-dependent. |
286 | 386M | if (isSpace(ch)) { |
287 | 190M | before_token = !include_ignorable; |
288 | 190M | in_token = include_ignorable; |
289 | 190M | if (include_ignorable) { |
290 | 50.6M | state = st_in_space; |
291 | 50.6M | } |
292 | 196M | } else if (ch == '%') { |
293 | 619k | before_token = !include_ignorable; |
294 | 619k | in_token = include_ignorable; |
295 | 619k | state = st_in_comment; |
296 | 195M | } else { |
297 | 195M | before_token = false; |
298 | 195M | in_token = true; |
299 | 195M | inTop(ch); |
300 | 195M | } |
301 | 386M | } |
302 | | |
303 | | void |
304 | | Tokenizer::inTop(char ch) |
305 | 195M | { |
306 | 195M | switch (ch) { |
307 | 1.62M | case '(': |
308 | 1.62M | string_depth = 1; |
309 | 1.62M | state = st_in_string; |
310 | 1.62M | return; |
311 | | |
312 | 6.65M | case '<': |
313 | 6.65M | state = st_lt; |
314 | 6.65M | return; |
315 | | |
316 | 4.93M | case '>': |
317 | 4.93M | state = st_gt; |
318 | 4.93M | return; |
319 | | |
320 | 1.79M | case (')'): |
321 | 1.79M | type = tt::tt_bad; |
322 | 1.79M | QTC::TC("qpdf", "QPDFTokenizer bad )"); |
323 | 1.79M | error_message = "unexpected )"; |
324 | 1.79M | state = st_token_ready; |
325 | 1.79M | return; |
326 | | |
327 | 3.03M | case '[': |
328 | 3.03M | type = tt::tt_array_open; |
329 | 3.03M | state = st_token_ready; |
330 | 3.03M | return; |
331 | | |
332 | 3.49M | case ']': |
333 | 3.49M | type = tt::tt_array_close; |
334 | 3.49M | state = st_token_ready; |
335 | 3.49M | return; |
336 | | |
337 | 2.80M | case '{': |
338 | 2.80M | type = tt::tt_brace_open; |
339 | 2.80M | state = st_token_ready; |
340 | 2.80M | return; |
341 | | |
342 | 496k | case '}': |
343 | 496k | type = tt::tt_brace_close; |
344 | 496k | state = st_token_ready; |
345 | 496k | return; |
346 | | |
347 | 52.8M | case '/': |
348 | 52.8M | state = st_name; |
349 | 52.8M | val += ch; |
350 | 52.8M | return; |
351 | | |
352 | 15.8M | case '0': |
353 | 23.3M | case '1': |
354 | 26.8M | case '2': |
355 | 29.1M | case '3': |
356 | 31.1M | case '4': |
357 | 33.6M | case '5': |
358 | 35.9M | case '6': |
359 | 39.3M | case '7': |
360 | 40.9M | case '8': |
361 | 42.0M | case '9': |
362 | 42.0M | state = st_number; |
363 | 42.0M | return; |
364 | | |
365 | 288k | case '+': |
366 | 1.30M | case '-': |
367 | 1.30M | state = st_sign; |
368 | 1.30M | return; |
369 | | |
370 | 454k | case '.': |
371 | 454k | state = st_decimal; |
372 | 454k | return; |
373 | | |
374 | 73.9M | default: |
375 | 73.9M | state = st_literal; |
376 | 73.9M | return; |
377 | 195M | } |
378 | 195M | } |
379 | | |
380 | | void |
381 | | Tokenizer::inSpace(char ch) |
382 | 89.1M | { |
383 | | // We only enter this state if include_ignorable is true. |
384 | 89.1M | if (!isSpace(ch)) { |
385 | 50.6M | type = tt::tt_space; |
386 | 50.6M | in_token = false; |
387 | 50.6M | char_to_unread = ch; |
388 | 50.6M | state = st_token_ready; |
389 | 50.6M | } |
390 | 89.1M | } |
391 | | |
392 | | void |
393 | | Tokenizer::inComment(char ch) |
394 | 268M | { |
395 | 268M | if ((ch == '\r') || (ch == '\n')) { |
396 | 603k | if (include_ignorable) { |
397 | 137k | type = tt::tt_comment; |
398 | 137k | in_token = false; |
399 | 137k | char_to_unread = ch; |
400 | 137k | state = st_token_ready; |
401 | 466k | } else { |
402 | 466k | state = st_before_token; |
403 | 466k | } |
404 | 603k | } |
405 | 268M | } |
406 | | |
407 | | void |
408 | | Tokenizer::inString(char ch) |
409 | 1.77G | { |
410 | 1.77G | switch (ch) { |
411 | 8.25M | case '\\': |
412 | 8.25M | state = st_string_escape; |
413 | 8.25M | return; |
414 | | |
415 | 7.64M | case '(': |
416 | 7.64M | val += ch; |
417 | 7.64M | ++string_depth; |
418 | 7.64M | return; |
419 | | |
420 | 4.10M | case ')': |
421 | 4.10M | if (--string_depth == 0) { |
422 | 1.47M | type = tt::tt_string; |
423 | 1.47M | state = st_token_ready; |
424 | 1.47M | return; |
425 | 1.47M | } |
426 | | |
427 | 2.62M | val += ch; |
428 | 2.62M | return; |
429 | | |
430 | 7.25M | case '\r': |
431 | | // CR by itself is converted to LF |
432 | 7.25M | val += '\n'; |
433 | 7.25M | state = st_string_after_cr; |
434 | 7.25M | return; |
435 | | |
436 | 12.5M | case '\n': |
437 | 12.5M | val += ch; |
438 | 12.5M | return; |
439 | | |
440 | 1.73G | default: |
441 | 1.73G | val += ch; |
442 | 1.73G | return; |
443 | 1.77G | } |
444 | 1.77G | } |
445 | | |
446 | | void |
447 | | Tokenizer::inName(char ch) |
448 | 816M | { |
449 | 816M | if (isDelimiter(ch)) { |
450 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
451 | | // the whitespace character even though it is ignored since it may be the newline after a |
452 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
453 | | // though not on any files in the test suite as of this |
454 | | // writing. |
455 | | |
456 | 52.3M | type = bad ? tt::tt_bad : tt::tt_name; |
457 | 52.3M | in_token = false; |
458 | 52.3M | char_to_unread = ch; |
459 | 52.3M | state = st_token_ready; |
460 | 763M | } else if (ch == '#') { |
461 | 1.49M | char_code = 0; |
462 | 1.49M | state = st_name_hex1; |
463 | 762M | } else { |
464 | 762M | val += ch; |
465 | 762M | } |
466 | 816M | } |
467 | | |
468 | | void |
469 | | Tokenizer::inNameHex1(char ch) |
470 | 1.49M | { |
471 | 1.49M | hex_char = ch; |
472 | | |
473 | 1.49M | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
474 | 498k | char_code = int(hval) << 4; |
475 | 498k | state = st_name_hex2; |
476 | 995k | } else { |
477 | 995k | QTC::TC("qpdf", "QPDFTokenizer bad name 1"); |
478 | 995k | error_message = "name with stray # will not work with PDF >= 1.2"; |
479 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
480 | 995k | val += '\0'; |
481 | 995k | state = st_name; |
482 | 995k | inName(ch); |
483 | 995k | } |
484 | 1.49M | } |
485 | | |
486 | | void |
487 | | Tokenizer::inNameHex2(char ch) |
488 | 498k | { |
489 | 498k | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
490 | 365k | char_code |= int(hval); |
491 | 365k | } else { |
492 | 132k | QTC::TC("qpdf", "QPDFTokenizer bad name 2"); |
493 | 132k | error_message = "name with stray # will not work with PDF >= 1.2"; |
494 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
495 | 132k | val += '\0'; |
496 | 132k | val += hex_char; |
497 | 132k | state = st_name; |
498 | 132k | inName(ch); |
499 | 132k | return; |
500 | 132k | } |
501 | 365k | if (char_code == 0) { |
502 | 138k | QTC::TC("qpdf", "QPDFTokenizer null in name"); |
503 | 138k | error_message = "null character not allowed in name token"; |
504 | 138k | val += "#00"; |
505 | 138k | state = st_name; |
506 | 138k | bad = true; |
507 | 227k | } else { |
508 | 227k | val += char(char_code); |
509 | 227k | state = st_name; |
510 | 227k | } |
511 | 365k | } |
512 | | |
513 | | void |
514 | | Tokenizer::inSign(char ch) |
515 | 1.25M | { |
516 | 1.25M | if (util::is_digit(ch)) { |
517 | 766k | state = st_number; |
518 | 766k | } else if (ch == '.') { |
519 | 10.7k | state = st_decimal; |
520 | 483k | } else { |
521 | 483k | state = st_literal; |
522 | 483k | inLiteral(ch); |
523 | 483k | } |
524 | 1.25M | } |
525 | | |
526 | | void |
527 | | Tokenizer::inDecimal(char ch) |
528 | 458k | { |
529 | 458k | if (util::is_digit(ch)) { |
530 | 111k | state = st_real; |
531 | 347k | } else { |
532 | 347k | state = st_literal; |
533 | 347k | inLiteral(ch); |
534 | 347k | } |
535 | 458k | } |
536 | | |
537 | | void |
538 | | Tokenizer::inNumber(char ch) |
539 | 91.3M | { |
540 | 91.3M | if (util::is_digit(ch)) { |
541 | 49.5M | } else if (ch == '.') { |
542 | 1.74M | state = st_real; |
543 | 40.0M | } else if (isDelimiter(ch)) { |
544 | 35.4M | type = tt::tt_integer; |
545 | 35.4M | state = st_token_ready; |
546 | 35.4M | in_token = false; |
547 | 35.4M | char_to_unread = ch; |
548 | 35.4M | } else { |
549 | 4.62M | state = st_literal; |
550 | 4.62M | } |
551 | 91.3M | } |
552 | | |
553 | | void |
554 | | Tokenizer::inReal(char ch) |
555 | 5.91M | { |
556 | 5.91M | if (util::is_digit(ch)) { |
557 | 4.05M | } else if (isDelimiter(ch)) { |
558 | 1.75M | type = tt::tt_real; |
559 | 1.75M | state = st_token_ready; |
560 | 1.75M | in_token = false; |
561 | 1.75M | char_to_unread = ch; |
562 | 1.75M | } else { |
563 | 99.7k | state = st_literal; |
564 | 99.7k | } |
565 | 5.91M | } |
566 | | void |
567 | | Tokenizer::inStringEscape(char ch) |
568 | 8.25M | { |
569 | 8.25M | state = st_in_string; |
570 | 8.25M | switch (ch) { |
571 | 101k | case '0': |
572 | 146k | case '1': |
573 | 299k | case '2': |
574 | 427k | case '3': |
575 | 448k | case '4': |
576 | 464k | case '5': |
577 | 487k | case '6': |
578 | 500k | case '7': |
579 | 500k | state = st_char_code; |
580 | 500k | char_code = 0; |
581 | 500k | digit_count = 0; |
582 | 500k | inCharCode(ch); |
583 | 500k | return; |
584 | | |
585 | 236k | case 'n': |
586 | 236k | val += '\n'; |
587 | 236k | return; |
588 | | |
589 | 819k | case 'r': |
590 | 819k | val += '\r'; |
591 | 819k | return; |
592 | | |
593 | 495k | case 't': |
594 | 495k | val += '\t'; |
595 | 495k | return; |
596 | | |
597 | 171k | case 'b': |
598 | 171k | val += '\b'; |
599 | 171k | return; |
600 | | |
601 | 941k | case 'f': |
602 | 941k | val += '\f'; |
603 | 941k | return; |
604 | | |
605 | 8.91k | case '\n': |
606 | 8.91k | return; |
607 | | |
608 | 4.89k | case '\r': |
609 | 4.89k | state = st_string_after_cr; |
610 | 4.89k | return; |
611 | | |
612 | 5.07M | default: |
613 | | // PDF spec says backslash is ignored before anything else |
614 | 5.07M | val += ch; |
615 | 5.07M | return; |
616 | 8.25M | } |
617 | 8.25M | } |
618 | | |
619 | | void |
620 | | Tokenizer::inStringAfterCR(char ch) |
621 | 7.25M | { |
622 | 7.25M | state = st_in_string; |
623 | 7.25M | if (ch != '\n') { |
624 | 6.78M | inString(ch); |
625 | 6.78M | } |
626 | 7.25M | } |
627 | | |
628 | | void |
629 | | Tokenizer::inLt(char ch) |
630 | 6.64M | { |
631 | 6.64M | if (ch == '<') { |
632 | 4.67M | type = tt::tt_dict_open; |
633 | 4.67M | state = st_token_ready; |
634 | 4.67M | return; |
635 | 4.67M | } |
636 | | |
637 | 1.96M | state = st_in_hexstring; |
638 | 1.96M | inHexstring(ch); |
639 | 1.96M | } |
640 | | |
641 | | void |
642 | | Tokenizer::inGt(char ch) |
643 | 4.92M | { |
644 | 4.92M | if (ch == '>') { |
645 | 3.40M | type = tt::tt_dict_close; |
646 | 3.40M | state = st_token_ready; |
647 | 3.40M | } else { |
648 | 1.51M | type = tt::tt_bad; |
649 | 1.51M | QTC::TC("qpdf", "QPDFTokenizer bad >"); |
650 | 1.51M | error_message = "unexpected >"; |
651 | 1.51M | in_token = false; |
652 | 1.51M | char_to_unread = ch; |
653 | 1.51M | state = st_token_ready; |
654 | 1.51M | } |
655 | 4.92M | } |
656 | | |
657 | | void |
658 | | Tokenizer::inLiteral(char ch) |
659 | 703M | { |
660 | 703M | if (isDelimiter(ch)) { |
661 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
662 | | // the whitespace character even though it is ignored since it may be the newline after a |
663 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
664 | | // though not on any files in the test suite as of this writing. |
665 | | |
666 | 76.4M | in_token = false; |
667 | 76.4M | char_to_unread = ch; |
668 | 76.4M | state = st_token_ready; |
669 | 76.4M | type = (raw_val == "true") || (raw_val == "false") |
670 | 76.4M | ? tt::tt_bool |
671 | 76.4M | : (raw_val == "null" ? tt::tt_null : tt::tt_word); |
672 | 76.4M | } |
673 | 703M | } |
674 | | |
675 | | void |
676 | | Tokenizer::inHexstring(char ch) |
677 | 9.72M | { |
678 | 9.72M | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
679 | 7.55M | char_code = int(hval) << 4; |
680 | 7.55M | state = st_in_hexstring_2nd; |
681 | | |
682 | 7.55M | } else if (ch == '>') { |
683 | 975k | type = tt::tt_string; |
684 | 975k | state = st_token_ready; |
685 | | |
686 | 1.19M | } else if (isSpace(ch)) { |
687 | | // ignore |
688 | | |
689 | 749k | } else { |
690 | 749k | type = tt::tt_bad; |
691 | 749k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); |
692 | 749k | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
693 | 749k | state = st_token_ready; |
694 | 749k | } |
695 | 9.72M | } |
696 | | |
697 | | void |
698 | | Tokenizer::inHexstring2nd(char ch) |
699 | 7.68M | { |
700 | 7.68M | if (char hval = util::hex_decode_char(ch); hval < '\20') { |
701 | 7.32M | val += char(char_code) | hval; |
702 | 7.32M | state = st_in_hexstring; |
703 | | |
704 | 7.32M | } else if (ch == '>') { |
705 | | // PDF spec says odd hexstrings have implicit trailing 0. |
706 | 65.0k | val += char(char_code); |
707 | 65.0k | type = tt::tt_string; |
708 | 65.0k | state = st_token_ready; |
709 | | |
710 | 292k | } else if (isSpace(ch)) { |
711 | | // ignore |
712 | | |
713 | 157k | } else { |
714 | 134k | type = tt::tt_bad; |
715 | 134k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); |
716 | 134k | error_message = std::string("invalid character (") + ch + ") in hexstring"; |
717 | 134k | state = st_token_ready; |
718 | 134k | } |
719 | 7.68M | } |
720 | | |
721 | | void |
722 | | Tokenizer::inCharCode(char ch) |
723 | 1.33M | { |
724 | 1.33M | bool handled = false; |
725 | 1.33M | if (('0' <= ch) && (ch <= '7')) { |
726 | 1.16M | char_code = 8 * char_code + (int(ch) - int('0')); |
727 | 1.16M | if (++(digit_count) < 3) { |
728 | 848k | return; |
729 | 848k | } |
730 | 318k | handled = true; |
731 | 318k | } |
732 | | // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF |
733 | | // Spec says to ignore high-order overflow. |
734 | 483k | val += char(char_code % 256); |
735 | 483k | state = st_in_string; |
736 | 483k | if (!handled) { |
737 | 164k | inString(ch); |
738 | 164k | } |
739 | 483k | } |
740 | | |
741 | | void |
742 | | Tokenizer::inInlineImage(char ch) |
743 | 164M | { |
744 | 164M | if ((raw_val.length() + 1) == inline_image_bytes) { |
745 | 8.22k | QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); |
746 | 8.22k | type = tt::tt_inline_image; |
747 | 8.22k | inline_image_bytes = 0; |
748 | 8.22k | state = st_token_ready; |
749 | 8.22k | } |
750 | 164M | } |
751 | | |
752 | | void |
753 | | QPDFTokenizer::presentEOF() |
754 | 0 | { |
755 | 0 | m->presentEOF(); |
756 | 0 | } |
757 | | |
758 | | void |
759 | | Tokenizer::presentEOF() |
760 | 724k | { |
761 | 724k | switch (state) { |
762 | 196k | case st_name: |
763 | 197k | case st_name_hex1: |
764 | 198k | case st_name_hex2: |
765 | 270k | case st_number: |
766 | 273k | case st_real: |
767 | 275k | case st_sign: |
768 | 277k | case st_decimal: |
769 | 382k | case st_literal: |
770 | 382k | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
771 | | // Push any delimiter to the state machine to finish off the final token. |
772 | 382k | presentCharacter('\f'); |
773 | 382k | in_token = true; |
774 | 382k | break; |
775 | | |
776 | 0 | case st_top: |
777 | 275k | case st_before_token: |
778 | 275k | type = tt::tt_eof; |
779 | 275k | break; |
780 | | |
781 | 2.80k | case st_in_space: |
782 | 2.80k | type = include_ignorable ? tt::tt_space : tt::tt_eof; |
783 | 2.80k | break; |
784 | | |
785 | 15.2k | case st_in_comment: |
786 | 15.2k | type = include_ignorable ? tt::tt_comment : tt::tt_bad; |
787 | 15.2k | break; |
788 | | |
789 | 0 | case st_token_ready: |
790 | 0 | break; |
791 | | |
792 | 47.9k | default: |
793 | 47.9k | QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); |
794 | 47.9k | type = tt::tt_bad; |
795 | 47.9k | error_message = "EOF while reading token"; |
796 | 724k | } |
797 | 724k | state = st_token_ready; |
798 | 724k | } |
799 | | |
800 | | void |
801 | | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
802 | 0 | { |
803 | 0 | m->expectInlineImage(*input); |
804 | 0 | } |
805 | | |
806 | | void |
807 | | QPDFTokenizer::expectInlineImage(InputSource& input) |
808 | 3.61k | { |
809 | 3.61k | m->expectInlineImage(input); |
810 | 3.61k | } |
811 | | |
812 | | void |
813 | | Tokenizer::expectInlineImage(InputSource& input) |
814 | 8.54k | { |
815 | 8.54k | if (state == st_token_ready) { |
816 | 4.92k | reset(); |
817 | 4.92k | } else if (state != st_before_token) { |
818 | 0 | throw std::logic_error( |
819 | 0 | "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); |
820 | 0 | } |
821 | 8.54k | findEI(input); |
822 | 8.54k | before_token = false; |
823 | 8.54k | in_token = true; |
824 | 8.54k | state = st_inline_image; |
825 | 8.54k | } |
826 | | |
827 | | void |
828 | | Tokenizer::findEI(InputSource& input) |
829 | 8.54k | { |
830 | 8.54k | qpdf_offset_t last_offset = input.getLastOffset(); |
831 | 8.54k | qpdf_offset_t pos = input.tell(); |
832 | | |
833 | | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
834 | | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
835 | | // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the |
836 | | // end without finding one, return the last EI we found. Store the number of bytes expected in |
837 | | // the inline image including the EI and use that to break out of inline image, falling back to |
838 | | // the old method if needed. |
839 | | |
840 | 8.54k | bool okay = false; |
841 | 8.54k | bool first_try = true; |
842 | 80.8k | while (!okay) { |
843 | 75.9k | QPDFWordTokenFinder f(input, "EI"); |
844 | 75.9k | if (!input.findFirst("EI", input.tell(), 0, f)) { |
845 | 3.63k | break; |
846 | 3.63k | } |
847 | 72.3k | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
848 | | |
849 | 72.3k | Tokenizer check; |
850 | 72.3k | bool found_bad = false; |
851 | | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
852 | | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
853 | | // EI and the next valid one's ID since width, height, bits per pixel, and color space are |
854 | | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
855 | | // be pretty sure we've found the actual EI. |
856 | 210k | for (int i = 0; i < 10; ++i) { |
857 | 205k | check.nextToken(input, "checker"); |
858 | 205k | auto typ = check.getType(); |
859 | 205k | if (typ == tt::tt_eof) { |
860 | 0 | okay = true; |
861 | 205k | } else if (typ == tt::tt_bad) { |
862 | 10.4k | found_bad = true; |
863 | 194k | } else if (typ == tt::tt_word) { |
864 | | // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into |
865 | | // "words". We recognize strings of alphabetic characters as potential valid |
866 | | // operators for purposes of telling whether we're in valid content or not. It's not |
867 | | // perfect, but it should work more reliably than what we used to do, which was |
868 | | // already good enough for the vast majority of files. |
869 | 120k | bool found_alpha = false; |
870 | 120k | bool found_non_printable = false; |
871 | 120k | bool found_other = false; |
872 | 381k | for (char ch: check.getValue()) { |
873 | 381k | if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '*')) { |
874 | | // Treat '*' as alpha since there are valid PDF operators that contain * |
875 | | // along with alphabetic characters. |
876 | 171k | found_alpha = true; |
877 | 209k | } else if (static_cast<signed char>(ch) < 32 && !isSpace(ch)) { |
878 | | // Compare ch as a signed char so characters outside of 7-bit will be < 0. |
879 | 26.2k | found_non_printable = true; |
880 | 26.2k | break; |
881 | 183k | } else { |
882 | 183k | found_other = true; |
883 | 183k | } |
884 | 381k | } |
885 | 120k | if (found_non_printable || (found_alpha && found_other)) { |
886 | 56.9k | found_bad = true; |
887 | 56.9k | } |
888 | 120k | } |
889 | 205k | if (okay || found_bad) { |
890 | 67.4k | break; |
891 | 67.4k | } |
892 | 205k | } |
893 | 72.3k | if (!found_bad) { |
894 | 4.90k | okay = true; |
895 | 4.90k | } |
896 | 72.3k | if (!okay) { |
897 | 67.4k | first_try = false; |
898 | 67.4k | } |
899 | 72.3k | } |
900 | 8.54k | if (okay && (!first_try)) { |
901 | 807 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
902 | 807 | } |
903 | | |
904 | 8.54k | input.seek(pos, SEEK_SET); |
905 | 8.54k | input.setLastOffset(last_offset); |
906 | 8.54k | } |
907 | | |
908 | | bool |
909 | | QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
910 | 0 | { |
911 | 0 | return m->getToken(token, unread_char, ch); |
912 | 0 | } |
913 | | |
914 | | bool |
915 | | Tokenizer::getToken(Token& token, bool& unread_char, char& ch) |
916 | 154M | { |
917 | 154M | bool ready = (state == st_token_ready); |
918 | 154M | unread_char = !in_token && !before_token; |
919 | 154M | ch = char_to_unread; |
920 | 154M | if (ready) { |
921 | 154M | token = (!(type == tt::tt_name || type == tt::tt_string)) |
922 | 154M | ? Token(type, raw_val, raw_val, error_message) |
923 | 154M | : Token(type, val, raw_val, error_message); |
924 | | |
925 | 154M | reset(); |
926 | 154M | } |
927 | 154M | return ready; |
928 | 154M | } |
929 | | |
930 | | bool |
931 | | QPDFTokenizer::betweenTokens() |
932 | 0 | { |
933 | 0 | return m->betweenTokens(); |
934 | 0 | } |
935 | | |
936 | | bool |
937 | | Tokenizer::betweenTokens() |
938 | 0 | { |
939 | 0 | return before_token; |
940 | 0 | } |
941 | | |
942 | | QPDFTokenizer::Token |
943 | | QPDFTokenizer::readToken( |
944 | | InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
945 | 113M | { |
946 | 113M | return m->readToken(input, context, allow_bad, max_len); |
947 | 113M | } |
948 | | |
949 | | QPDFTokenizer::Token |
950 | | QPDFTokenizer::readToken( |
951 | | std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) |
952 | 0 | { |
953 | 0 | return m->readToken(*input, context, allow_bad, max_len); |
954 | 0 | } |
955 | | |
956 | | QPDFTokenizer::Token |
957 | | Tokenizer::readToken(InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
958 | 154M | { |
959 | 154M | nextToken(input, context, max_len); |
960 | | |
961 | 154M | Token token; |
962 | 154M | bool unread_char; |
963 | 154M | char char_to_unread; |
964 | 154M | getToken(token, unread_char, char_to_unread); |
965 | | |
966 | 154M | if (token.getType() == tt::tt_bad) { |
967 | 6.67M | if (allow_bad) { |
968 | 6.67M | QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); |
969 | 6.67M | } else { |
970 | 0 | throw QPDFExc( |
971 | 0 | qpdf_e_damaged_pdf, |
972 | 0 | input.getName(), |
973 | 0 | context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, |
974 | 0 | input.getLastOffset(), |
975 | 0 | token.getErrorMessage()); |
976 | 0 | } |
977 | 6.67M | } |
978 | 154M | return token; |
979 | 154M | } |
980 | | |
981 | | bool |
982 | | Tokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) |
983 | 246M | { |
984 | 246M | if (state != st_inline_image) { |
985 | 246M | reset(); |
986 | 246M | } |
987 | 246M | qpdf_offset_t offset = input.fastTell(); |
988 | | |
989 | 4.58G | while (state != st_token_ready) { |
990 | 4.33G | char ch; |
991 | 4.33G | if (!input.fastRead(ch)) { |
992 | 724k | presentEOF(); |
993 | | |
994 | 724k | if ((type == tt::tt_eof) && (!allow_eof)) { |
995 | | // Nothing in the qpdf library calls readToken without allowEOF anymore, so this |
996 | | // case is not exercised. |
997 | 1.04k | type = tt::tt_bad; |
998 | 1.04k | error_message = "unexpected EOF"; |
999 | 1.04k | offset = input.getLastOffset(); |
1000 | 1.04k | } |
1001 | 4.33G | } else { |
1002 | 4.33G | handleCharacter(ch); |
1003 | 4.33G | if (before_token) { |
1004 | 182M | ++offset; |
1005 | 182M | } |
1006 | 4.33G | if (in_token) { |
1007 | 3.93G | raw_val += ch; |
1008 | 3.93G | } |
1009 | 4.33G | if (max_len && (raw_val.length() >= max_len) && (state != st_token_ready)) { |
1010 | | // terminate this token now |
1011 | 4.84M | QTC::TC("qpdf", "QPDFTokenizer block long token"); |
1012 | 4.84M | type = tt::tt_bad; |
1013 | 4.84M | state = st_token_ready; |
1014 | 4.84M | error_message = "exceeded allowable length while reading token"; |
1015 | 4.84M | } |
1016 | 4.33G | } |
1017 | 4.33G | } |
1018 | | |
1019 | 246M | input.fastUnread(!in_token && !before_token); |
1020 | | |
1021 | 246M | if (type != tt::tt_eof) { |
1022 | 246M | input.setLastOffset(offset); |
1023 | 246M | } |
1024 | | |
1025 | 246M | return error_message.empty(); |
1026 | 246M | } |