/src/qpdf/libqpdf/QPDFTokenizer.cc
Line | Count | Source (jump to first uncovered line) |
1 | | #include <qpdf/QPDFTokenizer.hh> |
2 | | |
3 | | // DO NOT USE ctype -- it is locale dependent for some things, and it's not worth the risk of |
4 | | // including it in case it may accidentally be used. |
5 | | |
6 | | #include <qpdf/QIntC.hh> |
7 | | #include <qpdf/QPDFExc.hh> |
8 | | #include <qpdf/QPDFObjectHandle.hh> |
9 | | #include <qpdf/QTC.hh> |
10 | | #include <qpdf/QUtil.hh> |
11 | | |
12 | | #include <cstdlib> |
13 | | #include <cstring> |
14 | | #include <stdexcept> |
15 | | |
16 | | static inline bool |
17 | | is_delimiter(char ch) |
18 | 105M | { |
19 | 105M | return ( |
20 | 105M | ch == ' ' || ch == '\n' || ch == '/' || ch == '(' || ch == ')' || ch == '{' || ch == '}' || |
21 | 105M | ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '%' || ch == '\t' || ch == '\r' || |
22 | 105M | ch == '\v' || ch == '\f' || ch == 0); |
23 | 105M | } |
24 | | |
25 | | namespace |
26 | | { |
27 | | class QPDFWordTokenFinder: public InputSource::Finder |
28 | | { |
29 | | public: |
30 | | QPDFWordTokenFinder(InputSource& is, std::string const& str) : |
31 | | is(is), |
32 | | str(str) |
33 | 0 | { |
34 | 0 | } |
35 | 0 | ~QPDFWordTokenFinder() override = default; |
36 | | bool check() override; |
37 | | |
38 | | private: |
39 | | InputSource& is; |
40 | | std::string str; |
41 | | }; |
42 | | } // namespace |
43 | | |
44 | | bool |
45 | | QPDFWordTokenFinder::check() |
46 | 0 | { |
47 | | // Find a word token matching the given string, preceded by a delimiter, and followed by a |
48 | | // delimiter or EOF. |
49 | 0 | QPDFTokenizer tokenizer; |
50 | 0 | QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true); |
51 | 0 | qpdf_offset_t pos = is.tell(); |
52 | 0 | if (!(t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str))) { |
53 | 0 | QTC::TC("qpdf", "QPDFTokenizer finder found wrong word"); |
54 | 0 | return false; |
55 | 0 | } |
56 | 0 | qpdf_offset_t token_start = is.getLastOffset(); |
57 | 0 | char next; |
58 | 0 | bool next_okay = false; |
59 | 0 | if (is.read(&next, 1) == 0) { |
60 | 0 | QTC::TC("qpdf", "QPDFTokenizer inline image at EOF"); |
61 | 0 | next_okay = true; |
62 | 0 | } else { |
63 | 0 | next_okay = is_delimiter(next); |
64 | 0 | } |
65 | 0 | is.seek(pos, SEEK_SET); |
66 | 0 | if (!next_okay) { |
67 | 0 | return false; |
68 | 0 | } |
69 | 0 | if (token_start == 0) { |
70 | | // Can't actually happen...we never start the search at the beginning of the input. |
71 | 0 | return false; |
72 | 0 | } |
73 | 0 | return true; |
74 | 0 | } |
75 | | |
76 | | void |
77 | | QPDFTokenizer::reset() |
78 | 17.8M | { |
79 | 17.8M | state = st_before_token; |
80 | 17.8M | type = tt_bad; |
81 | 17.8M | val.clear(); |
82 | 17.8M | raw_val.clear(); |
83 | 17.8M | error_message = ""; |
84 | 17.8M | before_token = true; |
85 | 17.8M | in_token = false; |
86 | 17.8M | char_to_unread = '\0'; |
87 | 17.8M | inline_image_bytes = 0; |
88 | 17.8M | string_depth = 0; |
89 | 17.8M | bad = false; |
90 | 17.8M | } |
91 | | |
92 | | QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) : |
93 | | type(type), |
94 | | value(value), |
95 | | raw_value(value) |
96 | 0 | { |
97 | 0 | if (type == tt_string) { |
98 | 0 | raw_value = QPDFObjectHandle::newString(value).unparse(); |
99 | 0 | } else if (type == tt_name) { |
100 | 0 | raw_value = QPDFObjectHandle::newName(value).unparse(); |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | QPDFTokenizer::QPDFTokenizer() : |
105 | | allow_eof(false), |
106 | | include_ignorable(false) |
107 | 15.2k | { |
108 | 15.2k | reset(); |
109 | 15.2k | } |
110 | | |
111 | | void |
112 | | QPDFTokenizer::allowEOF() |
113 | 6.27k | { |
114 | 6.27k | this->allow_eof = true; |
115 | 6.27k | } |
116 | | |
117 | | void |
118 | | QPDFTokenizer::includeIgnorable() |
119 | 0 | { |
120 | 0 | this->include_ignorable = true; |
121 | 0 | } |
122 | | |
123 | | bool |
124 | | QPDFTokenizer::isSpace(char ch) |
125 | 49.8M | { |
126 | 49.8M | return ((ch == '\0') || QUtil::is_space(ch)); |
127 | 49.8M | } |
128 | | |
129 | | bool |
130 | | QPDFTokenizer::isDelimiter(char ch) |
131 | 105M | { |
132 | 105M | return is_delimiter(ch); |
133 | 105M | } |
134 | | |
135 | | void |
136 | | QPDFTokenizer::presentCharacter(char ch) |
137 | 7.02k | { |
138 | 7.02k | handleCharacter(ch); |
139 | | |
140 | 7.02k | if (this->in_token) { |
141 | 0 | this->raw_val += ch; |
142 | 0 | } |
143 | 7.02k | } |
144 | | |
145 | | void |
146 | | QPDFTokenizer::handleCharacter(char ch) |
147 | 238M | { |
148 | | // In some cases, functions called below may call a second handler. This happens whenever you |
149 | | // have to use a character from the next token to detect the end of the current token. |
150 | | |
151 | 238M | switch (this->state) { |
152 | 0 | case st_top: |
153 | 0 | inTop(ch); |
154 | 0 | return; |
155 | | |
156 | 0 | case st_in_space: |
157 | 0 | inSpace(ch); |
158 | 0 | return; |
159 | | |
160 | 2.13M | case st_in_comment: |
161 | 2.13M | inComment(ch); |
162 | 2.13M | return; |
163 | | |
164 | 704k | case st_lt: |
165 | 704k | inLt(ch); |
166 | 704k | return; |
167 | | |
168 | 459k | case st_gt: |
169 | 459k | inGt(ch); |
170 | 459k | return; |
171 | | |
172 | 49.5M | case st_in_string: |
173 | 49.5M | inString(ch); |
174 | 49.5M | return; |
175 | | |
176 | 35.8M | case st_name: |
177 | 35.8M | inName(ch); |
178 | 35.8M | return; |
179 | | |
180 | 31.0M | case st_number: |
181 | 31.0M | inNumber(ch); |
182 | 31.0M | return; |
183 | | |
184 | 771k | case st_real: |
185 | 771k | inReal(ch); |
186 | 771k | return; |
187 | | |
188 | 251k | case st_string_after_cr: |
189 | 251k | inStringAfterCR(ch); |
190 | 251k | return; |
191 | | |
192 | 104k | case st_string_escape: |
193 | 104k | inStringEscape(ch); |
194 | 104k | return; |
195 | | |
196 | 8.22k | case st_char_code: |
197 | 8.22k | inCharCode(ch); |
198 | 8.22k | return; |
199 | | |
200 | 66.1M | case st_literal: |
201 | 66.1M | inLiteral(ch); |
202 | 66.1M | return; |
203 | | |
204 | 0 | case st_inline_image: |
205 | 0 | inInlineImage(ch); |
206 | 0 | return; |
207 | | |
208 | 1.21M | case st_in_hexstring: |
209 | 1.21M | inHexstring(ch); |
210 | 1.21M | return; |
211 | | |
212 | 1.07M | case st_in_hexstring_2nd: |
213 | 1.07M | inHexstring2nd(ch); |
214 | 1.07M | return; |
215 | | |
216 | 8.17k | case st_name_hex1: |
217 | 8.17k | inNameHex1(ch); |
218 | 8.17k | return; |
219 | | |
220 | 782 | case st_name_hex2: |
221 | 782 | inNameHex2(ch); |
222 | 782 | return; |
223 | | |
224 | 17.2k | case st_sign: |
225 | 17.2k | inSign(ch); |
226 | 17.2k | return; |
227 | | |
228 | 5.59k | case st_decimal: |
229 | 5.59k | inDecimal(ch); |
230 | 5.59k | return; |
231 | | |
232 | 49.5M | case (st_before_token): |
233 | 49.5M | inBeforeToken(ch); |
234 | 49.5M | return; |
235 | | |
236 | 0 | case (st_token_ready): |
237 | 0 | inTokenReady(ch); |
238 | 0 | return; |
239 | | |
240 | 0 | default: |
241 | 0 | throw std::logic_error("INTERNAL ERROR: invalid state while reading token"); |
242 | 238M | } |
243 | 238M | } |
244 | | |
245 | | void |
246 | | QPDFTokenizer::inTokenReady(char ch) |
247 | 0 | { |
248 | 0 | throw std::logic_error( |
249 | 0 | "INTERNAL ERROR: QPDF tokenizer presented character while token is waiting"); |
250 | 0 | } |
251 | | |
252 | | void |
253 | | QPDFTokenizer::inBeforeToken(char ch) |
254 | 49.5M | { |
255 | | // Note: we specifically do not use ctype here. It is locale-dependent. |
256 | 49.5M | if (isSpace(ch)) { |
257 | 37.6M | this->before_token = !this->include_ignorable; |
258 | 37.6M | this->in_token = this->include_ignorable; |
259 | 37.6M | if (this->include_ignorable) { |
260 | 0 | this->state = st_in_space; |
261 | 0 | } |
262 | 37.6M | } else if (ch == '%') { |
263 | 83.1k | this->before_token = !this->include_ignorable; |
264 | 83.1k | this->in_token = this->include_ignorable; |
265 | 83.1k | this->state = st_in_comment; |
266 | 11.8M | } else { |
267 | 11.8M | this->before_token = false; |
268 | 11.8M | this->in_token = true; |
269 | 11.8M | inTop(ch); |
270 | 11.8M | } |
271 | 49.5M | } |
272 | | |
273 | | void |
274 | | QPDFTokenizer::inTop(char ch) |
275 | 11.8M | { |
276 | 11.8M | switch (ch) { |
277 | 42.6k | case '(': |
278 | 42.6k | this->string_depth = 1; |
279 | 42.6k | this->state = st_in_string; |
280 | 42.6k | return; |
281 | | |
282 | 704k | case '<': |
283 | 704k | this->state = st_lt; |
284 | 704k | return; |
285 | | |
286 | 459k | case '>': |
287 | 459k | this->state = st_gt; |
288 | 459k | return; |
289 | | |
290 | 5.61k | case (')'): |
291 | 5.61k | this->type = tt_bad; |
292 | 5.61k | QTC::TC("qpdf", "QPDFTokenizer bad )"); |
293 | 5.61k | this->error_message = "unexpected )"; |
294 | 5.61k | this->state = st_token_ready; |
295 | 5.61k | return; |
296 | | |
297 | 213k | case '[': |
298 | 213k | this->type = tt_array_open; |
299 | 213k | this->state = st_token_ready; |
300 | 213k | return; |
301 | | |
302 | 232k | case ']': |
303 | 232k | this->type = tt_array_close; |
304 | 232k | this->state = st_token_ready; |
305 | 232k | return; |
306 | | |
307 | 2.46k | case '{': |
308 | 2.46k | this->type = tt_brace_open; |
309 | 2.46k | this->state = st_token_ready; |
310 | 2.46k | return; |
311 | | |
312 | 2.70k | case '}': |
313 | 2.70k | this->type = tt_brace_close; |
314 | 2.70k | this->state = st_token_ready; |
315 | 2.70k | return; |
316 | | |
317 | 2.64M | case '/': |
318 | 2.64M | this->state = st_name; |
319 | 2.64M | this->val += ch; |
320 | 2.64M | return; |
321 | | |
322 | 1.57M | case '0': |
323 | 2.28M | case '1': |
324 | 2.58M | case '2': |
325 | 2.79M | case '3': |
326 | 2.95M | case '4': |
327 | 3.26M | case '5': |
328 | 3.57M | case '6': |
329 | 3.81M | case '7': |
330 | 3.99M | case '8': |
331 | 4.10M | case '9': |
332 | 4.10M | this->state = st_number; |
333 | 4.10M | return; |
334 | | |
335 | 2.01k | case '+': |
336 | 17.2k | case '-': |
337 | 17.2k | this->state = st_sign; |
338 | 17.2k | return; |
339 | | |
340 | 5.53k | case '.': |
341 | 5.53k | this->state = st_decimal; |
342 | 5.53k | return; |
343 | | |
344 | 3.40M | default: |
345 | 3.40M | this->state = st_literal; |
346 | 3.40M | return; |
347 | 11.8M | } |
348 | 11.8M | } |
349 | | |
350 | | void |
351 | | QPDFTokenizer::inSpace(char ch) |
352 | 0 | { |
353 | | // We only enter this state if include_ignorable is true. |
354 | 0 | if (!isSpace(ch)) { |
355 | 0 | this->type = tt_space; |
356 | 0 | this->in_token = false; |
357 | 0 | this->char_to_unread = ch; |
358 | 0 | this->state = st_token_ready; |
359 | 0 | } |
360 | 0 | } |
361 | | |
362 | | void |
363 | | QPDFTokenizer::inComment(char ch) |
364 | 2.13M | { |
365 | 2.13M | if ((ch == '\r') || (ch == '\n')) { |
366 | 82.4k | if (this->include_ignorable) { |
367 | 0 | this->type = tt_comment; |
368 | 0 | this->in_token = false; |
369 | 0 | this->char_to_unread = ch; |
370 | 0 | this->state = st_token_ready; |
371 | 82.4k | } else { |
372 | 82.4k | this->state = st_before_token; |
373 | 82.4k | } |
374 | 82.4k | } |
375 | 2.13M | } |
376 | | |
377 | | void |
378 | | QPDFTokenizer::inString(char ch) |
379 | 49.8M | { |
380 | 49.8M | switch (ch) { |
381 | 104k | case '\\': |
382 | 104k | this->state = st_string_escape; |
383 | 104k | return; |
384 | | |
385 | 133k | case '(': |
386 | 133k | this->val += ch; |
387 | 133k | ++this->string_depth; |
388 | 133k | return; |
389 | | |
390 | 139k | case ')': |
391 | 139k | if (--this->string_depth == 0) { |
392 | 35.4k | this->type = tt_string; |
393 | 35.4k | this->state = st_token_ready; |
394 | 35.4k | return; |
395 | 35.4k | } |
396 | | |
397 | 103k | this->val += ch; |
398 | 103k | return; |
399 | | |
400 | 249k | case '\r': |
401 | | // CR by itself is converted to LF |
402 | 249k | this->val += '\n'; |
403 | 249k | this->state = st_string_after_cr; |
404 | 249k | return; |
405 | | |
406 | 557k | case '\n': |
407 | 557k | this->val += ch; |
408 | 557k | return; |
409 | | |
410 | 48.6M | default: |
411 | 48.6M | this->val += ch; |
412 | 48.6M | return; |
413 | 49.8M | } |
414 | 49.8M | } |
415 | | |
416 | | void |
417 | | QPDFTokenizer::inName(char ch) |
418 | 35.8M | { |
419 | 35.8M | if (isDelimiter(ch)) { |
420 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
421 | | // the whitespace character even though it is ignored since it may be the newline after a |
422 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
423 | | // though not on any files in the test suite as of this |
424 | | // writing. |
425 | | |
426 | 2.55M | this->type = this->bad ? tt_bad : tt_name; |
427 | 2.55M | this->in_token = false; |
428 | 2.55M | this->char_to_unread = ch; |
429 | 2.55M | this->state = st_token_ready; |
430 | 33.3M | } else if (ch == '#') { |
431 | 8.21k | this->char_code = 0; |
432 | 8.21k | this->state = st_name_hex1; |
433 | 33.3M | } else { |
434 | 33.3M | this->val += ch; |
435 | 33.3M | } |
436 | 35.8M | } |
437 | | |
438 | | void |
439 | | QPDFTokenizer::inNameHex1(char ch) |
440 | 8.17k | { |
441 | 8.17k | this->hex_char = ch; |
442 | | |
443 | 8.17k | if (char hval = QUtil::hex_decode_char(ch); hval < '\20') { |
444 | 782 | this->char_code = int(hval) << 4; |
445 | 782 | this->state = st_name_hex2; |
446 | 7.39k | } else { |
447 | 7.39k | QTC::TC("qpdf", "QPDFTokenizer bad name 1"); |
448 | 7.39k | this->error_message = "name with stray # will not work with PDF >= 1.2"; |
449 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
450 | 7.39k | this->val += '\0'; |
451 | 7.39k | this->state = st_name; |
452 | 7.39k | inName(ch); |
453 | 7.39k | } |
454 | 8.17k | } |
455 | | |
456 | | void |
457 | | QPDFTokenizer::inNameHex2(char ch) |
458 | 782 | { |
459 | 782 | if (char hval = QUtil::hex_decode_char(ch); hval < '\20') { |
460 | 529 | this->char_code |= int(hval); |
461 | 529 | } else { |
462 | 253 | QTC::TC("qpdf", "QPDFTokenizer bad name 2"); |
463 | 253 | this->error_message = "name with stray # will not work with PDF >= 1.2"; |
464 | | // Use null to encode a bad # -- this is reversed in QPDF_Name::normalizeName. |
465 | 253 | this->val += '\0'; |
466 | 253 | this->val += this->hex_char; |
467 | 253 | this->state = st_name; |
468 | 253 | inName(ch); |
469 | 253 | return; |
470 | 253 | } |
471 | 529 | if (this->char_code == 0) { |
472 | 90 | QTC::TC("qpdf", "QPDFTokenizer null in name"); |
473 | 90 | this->error_message = "null character not allowed in name token"; |
474 | 90 | this->val += "#00"; |
475 | 90 | this->state = st_name; |
476 | 90 | this->bad = true; |
477 | 439 | } else { |
478 | 439 | this->val += char(this->char_code); |
479 | 439 | this->state = st_name; |
480 | 439 | } |
481 | 529 | } |
482 | | |
483 | | void |
484 | | QPDFTokenizer::inSign(char ch) |
485 | 17.2k | { |
486 | 17.2k | if (QUtil::is_digit(ch)) { |
487 | 11.0k | this->state = st_number; |
488 | 11.0k | } else if (ch == '.') { |
489 | 66 | this->state = st_decimal; |
490 | 6.08k | } else { |
491 | 6.08k | this->state = st_literal; |
492 | 6.08k | inLiteral(ch); |
493 | 6.08k | } |
494 | 17.2k | } |
495 | | |
496 | | void |
497 | | QPDFTokenizer::inDecimal(char ch) |
498 | 5.59k | { |
499 | 5.59k | if (QUtil::is_digit(ch)) { |
500 | 949 | this->state = st_real; |
501 | 4.65k | } else { |
502 | 4.65k | this->state = st_literal; |
503 | 4.65k | inLiteral(ch); |
504 | 4.65k | } |
505 | 5.59k | } |
506 | | |
507 | | void |
508 | | QPDFTokenizer::inNumber(char ch) |
509 | 31.0M | { |
510 | 31.0M | if (QUtil::is_digit(ch)) { |
511 | 27.1M | } else if (ch == '.') { |
512 | 189k | this->state = st_real; |
513 | 3.71M | } else if (isDelimiter(ch)) { |
514 | 3.66M | this->type = tt_integer; |
515 | 3.66M | this->state = st_token_ready; |
516 | 3.66M | this->in_token = false; |
517 | 3.66M | this->char_to_unread = ch; |
518 | 3.66M | } else { |
519 | 57.7k | this->state = st_literal; |
520 | 57.7k | } |
521 | 31.0M | } |
522 | | |
523 | | void |
524 | | QPDFTokenizer::inReal(char ch) |
525 | 771k | { |
526 | 771k | if (QUtil::is_digit(ch)) { |
527 | 581k | } else if (isDelimiter(ch)) { |
528 | 188k | this->type = tt_real; |
529 | 188k | this->state = st_token_ready; |
530 | 188k | this->in_token = false; |
531 | 188k | this->char_to_unread = ch; |
532 | 188k | } else { |
533 | 1.22k | this->state = st_literal; |
534 | 1.22k | } |
535 | 771k | } |
536 | | void |
537 | | QPDFTokenizer::inStringEscape(char ch) |
538 | 104k | { |
539 | 104k | this->state = st_in_string; |
540 | 104k | switch (ch) { |
541 | 566 | case '0': |
542 | 1.61k | case '1': |
543 | 2.15k | case '2': |
544 | 3.33k | case '3': |
545 | 4.23k | case '4': |
546 | 4.88k | case '5': |
547 | 5.53k | case '6': |
548 | 6.83k | case '7': |
549 | 6.83k | this->state = st_char_code; |
550 | 6.83k | this->char_code = 0; |
551 | 6.83k | this->digit_count = 0; |
552 | 6.83k | inCharCode(ch); |
553 | 6.83k | return; |
554 | | |
555 | 723 | case 'n': |
556 | 723 | this->val += '\n'; |
557 | 723 | return; |
558 | | |
559 | 1.67k | case 'r': |
560 | 1.67k | this->val += '\r'; |
561 | 1.67k | return; |
562 | | |
563 | 647 | case 't': |
564 | 647 | this->val += '\t'; |
565 | 647 | return; |
566 | | |
567 | 2.83k | case 'b': |
568 | 2.83k | this->val += '\b'; |
569 | 2.83k | return; |
570 | | |
571 | 928 | case 'f': |
572 | 928 | this->val += '\f'; |
573 | 928 | return; |
574 | | |
575 | 673 | case '\n': |
576 | 673 | return; |
577 | | |
578 | 1.48k | case '\r': |
579 | 1.48k | this->state = st_string_after_cr; |
580 | 1.48k | return; |
581 | | |
582 | 89.0k | default: |
583 | | // PDF spec says backslash is ignored before anything else |
584 | 89.0k | this->val += ch; |
585 | 89.0k | return; |
586 | 104k | } |
587 | 104k | } |
588 | | |
589 | | void |
590 | | QPDFTokenizer::inStringAfterCR(char ch) |
591 | 251k | { |
592 | 251k | this->state = st_in_string; |
593 | 251k | if (ch != '\n') { |
594 | 218k | inString(ch); |
595 | 218k | } |
596 | 251k | } |
597 | | |
598 | | void |
599 | | QPDFTokenizer::inLt(char ch) |
600 | 704k | { |
601 | 704k | if (ch == '<') { |
602 | 514k | this->type = tt_dict_open; |
603 | 514k | this->state = st_token_ready; |
604 | 514k | return; |
605 | 514k | } |
606 | | |
607 | 189k | this->state = st_in_hexstring; |
608 | 189k | inHexstring(ch); |
609 | 189k | } |
610 | | |
611 | | void |
612 | | QPDFTokenizer::inGt(char ch) |
613 | 459k | { |
614 | 459k | if (ch == '>') { |
615 | 446k | this->type = tt_dict_close; |
616 | 446k | this->state = st_token_ready; |
617 | 446k | } else { |
618 | 12.8k | this->type = tt_bad; |
619 | 12.8k | QTC::TC("qpdf", "QPDFTokenizer bad >"); |
620 | 12.8k | this->error_message = "unexpected >"; |
621 | 12.8k | this->in_token = false; |
622 | 12.8k | this->char_to_unread = ch; |
623 | 12.8k | this->state = st_token_ready; |
624 | 12.8k | } |
625 | 459k | } |
626 | | |
627 | | void |
628 | | QPDFTokenizer::inLiteral(char ch) |
629 | 66.1M | { |
630 | 66.1M | if (isDelimiter(ch)) { |
631 | | // A C-locale whitespace character or delimiter terminates token. It is important to unread |
632 | | // the whitespace character even though it is ignored since it may be the newline after a |
633 | | // stream keyword. Removing it here could make the stream-reading code break on some files, |
634 | | // though not on any files in the test suite as of this writing. |
635 | | |
636 | 3.22M | this->in_token = false; |
637 | 3.22M | this->char_to_unread = ch; |
638 | 3.22M | this->state = st_token_ready; |
639 | 3.22M | this->type = (this->raw_val == "true") || (this->raw_val == "false") |
640 | 3.22M | ? tt_bool |
641 | 3.22M | : (this->raw_val == "null" ? tt_null : tt_word); |
642 | 3.22M | } |
643 | 66.1M | } |
644 | | |
645 | | void |
646 | | QPDFTokenizer::inHexstring(char ch) |
647 | 1.40M | { |
648 | 1.40M | if (char hval = QUtil::hex_decode_char(ch); hval < '\20') { |
649 | 1.05M | this->char_code = int(hval) << 4; |
650 | 1.05M | this->state = st_in_hexstring_2nd; |
651 | | |
652 | 1.05M | } else if (ch == '>') { |
653 | 70.2k | this->type = tt_string; |
654 | 70.2k | this->state = st_token_ready; |
655 | | |
656 | 284k | } else if (isSpace(ch)) { |
657 | | // ignore |
658 | | |
659 | 178k | } else { |
660 | 105k | this->type = tt_bad; |
661 | 105k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring character"); |
662 | 105k | this->error_message = std::string("invalid character (") + ch + ") in hexstring"; |
663 | 105k | this->state = st_token_ready; |
664 | 105k | } |
665 | 1.40M | } |
666 | | |
667 | | void |
668 | | QPDFTokenizer::inHexstring2nd(char ch) |
669 | 1.07M | { |
670 | 1.07M | if (char hval = QUtil::hex_decode_char(ch); hval < '\20') { |
671 | 1.03M | this->val += char(this->char_code) | hval; |
672 | 1.03M | this->state = st_in_hexstring; |
673 | | |
674 | 1.03M | } else if (ch == '>') { |
675 | | // PDF spec says odd hexstrings have implicit trailing 0. |
676 | 1.05k | this->val += char(this->char_code); |
677 | 1.05k | this->type = tt_string; |
678 | 1.05k | this->state = st_token_ready; |
679 | | |
680 | 33.1k | } else if (isSpace(ch)) { |
681 | | // ignore |
682 | | |
683 | 31.1k | } else { |
684 | 2.05k | this->type = tt_bad; |
685 | 2.05k | QTC::TC("qpdf", "QPDFTokenizer bad hexstring 2nd character"); |
686 | 2.05k | this->error_message = std::string("invalid character (") + ch + ") in hexstring"; |
687 | 2.05k | this->state = st_token_ready; |
688 | 2.05k | } |
689 | 1.07M | } |
690 | | |
691 | | void |
692 | | QPDFTokenizer::inCharCode(char ch) |
693 | 15.0k | { |
694 | 15.0k | bool handled = false; |
695 | 15.0k | if (('0' <= ch) && (ch <= '7')) { |
696 | 8.31k | this->char_code = 8 * this->char_code + (int(ch) - int('0')); |
697 | 8.31k | if (++(this->digit_count) < 3) { |
698 | 8.22k | return; |
699 | 8.22k | } |
700 | 89 | handled = true; |
701 | 89 | } |
702 | | // We've accumulated \ddd or we have \d or \dd followed by other than an octal digit. The PDF |
703 | | // Spec says to ignore high-order overflow. |
704 | 6.83k | this->val += char(this->char_code % 256); |
705 | 6.83k | this->state = st_in_string; |
706 | 6.83k | if (!handled) { |
707 | 6.74k | inString(ch); |
708 | 6.74k | } |
709 | 6.83k | } |
710 | | |
711 | | void |
712 | | QPDFTokenizer::inInlineImage(char ch) |
713 | 0 | { |
714 | 0 | if ((this->raw_val.length() + 1) == this->inline_image_bytes) { |
715 | 0 | QTC::TC("qpdf", "QPDFTokenizer found EI by byte count"); |
716 | 0 | this->type = tt_inline_image; |
717 | 0 | this->inline_image_bytes = 0; |
718 | 0 | this->state = st_token_ready; |
719 | 0 | } |
720 | 0 | } |
721 | | |
722 | | void |
723 | | QPDFTokenizer::presentEOF() |
724 | 24.4k | { |
725 | 24.4k | switch (this->state) { |
726 | 4.42k | case st_name: |
727 | 4.44k | case st_name_hex1: |
728 | 4.44k | case st_name_hex2: |
729 | 5.38k | case st_number: |
730 | 5.43k | case st_real: |
731 | 5.45k | case st_sign: |
732 | 5.47k | case st_decimal: |
733 | 7.02k | case st_literal: |
734 | 7.02k | QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token"); |
735 | | // Push any delimiter to the state machine to finish off the final token. |
736 | 7.02k | presentCharacter('\f'); |
737 | 7.02k | this->in_token = true; |
738 | 7.02k | break; |
739 | | |
740 | 0 | case st_top: |
741 | 14.7k | case st_before_token: |
742 | 14.7k | this->type = tt_eof; |
743 | 14.7k | break; |
744 | | |
745 | 0 | case st_in_space: |
746 | 0 | this->type = this->include_ignorable ? tt_space : tt_eof; |
747 | 0 | break; |
748 | | |
749 | 703 | case st_in_comment: |
750 | 703 | this->type = this->include_ignorable ? tt_comment : tt_bad; |
751 | 703 | break; |
752 | | |
753 | 0 | case st_token_ready: |
754 | 0 | break; |
755 | | |
756 | 1.95k | default: |
757 | 1.95k | QTC::TC("qpdf", "QPDFTokenizer EOF reading token"); |
758 | 1.95k | this->type = tt_bad; |
759 | 1.95k | this->error_message = "EOF while reading token"; |
760 | 24.4k | } |
761 | 24.4k | this->state = st_token_ready; |
762 | 24.4k | } |
763 | | |
764 | | void |
765 | | QPDFTokenizer::expectInlineImage(std::shared_ptr<InputSource> input) |
766 | 0 | { |
767 | 0 | expectInlineImage(*input); |
768 | 0 | } |
769 | | |
770 | | void |
771 | | QPDFTokenizer::expectInlineImage(InputSource& input) |
772 | 0 | { |
773 | 0 | if (this->state == st_token_ready) { |
774 | 0 | reset(); |
775 | 0 | } else if (this->state != st_before_token) { |
776 | 0 | throw std::logic_error( |
777 | 0 | "QPDFTokenizer::expectInlineImage called when tokenizer is in improper state"); |
778 | 0 | } |
779 | 0 | findEI(input); |
780 | 0 | this->before_token = false; |
781 | 0 | this->in_token = true; |
782 | 0 | this->state = st_inline_image; |
783 | 0 | } |
784 | | |
785 | | void |
786 | | QPDFTokenizer::findEI(InputSource& input) |
787 | 0 | { |
788 | 0 | qpdf_offset_t last_offset = input.getLastOffset(); |
789 | 0 | qpdf_offset_t pos = input.tell(); |
790 | | |
791 | | // Use QPDFWordTokenFinder to find EI surrounded by delimiters. Then read the next several |
792 | | // tokens or up to EOF. If we find any suspicious-looking or tokens, this is probably still part |
793 | | // of the image data, so keep looking for EI. Stop at the first EI that passes. If we get to the |
794 | | // end without finding one, return the last EI we found. Store the number of bytes expected in |
795 | | // the inline image including the EI and use that to break out of inline image, falling back to |
796 | | // the old method if needed. |
797 | |
|
798 | 0 | bool okay = false; |
799 | 0 | bool first_try = true; |
800 | 0 | while (!okay) { |
801 | 0 | QPDFWordTokenFinder f(input, "EI"); |
802 | 0 | if (!input.findFirst("EI", input.tell(), 0, f)) { |
803 | 0 | break; |
804 | 0 | } |
805 | 0 | inline_image_bytes = QIntC::to_size(input.tell() - pos - 2); |
806 | |
|
807 | 0 | QPDFTokenizer check; |
808 | 0 | bool found_bad = false; |
809 | | // Look at the next 10 tokens or up to EOF. The next inline image's image data would look |
810 | | // like bad tokens, but there will always be at least 10 tokens between one inline image's |
811 | | // EI and the next valid one's ID since width, height, bits per pixel, and color space are |
812 | | // all required as well as a BI and ID. If we get 10 good tokens in a row or hit EOF, we can |
813 | | // be pretty sure we've found the actual EI. |
814 | 0 | for (int i = 0; i < 10; ++i) { |
815 | 0 | QPDFTokenizer::Token t = check.readToken(input, "checker", true); |
816 | 0 | token_type_e type = t.getType(); |
817 | 0 | if (type == tt_eof) { |
818 | 0 | okay = true; |
819 | 0 | } else if (type == tt_bad) { |
820 | 0 | found_bad = true; |
821 | 0 | } else if (t.isWord()) { |
822 | | // The qpdf tokenizer lumps alphabetic and otherwise uncategorized characters into |
823 | | // "words". We recognize strings of alphabetic characters as potential valid |
824 | | // operators for purposes of telling whether we're in valid content or not. It's not |
825 | | // perfect, but it should work more reliably than what we used to do, which was |
826 | | // already good enough for the vast majority of files. |
827 | 0 | bool found_alpha = false; |
828 | 0 | bool found_non_printable = false; |
829 | 0 | bool found_other = false; |
830 | 0 | for (char ch: t.getValue()) { |
831 | 0 | if (((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A') && (ch <= 'Z')) || |
832 | 0 | (ch == '*')) { |
833 | | // Treat '*' as alpha since there are valid PDF operators that contain * |
834 | | // along with alphabetic characters. |
835 | 0 | found_alpha = true; |
836 | 0 | } else if ((static_cast<signed char>(ch) < 32) && (!isSpace(ch))) { |
837 | | // Compare ch as a signed char so characters outside of 7-bit will be < 0. |
838 | 0 | found_non_printable = true; |
839 | 0 | break; |
840 | 0 | } else { |
841 | 0 | found_other = true; |
842 | 0 | } |
843 | 0 | } |
844 | 0 | if (found_non_printable || (found_alpha && found_other)) { |
845 | 0 | found_bad = true; |
846 | 0 | } |
847 | 0 | } |
848 | 0 | if (okay || found_bad) { |
849 | 0 | break; |
850 | 0 | } |
851 | 0 | } |
852 | 0 | if (!found_bad) { |
853 | 0 | okay = true; |
854 | 0 | } |
855 | 0 | if (!okay) { |
856 | 0 | first_try = false; |
857 | 0 | } |
858 | 0 | } |
859 | 0 | if (okay && (!first_try)) { |
860 | 0 | QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try"); |
861 | 0 | } |
862 | |
|
863 | 0 | input.seek(pos, SEEK_SET); |
864 | 0 | input.setLastOffset(last_offset); |
865 | 0 | } |
866 | | |
867 | | bool |
868 | | QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch) |
869 | 5.99M | { |
870 | 5.99M | bool ready = (this->state == st_token_ready); |
871 | 5.99M | unread_char = !this->in_token && !this->before_token; |
872 | 5.99M | ch = this->char_to_unread; |
873 | 5.99M | if (ready) { |
874 | 5.99M | token = (!(this->type == tt_name || this->type == tt_string)) |
875 | 5.99M | ? Token(this->type, this->raw_val, this->raw_val, this->error_message) |
876 | 5.99M | : Token(this->type, this->val, this->raw_val, this->error_message); |
877 | | |
878 | 5.99M | this->reset(); |
879 | 5.99M | } |
880 | 5.99M | return ready; |
881 | 5.99M | } |
882 | | |
883 | | bool |
884 | | QPDFTokenizer::betweenTokens() |
885 | 0 | { |
886 | 0 | return this->before_token; |
887 | 0 | } |
888 | | |
889 | | QPDFTokenizer::Token |
890 | | QPDFTokenizer::readToken( |
891 | | InputSource& input, std::string const& context, bool allow_bad, size_t max_len) |
892 | 5.99M | { |
893 | 5.99M | nextToken(input, context, max_len); |
894 | | |
895 | 5.99M | Token token; |
896 | 5.99M | bool unread_char; |
897 | 5.99M | char char_to_unread; |
898 | 5.99M | getToken(token, unread_char, char_to_unread); |
899 | | |
900 | 5.99M | if (token.getType() == tt_bad) { |
901 | 668k | if (allow_bad) { |
902 | 668k | QTC::TC("qpdf", "QPDFTokenizer allowing bad token"); |
903 | 668k | } else { |
904 | 0 | throw QPDFExc( |
905 | 0 | qpdf_e_damaged_pdf, |
906 | 0 | input.getName(), |
907 | 0 | context.empty() ? "offset " + std::to_string(input.getLastOffset()) : context, |
908 | 0 | input.getLastOffset(), |
909 | 0 | token.getErrorMessage()); |
910 | 0 | } |
911 | 668k | } |
912 | 5.99M | return token; |
913 | 5.99M | } |
914 | | |
915 | | QPDFTokenizer::Token |
916 | | QPDFTokenizer::readToken( |
917 | | std::shared_ptr<InputSource> input, std::string const& context, bool allow_bad, size_t max_len) |
918 | 5.99M | { |
919 | 5.99M | return readToken(*input, context, allow_bad, max_len); |
920 | 5.99M | } |
921 | | |
922 | | bool |
923 | | QPDFTokenizer::nextToken(InputSource& input, std::string const& context, size_t max_len) |
924 | 11.8M | { |
925 | 11.8M | if (this->state != st_inline_image) { |
926 | 11.8M | reset(); |
927 | 11.8M | } |
928 | 11.8M | qpdf_offset_t offset = input.fastTell(); |
929 | | |
930 | 250M | while (this->state != st_token_ready) { |
931 | 238M | char ch; |
932 | 238M | if (!input.fastRead(ch)) { |
933 | 24.4k | presentEOF(); |
934 | | |
935 | 24.4k | if ((this->type == tt_eof) && (!this->allow_eof)) { |
936 | | // Nothing in the qpdf library calls readToken without allowEOF anymore, so this |
937 | | // case is not exercised. |
938 | 0 | this->type = tt_bad; |
939 | 0 | this->error_message = "unexpected EOF"; |
940 | 0 | offset = input.getLastOffset(); |
941 | 0 | } |
942 | 238M | } else { |
943 | 238M | handleCharacter(ch); |
944 | 238M | if (this->before_token) { |
945 | 39.8M | ++offset; |
946 | 39.8M | } |
947 | 238M | if (this->in_token) { |
948 | 189M | this->raw_val += ch; |
949 | 189M | } |
950 | 238M | if (max_len && (this->raw_val.length() >= max_len) && (this->state != st_token_ready)) { |
951 | | // terminate this token now |
952 | 562k | QTC::TC("qpdf", "QPDFTokenizer block long token"); |
953 | 562k | this->type = tt_bad; |
954 | 562k | this->state = st_token_ready; |
955 | 562k | this->error_message = "exceeded allowable length while reading token"; |
956 | 562k | } |
957 | 238M | } |
958 | 238M | } |
959 | | |
960 | 11.8M | input.fastUnread(!this->in_token && !this->before_token); |
961 | | |
962 | 11.8M | if (this->type != tt_eof) { |
963 | 11.8M | input.setLastOffset(offset); |
964 | 11.8M | } |
965 | | |
966 | 11.8M | return this->error_message.empty(); |
967 | 11.8M | } |