/src/mupdf/source/pdf/pdf-lex.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | #define IS_NUMBER \ |
29 | 36.6M | '+':case'-':case'.':case'0':case'1':case'2':case'3':\ |
30 | 46.8M | case'4':case'5':case'6':case'7':case'8':case'9' |
31 | | #define IS_WHITE \ |
32 | 156M | '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20' |
33 | | #define IS_HEX \ |
34 | 39.1M | '0':case'1':case'2':case'3':case'4':case'5':case'6':\ |
35 | 41.9M | case'7':case'8':case'9':case'A':case'B':case'C':\ |
36 | 44.0M | case'D':case'E':case'F':case'a':case'b':case'c':\ |
37 | 45.3M | case'd':case'e':case'f' |
38 | | #define IS_DELIM \ |
39 | 78.5M | '(':case')':case'<':case'>':case'[':case']':case'{':\ |
40 | 85.6M | case'}':case'/':case'%' |
41 | | |
42 | | #define RANGE_0_9 \ |
43 | 75.8M | '0':case'1':case'2':case'3':case'4':case'5':\ |
44 | 116M | case'6':case'7':case'8':case'9' |
45 | | #define RANGE_a_f \ |
46 | 18.0k | 'a':case'b':case'c':case'd':case'e':case'f' |
47 | | #define RANGE_A_F \ |
48 | 8.89k | 'A':case'B':case'C':case'D':case'E':case'F' |
49 | | #define RANGE_0_7 \ |
50 | 139k | '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' |
51 | | |
52 | | /* #define DUMP_LEXER_STREAM */ |
53 | | #ifdef DUMP_LEXER_STREAM |
54 | | static inline int lex_byte(fz_context *ctx, fz_stream *stm) |
55 | | { |
56 | | int c = fz_read_byte(ctx, stm); |
57 | | |
58 | | if (c == EOF) |
59 | | fz_write_printf(ctx, fz_stdout(ctx), "<EOF>"); |
60 | | else if (c >= 32 && c < 128) |
61 | | fz_write_printf(ctx, fz_stdout(ctx), "%c", c); |
62 | | else |
63 | | fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c); |
64 | | return c; |
65 | | } |
66 | | #else |
67 | 1.31G | #define lex_byte(C,S) fz_read_byte(C,S) |
68 | | #endif |
69 | | |
70 | | static inline int iswhite(int ch) |
71 | 16.5M | { |
72 | 16.5M | return |
73 | 16.5M | ch == '\000' || |
74 | 16.5M | ch == '\011' || |
75 | 16.5M | ch == '\012' || |
76 | 16.5M | ch == '\014' || |
77 | 16.5M | ch == '\015' || |
78 | 16.5M | ch == '\040'; |
79 | 16.5M | } |
80 | | |
81 | | static inline int fz_isprint(int ch) |
82 | 25.1M | { |
83 | 25.1M | return ch >= ' ' && ch <= '~'; |
84 | 25.1M | } |
85 | | |
86 | | static inline int unhex(int ch) |
87 | 45.3M | { |
88 | 45.3M | if (ch >= '0' && ch <= '9') return ch - '0'; |
89 | 35.6M | if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; |
90 | 33.4M | if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; |
91 | 31.1M | return 0; |
92 | 33.4M | } |
93 | | |
94 | | static void |
95 | | lex_white(fz_context *ctx, fz_stream *f) |
96 | 77.8M | { |
97 | 77.8M | int c; |
98 | 93.9M | do { |
99 | 93.9M | c = lex_byte(ctx, f); |
100 | 93.9M | } while ((c <= 32) && (iswhite(c))); |
101 | 77.8M | if (c != EOF) |
102 | 77.8M | fz_unread_byte(ctx, f); |
103 | 77.8M | } |
104 | | |
105 | | static void |
106 | | lex_comment(fz_context *ctx, fz_stream *f) |
107 | 306k | { |
108 | 306k | int c; |
109 | 152M | do { |
110 | 152M | c = lex_byte(ctx, f); |
111 | 152M | } while ((c != '\012') && (c != '\015') && (c != EOF)); |
112 | 306k | } |
113 | | |
114 | | /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ |
115 | | static float acrobat_compatible_atof(char *s) |
116 | 11.8k | { |
117 | 11.8k | int neg = 0; |
118 | 11.8k | int i = 0; |
119 | | |
120 | 12.4k | while (*s == '-') |
121 | 619 | { |
122 | 619 | neg = 1; |
123 | 619 | ++s; |
124 | 619 | } |
125 | 12.4k | while (*s == '+') |
126 | 637 | { |
127 | 637 | ++s; |
128 | 637 | } |
129 | | |
130 | 199k | while (*s >= '0' && *s <= '9') |
131 | 187k | { |
132 | | /* We deliberately ignore overflow here. |
133 | | * Tests show that Acrobat handles * overflows in exactly the same way we do: |
134 | | * 123450000000000000000678 is read as 678. |
135 | | */ |
136 | 187k | i = i * 10 + (*s - '0'); |
137 | 187k | ++s; |
138 | 187k | } |
139 | | |
140 | 11.8k | if (*s == '.') |
141 | 11.0k | { |
142 | 11.0k | float v = i; |
143 | 11.0k | float n = 0; |
144 | 11.0k | float d = 1; |
145 | 11.0k | ++s; |
146 | 40.2k | while (*s >= '0' && *s <= '9') |
147 | 29.1k | { |
148 | 29.1k | n = 10 * n + (*s - '0'); |
149 | 29.1k | d = 10 * d; |
150 | 29.1k | ++s; |
151 | 29.1k | } |
152 | 11.0k | v += n / d; |
153 | 11.0k | return neg ? -v : v; |
154 | 11.0k | } |
155 | 751 | else |
156 | 751 | { |
157 | 751 | return neg ? -i : i; |
158 | 751 | } |
159 | 11.8k | } |
160 | | |
161 | | /* Fast but inaccurate atoi. */ |
162 | | static int fast_atoi(char *s) |
163 | 32.5M | { |
164 | 32.5M | int neg = 0; |
165 | 32.5M | int i = 0; |
166 | | |
167 | 34.0M | while (*s == '-') |
168 | 1.47M | { |
169 | 1.47M | neg = 1; |
170 | 1.47M | ++s; |
171 | 1.47M | } |
172 | 32.5M | while (*s == '+') |
173 | 2.80k | { |
174 | 2.80k | ++s; |
175 | 2.80k | } |
176 | | |
177 | 111M | while (*s >= '0' && *s <= '9') |
178 | 79.3M | { |
179 | | /* We deliberately ignore overflow here. */ |
180 | 79.3M | i = i * 10 + (*s - '0'); |
181 | 79.3M | ++s; |
182 | 79.3M | } |
183 | | |
184 | 32.5M | return neg ? -i : i; |
185 | 32.5M | } |
186 | | |
187 | | static int |
188 | | lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) |
189 | 46.8M | { |
190 | 46.8M | char *s = buf->scratch; |
191 | 46.8M | char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ |
192 | 46.8M | char *isreal = (c == '.' ? s : NULL); |
193 | 46.8M | int neg = (c == '-'); |
194 | 46.8M | int isbad = 0; |
195 | | |
196 | 46.8M | *s++ = c; |
197 | | |
198 | 46.8M | c = lex_byte(ctx, f); |
199 | | |
200 | | /* skip extra '-' signs at start of number */ |
201 | 46.8M | if (neg) |
202 | 2.96M | { |
203 | 2.98M | while (c == '-') |
204 | 17.0k | c = lex_byte(ctx, f); |
205 | 2.96M | } |
206 | | |
207 | 180M | while (s < e) |
208 | 180M | { |
209 | 180M | switch (c) |
210 | 180M | { |
211 | 48.5M | case IS_WHITE: |
212 | 48.5M | case IS_DELIM: |
213 | 46.8M | fz_unread_byte(ctx, f); |
214 | 46.8M | goto end; |
215 | 6.98k | case EOF: |
216 | 6.98k | goto end; |
217 | 13.7M | case '.': |
218 | 13.7M | if (isreal) |
219 | 256k | isbad = 1; |
220 | 13.7M | isreal = s; |
221 | 13.7M | *s++ = c; |
222 | 13.7M | break; |
223 | 97.9k | case '-': |
224 | | /* Bug 703248: Some PDFs (particularly those |
225 | | * generated by google docs) apparently have |
226 | | * numbers like 0.000000000000-5684342 in them. |
227 | | * We'll stop our interpretation at the -, but |
228 | | * keep reading to skip over the trailing |
229 | | * digits so they aren't parsed later. */ |
230 | 97.9k | *s++ = '\0'; |
231 | 97.9k | break; |
232 | 116M | case RANGE_0_9: |
233 | 116M | *s++ = c; |
234 | 116M | break; |
235 | 2.60M | default: |
236 | 2.60M | isbad = 1; |
237 | 2.60M | *s++ = c; |
238 | 2.60M | break; |
239 | 180M | } |
240 | 133M | c = lex_byte(ctx, f); |
241 | 133M | } |
242 | | |
243 | 46.8M | end: |
244 | 46.8M | *s = '\0'; |
245 | 46.8M | if (isbad) |
246 | 772k | return PDF_TOK_KEYWORD; |
247 | 46.0M | if (isreal) |
248 | 13.4M | { |
249 | | /* We'd like to use the fastest possible atof |
250 | | * routine, but we'd rather match acrobats |
251 | | * handling of broken numbers. As such, we |
252 | | * spot common broken cases and call an |
253 | | * acrobat compatible routine where required. */ |
254 | 13.4M | if (neg > 1 || isreal - buf->scratch >= 10) |
255 | 11.8k | buf->f = acrobat_compatible_atof(buf->scratch); |
256 | 13.4M | else |
257 | 13.4M | buf->f = fz_atof(buf->scratch); |
258 | 13.4M | return PDF_TOK_REAL; |
259 | 13.4M | } |
260 | 32.5M | else |
261 | 32.5M | { |
262 | 32.5M | buf->i = fast_atoi(buf->scratch); |
263 | 32.5M | return PDF_TOK_INT; |
264 | 32.5M | } |
265 | 46.0M | } |
266 | | |
267 | | static void |
268 | | lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
269 | 38.8M | { |
270 | 38.8M | char *s = lb->scratch; |
271 | 38.8M | char *e = s + fz_minz(127, lb->size); |
272 | 38.8M | int c; |
273 | | |
274 | 294M | while (1) |
275 | 294M | { |
276 | 294M | if (s == e) |
277 | 11.5k | { |
278 | 11.5k | if (e - lb->scratch < 127) |
279 | 0 | { |
280 | 0 | s += pdf_lexbuf_grow(ctx, lb); |
281 | 0 | e = lb->scratch + fz_minz(127, lb->size); |
282 | 0 | } |
283 | 11.5k | else |
284 | 11.5k | { |
285 | | /* truncate names that are too long */ |
286 | 11.5k | fz_warn(ctx, "name is too long"); |
287 | 11.5k | *s = 0; |
288 | 11.5k | lb->len = s - lb->scratch; |
289 | 11.5k | s = NULL; |
290 | 11.5k | } |
291 | 11.5k | } |
292 | 294M | c = lex_byte(ctx, f); |
293 | 294M | switch (c) |
294 | 294M | { |
295 | 78.9M | case IS_WHITE: |
296 | 78.9M | case IS_DELIM: |
297 | 38.8M | fz_unread_byte(ctx, f); |
298 | 38.8M | goto end; |
299 | 12.1k | case EOF: |
300 | 12.1k | goto end; |
301 | 112k | case '#': |
302 | 112k | { |
303 | 112k | int hex[2]; |
304 | 112k | int i; |
305 | 183k | for (i = 0; i < 2; i++) |
306 | 153k | { |
307 | 153k | c = fz_peek_byte(ctx, f); |
308 | 153k | switch (c) |
309 | 153k | { |
310 | 325k | case RANGE_0_9: |
311 | 325k | if (i == 1 && c == '0' && hex[0] == 0) |
312 | 40 | goto illegal; |
313 | 44.3k | hex[i] = lex_byte(ctx, f) - '0'; |
314 | 44.3k | break; |
315 | 18.0k | case RANGE_a_f: |
316 | 18.0k | hex[i] = lex_byte(ctx, f) - 'a' + 10; |
317 | 18.0k | break; |
318 | 8.89k | case RANGE_A_F: |
319 | 8.89k | hex[i] = lex_byte(ctx, f) - 'A' + 10; |
320 | 8.89k | break; |
321 | 82.4k | default: |
322 | 82.5k | case EOF: |
323 | 82.5k | goto illegal; |
324 | 153k | } |
325 | 153k | } |
326 | 29.8k | if (s) *s++ = (hex[0] << 4) + hex[1]; |
327 | 29.8k | break; |
328 | 82.5k | illegal: |
329 | 82.5k | if (i == 1) |
330 | 11.6k | fz_unread_byte(ctx, f); |
331 | 82.5k | if (s) *s++ = '#'; |
332 | 82.5k | continue; |
333 | 112k | } |
334 | 255M | default: |
335 | 255M | if (s) *s++ = c; |
336 | 255M | break; |
337 | 294M | } |
338 | 294M | } |
339 | 38.8M | end: |
340 | 38.8M | if (s) |
341 | 38.8M | { |
342 | 38.8M | *s = '\0'; |
343 | 38.8M | lb->len = s - lb->scratch; |
344 | 38.8M | } |
345 | 38.8M | } |
346 | | |
347 | | static int |
348 | | lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
349 | 1.75M | { |
350 | 1.75M | char *s = lb->scratch; |
351 | 1.75M | char *e = s + lb->size; |
352 | 1.75M | int bal = 1; |
353 | 1.75M | int oct; |
354 | 1.75M | int c; |
355 | | |
356 | 359M | while (1) |
357 | 359M | { |
358 | 359M | if (s == e) |
359 | 5.74k | { |
360 | 5.74k | s += pdf_lexbuf_grow(ctx, lb); |
361 | 5.74k | e = lb->scratch + lb->size; |
362 | 5.74k | } |
363 | 359M | c = lex_byte(ctx, f); |
364 | 359M | switch (c) |
365 | 359M | { |
366 | 5.25k | case EOF: |
367 | 5.25k | return PDF_TOK_ERROR; |
368 | 488k | case '(': |
369 | 488k | bal++; |
370 | 488k | *s++ = c; |
371 | 488k | break; |
372 | 2.18M | case ')': |
373 | 2.18M | bal --; |
374 | 2.18M | if (bal == 0) |
375 | 1.74M | goto end; |
376 | 436k | *s++ = c; |
377 | 436k | break; |
378 | 597k | case '\\': |
379 | 597k | c = lex_byte(ctx, f); |
380 | 597k | switch (c) |
381 | 597k | { |
382 | 8 | case EOF: |
383 | 8 | return PDF_TOK_ERROR; |
384 | 5.54k | case 'n': |
385 | 5.54k | *s++ = '\n'; |
386 | 5.54k | break; |
387 | 7.51k | case 'r': |
388 | 7.51k | *s++ = '\r'; |
389 | 7.51k | break; |
390 | 4.70k | case 't': |
391 | 4.70k | *s++ = '\t'; |
392 | 4.70k | break; |
393 | 2.12k | case 'b': |
394 | 2.12k | *s++ = '\b'; |
395 | 2.12k | break; |
396 | 2.22k | case 'f': |
397 | 2.22k | *s++ = '\f'; |
398 | 2.22k | break; |
399 | 21.8k | case '(': |
400 | 21.8k | *s++ = '('; |
401 | 21.8k | break; |
402 | 21.6k | case ')': |
403 | 21.6k | *s++ = ')'; |
404 | 21.6k | break; |
405 | 72.3k | case '\\': |
406 | 72.3k | *s++ = '\\'; |
407 | 72.3k | break; |
408 | 139k | case RANGE_0_7: |
409 | 139k | oct = c - '0'; |
410 | 139k | c = lex_byte(ctx, f); |
411 | 139k | if (c >= '0' && c <= '7') |
412 | 134k | { |
413 | 134k | oct = oct * 8 + (c - '0'); |
414 | 134k | c = lex_byte(ctx, f); |
415 | 134k | if (c >= '0' && c <= '7') |
416 | 133k | oct = oct * 8 + (c - '0'); |
417 | 1.12k | else if (c != EOF) |
418 | 1.12k | fz_unread_byte(ctx, f); |
419 | 134k | } |
420 | 5.76k | else if (c != EOF) |
421 | 5.71k | fz_unread_byte(ctx, f); |
422 | 139k | *s++ = oct; |
423 | 139k | break; |
424 | 384 | case '\n': |
425 | 384 | break; |
426 | 2.53k | case '\r': |
427 | 2.53k | c = lex_byte(ctx, f); |
428 | 2.53k | if ((c != '\n') && (c != EOF)) |
429 | 2.45k | fz_unread_byte(ctx, f); |
430 | 2.53k | break; |
431 | 317k | default: |
432 | 317k | *s++ = c; |
433 | 597k | } |
434 | 597k | break; |
435 | 356M | default: |
436 | 356M | *s++ = c; |
437 | 356M | break; |
438 | 359M | } |
439 | 359M | } |
440 | 1.74M | end: |
441 | 1.74M | lb->len = s - lb->scratch; |
442 | 1.74M | return PDF_TOK_STRING; |
443 | 1.75M | } |
444 | | |
445 | | static int |
446 | | lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
447 | 1.28M | { |
448 | 1.28M | char *s = lb->scratch; |
449 | 1.28M | char *e = s + lb->size; |
450 | 1.28M | int a = 0, x = 0; |
451 | 1.28M | int c; |
452 | | |
453 | 51.4M | while (1) |
454 | 51.4M | { |
455 | 51.4M | if (s == e) |
456 | 1.69k | { |
457 | 1.69k | s += pdf_lexbuf_grow(ctx, lb); |
458 | 1.69k | e = lb->scratch + lb->size; |
459 | 1.69k | } |
460 | 51.4M | c = lex_byte(ctx, f); |
461 | 51.4M | switch (c) |
462 | 51.4M | { |
463 | 4.77M | case IS_WHITE: |
464 | 4.77M | break; |
465 | 31.1M | default: |
466 | 31.1M | fz_warn(ctx, "invalid character in hex string"); |
467 | | /* fall through */ |
468 | 45.3M | case IS_HEX: |
469 | 45.3M | if (x) |
470 | 22.6M | { |
471 | 22.6M | *s++ = a * 16 + unhex(c); |
472 | 22.6M | x = !x; |
473 | 22.6M | } |
474 | 22.7M | else |
475 | 22.7M | { |
476 | 22.7M | a = unhex(c); |
477 | 22.7M | x = !x; |
478 | 22.7M | } |
479 | 45.3M | break; |
480 | 1.28M | case '>': |
481 | 1.28M | if (x) |
482 | 91.6k | { |
483 | 91.6k | *s++ = a * 16; /* pad truncated string with '0' */ |
484 | 91.6k | } |
485 | 1.28M | goto end; |
486 | 4.06k | case EOF: |
487 | 4.06k | return PDF_TOK_ERROR; |
488 | 51.4M | } |
489 | 51.4M | } |
490 | 1.28M | end: |
491 | 1.28M | lb->len = s - lb->scratch; |
492 | 1.28M | return PDF_TOK_STRING; |
493 | 1.28M | } |
494 | | |
495 | | static pdf_token |
496 | | pdf_token_from_keyword(char *key) |
497 | 25.4M | { |
498 | 25.4M | switch (*key) |
499 | 25.4M | { |
500 | 6.15M | case 'R': |
501 | 6.15M | if (!strcmp(key, "R")) return PDF_TOK_R; |
502 | 45.1k | break; |
503 | 208k | case 't': |
504 | 208k | if (!strcmp(key, "true")) return PDF_TOK_TRUE; |
505 | 65.4k | if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; |
506 | 60.5k | break; |
507 | 338k | case 'f': |
508 | 338k | if (!strcmp(key, "false")) return PDF_TOK_FALSE; |
509 | 215k | break; |
510 | 506k | case 'n': |
511 | 506k | if (!strcmp(key, "null")) return PDF_TOK_NULL; |
512 | 387k | if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ; |
513 | 387k | break; |
514 | 635k | case 'o': |
515 | 635k | if (!strcmp(key, "obj")) return PDF_TOK_OBJ; |
516 | 13.7k | break; |
517 | 518k | case 'e': |
518 | 518k | if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; |
519 | 136k | if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; |
520 | 76.1k | break; |
521 | 375k | case 's': |
522 | 375k | if (!strcmp(key, "stream")) return PDF_TOK_STREAM; |
523 | 152k | if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; |
524 | 147k | break; |
525 | 147k | case 'x': |
526 | 14.2k | if (!strcmp(key, "xref")) return PDF_TOK_XREF; |
527 | 10.6k | break; |
528 | 25.4M | } |
529 | | |
530 | 35.8M | while (*key) |
531 | 25.1M | { |
532 | 25.1M | if (!fz_isprint(*key)) |
533 | 6.90M | return PDF_TOK_ERROR; |
534 | 18.2M | ++key; |
535 | 18.2M | } |
536 | | |
537 | 10.7M | return PDF_TOK_KEYWORD; |
538 | 17.6M | } |
539 | | |
540 | | void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) |
541 | 78.0k | { |
542 | 78.0k | lb->size = lb->base_size = size; |
543 | 78.0k | lb->len = 0; |
544 | 78.0k | lb->scratch = &lb->buffer[0]; |
545 | 78.0k | } |
546 | | |
547 | | void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) |
548 | 78.0k | { |
549 | 78.0k | if (lb && lb->size != lb->base_size) |
550 | 2.33k | fz_free(ctx, lb->scratch); |
551 | 78.0k | } |
552 | | |
553 | | ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) |
554 | 7.44k | { |
555 | 7.44k | char *old = lb->scratch; |
556 | 7.44k | size_t newsize = lb->size * 2; |
557 | 7.44k | if (lb->size == lb->base_size) |
558 | 2.33k | { |
559 | 2.33k | lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf"); |
560 | 2.33k | memcpy(lb->scratch, lb->buffer, lb->size); |
561 | 2.33k | } |
562 | 5.10k | else |
563 | 5.10k | { |
564 | 5.10k | lb->scratch = fz_realloc(ctx, lb->scratch, newsize); |
565 | 5.10k | } |
566 | 7.44k | lb->size = newsize; |
567 | 7.44k | return lb->scratch - old; |
568 | 7.44k | } |
569 | | |
570 | | pdf_token |
571 | | pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
572 | 91.8M | { |
573 | 167M | while (1) |
574 | 167M | { |
575 | 167M | int c = lex_byte(ctx, f); |
576 | 167M | switch (c) |
577 | 167M | { |
578 | 95.7k | case EOF: |
579 | 95.7k | return PDF_TOK_EOF; |
580 | 75.3M | case IS_WHITE: |
581 | 75.3M | lex_white(ctx, f); |
582 | 75.3M | break; |
583 | 254k | case '%': |
584 | 254k | lex_comment(ctx, f); |
585 | 254k | break; |
586 | 13.2M | case '/': |
587 | 13.2M | lex_name(ctx, f, buf); |
588 | 13.2M | return PDF_TOK_NAME; |
589 | 1.75M | case '(': |
590 | 1.75M | return lex_string(ctx, f, buf); |
591 | 47.1k | case ')': |
592 | 47.1k | return PDF_TOK_ERROR; |
593 | 2.80M | case '<': |
594 | 2.80M | c = lex_byte(ctx, f); |
595 | 2.80M | if (c == '<') |
596 | 1.51M | return PDF_TOK_OPEN_DICT; |
597 | 1.28M | if (c != EOF) |
598 | 1.28M | fz_unread_byte(ctx, f); |
599 | 1.28M | return lex_hex_string(ctx, f, buf); |
600 | 1.52M | case '>': |
601 | 1.52M | c = lex_byte(ctx, f); |
602 | 1.52M | if (c == '>') |
603 | 1.38M | return PDF_TOK_CLOSE_DICT; |
604 | 141k | if (c != EOF) |
605 | 140k | fz_unread_byte(ctx, f); |
606 | 141k | return PDF_TOK_ERROR; |
607 | 1.42M | case '[': |
608 | 1.42M | return PDF_TOK_OPEN_ARRAY; |
609 | 1.35M | case ']': |
610 | 1.35M | return PDF_TOK_CLOSE_ARRAY; |
611 | 39.6k | case '{': |
612 | 39.6k | return PDF_TOK_OPEN_BRACE; |
613 | 56.1k | case '}': |
614 | 56.1k | return PDF_TOK_CLOSE_BRACE; |
615 | 45.4M | case IS_NUMBER: |
616 | 45.4M | return lex_number(ctx, f, buf, c); |
617 | 24.0M | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
618 | 24.0M | fz_unread_byte(ctx, f); |
619 | 24.0M | lex_name(ctx, f, buf); |
620 | 24.0M | return pdf_token_from_keyword(buf->scratch); |
621 | 167M | } |
622 | 167M | } |
623 | 91.8M | } |
624 | | |
625 | | pdf_token |
626 | | pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
627 | 3.35M | { |
628 | 5.91M | while (1) |
629 | 5.91M | { |
630 | 5.91M | int c = lex_byte(ctx, f); |
631 | 5.91M | switch (c) |
632 | 5.91M | { |
633 | 3.58k | case EOF: |
634 | 3.58k | return PDF_TOK_EOF; |
635 | 2.49M | case IS_WHITE: |
636 | 2.49M | lex_white(ctx, f); |
637 | 2.49M | break; |
638 | 52.0k | case '%': |
639 | 52.0k | lex_comment(ctx, f); |
640 | 52.0k | break; |
641 | 95.6k | case '/': |
642 | 95.6k | lex_name(ctx, f, buf); |
643 | 95.6k | return PDF_TOK_NAME; |
644 | 40.8k | case '(': |
645 | 40.8k | return PDF_TOK_ERROR; /* no strings allowed */ |
646 | 41.2k | case ')': |
647 | 41.2k | return PDF_TOK_ERROR; /* no strings allowed */ |
648 | 59.3k | case '<': |
649 | 59.3k | c = lex_byte(ctx, f); |
650 | 59.3k | if (c == '<') |
651 | 8.19k | return PDF_TOK_OPEN_DICT; |
652 | 51.1k | if (c != EOF) |
653 | 51.1k | fz_unread_byte(ctx, f); |
654 | 51.1k | return PDF_TOK_ERROR; /* no strings allowed */ |
655 | 93.7k | case '>': |
656 | 93.7k | c = lex_byte(ctx, f); |
657 | 93.7k | if (c == '>') |
658 | 10.5k | return PDF_TOK_CLOSE_DICT; |
659 | 83.2k | if (c != EOF) |
660 | 83.2k | fz_unread_byte(ctx, f); |
661 | 83.2k | return PDF_TOK_ERROR; |
662 | 74.1k | case '[': |
663 | 74.1k | return PDF_TOK_OPEN_ARRAY; |
664 | 68.8k | case ']': |
665 | 68.8k | return PDF_TOK_CLOSE_ARRAY; |
666 | 40.0k | case '{': |
667 | 40.0k | return PDF_TOK_OPEN_BRACE; |
668 | 36.6k | case '}': |
669 | 36.6k | return PDF_TOK_CLOSE_BRACE; |
670 | 1.38M | case IS_NUMBER: |
671 | 1.38M | return lex_number(ctx, f, buf, c); |
672 | 1.41M | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
673 | 1.41M | fz_unread_byte(ctx, f); |
674 | 1.41M | lex_name(ctx, f, buf); |
675 | 1.41M | return pdf_token_from_keyword(buf->scratch); |
676 | 5.91M | } |
677 | 5.91M | } |
678 | 3.35M | } |
679 | | |
680 | | void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) |
681 | 0 | { |
682 | 0 | switch (tok) |
683 | 0 | { |
684 | 0 | case PDF_TOK_NAME: |
685 | 0 | fz_append_printf(ctx, fzbuf, "/%s", buf->scratch); |
686 | 0 | break; |
687 | 0 | case PDF_TOK_STRING: |
688 | 0 | if (buf->len >= buf->size) |
689 | 0 | pdf_lexbuf_grow(ctx, buf); |
690 | 0 | buf->scratch[buf->len] = 0; |
691 | 0 | fz_append_pdf_string(ctx, fzbuf, buf->scratch); |
692 | 0 | break; |
693 | 0 | case PDF_TOK_OPEN_DICT: |
694 | 0 | fz_append_string(ctx, fzbuf, "<<"); |
695 | 0 | break; |
696 | 0 | case PDF_TOK_CLOSE_DICT: |
697 | 0 | fz_append_string(ctx, fzbuf, ">>"); |
698 | 0 | break; |
699 | 0 | case PDF_TOK_OPEN_ARRAY: |
700 | 0 | fz_append_byte(ctx, fzbuf, '['); |
701 | 0 | break; |
702 | 0 | case PDF_TOK_CLOSE_ARRAY: |
703 | 0 | fz_append_byte(ctx, fzbuf, ']'); |
704 | 0 | break; |
705 | 0 | case PDF_TOK_OPEN_BRACE: |
706 | 0 | fz_append_byte(ctx, fzbuf, '{'); |
707 | 0 | break; |
708 | 0 | case PDF_TOK_CLOSE_BRACE: |
709 | 0 | fz_append_byte(ctx, fzbuf, '}'); |
710 | 0 | break; |
711 | 0 | case PDF_TOK_INT: |
712 | 0 | fz_append_printf(ctx, fzbuf, "%ld", buf->i); |
713 | 0 | break; |
714 | 0 | case PDF_TOK_REAL: |
715 | 0 | fz_append_printf(ctx, fzbuf, "%g", buf->f); |
716 | 0 | break; |
717 | 0 | default: |
718 | 0 | fz_append_data(ctx, fzbuf, buf->scratch, buf->len); |
719 | 0 | break; |
720 | 0 | } |
721 | 0 | } |