/src/mupdf/source/pdf/pdf-lex.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2024 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | #define IS_NUMBER \ |
29 | 101k | '+':case'-':case'.':case'0':case'1':case'2':case'3':\ |
30 | 160k | case'4':case'5':case'6':case'7':case'8':case'9' |
31 | | #define IS_WHITE \ |
32 | 807k | '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20' |
33 | | #define IS_HEX \ |
34 | 209k | '0':case'1':case'2':case'3':case'4':case'5':case'6':\ |
35 | 264k | case'7':case'8':case'9':case'A':case'B':case'C':\ |
36 | 293k | case'D':case'E':case'F':case'a':case'b':case'c':\ |
37 | 293k | case'd':case'e':case'f' |
38 | | #define IS_DELIM \ |
39 | 405k | '(':case')':case'<':case'>':case'[':case']':case'{':\ |
40 | 425k | case'}':case'/':case'%' |
41 | | |
42 | | #define RANGE_0_9 \ |
43 | 131k | '0':case'1':case'2':case'3':case'4':case'5':\ |
44 | 205k | case'6':case'7':case'8':case'9' |
45 | | #define RANGE_a_f \ |
46 | 120 | 'a':case'b':case'c':case'd':case'e':case'f' |
47 | | #define RANGE_A_F \ |
48 | 91 | 'A':case'B':case'C':case'D':case'E':case'F' |
49 | | #define RANGE_0_7 \ |
50 | 0 | '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' |
51 | | |
52 | | /* #define DUMP_LEXER_STREAM */ |
53 | | #ifdef DUMP_LEXER_STREAM |
54 | | static inline int lex_byte(fz_context *ctx, fz_stream *stm) |
55 | | { |
56 | | int c = fz_read_byte(ctx, stm); |
57 | | |
58 | | if (c == EOF) |
59 | | fz_write_printf(ctx, fz_stdout(ctx), "<EOF>"); |
60 | | else if (c >= 32 && c < 128) |
61 | | fz_write_printf(ctx, fz_stdout(ctx), "%c", c); |
62 | | else |
63 | | fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c); |
64 | | return c; |
65 | | } |
66 | | #else |
67 | 5.68M | #define lex_byte(C,S) fz_read_byte(C,S) |
68 | | #endif |
69 | | |
70 | | static inline int iswhite(int ch) |
71 | 405k | { |
72 | 405k | return |
73 | 405k | ch == '\000' || |
74 | 405k | ch == '\011' || |
75 | 405k | ch == '\012' || |
76 | 405k | ch == '\014' || |
77 | 405k | ch == '\015' || |
78 | 405k | ch == '\040'; |
79 | 405k | } |
80 | | |
81 | | static inline int fz_isprint(int ch) |
82 | 283k | { |
83 | 283k | return ch >= ' ' && ch <= '~'; |
84 | 283k | } |
85 | | |
86 | | static inline int unhex(int ch) |
87 | 293k | { |
88 | 293k | if (ch >= '0' && ch <= '9') return ch - '0'; |
89 | 52.7k | if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; |
90 | 0 | if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; |
91 | 0 | return 0; |
92 | 0 | } |
93 | | |
94 | | static void |
95 | | lex_white(fz_context *ctx, fz_stream *f) |
96 | 441k | { |
97 | 441k | int c; |
98 | 838k | do { |
99 | 838k | c = lex_byte(ctx, f); |
100 | 838k | } while ((c <= 32) && (iswhite(c))); |
101 | 441k | if (c != EOF) |
102 | 441k | fz_unread_byte(ctx, f); |
103 | 441k | } |
104 | | |
105 | | static void |
106 | | lex_comment(fz_context *ctx, fz_stream *f) |
107 | 4.67k | { |
108 | 4.67k | int c; |
109 | 1.10M | do { |
110 | 1.10M | c = lex_byte(ctx, f); |
111 | 1.10M | } while ((c != '\012') && (c != '\015') && (c != EOF)); |
112 | 4.67k | } |
113 | | |
114 | | /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ |
115 | | static float acrobat_compatible_atof(char *s) |
116 | 0 | { |
117 | 0 | int neg = 0; |
118 | 0 | int i = 0; |
119 | |
|
120 | 0 | while (*s == '-') |
121 | 0 | { |
122 | 0 | neg = 1; |
123 | 0 | ++s; |
124 | 0 | } |
125 | 0 | while (*s == '+') |
126 | 0 | { |
127 | 0 | ++s; |
128 | 0 | } |
129 | |
|
130 | 0 | while (*s >= '0' && *s <= '9') |
131 | 0 | { |
132 | | /* We deliberately ignore overflow here. |
133 | | * Tests show that Acrobat handles * overflows in exactly the same way we do: |
134 | | * 123450000000000000000678 is read as 678. |
135 | | */ |
136 | 0 | i = i * 10 + (*s - '0'); |
137 | 0 | ++s; |
138 | 0 | } |
139 | |
|
140 | 0 | if (*s == '.') |
141 | 0 | { |
142 | 0 | float v = i; |
143 | 0 | float n = 0; |
144 | 0 | float d = 1; |
145 | 0 | ++s; |
146 | 0 | while (*s >= '0' && *s <= '9') |
147 | 0 | { |
148 | 0 | n = 10 * n + (*s - '0'); |
149 | 0 | d = 10 * d; |
150 | 0 | ++s; |
151 | 0 | } |
152 | 0 | v += n / d; |
153 | 0 | return neg ? -v : v; |
154 | 0 | } |
155 | 0 | else |
156 | 0 | { |
157 | 0 | return neg ? -i : i; |
158 | 0 | } |
159 | 0 | } |
160 | | |
161 | | /* Fast but inaccurate atoi. */ |
162 | | static int64_t fast_atoi(char *s) |
163 | 150k | { |
164 | 150k | int neg = 0; |
165 | 150k | int64_t i = 0; |
166 | | |
167 | 150k | while (*s == '-') |
168 | 340 | { |
169 | 340 | neg = 1; |
170 | 340 | ++s; |
171 | 340 | } |
172 | 150k | while (*s == '+') |
173 | 16 | { |
174 | 16 | ++s; |
175 | 16 | } |
176 | | |
177 | 489k | while (*s >= '0' && *s <= '9') |
178 | 339k | { |
179 | | /* We deliberately ignore overflow here. */ |
180 | 339k | i = i * 10 + (*s - '0'); |
181 | 339k | ++s; |
182 | 339k | } |
183 | | |
184 | 150k | return neg ? -i : i; |
185 | 150k | } |
186 | | |
187 | | static int |
188 | | lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) |
189 | 160k | { |
190 | 160k | char *s = buf->scratch; |
191 | 160k | char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ |
192 | 160k | char *isreal = (c == '.' ? s : NULL); |
193 | 160k | int neg = (c == '-'); |
194 | 160k | int isbad = 0; |
195 | | |
196 | 160k | *s++ = c; |
197 | | |
198 | 160k | c = lex_byte(ctx, f); |
199 | | |
200 | | /* skip extra '-' signs at start of number */ |
201 | 160k | if (neg) |
202 | 802 | { |
203 | 804 | while (c == '-') |
204 | 2 | c = lex_byte(ctx, f); |
205 | 802 | } |
206 | | |
207 | 417k | while (s < e) |
208 | 417k | { |
209 | 417k | switch (c) |
210 | 417k | { |
211 | 291k | case IS_WHITE: |
212 | 291k | case IS_DELIM: |
213 | 160k | fz_unread_byte(ctx, f); |
214 | 160k | goto end; |
215 | 0 | case EOF: |
216 | 0 | goto end; |
217 | 2.38k | case '.': |
218 | 2.38k | if (isreal) |
219 | 89 | isbad = 1; |
220 | 2.38k | isreal = s; |
221 | 2.38k | *s++ = c; |
222 | 2.38k | break; |
223 | 236 | case '-': |
224 | | /* Bug 703248: Some PDFs (particularly those |
225 | | * generated by google docs) apparently have |
226 | | * numbers like 0.000000000000-5684342 in them. |
227 | | * We'll stop our interpretation at the -, but |
228 | | * keep reading to skip over the trailing |
229 | | * digits so they aren't parsed later. */ |
230 | 236 | *s++ = '\0'; |
231 | 236 | break; |
232 | 204k | case RANGE_0_9: |
233 | 204k | *s++ = c; |
234 | 204k | break; |
235 | 50.6k | default: |
236 | 50.6k | isbad = 1; |
237 | 50.6k | *s++ = c; |
238 | 50.6k | break; |
239 | 417k | } |
240 | 257k | c = lex_byte(ctx, f); |
241 | 257k | } |
242 | | |
243 | 160k | end: |
244 | 160k | *s = '\0'; |
245 | 160k | if (isbad) |
246 | 3.51k | return PDF_TOK_KEYWORD; |
247 | 156k | if (isreal) |
248 | 6.43k | { |
249 | | /* We'd like to use the fastest possible atof |
250 | | * routine, but we'd rather match acrobats |
251 | | * handling of broken numbers. As such, we |
252 | | * spot common broken cases and call an |
253 | | * acrobat compatible routine where required. */ |
254 | 6.43k | if (neg > 1 || isreal - buf->scratch >= 10) |
255 | 0 | buf->f = acrobat_compatible_atof(buf->scratch); |
256 | 6.43k | else |
257 | 6.43k | buf->f = fz_atof(buf->scratch); |
258 | 6.43k | return PDF_TOK_REAL; |
259 | 6.43k | } |
260 | 150k | else |
261 | 150k | { |
262 | 150k | buf->i = fast_atoi(buf->scratch); |
263 | 150k | return PDF_TOK_INT; |
264 | 150k | } |
265 | 156k | } |
266 | | |
267 | | static void |
268 | | lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
269 | 265k | { |
270 | 265k | char *s = lb->scratch; |
271 | 265k | char *e = s + fz_minz(127, lb->size); |
272 | 265k | int c; |
273 | | |
274 | 1.87M | while (1) |
275 | 1.87M | { |
276 | 1.87M | if (s == e) |
277 | 30 | { |
278 | 30 | if (e - lb->scratch < 127) |
279 | 0 | { |
280 | 0 | s += pdf_lexbuf_grow(ctx, lb); |
281 | 0 | e = lb->scratch + fz_minz(127, lb->size); |
282 | 0 | } |
283 | 30 | else |
284 | 30 | { |
285 | | /* truncate names that are too long */ |
286 | 30 | fz_warn(ctx, "name is too long"); |
287 | 30 | *s = 0; |
288 | 30 | lb->len = s - lb->scratch; |
289 | 30 | s = NULL; |
290 | 30 | } |
291 | 30 | } |
292 | 1.87M | c = lex_byte(ctx, f); |
293 | 1.87M | switch (c) |
294 | 1.87M | { |
295 | 823k | case IS_WHITE: |
296 | 823k | case IS_DELIM: |
297 | 265k | fz_unread_byte(ctx, f); |
298 | 265k | goto end; |
299 | 2 | case EOF: |
300 | 2 | goto end; |
301 | 6.64k | case '#': |
302 | 6.64k | { |
303 | 6.64k | int hex[2]; |
304 | 6.64k | int i; |
305 | 8.29k | for (i = 0; i < 2; i++) |
306 | 8.27k | { |
307 | 8.27k | c = fz_peek_byte(ctx, f); |
308 | 8.27k | switch (c) |
309 | 8.27k | { |
310 | 12.4k | case RANGE_0_9: |
311 | 12.4k | if (i == 1 && c == '0' && hex[0] == 0) |
312 | 0 | goto illegal; |
313 | 1.44k | hex[i] = lex_byte(ctx, f) - '0'; |
314 | 1.44k | break; |
315 | 120 | case RANGE_a_f: |
316 | 120 | hex[i] = lex_byte(ctx, f) - 'a' + 10; |
317 | 120 | break; |
318 | 91 | case RANGE_A_F: |
319 | 91 | hex[i] = lex_byte(ctx, f) - 'A' + 10; |
320 | 91 | break; |
321 | 6.62k | default: |
322 | 6.62k | goto illegal; |
323 | 6.62k | case EOF: |
324 | 0 | goto illegal_eof; |
325 | 8.27k | } |
326 | 8.27k | } |
327 | 21 | if (s) *s++ = (hex[0] << 4) + hex[1]; |
328 | 21 | break; |
329 | 6.62k | illegal: |
330 | 6.62k | if (i == 1) |
331 | 1.61k | fz_unread_byte(ctx, f); |
332 | 6.62k | illegal_eof: |
333 | 6.62k | if (s) *s++ = '#'; |
334 | 6.62k | continue; |
335 | 6.62k | } |
336 | 1.60M | default: |
337 | 1.60M | if (s) *s++ = c; |
338 | 1.60M | break; |
339 | 1.87M | } |
340 | 1.87M | } |
341 | 265k | end: |
342 | 265k | if (s) |
343 | 265k | { |
344 | 265k | *s = '\0'; |
345 | 265k | lb->len = s - lb->scratch; |
346 | 265k | } |
347 | 265k | } |
348 | | |
349 | | static int |
350 | | lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
351 | 289 | { |
352 | 289 | char *s = lb->scratch; |
353 | 289 | char *e = s + lb->size; |
354 | 289 | int bal = 1; |
355 | 289 | int oct; |
356 | 289 | int c; |
357 | | |
358 | 3.81k | while (1) |
359 | 3.81k | { |
360 | 3.81k | if (s == e) |
361 | 4 | { |
362 | 4 | s += pdf_lexbuf_grow(ctx, lb); |
363 | 4 | e = lb->scratch + lb->size; |
364 | 4 | } |
365 | 3.81k | c = lex_byte(ctx, f); |
366 | 3.81k | switch (c) |
367 | 3.81k | { |
368 | 2 | case EOF: |
369 | 2 | return PDF_TOK_ERROR; |
370 | 94 | case '(': |
371 | 94 | bal++; |
372 | 94 | *s++ = c; |
373 | 94 | break; |
374 | 339 | case ')': |
375 | 339 | bal --; |
376 | 339 | if (bal == 0) |
377 | 287 | goto end; |
378 | 52 | *s++ = c; |
379 | 52 | break; |
380 | 2 | case '\\': |
381 | 2 | c = lex_byte(ctx, f); |
382 | 2 | switch (c) |
383 | 2 | { |
384 | 0 | case EOF: |
385 | 0 | return PDF_TOK_ERROR; |
386 | 0 | case 'n': |
387 | 0 | *s++ = '\n'; |
388 | 0 | break; |
389 | 0 | case 'r': |
390 | 0 | *s++ = '\r'; |
391 | 0 | break; |
392 | 0 | case 't': |
393 | 0 | *s++ = '\t'; |
394 | 0 | break; |
395 | 0 | case 'b': |
396 | 0 | *s++ = '\b'; |
397 | 0 | break; |
398 | 0 | case 'f': |
399 | 0 | *s++ = '\f'; |
400 | 0 | break; |
401 | 1 | case '(': |
402 | 1 | *s++ = '('; |
403 | 1 | break; |
404 | 1 | case ')': |
405 | 1 | *s++ = ')'; |
406 | 1 | break; |
407 | 0 | case '\\': |
408 | 0 | *s++ = '\\'; |
409 | 0 | break; |
410 | 0 | case RANGE_0_7: |
411 | 0 | oct = c - '0'; |
412 | 0 | c = lex_byte(ctx, f); |
413 | 0 | if (c >= '0' && c <= '7') |
414 | 0 | { |
415 | 0 | oct = oct * 8 + (c - '0'); |
416 | 0 | c = lex_byte(ctx, f); |
417 | 0 | if (c >= '0' && c <= '7') |
418 | 0 | oct = oct * 8 + (c - '0'); |
419 | 0 | else if (c != EOF) |
420 | 0 | fz_unread_byte(ctx, f); |
421 | 0 | } |
422 | 0 | else if (c != EOF) |
423 | 0 | fz_unread_byte(ctx, f); |
424 | 0 | *s++ = oct; |
425 | 0 | break; |
426 | 0 | case '\n': |
427 | 0 | break; |
428 | 0 | case '\r': |
429 | 0 | c = lex_byte(ctx, f); |
430 | 0 | if ((c != '\n') && (c != EOF)) |
431 | 0 | fz_unread_byte(ctx, f); |
432 | 0 | break; |
433 | 0 | default: |
434 | 0 | *s++ = c; |
435 | 2 | } |
436 | 2 | break; |
437 | | /* Bug 708256: PDF 32000-1 says that any occurence of \n, \r, or \r\n in a |
438 | | * (unless escaped with a '\') should be interpreted as a single 0x0a byte. */ |
439 | 13 | case '\n': |
440 | 13 | *s++ = 0x0a; |
441 | 13 | break; |
442 | 77 | case '\r': |
443 | 77 | *s++ = 0x0a; |
444 | 77 | c = lex_byte(ctx, f); |
445 | 77 | if ((c != '\n') && (c != EOF)) |
446 | 3 | fz_unread_byte(ctx, f); |
447 | 77 | break; |
448 | 3.28k | default: |
449 | 3.28k | *s++ = c; |
450 | 3.28k | break; |
451 | 3.81k | } |
452 | 3.81k | } |
453 | 287 | end: |
454 | 287 | lb->len = s - lb->scratch; |
455 | 287 | return PDF_TOK_STRING; |
456 | 289 | } |
457 | | |
458 | | static int |
459 | | lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
460 | 66.8k | { |
461 | 66.8k | char *s = lb->scratch; |
462 | 66.8k | char *e = s + lb->size; |
463 | 66.8k | int a = 0, x = 0; |
464 | 66.8k | int c; |
465 | | |
466 | 360k | while (1) |
467 | 360k | { |
468 | 360k | if (s == e) |
469 | 0 | { |
470 | 0 | s += pdf_lexbuf_grow(ctx, lb); |
471 | 0 | e = lb->scratch + lb->size; |
472 | 0 | } |
473 | 360k | c = lex_byte(ctx, f); |
474 | 360k | switch (c) |
475 | 360k | { |
476 | 0 | case IS_WHITE: |
477 | 0 | break; |
478 | 0 | default: |
479 | 0 | fz_warn(ctx, "invalid character in hex string"); |
480 | | /* fall through */ |
481 | 293k | case IS_HEX: |
482 | 293k | if (x) |
483 | 146k | { |
484 | 146k | *s++ = a * 16 + unhex(c); |
485 | 146k | x = !x; |
486 | 146k | } |
487 | 146k | else |
488 | 146k | { |
489 | 146k | a = unhex(c); |
490 | 146k | x = !x; |
491 | 146k | } |
492 | 293k | break; |
493 | 66.8k | case '>': |
494 | 66.8k | if (x) |
495 | 1 | { |
496 | 1 | *s++ = a * 16; /* pad truncated string with '0' */ |
497 | 1 | } |
498 | 66.8k | goto end; |
499 | 0 | case EOF: |
500 | 0 | return PDF_TOK_ERROR; |
501 | 360k | } |
502 | 360k | } |
503 | 66.8k | end: |
504 | 66.8k | lb->len = s - lb->scratch; |
505 | 66.8k | return PDF_TOK_STRING; |
506 | 66.8k | } |
507 | | |
508 | | static pdf_token |
509 | | pdf_token_from_keyword(char *key) |
510 | 218k | { |
511 | 218k | switch (*key) |
512 | 218k | { |
513 | 22.1k | case 'R': |
514 | 22.1k | if (!strcmp(key, "R")) return PDF_TOK_R; |
515 | 265 | break; |
516 | 6.83k | case 't': |
517 | 6.83k | if (!strcmp(key, "true")) return PDF_TOK_TRUE; |
518 | 6.80k | if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; |
519 | 6.80k | break; |
520 | 6.80k | case 'f': |
521 | 1.35k | if (!strcmp(key, "false")) return PDF_TOK_FALSE; |
522 | 946 | break; |
523 | 4.66k | case 'n': |
524 | 4.66k | if (!strcmp(key, "null")) return PDF_TOK_NULL; |
525 | 4.66k | if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ; |
526 | 4.66k | break; |
527 | 7.30k | case 'o': |
528 | 7.30k | if (!strcmp(key, "obj")) return PDF_TOK_OBJ; |
529 | 5.97k | break; |
530 | 8.77k | case 'e': |
531 | 8.77k | if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; |
532 | 7.80k | if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; |
533 | 7.51k | break; |
534 | 8.58k | case 's': |
535 | 8.58k | if (!strcmp(key, "stream")) return PDF_TOK_STREAM; |
536 | 8.10k | if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; |
537 | 8.10k | break; |
538 | 8.10k | case 'x': |
539 | 579 | if (!strcmp(key, "xref")) return PDF_TOK_XREF; |
540 | 579 | break; |
541 | 218k | } |
542 | | |
543 | 397k | while (*key) |
544 | 283k | { |
545 | 283k | if (!fz_isprint(*key)) |
546 | 79.2k | return PDF_TOK_ERROR; |
547 | 204k | ++key; |
548 | 204k | } |
549 | | |
550 | 114k | return PDF_TOK_KEYWORD; |
551 | 193k | } |
552 | | |
553 | | void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) |
554 | 23 | { |
555 | 23 | lb->size = lb->base_size = size; |
556 | 23 | lb->len = 0; |
557 | 23 | lb->scratch = &lb->buffer[0]; |
558 | 23 | } |
559 | | |
560 | | void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) |
561 | 23 | { |
562 | 23 | if (lb && lb->size != lb->base_size) |
563 | 2 | fz_free(ctx, lb->scratch); |
564 | 23 | } |
565 | | |
566 | | ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) |
567 | 4 | { |
568 | 4 | char *old = lb->scratch; |
569 | 4 | size_t newsize = lb->size * 2; |
570 | 4 | if (lb->size == lb->base_size) |
571 | 2 | { |
572 | 2 | lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf"); |
573 | 2 | memcpy(lb->scratch, lb->buffer, lb->size); |
574 | 2 | } |
575 | 2 | else |
576 | 2 | { |
577 | 2 | lb->scratch = fz_realloc(ctx, lb->scratch, newsize); |
578 | 2 | } |
579 | 4 | lb->size = newsize; |
580 | 4 | return lb->scratch - old; |
581 | 4 | } |
582 | | |
583 | | pdf_token |
584 | | pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
585 | 240k | { |
586 | 464k | while (1) |
587 | 464k | { |
588 | 464k | int c = lex_byte(ctx, f); |
589 | 464k | switch (c) |
590 | 464k | { |
591 | 27 | case EOF: |
592 | 27 | return PDF_TOK_EOF; |
593 | 223k | case IS_WHITE: |
594 | 223k | lex_white(ctx, f); |
595 | 223k | break; |
596 | 26 | case '%': |
597 | 26 | lex_comment(ctx, f); |
598 | 26 | break; |
599 | 39.4k | case '/': |
600 | 39.4k | lex_name(ctx, f, buf); |
601 | 39.4k | return PDF_TOK_NAME; |
602 | 289 | case '(': |
603 | 289 | return lex_string(ctx, f, buf); |
604 | 213 | case ')': |
605 | 213 | return PDF_TOK_ERROR; |
606 | 69.8k | case '<': |
607 | 69.8k | c = lex_byte(ctx, f); |
608 | 69.8k | if (c == '<') |
609 | 2.96k | return PDF_TOK_OPEN_DICT; |
610 | 66.8k | if (c != EOF) |
611 | 66.8k | fz_unread_byte(ctx, f); |
612 | 66.8k | return lex_hex_string(ctx, f, buf); |
613 | 2.59k | case '>': |
614 | 2.59k | c = lex_byte(ctx, f); |
615 | 2.59k | if (c == '>') |
616 | 2.59k | return PDF_TOK_CLOSE_DICT; |
617 | 0 | if (c != EOF) |
618 | 0 | fz_unread_byte(ctx, f); |
619 | 0 | return PDF_TOK_ERROR; |
620 | 1.75k | case '[': |
621 | 1.75k | return PDF_TOK_OPEN_ARRAY; |
622 | 1.71k | case ']': |
623 | 1.71k | return PDF_TOK_CLOSE_ARRAY; |
624 | 0 | case '{': |
625 | 0 | return PDF_TOK_OPEN_BRACE; |
626 | 0 | case '}': |
627 | 0 | return PDF_TOK_CLOSE_BRACE; |
628 | 99.3k | case IS_NUMBER: |
629 | 99.3k | return lex_number(ctx, f, buf, c); |
630 | 25.5k | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
631 | 25.5k | fz_unread_byte(ctx, f); |
632 | 25.5k | lex_name(ctx, f, buf); |
633 | 25.5k | return pdf_token_from_keyword(buf->scratch); |
634 | 464k | } |
635 | 464k | } |
636 | 240k | } |
637 | | |
638 | | pdf_token |
639 | | pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
640 | 305k | { |
641 | 527k | while (1) |
642 | 527k | { |
643 | 527k | int c = lex_byte(ctx, f); |
644 | 527k | switch (c) |
645 | 527k | { |
646 | 3 | case EOF: |
647 | 3 | return PDF_TOK_EOF; |
648 | 217k | case IS_WHITE: |
649 | 217k | lex_white(ctx, f); |
650 | 217k | break; |
651 | 4.64k | case '%': |
652 | 4.64k | lex_comment(ctx, f); |
653 | 4.64k | break; |
654 | 7.20k | case '/': |
655 | 7.20k | lex_name(ctx, f, buf); |
656 | 7.20k | return PDF_TOK_NAME; |
657 | 4.61k | case '(': |
658 | 4.61k | return PDF_TOK_ERROR; /* no strings allowed */ |
659 | 4.64k | case ')': |
660 | 4.64k | return PDF_TOK_ERROR; /* no strings allowed */ |
661 | 7.33k | case '<': |
662 | 7.33k | c = lex_byte(ctx, f); |
663 | 7.33k | if (c == '<') |
664 | 16 | return PDF_TOK_OPEN_DICT; |
665 | 7.31k | if (c != EOF) |
666 | 7.31k | fz_unread_byte(ctx, f); |
667 | 7.31k | return PDF_TOK_ERROR; /* no strings allowed */ |
668 | 9.05k | case '>': |
669 | 9.05k | c = lex_byte(ctx, f); |
670 | 9.05k | if (c == '>') |
671 | 50 | return PDF_TOK_CLOSE_DICT; |
672 | 9.00k | if (c != EOF) |
673 | 9.00k | fz_unread_byte(ctx, f); |
674 | 9.00k | return PDF_TOK_ERROR; |
675 | 4.55k | case '[': |
676 | 4.55k | return PDF_TOK_OPEN_ARRAY; |
677 | 4.67k | case ']': |
678 | 4.67k | return PDF_TOK_CLOSE_ARRAY; |
679 | 4.56k | case '{': |
680 | 4.56k | return PDF_TOK_OPEN_BRACE; |
681 | 4.60k | case '}': |
682 | 4.60k | return PDF_TOK_CLOSE_BRACE; |
683 | 60.7k | case IS_NUMBER: |
684 | 60.7k | return lex_number(ctx, f, buf, c); |
685 | 193k | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
686 | 193k | fz_unread_byte(ctx, f); |
687 | 193k | lex_name(ctx, f, buf); |
688 | 193k | return pdf_token_from_keyword(buf->scratch); |
689 | 527k | } |
690 | 527k | } |
691 | 305k | } |
692 | | |
693 | | void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) |
694 | 0 | { |
695 | 0 | switch (tok) |
696 | 0 | { |
697 | 0 | case PDF_TOK_NAME: |
698 | 0 | fz_append_printf(ctx, fzbuf, "/%s", buf->scratch); |
699 | 0 | break; |
700 | 0 | case PDF_TOK_STRING: |
701 | 0 | if (buf->len >= buf->size) |
702 | 0 | pdf_lexbuf_grow(ctx, buf); |
703 | 0 | buf->scratch[buf->len] = 0; |
704 | 0 | fz_append_pdf_string(ctx, fzbuf, buf->scratch); |
705 | 0 | break; |
706 | 0 | case PDF_TOK_OPEN_DICT: |
707 | 0 | fz_append_string(ctx, fzbuf, "<<"); |
708 | 0 | break; |
709 | 0 | case PDF_TOK_CLOSE_DICT: |
710 | 0 | fz_append_string(ctx, fzbuf, ">>"); |
711 | 0 | break; |
712 | 0 | case PDF_TOK_OPEN_ARRAY: |
713 | 0 | fz_append_byte(ctx, fzbuf, '['); |
714 | 0 | break; |
715 | 0 | case PDF_TOK_CLOSE_ARRAY: |
716 | 0 | fz_append_byte(ctx, fzbuf, ']'); |
717 | 0 | break; |
718 | 0 | case PDF_TOK_OPEN_BRACE: |
719 | 0 | fz_append_byte(ctx, fzbuf, '{'); |
720 | 0 | break; |
721 | 0 | case PDF_TOK_CLOSE_BRACE: |
722 | 0 | fz_append_byte(ctx, fzbuf, '}'); |
723 | 0 | break; |
724 | 0 | case PDF_TOK_INT: |
725 | 0 | fz_append_printf(ctx, fzbuf, "%ld", buf->i); |
726 | 0 | break; |
727 | 0 | case PDF_TOK_REAL: |
728 | 0 | fz_append_printf(ctx, fzbuf, "%g", buf->f); |
729 | 0 | break; |
730 | 0 | default: |
731 | 0 | fz_append_data(ctx, fzbuf, buf->scratch, buf->len); |
732 | 0 | break; |
733 | 0 | } |
734 | 0 | } |