/src/mupdf/source/pdf/pdf-cmap-parse.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/pdf.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | /* |
29 | | * CMap parser |
30 | | */ |
31 | | |
32 | | static int |
33 | | is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word) |
34 | 36.0k | { |
35 | | /* Ignore trailing garbage when matching keywords */ |
36 | 36.0k | return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word))); |
37 | 36.0k | } |
38 | | |
39 | | static void |
40 | | skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn) |
41 | 8 | { |
42 | 8 | fz_warn(ctx, "%s", warn); |
43 | 8 | for (;;) |
44 | 1.50k | { |
45 | 1.50k | pdf_token tok = pdf_lex(ctx, file, buf); |
46 | 1.50k | if (is_keyword(tok, buf, end)) |
47 | 8 | return; |
48 | 1.49k | if (tok == PDF_TOK_ERROR) |
49 | 0 | return; |
50 | 1.49k | if (tok == PDF_TOK_EOF) |
51 | 0 | return; |
52 | 1.49k | } |
53 | 8 | } |
54 | | |
55 | | static void |
56 | | skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn) |
57 | 0 | { |
58 | 0 | fz_warn(ctx, "%s", warn); |
59 | 0 | for (;;) |
60 | 0 | { |
61 | 0 | pdf_token tok = pdf_lex(ctx, file, buf); |
62 | 0 | if (tok == end) |
63 | 0 | return; |
64 | 0 | if (tok == PDF_TOK_ERROR) |
65 | 0 | return; |
66 | 0 | if (tok == PDF_TOK_EOF) |
67 | 0 | return; |
68 | 0 | } |
69 | 0 | } |
70 | | |
71 | | static int |
72 | | pdf_code_from_string(char *buf, size_t len) |
73 | 71.6k | { |
74 | 71.6k | unsigned int a = 0; |
75 | 214k | while (len--) |
76 | 143k | a = (a << 8) | *(unsigned char *)buf++; |
77 | 71.6k | return a; |
78 | 71.6k | } |
79 | | |
80 | | static void |
81 | | pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
82 | 2 | { |
83 | 2 | pdf_token tok; |
84 | | |
85 | 2 | tok = pdf_lex(ctx, file, buf); |
86 | | |
87 | 2 | if (tok == PDF_TOK_NAME) |
88 | 2 | fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name)); |
89 | 0 | else |
90 | 0 | fz_warn(ctx, "expected name after CMapName in cmap"); |
91 | 2 | } |
92 | | |
93 | | static void |
94 | | pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
95 | 0 | { |
96 | 0 | pdf_token tok; |
97 | |
|
98 | 0 | tok = pdf_lex(ctx, file, buf); |
99 | |
|
100 | 0 | if (tok == PDF_TOK_INT) |
101 | 0 | pdf_set_cmap_wmode(ctx, cmap, buf->i); |
102 | 0 | else |
103 | 0 | fz_warn(ctx, "expected integer after WMode in cmap"); |
104 | 0 | } |
105 | | |
106 | | static void |
107 | | pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
108 | 2 | { |
109 | 2 | pdf_token tok; |
110 | 2 | int lo, hi; |
111 | | |
112 | 4 | while (1) |
113 | 4 | { |
114 | 4 | tok = pdf_lex(ctx, file, buf); |
115 | | |
116 | 4 | if (is_keyword(tok, buf, "endcodespacerange")) |
117 | 2 | return; |
118 | | |
119 | 2 | else if (tok == PDF_TOK_STRING) |
120 | 2 | { |
121 | 2 | lo = pdf_code_from_string(buf->scratch, buf->len); |
122 | 2 | tok = pdf_lex(ctx, file, buf); |
123 | 2 | if (tok == PDF_TOK_STRING) |
124 | 2 | { |
125 | 2 | hi = pdf_code_from_string(buf->scratch, buf->len); |
126 | 2 | pdf_add_codespace(ctx, cmap, lo, hi, buf->len); |
127 | 2 | } |
128 | 0 | else |
129 | 0 | { |
130 | 0 | skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange"); |
131 | 0 | return; |
132 | 0 | } |
133 | 2 | } |
134 | 0 | else |
135 | 0 | { |
136 | 0 | skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange"); |
137 | 0 | return; |
138 | 0 | } |
139 | 4 | } |
140 | 2 | } |
141 | | |
142 | | static void |
143 | | pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
144 | 0 | { |
145 | 0 | pdf_token tok; |
146 | 0 | int lo, hi, dst; |
147 | |
|
148 | 0 | while (1) |
149 | 0 | { |
150 | 0 | tok = pdf_lex(ctx, file, buf); |
151 | |
|
152 | 0 | if (is_keyword(tok, buf, "endcidrange")) |
153 | 0 | return; |
154 | | |
155 | 0 | else if (tok != PDF_TOK_STRING) |
156 | 0 | { |
157 | 0 | skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange"); |
158 | 0 | return; |
159 | 0 | } |
160 | | |
161 | 0 | lo = pdf_code_from_string(buf->scratch, buf->len); |
162 | |
|
163 | 0 | tok = pdf_lex(ctx, file, buf); |
164 | 0 | if (tok != PDF_TOK_STRING) |
165 | 0 | { |
166 | 0 | skip_to_keyword(ctx, file, buf, "endcidrange", "expected string"); |
167 | 0 | return; |
168 | 0 | } |
169 | | |
170 | 0 | hi = pdf_code_from_string(buf->scratch, buf->len); |
171 | |
|
172 | 0 | tok = pdf_lex(ctx, file, buf); |
173 | 0 | if (tok != PDF_TOK_INT) |
174 | 0 | { |
175 | 0 | skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer"); |
176 | 0 | return; |
177 | 0 | } |
178 | | |
179 | 0 | dst = buf->i; |
180 | |
|
181 | 0 | pdf_map_range_to_range(ctx, cmap, lo, hi, dst); |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | | static void |
186 | | pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
187 | 0 | { |
188 | 0 | pdf_token tok; |
189 | 0 | int src, dst; |
190 | |
|
191 | 0 | while (1) |
192 | 0 | { |
193 | 0 | tok = pdf_lex(ctx, file, buf); |
194 | |
|
195 | 0 | if (is_keyword(tok, buf, "endcidchar")) |
196 | 0 | return; |
197 | | |
198 | 0 | else if (tok != PDF_TOK_STRING) |
199 | 0 | { |
200 | 0 | skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar"); |
201 | 0 | return; |
202 | 0 | } |
203 | | |
204 | 0 | src = pdf_code_from_string(buf->scratch, buf->len); |
205 | |
|
206 | 0 | tok = pdf_lex(ctx, file, buf); |
207 | 0 | if (tok != PDF_TOK_INT) |
208 | 0 | { |
209 | 0 | skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer"); |
210 | 0 | return; |
211 | 0 | } |
212 | | |
213 | 0 | dst = buf->i; |
214 | |
|
215 | 0 | pdf_map_range_to_range(ctx, cmap, src, src, dst); |
216 | 0 | } |
217 | 0 | } |
218 | | |
219 | | static void |
220 | | pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi) |
221 | 0 | { |
222 | 0 | pdf_token tok; |
223 | 0 | int dst[PDF_MRANGE_CAP]; |
224 | |
|
225 | 0 | while (1) |
226 | 0 | { |
227 | 0 | tok = pdf_lex(ctx, file, buf); |
228 | |
|
229 | 0 | if (tok == PDF_TOK_CLOSE_ARRAY) |
230 | 0 | return; |
231 | | |
232 | | /* Note: does not handle [ /Name /Name ... ] */ |
233 | 0 | else if (tok != PDF_TOK_STRING) |
234 | 0 | { |
235 | 0 | skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]"); |
236 | 0 | return; |
237 | 0 | } |
238 | | |
239 | 0 | if (buf->len / 2) |
240 | 0 | { |
241 | 0 | size_t i; |
242 | 0 | size_t len = fz_minz(buf->len / 2, nelem(dst)); |
243 | 0 | for (i = 0; i < len; i++) |
244 | 0 | dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2); |
245 | |
|
246 | 0 | pdf_map_one_to_many(ctx, cmap, lo, dst, i); |
247 | 0 | } |
248 | |
|
249 | 0 | lo ++; |
250 | 0 | } |
251 | 0 | } |
252 | | |
253 | | static void |
254 | | pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
255 | 0 | { |
256 | 0 | pdf_token tok; |
257 | 0 | int lo, hi, dst; |
258 | |
|
259 | 0 | while (1) |
260 | 0 | { |
261 | 0 | tok = pdf_lex(ctx, file, buf); |
262 | |
|
263 | 0 | if (is_keyword(tok, buf, "endbfrange")) |
264 | 0 | return; |
265 | | |
266 | 0 | else if (tok != PDF_TOK_STRING) |
267 | 0 | { |
268 | 0 | skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange"); |
269 | 0 | return; |
270 | 0 | } |
271 | | |
272 | 0 | lo = pdf_code_from_string(buf->scratch, buf->len); |
273 | |
|
274 | 0 | tok = pdf_lex(ctx, file, buf); |
275 | 0 | if (tok != PDF_TOK_STRING) |
276 | 0 | { |
277 | 0 | skip_to_keyword(ctx, file, buf, "endbfrange", "expected string"); |
278 | 0 | return; |
279 | 0 | } |
280 | | |
281 | 0 | hi = pdf_code_from_string(buf->scratch, buf->len); |
282 | 0 | if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi) |
283 | 0 | { |
284 | 0 | skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range"); |
285 | 0 | return; |
286 | 0 | } |
287 | | |
288 | 0 | tok = pdf_lex(ctx, file, buf); |
289 | |
|
290 | 0 | if (tok == PDF_TOK_STRING) |
291 | 0 | { |
292 | 0 | if (buf->len == 2) |
293 | 0 | { |
294 | 0 | dst = pdf_code_from_string(buf->scratch, buf->len); |
295 | 0 | pdf_map_range_to_range(ctx, cmap, lo, hi, dst); |
296 | 0 | } |
297 | 0 | else |
298 | 0 | { |
299 | 0 | int dststr[PDF_MRANGE_CAP]; |
300 | 0 | size_t i; |
301 | |
|
302 | 0 | if (buf->len / 2) |
303 | 0 | { |
304 | 0 | size_t len = fz_minz(buf->len / 2, nelem(dststr)); |
305 | 0 | for (i = 0; i < len; i++) |
306 | 0 | dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2); |
307 | |
|
308 | 0 | while (lo <= hi) |
309 | 0 | { |
310 | 0 | pdf_map_one_to_many(ctx, cmap, lo, dststr, i); |
311 | 0 | dststr[i-1] ++; |
312 | 0 | lo ++; |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | 0 | } |
317 | | |
318 | 0 | else if (tok == PDF_TOK_OPEN_ARRAY) |
319 | 0 | { |
320 | 0 | pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi); |
321 | 0 | } |
322 | | |
323 | 0 | else |
324 | 0 | { |
325 | 0 | skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange"); |
326 | 0 | return; |
327 | 0 | } |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | | static void |
332 | | pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf) |
333 | 330 | { |
334 | 330 | pdf_token tok; |
335 | 330 | int dst[PDF_MRANGE_CAP]; |
336 | 330 | int src; |
337 | | |
338 | 33.0k | while (1) |
339 | 33.0k | { |
340 | 33.0k | tok = pdf_lex(ctx, file, buf); |
341 | | |
342 | 33.0k | if (is_keyword(tok, buf, "endbfchar")) |
343 | 322 | return; |
344 | | |
345 | 32.7k | else if (tok != PDF_TOK_STRING) |
346 | 1 | { |
347 | 1 | skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar"); |
348 | 1 | return; |
349 | 1 | } |
350 | | |
351 | 32.7k | src = pdf_code_from_string(buf->scratch, buf->len); |
352 | | |
353 | 32.7k | tok = pdf_lex(ctx, file, buf); |
354 | | /* Note: does not handle /dstName */ |
355 | 32.7k | if (tok != PDF_TOK_STRING) |
356 | 7 | { |
357 | 7 | skip_to_keyword(ctx, file, buf, "endbfchar", "expected string"); |
358 | 7 | return; |
359 | 7 | } |
360 | | |
361 | 32.6k | if (buf->len / 2) |
362 | 32.6k | { |
363 | 32.6k | size_t i; |
364 | 32.6k | size_t len = fz_minz(buf->len / 2, nelem(dst)); |
365 | 71.6k | for (i = 0; i < len; i++) |
366 | 38.9k | dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2); |
367 | 32.6k | pdf_map_one_to_many(ctx, cmap, src, dst, i); |
368 | 32.6k | } |
369 | 32.6k | } |
370 | 330 | } |
371 | | |
372 | | pdf_cmap * |
373 | | pdf_load_cmap(fz_context *ctx, fz_stream *file) |
374 | 2 | { |
375 | 2 | pdf_cmap *cmap; |
376 | 2 | char key[64]; |
377 | 2 | pdf_lexbuf buf; |
378 | 2 | pdf_token tok; |
379 | | |
380 | 2 | pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL); |
381 | 2 | cmap = pdf_new_cmap(ctx); |
382 | | |
383 | 2 | strcpy(key, ".notdef"); |
384 | | |
385 | 4 | fz_try(ctx) |
386 | 4 | { |
387 | 724 | while (1) |
388 | 724 | { |
389 | 724 | tok = pdf_lex(ctx, file, &buf); |
390 | | |
391 | 724 | if (tok == PDF_TOK_EOF) |
392 | 0 | break; |
393 | | |
394 | 724 | else if (tok == PDF_TOK_NAME) |
395 | 16 | { |
396 | 16 | if (!strcmp(buf.scratch, "CMapName")) |
397 | 2 | pdf_parse_cmap_name(ctx, cmap, file, &buf); |
398 | 14 | else if (!strcmp(buf.scratch, "WMode")) |
399 | 0 | pdf_parse_wmode(ctx, cmap, file, &buf); |
400 | 14 | else |
401 | 14 | fz_strlcpy(key, buf.scratch, sizeof key); |
402 | 16 | } |
403 | | |
404 | 708 | else if (tok == PDF_TOK_KEYWORD) |
405 | 364 | { |
406 | 364 | if (is_keyword(tok, &buf, "endcmap")) |
407 | 2 | break; |
408 | | |
409 | 362 | else if (is_keyword(tok, &buf, "usecmap")) |
410 | 0 | fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name)); |
411 | | |
412 | 362 | else if (is_keyword(tok, &buf, "begincodespacerange")) |
413 | 2 | pdf_parse_codespace_range(ctx, cmap, file, &buf); |
414 | | |
415 | 360 | else if (is_keyword(tok, &buf, "beginbfchar")) |
416 | 330 | pdf_parse_bf_char(ctx, cmap, file, &buf); |
417 | | |
418 | 30 | else if (is_keyword(tok, &buf, "begincidchar")) |
419 | 0 | pdf_parse_cid_char(ctx, cmap, file, &buf); |
420 | | |
421 | 30 | else if (is_keyword(tok, &buf, "beginbfrange")) |
422 | 0 | pdf_parse_bf_range(ctx, cmap, file, &buf); |
423 | | |
424 | 30 | else if (is_keyword(tok, &buf, "begincidrange")) |
425 | 0 | pdf_parse_cid_range(ctx, cmap, file, &buf); |
426 | 364 | } |
427 | | |
428 | | /* ignore everything else */ |
429 | 724 | } |
430 | | |
431 | 2 | pdf_sort_cmap(ctx, cmap); |
432 | 2 | } |
433 | 4 | fz_always(ctx) |
434 | 2 | { |
435 | 2 | pdf_lexbuf_fin(ctx, &buf); |
436 | 2 | } |
437 | 2 | fz_catch(ctx) |
438 | 0 | { |
439 | 0 | pdf_drop_cmap(ctx, cmap); |
440 | 0 | fz_rethrow(ctx); |
441 | 0 | } |
442 | | |
443 | 2 | return cmap; |
444 | 2 | } |