Coverage Report

Created: 2026-06-30 07:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/pdf/pdf-cmap-parse.c
Line
Count
Source
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/pdf.h"
25
26
#include <string.h>
27
28
/*
29
 * CMap parser
30
 */
31
32
static int
33
is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word)
34
45.8k
{
35
  /* Ignore trailing garbage when matching keywords */
36
45.8k
  return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word)));
37
45.8k
}
38
39
static void
40
skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn)
41
1
{
42
1
  fz_warn(ctx, "%s", warn);
43
1
  for (;;)
44
2
  {
45
2
    pdf_token tok = pdf_lex(ctx, file, buf);
46
2
    if (is_keyword(tok, buf, end))
47
1
      return;
48
1
    if (tok == PDF_TOK_ERROR)
49
0
      return;
50
1
    if (tok == PDF_TOK_EOF)
51
0
      return;
52
1
  }
53
1
}
54
55
static void
56
skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn)
57
0
{
58
0
  fz_warn(ctx, "%s", warn);
59
0
  for (;;)
60
0
  {
61
0
    pdf_token tok = pdf_lex(ctx, file, buf);
62
0
    if (tok == end)
63
0
      return;
64
0
    if (tok == PDF_TOK_ERROR)
65
0
      return;
66
0
    if (tok == PDF_TOK_EOF)
67
0
      return;
68
0
  }
69
0
}
70
71
static int
72
pdf_code_from_string(char *buf, size_t len)
73
67.0k
{
74
67.0k
  unsigned int a = 0;
75
196k
  while (len--)
76
129k
    a = (a << 8) | *(unsigned char *)buf++;
77
67.0k
  return a;
78
67.0k
}
79
80
static void
81
pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
82
206
{
83
206
  pdf_token tok;
84
85
206
  tok = pdf_lex(ctx, file, buf);
86
87
206
  if (tok == PDF_TOK_NAME)
88
206
    fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name));
89
0
  else
90
0
    fz_warn(ctx, "expected name after CMapName in cmap");
91
206
}
92
93
static void
94
pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
95
1
{
96
1
  pdf_token tok;
97
98
1
  tok = pdf_lex(ctx, file, buf);
99
100
1
  if (tok == PDF_TOK_INT)
101
1
    pdf_set_cmap_wmode(ctx, cmap, buf->i);
102
0
  else
103
0
    fz_warn(ctx, "expected integer after WMode in cmap");
104
1
}
105
106
static void
107
pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
108
206
{
109
206
  pdf_token tok;
110
206
  int lo, hi;
111
112
412
  while (1)
113
412
  {
114
412
    tok = pdf_lex(ctx, file, buf);
115
116
412
    if (is_keyword(tok, buf, "endcodespacerange"))
117
206
      return;
118
119
206
    else if (tok == PDF_TOK_STRING)
120
206
    {
121
206
      lo = pdf_code_from_string(buf->scratch, buf->len);
122
206
      tok = pdf_lex(ctx, file, buf);
123
206
      if (tok == PDF_TOK_STRING)
124
206
      {
125
206
        hi = pdf_code_from_string(buf->scratch, buf->len);
126
206
        pdf_add_codespace(ctx, cmap, lo, hi, buf->len);
127
206
      }
128
0
      else
129
0
      {
130
0
        skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
131
0
        return;
132
0
      }
133
206
    }
134
0
    else
135
0
    {
136
0
      skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
137
0
      return;
138
0
    }
139
412
  }
140
206
}
141
142
static void
143
pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
144
0
{
145
0
  pdf_token tok;
146
0
  int lo, hi, dst;
147
148
0
  while (1)
149
0
  {
150
0
    tok = pdf_lex(ctx, file, buf);
151
152
0
    if (is_keyword(tok, buf, "endcidrange"))
153
0
      return;
154
155
0
    else if (tok != PDF_TOK_STRING)
156
0
    {
157
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange");
158
0
      return;
159
0
    }
160
161
0
    lo = pdf_code_from_string(buf->scratch, buf->len);
162
163
0
    tok = pdf_lex(ctx, file, buf);
164
0
    if (tok != PDF_TOK_STRING)
165
0
    {
166
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string");
167
0
      return;
168
0
    }
169
170
0
    hi = pdf_code_from_string(buf->scratch, buf->len);
171
172
0
    tok = pdf_lex(ctx, file, buf);
173
0
    if (tok != PDF_TOK_INT)
174
0
    {
175
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer");
176
0
      return;
177
0
    }
178
179
0
    dst = buf->i;
180
181
0
    pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
182
0
  }
183
0
}
184
185
static void
186
pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
187
0
{
188
0
  pdf_token tok;
189
0
  int src, dst;
190
191
0
  while (1)
192
0
  {
193
0
    tok = pdf_lex(ctx, file, buf);
194
195
0
    if (is_keyword(tok, buf, "endcidchar"))
196
0
      return;
197
198
0
    else if (tok != PDF_TOK_STRING)
199
0
    {
200
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar");
201
0
      return;
202
0
    }
203
204
0
    src = pdf_code_from_string(buf->scratch, buf->len);
205
206
0
    tok = pdf_lex(ctx, file, buf);
207
0
    if (tok != PDF_TOK_INT)
208
0
    {
209
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer");
210
0
      return;
211
0
    }
212
213
0
    dst = buf->i;
214
215
0
    pdf_map_range_to_range(ctx, cmap, src, src, dst);
216
0
  }
217
0
}
218
219
static void
220
pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi)
221
0
{
222
0
  pdf_token tok;
223
0
  int dst[PDF_MRANGE_CAP];
224
225
0
  while (1)
226
0
  {
227
0
    tok = pdf_lex(ctx, file, buf);
228
229
0
    if (tok == PDF_TOK_CLOSE_ARRAY)
230
0
      return;
231
232
    /* Note: does not handle [ /Name /Name ... ] */
233
0
    else if (tok != PDF_TOK_STRING)
234
0
    {
235
0
      skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]");
236
0
      return;
237
0
    }
238
239
0
    if (buf->len / 2)
240
0
    {
241
0
      size_t i;
242
0
      size_t len = fz_minz(buf->len / 2, nelem(dst));
243
0
      for (i = 0; i < len; i++)
244
0
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
245
246
0
      pdf_map_one_to_many(ctx, cmap, lo, dst, i);
247
0
    }
248
249
0
    lo ++;
250
0
  }
251
0
}
252
253
static void
254
pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
255
78
{
256
78
  pdf_token tok;
257
78
  int lo, hi, dst;
258
259
1.67k
  while (1)
260
1.67k
  {
261
1.67k
    tok = pdf_lex(ctx, file, buf);
262
263
1.67k
    if (is_keyword(tok, buf, "endbfrange"))
264
77
      return;
265
266
1.60k
    else if (tok != PDF_TOK_STRING)
267
0
    {
268
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange");
269
0
      return;
270
0
    }
271
272
1.60k
    lo = pdf_code_from_string(buf->scratch, buf->len);
273
274
1.60k
    tok = pdf_lex(ctx, file, buf);
275
1.60k
    if (tok != PDF_TOK_STRING)
276
0
    {
277
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string");
278
0
      return;
279
0
    }
280
281
1.60k
    hi = pdf_code_from_string(buf->scratch, buf->len);
282
1.60k
    if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi)
283
1
    {
284
1
      skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range");
285
1
      return;
286
1
    }
287
288
1.60k
    tok = pdf_lex(ctx, file, buf);
289
290
1.60k
    if (tok == PDF_TOK_STRING)
291
1.60k
    {
292
1.60k
      if (buf->len == 2)
293
1.60k
      {
294
1.60k
        dst = pdf_code_from_string(buf->scratch, buf->len);
295
1.60k
        pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
296
1.60k
      }
297
0
      else
298
0
      {
299
0
        int dststr[PDF_MRANGE_CAP];
300
0
        size_t i;
301
302
0
        if (buf->len / 2)
303
0
        {
304
0
          size_t len = fz_minz(buf->len / 2, nelem(dststr));
305
0
          for (i = 0; i < len; i++)
306
0
            dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
307
308
0
          while (lo <= hi)
309
0
          {
310
0
            pdf_map_one_to_many(ctx, cmap, lo, dststr, i);
311
0
            dststr[i-1] ++;
312
0
            lo ++;
313
0
          }
314
0
        }
315
0
      }
316
1.60k
    }
317
318
0
    else if (tok == PDF_TOK_OPEN_ARRAY)
319
0
    {
320
0
      pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi);
321
0
    }
322
323
0
    else
324
0
    {
325
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange");
326
0
      return;
327
0
    }
328
1.60k
  }
329
78
}
330
331
static void
332
pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
333
341
{
334
341
  pdf_token tok;
335
341
  int dst[PDF_MRANGE_CAP];
336
341
  int src;
337
338
29.4k
  while (1)
339
29.4k
  {
340
29.4k
    tok = pdf_lex(ctx, file, buf);
341
342
29.4k
    if (is_keyword(tok, buf, "endbfchar"))
343
341
      return;
344
345
29.0k
    else if (tok != PDF_TOK_STRING)
346
0
    {
347
0
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar");
348
0
      return;
349
0
    }
350
351
29.0k
    src = pdf_code_from_string(buf->scratch, buf->len);
352
353
29.0k
    tok = pdf_lex(ctx, file, buf);
354
    /* Note: does not handle /dstName */
355
29.0k
    if (tok != PDF_TOK_STRING)
356
0
    {
357
0
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string");
358
0
      return;
359
0
    }
360
361
29.0k
    if (buf->len / 2)
362
29.0k
    {
363
29.0k
      size_t i;
364
29.0k
      size_t len = fz_minz(buf->len / 2, nelem(dst));
365
61.8k
      for (i = 0; i < len; i++)
366
32.7k
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
367
29.0k
      pdf_map_one_to_many(ctx, cmap, src, dst, i);
368
29.0k
    }
369
29.0k
  }
370
341
}
371
372
pdf_cmap *
373
pdf_load_cmap(fz_context *ctx, fz_stream *file)
374
206
{
375
206
  pdf_cmap *cmap;
376
206
  char key[64];
377
206
  pdf_lexbuf buf;
378
206
  pdf_token tok;
379
380
206
  pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
381
206
  cmap = pdf_new_cmap(ctx);
382
383
206
  strcpy(key, ".notdef");
384
385
412
  fz_try(ctx)
386
412
  {
387
6.19k
    while (1)
388
6.19k
    {
389
6.19k
      tok = pdf_lex(ctx, file, &buf);
390
391
6.19k
      if (tok == PDF_TOK_EOF)
392
0
        break;
393
394
6.19k
      else if (tok == PDF_TOK_NAME)
395
1.63k
      {
396
1.63k
        if (!strcmp(buf.scratch, "CMapName"))
397
206
          pdf_parse_cmap_name(ctx, cmap, file, &buf);
398
1.43k
        else if (!strcmp(buf.scratch, "WMode"))
399
1
          pdf_parse_wmode(ctx, cmap, file, &buf);
400
1.43k
        else
401
1.43k
          fz_strlcpy(key, buf.scratch, sizeof key);
402
1.63k
      }
403
404
4.55k
      else if (tok == PDF_TOK_KEYWORD)
405
2.49k
      {
406
2.49k
        if (is_keyword(tok, &buf, "endcmap"))
407
206
          break;
408
409
2.29k
        else if (is_keyword(tok, &buf, "usecmap"))
410
0
          fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
411
412
2.29k
        else if (is_keyword(tok, &buf, "begincodespacerange"))
413
206
          pdf_parse_codespace_range(ctx, cmap, file, &buf);
414
415
2.08k
        else if (is_keyword(tok, &buf, "beginbfchar"))
416
341
          pdf_parse_bf_char(ctx, cmap, file, &buf);
417
418
1.74k
        else if (is_keyword(tok, &buf, "begincidchar"))
419
0
          pdf_parse_cid_char(ctx, cmap, file, &buf);
420
421
1.74k
        else if (is_keyword(tok, &buf, "beginbfrange"))
422
78
          pdf_parse_bf_range(ctx, cmap, file, &buf);
423
424
1.66k
        else if (is_keyword(tok, &buf, "begincidrange"))
425
0
          pdf_parse_cid_range(ctx, cmap, file, &buf);
426
2.49k
      }
427
428
      /* ignore everything else */
429
6.19k
    }
430
431
206
    pdf_sort_cmap(ctx, cmap);
432
206
  }
433
412
  fz_always(ctx)
434
206
  {
435
206
    pdf_lexbuf_fin(ctx, &buf);
436
206
  }
437
206
  fz_catch(ctx)
438
0
  {
439
0
    pdf_drop_cmap(ctx, cmap);
440
0
    fz_rethrow(ctx);
441
0
  }
442
443
206
  return cmap;
444
206
}