Coverage Report

Created: 2026-06-08 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/pdf/pdf-cmap-parse.c
Line
Count
Source
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/pdf.h"
25
26
#include <string.h>
27
28
/*
29
 * CMap parser
30
 */
31
32
static int
33
is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word)
34
49.6k
{
35
  /* Ignore trailing garbage when matching keywords */
36
49.6k
  return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word)));
37
49.6k
}
38
39
static void
40
skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn)
41
1
{
42
1
  fz_warn(ctx, "%s", warn);
43
1
  for (;;)
44
2
  {
45
2
    pdf_token tok = pdf_lex(ctx, file, buf);
46
2
    if (is_keyword(tok, buf, end))
47
1
      return;
48
1
    if (tok == PDF_TOK_ERROR)
49
0
      return;
50
1
    if (tok == PDF_TOK_EOF)
51
0
      return;
52
1
  }
53
1
}
54
55
static void
56
skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn)
57
0
{
58
0
  fz_warn(ctx, "%s", warn);
59
0
  for (;;)
60
0
  {
61
0
    pdf_token tok = pdf_lex(ctx, file, buf);
62
0
    if (tok == end)
63
0
      return;
64
0
    if (tok == PDF_TOK_ERROR)
65
0
      return;
66
0
    if (tok == PDF_TOK_EOF)
67
0
      return;
68
0
  }
69
0
}
70
71
static int
72
pdf_code_from_string(char *buf, size_t len)
73
72.0k
{
74
72.0k
  unsigned int a = 0;
75
211k
  while (len--)
76
139k
    a = (a << 8) | *(unsigned char *)buf++;
77
72.0k
  return a;
78
72.0k
}
79
80
static void
81
pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
82
224
{
83
224
  pdf_token tok;
84
85
224
  tok = pdf_lex(ctx, file, buf);
86
87
224
  if (tok == PDF_TOK_NAME)
88
224
    fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name));
89
0
  else
90
0
    fz_warn(ctx, "expected name after CMapName in cmap");
91
224
}
92
93
static void
94
pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
95
1
{
96
1
  pdf_token tok;
97
98
1
  tok = pdf_lex(ctx, file, buf);
99
100
1
  if (tok == PDF_TOK_INT)
101
1
    pdf_set_cmap_wmode(ctx, cmap, buf->i);
102
0
  else
103
0
    fz_warn(ctx, "expected integer after WMode in cmap");
104
1
}
105
106
static void
107
pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
108
224
{
109
224
  pdf_token tok;
110
224
  int lo, hi;
111
112
448
  while (1)
113
448
  {
114
448
    tok = pdf_lex(ctx, file, buf);
115
116
448
    if (is_keyword(tok, buf, "endcodespacerange"))
117
224
      return;
118
119
224
    else if (tok == PDF_TOK_STRING)
120
224
    {
121
224
      lo = pdf_code_from_string(buf->scratch, buf->len);
122
224
      tok = pdf_lex(ctx, file, buf);
123
224
      if (tok == PDF_TOK_STRING)
124
224
      {
125
224
        hi = pdf_code_from_string(buf->scratch, buf->len);
126
224
        pdf_add_codespace(ctx, cmap, lo, hi, buf->len);
127
224
      }
128
0
      else
129
0
      {
130
0
        skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
131
0
        return;
132
0
      }
133
224
    }
134
0
    else
135
0
    {
136
0
      skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
137
0
      return;
138
0
    }
139
448
  }
140
224
}
141
142
static void
143
pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
144
0
{
145
0
  pdf_token tok;
146
0
  int lo, hi, dst;
147
148
0
  while (1)
149
0
  {
150
0
    tok = pdf_lex(ctx, file, buf);
151
152
0
    if (is_keyword(tok, buf, "endcidrange"))
153
0
      return;
154
155
0
    else if (tok != PDF_TOK_STRING)
156
0
    {
157
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange");
158
0
      return;
159
0
    }
160
161
0
    lo = pdf_code_from_string(buf->scratch, buf->len);
162
163
0
    tok = pdf_lex(ctx, file, buf);
164
0
    if (tok != PDF_TOK_STRING)
165
0
    {
166
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string");
167
0
      return;
168
0
    }
169
170
0
    hi = pdf_code_from_string(buf->scratch, buf->len);
171
172
0
    tok = pdf_lex(ctx, file, buf);
173
0
    if (tok != PDF_TOK_INT)
174
0
    {
175
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer");
176
0
      return;
177
0
    }
178
179
0
    dst = buf->i;
180
181
0
    pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
182
0
  }
183
0
}
184
185
static void
186
pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
187
0
{
188
0
  pdf_token tok;
189
0
  int src, dst;
190
191
0
  while (1)
192
0
  {
193
0
    tok = pdf_lex(ctx, file, buf);
194
195
0
    if (is_keyword(tok, buf, "endcidchar"))
196
0
      return;
197
198
0
    else if (tok != PDF_TOK_STRING)
199
0
    {
200
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar");
201
0
      return;
202
0
    }
203
204
0
    src = pdf_code_from_string(buf->scratch, buf->len);
205
206
0
    tok = pdf_lex(ctx, file, buf);
207
0
    if (tok != PDF_TOK_INT)
208
0
    {
209
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer");
210
0
      return;
211
0
    }
212
213
0
    dst = buf->i;
214
215
0
    pdf_map_range_to_range(ctx, cmap, src, src, dst);
216
0
  }
217
0
}
218
219
static void
220
pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi)
221
0
{
222
0
  pdf_token tok;
223
0
  int dst[PDF_MRANGE_CAP];
224
225
0
  while (1)
226
0
  {
227
0
    tok = pdf_lex(ctx, file, buf);
228
229
0
    if (tok == PDF_TOK_CLOSE_ARRAY)
230
0
      return;
231
232
    /* Note: does not handle [ /Name /Name ... ] */
233
0
    else if (tok != PDF_TOK_STRING)
234
0
    {
235
0
      skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]");
236
0
      return;
237
0
    }
238
239
0
    if (buf->len / 2)
240
0
    {
241
0
      size_t i;
242
0
      size_t len = fz_minz(buf->len / 2, nelem(dst));
243
0
      for (i = 0; i < len; i++)
244
0
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
245
246
0
      pdf_map_one_to_many(ctx, cmap, lo, dst, i);
247
0
    }
248
249
0
    lo ++;
250
0
  }
251
0
}
252
253
static void
254
pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
255
78
{
256
78
  pdf_token tok;
257
78
  int lo, hi, dst;
258
259
1.65k
  while (1)
260
1.65k
  {
261
1.65k
    tok = pdf_lex(ctx, file, buf);
262
263
1.65k
    if (is_keyword(tok, buf, "endbfrange"))
264
77
      return;
265
266
1.58k
    else if (tok != PDF_TOK_STRING)
267
0
    {
268
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange");
269
0
      return;
270
0
    }
271
272
1.58k
    lo = pdf_code_from_string(buf->scratch, buf->len);
273
274
1.58k
    tok = pdf_lex(ctx, file, buf);
275
1.58k
    if (tok != PDF_TOK_STRING)
276
0
    {
277
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string");
278
0
      return;
279
0
    }
280
281
1.58k
    hi = pdf_code_from_string(buf->scratch, buf->len);
282
1.58k
    if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi)
283
1
    {
284
1
      skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range");
285
1
      return;
286
1
    }
287
288
1.58k
    tok = pdf_lex(ctx, file, buf);
289
290
1.58k
    if (tok == PDF_TOK_STRING)
291
1.58k
    {
292
1.58k
      if (buf->len == 2)
293
1.58k
      {
294
1.58k
        dst = pdf_code_from_string(buf->scratch, buf->len);
295
1.58k
        pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
296
1.58k
      }
297
0
      else
298
0
      {
299
0
        int dststr[PDF_MRANGE_CAP];
300
0
        size_t i;
301
302
0
        if (buf->len / 2)
303
0
        {
304
0
          size_t len = fz_minz(buf->len / 2, nelem(dststr));
305
0
          for (i = 0; i < len; i++)
306
0
            dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
307
308
0
          while (lo <= hi)
309
0
          {
310
0
            pdf_map_one_to_many(ctx, cmap, lo, dststr, i);
311
0
            dststr[i-1] ++;
312
0
            lo ++;
313
0
          }
314
0
        }
315
0
      }
316
1.58k
    }
317
318
0
    else if (tok == PDF_TOK_OPEN_ARRAY)
319
0
    {
320
0
      pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi);
321
0
    }
322
323
0
    else
324
0
    {
325
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange");
326
0
      return;
327
0
    }
328
1.58k
  }
329
78
}
330
331
static void
332
pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
333
379
{
334
379
  pdf_token tok;
335
379
  int dst[PDF_MRANGE_CAP];
336
379
  int src;
337
338
31.9k
  while (1)
339
31.9k
  {
340
31.9k
    tok = pdf_lex(ctx, file, buf);
341
342
31.9k
    if (is_keyword(tok, buf, "endbfchar"))
343
379
      return;
344
345
31.5k
    else if (tok != PDF_TOK_STRING)
346
0
    {
347
0
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar");
348
0
      return;
349
0
    }
350
351
31.5k
    src = pdf_code_from_string(buf->scratch, buf->len);
352
353
31.5k
    tok = pdf_lex(ctx, file, buf);
354
    /* Note: does not handle /dstName */
355
31.5k
    if (tok != PDF_TOK_STRING)
356
0
    {
357
0
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string");
358
0
      return;
359
0
    }
360
361
31.5k
    if (buf->len / 2)
362
31.5k
    {
363
31.5k
      size_t i;
364
31.5k
      size_t len = fz_minz(buf->len / 2, nelem(dst));
365
66.8k
      for (i = 0; i < len; i++)
366
35.2k
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
367
31.5k
      pdf_map_one_to_many(ctx, cmap, src, dst, i);
368
31.5k
    }
369
31.5k
  }
370
379
}
371
372
pdf_cmap *
373
pdf_load_cmap(fz_context *ctx, fz_stream *file)
374
224
{
375
224
  pdf_cmap *cmap;
376
224
  char key[64];
377
224
  pdf_lexbuf buf;
378
224
  pdf_token tok;
379
380
224
  pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
381
224
  cmap = pdf_new_cmap(ctx);
382
383
224
  strcpy(key, ".notdef");
384
385
448
  fz_try(ctx)
386
448
  {
387
6.73k
    while (1)
388
6.73k
    {
389
6.73k
      tok = pdf_lex(ctx, file, &buf);
390
391
6.73k
      if (tok == PDF_TOK_EOF)
392
0
        break;
393
394
6.73k
      else if (tok == PDF_TOK_NAME)
395
1.78k
      {
396
1.78k
        if (!strcmp(buf.scratch, "CMapName"))
397
224
          pdf_parse_cmap_name(ctx, cmap, file, &buf);
398
1.55k
        else if (!strcmp(buf.scratch, "WMode"))
399
1
          pdf_parse_wmode(ctx, cmap, file, &buf);
400
1.55k
        else
401
1.55k
          fz_strlcpy(key, buf.scratch, sizeof key);
402
1.78k
      }
403
404
4.95k
      else if (tok == PDF_TOK_KEYWORD)
405
2.71k
      {
406
2.71k
        if (is_keyword(tok, &buf, "endcmap"))
407
224
          break;
408
409
2.49k
        else if (is_keyword(tok, &buf, "usecmap"))
410
0
          fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
411
412
2.49k
        else if (is_keyword(tok, &buf, "begincodespacerange"))
413
224
          pdf_parse_codespace_range(ctx, cmap, file, &buf);
414
415
2.26k
        else if (is_keyword(tok, &buf, "beginbfchar"))
416
379
          pdf_parse_bf_char(ctx, cmap, file, &buf);
417
418
1.88k
        else if (is_keyword(tok, &buf, "begincidchar"))
419
0
          pdf_parse_cid_char(ctx, cmap, file, &buf);
420
421
1.88k
        else if (is_keyword(tok, &buf, "beginbfrange"))
422
78
          pdf_parse_bf_range(ctx, cmap, file, &buf);
423
424
1.81k
        else if (is_keyword(tok, &buf, "begincidrange"))
425
0
          pdf_parse_cid_range(ctx, cmap, file, &buf);
426
2.71k
      }
427
428
      /* ignore everything else */
429
6.73k
    }
430
431
224
    pdf_sort_cmap(ctx, cmap);
432
224
  }
433
448
  fz_always(ctx)
434
224
  {
435
224
    pdf_lexbuf_fin(ctx, &buf);
436
224
  }
437
224
  fz_catch(ctx)
438
0
  {
439
0
    pdf_drop_cmap(ctx, cmap);
440
0
    fz_rethrow(ctx);
441
0
  }
442
443
224
  return cmap;
444
224
}