Coverage Report

Created: 2025-12-03 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/pdf/pdf-cmap-parse.c
Line
Count
Source
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/pdf.h"
25
26
#include <string.h>
27
28
/*
29
 * CMap parser
30
 */
31
32
static int
33
is_keyword(pdf_token tok, pdf_lexbuf *buf, const char *word)
34
36.0k
{
35
  /* Ignore trailing garbage when matching keywords */
36
36.0k
  return (tok == PDF_TOK_KEYWORD && !strncmp(buf->scratch, word, strlen(word)));
37
36.0k
}
38
39
static void
40
skip_to_keyword(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, const char *end, const char *warn)
41
8
{
42
8
  fz_warn(ctx, "%s", warn);
43
8
  for (;;)
44
1.50k
  {
45
1.50k
    pdf_token tok = pdf_lex(ctx, file, buf);
46
1.50k
    if (is_keyword(tok, buf, end))
47
8
      return;
48
1.49k
    if (tok == PDF_TOK_ERROR)
49
0
      return;
50
1.49k
    if (tok == PDF_TOK_EOF)
51
0
      return;
52
1.49k
  }
53
8
}
54
55
static void
56
skip_to_token(fz_context *ctx, fz_stream *file, pdf_lexbuf *buf, pdf_token end, const char *warn)
57
0
{
58
0
  fz_warn(ctx, "%s", warn);
59
0
  for (;;)
60
0
  {
61
0
    pdf_token tok = pdf_lex(ctx, file, buf);
62
0
    if (tok == end)
63
0
      return;
64
0
    if (tok == PDF_TOK_ERROR)
65
0
      return;
66
0
    if (tok == PDF_TOK_EOF)
67
0
      return;
68
0
  }
69
0
}
70
71
static int
72
pdf_code_from_string(char *buf, size_t len)
73
71.6k
{
74
71.6k
  unsigned int a = 0;
75
214k
  while (len--)
76
143k
    a = (a << 8) | *(unsigned char *)buf++;
77
71.6k
  return a;
78
71.6k
}
79
80
static void
81
pdf_parse_cmap_name(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
82
2
{
83
2
  pdf_token tok;
84
85
2
  tok = pdf_lex(ctx, file, buf);
86
87
2
  if (tok == PDF_TOK_NAME)
88
2
    fz_strlcpy(cmap->cmap_name, buf->scratch, sizeof(cmap->cmap_name));
89
0
  else
90
0
    fz_warn(ctx, "expected name after CMapName in cmap");
91
2
}
92
93
static void
94
pdf_parse_wmode(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
95
0
{
96
0
  pdf_token tok;
97
98
0
  tok = pdf_lex(ctx, file, buf);
99
100
0
  if (tok == PDF_TOK_INT)
101
0
    pdf_set_cmap_wmode(ctx, cmap, buf->i);
102
0
  else
103
0
    fz_warn(ctx, "expected integer after WMode in cmap");
104
0
}
105
106
static void
107
pdf_parse_codespace_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
108
2
{
109
2
  pdf_token tok;
110
2
  int lo, hi;
111
112
4
  while (1)
113
4
  {
114
4
    tok = pdf_lex(ctx, file, buf);
115
116
4
    if (is_keyword(tok, buf, "endcodespacerange"))
117
2
      return;
118
119
2
    else if (tok == PDF_TOK_STRING)
120
2
    {
121
2
      lo = pdf_code_from_string(buf->scratch, buf->len);
122
2
      tok = pdf_lex(ctx, file, buf);
123
2
      if (tok == PDF_TOK_STRING)
124
2
      {
125
2
        hi = pdf_code_from_string(buf->scratch, buf->len);
126
2
        pdf_add_codespace(ctx, cmap, lo, hi, buf->len);
127
2
      }
128
0
      else
129
0
      {
130
0
        skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
131
0
        return;
132
0
      }
133
2
    }
134
0
    else
135
0
    {
136
0
      skip_to_keyword(ctx, file, buf, "endcodespacerange", "expected string or endcodespacerange");
137
0
      return;
138
0
    }
139
4
  }
140
2
}
141
142
static void
143
pdf_parse_cid_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
144
0
{
145
0
  pdf_token tok;
146
0
  int lo, hi, dst;
147
148
0
  while (1)
149
0
  {
150
0
    tok = pdf_lex(ctx, file, buf);
151
152
0
    if (is_keyword(tok, buf, "endcidrange"))
153
0
      return;
154
155
0
    else if (tok != PDF_TOK_STRING)
156
0
    {
157
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string or endcidrange");
158
0
      return;
159
0
    }
160
161
0
    lo = pdf_code_from_string(buf->scratch, buf->len);
162
163
0
    tok = pdf_lex(ctx, file, buf);
164
0
    if (tok != PDF_TOK_STRING)
165
0
    {
166
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected string");
167
0
      return;
168
0
    }
169
170
0
    hi = pdf_code_from_string(buf->scratch, buf->len);
171
172
0
    tok = pdf_lex(ctx, file, buf);
173
0
    if (tok != PDF_TOK_INT)
174
0
    {
175
0
      skip_to_keyword(ctx, file, buf, "endcidrange", "expected integer");
176
0
      return;
177
0
    }
178
179
0
    dst = buf->i;
180
181
0
    pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
182
0
  }
183
0
}
184
185
static void
186
pdf_parse_cid_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
187
0
{
188
0
  pdf_token tok;
189
0
  int src, dst;
190
191
0
  while (1)
192
0
  {
193
0
    tok = pdf_lex(ctx, file, buf);
194
195
0
    if (is_keyword(tok, buf, "endcidchar"))
196
0
      return;
197
198
0
    else if (tok != PDF_TOK_STRING)
199
0
    {
200
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected string or endcidchar");
201
0
      return;
202
0
    }
203
204
0
    src = pdf_code_from_string(buf->scratch, buf->len);
205
206
0
    tok = pdf_lex(ctx, file, buf);
207
0
    if (tok != PDF_TOK_INT)
208
0
    {
209
0
      skip_to_keyword(ctx, file, buf, "endcidchar", "expected integer");
210
0
      return;
211
0
    }
212
213
0
    dst = buf->i;
214
215
0
    pdf_map_range_to_range(ctx, cmap, src, src, dst);
216
0
  }
217
0
}
218
219
static void
220
pdf_parse_bf_range_array(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf, int lo, int hi)
221
0
{
222
0
  pdf_token tok;
223
0
  int dst[PDF_MRANGE_CAP];
224
225
0
  while (1)
226
0
  {
227
0
    tok = pdf_lex(ctx, file, buf);
228
229
0
    if (tok == PDF_TOK_CLOSE_ARRAY)
230
0
      return;
231
232
    /* Note: does not handle [ /Name /Name ... ] */
233
0
    else if (tok != PDF_TOK_STRING)
234
0
    {
235
0
      skip_to_token(ctx, file, buf, PDF_TOK_CLOSE_ARRAY, "expected string or ]");
236
0
      return;
237
0
    }
238
239
0
    if (buf->len / 2)
240
0
    {
241
0
      size_t i;
242
0
      size_t len = fz_minz(buf->len / 2, nelem(dst));
243
0
      for (i = 0; i < len; i++)
244
0
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
245
246
0
      pdf_map_one_to_many(ctx, cmap, lo, dst, i);
247
0
    }
248
249
0
    lo ++;
250
0
  }
251
0
}
252
253
static void
254
pdf_parse_bf_range(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
255
0
{
256
0
  pdf_token tok;
257
0
  int lo, hi, dst;
258
259
0
  while (1)
260
0
  {
261
0
    tok = pdf_lex(ctx, file, buf);
262
263
0
    if (is_keyword(tok, buf, "endbfrange"))
264
0
      return;
265
266
0
    else if (tok != PDF_TOK_STRING)
267
0
    {
268
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or endbfrange");
269
0
      return;
270
0
    }
271
272
0
    lo = pdf_code_from_string(buf->scratch, buf->len);
273
274
0
    tok = pdf_lex(ctx, file, buf);
275
0
    if (tok != PDF_TOK_STRING)
276
0
    {
277
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string");
278
0
      return;
279
0
    }
280
281
0
    hi = pdf_code_from_string(buf->scratch, buf->len);
282
0
    if (lo < 0 || lo > 65535 || hi < 0 || hi > 65535 || lo > hi)
283
0
    {
284
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "bfrange limits out of range");
285
0
      return;
286
0
    }
287
288
0
    tok = pdf_lex(ctx, file, buf);
289
290
0
    if (tok == PDF_TOK_STRING)
291
0
    {
292
0
      if (buf->len == 2)
293
0
      {
294
0
        dst = pdf_code_from_string(buf->scratch, buf->len);
295
0
        pdf_map_range_to_range(ctx, cmap, lo, hi, dst);
296
0
      }
297
0
      else
298
0
      {
299
0
        int dststr[PDF_MRANGE_CAP];
300
0
        size_t i;
301
302
0
        if (buf->len / 2)
303
0
        {
304
0
          size_t len = fz_minz(buf->len / 2, nelem(dststr));
305
0
          for (i = 0; i < len; i++)
306
0
            dststr[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
307
308
0
          while (lo <= hi)
309
0
          {
310
0
            pdf_map_one_to_many(ctx, cmap, lo, dststr, i);
311
0
            dststr[i-1] ++;
312
0
            lo ++;
313
0
          }
314
0
        }
315
0
      }
316
0
    }
317
318
0
    else if (tok == PDF_TOK_OPEN_ARRAY)
319
0
    {
320
0
      pdf_parse_bf_range_array(ctx, cmap, file, buf, lo, hi);
321
0
    }
322
323
0
    else
324
0
    {
325
0
      skip_to_keyword(ctx, file, buf, "endbfrange", "expected string or array or endbfrange");
326
0
      return;
327
0
    }
328
0
  }
329
0
}
330
331
static void
332
pdf_parse_bf_char(fz_context *ctx, pdf_cmap *cmap, fz_stream *file, pdf_lexbuf *buf)
333
330
{
334
330
  pdf_token tok;
335
330
  int dst[PDF_MRANGE_CAP];
336
330
  int src;
337
338
33.0k
  while (1)
339
33.0k
  {
340
33.0k
    tok = pdf_lex(ctx, file, buf);
341
342
33.0k
    if (is_keyword(tok, buf, "endbfchar"))
343
322
      return;
344
345
32.7k
    else if (tok != PDF_TOK_STRING)
346
1
    {
347
1
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string or endbfchar");
348
1
      return;
349
1
    }
350
351
32.7k
    src = pdf_code_from_string(buf->scratch, buf->len);
352
353
32.7k
    tok = pdf_lex(ctx, file, buf);
354
    /* Note: does not handle /dstName */
355
32.7k
    if (tok != PDF_TOK_STRING)
356
7
    {
357
7
      skip_to_keyword(ctx, file, buf, "endbfchar", "expected string");
358
7
      return;
359
7
    }
360
361
32.6k
    if (buf->len / 2)
362
32.6k
    {
363
32.6k
      size_t i;
364
32.6k
      size_t len = fz_minz(buf->len / 2, nelem(dst));
365
71.6k
      for (i = 0; i < len; i++)
366
38.9k
        dst[i] = pdf_code_from_string(&buf->scratch[i * 2], 2);
367
32.6k
      pdf_map_one_to_many(ctx, cmap, src, dst, i);
368
32.6k
    }
369
32.6k
  }
370
330
}
371
372
pdf_cmap *
373
pdf_load_cmap(fz_context *ctx, fz_stream *file)
374
2
{
375
2
  pdf_cmap *cmap;
376
2
  char key[64];
377
2
  pdf_lexbuf buf;
378
2
  pdf_token tok;
379
380
2
  pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
381
2
  cmap = pdf_new_cmap(ctx);
382
383
2
  strcpy(key, ".notdef");
384
385
4
  fz_try(ctx)
386
4
  {
387
724
    while (1)
388
724
    {
389
724
      tok = pdf_lex(ctx, file, &buf);
390
391
724
      if (tok == PDF_TOK_EOF)
392
0
        break;
393
394
724
      else if (tok == PDF_TOK_NAME)
395
16
      {
396
16
        if (!strcmp(buf.scratch, "CMapName"))
397
2
          pdf_parse_cmap_name(ctx, cmap, file, &buf);
398
14
        else if (!strcmp(buf.scratch, "WMode"))
399
0
          pdf_parse_wmode(ctx, cmap, file, &buf);
400
14
        else
401
14
          fz_strlcpy(key, buf.scratch, sizeof key);
402
16
      }
403
404
708
      else if (tok == PDF_TOK_KEYWORD)
405
364
      {
406
364
        if (is_keyword(tok, &buf, "endcmap"))
407
2
          break;
408
409
362
        else if (is_keyword(tok, &buf, "usecmap"))
410
0
          fz_strlcpy(cmap->usecmap_name, key, sizeof(cmap->usecmap_name));
411
412
362
        else if (is_keyword(tok, &buf, "begincodespacerange"))
413
2
          pdf_parse_codespace_range(ctx, cmap, file, &buf);
414
415
360
        else if (is_keyword(tok, &buf, "beginbfchar"))
416
330
          pdf_parse_bf_char(ctx, cmap, file, &buf);
417
418
30
        else if (is_keyword(tok, &buf, "begincidchar"))
419
0
          pdf_parse_cid_char(ctx, cmap, file, &buf);
420
421
30
        else if (is_keyword(tok, &buf, "beginbfrange"))
422
0
          pdf_parse_bf_range(ctx, cmap, file, &buf);
423
424
30
        else if (is_keyword(tok, &buf, "begincidrange"))
425
0
          pdf_parse_cid_range(ctx, cmap, file, &buf);
426
364
      }
427
428
      /* ignore everything else */
429
724
    }
430
431
2
    pdf_sort_cmap(ctx, cmap);
432
2
  }
433
4
  fz_always(ctx)
434
2
  {
435
2
    pdf_lexbuf_fin(ctx, &buf);
436
2
  }
437
2
  fz_catch(ctx)
438
0
  {
439
0
    pdf_drop_cmap(ctx, cmap);
440
0
    fz_rethrow(ctx);
441
0
  }
442
443
2
  return cmap;
444
2
}