Coverage Report

Created: 2025-01-11 06:55

/src/mupdf/source/fitz/text-decoder.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/pdf.h"
25
26
static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
27
0
{
28
0
  return n * 4 + 1;
29
0
}
30
31
static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
32
0
{
33
0
  const unsigned short *table = dec->table1;
34
0
  unsigned char *e = s + n;
35
0
  int len = 1;
36
0
  while (s < e)
37
0
    len += fz_runelen(table[*s++]);
38
0
  return len;
39
0
}
40
41
static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
42
0
{
43
0
  const unsigned short *table = dec->table1;
44
0
  unsigned char *e = s + n;
45
0
  while (s < e)
46
0
    p += fz_runetochar(p, table[*s++]);
47
0
  *p = 0;
48
0
}
49
50
static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
51
0
{
52
0
  return n * 2 + 1;
53
0
}
54
55
static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
56
0
{
57
0
  return n * 2 + 1;
58
0
}
59
60
static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
61
0
{
62
0
  unsigned char *e = s + n;
63
0
  int len = 1;
64
0
  while (s + 1 < e) {
65
0
    len += fz_runelen(s[0] << 8 | s[1]);
66
0
    s += 2;
67
0
  }
68
0
  return len;
69
0
}
70
71
static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
72
0
{
73
0
  unsigned char *e = s + n;
74
0
  int len = 1;
75
0
  while (s + 1 < e) {
76
0
    len += fz_runelen(s[0] | s[1] << 8);
77
0
    s += 2;
78
0
  }
79
0
  return len;
80
0
}
81
82
static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
83
0
{
84
0
  unsigned char *e = s + n;
85
0
  while (s + 1 < e) {
86
0
    p += fz_runetochar(p, s[0] << 8 | s[1]);
87
0
    s += 2;
88
0
  }
89
0
  *p = 0;
90
0
}
91
92
static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
93
0
{
94
0
  unsigned char *e = s + n;
95
0
  while (s + 1 < e) {
96
0
    p += fz_runetochar(p, s[0] | s[1] << 8);
97
0
    s += 2;
98
0
  }
99
0
  *p = 0;
100
0
}
101
102
static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
103
0
{
104
0
  return n * 4 + 1;
105
0
}
106
107
static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
108
0
{
109
0
  unsigned char *e = s + n;
110
0
  pdf_cmap *to_cid = dec->table1;
111
0
  pdf_cmap *to_uni = dec->table2;
112
0
  unsigned int raw;
113
0
  int cid, uni;
114
0
  int len = 1;
115
0
  while (s < e) {
116
0
    s += pdf_decode_cmap(to_cid, s, e, &raw);
117
0
    cid = pdf_lookup_cmap(to_cid, raw);
118
0
    uni = pdf_lookup_cmap(to_uni, cid);
119
0
    if (uni < 0) {
120
      // ASCII control characters are missing in the CMaps
121
0
      if (raw < 32)
122
0
        uni = raw;
123
0
      else
124
0
        uni = FZ_REPLACEMENT_CHARACTER;
125
0
    }
126
0
    len += fz_runelen(uni);
127
0
  }
128
0
  return len;
129
0
}
130
131
static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
132
0
{
133
0
  unsigned char *e = s + n;
134
0
  pdf_cmap *to_cid = dec->table1;
135
0
  pdf_cmap *to_uni = dec->table2;
136
0
  unsigned int raw;
137
0
  int cid, uni;
138
0
  while (s < e) {
139
0
    s += pdf_decode_cmap(to_cid, s, e, &raw);
140
0
    cid = pdf_lookup_cmap(to_cid, raw);
141
0
    uni = pdf_lookup_cmap(to_uni, cid);
142
0
    if (uni < 0) {
143
      // ASCII control characters are missing in the CMaps
144
0
      if (raw < 32)
145
0
        uni = raw;
146
0
      else
147
0
        uni = FZ_REPLACEMENT_CHARACTER;
148
0
    }
149
0
    p += fz_runetochar(p, uni);
150
0
  }
151
0
  *p = 0;
152
0
}
153
154
static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table)
155
0
{
156
0
  dec->decode_bound = simple_text_decode_bound;
157
0
  dec->decode_size = simple_text_decode_size;
158
0
  dec->decode = simple_text_decode;
159
0
  dec->table1 = (void*)table;
160
0
}
161
162
static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec)
163
0
{
164
0
  dec->decode_bound = utf16be_text_decode_bound;
165
0
  dec->decode_size = utf16be_text_decode_size;
166
0
  dec->decode = utf16be_text_decode;
167
0
}
168
169
static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec)
170
0
{
171
0
  dec->decode_bound = utf16le_text_decode_bound;
172
0
  dec->decode_size = utf16le_text_decode_size;
173
0
  dec->decode = utf16le_text_decode;
174
0
}
175
176
static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni)
177
0
{
178
0
  dec->decode_bound = cjk_text_decode_bound;
179
0
  dec->decode_size = cjk_text_decode_size;
180
0
  dec->decode = cjk_text_decode;
181
0
  dec->table1 = pdf_load_builtin_cmap(ctx, to_cid);
182
0
  if (!dec->table1)
183
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid);
184
0
  dec->table2 = pdf_load_builtin_cmap(ctx, to_uni);
185
0
  if (!dec->table2)
186
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni);
187
0
}
188
189
void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc)
190
0
{
191
  // Recognize IANA character set identifiers (case insensitive).
192
  // https://www.iana.org/assignments/character-sets/character-sets.xhtml
193
194
0
  if (!fz_strcasecmp(enc, "utf-16"))
195
0
    fz_init_utf16le_text_decoder(ctx, dec);
196
0
  else if (!fz_strcasecmp(enc, "utf-16be"))
197
0
    fz_init_utf16be_text_decoder(ctx, dec);
198
0
  else if (!fz_strcasecmp(enc, "utf-16le"))
199
0
    fz_init_utf16le_text_decoder(ctx, dec);
200
201
0
  else if (!fz_strcasecmp(enc, "euc-jp"))
202
0
    fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2");
203
0
  else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis"))
204
0
    fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2");
205
206
0
  else if (!fz_strcasecmp(enc, "euc-kr"))
207
0
    fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2");
208
209
0
  else if (!fz_strcasecmp(enc, "euc-cn"))
210
0
    fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2");
211
0
  else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030"))
212
0
    fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2");
213
214
0
  else if (!fz_strcasecmp(enc, "euc-tw"))
215
0
    fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2");
216
0
  else if (!fz_strcasecmp(enc, "big5"))
217
0
    fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2");
218
0
  else if (!fz_strcasecmp(enc, "big5-hkscs"))
219
0
    fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2");
220
221
0
  else if (!fz_strcasecmp(enc, "iso-8859-1"))
222
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1);
223
0
  else if (!fz_strcasecmp(enc, "iso-8859-7"))
224
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7);
225
0
  else if (!fz_strcasecmp(enc, "koi8-r"))
226
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u);
227
0
  else if (!fz_strcasecmp(enc, "windows-1250"))
228
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250);
229
0
  else if (!fz_strcasecmp(enc, "windows-1251"))
230
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251);
231
0
  else if (!fz_strcasecmp(enc, "windows-1252"))
232
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252);
233
234
0
  else
235
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc);
236
0
}