Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/fitz/text-decoder.c
Line
Count
Source (jump to first uncovered line)
1
#include "mupdf/fitz.h"
2
#include "mupdf/pdf.h"
3
4
static int simple_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
5
0
{
6
0
  return n * 4 + 1;
7
0
}
8
9
static int simple_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
10
0
{
11
0
  const unsigned short *table = dec->table1;
12
0
  unsigned char *e = s + n;
13
0
  int len = 1;
14
0
  while (s < e)
15
0
    len += fz_runelen(table[*s++]);
16
0
  return len;
17
0
}
18
19
static void simple_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
20
0
{
21
0
  const unsigned short *table = dec->table1;
22
0
  unsigned char *e = s + n;
23
0
  while (s < e)
24
0
    p += fz_runetochar(p, table[*s++]);
25
0
  *p = 0;
26
0
}
27
28
static int utf16be_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
29
0
{
30
0
  return n * 2 + 1;
31
0
}
32
33
static int utf16le_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
34
0
{
35
0
  return n * 2 + 1;
36
0
}
37
38
static int utf16be_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
39
0
{
40
0
  unsigned char *e = s + n;
41
0
  int len = 1;
42
0
  while (s + 1 < e) {
43
0
    len += fz_runelen(s[0] << 8 | s[1]);
44
0
    s += 2;
45
0
  }
46
0
  return len;
47
0
}
48
49
static int utf16le_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
50
0
{
51
0
  unsigned char *e = s + n;
52
0
  int len = 1;
53
0
  while (s + 1 < e) {
54
0
    len += fz_runelen(s[0] | s[1] << 8);
55
0
    s += 2;
56
0
  }
57
0
  return len;
58
0
}
59
60
static void utf16be_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
61
0
{
62
0
  unsigned char *e = s + n;
63
0
  while (s + 1 < e) {
64
0
    p += fz_runetochar(p, s[0] << 8 | s[1]);
65
0
    s += 2;
66
0
  }
67
0
  *p = 0;
68
0
}
69
70
static void utf16le_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
71
0
{
72
0
  unsigned char *e = s + n;
73
0
  while (s + 1 < e) {
74
0
    p += fz_runetochar(p, s[0] | s[1] << 8);
75
0
    s += 2;
76
0
  }
77
0
  *p = 0;
78
0
}
79
80
static int cjk_text_decode_bound(fz_text_decoder *dec, unsigned char *s, int n)
81
0
{
82
0
  return n * 4 + 1;
83
0
}
84
85
static int cjk_text_decode_size(fz_text_decoder *dec, unsigned char *s, int n)
86
0
{
87
0
  unsigned char *e = s + n;
88
0
  pdf_cmap *to_cid = dec->table1;
89
0
  pdf_cmap *to_uni = dec->table2;
90
0
  unsigned int raw;
91
0
  int cid, uni;
92
0
  int len = 1;
93
0
  while (s < e) {
94
0
    s += pdf_decode_cmap(to_cid, s, e, &raw);
95
0
    cid = pdf_lookup_cmap(to_cid, raw);
96
0
    uni = pdf_lookup_cmap(to_uni, cid);
97
0
    if (uni < 0) {
98
      // ASCII control characters are missing in the CMaps
99
0
      if (raw < 32)
100
0
        uni = raw;
101
0
      else
102
0
        uni = FZ_REPLACEMENT_CHARACTER;
103
0
    }
104
0
    len += fz_runelen(uni);
105
0
  }
106
0
  return len;
107
0
}
108
109
static void cjk_text_decode(fz_text_decoder *dec, char *p, unsigned char *s, int n)
110
0
{
111
0
  unsigned char *e = s + n;
112
0
  pdf_cmap *to_cid = dec->table1;
113
0
  pdf_cmap *to_uni = dec->table2;
114
0
  unsigned int raw;
115
0
  int cid, uni;
116
0
  while (s < e) {
117
0
    s += pdf_decode_cmap(to_cid, s, e, &raw);
118
0
    cid = pdf_lookup_cmap(to_cid, raw);
119
0
    uni = pdf_lookup_cmap(to_uni, cid);
120
0
    if (uni < 0) {
121
      // ASCII control characters are missing in the CMaps
122
0
      if (raw < 32)
123
0
        uni = raw;
124
0
      else
125
0
        uni = FZ_REPLACEMENT_CHARACTER;
126
0
    }
127
0
    p += fz_runetochar(p, uni);
128
0
  }
129
0
  *p = 0;
130
0
}
131
132
static void fz_init_simple_text_decoder(fz_context *ctx, fz_text_decoder *dec, const unsigned short *table)
133
0
{
134
0
  dec->decode_bound = simple_text_decode_bound;
135
0
  dec->decode_size = simple_text_decode_size;
136
0
  dec->decode = simple_text_decode;
137
0
  dec->table1 = (void*)table;
138
0
}
139
140
static void fz_init_utf16be_text_decoder(fz_context *ctx, fz_text_decoder *dec)
141
0
{
142
0
  dec->decode_bound = utf16be_text_decode_bound;
143
0
  dec->decode_size = utf16be_text_decode_size;
144
0
  dec->decode = utf16be_text_decode;
145
0
}
146
147
static void fz_init_utf16le_text_decoder(fz_context *ctx, fz_text_decoder *dec)
148
0
{
149
0
  dec->decode_bound = utf16le_text_decode_bound;
150
0
  dec->decode_size = utf16le_text_decode_size;
151
0
  dec->decode = utf16le_text_decode;
152
0
}
153
154
static void fz_init_cjk_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *to_cid, const char *to_uni)
155
0
{
156
0
  dec->decode_bound = cjk_text_decode_bound;
157
0
  dec->decode_size = cjk_text_decode_size;
158
0
  dec->decode = cjk_text_decode;
159
0
  dec->table1 = pdf_load_builtin_cmap(ctx, to_cid);
160
0
  if (!dec->table1)
161
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_cid);
162
0
  dec->table2 = pdf_load_builtin_cmap(ctx, to_uni);
163
0
  if (!dec->table2)
164
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown CMap: %s", to_uni);
165
0
}
166
167
void fz_init_text_decoder(fz_context *ctx, fz_text_decoder *dec, const char *enc)
168
0
{
169
  // Recognize IANA character set identifiers (case insensitive).
170
  // https://www.iana.org/assignments/character-sets/character-sets.xhtml
171
172
0
  if (!fz_strcasecmp(enc, "utf-16"))
173
0
    fz_init_utf16le_text_decoder(ctx, dec);
174
0
  else if (!fz_strcasecmp(enc, "utf-16be"))
175
0
    fz_init_utf16be_text_decoder(ctx, dec);
176
0
  else if (!fz_strcasecmp(enc, "utf-16le"))
177
0
    fz_init_utf16le_text_decoder(ctx, dec);
178
179
0
  else if (!fz_strcasecmp(enc, "euc-jp"))
180
0
    fz_init_cjk_text_decoder(ctx, dec, "EUC-H", "Adobe-Japan1-UCS2");
181
0
  else if (!fz_strcasecmp(enc, "shift_jis") || !fz_strcasecmp(enc, "sjis"))
182
0
    fz_init_cjk_text_decoder(ctx, dec, "90msp-H", "Adobe-Japan1-UCS2");
183
184
0
  else if (!fz_strcasecmp(enc, "euc-kr"))
185
0
    fz_init_cjk_text_decoder(ctx, dec, "KSCms-UHC-H", "Adobe-Korea1-UCS2");
186
187
0
  else if (!fz_strcasecmp(enc, "euc-cn"))
188
0
    fz_init_cjk_text_decoder(ctx, dec, "GB-EUC-H", "Adobe-GB1-UCS2");
189
0
  else if (!fz_strcasecmp(enc, "gbk") || !fz_strcasecmp(enc, "gb2312") || !fz_strcasecmp(enc, "gb18030"))
190
0
    fz_init_cjk_text_decoder(ctx, dec, "GBK2K-H", "Adobe-GB1-UCS2");
191
192
0
  else if (!fz_strcasecmp(enc, "euc-tw"))
193
0
    fz_init_cjk_text_decoder(ctx, dec, "CNS-EUC-H", "Adobe-CNS1-UCS2");
194
0
  else if (!fz_strcasecmp(enc, "big5"))
195
0
    fz_init_cjk_text_decoder(ctx, dec, "ETen-B5-H", "Adobe-CNS1-UCS2");
196
0
  else if (!fz_strcasecmp(enc, "big5-hkscs"))
197
0
    fz_init_cjk_text_decoder(ctx, dec, "HKscs-B5-H", "Adobe-CNS1-UCS2");
198
199
0
  else if (!fz_strcasecmp(enc, "iso-8859-1"))
200
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_1);
201
0
  else if (!fz_strcasecmp(enc, "iso-8859-7"))
202
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_iso8859_7);
203
0
  else if (!fz_strcasecmp(enc, "koi8-r"))
204
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_koi8u);
205
0
  else if (!fz_strcasecmp(enc, "windows-1250"))
206
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1250);
207
0
  else if (!fz_strcasecmp(enc, "windows-1251"))
208
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1251);
209
0
  else if (!fz_strcasecmp(enc, "windows-1252"))
210
0
    fz_init_simple_text_decoder(ctx, dec, fz_unicode_from_windows_1252);
211
212
0
  else
213
0
    fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "unknown text encoding: %s", enc);
214
0
}