Coverage Report

Created: 2024-11-12 06:38

/src/w3m/libwc/gb18030.c
Line
Count
Source (jump to first uncovered line)
1
2
#include "wc.h"
3
#include "gb18030.h"
4
#include "search.h"
5
#include "wtf.h"
6
#ifdef USE_UNICODE
7
#include "ucs.h"
8
#endif
9
#include "map/gb18030_ucs.map"
10
11
#define C0 WC_GB18030_MAP_C0
12
#define GL WC_GB18030_MAP_GL
13
377k
#define C1 WC_GB18030_MAP_C1
14
1.12M
#define LB WC_GB18030_MAP_LB
15
1.21M
#define UB WC_GB18030_MAP_UB
16
229k
#define L4 WC_GB18030_MAP_L4
17
18
wc_uint8 WC_GB18030_MAP[ 0x100 ] = {
19
    C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
20
    C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
21
    GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL,
22
    L4, L4, L4, L4, L4, L4, L4, L4, L4, L4, GL, GL, GL, GL, GL, GL,
23
    LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
24
    LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
25
    LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB,
26
    LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0,
27
28
    LB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
29
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
30
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
31
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
32
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
33
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
34
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB,
35
    UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1,
36
};
37
38
wc_wchar_t
39
wc_gbk_ext_to_cs128w(wc_wchar_t cc)
40
469k
{
41
469k
    cc.code = WC_GBK_N(cc.code);
42
469k
    if (cc.code < 0x4000)
43
441k
  cc.ccs = WC_CCS_GBK_EXT_1;
44
27.6k
    else {
45
27.6k
  cc.ccs = WC_CCS_GBK_EXT_2;
46
27.6k
  cc.code -= 0x4000;
47
27.6k
    }
48
469k
    cc.code = WC_N_CS128W(cc.code);
49
469k
    return cc;
50
469k
}
51
52
wc_wchar_t
53
wc_cs128w_to_gbk_ext(wc_wchar_t cc)
54
445k
{
55
445k
    cc.code = WC_CS128W_N(cc.code);
56
445k
    if (cc.ccs == WC_CCS_GBK_EXT_2)
57
5.54k
  cc.code += 0x4000;
58
445k
    cc.ccs = WC_CCS_GBK_EXT;
59
445k
    cc.code = WC_N_GBK(cc.code);
60
445k
    return cc;
61
445k
}
62
63
static wc_ccs
64
981k
wc_gbk_or_gbk_ext(wc_uint16 code) {
65
981k
    return wc_map3_range_search(code,
66
981k
        gbk_ext_ucs_map, N_gbk_ext_ucs_map)
67
981k
        ? WC_CCS_GBK_EXT : WC_CCS_GBK;
68
981k
}
69
70
#ifdef USE_UNICODE
71
wc_uint32
72
wc_gb18030_to_ucs(wc_wchar_t cc)
73
175k
{
74
175k
    wc_map3 *map;
75
76
175k
    switch (WC_CCS_SET(cc.ccs)) {
77
0
    case WC_CCS_GBK_EXT_1:
78
0
    case WC_CCS_GBK_EXT_2:
79
0
  cc = wc_cs128w_to_gbk_ext(cc);
80
171k
    case WC_CCS_GBK_EXT:
81
171k
  map = wc_map3_range_search((wc_uint16)cc.code,
82
171k
    gbk_ext_ucs_map, N_gbk_ext_ucs_map);
83
171k
  if (map)
84
171k
      return map->code3 + WC_GBK_N(cc.code) - WC_GBK_N(map->code2);
85
356
  return WC_C_UCS4_ERROR;
86
3.84k
    case WC_CCS_GB18030:
87
3.84k
  break;
88
0
    default:
89
0
  return wc_any_to_ucs(cc);
90
175k
    }
91
3.84k
    if (cc.code >= WC_C_GB18030_UCS2 && cc.code <= WC_C_GB18030_UCS2_END) {
92
1.41k
  int i, min = 0, max = N_ucs_gb18030_map - 1;
93
94
1.41k
  cc.code = WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS2);
95
1.41k
  if (cc.code >= ucs_gb18030_map[max].code3)
96
690
      i = max;
97
727
  else {
98
5.41k
      while(1) {
99
5.41k
    i = (min + max) / 2;
100
5.41k
    if (min == max)
101
349
        break;
102
5.06k
    if (cc.code < ucs_gb18030_map[i].code3)
103
2.66k
        max = i - 1;
104
2.40k
    else if (cc.code >= ucs_gb18030_map[i+1].code3)
105
2.02k
        min = i + 1;
106
378
    else
107
378
        break;
108
5.06k
      }
109
727
  }
110
1.41k
  return ucs_gb18030_map[i].code + cc.code - ucs_gb18030_map[i].code3;
111
1.41k
    }
112
2.42k
    if (cc.code >= WC_C_GB18030_UCS4 && cc.code <= WC_C_GB18030_UCS4_END)
113
943
  return WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS4)
114
943
    + 0x10000;
115
1.48k
    return WC_C_UCS4_ERROR;
116
2.42k
}
117
118
wc_wchar_t
119
wc_ucs_to_gb18030(wc_uint32 ucs)
120
14.5M
{
121
14.5M
    wc_wchar_t cc;
122
14.5M
    wc_map3 *map;
123
124
14.5M
    if (ucs <= WC_C_UCS2_END) {
125
14.5M
  map = wc_map3_range_search((wc_uint16)ucs,
126
14.5M
    ucs_gbk_ext_map, N_ucs_gbk_ext_map);
127
14.5M
  if (map) {
128
360
      cc.code = WC_GBK_N(map->code3) + ucs - map->code;
129
360
      cc.code = WC_N_GBK(cc.code);
130
360
      cc.ccs = WC_CCS_GBK_EXT;
131
360
      return cc;
132
360
  }
133
14.5M
  map = wc_map3_range_search((wc_uint16)ucs,
134
14.5M
    ucs_gb18030_map, N_ucs_gb18030_map);
135
14.5M
  if (map) {
136
14.5M
      cc.code = map->code3 + ucs - map->code + WC_GB18030_N(WC_C_GB18030_UCS2);
137
14.5M
      cc.code = WC_N_GB18030(cc.code);
138
14.5M
      if (WcOption.gb18030_as_ucs)
139
0
    cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET);
140
14.5M
      else
141
14.5M
    cc.ccs = WC_CCS_GB18030_W;
142
14.5M
      return cc;
143
14.5M
  }
144
14.5M
    } else if (ucs <= WC_C_UNICODE_END) {
145
1.17k
  cc.code = ucs - 0x10000 + WC_GB18030_N(WC_C_GB18030_UCS4);
146
1.17k
  cc.code = WC_N_GB18030(cc.code);
147
1.17k
  if (WcOption.gb18030_as_ucs)
148
0
      cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET);
149
1.17k
  else
150
1.17k
      cc.ccs = WC_CCS_GB18030_W;
151
1.17k
  return cc;
152
1.17k
    }
153
5.00k
    cc.ccs = WC_CCS_UNKNOWN;
154
5.00k
    cc.code = 0;
155
5.00k
    return cc;
156
14.5M
}
157
#endif
158
159
Str
160
wc_conv_from_gb18030(Str is, wc_ces ces)
161
1.52k
{
162
1.52k
    Str os;
163
1.52k
    wc_uchar *sp = (wc_uchar *)is->ptr;
164
1.52k
    wc_uchar *ep = sp + is->length;
165
1.52k
    wc_uchar *p;
166
1.52k
    int state = WC_GB18030_NOSTATE;
167
1.52k
    wc_uint32 gbk;
168
1.52k
    wc_wchar_t cc;
169
1.52k
#ifdef USE_UNICODE
170
1.52k
    wc_uint32 ucs;
171
1.52k
#endif
172
173
4.15k
    for (p = sp; p < ep && *p < 0x80; p++) 
174
2.62k
  ;
175
1.52k
    if (p == ep)
176
11
  return is;
177
1.51k
    os = Strnew_size(is->length);
178
1.51k
    if (p > sp)
179
317
  Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp));
180
181
3.28M
    for (; p < ep; p++) {
182
3.28M
  switch (state) {
183
1.97M
  case WC_GB18030_NOSTATE:
184
1.97M
      switch (WC_GB18030_MAP[*p]) {
185
1.12M
      case UB:
186
1.12M
    state = WC_GB18030_MBYTE1;
187
1.12M
    break;
188
377k
      case C1:
189
377k
    wtf_push_unknown(os, p, 1);
190
377k
    break;
191
476k
      default:
192
476k
    Strcat_char(os, (char)*p);
193
476k
    break;
194
1.97M
      }
195
1.97M
      break;
196
1.97M
  case WC_GB18030_MBYTE1:
197
1.12M
      if (WC_GB18030_MAP[*p] & LB) {
198
981k
    gbk = ((wc_uint32)*(p-1) << 8) | *p;
199
981k
    if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT)
200
469k
        wtf_push(os, WC_CCS_GBK_EXT, gbk);
201
512k
    else if (*(p-1) >= 0xA1 && *p >= 0xA1)
202
417k
        wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
203
95.0k
    else
204
95.0k
        wtf_push(os, WC_CCS_GBK, gbk);
205
981k
      } else if (WC_GB18030_MAP[*p] == L4) {
206
93.9k
    state = WC_GB18030_MBYTE2;
207
93.9k
    break;
208
93.9k
      } else
209
45.2k
    wtf_push_unknown(os, p-1, 2);
210
1.02M
      state = WC_GB18030_NOSTATE;
211
1.02M
      break;
212
93.9k
  case WC_GB18030_MBYTE2:
213
93.9k
      if (WC_GB18030_MAP[*p] == UB) {
214
89.9k
    state = WC_GB18030_MBYTE3;
215
89.9k
    break;
216
89.9k
      } else
217
3.93k
    wtf_push_unknown(os, p-2, 3);
218
3.93k
      state = WC_GB18030_NOSTATE;
219
3.93k
      break;
220
89.9k
  case WC_GB18030_MBYTE3:
221
89.9k
      if (WC_GB18030_MAP[*p] == L4) {
222
86.5k
    cc.ccs = WC_CCS_GB18030_W;
223
86.5k
    cc.code = ((wc_uint32)*(p-3) << 24)
224
86.5k
            | ((wc_uint32)*(p-2) << 16)
225
86.5k
            | ((wc_uint32)*(p-1) << 8)
226
86.5k
            | *p;
227
86.5k
#ifdef USE_UNICODE
228
86.5k
    if (WcOption.gb18030_as_ucs &&
229
86.5k
        (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR)
230
0
        wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code);
231
86.5k
    else
232
86.5k
#endif
233
86.5k
        wtf_push(os, cc.ccs, cc.code);
234
86.5k
      } else
235
3.45k
    wtf_push_unknown(os, p-3, 4);
236
89.9k
      state = WC_GB18030_NOSTATE;
237
89.9k
      break;
238
3.28M
  }
239
3.28M
    }
240
1.51k
    switch (state) {
241
515
    case WC_GB18030_MBYTE1:
242
515
  wtf_push_unknown(os, p-1, 1);
243
515
  break;
244
14
    case WC_GB18030_MBYTE2:
245
14
  wtf_push_unknown(os, p-2, 2);
246
14
  break;
247
13
    case WC_GB18030_MBYTE3:
248
13
  wtf_push_unknown(os, p-3, 3);
249
13
  break;
250
1.51k
    }
251
1.51k
    return os;
252
1.51k
}
253
254
void
255
wc_push_to_gb18030(Str os, wc_wchar_t cc, wc_status *st)
256
17.2M
{
257
33.6M
  while (1) {
258
33.6M
    switch (WC_CCS_SET(cc.ccs)) {
259
1.36M
    case WC_CCS_US_ASCII:
260
1.36M
  Strcat_char(os, (char)cc.code);
261
1.36M
  return;
262
650k
    case WC_CCS_GB_2312:
263
650k
  Strcat_char(os, (char)((cc.code >> 8) | 0x80));
264
650k
  Strcat_char(os, (char)((cc.code & 0xff) | 0x80));
265
650k
  return;
266
13.6k
    case WC_CCS_GBK_1:
267
14.8k
    case WC_CCS_GBK_2:
268
14.8k
  cc = wc_cs128w_to_gbk(cc);
269
40.3k
    case WC_CCS_GBK:
270
40.3k
  Strcat_char(os, (char)(cc.code >> 8));
271
40.3k
  Strcat_char(os, (char)(cc.code & 0xff));
272
40.3k
  return;
273
425
    case WC_CCS_GBK_EXT_1:
274
836
    case WC_CCS_GBK_EXT_2:
275
836
  cc = wc_cs128w_to_gbk(cc);
276
274k
    case WC_CCS_GBK_EXT:
277
274k
  Strcat_char(os, (char)(cc.code >> 8));
278
274k
  Strcat_char(os, (char)(cc.code & 0xff));
279
274k
  return;
280
14.5M
    case WC_CCS_GB18030:
281
14.5M
  Strcat_char(os, (char)((cc.code >> 24) & 0xff));
282
14.5M
  Strcat_char(os, (char)((cc.code >> 16) & 0xff));
283
14.5M
  Strcat_char(os, (char)((cc.code >> 8)  & 0xff));
284
14.5M
  Strcat_char(os, (char)(cc.code & 0xff));
285
14.5M
  return;
286
9.76k
    case WC_CCS_UNKNOWN_W:
287
9.76k
  if (!WcOption.no_replace)
288
9.76k
      Strcat_charp(os, WC_REPLACE_W);
289
9.76k
  return;
290
361k
    case WC_CCS_UNKNOWN:
291
361k
  if (!WcOption.no_replace)
292
361k
      Strcat_charp(os, WC_REPLACE);
293
361k
  return;
294
16.3M
    default:
295
16.3M
#ifdef USE_UNICODE
296
16.3M
  if (WcOption.ucs_conv)
297
16.3M
      cc = wc_any_to_any_ces(cc, st);
298
0
  else
299
0
#endif
300
0
      cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN;
301
16.3M
  continue;
302
33.6M
    }
303
33.6M
  }
304
17.2M
}
305
306
Str
307
wc_char_conv_from_gb18030(wc_uchar c, wc_status *st)
308
0
{
309
0
    static Str os;
310
0
    static wc_uchar gb[4];
311
0
    wc_uint32 gbk;
312
0
    wc_wchar_t cc;
313
0
#ifdef USE_UNICODE
314
0
    wc_uint32 ucs;
315
0
#endif
316
317
0
    if (st->state == -1) {
318
0
  st->state = WC_GB18030_NOSTATE;
319
0
  os = Strnew_size(8);
320
0
    }
321
322
0
    switch (st->state) {
323
0
    case WC_GB18030_NOSTATE:
324
0
  switch (WC_GB18030_MAP[c]) {
325
0
  case UB:
326
0
      gb[0] = c;
327
0
      st->state = WC_GB18030_MBYTE1;
328
0
      return NULL;
329
0
  case C1:
330
0
      break;
331
0
  default:
332
0
      Strcat_char(os, (char)c);
333
0
      break;
334
0
  }
335
0
  break;
336
0
    case WC_GB18030_MBYTE1:
337
0
  if (WC_GB18030_MAP[c] & LB) {
338
0
      gbk = ((wc_uint32)gb[0] << 8) | c;
339
0
      if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT)
340
0
    wtf_push(os, WC_CCS_GBK_EXT, gbk);
341
0
      else if (gb[0] >= 0xA1 && c >= 0xA1)
342
0
    wtf_push(os, wc_gb2312_or_gbk(gbk), gbk);
343
0
      else
344
0
    wtf_push(os, WC_CCS_GBK, gbk);
345
0
  } else if (WC_GB18030_MAP[c] == L4) {
346
0
      gb[1] = c;
347
0
      st->state = WC_GB18030_MBYTE2;
348
0
      return NULL;
349
0
  }
350
0
  break;
351
0
    case WC_GB18030_MBYTE2:
352
0
  if (WC_GB18030_MAP[c] == UB) {
353
0
      gb[2] = c;
354
0
      st->state = WC_GB18030_MBYTE3;
355
0
      return NULL;
356
0
  }
357
0
  break;
358
0
    case WC_GB18030_MBYTE3:
359
0
  if (WC_GB18030_MAP[c] == L4) {
360
0
      cc.ccs = WC_CCS_GB18030_W;
361
0
      cc.code = ((wc_uint32)gb[0] << 24)
362
0
        | ((wc_uint32)gb[1] << 16)
363
0
        | ((wc_uint32)gb[2] << 8)
364
0
        | c;
365
0
#ifdef USE_UNICODE
366
0
      if (WcOption.gb18030_as_ucs &&
367
0
    (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR)
368
0
    wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code);
369
0
      else
370
0
#endif
371
0
          wtf_push(os, cc.ccs, cc.code);
372
0
  }
373
0
  break;
374
0
    }
375
0
    st->state = -1;
376
0
    return os;
377
0
}