Coverage Report

Created: 2026-04-28 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/w3m/libwc/charset.c
Line
Count
Source
1
2
#include <stdlib.h>
3
#include <ctype.h>
4
#include "../alloc.h"
5
6
#include "wc.h"
7
8
#ifdef HAVE_LANGINFO_CODESET
9
#include <langinfo.h>
10
#endif
11
12
wc_locale WcLocale = 0;
13
14
static struct {
15
  char *lang;
16
  wc_ces ces;
17
} lang_ces_table[] = {
18
  { "cs", WC_CES_ISO_8859_2 },  /* cs_CZ */
19
  { "el", WC_CES_ISO_8859_7 },  /* el_GR */
20
  { "iw", WC_CES_ISO_8859_8 },  /* iw_IL */
21
  { "ja", WC_CES_EUC_JP },  /* ja_JP */
22
  { "ko", WC_CES_EUC_KR },  /* ko_KR */
23
  { "hu", WC_CES_ISO_8859_2 },  /* hu_HU */
24
  { "pl", WC_CES_ISO_8859_2 },  /* pl_PL */
25
  { "ro", WC_CES_ISO_8859_2 },  /* ro_RO */
26
  { "ru", WC_CES_ISO_8859_5 },  /* ru_SU */
27
  { "sk", WC_CES_ISO_8859_2 },  /* sk_SK */
28
  { "sl", WC_CES_ISO_8859_2 },  /* sl_CS */
29
  { "tr", WC_CES_ISO_8859_9 },  /* tr_TR */
30
  { "zh", WC_CES_EUC_CN },  /* zh_CN */
31
  { NULL, 0 }
32
};
33
34
static wc_ces
35
wc_codepage(int n)
36
430
{
37
430
  switch (n) {
38
4
  case 437: return WC_CES_CP437;
39
3
  case 737: return WC_CES_CP737;
40
3
  case 775: return WC_CES_CP775;
41
5
  case 850: return WC_CES_CP850;
42
3
  case 852: return WC_CES_CP852;
43
8
  case 855: return WC_CES_CP855;
44
3
  case 856: return WC_CES_CP856;
45
3
  case 857: return WC_CES_CP857;
46
5
  case 860: return WC_CES_CP860;
47
3
  case 861: return WC_CES_CP861;
48
3
  case 862: return WC_CES_CP862;
49
5
  case 863: return WC_CES_CP863;
50
13
  case 864: return WC_CES_CP864;
51
4
  case 865: return WC_CES_CP865;
52
4
  case 866: return WC_CES_CP866;
53
4
  case 869: return WC_CES_CP869;
54
17
  case 874: return WC_CES_CP874;
55
3
  case 932: return WC_CES_CP932;   /* CP932 = Shift_JIS */
56
4
  case 936: return WC_CES_CP936;   /* CP936 = GBK > EUC_CN */
57
3
  case 943: return WC_CES_CP943;   /* CP943 = Shift_JIS */
58
3
  case 949: return WC_CES_CP949;   /* CP949 = UHC > EUC_KR */
59
1
  case 950: return WC_CES_CP950;   /* CP950 = Big5 */
60
3
  case 1006: return WC_CES_CP1006;
61
3
  case 1250: return WC_CES_CP1250;
62
5
  case 1251: return WC_CES_CP1251;
63
4
  case 1252: return WC_CES_CP1252;
64
3
  case 1253: return WC_CES_CP1253;
65
5
  case 1254: return WC_CES_CP1254;
66
13
  case 1255: return WC_CES_CP1255;
67
13
  case 1256: return WC_CES_CP1256;
68
3
  case 1257: return WC_CES_CP1257;
69
123
  case 1258: return WC_CES_CP1258;
70
430
  }
71
151
  return 0;
72
430
}
73
74
wc_ces
75
wc_guess_charset(char *charset, wc_ces orig)
76
0
{
77
0
    wc_ces guess;
78
79
0
    if (charset == NULL || *charset == '\0')
80
0
  return orig;
81
0
    guess = wc_charset_to_ces(charset);
82
0
    return guess ? guess : orig;
83
0
}
84
85
wc_ces
86
wc_guess_charset_short(char *charset, wc_ces orig)
87
23.3k
{
88
23.3k
    wc_ces guess;
89
90
23.3k
    if (charset == NULL || *charset == '\0')
91
6.95k
  return orig;
92
16.4k
    guess = wc_charset_short_to_ces(charset);
93
16.4k
    return guess ? guess : orig;
94
23.3k
}
95
96
wc_ces
97
wc_guess_locale_charset(char *locale, wc_ces orig)
98
0
{
99
0
    wc_ces guess;
100
101
0
    if (locale == NULL || *locale == '\0')
102
0
  return orig;
103
0
    guess = wc_locale_to_ces(locale);
104
0
    return guess ? guess : orig;
105
0
}
106
107
wc_ces
108
wc_charset_to_ces(char *charset)
109
16.4k
{
110
16.4k
    char *p = charset;
111
16.4k
    char buf[16];
112
16.4k
    int n;
113
114
16.4k
    if ((*p == 'x' || *p == 'X') && *(p+1) == '-')
115
6
  p += 2;
116
66.0k
    for (n = 0; *p && n < 15; p++) {
117
49.6k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
118
47.3k
      buf[n++] = *p | 32; /* tolower(*p); */
119
49.6k
    }
120
16.4k
    buf[n] = 0;
121
16.4k
    p = buf;
122
16.4k
    switch (*p) {
123
1.56k
    case 'e':
124
1.56k
  if (! strncmp(p, "euc", 3)) {
125
17
      p += 3;
126
17
      switch (*p) {
127
3
      case 'j': return WC_CES_EUC_JP;
128
4
      case 'c': return WC_CES_EUC_CN;
129
3
      case 't': return WC_CES_EUC_TW;
130
3
      case 'k': return WC_CES_EUC_KR;
131
17
      }
132
4
      switch (WcLocale) {
133
0
      case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
134
0
      case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
135
0
      case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
136
0
      case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
137
0
      case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
138
4
      }
139
4
      return WC_CES_EUC_JP;
140
4
        }
141
1.54k
  break;
142
1.54k
    case 'i':
143
207
  if (! strncmp(p, "iso2022", 7)) {
144
30
      p += 7;
145
30
      switch (*p) {
146
21
      case 'j':
147
21
    if (! strncmp(p, "jp2", 3))
148
2
        return WC_CES_ISO_2022_JP_2;
149
19
    if (! strncmp(p, "jp3", 3))
150
3
        return WC_CES_ISO_2022_JP_3;
151
16
    return WC_CES_ISO_2022_JP;
152
3
      case 'c': return WC_CES_ISO_2022_CN;
153
3
      case 'k': return WC_CES_ISO_2022_KR;
154
30
      }
155
3
      return WC_CES_ISO_2022_JP;
156
177
  } else if (! strncmp(p, "iso8859", 7)) {
157
28
      n = atoi(p + 7);
158
28
      if (n >= 1 && n <= 16 && n != 12)
159
5
    return (WC_CES_E_ISO_8859 | n);
160
23
      return WC_CES_ISO_8859_1;
161
149
  } else if (! strncmp(p, "ibm", 3)) {
162
67
      p += 3;
163
67
      if (*p >= '1' && *p <= '9')
164
6
        return wc_codepage(atoi(p));
165
61
      return wc_charset_to_ces(p);
166
67
  }
167
82
  break;
168
1.34k
    case 'j':
169
1.34k
  if (! strncmp(p, "johab", 5))
170
5
      return WC_CES_JOHAB;
171
1.33k
  if (! strncmp(p, "jis", 3))
172
5
      return WC_CES_ISO_2022_JP;
173
1.33k
  break;
174
1.79k
    case 's':
175
1.79k
  if (! strncmp(p, "shiftjisx0213", 13) ||
176
1.77k
      ! strncmp(p, "sjisx0213", 9))
177
306
      return WC_CES_SHIFT_JISX0213;
178
1.49k
  if (! strncmp(p, "shiftjis", 8) ||
179
1.44k
      ! strncmp(p, "sjis", 4))
180
159
      return WC_CES_SHIFT_JIS;
181
1.33k
  break;
182
1.33k
    case 'p':
183
22
  if (! strncmp(p, "pck", 3))
184
3
      return WC_CES_SHIFT_JIS;
185
19
  break;
186
3.05k
    case 'g':
187
3.05k
  if (! strncmp(p, "gb18030", 7) ||
188
3.04k
      ! strncmp(p, "gbk2k", 5))
189
2.01k
      return WC_CES_GB18030;
190
1.03k
  if (! strncmp(p, "gbk", 3))
191
512
      return WC_CES_GBK;
192
526
  if (! strncmp(p, "gb2312", 6))
193
4
      return WC_CES_EUC_CN;
194
522
  break;
195
522
    case 'b':
196
466
  if (! strncmp(p, "big5hkscs", 9))
197
3
      return WC_CES_HKSCS;
198
463
  if (! strncmp(p, "big5", 4))
199
33
      return WC_CES_BIG5;
200
430
  break;
201
1.26k
    case 'h':
202
1.26k
  if (! strncmp(p, "hz", 2))
203
5
      return WC_CES_HZ_GB_2312;
204
1.25k
  if (! strncmp(p, "hkscs", 5))
205
8
      return WC_CES_HKSCS;
206
1.25k
  break;
207
1.25k
    case 'k':
208
515
  if (! strncmp(p, "koi8r", 5))
209
3
      return WC_CES_KOI8_R;
210
512
  if (! strncmp(p, "koi8u", 5))
211
4
      return WC_CES_KOI8_U;
212
508
  if (! strncmp(p, "ksx1001", 7))
213
4
      return WC_CES_EUC_KR;
214
504
  if (! strncmp(p, "ksc5601", 7))
215
3
      return WC_CES_EUC_KR;
216
501
  break;
217
712
    case 't':
218
712
  if (! strncmp(p, "tis620", 6))
219
4
      return WC_CES_TIS_620;
220
708
  if (! strncmp(p, "tcvn", 4))
221
44
      return WC_CES_TCVN_5712;
222
664
  break;
223
664
    case 'n':
224
292
  if (! strncmp(p, "next", 4))
225
3
      return WC_CES_NEXTSTEP;
226
289
  break;
227
507
    case 'v':
228
507
  if (! strncmp(p, "viet", 4)) {
229
22
      p += 4;
230
22
      if (! strncmp(p, "tcvn", 4))
231
3
    return WC_CES_TCVN_5712;
232
22
  }
233
504
  if (! strncmp(p, "viscii", 6))
234
3
      return WC_CES_VISCII_11;
235
501
  if (! strncmp(p, "vps", 3))
236
4
      return WC_CES_VPS;
237
497
  break;
238
3.06k
    case 'u':
239
3.06k
#ifdef USE_UNICODE
240
3.06k
  if (! strncmp(p, "utf8", 4))
241
3
      return WC_CES_UTF_8;
242
3.06k
  if (! strncmp(p, "utf7", 4))
243
116
      return WC_CES_UTF_7;
244
2.95k
#endif
245
2.95k
  if (! strncmp(p, "uhc", 3))
246
447
      return WC_CES_UHC;
247
2.50k
  if (! strncmp(p, "ujis", 4))
248
8
      return WC_CES_EUC_JP;
249
2.49k
  if (! strncmp(p, "usascii", 7))
250
4
      return WC_CES_US_ASCII;
251
2.49k
  break;
252
2.49k
    case 'a':
253
93
  if (! strncmp(p, "ascii", 5))
254
3
      return WC_CES_US_ASCII;
255
90
  break;
256
461
    case 'c':
257
461
  if (! strncmp(p, "cngb", 4))
258
3
      return WC_CES_EUC_CN;
259
458
  if (*(p+1) != 'p')
260
439
      break;
261
19
  p += 2;
262
19
  if (*p >= '1' &&  *p <= '9')
263
6
      return wc_codepage(atoi(p));
264
13
  break;
265
468
    case 'w':
266
468
  if (strncmp(p, "windows", 7))
267
434
      break;
268
34
  p += 7;
269
34
  if (! strncmp(p, "31j", 3))
270
3
      return WC_CES_CP932;
271
31
  if (*p >= '1' &&  *p <= '9')
272
12
      return wc_codepage(atoi(p));
273
19
  break;
274
16.4k
    }
275
12.6k
    return 0;
276
16.4k
}
277
278
wc_ces
279
wc_charset_short_to_ces(char *charset)
280
16.4k
{
281
16.4k
    char *p = charset;
282
16.4k
    char buf[16];
283
16.4k
    wc_ces ces;
284
16.4k
    int n;
285
286
16.4k
    ces = wc_charset_to_ces(charset);
287
16.4k
    if (ces)
288
3.79k
  return ces;
289
290
40.1k
    for (n = 0; *p && n < 15; p++) {
291
27.5k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
292
25.7k
      buf[n++] = *p | 32; /* tolower(*p); */
293
27.5k
    }
294
12.6k
    buf[n] = 0;
295
12.6k
    p = buf;
296
12.6k
    switch (*p) {
297
1.54k
    case 'e':
298
1.54k
  switch (*(p+1)) {
299
1
  case 'j': return WC_CES_EUC_JP;
300
3
  case 'c': return WC_CES_EUC_CN;
301
441
  case 't': return WC_CES_EUC_TW;
302
15
  case 'k': return WC_CES_EUC_KR;
303
1.54k
  }
304
1.08k
  return WC_CES_EUC_JP;
305
1.33k
    case 'j':
306
1.33k
  p++;
307
1.33k
  if (*p == 'o')
308
739
      return WC_CES_JOHAB;
309
594
  if (*p == 'p')
310
3
     p++;
311
594
  if (*p == '2')
312
134
     return WC_CES_ISO_2022_JP_2;
313
460
  if (*p == '3')
314
7
     return WC_CES_ISO_2022_JP_3;
315
453
  return WC_CES_ISO_2022_JP;
316
1.33k
    case 's':
317
1.33k
  return WC_CES_SHIFT_JIS;
318
522
    case 'g':
319
522
  return WC_CES_EUC_CN;
320
429
    case 'b':
321
429
  return WC_CES_BIG5;
322
1.25k
    case 'h':
323
1.25k
  if (*(p+1) == 'k')
324
437
      return WC_CES_HKSCS;
325
814
  return WC_CES_HZ_GB_2312;
326
500
    case 'k':
327
500
  if (*(p+1) == 'o')
328
30
      return WC_CES_KOI8_R;
329
470
  return WC_CES_ISO_2022_KR;
330
314
    case 'l':
331
314
  n = atoi(p + 1);
332
314
  if (n >= 1 && n <= 16 && n != 12)
333
30
      return (WC_CES_E_ISO_8859 | n);
334
284
  return WC_CES_ISO_8859_1;
335
664
    case 't':
336
664
  if (*(p+1) == 'c')
337
153
      return WC_CES_TCVN_5712;
338
511
  return WC_CES_TIS_620;
339
289
    case 'n':
340
289
  return WC_CES_NEXTSTEP;
341
497
    case 'v':
342
497
  if (*(p+1) == 'p')
343
42
      return WC_CES_VPS;
344
455
  return WC_CES_VISCII_11;
345
0
#ifdef USE_UNICODE
346
2.49k
    case 'u':
347
2.49k
  if (*(p+1) == '7')
348
841
      return WC_CES_UTF_7;
349
1.64k
  return WC_CES_UTF_8;
350
0
#endif
351
90
    case 'a':
352
90
  return WC_CES_US_ASCII;
353
457
    case 'c':
354
457
  return WC_CES_ISO_2022_CN;
355
465
    case 'w':
356
465
  p++;
357
465
  if (*p >= '1' &&  *p <= '9')
358
406
      return wc_codepage(atoi(p));
359
59
  break;
360
130
    case 'r':
361
130
  return WC_CES_RAW;
362
12.6k
    }
363
387
    return 0;
364
12.6k
}
365
366
wc_ces
367
wc_locale_to_ces(char *locale)
368
0
{
369
0
    char *p = locale;
370
0
    char buf[8];
371
0
    int n;
372
373
0
    if (*p == 'C' && *(p+1) == '\0')
374
0
  return WC_CES_US_ASCII;
375
0
#ifdef HAVE_LANGINFO_CODESET
376
0
    {
377
0
  char *cs = nl_langinfo(CODESET);
378
0
  if (cs && strcmp(cs, "US-ASCII"))
379
0
      return wc_charset_to_ces(cs);
380
0
    }
381
0
#endif
382
0
    for (n = 0; *p && *p != '.' && n < 7; p++) {
383
0
  if ((unsigned char)*p > 0x20)
384
0
      buf[n++] = *p | 32; /* tolower(*p); */
385
0
    }
386
0
    buf[n] = 0;
387
0
    if (*p == '.') {
388
0
  p++;
389
0
  if (! strcasecmp(p, "euc")) {
390
0
      switch (buf[0]) {
391
0
      case 'j':
392
0
    WcLocale = WC_LOCALE_JA_JP;
393
0
    break;
394
0
      case 'k':
395
0
    WcLocale = WC_LOCALE_KO_KR;
396
0
    break;
397
0
      case 'z':
398
0
          if (!strcmp(buf, "zh_tw"))
399
0
        WcLocale = WC_LOCALE_ZH_TW;
400
0
          else if (!strcmp(buf, "zh_hk"))
401
0
        WcLocale = WC_LOCALE_ZH_HK;
402
0
    else
403
0
        WcLocale = WC_LOCALE_ZH_CN;
404
0
    break;
405
0
      default:
406
0
    WcLocale = 0;
407
0
    break;
408
0
      }
409
0
  }
410
0
  return wc_charset_to_ces(p);
411
0
    }
412
413
0
    if (!strcmp(buf, "japanese"))
414
0
  return WC_CES_SHIFT_JIS;
415
0
    if (!strcmp(buf, "zh_tw") ||
416
0
  !strcmp(buf, "zh_hk"))
417
0
  return WC_CES_BIG5;
418
0
    for (n = 0; lang_ces_table[n].lang; n++) {
419
0
  if (!strncmp(buf, lang_ces_table[n].lang, 2))
420
0
      return lang_ces_table[n].ces;
421
0
    }
422
0
    return WC_CES_ISO_8859_1;
423
0
}
424
425
char *
426
wc_ces_to_charset(wc_ces ces)
427
0
{
428
0
    if (ces == WC_CES_WTF)
429
0
  return "WTF";
430
0
    return WcCesInfo[WC_CES_INDEX(ces)].name;
431
0
}
432
433
char *
434
wc_ces_to_charset_desc(wc_ces ces)
435
0
{
436
0
    if (ces == WC_CES_WTF)
437
0
  return "W3M Transfer Format";
438
0
    return WcCesInfo[WC_CES_INDEX(ces)].desc;
439
0
}
440
441
wc_ces
442
wc_guess_8bit_charset(wc_ces orig)
443
0
{
444
0
    switch (orig) {
445
0
    case WC_CES_ISO_2022_JP:
446
0
    case WC_CES_ISO_2022_JP_2:
447
0
    case WC_CES_ISO_2022_JP_3:
448
0
  return WC_CES_EUC_JP;
449
0
    case WC_CES_ISO_2022_KR:
450
0
  return WC_CES_EUC_KR;
451
0
    case WC_CES_ISO_2022_CN:
452
0
    case WC_CES_HZ_GB_2312:
453
0
  return WC_CES_EUC_CN;
454
0
    case WC_CES_US_ASCII:
455
0
  return WC_CES_ISO_8859_1;
456
0
    }
457
0
    return orig;
458
0
}
459
460
wc_bool
461
wc_check_ces(wc_ces ces)
462
2
{
463
2
    size_t i = WC_CES_INDEX(ces);
464
465
2
    return (i <= WC_CES_END && WcCesInfo[i].id == ces);
466
2
}
467
468
static int
469
wc_ces_list_cmp(const void *a, const void *b)
470
0
{
471
0
    return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
472
0
}
473
474
static wc_ces_list *list = NULL;
475
476
wc_ces_list *
477
wc_get_ces_list(void)
478
0
{
479
0
    wc_ces_info *info;
480
0
    size_t n;
481
482
0
    if (list)
483
0
  return list;
484
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
485
0
  if (info->name != NULL)
486
0
      n++;
487
0
    }
488
0
    list = New_N(wc_ces_list, n + 1);
489
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
490
0
  if (info->name != NULL) {
491
0
      list[n].id = info->id;
492
0
      list[n].name = info->name;
493
0
      list[n].desc = info->desc;
494
0
      n++;
495
0
  }
496
0
    }
497
0
    list[n].id = 0;
498
0
    list[n].name = NULL;
499
    list[n].desc = NULL;
500
0
    qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
501
0
    return list;
502
0
}