Coverage Report

Created: 2025-10-14 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/w3m/libwc/charset.c
Line
Count
Source
1
2
#include <stdlib.h>
3
#include <ctype.h>
4
#include "../alloc.h"
5
6
#include "wc.h"
7
8
#ifdef HAVE_LANGINFO_CODESET
9
#include <langinfo.h>
10
#endif
11
12
wc_locale WcLocale = 0;
13
14
static struct {
15
  char *lang;
16
  wc_ces ces;
17
} lang_ces_table[] = {
18
  { "cs", WC_CES_ISO_8859_2 },  /* cs_CZ */
19
  { "el", WC_CES_ISO_8859_7 },  /* el_GR */
20
  { "iw", WC_CES_ISO_8859_8 },  /* iw_IL */
21
  { "ja", WC_CES_EUC_JP },  /* ja_JP */
22
  { "ko", WC_CES_EUC_KR },  /* ko_KR */
23
  { "hu", WC_CES_ISO_8859_2 },  /* hu_HU */
24
  { "pl", WC_CES_ISO_8859_2 },  /* pl_PL */
25
  { "ro", WC_CES_ISO_8859_2 },  /* ro_RO */
26
  { "ru", WC_CES_ISO_8859_5 },  /* ru_SU */
27
  { "sk", WC_CES_ISO_8859_2 },  /* sk_SK */
28
  { "sl", WC_CES_ISO_8859_2 },  /* sl_CS */
29
  { "tr", WC_CES_ISO_8859_9 },  /* tr_TR */
30
  { "zh", WC_CES_EUC_CN },  /* zh_CN */
31
  { NULL, 0 }
32
};
33
34
static wc_ces
35
wc_codepage(int n)
36
413
{
37
413
  switch (n) {
38
6
  case 437: return WC_CES_CP437;
39
3
  case 737: return WC_CES_CP737;
40
4
  case 775: return WC_CES_CP775;
41
3
  case 850: return WC_CES_CP850;
42
4
  case 852: return WC_CES_CP852;
43
3
  case 855: return WC_CES_CP855;
44
5
  case 856: return WC_CES_CP856;
45
3
  case 857: return WC_CES_CP857;
46
3
  case 860: return WC_CES_CP860;
47
3
  case 861: return WC_CES_CP861;
48
4
  case 862: return WC_CES_CP862;
49
4
  case 863: return WC_CES_CP863;
50
13
  case 864: return WC_CES_CP864;
51
4
  case 865: return WC_CES_CP865;
52
5
  case 866: return WC_CES_CP866;
53
3
  case 869: return WC_CES_CP869;
54
15
  case 874: return WC_CES_CP874;
55
3
  case 932: return WC_CES_CP932;   /* CP932 = Shift_JIS */
56
4
  case 936: return WC_CES_CP936;   /* CP936 = GBK > EUC_CN */
57
3
  case 943: return WC_CES_CP943;   /* CP943 = Shift_JIS */
58
3
  case 949: return WC_CES_CP949;   /* CP949 = UHC > EUC_KR */
59
3
  case 950: return WC_CES_CP950;   /* CP950 = Big5 */
60
3
  case 1006: return WC_CES_CP1006;
61
4
  case 1250: return WC_CES_CP1250;
62
4
  case 1251: return WC_CES_CP1251;
63
4
  case 1252: return WC_CES_CP1252;
64
4
  case 1253: return WC_CES_CP1253;
65
5
  case 1254: return WC_CES_CP1254;
66
14
  case 1255: return WC_CES_CP1255;
67
12
  case 1256: return WC_CES_CP1256;
68
4
  case 1257: return WC_CES_CP1257;
69
114
  case 1258: return WC_CES_CP1258;
70
413
  }
71
144
  return 0;
72
413
}
73
74
wc_ces
75
wc_guess_charset(char *charset, wc_ces orig)
76
0
{
77
0
    wc_ces guess;
78
79
0
    if (charset == NULL || *charset == '\0')
80
0
  return orig;
81
0
    guess = wc_charset_to_ces(charset);
82
0
    return guess ? guess : orig;
83
0
}
84
85
wc_ces
86
wc_guess_charset_short(char *charset, wc_ces orig)
87
24.0k
{
88
24.0k
    wc_ces guess;
89
90
24.0k
    if (charset == NULL || *charset == '\0')
91
7.24k
  return orig;
92
16.7k
    guess = wc_charset_short_to_ces(charset);
93
16.7k
    return guess ? guess : orig;
94
24.0k
}
95
96
wc_ces
97
wc_guess_locale_charset(char *locale, wc_ces orig)
98
0
{
99
0
    wc_ces guess;
100
101
0
    if (locale == NULL || *locale == '\0')
102
0
  return orig;
103
0
    guess = wc_locale_to_ces(locale);
104
0
    return guess ? guess : orig;
105
0
}
106
107
wc_ces
108
wc_charset_to_ces(char *charset)
109
16.8k
{
110
16.8k
    char *p = charset;
111
16.8k
    char buf[16];
112
16.8k
    int n;
113
114
16.8k
    if ((*p == 'x' || *p == 'X') && *(p+1) == '-')
115
6
  p += 2;
116
66.2k
    for (n = 0; *p && n < 15; p++) {
117
49.3k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
118
47.3k
      buf[n++] = *p | 32; /* tolower(*p); */
119
49.3k
    }
120
16.8k
    buf[n] = 0;
121
16.8k
    p = buf;
122
16.8k
    switch (*p) {
123
1.47k
    case 'e':
124
1.47k
  if (! strncmp(p, "euc", 3)) {
125
17
      p += 3;
126
17
      switch (*p) {
127
3
      case 'j': return WC_CES_EUC_JP;
128
3
      case 'c': return WC_CES_EUC_CN;
129
3
      case 't': return WC_CES_EUC_TW;
130
3
      case 'k': return WC_CES_EUC_KR;
131
17
      }
132
5
      switch (WcLocale) {
133
0
      case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
134
0
      case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
135
0
      case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
136
0
      case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
137
0
      case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
138
5
      }
139
5
      return WC_CES_EUC_JP;
140
5
        }
141
1.46k
  break;
142
1.46k
    case 'i':
143
217
  if (! strncmp(p, "iso2022", 7)) {
144
33
      p += 7;
145
33
      switch (*p) {
146
24
      case 'j':
147
24
    if (! strncmp(p, "jp2", 3))
148
3
        return WC_CES_ISO_2022_JP_2;
149
21
    if (! strncmp(p, "jp3", 3))
150
3
        return WC_CES_ISO_2022_JP_3;
151
18
    return WC_CES_ISO_2022_JP;
152
3
      case 'c': return WC_CES_ISO_2022_CN;
153
3
      case 'k': return WC_CES_ISO_2022_KR;
154
33
      }
155
3
      return WC_CES_ISO_2022_JP;
156
184
  } else if (! strncmp(p, "iso8859", 7)) {
157
32
      n = atoi(p + 7);
158
32
      if (n >= 1 && n <= 16 && n != 12)
159
6
    return (WC_CES_E_ISO_8859 | n);
160
26
      return WC_CES_ISO_8859_1;
161
152
  } else if (! strncmp(p, "ibm", 3)) {
162
63
      p += 3;
163
63
      if (*p >= '1' && *p <= '9')
164
6
        return wc_codepage(atoi(p));
165
57
      return wc_charset_to_ces(p);
166
63
  }
167
89
  break;
168
1.41k
    case 'j':
169
1.41k
  if (! strncmp(p, "johab", 5))
170
3
      return WC_CES_JOHAB;
171
1.41k
  if (! strncmp(p, "jis", 3))
172
11
      return WC_CES_ISO_2022_JP;
173
1.40k
  break;
174
1.90k
    case 's':
175
1.90k
  if (! strncmp(p, "shiftjisx0213", 13) ||
176
1.88k
      ! strncmp(p, "sjisx0213", 9))
177
299
      return WC_CES_SHIFT_JISX0213;
178
1.60k
  if (! strncmp(p, "shiftjis", 8) ||
179
1.55k
      ! strncmp(p, "sjis", 4))
180
165
      return WC_CES_SHIFT_JIS;
181
1.44k
  break;
182
1.44k
    case 'p':
183
26
  if (! strncmp(p, "pck", 3))
184
3
      return WC_CES_SHIFT_JIS;
185
23
  break;
186
3.00k
    case 'g':
187
3.00k
  if (! strncmp(p, "gb18030", 7) ||
188
2.99k
      ! strncmp(p, "gbk2k", 5))
189
2.02k
      return WC_CES_GB18030;
190
981
  if (! strncmp(p, "gbk", 3))
191
492
      return WC_CES_GBK;
192
489
  if (! strncmp(p, "gb2312", 6))
193
3
      return WC_CES_EUC_CN;
194
486
  break;
195
486
    case 'b':
196
470
  if (! strncmp(p, "big5hkscs", 9))
197
5
      return WC_CES_HKSCS;
198
465
  if (! strncmp(p, "big5", 4))
199
33
      return WC_CES_BIG5;
200
432
  break;
201
1.30k
    case 'h':
202
1.30k
  if (! strncmp(p, "hz", 2))
203
3
      return WC_CES_HZ_GB_2312;
204
1.30k
  if (! strncmp(p, "hkscs", 5))
205
6
      return WC_CES_HKSCS;
206
1.30k
  break;
207
1.30k
    case 'k':
208
559
  if (! strncmp(p, "koi8r", 5))
209
3
      return WC_CES_KOI8_R;
210
556
  if (! strncmp(p, "koi8u", 5))
211
3
      return WC_CES_KOI8_U;
212
553
  if (! strncmp(p, "ksx1001", 7))
213
3
      return WC_CES_EUC_KR;
214
550
  if (! strncmp(p, "ksc5601", 7))
215
3
      return WC_CES_EUC_KR;
216
547
  break;
217
676
    case 't':
218
676
  if (! strncmp(p, "tis620", 6))
219
3
      return WC_CES_TIS_620;
220
673
  if (! strncmp(p, "tcvn", 4))
221
27
      return WC_CES_TCVN_5712;
222
646
  break;
223
646
    case 'n':
224
302
  if (! strncmp(p, "next", 4))
225
3
      return WC_CES_NEXTSTEP;
226
299
  break;
227
550
    case 'v':
228
550
  if (! strncmp(p, "viet", 4)) {
229
22
      p += 4;
230
22
      if (! strncmp(p, "tcvn", 4))
231
3
    return WC_CES_TCVN_5712;
232
22
  }
233
547
  if (! strncmp(p, "viscii", 6))
234
4
      return WC_CES_VISCII_11;
235
543
  if (! strncmp(p, "vps", 3))
236
3
      return WC_CES_VPS;
237
540
  break;
238
3.24k
    case 'u':
239
3.24k
#ifdef USE_UNICODE
240
3.24k
  if (! strncmp(p, "utf8", 4))
241
13
      return WC_CES_UTF_8;
242
3.22k
  if (! strncmp(p, "utf7", 4))
243
184
      return WC_CES_UTF_7;
244
3.04k
#endif
245
3.04k
  if (! strncmp(p, "uhc", 3))
246
459
      return WC_CES_UHC;
247
2.58k
  if (! strncmp(p, "ujis", 4))
248
7
      return WC_CES_EUC_JP;
249
2.57k
  if (! strncmp(p, "usascii", 7))
250
4
      return WC_CES_US_ASCII;
251
2.57k
  break;
252
2.57k
    case 'a':
253
96
  if (! strncmp(p, "ascii", 5))
254
3
      return WC_CES_US_ASCII;
255
93
  break;
256
486
    case 'c':
257
486
  if (! strncmp(p, "cngb", 4))
258
3
      return WC_CES_EUC_CN;
259
483
  if (*(p+1) != 'p')
260
468
      break;
261
15
  p += 2;
262
15
  if (*p >= '1' &&  *p <= '9')
263
4
      return wc_codepage(atoi(p));
264
11
  break;
265
453
    case 'w':
266
453
  if (strncmp(p, "windows", 7))
267
418
      break;
268
35
  p += 7;
269
35
  if (! strncmp(p, "31j", 3))
270
3
      return WC_CES_CP932;
271
32
  if (*p >= '1' &&  *p <= '9')
272
17
      return wc_codepage(atoi(p));
273
15
  break;
274
16.8k
    }
275
12.9k
    return 0;
276
16.8k
}
277
278
wc_ces
279
wc_charset_short_to_ces(char *charset)
280
16.7k
{
281
16.7k
    char *p = charset;
282
16.7k
    char buf[16];
283
16.7k
    wc_ces ces;
284
16.7k
    int n;
285
286
16.7k
    ces = wc_charset_to_ces(charset);
287
16.7k
    if (ces)
288
3.85k
  return ces;
289
290
40.5k
    for (n = 0; *p && n < 15; p++) {
291
27.5k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
292
26.0k
      buf[n++] = *p | 32; /* tolower(*p); */
293
27.5k
    }
294
12.9k
    buf[n] = 0;
295
12.9k
    p = buf;
296
12.9k
    switch (*p) {
297
1.46k
    case 'e':
298
1.46k
  switch (*(p+1)) {
299
1
  case 'j': return WC_CES_EUC_JP;
300
3
  case 'c': return WC_CES_EUC_CN;
301
434
  case 't': return WC_CES_EUC_TW;
302
14
  case 'k': return WC_CES_EUC_KR;
303
1.46k
  }
304
1.00k
  return WC_CES_EUC_JP;
305
1.40k
    case 'j':
306
1.40k
  p++;
307
1.40k
  if (*p == 'o')
308
739
      return WC_CES_JOHAB;
309
665
  if (*p == 'p')
310
3
     p++;
311
665
  if (*p == '2')
312
117
     return WC_CES_ISO_2022_JP_2;
313
548
  if (*p == '3')
314
7
     return WC_CES_ISO_2022_JP_3;
315
541
  return WC_CES_ISO_2022_JP;
316
1.44k
    case 's':
317
1.44k
  return WC_CES_SHIFT_JIS;
318
486
    case 'g':
319
486
  return WC_CES_EUC_CN;
320
430
    case 'b':
321
430
  return WC_CES_BIG5;
322
1.30k
    case 'h':
323
1.30k
  if (*(p+1) == 'k')
324
433
      return WC_CES_HKSCS;
325
867
  return WC_CES_HZ_GB_2312;
326
547
    case 'k':
327
547
  if (*(p+1) == 'o')
328
32
      return WC_CES_KOI8_R;
329
515
  return WC_CES_ISO_2022_KR;
330
303
    case 'l':
331
303
  n = atoi(p + 1);
332
303
  if (n >= 1 && n <= 16 && n != 12)
333
28
      return (WC_CES_E_ISO_8859 | n);
334
275
  return WC_CES_ISO_8859_1;
335
646
    case 't':
336
646
  if (*(p+1) == 'c')
337
148
      return WC_CES_TCVN_5712;
338
498
  return WC_CES_TIS_620;
339
299
    case 'n':
340
299
  return WC_CES_NEXTSTEP;
341
540
    case 'v':
342
540
  if (*(p+1) == 'p')
343
47
      return WC_CES_VPS;
344
493
  return WC_CES_VISCII_11;
345
0
#ifdef USE_UNICODE
346
2.57k
    case 'u':
347
2.57k
  if (*(p+1) == '7')
348
808
      return WC_CES_UTF_7;
349
1.76k
  return WC_CES_UTF_8;
350
0
#endif
351
92
    case 'a':
352
92
  return WC_CES_US_ASCII;
353
482
    case 'c':
354
482
  return WC_CES_ISO_2022_CN;
355
450
    case 'w':
356
450
  p++;
357
450
  if (*p >= '1' &&  *p <= '9')
358
386
      return wc_codepage(atoi(p));
359
64
  break;
360
142
    case 'r':
361
142
  return WC_CES_RAW;
362
12.9k
    }
363
408
    return 0;
364
12.9k
}
365
366
wc_ces
367
wc_locale_to_ces(char *locale)
368
0
{
369
0
    char *p = locale;
370
0
    char buf[8];
371
0
    int n;
372
373
0
    if (*p == 'C' && *(p+1) == '\0')
374
0
  return WC_CES_US_ASCII;
375
0
#ifdef HAVE_LANGINFO_CODESET
376
0
    {
377
0
  char *cs = nl_langinfo(CODESET);
378
0
  if (cs && strcmp(cs, "US-ASCII"))
379
0
      return wc_charset_to_ces(cs);
380
0
    }
381
0
#endif
382
0
    for (n = 0; *p && *p != '.' && n < 7; p++) {
383
0
  if ((unsigned char)*p > 0x20)
384
0
      buf[n++] = *p | 32; /* tolower(*p); */
385
0
    }
386
0
    buf[n] = 0;
387
0
    if (*p == '.') {
388
0
  p++;
389
0
  if (! strcasecmp(p, "euc")) {
390
0
      switch (buf[0]) {
391
0
      case 'j':
392
0
    WcLocale = WC_LOCALE_JA_JP;
393
0
    break;
394
0
      case 'k':
395
0
    WcLocale = WC_LOCALE_KO_KR;
396
0
    break;
397
0
      case 'z':
398
0
          if (!strcmp(buf, "zh_tw"))
399
0
        WcLocale = WC_LOCALE_ZH_TW;
400
0
          else if (!strcmp(buf, "zh_hk"))
401
0
        WcLocale = WC_LOCALE_ZH_HK;
402
0
    else
403
0
        WcLocale = WC_LOCALE_ZH_CN;
404
0
    break;
405
0
      default:
406
0
    WcLocale = 0;
407
0
    break;
408
0
      }
409
0
  }
410
0
  return wc_charset_to_ces(p);
411
0
    }
412
413
0
    if (!strcmp(buf, "japanese"))
414
0
  return WC_CES_SHIFT_JIS;
415
0
    if (!strcmp(buf, "zh_tw") ||
416
0
  !strcmp(buf, "zh_hk"))
417
0
  return WC_CES_BIG5;
418
0
    for (n = 0; lang_ces_table[n].lang; n++) {
419
0
  if (!strncmp(buf, lang_ces_table[n].lang, 2))
420
0
      return lang_ces_table[n].ces;
421
0
    }
422
0
    return WC_CES_ISO_8859_1;
423
0
}
424
425
char *
426
wc_ces_to_charset(wc_ces ces)
427
0
{
428
0
    if (ces == WC_CES_WTF)
429
0
  return "WTF";
430
0
    return WcCesInfo[WC_CES_INDEX(ces)].name;
431
0
}
432
433
char *
434
wc_ces_to_charset_desc(wc_ces ces)
435
0
{
436
0
    if (ces == WC_CES_WTF)
437
0
  return "W3M Transfer Format";
438
0
    return WcCesInfo[WC_CES_INDEX(ces)].desc;
439
0
}
440
441
wc_ces
442
wc_guess_8bit_charset(wc_ces orig)
443
0
{
444
0
    switch (orig) {
445
0
    case WC_CES_ISO_2022_JP:
446
0
    case WC_CES_ISO_2022_JP_2:
447
0
    case WC_CES_ISO_2022_JP_3:
448
0
  return WC_CES_EUC_JP;
449
0
    case WC_CES_ISO_2022_KR:
450
0
  return WC_CES_EUC_KR;
451
0
    case WC_CES_ISO_2022_CN:
452
0
    case WC_CES_HZ_GB_2312:
453
0
  return WC_CES_EUC_CN;
454
0
    case WC_CES_US_ASCII:
455
0
  return WC_CES_ISO_8859_1;
456
0
    }
457
0
    return orig;
458
0
}
459
460
wc_bool
461
wc_check_ces(wc_ces ces)
462
2
{
463
2
    size_t i = WC_CES_INDEX(ces);
464
465
2
    return (i <= WC_CES_END && WcCesInfo[i].id == ces);
466
2
}
467
468
static int
469
wc_ces_list_cmp(const void *a, const void *b)
470
0
{
471
0
    return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
472
0
}
473
474
static wc_ces_list *list = NULL;
475
476
wc_ces_list *
477
wc_get_ces_list(void)
478
0
{
479
0
    wc_ces_info *info;
480
0
    size_t n;
481
482
0
    if (list)
483
0
  return list;
484
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
485
0
  if (info->name != NULL)
486
0
      n++;
487
0
    }
488
0
    list = New_N(wc_ces_list, n + 1);
489
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
490
0
  if (info->name != NULL) {
491
0
      list[n].id = info->id;
492
0
      list[n].name = info->name;
493
0
      list[n].desc = info->desc;
494
0
      n++;
495
0
  }
496
0
    }
497
0
    list[n].id = 0;
498
0
    list[n].name = NULL;
499
    list[n].desc = NULL;
500
0
    qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
501
0
    return list;
502
0
}