Coverage Report

Created: 2025-12-13 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/w3m/libwc/charset.c
Line
Count
Source
1
2
#include <stdlib.h>
3
#include <ctype.h>
4
#include "../alloc.h"
5
6
#include "wc.h"
7
8
#ifdef HAVE_LANGINFO_CODESET
9
#include <langinfo.h>
10
#endif
11
12
wc_locale WcLocale = 0;
13
14
static struct {
15
  char *lang;
16
  wc_ces ces;
17
} lang_ces_table[] = {
18
  { "cs", WC_CES_ISO_8859_2 },  /* cs_CZ */
19
  { "el", WC_CES_ISO_8859_7 },  /* el_GR */
20
  { "iw", WC_CES_ISO_8859_8 },  /* iw_IL */
21
  { "ja", WC_CES_EUC_JP },  /* ja_JP */
22
  { "ko", WC_CES_EUC_KR },  /* ko_KR */
23
  { "hu", WC_CES_ISO_8859_2 },  /* hu_HU */
24
  { "pl", WC_CES_ISO_8859_2 },  /* pl_PL */
25
  { "ro", WC_CES_ISO_8859_2 },  /* ro_RO */
26
  { "ru", WC_CES_ISO_8859_5 },  /* ru_SU */
27
  { "sk", WC_CES_ISO_8859_2 },  /* sk_SK */
28
  { "sl", WC_CES_ISO_8859_2 },  /* sl_CS */
29
  { "tr", WC_CES_ISO_8859_9 },  /* tr_TR */
30
  { "zh", WC_CES_EUC_CN },  /* zh_CN */
31
  { NULL, 0 }
32
};
33
34
static wc_ces
35
wc_codepage(int n)
36
418
{
37
418
  switch (n) {
38
4
  case 437: return WC_CES_CP437;
39
4
  case 737: return WC_CES_CP737;
40
3
  case 775: return WC_CES_CP775;
41
3
  case 850: return WC_CES_CP850;
42
3
  case 852: return WC_CES_CP852;
43
3
  case 855: return WC_CES_CP855;
44
4
  case 856: return WC_CES_CP856;
45
3
  case 857: return WC_CES_CP857;
46
4
  case 860: return WC_CES_CP860;
47
4
  case 861: return WC_CES_CP861;
48
4
  case 862: return WC_CES_CP862;
49
5
  case 863: return WC_CES_CP863;
50
13
  case 864: return WC_CES_CP864;
51
4
  case 865: return WC_CES_CP865;
52
4
  case 866: return WC_CES_CP866;
53
5
  case 869: return WC_CES_CP869;
54
18
  case 874: return WC_CES_CP874;
55
3
  case 932: return WC_CES_CP932;   /* CP932 = Shift_JIS */
56
4
  case 936: return WC_CES_CP936;   /* CP936 = GBK > EUC_CN */
57
3
  case 943: return WC_CES_CP943;   /* CP943 = Shift_JIS */
58
3
  case 949: return WC_CES_CP949;   /* CP949 = UHC > EUC_KR */
59
3
  case 950: return WC_CES_CP950;   /* CP950 = Big5 */
60
3
  case 1006: return WC_CES_CP1006;
61
4
  case 1250: return WC_CES_CP1250;
62
4
  case 1251: return WC_CES_CP1251;
63
5
  case 1252: return WC_CES_CP1252;
64
3
  case 1253: return WC_CES_CP1253;
65
5
  case 1254: return WC_CES_CP1254;
66
13
  case 1255: return WC_CES_CP1255;
67
12
  case 1256: return WC_CES_CP1256;
68
3
  case 1257: return WC_CES_CP1257;
69
109
  case 1258: return WC_CES_CP1258;
70
418
  }
71
153
  return 0;
72
418
}
73
74
wc_ces
75
wc_guess_charset(char *charset, wc_ces orig)
76
0
{
77
0
    wc_ces guess;
78
79
0
    if (charset == NULL || *charset == '\0')
80
0
  return orig;
81
0
    guess = wc_charset_to_ces(charset);
82
0
    return guess ? guess : orig;
83
0
}
84
85
wc_ces
86
wc_guess_charset_short(char *charset, wc_ces orig)
87
24.9k
{
88
24.9k
    wc_ces guess;
89
90
24.9k
    if (charset == NULL || *charset == '\0')
91
7.64k
  return orig;
92
17.3k
    guess = wc_charset_short_to_ces(charset);
93
17.3k
    return guess ? guess : orig;
94
24.9k
}
95
96
wc_ces
97
wc_guess_locale_charset(char *locale, wc_ces orig)
98
0
{
99
0
    wc_ces guess;
100
101
0
    if (locale == NULL || *locale == '\0')
102
0
  return orig;
103
0
    guess = wc_locale_to_ces(locale);
104
0
    return guess ? guess : orig;
105
0
}
106
107
wc_ces
108
wc_charset_to_ces(char *charset)
109
17.3k
{
110
17.3k
    char *p = charset;
111
17.3k
    char buf[16];
112
17.3k
    int n;
113
114
17.3k
    if ((*p == 'x' || *p == 'X') && *(p+1) == '-')
115
6
  p += 2;
116
67.2k
    for (n = 0; *p && n < 15; p++) {
117
49.8k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
118
47.5k
      buf[n++] = *p | 32; /* tolower(*p); */
119
49.8k
    }
120
17.3k
    buf[n] = 0;
121
17.3k
    p = buf;
122
17.3k
    switch (*p) {
123
1.56k
    case 'e':
124
1.56k
  if (! strncmp(p, "euc", 3)) {
125
21
      p += 3;
126
21
      switch (*p) {
127
3
      case 'j': return WC_CES_EUC_JP;
128
3
      case 'c': return WC_CES_EUC_CN;
129
6
      case 't': return WC_CES_EUC_TW;
130
3
      case 'k': return WC_CES_EUC_KR;
131
21
      }
132
6
      switch (WcLocale) {
133
0
      case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
134
0
      case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
135
0
      case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
136
0
      case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
137
0
      case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
138
6
      }
139
6
      return WC_CES_EUC_JP;
140
6
        }
141
1.54k
  break;
142
1.54k
    case 'i':
143
219
  if (! strncmp(p, "iso2022", 7)) {
144
33
      p += 7;
145
33
      switch (*p) {
146
24
      case 'j':
147
24
    if (! strncmp(p, "jp2", 3))
148
3
        return WC_CES_ISO_2022_JP_2;
149
21
    if (! strncmp(p, "jp3", 3))
150
3
        return WC_CES_ISO_2022_JP_3;
151
18
    return WC_CES_ISO_2022_JP;
152
3
      case 'c': return WC_CES_ISO_2022_CN;
153
3
      case 'k': return WC_CES_ISO_2022_KR;
154
33
      }
155
3
      return WC_CES_ISO_2022_JP;
156
186
  } else if (! strncmp(p, "iso8859", 7)) {
157
33
      n = atoi(p + 7);
158
33
      if (n >= 1 && n <= 16 && n != 12)
159
6
    return (WC_CES_E_ISO_8859 | n);
160
27
      return WC_CES_ISO_8859_1;
161
153
  } else if (! strncmp(p, "ibm", 3)) {
162
65
      p += 3;
163
65
      if (*p >= '1' && *p <= '9')
164
7
        return wc_codepage(atoi(p));
165
58
      return wc_charset_to_ces(p);
166
65
  }
167
88
  break;
168
1.37k
    case 'j':
169
1.37k
  if (! strncmp(p, "johab", 5))
170
4
      return WC_CES_JOHAB;
171
1.36k
  if (! strncmp(p, "jis", 3))
172
6
      return WC_CES_ISO_2022_JP;
173
1.36k
  break;
174
1.93k
    case 's':
175
1.93k
  if (! strncmp(p, "shiftjisx0213", 13) ||
176
1.92k
      ! strncmp(p, "sjisx0213", 9))
177
305
      return WC_CES_SHIFT_JISX0213;
178
1.63k
  if (! strncmp(p, "shiftjis", 8) ||
179
1.58k
      ! strncmp(p, "sjis", 4))
180
169
      return WC_CES_SHIFT_JIS;
181
1.46k
  break;
182
1.46k
    case 'p':
183
24
  if (! strncmp(p, "pck", 3))
184
3
      return WC_CES_SHIFT_JIS;
185
21
  break;
186
3.10k
    case 'g':
187
3.10k
  if (! strncmp(p, "gb18030", 7) ||
188
3.08k
      ! strncmp(p, "gbk2k", 5))
189
2.01k
      return WC_CES_GB18030;
190
1.09k
  if (! strncmp(p, "gbk", 3))
191
523
      return WC_CES_GBK;
192
567
  if (! strncmp(p, "gb2312", 6))
193
3
      return WC_CES_EUC_CN;
194
564
  break;
195
564
    case 'b':
196
484
  if (! strncmp(p, "big5hkscs", 9))
197
3
      return WC_CES_HKSCS;
198
481
  if (! strncmp(p, "big5", 4))
199
34
      return WC_CES_BIG5;
200
447
  break;
201
1.37k
    case 'h':
202
1.37k
  if (! strncmp(p, "hz", 2))
203
4
      return WC_CES_HZ_GB_2312;
204
1.37k
  if (! strncmp(p, "hkscs", 5))
205
6
      return WC_CES_HKSCS;
206
1.36k
  break;
207
1.36k
    case 'k':
208
587
  if (! strncmp(p, "koi8r", 5))
209
3
      return WC_CES_KOI8_R;
210
584
  if (! strncmp(p, "koi8u", 5))
211
5
      return WC_CES_KOI8_U;
212
579
  if (! strncmp(p, "ksx1001", 7))
213
3
      return WC_CES_EUC_KR;
214
576
  if (! strncmp(p, "ksc5601", 7))
215
3
      return WC_CES_EUC_KR;
216
573
  break;
217
725
    case 't':
218
725
  if (! strncmp(p, "tis620", 6))
219
3
      return WC_CES_TIS_620;
220
722
  if (! strncmp(p, "tcvn", 4))
221
41
      return WC_CES_TCVN_5712;
222
681
  break;
223
681
    case 'n':
224
295
  if (! strncmp(p, "next", 4))
225
3
      return WC_CES_NEXTSTEP;
226
292
  break;
227
558
    case 'v':
228
558
  if (! strncmp(p, "viet", 4)) {
229
21
      p += 4;
230
21
      if (! strncmp(p, "tcvn", 4))
231
3
    return WC_CES_TCVN_5712;
232
21
  }
233
555
  if (! strncmp(p, "viscii", 6))
234
3
      return WC_CES_VISCII_11;
235
552
  if (! strncmp(p, "vps", 3))
236
3
      return WC_CES_VPS;
237
549
  break;
238
3.39k
    case 'u':
239
3.39k
#ifdef USE_UNICODE
240
3.39k
  if (! strncmp(p, "utf8", 4))
241
9
      return WC_CES_UTF_8;
242
3.39k
  if (! strncmp(p, "utf7", 4))
243
162
      return WC_CES_UTF_7;
244
3.22k
#endif
245
3.22k
  if (! strncmp(p, "uhc", 3))
246
484
      return WC_CES_UHC;
247
2.74k
  if (! strncmp(p, "ujis", 4))
248
5
      return WC_CES_EUC_JP;
249
2.73k
  if (! strncmp(p, "usascii", 7))
250
3
      return WC_CES_US_ASCII;
251
2.73k
  break;
252
2.73k
    case 'a':
253
96
  if (! strncmp(p, "ascii", 5))
254
3
      return WC_CES_US_ASCII;
255
93
  break;
256
506
    case 'c':
257
506
  if (! strncmp(p, "cngb", 4))
258
3
      return WC_CES_EUC_CN;
259
503
  if (*(p+1) != 'p')
260
484
      break;
261
19
  p += 2;
262
19
  if (*p >= '1' &&  *p <= '9')
263
4
      return wc_codepage(atoi(p));
264
15
  break;
265
456
    case 'w':
266
456
  if (strncmp(p, "windows", 7))
267
422
      break;
268
34
  p += 7;
269
34
  if (! strncmp(p, "31j", 3))
270
3
      return WC_CES_CP932;
271
31
  if (*p >= '1' &&  *p <= '9')
272
16
      return wc_codepage(atoi(p));
273
15
  break;
274
17.3k
    }
275
13.3k
    return 0;
276
17.3k
}
277
278
wc_ces
279
wc_charset_short_to_ces(char *charset)
280
17.3k
{
281
17.3k
    char *p = charset;
282
17.3k
    char buf[16];
283
17.3k
    wc_ces ces;
284
17.3k
    int n;
285
286
17.3k
    ces = wc_charset_to_ces(charset);
287
17.3k
    if (ces)
288
3.90k
  return ces;
289
290
41.3k
    for (n = 0; *p && n < 15; p++) {
291
27.9k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
292
26.1k
      buf[n++] = *p | 32; /* tolower(*p); */
293
27.9k
    }
294
13.4k
    buf[n] = 0;
295
13.4k
    p = buf;
296
13.4k
    switch (*p) {
297
1.54k
    case 'e':
298
1.54k
  switch (*(p+1)) {
299
0
  case 'j': return WC_CES_EUC_JP;
300
3
  case 'c': return WC_CES_EUC_CN;
301
456
  case 't': return WC_CES_EUC_TW;
302
17
  case 'k': return WC_CES_EUC_KR;
303
1.54k
  }
304
1.06k
  return WC_CES_EUC_JP;
305
1.36k
    case 'j':
306
1.36k
  p++;
307
1.36k
  if (*p == 'o')
308
747
      return WC_CES_JOHAB;
309
615
  if (*p == 'p')
310
4
     p++;
311
615
  if (*p == '2')
312
131
     return WC_CES_ISO_2022_JP_2;
313
484
  if (*p == '3')
314
5
     return WC_CES_ISO_2022_JP_3;
315
479
  return WC_CES_ISO_2022_JP;
316
1.46k
    case 's':
317
1.46k
  return WC_CES_SHIFT_JIS;
318
564
    case 'g':
319
564
  return WC_CES_EUC_CN;
320
446
    case 'b':
321
446
  return WC_CES_BIG5;
322
1.36k
    case 'h':
323
1.36k
  if (*(p+1) == 'k')
324
456
      return WC_CES_HKSCS;
325
912
  return WC_CES_HZ_GB_2312;
326
573
    case 'k':
327
573
  if (*(p+1) == 'o')
328
35
      return WC_CES_KOI8_R;
329
538
  return WC_CES_ISO_2022_KR;
330
342
    case 'l':
331
342
  n = atoi(p + 1);
332
342
  if (n >= 1 && n <= 16 && n != 12)
333
35
      return (WC_CES_E_ISO_8859 | n);
334
307
  return WC_CES_ISO_8859_1;
335
681
    case 't':
336
681
  if (*(p+1) == 'c')
337
142
      return WC_CES_TCVN_5712;
338
539
  return WC_CES_TIS_620;
339
292
    case 'n':
340
292
  return WC_CES_NEXTSTEP;
341
549
    case 'v':
342
549
  if (*(p+1) == 'p')
343
44
      return WC_CES_VPS;
344
505
  return WC_CES_VISCII_11;
345
0
#ifdef USE_UNICODE
346
2.73k
    case 'u':
347
2.73k
  if (*(p+1) == '7')
348
875
      return WC_CES_UTF_7;
349
1.85k
  return WC_CES_UTF_8;
350
0
#endif
351
93
    case 'a':
352
93
  return WC_CES_US_ASCII;
353
503
    case 'c':
354
503
  return WC_CES_ISO_2022_CN;
355
453
    case 'w':
356
453
  p++;
357
453
  if (*p >= '1' &&  *p <= '9')
358
391
      return wc_codepage(atoi(p));
359
62
  break;
360
149
    case 'r':
361
149
  return WC_CES_RAW;
362
13.4k
    }
363
355
    return 0;
364
13.4k
}
365
366
wc_ces
367
wc_locale_to_ces(char *locale)
368
0
{
369
0
    char *p = locale;
370
0
    char buf[8];
371
0
    int n;
372
373
0
    if (*p == 'C' && *(p+1) == '\0')
374
0
  return WC_CES_US_ASCII;
375
0
#ifdef HAVE_LANGINFO_CODESET
376
0
    {
377
0
  char *cs = nl_langinfo(CODESET);
378
0
  if (cs && strcmp(cs, "US-ASCII"))
379
0
      return wc_charset_to_ces(cs);
380
0
    }
381
0
#endif
382
0
    for (n = 0; *p && *p != '.' && n < 7; p++) {
383
0
  if ((unsigned char)*p > 0x20)
384
0
      buf[n++] = *p | 32; /* tolower(*p); */
385
0
    }
386
0
    buf[n] = 0;
387
0
    if (*p == '.') {
388
0
  p++;
389
0
  if (! strcasecmp(p, "euc")) {
390
0
      switch (buf[0]) {
391
0
      case 'j':
392
0
    WcLocale = WC_LOCALE_JA_JP;
393
0
    break;
394
0
      case 'k':
395
0
    WcLocale = WC_LOCALE_KO_KR;
396
0
    break;
397
0
      case 'z':
398
0
          if (!strcmp(buf, "zh_tw"))
399
0
        WcLocale = WC_LOCALE_ZH_TW;
400
0
          else if (!strcmp(buf, "zh_hk"))
401
0
        WcLocale = WC_LOCALE_ZH_HK;
402
0
    else
403
0
        WcLocale = WC_LOCALE_ZH_CN;
404
0
    break;
405
0
      default:
406
0
    WcLocale = 0;
407
0
    break;
408
0
      }
409
0
  }
410
0
  return wc_charset_to_ces(p);
411
0
    }
412
413
0
    if (!strcmp(buf, "japanese"))
414
0
  return WC_CES_SHIFT_JIS;
415
0
    if (!strcmp(buf, "zh_tw") ||
416
0
  !strcmp(buf, "zh_hk"))
417
0
  return WC_CES_BIG5;
418
0
    for (n = 0; lang_ces_table[n].lang; n++) {
419
0
  if (!strncmp(buf, lang_ces_table[n].lang, 2))
420
0
      return lang_ces_table[n].ces;
421
0
    }
422
0
    return WC_CES_ISO_8859_1;
423
0
}
424
425
char *
426
wc_ces_to_charset(wc_ces ces)
427
0
{
428
0
    if (ces == WC_CES_WTF)
429
0
  return "WTF";
430
0
    return WcCesInfo[WC_CES_INDEX(ces)].name;
431
0
}
432
433
char *
434
wc_ces_to_charset_desc(wc_ces ces)
435
0
{
436
0
    if (ces == WC_CES_WTF)
437
0
  return "W3M Transfer Format";
438
0
    return WcCesInfo[WC_CES_INDEX(ces)].desc;
439
0
}
440
441
wc_ces
442
wc_guess_8bit_charset(wc_ces orig)
443
0
{
444
0
    switch (orig) {
445
0
    case WC_CES_ISO_2022_JP:
446
0
    case WC_CES_ISO_2022_JP_2:
447
0
    case WC_CES_ISO_2022_JP_3:
448
0
  return WC_CES_EUC_JP;
449
0
    case WC_CES_ISO_2022_KR:
450
0
  return WC_CES_EUC_KR;
451
0
    case WC_CES_ISO_2022_CN:
452
0
    case WC_CES_HZ_GB_2312:
453
0
  return WC_CES_EUC_CN;
454
0
    case WC_CES_US_ASCII:
455
0
  return WC_CES_ISO_8859_1;
456
0
    }
457
0
    return orig;
458
0
}
459
460
wc_bool
461
wc_check_ces(wc_ces ces)
462
2
{
463
2
    size_t i = WC_CES_INDEX(ces);
464
465
2
    return (i <= WC_CES_END && WcCesInfo[i].id == ces);
466
2
}
467
468
static int
469
wc_ces_list_cmp(const void *a, const void *b)
470
0
{
471
0
    return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
472
0
}
473
474
static wc_ces_list *list = NULL;
475
476
wc_ces_list *
477
wc_get_ces_list(void)
478
0
{
479
0
    wc_ces_info *info;
480
0
    size_t n;
481
482
0
    if (list)
483
0
  return list;
484
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
485
0
  if (info->name != NULL)
486
0
      n++;
487
0
    }
488
0
    list = New_N(wc_ces_list, n + 1);
489
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
490
0
  if (info->name != NULL) {
491
0
      list[n].id = info->id;
492
0
      list[n].name = info->name;
493
0
      list[n].desc = info->desc;
494
0
      n++;
495
0
  }
496
0
    }
497
0
    list[n].id = 0;
498
0
    list[n].name = NULL;
499
    list[n].desc = NULL;
500
0
    qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
501
0
    return list;
502
0
}