Coverage Report

Created: 2024-10-12 16:05

/src/w3m/libwc/charset.c
Line
Count
Source (jump to first uncovered line)
1
2
#include <stdlib.h>
3
#include <ctype.h>
4
#include "../alloc.h"
5
6
#include "wc.h"
7
8
#ifdef HAVE_LANGINFO_CODESET
9
#include <langinfo.h>
10
#endif
11
12
wc_locale WcLocale = 0;
13
14
static struct {
15
  char *lang;
16
  wc_ces ces;
17
} lang_ces_table[] = {
18
  { "cs", WC_CES_ISO_8859_2 },  /* cs_CZ */
19
  { "el", WC_CES_ISO_8859_7 },  /* el_GR */
20
  { "iw", WC_CES_ISO_8859_8 },  /* iw_IL */
21
  { "ja", WC_CES_EUC_JP },  /* ja_JP */
22
  { "ko", WC_CES_EUC_KR },  /* ko_KR */
23
  { "hu", WC_CES_ISO_8859_2 },  /* hu_HU */
24
  { "pl", WC_CES_ISO_8859_2 },  /* pl_PL */
25
  { "ro", WC_CES_ISO_8859_2 },  /* ro_RO */
26
  { "ru", WC_CES_ISO_8859_5 },  /* ru_SU */
27
  { "sk", WC_CES_ISO_8859_2 },  /* sk_SK */
28
  { "sl", WC_CES_ISO_8859_2 },  /* sl_CS */
29
  { "tr", WC_CES_ISO_8859_9 },  /* tr_TR */
30
  { "zh", WC_CES_EUC_CN },  /* zh_CN */
31
  { NULL, 0 }
32
};
33
34
static wc_ces
35
wc_codepage(int n)
36
372
{
37
372
  switch (n) {
38
5
  case 437: return WC_CES_CP437;
39
3
  case 737: return WC_CES_CP737;
40
4
  case 775: return WC_CES_CP775;
41
3
  case 850: return WC_CES_CP850;
42
4
  case 852: return WC_CES_CP852;
43
3
  case 855: return WC_CES_CP855;
44
4
  case 856: return WC_CES_CP856;
45
3
  case 857: return WC_CES_CP857;
46
4
  case 860: return WC_CES_CP860;
47
3
  case 861: return WC_CES_CP861;
48
5
  case 862: return WC_CES_CP862;
49
4
  case 863: return WC_CES_CP863;
50
12
  case 864: return WC_CES_CP864;
51
4
  case 865: return WC_CES_CP865;
52
4
  case 866: return WC_CES_CP866;
53
3
  case 869: return WC_CES_CP869;
54
17
  case 874: return WC_CES_CP874;
55
3
  case 932: return WC_CES_CP932;   /* CP932 = Shift_JIS */
56
3
  case 936: return WC_CES_CP936;   /* CP936 = GBK > EUC_CN */
57
4
  case 943: return WC_CES_CP943;   /* CP943 = Shift_JIS */
58
3
  case 949: return WC_CES_CP949;   /* CP949 = UHC > EUC_KR */
59
3
  case 950: return WC_CES_CP950;   /* CP950 = Big5 */
60
3
  case 1006: return WC_CES_CP1006;
61
3
  case 1250: return WC_CES_CP1250;
62
5
  case 1251: return WC_CES_CP1251;
63
6
  case 1252: return WC_CES_CP1252;
64
4
  case 1253: return WC_CES_CP1253;
65
4
  case 1254: return WC_CES_CP1254;
66
14
  case 1255: return WC_CES_CP1255;
67
14
  case 1256: return WC_CES_CP1256;
68
4
  case 1257: return WC_CES_CP1257;
69
81
  case 1258: return WC_CES_CP1258;
70
372
  }
71
133
  return 0;
72
372
}
73
74
wc_ces
75
wc_guess_charset(char *charset, wc_ces orig)
76
0
{
77
0
    wc_ces guess;
78
79
0
    if (charset == NULL || *charset == '\0')
80
0
  return orig;
81
0
    guess = wc_charset_to_ces(charset);
82
0
    return guess ? guess : orig;
83
0
}
84
85
wc_ces
86
wc_guess_charset_short(char *charset, wc_ces orig)
87
21.3k
{
88
21.3k
    wc_ces guess;
89
90
21.3k
    if (charset == NULL || *charset == '\0')
91
5.86k
  return orig;
92
15.4k
    guess = wc_charset_short_to_ces(charset);
93
15.4k
    return guess ? guess : orig;
94
21.3k
}
95
96
wc_ces
97
wc_guess_locale_charset(char *locale, wc_ces orig)
98
0
{
99
0
    wc_ces guess;
100
101
0
    if (locale == NULL || *locale == '\0')
102
0
  return orig;
103
0
    guess = wc_locale_to_ces(locale);
104
0
    return guess ? guess : orig;
105
0
}
106
107
wc_ces
108
wc_charset_to_ces(char *charset)
109
15.4k
{
110
15.4k
    char *p = charset;
111
15.4k
    char buf[16];
112
15.4k
    int n;
113
114
15.4k
    if ((*p == 'x' || *p == 'X') && *(p+1) == '-')
115
4
  p += 2;
116
67.1k
    for (n = 0; *p && n < 15; p++) {
117
51.6k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
118
48.9k
      buf[n++] = *p | 32; /* tolower(*p); */
119
51.6k
    }
120
15.4k
    buf[n] = 0;
121
15.4k
    p = buf;
122
15.4k
    switch (*p) {
123
1.22k
    case 'e':
124
1.22k
  if (! strncmp(p, "euc", 3)) {
125
16
      p += 3;
126
16
      switch (*p) {
127
3
      case 'j': return WC_CES_EUC_JP;
128
3
      case 'c': return WC_CES_EUC_CN;
129
4
      case 't': return WC_CES_EUC_TW;
130
3
      case 'k': return WC_CES_EUC_KR;
131
16
      }
132
3
      switch (WcLocale) {
133
0
      case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
134
0
      case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
135
0
      case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
136
0
      case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
137
0
      case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
138
3
      }
139
3
      return WC_CES_EUC_JP;
140
3
        }
141
1.21k
  break;
142
1.21k
    case 'i':
143
235
  if (! strncmp(p, "iso2022", 7)) {
144
52
      p += 7;
145
52
      switch (*p) {
146
41
      case 'j':
147
41
    if (! strncmp(p, "jp2", 3))
148
3
        return WC_CES_ISO_2022_JP_2;
149
38
    if (! strncmp(p, "jp3", 3))
150
2
        return WC_CES_ISO_2022_JP_3;
151
36
    return WC_CES_ISO_2022_JP;
152
3
      case 'c': return WC_CES_ISO_2022_CN;
153
3
      case 'k': return WC_CES_ISO_2022_KR;
154
52
      }
155
5
      return WC_CES_ISO_2022_JP;
156
183
  } else if (! strncmp(p, "iso8859", 7)) {
157
31
      n = atoi(p + 7);
158
31
      if (n >= 1 && n <= 16 && n != 12)
159
6
    return (WC_CES_E_ISO_8859 | n);
160
25
      return WC_CES_ISO_8859_1;
161
152
  } else if (! strncmp(p, "ibm", 3)) {
162
51
      p += 3;
163
51
      if (*p >= '1' && *p <= '9')
164
4
        return wc_codepage(atoi(p));
165
47
      return wc_charset_to_ces(p);
166
51
  }
167
101
  break;
168
1.36k
    case 'j':
169
1.36k
  if (! strncmp(p, "johab", 5))
170
7
      return WC_CES_JOHAB;
171
1.36k
  if (! strncmp(p, "jis", 3))
172
3
      return WC_CES_ISO_2022_JP;
173
1.35k
  break;
174
1.60k
    case 's':
175
1.60k
  if (! strncmp(p, "shiftjisx0213", 13) ||
176
1.60k
      ! strncmp(p, "sjisx0213", 9))
177
315
      return WC_CES_SHIFT_JISX0213;
178
1.29k
  if (! strncmp(p, "shiftjis", 8) ||
179
1.29k
      ! strncmp(p, "sjis", 4))
180
159
      return WC_CES_SHIFT_JIS;
181
1.13k
  break;
182
1.13k
    case 'p':
183
28
  if (! strncmp(p, "pck", 3))
184
3
      return WC_CES_SHIFT_JIS;
185
25
  break;
186
2.94k
    case 'g':
187
2.94k
  if (! strncmp(p, "gb18030", 7) ||
188
2.94k
      ! strncmp(p, "gbk2k", 5))
189
1.95k
      return WC_CES_GB18030;
190
990
  if (! strncmp(p, "gbk", 3))
191
523
      return WC_CES_GBK;
192
467
  if (! strncmp(p, "gb2312", 6))
193
7
      return WC_CES_EUC_CN;
194
460
  break;
195
460
    case 'b':
196
457
  if (! strncmp(p, "big5hkscs", 9))
197
5
      return WC_CES_HKSCS;
198
452
  if (! strncmp(p, "big5", 4))
199
38
      return WC_CES_BIG5;
200
414
  break;
201
1.17k
    case 'h':
202
1.17k
  if (! strncmp(p, "hz", 2))
203
6
      return WC_CES_HZ_GB_2312;
204
1.16k
  if (! strncmp(p, "hkscs", 5))
205
8
      return WC_CES_HKSCS;
206
1.15k
  break;
207
1.15k
    case 'k':
208
538
  if (! strncmp(p, "koi8r", 5))
209
3
      return WC_CES_KOI8_R;
210
535
  if (! strncmp(p, "koi8u", 5))
211
3
      return WC_CES_KOI8_U;
212
532
  if (! strncmp(p, "ksx1001", 7))
213
2
      return WC_CES_EUC_KR;
214
530
  if (! strncmp(p, "ksc5601", 7))
215
3
      return WC_CES_EUC_KR;
216
527
  break;
217
613
    case 't':
218
613
  if (! strncmp(p, "tis620", 6))
219
4
      return WC_CES_TIS_620;
220
609
  if (! strncmp(p, "tcvn", 4))
221
16
      return WC_CES_TCVN_5712;
222
593
  break;
223
593
    case 'n':
224
198
  if (! strncmp(p, "next", 4))
225
3
      return WC_CES_NEXTSTEP;
226
195
  break;
227
475
    case 'v':
228
475
  if (! strncmp(p, "viet", 4)) {
229
41
      p += 4;
230
41
      if (! strncmp(p, "tcvn", 4))
231
4
    return WC_CES_TCVN_5712;
232
41
  }
233
471
  if (! strncmp(p, "viscii", 6))
234
4
      return WC_CES_VISCII_11;
235
467
  if (! strncmp(p, "vps", 3))
236
7
      return WC_CES_VPS;
237
460
  break;
238
2.95k
    case 'u':
239
2.95k
#ifdef USE_UNICODE
240
2.95k
  if (! strncmp(p, "utf8", 4))
241
16
      return WC_CES_UTF_8;
242
2.93k
  if (! strncmp(p, "utf7", 4))
243
190
      return WC_CES_UTF_7;
244
2.74k
#endif
245
2.74k
  if (! strncmp(p, "uhc", 3))
246
411
      return WC_CES_UHC;
247
2.33k
  if (! strncmp(p, "ujis", 4))
248
4
      return WC_CES_EUC_JP;
249
2.32k
  if (! strncmp(p, "usascii", 7))
250
4
      return WC_CES_US_ASCII;
251
2.32k
  break;
252
2.32k
    case 'a':
253
109
  if (! strncmp(p, "ascii", 5))
254
3
      return WC_CES_US_ASCII;
255
106
  break;
256
369
    case 'c':
257
369
  if (! strncmp(p, "cngb", 4))
258
3
      return WC_CES_EUC_CN;
259
366
  if (*(p+1) != 'p')
260
344
      break;
261
22
  p += 2;
262
22
  if (*p >= '1' &&  *p <= '9')
263
9
      return wc_codepage(atoi(p));
264
13
  break;
265
446
    case 'w':
266
446
  if (strncmp(p, "windows", 7))
267
406
      break;
268
40
  p += 7;
269
40
  if (! strncmp(p, "31j", 3))
270
3
      return WC_CES_CP932;
271
37
  if (*p >= '1' &&  *p <= '9')
272
18
      return wc_codepage(atoi(p));
273
19
  break;
274
15.4k
    }
275
11.6k
    return 0;
276
15.4k
}
277
278
wc_ces
279
wc_charset_short_to_ces(char *charset)
280
15.4k
{
281
15.4k
    char *p = charset;
282
15.4k
    char buf[16];
283
15.4k
    wc_ces ces;
284
15.4k
    int n;
285
286
15.4k
    ces = wc_charset_to_ces(charset);
287
15.4k
    if (ces)
288
3.81k
  return ces;
289
290
40.1k
    for (n = 0; *p && n < 15; p++) {
291
28.4k
  if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
292
26.4k
      buf[n++] = *p | 32; /* tolower(*p); */
293
28.4k
    }
294
11.6k
    buf[n] = 0;
295
11.6k
    p = buf;
296
11.6k
    switch (*p) {
297
1.21k
    case 'e':
298
1.21k
  switch (*(p+1)) {
299
0
  case 'j': return WC_CES_EUC_JP;
300
6
  case 'c': return WC_CES_EUC_CN;
301
466
  case 't': return WC_CES_EUC_TW;
302
17
  case 'k': return WC_CES_EUC_KR;
303
1.21k
  }
304
723
  return WC_CES_EUC_JP;
305
1.35k
    case 'j':
306
1.35k
  p++;
307
1.35k
  if (*p == 'o')
308
685
      return WC_CES_JOHAB;
309
672
  if (*p == 'p')
310
6
     p++;
311
672
  if (*p == '2')
312
95
     return WC_CES_ISO_2022_JP_2;
313
577
  if (*p == '3')
314
7
     return WC_CES_ISO_2022_JP_3;
315
570
  return WC_CES_ISO_2022_JP;
316
1.13k
    case 's':
317
1.13k
  return WC_CES_SHIFT_JIS;
318
460
    case 'g':
319
460
  return WC_CES_EUC_CN;
320
413
    case 'b':
321
413
  return WC_CES_BIG5;
322
1.15k
    case 'h':
323
1.15k
  if (*(p+1) == 'k')
324
387
      return WC_CES_HKSCS;
325
772
  return WC_CES_HZ_GB_2312;
326
527
    case 'k':
327
527
  if (*(p+1) == 'o')
328
40
      return WC_CES_KOI8_R;
329
487
  return WC_CES_ISO_2022_KR;
330
308
    case 'l':
331
308
  n = atoi(p + 1);
332
308
  if (n >= 1 && n <= 16 && n != 12)
333
33
      return (WC_CES_E_ISO_8859 | n);
334
275
  return WC_CES_ISO_8859_1;
335
593
    case 't':
336
593
  if (*(p+1) == 'c')
337
130
      return WC_CES_TCVN_5712;
338
463
  return WC_CES_TIS_620;
339
195
    case 'n':
340
195
  return WC_CES_NEXTSTEP;
341
460
    case 'v':
342
460
  if (*(p+1) == 'p')
343
41
      return WC_CES_VPS;
344
419
  return WC_CES_VISCII_11;
345
0
#ifdef USE_UNICODE
346
2.32k
    case 'u':
347
2.32k
  if (*(p+1) == '7')
348
701
      return WC_CES_UTF_7;
349
1.62k
  return WC_CES_UTF_8;
350
0
#endif
351
106
    case 'a':
352
106
  return WC_CES_US_ASCII;
353
365
    case 'c':
354
365
  return WC_CES_ISO_2022_CN;
355
443
    case 'w':
356
443
  p++;
357
443
  if (*p >= '1' &&  *p <= '9')
358
341
      return wc_codepage(atoi(p));
359
102
  break;
360
126
    case 'r':
361
126
  return WC_CES_RAW;
362
11.6k
    }
363
563
    return 0;
364
11.6k
}
365
366
wc_ces
367
wc_locale_to_ces(char *locale)
368
0
{
369
0
    char *p = locale;
370
0
    char buf[8];
371
0
    int n;
372
373
0
    if (*p == 'C' && *(p+1) == '\0')
374
0
  return WC_CES_US_ASCII;
375
0
#ifdef HAVE_LANGINFO_CODESET
376
0
    {
377
0
  char *cs = nl_langinfo(CODESET);
378
0
  if (cs && strcmp(cs, "US-ASCII"))
379
0
      return wc_charset_to_ces(cs);
380
0
    }
381
0
#endif
382
0
    for (n = 0; *p && *p != '.' && n < 7; p++) {
383
0
  if ((unsigned char)*p > 0x20)
384
0
      buf[n++] = *p | 32; /* tolower(*p); */
385
0
    }
386
0
    buf[n] = 0;
387
0
    if (*p == '.') {
388
0
  p++;
389
0
  if (! strcasecmp(p, "euc")) {
390
0
      switch (buf[0]) {
391
0
      case 'j':
392
0
    WcLocale = WC_LOCALE_JA_JP;
393
0
    break;
394
0
      case 'k':
395
0
    WcLocale = WC_LOCALE_KO_KR;
396
0
    break;
397
0
      case 'z':
398
0
          if (!strcmp(buf, "zh_tw"))
399
0
        WcLocale = WC_LOCALE_ZH_TW;
400
0
          else if (!strcmp(buf, "zh_hk"))
401
0
        WcLocale = WC_LOCALE_ZH_HK;
402
0
    else
403
0
        WcLocale = WC_LOCALE_ZH_CN;
404
0
    break;
405
0
      default:
406
0
    WcLocale = 0;
407
0
    break;
408
0
      }
409
0
  }
410
0
  return wc_charset_to_ces(p);
411
0
    }
412
413
0
    if (!strcmp(buf, "japanese"))
414
0
  return WC_CES_SHIFT_JIS;
415
0
    if (!strcmp(buf, "zh_tw") ||
416
0
  !strcmp(buf, "zh_hk"))
417
0
  return WC_CES_BIG5;
418
0
    for (n = 0; lang_ces_table[n].lang; n++) {
419
0
  if (!strncmp(buf, lang_ces_table[n].lang, 2))
420
0
      return lang_ces_table[n].ces;
421
0
    }
422
0
    return WC_CES_ISO_8859_1;
423
0
}
424
425
char *
426
wc_ces_to_charset(wc_ces ces)
427
0
{
428
0
    if (ces == WC_CES_WTF)
429
0
  return "WTF";
430
0
    return WcCesInfo[WC_CES_INDEX(ces)].name;
431
0
}
432
433
char *
434
wc_ces_to_charset_desc(wc_ces ces)
435
0
{
436
0
    if (ces == WC_CES_WTF)
437
0
  return "W3M Transfer Format";
438
0
    return WcCesInfo[WC_CES_INDEX(ces)].desc;
439
0
}
440
441
wc_ces
442
wc_guess_8bit_charset(wc_ces orig)
443
0
{
444
0
    switch (orig) {
445
0
    case WC_CES_ISO_2022_JP:
446
0
    case WC_CES_ISO_2022_JP_2:
447
0
    case WC_CES_ISO_2022_JP_3:
448
0
  return WC_CES_EUC_JP;
449
0
    case WC_CES_ISO_2022_KR:
450
0
  return WC_CES_EUC_KR;
451
0
    case WC_CES_ISO_2022_CN:
452
0
    case WC_CES_HZ_GB_2312:
453
0
  return WC_CES_EUC_CN;
454
0
    case WC_CES_US_ASCII:
455
0
  return WC_CES_ISO_8859_1;
456
0
    }
457
0
    return orig;
458
0
}
459
460
wc_bool
461
wc_check_ces(wc_ces ces)
462
2
{
463
2
    size_t i = WC_CES_INDEX(ces);
464
465
2
    return (i <= WC_CES_END && WcCesInfo[i].id == ces);
466
2
}
467
468
static int
469
wc_ces_list_cmp(const void *a, const void *b)
470
0
{
471
0
    return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
472
0
}
473
474
static wc_ces_list *list = NULL;
475
476
wc_ces_list *
477
wc_get_ces_list(void)
478
0
{
479
0
    wc_ces_info *info;
480
0
    size_t n;
481
482
0
    if (list)
483
0
  return list;
484
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
485
0
  if (info->name != NULL)
486
0
      n++;
487
0
    }
488
0
    list = New_N(wc_ces_list, n + 1);
489
0
    for (info = WcCesInfo, n = 0; info->id; info++) {
490
0
  if (info->name != NULL) {
491
0
      list[n].id = info->id;
492
0
      list[n].name = info->name;
493
0
      list[n].desc = info->desc;
494
0
      n++;
495
0
  }
496
0
    }
497
0
    list[n].id = 0;
498
0
    list[n].name = NULL;
499
0
    list[n].desc = NULL;
500
0
    qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
501
0
    return list;
502
0
}