Coverage Report

Created: 2025-08-25 07:10

/src/libredwg/src/codepages.c
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************/
2
/*  LibreDWG - free implementation of the DWG file format                    */
3
/*                                                                           */
4
/*  Copyright (C) 2023-2025 Free Software Foundation, Inc.                   */
5
/*                                                                           */
6
/*  This library is free software, licensed under the terms of the GNU       */
7
/*  General Public License as published by the Free Software Foundation,     */
8
/*  either version 3 of the License, or (at your option) any later version.  */
9
/*  You should have received a copy of the GNU General Public License        */
10
/*  along with this program.  If not, see <http://www.gnu.org/licenses/>.    */
11
/*****************************************************************************/
12
13
/*
14
 * codepages.c: preR2007 codepages support via iconv
15
 * written by Reini Urban
16
 *
17
 * See also the src mappings from https://www.unicode.org/Public/MAPPINGS/
18
 * or the libdxfrw/src/intern/drw_textcodec.cpp mappings.
19
 */
20
21
#include "config.h"
22
#include <string.h>
23
#include <stdint.h>
24
#include <stdlib.h>
25
#include <errno.h>
26
#include <ctype.h>
27
#include <assert.h>
28
#if defined HAVE_ICONV && defined HAVE_ICONV_H
29
#  include <iconv.h>
30
#endif
31
#if defined HAVE_WCTYPE_H
32
#  include <wctype.h>
33
#endif
34
// #define CODEPAGES_C
35
#include "common.h"
36
#include "codepages.h"
37
38
#include "codepages/ISO-8859-2.h"
39
#include "codepages/ISO-8859-3.h"
40
#include "codepages/ISO-8859-4.h"
41
#include "codepages/ISO-8859-5.h"
42
#include "codepages/ISO-8859-6.h"
43
#include "codepages/ISO-8859-7.h"
44
#include "codepages/ISO-8859-8.h"
45
#include "codepages/ISO-8859-9.h"
46
#include "codepages/CP437.h"
47
#include "codepages/CP850.h"
48
#include "codepages/CP852.h"
49
#include "codepages/CP855.h"
50
#include "codepages/CP857.h"
51
#include "codepages/CP860.h"
52
#include "codepages/CP861.h"
53
#include "codepages/CP863.h"
54
#include "codepages/CP864.h"
55
#include "codepages/CP865.h"
56
#include "codepages/CP869.h"
57
#include "codepages/CP932.h"
58
#include "codepages/MACINTOSH.h"
59
#include "codepages/BIG5.h"
60
#include "codepages/CP949.h"
61
#include "codepages/JOHAB.h"
62
#include "codepages/CP866.h"
63
#include "codepages/WINDOWS-1250.h"
64
#include "codepages/WINDOWS-1251.h"
65
#include "codepages/WINDOWS-1252.h"
66
#include "codepages/GB2312.h"
67
#include "codepages/WINDOWS-1253.h"
68
#include "codepages/WINDOWS-1254.h"
69
#include "codepages/WINDOWS-1255.h"
70
#include "codepages/WINDOWS-1256.h"
71
#include "codepages/WINDOWS-1257.h"
72
#include "codepages/WINDOWS-874.h"
73
#include "codepages/WINDOWS-932.h"
74
#include "codepages/WINDOWS-936.h"
75
#include "codepages/WINDOWS-949.h"
76
#include "codepages/WINDOWS-950.h"
77
#include "codepages/WINDOWS-1361.h"
78
#include "codepages/WINDOWS-1258.h"
79
80
static const uint16_t *cp_fntbl[] = { NULL, // UTF8
81
                                      NULL, // US-ASCII
82
                                      NULL, // ISO-8859-1
83
                                      cptbl_iso_8859_2,
84
                                      cptbl_iso_8859_3,
85
                                      cptbl_iso_8859_4,
86
                                      cptbl_iso_8859_5,
87
                                      cptbl_iso_8859_6,
88
                                      cptbl_iso_8859_7,
89
                                      cptbl_iso_8859_8,
90
                                      cptbl_iso_8859_9,
91
                                      cptbl_cp437,
92
                                      cptbl_cp850,
93
                                      cptbl_cp852,
94
                                      cptbl_cp855,
95
                                      cptbl_cp857,
96
                                      cptbl_cp860,
97
                                      cptbl_cp861,
98
                                      cptbl_cp863,
99
                                      cptbl_cp864,
100
                                      cptbl_cp865,
101
                                      cptbl_cp869,
102
                                      cptbl_cp932, /* original shiftjis */
103
                                      cptbl_macintosh,
104
                                      cptbl_big5,
105
                                      cptbl_cp949, /* 25 */
106
                                      cptbl_johab, /* 26 */
107
                                      cptbl_cp866,
108
                                      cptbl_windows_1250,
109
                                      cptbl_windows_1251, /* 29 */
110
                                      cptbl_windows_1252, /* 30 */
111
                                      cptbl_gb2312,
112
                                      cptbl_windows_1253,
113
                                      cptbl_windows_1254,
114
                                      cptbl_windows_1255,
115
                                      cptbl_windows_1256,
116
                                      cptbl_windows_1257,
117
                                      cptbl_windows_874,
118
                                      cptbl_windows_932, /* windows-31j */
119
                                      cptbl_windows_936,
120
                                      cptbl_windows_949,
121
                                      cptbl_windows_950,
122
                                      cptbl_windows_1361, /* 42 */
123
                                      NULL,               /* 43 UTF16 */
124
                                      cptbl_windows_1258,
125
                                      NULL };
126
127
static const uint8_t *cp_alnumtbl[]
128
    = { NULL, // UTF8
129
        NULL, // US-ASCII
130
        NULL, // ISO-8859-1
131
        cptbl_alnum_iso_8859_2,
132
        cptbl_alnum_iso_8859_3,
133
        cptbl_alnum_iso_8859_4,
134
        cptbl_alnum_iso_8859_5,
135
        cptbl_alnum_iso_8859_6,
136
        cptbl_alnum_iso_8859_7,
137
        cptbl_alnum_iso_8859_8,
138
        cptbl_alnum_iso_8859_9,
139
        cptbl_alnum_cp437,
140
        cptbl_alnum_cp850,
141
        cptbl_alnum_cp852,
142
        cptbl_alnum_cp855,
143
        cptbl_alnum_cp857,
144
        cptbl_alnum_cp860,
145
        cptbl_alnum_cp861,
146
        cptbl_alnum_cp863,
147
        cptbl_alnum_cp864,
148
        cptbl_alnum_cp865,
149
        cptbl_alnum_cp869,
150
        NULL, // cptbl_alnum_cp932, /* original shiftjis */
151
        cptbl_alnum_macintosh,
152
        NULL, // cptbl_alnum_big5,
153
        NULL, // cptbl_alnum_cp949, /* 25 */
154
        NULL, // cptbl_alnum_johab, /* 26 */
155
        cptbl_alnum_cp866,
156
        cptbl_alnum_windows_1250,
157
        cptbl_alnum_windows_1251, /* 29 */
158
        cptbl_alnum_windows_1252, /* 30 */
159
        NULL,                     // cptbl_alnum_gb2312,
160
        cptbl_alnum_windows_1253,
161
        cptbl_alnum_windows_1254,
162
        cptbl_alnum_windows_1255,
163
        cptbl_alnum_windows_1256,
164
        cptbl_alnum_windows_1257,
165
        cptbl_alnum_windows_874,
166
        NULL, // cptbl_alnum_windows_932, /* windows-31j */
167
        NULL, // cptbl_alnum_windows_936,
168
        NULL, // cptbl_alnum_windows_949,
169
        NULL, // cptbl_alnum_windows_950,
170
        NULL, // cptbl_alnum_windows_1361, /* 42 */
171
        NULL, /* 43 UTF16 */
172
        cptbl_alnum_windows_1258,
173
        NULL };
174
175
static const uint16_t *cp_alnum16tbl[]
176
    = { NULL,              // UTF8
177
        NULL,              // US-ASCII
178
        NULL,              // ISO-8859-1
179
        NULL,              // cptbl_alnum_iso_8859_2,
180
        NULL,              // cptbl_alnum_iso_8859_3,
181
        NULL,              // cptbl_alnum_iso_8859_4,
182
        NULL,              // cptbl_alnum_iso_8859_5,
183
        NULL,              // cptbl_alnum_iso_8859_6,
184
        NULL,              // cptbl_alnum_iso_8859_7,
185
        NULL,              // cptbl_alnum_iso_8859_8,
186
        NULL,              // cptbl_alnum_iso_8859_9,
187
        NULL,              // cptbl_alnum_cp437,
188
        NULL,              // cptbl_alnum_cp850,
189
        NULL,              // cptbl_alnum_cp852,
190
        NULL,              // cptbl_alnum_cp855,
191
        NULL,              // cptbl_alnum_cp857,
192
        NULL,              // cptbl_alnum_cp860,
193
        NULL,              // cptbl_alnum_cp861,
194
        NULL,              // cptbl_alnum_cp863,
195
        NULL,              // cptbl_alnum_cp864,
196
        NULL,              // cptbl_alnum_cp865,
197
        NULL,              // cptbl_alnum_cp869,
198
        cptbl_alnum_cp932, /* original shiftjis */
199
        NULL,              // cptbl_alnum_macintosh,
200
        cptbl_alnum_big5,
201
        cptbl_alnum_cp949, /* 25 */
202
        cptbl_alnum_johab, /* 26 */
203
        NULL,              // cptbl_alnum_cp866,
204
        NULL,              // cptbl_alnum_windows_1250,
205
        NULL,              // cptbl_alnum_windows_1251, /* 29 */
206
        NULL,              // cptbl_alnum_windows_1252, /* 30 */
207
        cptbl_alnum_gb2312,
208
        NULL,                    // cptbl_alnum_windows_1253,
209
        NULL,                    // cptbl_alnum_windows_1254,
210
        NULL,                    // cptbl_alnum_windows_1255,
211
        NULL,                    // cptbl_alnum_windows_1256,
212
        NULL,                    // cptbl_alnum_windows_1257,
213
        NULL,                    // cptbl_alnum_windows_874,
214
        cptbl_alnum_windows_932, /* windows-31j */
215
        cptbl_alnum_windows_936,
216
        cptbl_alnum_windows_949,
217
        cptbl_alnum_windows_950,
218
        cptbl_alnum_windows_1361, /* 42 */
219
        NULL,                     /* 43 UTF16 */
220
        NULL,                     // cptbl_alnum_windows_1258,
221
        NULL };
222
223
// synced with typedef enum _dwg_codepage in codepages.h
224
#ifdef HAVE_ICONV
225
226
const char *
227
dwg_codepage_iconvstr (Dwg_Codepage cp)
228
65.8k
{
229
  // for iconv
230
65.8k
  const char *_codepage_iconvstr[] = { "UTF8",         "US-ASCII",
231
65.8k
                                       "ISO-8859-1",   "ISO-8859-2",
232
65.8k
                                       "ISO-8859-3",   "ISO-8859-4",
233
65.8k
                                       "ISO-8859-5",   "ISO-8859-6",
234
65.8k
                                       "ISO-8859-7",   "ISO-8859-8",
235
65.8k
                                       "ISO-8859-9",   "CP437",
236
65.8k
                                       "CP850",        "CP852",
237
65.8k
                                       "CP855",        "CP857",
238
65.8k
                                       "CP860",        "CP861",
239
65.8k
                                       "CP863",        "CP864",
240
65.8k
                                       "CP865",        "CP869",
241
65.8k
                                       "CP932",        "MACINTOSH",
242
65.8k
                                       "BIG5",         "CP949", /* 25 */
243
65.8k
                                       "JOHAB",        "CP866",
244
65.8k
                                       "WINDOWS-1250", "WINDOWS-1251", /* 29 */
245
65.8k
                                       "WINDOWS-1252",                 /* 30 */
246
65.8k
                                       "GB2312",       "WINDOWS-1253",
247
65.8k
                                       "WINDOWS-1254", "WINDOWS-1255",
248
65.8k
                                       "WINDOWS-1256", "WINDOWS-1257",
249
65.8k
                                       "WINDOWS-874",  "WINDOWS-932",
250
65.8k
                                       "WINDOWS-936",  "WINDOWS-949",
251
65.8k
                                       "WINDOWS-950",  "WINDOWS-1361",
252
65.8k
                                       "UTF16", /* 43 */
253
65.8k
                                       "WINDOWS-1258", NULL };
254
65.8k
  if (cp <= CP_ANSI_1258)
255
65.8k
    return _codepage_iconvstr[cp];
256
0
  else
257
0
    return NULL;
258
65.8k
}
259
#endif
260
261
const char *_codepage_dxfstr[]
262
    = { "UTF8",       "US_ASCII",   "ISO-8859-1", "ISO-8859-2", "ISO-8859-3",
263
        "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
264
        "ISO-8859-9", "CP437",      "CP850",      "CP852",      "CP855",
265
        "CP857",      "CP860",      "CP861",      "CP863",      "CP864",
266
        "CP865",      "CP869",      "CP932",      "MACINTOSH",  "BIG5",
267
        "CP949",                                               /* 25 */
268
        "JOHAB",      "CP866",      "ANSI_1250",  "ANSI_1251", /* 29 */
269
        "ANSI_1252", /* 30 WesternEurope Windows */
270
        "GB2312",     "ANSI_1253",  "ANSI_1254",  "ANSI_1255",  "ANSI_1256",
271
        "ANSI_1257",  "ANSI_874",   "ANSI_932",   "ANSI_936",   "ANSI_949",
272
        "ANSI_950",   "ANSI_1361",  "UTF16", /* 43 */
273
        "ANSI_1258",  NULL };
274
275
const char *
276
dwg_codepage_dxfstr (Dwg_Codepage cp)
277
0
{
278
0
  if (cp <= CP_ANSI_1258)
279
0
    return _codepage_dxfstr[cp];
280
0
  else if (cp == CP_UNDEFINED)
281
0
    return "undefined";
282
0
  else
283
0
    return NULL;
284
0
}
285
286
Dwg_Codepage
287
dwg_codepage_int (const char *s)
288
259
{
289
8.02k
  for (int i = 0; i <= (int)CP_ANSI_1258; i++)
290
8.02k
    {
291
8.02k
      if (strEQ (s, _codepage_dxfstr[i]))
292
259
        return (Dwg_Codepage)i;
293
7.77k
      if (islower (*s) && 0 == strcasecmp (s, _codepage_dxfstr[i]))
294
0
        return (Dwg_Codepage)i;
295
7.77k
    }
296
0
  return CP_UNDEFINED;
297
259
}
298
299
/* helper to check if a codepoint exists in the codepage,
300
   and convert it to/from unicode.
301
   dir = 1: from unicode wc to charset
302
   asian = 1: 2-byte CJK charset, else 1-byte (0-255)
303
*/
304
static wchar_t
305
codepage_helper (const Dwg_Codepage codepage, const wchar_t wc, const int dir,
306
                 const int asian)
307
106k
{
308
106k
  const uint16_t *fntbl;
309
106k
  uint16_t maxc;
310
106k
  assert (codepage != CP_UTF8 && codepage != CP_UTF16
311
106k
          && codepage != CP_US_ASCII && codepage != CP_ISO_8859_1);
312
106k
  fntbl = cp_fntbl[codepage];
313
106k
  maxc = fntbl[0];
314
106k
  assert (maxc);
315
106k
  if (dir) // from unicode to charset.
316
0
    {      // reverse lookup. unsorted rhs values so we cannot bsearch.
317
0
      for (uint16_t i = 0x80; i < maxc; i++)
318
0
        {
319
0
          if (wc == fntbl[i])
320
0
            return i;
321
0
        }
322
0
      return 0;
323
0
    }
324
106k
  else
325
106k
    {
326
106k
      if (wc < maxc)
327
84.3k
        return fntbl[wc];
328
21.9k
      else
329
21.9k
        return 0;
330
106k
    }
331
106k
}
332
333
// returns the matching unicode codepoint,
334
// or 0 if the codepage does not contain the character
335
wchar_t
336
dwg_codepage_uc (Dwg_Codepage cp, unsigned char c)
337
0
{
338
0
  if (c < 128)
339
0
    return (wchar_t)c;
340
0
  else if (cp == CP_US_ASCII)
341
0
    return 0;
342
0
  if (cp == CP_ISO_8859_1 || cp == CP_UTF8 || cp == CP_UTF16)
343
0
    return (wchar_t)c;
344
0
  return codepage_helper (cp, (wchar_t)c, 0, 0);
345
0
}
346
// for wide asian chars
347
wchar_t
348
dwg_codepage_uwc (Dwg_Codepage cp, uint16_t c)
349
414k
{
350
414k
  if (cp == CP_CP864 && c == 0x25)
351
0
    return 0x066a;
352
414k
  else if (cp == CP_CP932 && c == 0x5c)
353
0
    return 0x00A5;
354
414k
  else if (cp == CP_CP932 && c == 0x7e)
355
0
    return 0x203E;
356
414k
  else if (cp == CP_JOHAB && c == 0x5c)
357
0
    return 0x20A9;
358
414k
  else if (c < 128 || cp == CP_UTF8 || cp == CP_UTF16)
359
308k
    return (wchar_t)c;
360
106k
  return codepage_helper (cp, (wchar_t)c, 0, 1);
361
414k
}
362
// returns the matching codepoint,
363
// or 0 if the codepage does not contain the wide character
364
unsigned char
365
dwg_codepage_c (Dwg_Codepage cp, wchar_t wc)
366
0
{
367
0
  if (wc < 128)
368
0
    {
369
0
      if (cp == CP_US_ASCII || cp == CP_UTF8 || cp == CP_UTF16)
370
0
        return wc & 0xff;
371
0
    }
372
0
  else if (cp == CP_US_ASCII)
373
0
    return 0;
374
0
  if (cp == CP_ISO_8859_1 || cp == CP_UTF8)
375
0
    return wc < 256 ? wc : 0;
376
0
  return (unsigned char)codepage_helper (cp, wc, 1, 0);
377
0
}
378
// for wide asian chars
379
uint16_t
380
dwg_codepage_wc (Dwg_Codepage cp, wchar_t wc)
381
0
{
382
0
  if (wc < 128 || cp == CP_UTF8 || cp == CP_UTF16)
383
0
    return wc & 0xffff;
384
0
  return (uint16_t)codepage_helper (cp, wc, 1, 1);
385
0
}
386
387
/* for possible wide asian chars:
388
   932 is single-byte for most chars, but 0x8*, 0x9*, 0xE* and 0xF* lead bytes
389
   CP949, JOHAB, ANSI_949, 936, 950 for all > 0x8* lead bytes
390
   1361 for all but 0x8[0123], 0xD[4567F], 0xF[A-F] lead bytes
391
   BIG5, GB2312 are two-byte only.
392
393
   none have valid 0x00 bytes, so strlen works as before in the TV case.
394
*/
395
bool
396
dwg_codepage_isasian (const Dwg_Codepage cp)
397
856k
{
398
856k
  if (cp >= CP_BIG5 && cp <= CP_JOHAB)
399
348k
    return true;
400
508k
  else if (cp >= CP_ANSI_932 && cp <= CP_ANSI_1258)
401
203k
    return true;
402
305k
  else if (cp == CP_GB2312)
403
0
    return true;
404
305k
  else
405
305k
    return false;
406
856k
}
407
408
static int
409
b8_cmp (const void *a, const void *b)
410
0
{
411
0
  return *(uint8_t *)a < *(uint8_t *)b   ? -1
412
0
         : *(uint8_t *)a > *(uint8_t *)b ? 1
413
0
                                         : 0;
414
0
}
415
416
static int
417
b16_cmp (const void *a, const void *b)
418
0
{
419
0
  return *(uint16_t *)a < *(uint16_t *)b   ? -1
420
0
         : *(uint16_t *)a > *(uint16_t *)b ? 1
421
0
                                           : 0;
422
0
}
423
424
bool
425
dwg_codepage_isalnum (const Dwg_Codepage cp, const wchar_t c)
426
0
{
427
0
  if (c < 128)
428
0
    return isalnum ((int)c);
429
0
  switch (cp)
430
0
    {
431
0
    case CP_US_ASCII:
432
0
      return false;
433
0
    case CP_ISO_8859_1:
434
0
      return (c >= 0xC0 && c <= 0xD6) || (c >= 0xD8 && c <= 0xFF);
435
0
    case CP_UTF8:
436
0
    case CP_UTF16: // fallthru
437
0
#if defined HAVE_WCTYPE_H
438
0
      return iswalnum ((int)c);
439
#else
440
                   // TODO panic?
441
      return false;
442
#endif
443
0
    default:
444
0
      {
445
0
        const uint8_t *fntbl = cp_alnumtbl[cp];
446
0
        assert (cp != CP_UTF8 && cp != CP_UTF16 && cp != CP_US_ASCII
447
0
                && cp != CP_ISO_8859_1);
448
        // 8 or 16bit?
449
0
        if (fntbl)
450
0
          {
451
0
            const uint8_t key = c & 0xff;
452
0
            const uint8_t sz8 = fntbl[0];
453
0
            const size_t sz = (size_t)sz8;
454
0
            uint8_t *found
455
0
                = (uint8_t *)bsearch (&key, &fntbl[1], sz, 1, b8_cmp);
456
0
            if (!found || found == &fntbl[0])
457
0
              return false;
458
0
            else
459
0
              return true;
460
0
          }
461
0
        else
462
0
          {
463
0
            const uint16_t key = c & 0xffff;
464
0
            const uint16_t *fntbl16 = cp_alnum16tbl[cp];
465
0
            const uint16_t sz16 = fntbl16[0];
466
0
            const size_t sz = (size_t)sz16;
467
0
            uint16_t *found
468
0
                = (uint16_t *)bsearch (&key, &fntbl16[1], sz, 2, b16_cmp);
469
0
            if (!found || found == &fntbl16[0])
470
0
              return false;
471
0
            else
472
0
              return true;
473
0
          }
474
0
        return false;
475
0
      }
476
0
    }
477
0
}
478
479
bool
480
dwg_codepage_is_twobyte (const Dwg_Codepage cp, const unsigned char c)
481
414k
{
482
414k
  if (cp == CP_CP932 || cp == CP_ANSI_932)
483
0
    return (c >= 0x80 && c <= 0x9F) || (c >= 0xE0);
484
414k
  else if (cp == CP_CP949 || cp == CP_ANSI_949 || cp == CP_ANSI_936
485
414k
           || cp == CP_ANSI_950)
486
106k
    return c & 0x80;
487
308k
  else if (cp == CP_ANSI_1361)
488
308k
    return (c >= 0x80 && c <= 0x83) || (c >= 0xD4 && c <= 0xD7) || (c == 0xDF)
489
308k
           || (c >= 0xFA);
490
0
  else if (cp == CP_GB2312 || cp == CP_BIG5)
491
0
    return true;
492
0
  else
493
0
    return false;
494
414k
}