Coverage Report

Created: 2025-09-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/common/encnames.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * encnames.c
4
 *    Encoding names and routines for working with them.
5
 *
6
 * Portions Copyright (c) 2001-2025, PostgreSQL Global Development Group
7
 *
8
 * IDENTIFICATION
9
 *    src/common/encnames.c
10
 *
11
 *-------------------------------------------------------------------------
12
 */
13
#include "c.h"
14
15
#include <ctype.h>
16
#include <unistd.h>
17
18
#include "mb/pg_wchar.h"
19
20
21
/* ----------
22
 * All encoding names, sorted:     *** A L P H A B E T I C ***
23
 *
24
 * All names must be without irrelevant chars, search routines use
25
 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26
 * are always converted to 'iso88591'. All must be lower case.
27
 *
28
 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
29
 *
30
 * Karel Zak, Aug 2001
31
 * ----------
32
 */
33
typedef struct pg_encname
34
{
35
  const char *name;
36
  pg_enc    encoding;
37
} pg_encname;
38
39
static const pg_encname pg_encname_tbl[] =
40
{
41
  {
42
    "abc", PG_WIN1258
43
  },              /* alias for WIN1258 */
44
  {
45
    "alt", PG_WIN866
46
  },              /* IBM866 */
47
  {
48
    "big5", PG_BIG5
49
  },              /* Big5; Chinese for Taiwan multibyte set */
50
  {
51
    "euccn", PG_EUC_CN
52
  },              /* EUC-CN; Extended Unix Code for simplified
53
                 * Chinese */
54
  {
55
    "eucjis2004", PG_EUC_JIS_2004
56
  },              /* EUC-JIS-2004; Extended UNIX Code fixed
57
                 * Width for Japanese, standard JIS X 0213 */
58
  {
59
    "eucjp", PG_EUC_JP
60
  },              /* EUC-JP; Extended UNIX Code fixed Width for
61
                 * Japanese, standard OSF */
62
  {
63
    "euckr", PG_EUC_KR
64
  },              /* EUC-KR; Extended Unix Code for Korean , KS
65
                 * X 1001 standard */
66
  {
67
    "euctw", PG_EUC_TW
68
  },              /* EUC-TW; Extended Unix Code for
69
                 *
70
                 * traditional Chinese */
71
  {
72
    "gb18030", PG_GB18030
73
  },              /* GB18030;GB18030 */
74
  {
75
    "gbk", PG_GBK
76
  },              /* GBK; Chinese Windows CodePage 936
77
                 * simplified Chinese */
78
  {
79
    "iso88591", PG_LATIN1
80
  },              /* ISO-8859-1; RFC1345,KXS2 */
81
  {
82
    "iso885910", PG_LATIN6
83
  },              /* ISO-8859-10; RFC1345,KXS2 */
84
  {
85
    "iso885913", PG_LATIN7
86
  },              /* ISO-8859-13; RFC1345,KXS2 */
87
  {
88
    "iso885914", PG_LATIN8
89
  },              /* ISO-8859-14; RFC1345,KXS2 */
90
  {
91
    "iso885915", PG_LATIN9
92
  },              /* ISO-8859-15; RFC1345,KXS2 */
93
  {
94
    "iso885916", PG_LATIN10
95
  },              /* ISO-8859-16; RFC1345,KXS2 */
96
  {
97
    "iso88592", PG_LATIN2
98
  },              /* ISO-8859-2; RFC1345,KXS2 */
99
  {
100
    "iso88593", PG_LATIN3
101
  },              /* ISO-8859-3; RFC1345,KXS2 */
102
  {
103
    "iso88594", PG_LATIN4
104
  },              /* ISO-8859-4; RFC1345,KXS2 */
105
  {
106
    "iso88595", PG_ISO_8859_5
107
  },              /* ISO-8859-5; RFC1345,KXS2 */
108
  {
109
    "iso88596", PG_ISO_8859_6
110
  },              /* ISO-8859-6; RFC1345,KXS2 */
111
  {
112
    "iso88597", PG_ISO_8859_7
113
  },              /* ISO-8859-7; RFC1345,KXS2 */
114
  {
115
    "iso88598", PG_ISO_8859_8
116
  },              /* ISO-8859-8; RFC1345,KXS2 */
117
  {
118
    "iso88599", PG_LATIN5
119
  },              /* ISO-8859-9; RFC1345,KXS2 */
120
  {
121
    "johab", PG_JOHAB
122
  },              /* JOHAB; Extended Unix Code for simplified
123
                 * Chinese */
124
  {
125
    "koi8", PG_KOI8R
126
  },              /* _dirty_ alias for KOI8-R (backward
127
                 * compatibility) */
128
  {
129
    "koi8r", PG_KOI8R
130
  },              /* KOI8-R; RFC1489 */
131
  {
132
    "koi8u", PG_KOI8U
133
  },              /* KOI8-U; RFC2319 */
134
  {
135
    "latin1", PG_LATIN1
136
  },              /* alias for ISO-8859-1 */
137
  {
138
    "latin10", PG_LATIN10
139
  },              /* alias for ISO-8859-16 */
140
  {
141
    "latin2", PG_LATIN2
142
  },              /* alias for ISO-8859-2 */
143
  {
144
    "latin3", PG_LATIN3
145
  },              /* alias for ISO-8859-3 */
146
  {
147
    "latin4", PG_LATIN4
148
  },              /* alias for ISO-8859-4 */
149
  {
150
    "latin5", PG_LATIN5
151
  },              /* alias for ISO-8859-9 */
152
  {
153
    "latin6", PG_LATIN6
154
  },              /* alias for ISO-8859-10 */
155
  {
156
    "latin7", PG_LATIN7
157
  },              /* alias for ISO-8859-13 */
158
  {
159
    "latin8", PG_LATIN8
160
  },              /* alias for ISO-8859-14 */
161
  {
162
    "latin9", PG_LATIN9
163
  },              /* alias for ISO-8859-15 */
164
  {
165
    "mskanji", PG_SJIS
166
  },              /* alias for Shift_JIS */
167
  {
168
    "muleinternal", PG_MULE_INTERNAL
169
  },
170
  {
171
    "shiftjis", PG_SJIS
172
  },              /* Shift_JIS; JIS X 0202-1991 */
173
174
  {
175
    "shiftjis2004", PG_SHIFT_JIS_2004
176
  },              /* SHIFT-JIS-2004; Shift JIS for Japanese,
177
                 * standard JIS X 0213 */
178
  {
179
    "sjis", PG_SJIS
180
  },              /* alias for Shift_JIS */
181
  {
182
    "sqlascii", PG_SQL_ASCII
183
  },
184
  {
185
    "tcvn", PG_WIN1258
186
  },              /* alias for WIN1258 */
187
  {
188
    "tcvn5712", PG_WIN1258
189
  },              /* alias for WIN1258 */
190
  {
191
    "uhc", PG_UHC
192
  },              /* UHC; Korean Windows CodePage 949 */
193
  {
194
    "unicode", PG_UTF8
195
  },              /* alias for UTF8 */
196
  {
197
    "utf8", PG_UTF8
198
  },              /* alias for UTF8 */
199
  {
200
    "vscii", PG_WIN1258
201
  },              /* alias for WIN1258 */
202
  {
203
    "win", PG_WIN1251
204
  },              /* _dirty_ alias for windows-1251 (backward
205
                 * compatibility) */
206
  {
207
    "win1250", PG_WIN1250
208
  },              /* alias for Windows-1250 */
209
  {
210
    "win1251", PG_WIN1251
211
  },              /* alias for Windows-1251 */
212
  {
213
    "win1252", PG_WIN1252
214
  },              /* alias for Windows-1252 */
215
  {
216
    "win1253", PG_WIN1253
217
  },              /* alias for Windows-1253 */
218
  {
219
    "win1254", PG_WIN1254
220
  },              /* alias for Windows-1254 */
221
  {
222
    "win1255", PG_WIN1255
223
  },              /* alias for Windows-1255 */
224
  {
225
    "win1256", PG_WIN1256
226
  },              /* alias for Windows-1256 */
227
  {
228
    "win1257", PG_WIN1257
229
  },              /* alias for Windows-1257 */
230
  {
231
    "win1258", PG_WIN1258
232
  },              /* alias for Windows-1258 */
233
  {
234
    "win866", PG_WIN866
235
  },              /* IBM866 */
236
  {
237
    "win874", PG_WIN874
238
  },              /* alias for Windows-874 */
239
  {
240
    "win932", PG_SJIS
241
  },              /* alias for Shift_JIS */
242
  {
243
    "win936", PG_GBK
244
  },              /* alias for GBK */
245
  {
246
    "win949", PG_UHC
247
  },              /* alias for UHC */
248
  {
249
    "win950", PG_BIG5
250
  },              /* alias for BIG5 */
251
  {
252
    "windows1250", PG_WIN1250
253
  },              /* Windows-1251; Microsoft */
254
  {
255
    "windows1251", PG_WIN1251
256
  },              /* Windows-1251; Microsoft */
257
  {
258
    "windows1252", PG_WIN1252
259
  },              /* Windows-1252; Microsoft */
260
  {
261
    "windows1253", PG_WIN1253
262
  },              /* Windows-1253; Microsoft */
263
  {
264
    "windows1254", PG_WIN1254
265
  },              /* Windows-1254; Microsoft */
266
  {
267
    "windows1255", PG_WIN1255
268
  },              /* Windows-1255; Microsoft */
269
  {
270
    "windows1256", PG_WIN1256
271
  },              /* Windows-1256; Microsoft */
272
  {
273
    "windows1257", PG_WIN1257
274
  },              /* Windows-1257; Microsoft */
275
  {
276
    "windows1258", PG_WIN1258
277
  },              /* Windows-1258; Microsoft */
278
  {
279
    "windows866", PG_WIN866
280
  },              /* IBM866 */
281
  {
282
    "windows874", PG_WIN874
283
  },              /* Windows-874; Microsoft */
284
  {
285
    "windows932", PG_SJIS
286
  },              /* alias for Shift_JIS */
287
  {
288
    "windows936", PG_GBK
289
  },              /* alias for GBK */
290
  {
291
    "windows949", PG_UHC
292
  },              /* alias for UHC */
293
  {
294
    "windows950", PG_BIG5
295
  }             /* alias for BIG5 */
296
};
297
298
/* ----------
299
 * These are "official" encoding names.
300
 * ----------
301
 */
302
#ifndef WIN32
303
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304
#else
305
#define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306
#endif
307
308
const pg_enc2name pg_enc2name_tbl[] =
309
{
310
  [PG_SQL_ASCII] = DEF_ENC2NAME(SQL_ASCII, 0),
311
  [PG_EUC_JP] = DEF_ENC2NAME(EUC_JP, 20932),
312
  [PG_EUC_CN] = DEF_ENC2NAME(EUC_CN, 20936),
313
  [PG_EUC_KR] = DEF_ENC2NAME(EUC_KR, 51949),
314
  [PG_EUC_TW] = DEF_ENC2NAME(EUC_TW, 0),
315
  [PG_EUC_JIS_2004] = DEF_ENC2NAME(EUC_JIS_2004, 20932),
316
  [PG_UTF8] = DEF_ENC2NAME(UTF8, 65001),
317
  [PG_MULE_INTERNAL] = DEF_ENC2NAME(MULE_INTERNAL, 0),
318
  [PG_LATIN1] = DEF_ENC2NAME(LATIN1, 28591),
319
  [PG_LATIN2] = DEF_ENC2NAME(LATIN2, 28592),
320
  [PG_LATIN3] = DEF_ENC2NAME(LATIN3, 28593),
321
  [PG_LATIN4] = DEF_ENC2NAME(LATIN4, 28594),
322
  [PG_LATIN5] = DEF_ENC2NAME(LATIN5, 28599),
323
  [PG_LATIN6] = DEF_ENC2NAME(LATIN6, 0),
324
  [PG_LATIN7] = DEF_ENC2NAME(LATIN7, 0),
325
  [PG_LATIN8] = DEF_ENC2NAME(LATIN8, 0),
326
  [PG_LATIN9] = DEF_ENC2NAME(LATIN9, 28605),
327
  [PG_LATIN10] = DEF_ENC2NAME(LATIN10, 0),
328
  [PG_WIN1256] = DEF_ENC2NAME(WIN1256, 1256),
329
  [PG_WIN1258] = DEF_ENC2NAME(WIN1258, 1258),
330
  [PG_WIN866] = DEF_ENC2NAME(WIN866, 866),
331
  [PG_WIN874] = DEF_ENC2NAME(WIN874, 874),
332
  [PG_KOI8R] = DEF_ENC2NAME(KOI8R, 20866),
333
  [PG_WIN1251] = DEF_ENC2NAME(WIN1251, 1251),
334
  [PG_WIN1252] = DEF_ENC2NAME(WIN1252, 1252),
335
  [PG_ISO_8859_5] = DEF_ENC2NAME(ISO_8859_5, 28595),
336
  [PG_ISO_8859_6] = DEF_ENC2NAME(ISO_8859_6, 28596),
337
  [PG_ISO_8859_7] = DEF_ENC2NAME(ISO_8859_7, 28597),
338
  [PG_ISO_8859_8] = DEF_ENC2NAME(ISO_8859_8, 28598),
339
  [PG_WIN1250] = DEF_ENC2NAME(WIN1250, 1250),
340
  [PG_WIN1253] = DEF_ENC2NAME(WIN1253, 1253),
341
  [PG_WIN1254] = DEF_ENC2NAME(WIN1254, 1254),
342
  [PG_WIN1255] = DEF_ENC2NAME(WIN1255, 1255),
343
  [PG_WIN1257] = DEF_ENC2NAME(WIN1257, 1257),
344
  [PG_KOI8U] = DEF_ENC2NAME(KOI8U, 21866),
345
  [PG_SJIS] = DEF_ENC2NAME(SJIS, 932),
346
  [PG_BIG5] = DEF_ENC2NAME(BIG5, 950),
347
  [PG_GBK] = DEF_ENC2NAME(GBK, 936),
348
  [PG_UHC] = DEF_ENC2NAME(UHC, 949),
349
  [PG_GB18030] = DEF_ENC2NAME(GB18030, 54936),
350
  [PG_JOHAB] = DEF_ENC2NAME(JOHAB, 0),
351
  [PG_SHIFT_JIS_2004] = DEF_ENC2NAME(SHIFT_JIS_2004, 932),
352
};
353
354
/* ----------
355
 * These are encoding names for gettext.
356
 *
357
 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
358
 * ----------
359
 */
360
const char *pg_enc2gettext_tbl[] =
361
{
362
  [PG_SQL_ASCII] = "US-ASCII",
363
  [PG_UTF8] = "UTF-8",
364
  [PG_MULE_INTERNAL] = NULL,
365
  [PG_LATIN1] = "LATIN1",
366
  [PG_LATIN2] = "LATIN2",
367
  [PG_LATIN3] = "LATIN3",
368
  [PG_LATIN4] = "LATIN4",
369
  [PG_ISO_8859_5] = "ISO-8859-5",
370
  [PG_ISO_8859_6] = "ISO_8859-6",
371
  [PG_ISO_8859_7] = "ISO-8859-7",
372
  [PG_ISO_8859_8] = "ISO-8859-8",
373
  [PG_LATIN5] = "LATIN5",
374
  [PG_LATIN6] = "LATIN6",
375
  [PG_LATIN7] = "LATIN7",
376
  [PG_LATIN8] = "LATIN8",
377
  [PG_LATIN9] = "LATIN-9",
378
  [PG_LATIN10] = "LATIN10",
379
  [PG_KOI8R] = "KOI8-R",
380
  [PG_KOI8U] = "KOI8-U",
381
  [PG_WIN1250] = "CP1250",
382
  [PG_WIN1251] = "CP1251",
383
  [PG_WIN1252] = "CP1252",
384
  [PG_WIN1253] = "CP1253",
385
  [PG_WIN1254] = "CP1254",
386
  [PG_WIN1255] = "CP1255",
387
  [PG_WIN1256] = "CP1256",
388
  [PG_WIN1257] = "CP1257",
389
  [PG_WIN1258] = "CP1258",
390
  [PG_WIN866] = "CP866",
391
  [PG_WIN874] = "CP874",
392
  [PG_EUC_CN] = "EUC-CN",
393
  [PG_EUC_JP] = "EUC-JP",
394
  [PG_EUC_KR] = "EUC-KR",
395
  [PG_EUC_TW] = "EUC-TW",
396
  [PG_EUC_JIS_2004] = "EUC-JP",
397
  [PG_SJIS] = "SHIFT-JIS",
398
  [PG_BIG5] = "BIG5",
399
  [PG_GBK] = "GBK",
400
  [PG_UHC] = "UHC",
401
  [PG_GB18030] = "GB18030",
402
  [PG_JOHAB] = "JOHAB",
403
  [PG_SHIFT_JIS_2004] = "SHIFT_JISX0213",
404
};
405
406
407
/*
408
 * Table of encoding names for ICU (currently covers backend encodings only)
409
 *
410
 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
411
 *
412
 * NULL entries are not supported by ICU, or their mapping is unclear.
413
 */
414
static const char *const pg_enc2icu_tbl[] =
415
{
416
  [PG_SQL_ASCII] = NULL,
417
  [PG_EUC_JP] = "EUC-JP",
418
  [PG_EUC_CN] = "EUC-CN",
419
  [PG_EUC_KR] = "EUC-KR",
420
  [PG_EUC_TW] = "EUC-TW",
421
  [PG_EUC_JIS_2004] = NULL,
422
  [PG_UTF8] = "UTF-8",
423
  [PG_MULE_INTERNAL] = NULL,
424
  [PG_LATIN1] = "ISO-8859-1",
425
  [PG_LATIN2] = "ISO-8859-2",
426
  [PG_LATIN3] = "ISO-8859-3",
427
  [PG_LATIN4] = "ISO-8859-4",
428
  [PG_LATIN5] = "ISO-8859-9",
429
  [PG_LATIN6] = "ISO-8859-10",
430
  [PG_LATIN7] = "ISO-8859-13",
431
  [PG_LATIN8] = "ISO-8859-14",
432
  [PG_LATIN9] = "ISO-8859-15",
433
  [PG_LATIN10] = NULL,
434
  [PG_WIN1256] = "CP1256",
435
  [PG_WIN1258] = "CP1258",
436
  [PG_WIN866] = "CP866",
437
  [PG_WIN874] = NULL,
438
  [PG_KOI8R] = "KOI8-R",
439
  [PG_WIN1251] = "CP1251",
440
  [PG_WIN1252] = "CP1252",
441
  [PG_ISO_8859_5] = "ISO-8859-5",
442
  [PG_ISO_8859_6] = "ISO-8859-6",
443
  [PG_ISO_8859_7] = "ISO-8859-7",
444
  [PG_ISO_8859_8] = "ISO-8859-8",
445
  [PG_WIN1250] = "CP1250",
446
  [PG_WIN1253] = "CP1253",
447
  [PG_WIN1254] = "CP1254",
448
  [PG_WIN1255] = "CP1255",
449
  [PG_WIN1257] = "CP1257",
450
  [PG_KOI8U] = "KOI8-U",
451
};
452
453
StaticAssertDecl(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
454
         "pg_enc2icu_tbl incomplete");
455
456
457
/*
458
 * Is this encoding supported by ICU?
459
 */
460
bool
461
is_encoding_supported_by_icu(int encoding)
462
0
{
463
0
  if (!PG_VALID_BE_ENCODING(encoding))
464
0
    return false;
465
0
  return (pg_enc2icu_tbl[encoding] != NULL);
466
0
}
467
468
/*
469
 * Returns ICU's name for encoding, or NULL if not supported
470
 */
471
const char *
472
get_encoding_name_for_icu(int encoding)
473
0
{
474
0
  if (!PG_VALID_BE_ENCODING(encoding))
475
0
    return NULL;
476
0
  return pg_enc2icu_tbl[encoding];
477
0
}
478
479
480
/* ----------
481
 * Encoding checks, for error returns -1 else encoding id
482
 * ----------
483
 */
484
int
485
pg_valid_client_encoding(const char *name)
486
2
{
487
2
  int     enc;
488
489
2
  if ((enc = pg_char_to_encoding(name)) < 0)
490
0
    return -1;
491
492
2
  if (!PG_VALID_FE_ENCODING(enc))
493
0
    return -1;
494
495
2
  return enc;
496
2
}
497
498
int
499
pg_valid_server_encoding(const char *name)
500
0
{
501
0
  int     enc;
502
503
0
  if ((enc = pg_char_to_encoding(name)) < 0)
504
0
    return -1;
505
506
0
  if (!PG_VALID_BE_ENCODING(enc))
507
0
    return -1;
508
509
0
  return enc;
510
0
}
511
512
int
513
pg_valid_server_encoding_id(int encoding)
514
0
{
515
0
  return PG_VALID_BE_ENCODING(encoding);
516
0
}
517
518
/*
519
 * Remove irrelevant chars from encoding name, store at *newkey
520
 *
521
 * (Caller's responsibility to provide a large enough buffer)
522
 */
523
static char *
524
clean_encoding_name(const char *key, char *newkey)
525
2
{
526
2
  const char *p;
527
2
  char     *np;
528
529
20
  for (p = key, np = newkey; *p != '\0'; p++)
530
18
  {
531
18
    if (isalnum((unsigned char) *p))
532
16
    {
533
16
      if (*p >= 'A' && *p <= 'Z')
534
16
        *np++ = *p + 'a' - 'A';
535
0
      else
536
0
        *np++ = *p;
537
16
    }
538
18
  }
539
2
  *np = '\0';
540
2
  return newkey;
541
2
}
542
543
/*
544
 * Search encoding by encoding name
545
 *
546
 * Returns encoding ID, or -1 if not recognized
547
 */
548
int
549
pg_char_to_encoding(const char *name)
550
2
{
551
2
  unsigned int nel = lengthof(pg_encname_tbl);
552
2
  const pg_encname *base = pg_encname_tbl,
553
2
         *last = base + nel - 1,
554
2
         *position;
555
2
  int     result;
556
2
  char    buff[NAMEDATALEN],
557
2
         *key;
558
559
2
  if (name == NULL || *name == '\0')
560
0
    return -1;
561
562
2
  if (strlen(name) >= NAMEDATALEN)
563
0
    return -1;       /* it's certainly not in the table */
564
565
2
  key = clean_encoding_name(name, buff);
566
567
12
  while (last >= base)
568
12
  {
569
12
    position = base + ((last - base) >> 1);
570
12
    result = key[0] - position->name[0];
571
572
12
    if (result == 0)
573
6
    {
574
6
      result = strcmp(key, position->name);
575
6
      if (result == 0)
576
2
        return position->encoding;
577
6
    }
578
10
    if (result < 0)
579
6
      last = position - 1;
580
4
    else
581
4
      base = position + 1;
582
10
  }
583
0
  return -1;
584
2
}
585
586
const char *
587
pg_encoding_to_char(int encoding)
588
2
{
589
2
  if (PG_VALID_ENCODING(encoding))
590
2
  {
591
2
    const pg_enc2name *p = &pg_enc2name_tbl[encoding];
592
593
2
    Assert(encoding == p->encoding);
594
2
    return p->name;
595
2
  }
596
0
  return "";
597
2
}