Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/common/wchar.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * wchar.c
4
 *    Functions for working with multibyte characters in various encodings.
5
 *
6
 * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
7
 *
8
 * IDENTIFICATION
9
 *    src/common/wchar.c
10
 *
11
 *-------------------------------------------------------------------------
12
 */
13
#include "c.h"
14
15
#include <limits.h>
16
17
#include "mb/pg_wchar.h"
18
#include "utils/ascii.h"
19
20
21
/*
22
 * In today's multibyte encodings other than UTF8, this two-byte sequence
23
 * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
24
 *
25
 * For historical reasons, several verifychar implementations opt to reject
26
 * this pair specifically.  Byte pair range constraints, in encoding
27
 * originator documentation, always excluded this pair.  No core conversion
28
 * could translate it.  However, longstanding verifychar implementations
29
 * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
30
 * pairs not valid per encoding originator documentation.  To avoid tightening
31
 * core or non-core conversions in a security patch, we sought this one pair.
32
 *
33
 * PQescapeString() historically used spaces for BYTE1; many other values
34
 * could suffice for BYTE1.
35
 */
36
0
#define NONUTF8_INVALID_BYTE0 (0x8d)
37
0
#define NONUTF8_INVALID_BYTE1 (' ')
38
39
40
/*
41
 * Operations on multi-byte encodings are driven by a table of helper
42
 * functions.
43
 *
44
 * To add an encoding support, define mblen(), dsplen(), verifychar() and
45
 * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
46
 * and wchar2mb() conversion functions.
47
 *
48
 * These functions generally assume that their input is validly formed.
49
 * The "verifier" functions, further down in the file, have to be more
50
 * paranoid.
51
 *
52
 * We expect that mblen() does not need to examine more than the first byte
53
 * of the character to discover the correct length.  GB18030 is an exception
54
 * to that rule, though, as it also looks at second byte.  But even that
55
 * behaves in a predictable way, if you only pass the first byte: it will
56
 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
57
 * good enough for all current uses.
58
 *
59
 * Note: for the display output of psql to work properly, the return values
60
 * of the dsplen functions must conform to the Unicode standard. In particular
61
 * the NUL character is zero width and control characters are generally
62
 * width -1. It is recommended that non-ASCII encodings refer their ASCII
63
 * subset to the ASCII routines to ensure consistency.
64
 */
65
66
/*
67
 * SQL/ASCII
68
 */
69
static int
70
pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
71
0
{
72
0
  int     cnt = 0;
73
74
0
  while (len > 0 && *from)
75
0
  {
76
0
    *to++ = *from++;
77
0
    len--;
78
0
    cnt++;
79
0
  }
80
0
  *to = 0;
81
0
  return cnt;
82
0
}
83
84
static int
85
pg_ascii_mblen(const unsigned char *s)
86
27
{
87
27
  return 1;
88
27
}
89
90
static int
91
pg_ascii_dsplen(const unsigned char *s)
92
0
{
93
0
  if (*s == '\0')
94
0
    return 0;
95
0
  if (*s < 0x20 || *s == 0x7f)
96
0
    return -1;
97
98
0
  return 1;
99
0
}
100
101
/*
102
 * EUC
103
 */
104
static int
105
pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
106
0
{
107
0
  int     cnt = 0;
108
109
0
  while (len > 0 && *from)
110
0
  {
111
0
    if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
112
                     * KANA") */
113
0
    {
114
0
      from++;
115
0
      *to = (SS2 << 8) | *from++;
116
0
      len -= 2;
117
0
    }
118
0
    else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
119
0
    {
120
0
      from++;
121
0
      *to = (SS3 << 16) | (*from++ << 8);
122
0
      *to |= *from++;
123
0
      len -= 3;
124
0
    }
125
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
126
0
    {
127
0
      *to = *from++ << 8;
128
0
      *to |= *from++;
129
0
      len -= 2;
130
0
    }
131
0
    else          /* must be ASCII */
132
0
    {
133
0
      *to = *from++;
134
0
      len--;
135
0
    }
136
0
    to++;
137
0
    cnt++;
138
0
  }
139
0
  *to = 0;
140
0
  return cnt;
141
0
}
142
143
static inline int
144
pg_euc_mblen(const unsigned char *s)
145
0
{
146
0
  int     len;
147
148
0
  if (*s == SS2)
149
0
    len = 2;
150
0
  else if (*s == SS3)
151
0
    len = 3;
152
0
  else if (IS_HIGHBIT_SET(*s))
153
0
    len = 2;
154
0
  else
155
0
    len = 1;
156
0
  return len;
157
0
}
158
159
static inline int
160
pg_euc_dsplen(const unsigned char *s)
161
0
{
162
0
  int     len;
163
164
0
  if (*s == SS2)
165
0
    len = 2;
166
0
  else if (*s == SS3)
167
0
    len = 2;
168
0
  else if (IS_HIGHBIT_SET(*s))
169
0
    len = 2;
170
0
  else
171
0
    len = pg_ascii_dsplen(s);
172
0
  return len;
173
0
}
174
175
/*
176
 * EUC_JP
177
 */
178
static int
179
pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
180
0
{
181
0
  return pg_euc2wchar_with_len(from, to, len);
182
0
}
183
184
static int
185
pg_eucjp_mblen(const unsigned char *s)
186
0
{
187
0
  return pg_euc_mblen(s);
188
0
}
189
190
static int
191
pg_eucjp_dsplen(const unsigned char *s)
192
0
{
193
0
  int     len;
194
195
0
  if (*s == SS2)
196
0
    len = 1;
197
0
  else if (*s == SS3)
198
0
    len = 2;
199
0
  else if (IS_HIGHBIT_SET(*s))
200
0
    len = 2;
201
0
  else
202
0
    len = pg_ascii_dsplen(s);
203
0
  return len;
204
0
}
205
206
/*
207
 * EUC_KR
208
 */
209
static int
210
pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
211
0
{
212
0
  return pg_euc2wchar_with_len(from, to, len);
213
0
}
214
215
static int
216
pg_euckr_mblen(const unsigned char *s)
217
0
{
218
0
  return pg_euc_mblen(s);
219
0
}
220
221
static int
222
pg_euckr_dsplen(const unsigned char *s)
223
0
{
224
0
  return pg_euc_dsplen(s);
225
0
}
226
227
/*
228
 * EUC_CN
229
 *
230
 */
231
static int
232
pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
233
0
{
234
0
  int     cnt = 0;
235
236
0
  while (len > 0 && *from)
237
0
  {
238
0
    if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
239
0
    {
240
0
      from++;
241
0
      *to = (SS2 << 16) | (*from++ << 8);
242
0
      *to |= *from++;
243
0
      len -= 3;
244
0
    }
245
0
    else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
246
0
    {
247
0
      from++;
248
0
      *to = (SS3 << 16) | (*from++ << 8);
249
0
      *to |= *from++;
250
0
      len -= 3;
251
0
    }
252
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
253
0
    {
254
0
      *to = *from++ << 8;
255
0
      *to |= *from++;
256
0
      len -= 2;
257
0
    }
258
0
    else
259
0
    {
260
0
      *to = *from++;
261
0
      len--;
262
0
    }
263
0
    to++;
264
0
    cnt++;
265
0
  }
266
0
  *to = 0;
267
0
  return cnt;
268
0
}
269
270
static int
271
pg_euccn_mblen(const unsigned char *s)
272
0
{
273
0
  int     len;
274
275
0
  if (IS_HIGHBIT_SET(*s))
276
0
    len = 2;
277
0
  else
278
0
    len = 1;
279
0
  return len;
280
0
}
281
282
static int
283
pg_euccn_dsplen(const unsigned char *s)
284
0
{
285
0
  int     len;
286
287
0
  if (IS_HIGHBIT_SET(*s))
288
0
    len = 2;
289
0
  else
290
0
    len = pg_ascii_dsplen(s);
291
0
  return len;
292
0
}
293
294
/*
295
 * EUC_TW
296
 *
297
 */
298
static int
299
pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
300
0
{
301
0
  int     cnt = 0;
302
303
0
  while (len > 0 && *from)
304
0
  {
305
0
    if (*from == SS2 && len >= 4) /* code set 2 */
306
0
    {
307
0
      from++;
308
0
      *to = (((uint32) SS2) << 24) | (*from++ << 16);
309
0
      *to |= *from++ << 8;
310
0
      *to |= *from++;
311
0
      len -= 4;
312
0
    }
313
0
    else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
314
0
    {
315
0
      from++;
316
0
      *to = (SS3 << 16) | (*from++ << 8);
317
0
      *to |= *from++;
318
0
      len -= 3;
319
0
    }
320
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
321
0
    {
322
0
      *to = *from++ << 8;
323
0
      *to |= *from++;
324
0
      len -= 2;
325
0
    }
326
0
    else
327
0
    {
328
0
      *to = *from++;
329
0
      len--;
330
0
    }
331
0
    to++;
332
0
    cnt++;
333
0
  }
334
0
  *to = 0;
335
0
  return cnt;
336
0
}
337
338
static int
339
pg_euctw_mblen(const unsigned char *s)
340
0
{
341
0
  int     len;
342
343
0
  if (*s == SS2)
344
0
    len = 4;
345
0
  else if (*s == SS3)
346
0
    len = 3;
347
0
  else if (IS_HIGHBIT_SET(*s))
348
0
    len = 2;
349
0
  else
350
0
    len = 1;
351
0
  return len;
352
0
}
353
354
static int
355
pg_euctw_dsplen(const unsigned char *s)
356
0
{
357
0
  int     len;
358
359
0
  if (*s == SS2)
360
0
    len = 2;
361
0
  else if (*s == SS3)
362
0
    len = 2;
363
0
  else if (IS_HIGHBIT_SET(*s))
364
0
    len = 2;
365
0
  else
366
0
    len = pg_ascii_dsplen(s);
367
0
  return len;
368
0
}
369
370
/*
371
 * Convert pg_wchar to EUC_* encoding.
372
 * caller must allocate enough space for "to", including a trailing zero!
373
 * len: length of from.
374
 * "from" not necessarily null terminated.
375
 */
376
static int
377
pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
378
0
{
379
0
  int     cnt = 0;
380
381
0
  while (len > 0 && *from)
382
0
  {
383
0
    unsigned char c;
384
385
0
    if ((c = (*from >> 24)))
386
0
    {
387
0
      *to++ = c;
388
0
      *to++ = (*from >> 16) & 0xff;
389
0
      *to++ = (*from >> 8) & 0xff;
390
0
      *to++ = *from & 0xff;
391
0
      cnt += 4;
392
0
    }
393
0
    else if ((c = (*from >> 16)))
394
0
    {
395
0
      *to++ = c;
396
0
      *to++ = (*from >> 8) & 0xff;
397
0
      *to++ = *from & 0xff;
398
0
      cnt += 3;
399
0
    }
400
0
    else if ((c = (*from >> 8)))
401
0
    {
402
0
      *to++ = c;
403
0
      *to++ = *from & 0xff;
404
0
      cnt += 2;
405
0
    }
406
0
    else
407
0
    {
408
0
      *to++ = *from;
409
0
      cnt++;
410
0
    }
411
0
    from++;
412
0
    len--;
413
0
  }
414
0
  *to = 0;
415
0
  return cnt;
416
0
}
417
418
419
/*
420
 * JOHAB
421
 */
422
static int
423
pg_johab_mblen(const unsigned char *s)
424
0
{
425
0
  return pg_euc_mblen(s);
426
0
}
427
428
static int
429
pg_johab_dsplen(const unsigned char *s)
430
0
{
431
0
  return pg_euc_dsplen(s);
432
0
}
433
434
/*
435
 * convert UTF8 string to pg_wchar (UCS-4)
436
 * caller must allocate enough space for "to", including a trailing zero!
437
 * len: length of from.
438
 * "from" not necessarily null terminated.
439
 */
440
static int
441
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
442
0
{
443
0
  int     cnt = 0;
444
0
  uint32    c1,
445
0
        c2,
446
0
        c3,
447
0
        c4;
448
449
0
  while (len > 0 && *from)
450
0
  {
451
0
    if ((*from & 0x80) == 0)
452
0
    {
453
0
      *to = *from++;
454
0
      len--;
455
0
    }
456
0
    else if ((*from & 0xe0) == 0xc0)
457
0
    {
458
0
      if (len < 2)
459
0
        break;     /* drop trailing incomplete char */
460
0
      c1 = *from++ & 0x1f;
461
0
      c2 = *from++ & 0x3f;
462
0
      *to = (c1 << 6) | c2;
463
0
      len -= 2;
464
0
    }
465
0
    else if ((*from & 0xf0) == 0xe0)
466
0
    {
467
0
      if (len < 3)
468
0
        break;     /* drop trailing incomplete char */
469
0
      c1 = *from++ & 0x0f;
470
0
      c2 = *from++ & 0x3f;
471
0
      c3 = *from++ & 0x3f;
472
0
      *to = (c1 << 12) | (c2 << 6) | c3;
473
0
      len -= 3;
474
0
    }
475
0
    else if ((*from & 0xf8) == 0xf0)
476
0
    {
477
0
      if (len < 4)
478
0
        break;     /* drop trailing incomplete char */
479
0
      c1 = *from++ & 0x07;
480
0
      c2 = *from++ & 0x3f;
481
0
      c3 = *from++ & 0x3f;
482
0
      c4 = *from++ & 0x3f;
483
0
      *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
484
0
      len -= 4;
485
0
    }
486
0
    else
487
0
    {
488
      /* treat a bogus char as length 1; not ours to raise error */
489
0
      *to = *from++;
490
0
      len--;
491
0
    }
492
0
    to++;
493
0
    cnt++;
494
0
  }
495
0
  *to = 0;
496
0
  return cnt;
497
0
}
498
499
500
/*
501
 * Trivial conversion from pg_wchar to UTF-8.
502
 * caller should allocate enough space for "to"
503
 * len: length of from.
504
 * "from" not necessarily null terminated.
505
 */
506
static int
507
pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
508
0
{
509
0
  int     cnt = 0;
510
511
0
  while (len > 0 && *from)
512
0
  {
513
0
    int     char_len;
514
515
0
    unicode_to_utf8(*from, to);
516
0
    char_len = pg_utf_mblen(to);
517
0
    cnt += char_len;
518
0
    to += char_len;
519
0
    from++;
520
0
    len--;
521
0
  }
522
0
  *to = 0;
523
0
  return cnt;
524
0
}
525
526
/*
527
 * Return the byte length of a UTF8 character pointed to by s
528
 *
529
 * Note: in the current implementation we do not support UTF8 sequences
530
 * of more than 4 bytes; hence do NOT return a value larger than 4.
531
 * We return "1" for any leading byte that is either flat-out illegal or
532
 * indicates a length larger than we support.
533
 *
534
 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
535
 * other places would need to be fixed to change this.
536
 */
537
int
538
pg_utf_mblen(const unsigned char *s)
539
486
{
540
486
  int     len;
541
542
486
  if ((*s & 0x80) == 0)
543
402
    len = 1;
544
84
  else if ((*s & 0xe0) == 0xc0)
545
5
    len = 2;
546
79
  else if ((*s & 0xf0) == 0xe0)
547
8
    len = 3;
548
71
  else if ((*s & 0xf8) == 0xf0)
549
29
    len = 4;
550
#ifdef NOT_USED
551
  else if ((*s & 0xfc) == 0xf8)
552
    len = 5;
553
  else if ((*s & 0xfe) == 0xfc)
554
    len = 6;
555
#endif
556
42
  else
557
42
    len = 1;
558
486
  return len;
559
486
}
560
561
/*
562
 * This is an implementation of wcwidth() and wcswidth() as defined in
563
 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
564
 * <http://www.unix.org/online.html>
565
 *
566
 * Markus Kuhn -- 2001-09-08 -- public domain
567
 *
568
 * customised for PostgreSQL
569
 *
570
 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
571
 */
572
573
struct mbinterval
574
{
575
  unsigned int first;
576
  unsigned int last;
577
};
578
579
/* auxiliary function for binary search in interval table */
580
static int
581
mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
582
0
{
583
0
  int     min = 0;
584
0
  int     mid;
585
586
0
  if (ucs < table[0].first || ucs > table[max].last)
587
0
    return 0;
588
0
  while (max >= min)
589
0
  {
590
0
    mid = (min + max) / 2;
591
0
    if (ucs > table[mid].last)
592
0
      min = mid + 1;
593
0
    else if (ucs < table[mid].first)
594
0
      max = mid - 1;
595
0
    else
596
0
      return 1;
597
0
  }
598
599
0
  return 0;
600
0
}
601
602
603
/* The following functions define the column width of an ISO 10646
604
 * character as follows:
605
 *
606
 *    - The null character (U+0000) has a column width of 0.
607
 *
608
 *    - Other C0/C1 control characters and DEL will lead to a return
609
 *    value of -1.
610
 *
611
 *    - Non-spacing and enclosing combining characters (general
612
 *    category code Mn, Me or Cf in the Unicode database) have a
613
 *    column width of 0.
614
 *
615
 *    - Spacing characters in the East Asian Wide (W) or East Asian
616
 *    FullWidth (F) category as defined in Unicode Technical
617
 *    Report #11 have a column width of 2.
618
 *
619
 *    - All remaining characters (including all printable
620
 *    ISO 8859-1 and WGL4 characters, Unicode control characters,
621
 *    etc.) have a column width of 1.
622
 *
623
 * This implementation assumes that wchar_t characters are encoded
624
 * in ISO 10646.
625
 */
626
627
static int
628
ucs_wcwidth(pg_wchar ucs)
629
0
{
630
0
#include "common/unicode_nonspacing_table.h"
631
0
#include "common/unicode_east_asian_fw_table.h"
632
633
  /* test for 8-bit control characters */
634
0
  if (ucs == 0)
635
0
    return 0;
636
637
0
  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
638
0
    return -1;
639
640
  /*
641
   * binary search in table of non-spacing characters
642
   *
643
   * XXX: In the official Unicode sources, it is possible for a character to
644
   * be described as both non-spacing and wide at the same time. As of
645
   * Unicode 13.0, treating the non-spacing property as the determining
646
   * factor for display width leads to the correct behavior, so do that
647
   * search first.
648
   */
649
0
  if (mbbisearch(ucs, nonspacing,
650
0
           sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
651
0
    return 0;
652
653
  /* binary search in table of wide characters */
654
0
  if (mbbisearch(ucs, east_asian_fw,
655
0
           sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
656
0
    return 2;
657
658
0
  return 1;
659
0
}
660
661
static int
662
pg_utf_dsplen(const unsigned char *s)
663
0
{
664
0
  return ucs_wcwidth(utf8_to_unicode(s));
665
0
}
666
667
/*
668
 * convert mule internal code to pg_wchar
669
 * caller should allocate enough space for "to"
670
 * len: length of from.
671
 * "from" not necessarily null terminated.
672
 */
673
static int
674
pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
675
0
{
676
0
  int     cnt = 0;
677
678
0
  while (len > 0 && *from)
679
0
  {
680
0
    if (IS_LC1(*from) && len >= 2)
681
0
    {
682
0
      *to = *from++ << 16;
683
0
      *to |= *from++;
684
0
      len -= 2;
685
0
    }
686
0
    else if (IS_LCPRV1(*from) && len >= 3)
687
0
    {
688
0
      from++;
689
0
      *to = *from++ << 16;
690
0
      *to |= *from++;
691
0
      len -= 3;
692
0
    }
693
0
    else if (IS_LC2(*from) && len >= 3)
694
0
    {
695
0
      *to = *from++ << 16;
696
0
      *to |= *from++ << 8;
697
0
      *to |= *from++;
698
0
      len -= 3;
699
0
    }
700
0
    else if (IS_LCPRV2(*from) && len >= 4)
701
0
    {
702
0
      from++;
703
0
      *to = *from++ << 16;
704
0
      *to |= *from++ << 8;
705
0
      *to |= *from++;
706
0
      len -= 4;
707
0
    }
708
0
    else
709
0
    {           /* assume ASCII */
710
0
      *to = (unsigned char) *from++;
711
0
      len--;
712
0
    }
713
0
    to++;
714
0
    cnt++;
715
0
  }
716
0
  *to = 0;
717
0
  return cnt;
718
0
}
719
720
/*
721
 * convert pg_wchar to mule internal code
722
 * caller should allocate enough space for "to"
723
 * len: length of from.
724
 * "from" not necessarily null terminated.
725
 */
726
static int
727
pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
728
0
{
729
0
  int     cnt = 0;
730
731
0
  while (len > 0 && *from)
732
0
  {
733
0
    unsigned char lb;
734
735
0
    lb = (*from >> 16) & 0xff;
736
0
    if (IS_LC1(lb))
737
0
    {
738
0
      *to++ = lb;
739
0
      *to++ = *from & 0xff;
740
0
      cnt += 2;
741
0
    }
742
0
    else if (IS_LC2(lb))
743
0
    {
744
0
      *to++ = lb;
745
0
      *to++ = (*from >> 8) & 0xff;
746
0
      *to++ = *from & 0xff;
747
0
      cnt += 3;
748
0
    }
749
0
    else if (IS_LCPRV1_A_RANGE(lb))
750
0
    {
751
0
      *to++ = LCPRV1_A;
752
0
      *to++ = lb;
753
0
      *to++ = *from & 0xff;
754
0
      cnt += 3;
755
0
    }
756
0
    else if (IS_LCPRV1_B_RANGE(lb))
757
0
    {
758
0
      *to++ = LCPRV1_B;
759
0
      *to++ = lb;
760
0
      *to++ = *from & 0xff;
761
0
      cnt += 3;
762
0
    }
763
0
    else if (IS_LCPRV2_A_RANGE(lb))
764
0
    {
765
0
      *to++ = LCPRV2_A;
766
0
      *to++ = lb;
767
0
      *to++ = (*from >> 8) & 0xff;
768
0
      *to++ = *from & 0xff;
769
0
      cnt += 4;
770
0
    }
771
0
    else if (IS_LCPRV2_B_RANGE(lb))
772
0
    {
773
0
      *to++ = LCPRV2_B;
774
0
      *to++ = lb;
775
0
      *to++ = (*from >> 8) & 0xff;
776
0
      *to++ = *from & 0xff;
777
0
      cnt += 4;
778
0
    }
779
0
    else
780
0
    {
781
0
      *to++ = *from & 0xff;
782
0
      cnt += 1;
783
0
    }
784
0
    from++;
785
0
    len--;
786
0
  }
787
0
  *to = 0;
788
0
  return cnt;
789
0
}
790
791
/* exported for direct use by conv.c */
792
int
793
pg_mule_mblen(const unsigned char *s)
794
0
{
795
0
  int     len;
796
797
0
  if (IS_LC1(*s))
798
0
    len = 2;
799
0
  else if (IS_LCPRV1(*s))
800
0
    len = 3;
801
0
  else if (IS_LC2(*s))
802
0
    len = 3;
803
0
  else if (IS_LCPRV2(*s))
804
0
    len = 4;
805
0
  else
806
0
    len = 1;       /* assume ASCII */
807
0
  return len;
808
0
}
809
810
static int
811
pg_mule_dsplen(const unsigned char *s)
812
0
{
813
0
  int     len;
814
815
  /*
816
   * Note: it's not really appropriate to assume that all multibyte charsets
817
   * are double-wide on screen.  But this seems an okay approximation for
818
   * the MULE charsets we currently support.
819
   */
820
821
0
  if (IS_LC1(*s))
822
0
    len = 1;
823
0
  else if (IS_LCPRV1(*s))
824
0
    len = 1;
825
0
  else if (IS_LC2(*s))
826
0
    len = 2;
827
0
  else if (IS_LCPRV2(*s))
828
0
    len = 2;
829
0
  else
830
0
    len = 1;       /* assume ASCII */
831
832
0
  return len;
833
0
}
834
835
/*
836
 * ISO8859-1
837
 */
838
static int
839
pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
840
0
{
841
0
  int     cnt = 0;
842
843
0
  while (len > 0 && *from)
844
0
  {
845
0
    *to++ = *from++;
846
0
    len--;
847
0
    cnt++;
848
0
  }
849
0
  *to = 0;
850
0
  return cnt;
851
0
}
852
853
/*
854
 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
855
 * high bits.
856
 * caller should allocate enough space for "to"
857
 * len: length of from.
858
 * "from" not necessarily null terminated.
859
 */
860
static int
861
pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
862
0
{
863
0
  int     cnt = 0;
864
865
0
  while (len > 0 && *from)
866
0
  {
867
0
    *to++ = *from++;
868
0
    len--;
869
0
    cnt++;
870
0
  }
871
0
  *to = 0;
872
0
  return cnt;
873
0
}
874
875
static int
876
pg_latin1_mblen(const unsigned char *s)
877
0
{
878
0
  return 1;
879
0
}
880
881
static int
882
pg_latin1_dsplen(const unsigned char *s)
883
0
{
884
0
  return pg_ascii_dsplen(s);
885
0
}
886
887
/*
888
 * SJIS
889
 */
890
static int
891
pg_sjis_mblen(const unsigned char *s)
892
0
{
893
0
  int     len;
894
895
0
  if (*s >= 0xa1 && *s <= 0xdf)
896
0
    len = 1;       /* 1 byte kana? */
897
0
  else if (IS_HIGHBIT_SET(*s))
898
0
    len = 2;       /* kanji? */
899
0
  else
900
0
    len = 1;       /* should be ASCII */
901
0
  return len;
902
0
}
903
904
static int
905
pg_sjis_dsplen(const unsigned char *s)
906
0
{
907
0
  int     len;
908
909
0
  if (*s >= 0xa1 && *s <= 0xdf)
910
0
    len = 1;       /* 1 byte kana? */
911
0
  else if (IS_HIGHBIT_SET(*s))
912
0
    len = 2;       /* kanji? */
913
0
  else
914
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
915
0
  return len;
916
0
}
917
918
/*
919
 * Big5
920
 */
921
static int
922
pg_big5_mblen(const unsigned char *s)
923
0
{
924
0
  int     len;
925
926
0
  if (IS_HIGHBIT_SET(*s))
927
0
    len = 2;       /* kanji? */
928
0
  else
929
0
    len = 1;       /* should be ASCII */
930
0
  return len;
931
0
}
932
933
static int
934
pg_big5_dsplen(const unsigned char *s)
935
0
{
936
0
  int     len;
937
938
0
  if (IS_HIGHBIT_SET(*s))
939
0
    len = 2;       /* kanji? */
940
0
  else
941
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
942
0
  return len;
943
0
}
944
945
/*
946
 * GBK
947
 */
948
static int
949
pg_gbk_mblen(const unsigned char *s)
950
0
{
951
0
  int     len;
952
953
0
  if (IS_HIGHBIT_SET(*s))
954
0
    len = 2;       /* kanji? */
955
0
  else
956
0
    len = 1;       /* should be ASCII */
957
0
  return len;
958
0
}
959
960
static int
961
pg_gbk_dsplen(const unsigned char *s)
962
0
{
963
0
  int     len;
964
965
0
  if (IS_HIGHBIT_SET(*s))
966
0
    len = 2;       /* kanji? */
967
0
  else
968
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
969
0
  return len;
970
0
}
971
972
/*
973
 * UHC
974
 */
975
static int
976
pg_uhc_mblen(const unsigned char *s)
977
0
{
978
0
  int     len;
979
980
0
  if (IS_HIGHBIT_SET(*s))
981
0
    len = 2;       /* 2byte? */
982
0
  else
983
0
    len = 1;       /* should be ASCII */
984
0
  return len;
985
0
}
986
987
static int
988
pg_uhc_dsplen(const unsigned char *s)
989
0
{
990
0
  int     len;
991
992
0
  if (IS_HIGHBIT_SET(*s))
993
0
    len = 2;       /* 2byte? */
994
0
  else
995
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
996
0
  return len;
997
0
}
998
999
/*
1000
 * GB18030
1001
 *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1002
 */
1003
1004
/*
1005
 * Unlike all other mblen() functions, this also looks at the second byte of
1006
 * the input.  However, if you only pass the first byte of a multi-byte
1007
 * string, and \0 as the second byte, this still works in a predictable way:
1008
 * a 4-byte character will be reported as two 2-byte characters.  That's
1009
 * enough for all current uses, as a client-only encoding.  It works that
1010
 * way, because in any valid 4-byte GB18030-encoded character, the third and
1011
 * fourth byte look like a 2-byte encoded character, when looked at
1012
 * separately.
1013
 */
1014
static int
1015
pg_gb18030_mblen(const unsigned char *s)
1016
0
{
1017
0
  int     len;
1018
1019
0
  if (!IS_HIGHBIT_SET(*s))
1020
0
    len = 1;       /* ASCII */
1021
0
  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1022
0
    len = 4;
1023
0
  else
1024
0
    len = 2;
1025
0
  return len;
1026
0
}
1027
1028
static int
1029
pg_gb18030_dsplen(const unsigned char *s)
1030
0
{
1031
0
  int     len;
1032
1033
0
  if (IS_HIGHBIT_SET(*s))
1034
0
    len = 2;
1035
0
  else
1036
0
    len = pg_ascii_dsplen(s); /* ASCII */
1037
0
  return len;
1038
0
}
1039
1040
/*
1041
 *-------------------------------------------------------------------
1042
 * multibyte sequence validators
1043
 *
1044
 * The verifychar functions accept "s", a pointer to the first byte of a
1045
 * string, and "len", the remaining length of the string.  If there is a
1046
 * validly encoded character beginning at *s, return its length in bytes;
1047
 * else return -1.
1048
 *
1049
 * The verifystr functions also accept "s", a pointer to a string and "len",
1050
 * the length of the string.  They verify the whole string, and return the
1051
 * number of input bytes (<= len) that are valid.  In other words, if the
1052
 * whole string is valid, verifystr returns "len", otherwise it returns the
1053
 * byte offset of the first invalid character.  The verifystr functions must
1054
 * test for and reject zeroes in the input.
1055
 *
1056
 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1057
 * they must test for and reject zeroes in any additional bytes of a
1058
 * multibyte character.  Note that this definition allows the function for a
1059
 * single-byte encoding to be just "return 1".
1060
 *-------------------------------------------------------------------
1061
 */
1062
static int
1063
pg_ascii_verifychar(const unsigned char *s, int len)
1064
0
{
1065
0
  return 1;
1066
0
}
1067
1068
static int
1069
pg_ascii_verifystr(const unsigned char *s, int len)
1070
239
{
1071
239
  const unsigned char *nullpos = memchr(s, 0, len);
1072
1073
239
  if (nullpos == NULL)
1074
212
    return len;
1075
27
  else
1076
27
    return nullpos - s;
1077
239
}
1078
1079
0
#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1080
1081
static int
1082
pg_eucjp_verifychar(const unsigned char *s, int len)
1083
0
{
1084
0
  int     l;
1085
0
  unsigned char c1,
1086
0
        c2;
1087
1088
0
  c1 = *s++;
1089
1090
0
  switch (c1)
1091
0
  {
1092
0
    case SS2:       /* JIS X 0201 */
1093
0
      l = 2;
1094
0
      if (l > len)
1095
0
        return -1;
1096
0
      c2 = *s++;
1097
0
      if (c2 < 0xa1 || c2 > 0xdf)
1098
0
        return -1;
1099
0
      break;
1100
1101
0
    case SS3:       /* JIS X 0212 */
1102
0
      l = 3;
1103
0
      if (l > len)
1104
0
        return -1;
1105
0
      c2 = *s++;
1106
0
      if (!IS_EUC_RANGE_VALID(c2))
1107
0
        return -1;
1108
0
      c2 = *s++;
1109
0
      if (!IS_EUC_RANGE_VALID(c2))
1110
0
        return -1;
1111
0
      break;
1112
1113
0
    default:
1114
0
      if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1115
0
      {
1116
0
        l = 2;
1117
0
        if (l > len)
1118
0
          return -1;
1119
0
        if (!IS_EUC_RANGE_VALID(c1))
1120
0
          return -1;
1121
0
        c2 = *s++;
1122
0
        if (!IS_EUC_RANGE_VALID(c2))
1123
0
          return -1;
1124
0
      }
1125
0
      else
1126
        /* must be ASCII */
1127
0
      {
1128
0
        l = 1;
1129
0
      }
1130
0
      break;
1131
0
  }
1132
1133
0
  return l;
1134
0
}
1135
1136
static int
1137
pg_eucjp_verifystr(const unsigned char *s, int len)
1138
0
{
1139
0
  const unsigned char *start = s;
1140
1141
0
  while (len > 0)
1142
0
  {
1143
0
    int     l;
1144
1145
    /* fast path for ASCII-subset characters */
1146
0
    if (!IS_HIGHBIT_SET(*s))
1147
0
    {
1148
0
      if (*s == '\0')
1149
0
        break;
1150
0
      l = 1;
1151
0
    }
1152
0
    else
1153
0
    {
1154
0
      l = pg_eucjp_verifychar(s, len);
1155
0
      if (l == -1)
1156
0
        break;
1157
0
    }
1158
0
    s += l;
1159
0
    len -= l;
1160
0
  }
1161
1162
0
  return s - start;
1163
0
}
1164
1165
static int
1166
pg_euckr_verifychar(const unsigned char *s, int len)
1167
0
{
1168
0
  int     l;
1169
0
  unsigned char c1,
1170
0
        c2;
1171
1172
0
  c1 = *s++;
1173
1174
0
  if (IS_HIGHBIT_SET(c1))
1175
0
  {
1176
0
    l = 2;
1177
0
    if (l > len)
1178
0
      return -1;
1179
0
    if (!IS_EUC_RANGE_VALID(c1))
1180
0
      return -1;
1181
0
    c2 = *s++;
1182
0
    if (!IS_EUC_RANGE_VALID(c2))
1183
0
      return -1;
1184
0
  }
1185
0
  else
1186
    /* must be ASCII */
1187
0
  {
1188
0
    l = 1;
1189
0
  }
1190
1191
0
  return l;
1192
0
}
1193
1194
static int
1195
pg_euckr_verifystr(const unsigned char *s, int len)
1196
0
{
1197
0
  const unsigned char *start = s;
1198
1199
0
  while (len > 0)
1200
0
  {
1201
0
    int     l;
1202
1203
    /* fast path for ASCII-subset characters */
1204
0
    if (!IS_HIGHBIT_SET(*s))
1205
0
    {
1206
0
      if (*s == '\0')
1207
0
        break;
1208
0
      l = 1;
1209
0
    }
1210
0
    else
1211
0
    {
1212
0
      l = pg_euckr_verifychar(s, len);
1213
0
      if (l == -1)
1214
0
        break;
1215
0
    }
1216
0
    s += l;
1217
0
    len -= l;
1218
0
  }
1219
1220
0
  return s - start;
1221
0
}
1222
1223
/* EUC-CN byte sequences are exactly same as EUC-KR */
1224
#define pg_euccn_verifychar pg_euckr_verifychar
1225
#define pg_euccn_verifystr  pg_euckr_verifystr
1226
1227
static int
1228
pg_euctw_verifychar(const unsigned char *s, int len)
1229
0
{
1230
0
  int     l;
1231
0
  unsigned char c1,
1232
0
        c2;
1233
1234
0
  c1 = *s++;
1235
1236
0
  switch (c1)
1237
0
  {
1238
0
    case SS2:       /* CNS 11643 Plane 1-7 */
1239
0
      l = 4;
1240
0
      if (l > len)
1241
0
        return -1;
1242
0
      c2 = *s++;
1243
0
      if (c2 < 0xa1 || c2 > 0xa7)
1244
0
        return -1;
1245
0
      c2 = *s++;
1246
0
      if (!IS_EUC_RANGE_VALID(c2))
1247
0
        return -1;
1248
0
      c2 = *s++;
1249
0
      if (!IS_EUC_RANGE_VALID(c2))
1250
0
        return -1;
1251
0
      break;
1252
1253
0
    case SS3:       /* unused */
1254
0
      return -1;
1255
1256
0
    default:
1257
0
      if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1258
0
      {
1259
0
        l = 2;
1260
0
        if (l > len)
1261
0
          return -1;
1262
        /* no further range check on c1? */
1263
0
        c2 = *s++;
1264
0
        if (!IS_EUC_RANGE_VALID(c2))
1265
0
          return -1;
1266
0
      }
1267
0
      else
1268
        /* must be ASCII */
1269
0
      {
1270
0
        l = 1;
1271
0
      }
1272
0
      break;
1273
0
  }
1274
0
  return l;
1275
0
}
1276
1277
static int
1278
pg_euctw_verifystr(const unsigned char *s, int len)
1279
0
{
1280
0
  const unsigned char *start = s;
1281
1282
0
  while (len > 0)
1283
0
  {
1284
0
    int     l;
1285
1286
    /* fast path for ASCII-subset characters */
1287
0
    if (!IS_HIGHBIT_SET(*s))
1288
0
    {
1289
0
      if (*s == '\0')
1290
0
        break;
1291
0
      l = 1;
1292
0
    }
1293
0
    else
1294
0
    {
1295
0
      l = pg_euctw_verifychar(s, len);
1296
0
      if (l == -1)
1297
0
        break;
1298
0
    }
1299
0
    s += l;
1300
0
    len -= l;
1301
0
  }
1302
1303
0
  return s - start;
1304
0
}
1305
1306
static int
1307
pg_johab_verifychar(const unsigned char *s, int len)
1308
0
{
1309
0
  int     l,
1310
0
        mbl;
1311
0
  unsigned char c;
1312
1313
0
  l = mbl = pg_johab_mblen(s);
1314
1315
0
  if (len < l)
1316
0
    return -1;
1317
1318
0
  if (!IS_HIGHBIT_SET(*s))
1319
0
    return mbl;
1320
1321
0
  while (--l > 0)
1322
0
  {
1323
0
    c = *++s;
1324
0
    if (!IS_EUC_RANGE_VALID(c))
1325
0
      return -1;
1326
0
  }
1327
0
  return mbl;
1328
0
}
1329
1330
static int
1331
pg_johab_verifystr(const unsigned char *s, int len)
1332
0
{
1333
0
  const unsigned char *start = s;
1334
1335
0
  while (len > 0)
1336
0
  {
1337
0
    int     l;
1338
1339
    /* fast path for ASCII-subset characters */
1340
0
    if (!IS_HIGHBIT_SET(*s))
1341
0
    {
1342
0
      if (*s == '\0')
1343
0
        break;
1344
0
      l = 1;
1345
0
    }
1346
0
    else
1347
0
    {
1348
0
      l = pg_johab_verifychar(s, len);
1349
0
      if (l == -1)
1350
0
        break;
1351
0
    }
1352
0
    s += l;
1353
0
    len -= l;
1354
0
  }
1355
1356
0
  return s - start;
1357
0
}
1358
1359
static int
1360
pg_mule_verifychar(const unsigned char *s, int len)
1361
0
{
1362
0
  int     l,
1363
0
        mbl;
1364
0
  unsigned char c;
1365
1366
0
  l = mbl = pg_mule_mblen(s);
1367
1368
0
  if (len < l)
1369
0
    return -1;
1370
1371
0
  while (--l > 0)
1372
0
  {
1373
0
    c = *++s;
1374
0
    if (!IS_HIGHBIT_SET(c))
1375
0
      return -1;
1376
0
  }
1377
0
  return mbl;
1378
0
}
1379
1380
static int
1381
pg_mule_verifystr(const unsigned char *s, int len)
1382
0
{
1383
0
  const unsigned char *start = s;
1384
1385
0
  while (len > 0)
1386
0
  {
1387
0
    int     l;
1388
1389
    /* fast path for ASCII-subset characters */
1390
0
    if (!IS_HIGHBIT_SET(*s))
1391
0
    {
1392
0
      if (*s == '\0')
1393
0
        break;
1394
0
      l = 1;
1395
0
    }
1396
0
    else
1397
0
    {
1398
0
      l = pg_mule_verifychar(s, len);
1399
0
      if (l == -1)
1400
0
        break;
1401
0
    }
1402
0
    s += l;
1403
0
    len -= l;
1404
0
  }
1405
1406
0
  return s - start;
1407
0
}
1408
1409
static int
1410
pg_latin1_verifychar(const unsigned char *s, int len)
1411
0
{
1412
0
  return 1;
1413
0
}
1414
1415
static int
1416
pg_latin1_verifystr(const unsigned char *s, int len)
1417
0
{
1418
0
  const unsigned char *nullpos = memchr(s, 0, len);
1419
1420
0
  if (nullpos == NULL)
1421
0
    return len;
1422
0
  else
1423
0
    return nullpos - s;
1424
0
}
1425
1426
static int
1427
pg_sjis_verifychar(const unsigned char *s, int len)
1428
0
{
1429
0
  int     l,
1430
0
        mbl;
1431
0
  unsigned char c1,
1432
0
        c2;
1433
1434
0
  l = mbl = pg_sjis_mblen(s);
1435
1436
0
  if (len < l)
1437
0
    return -1;
1438
1439
0
  if (l == 1)         /* pg_sjis_mblen already verified it */
1440
0
    return mbl;
1441
1442
0
  c1 = *s++;
1443
0
  c2 = *s;
1444
0
  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1445
0
    return -1;
1446
0
  return mbl;
1447
0
}
1448
1449
static int
1450
pg_sjis_verifystr(const unsigned char *s, int len)
1451
0
{
1452
0
  const unsigned char *start = s;
1453
1454
0
  while (len > 0)
1455
0
  {
1456
0
    int     l;
1457
1458
    /* fast path for ASCII-subset characters */
1459
0
    if (!IS_HIGHBIT_SET(*s))
1460
0
    {
1461
0
      if (*s == '\0')
1462
0
        break;
1463
0
      l = 1;
1464
0
    }
1465
0
    else
1466
0
    {
1467
0
      l = pg_sjis_verifychar(s, len);
1468
0
      if (l == -1)
1469
0
        break;
1470
0
    }
1471
0
    s += l;
1472
0
    len -= l;
1473
0
  }
1474
1475
0
  return s - start;
1476
0
}
1477
1478
static int
1479
pg_big5_verifychar(const unsigned char *s, int len)
1480
0
{
1481
0
  int     l,
1482
0
        mbl;
1483
1484
0
  l = mbl = pg_big5_mblen(s);
1485
1486
0
  if (len < l)
1487
0
    return -1;
1488
1489
0
  if (l == 2 &&
1490
0
    s[0] == NONUTF8_INVALID_BYTE0 &&
1491
0
    s[1] == NONUTF8_INVALID_BYTE1)
1492
0
    return -1;
1493
1494
0
  while (--l > 0)
1495
0
  {
1496
0
    if (*++s == '\0')
1497
0
      return -1;
1498
0
  }
1499
1500
0
  return mbl;
1501
0
}
1502
1503
static int
1504
pg_big5_verifystr(const unsigned char *s, int len)
1505
0
{
1506
0
  const unsigned char *start = s;
1507
1508
0
  while (len > 0)
1509
0
  {
1510
0
    int     l;
1511
1512
    /* fast path for ASCII-subset characters */
1513
0
    if (!IS_HIGHBIT_SET(*s))
1514
0
    {
1515
0
      if (*s == '\0')
1516
0
        break;
1517
0
      l = 1;
1518
0
    }
1519
0
    else
1520
0
    {
1521
0
      l = pg_big5_verifychar(s, len);
1522
0
      if (l == -1)
1523
0
        break;
1524
0
    }
1525
0
    s += l;
1526
0
    len -= l;
1527
0
  }
1528
1529
0
  return s - start;
1530
0
}
1531
1532
static int
1533
pg_gbk_verifychar(const unsigned char *s, int len)
1534
0
{
1535
0
  int     l,
1536
0
        mbl;
1537
1538
0
  l = mbl = pg_gbk_mblen(s);
1539
1540
0
  if (len < l)
1541
0
    return -1;
1542
1543
0
  if (l == 2 &&
1544
0
    s[0] == NONUTF8_INVALID_BYTE0 &&
1545
0
    s[1] == NONUTF8_INVALID_BYTE1)
1546
0
    return -1;
1547
1548
0
  while (--l > 0)
1549
0
  {
1550
0
    if (*++s == '\0')
1551
0
      return -1;
1552
0
  }
1553
1554
0
  return mbl;
1555
0
}
1556
1557
static int
1558
pg_gbk_verifystr(const unsigned char *s, int len)
1559
0
{
1560
0
  const unsigned char *start = s;
1561
1562
0
  while (len > 0)
1563
0
  {
1564
0
    int     l;
1565
1566
    /* fast path for ASCII-subset characters */
1567
0
    if (!IS_HIGHBIT_SET(*s))
1568
0
    {
1569
0
      if (*s == '\0')
1570
0
        break;
1571
0
      l = 1;
1572
0
    }
1573
0
    else
1574
0
    {
1575
0
      l = pg_gbk_verifychar(s, len);
1576
0
      if (l == -1)
1577
0
        break;
1578
0
    }
1579
0
    s += l;
1580
0
    len -= l;
1581
0
  }
1582
1583
0
  return s - start;
1584
0
}
1585
1586
static int
1587
pg_uhc_verifychar(const unsigned char *s, int len)
1588
0
{
1589
0
  int     l,
1590
0
        mbl;
1591
1592
0
  l = mbl = pg_uhc_mblen(s);
1593
1594
0
  if (len < l)
1595
0
    return -1;
1596
1597
0
  if (l == 2 &&
1598
0
    s[0] == NONUTF8_INVALID_BYTE0 &&
1599
0
    s[1] == NONUTF8_INVALID_BYTE1)
1600
0
    return -1;
1601
1602
0
  while (--l > 0)
1603
0
  {
1604
0
    if (*++s == '\0')
1605
0
      return -1;
1606
0
  }
1607
1608
0
  return mbl;
1609
0
}
1610
1611
static int
1612
pg_uhc_verifystr(const unsigned char *s, int len)
1613
0
{
1614
0
  const unsigned char *start = s;
1615
1616
0
  while (len > 0)
1617
0
  {
1618
0
    int     l;
1619
1620
    /* fast path for ASCII-subset characters */
1621
0
    if (!IS_HIGHBIT_SET(*s))
1622
0
    {
1623
0
      if (*s == '\0')
1624
0
        break;
1625
0
      l = 1;
1626
0
    }
1627
0
    else
1628
0
    {
1629
0
      l = pg_uhc_verifychar(s, len);
1630
0
      if (l == -1)
1631
0
        break;
1632
0
    }
1633
0
    s += l;
1634
0
    len -= l;
1635
0
  }
1636
1637
0
  return s - start;
1638
0
}
1639
1640
static int
1641
pg_gb18030_verifychar(const unsigned char *s, int len)
1642
0
{
1643
0
  int     l;
1644
1645
0
  if (!IS_HIGHBIT_SET(*s))
1646
0
    l = 1;         /* ASCII */
1647
0
  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1648
0
  {
1649
    /* Should be 4-byte, validate remaining bytes */
1650
0
    if (*s >= 0x81 && *s <= 0xfe &&
1651
0
      *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1652
0
      *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1653
0
      l = 4;
1654
0
    else
1655
0
      l = -1;
1656
0
  }
1657
0
  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1658
0
  {
1659
    /* Should be 2-byte, validate */
1660
0
    if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1661
0
      (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1662
0
      l = 2;
1663
0
    else
1664
0
      l = -1;
1665
0
  }
1666
0
  else
1667
0
    l = -1;
1668
0
  return l;
1669
0
}
1670
1671
static int
1672
pg_gb18030_verifystr(const unsigned char *s, int len)
1673
0
{
1674
0
  const unsigned char *start = s;
1675
1676
0
  while (len > 0)
1677
0
  {
1678
0
    int     l;
1679
1680
    /* fast path for ASCII-subset characters */
1681
0
    if (!IS_HIGHBIT_SET(*s))
1682
0
    {
1683
0
      if (*s == '\0')
1684
0
        break;
1685
0
      l = 1;
1686
0
    }
1687
0
    else
1688
0
    {
1689
0
      l = pg_gb18030_verifychar(s, len);
1690
0
      if (l == -1)
1691
0
        break;
1692
0
    }
1693
0
    s += l;
1694
0
    len -= l;
1695
0
  }
1696
1697
0
  return s - start;
1698
0
}
1699
1700
static int
1701
pg_utf8_verifychar(const unsigned char *s, int len)
1702
0
{
1703
0
  int     l;
1704
1705
0
  if ((*s & 0x80) == 0)
1706
0
  {
1707
0
    if (*s == '\0')
1708
0
      return -1;
1709
0
    return 1;
1710
0
  }
1711
0
  else if ((*s & 0xe0) == 0xc0)
1712
0
    l = 2;
1713
0
  else if ((*s & 0xf0) == 0xe0)
1714
0
    l = 3;
1715
0
  else if ((*s & 0xf8) == 0xf0)
1716
0
    l = 4;
1717
0
  else
1718
0
    l = 1;
1719
1720
0
  if (l > len)
1721
0
    return -1;
1722
1723
0
  if (!pg_utf8_islegal(s, l))
1724
0
    return -1;
1725
1726
0
  return l;
1727
0
}
1728
1729
/*
1730
 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1731
 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1732
 * input byte and current state are used to compute an index into an array of
1733
 * state transitions. Since the address of the next transition is dependent
1734
 * on this computation, there is latency in executing the load instruction,
1735
 * and the CPU is not kept busy.
1736
 *
1737
 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1738
 *
1739
 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1740
 *
1741
 * In a shift-based DFA, the input byte is an index into array of integers
1742
 * whose bit pattern encodes the state transitions. To compute the next
1743
 * state, we simply right-shift the integer by the current state and apply a
1744
 * mask. In this scheme, the address of the transition only depends on the
1745
 * input byte, so there is better pipelining.
1746
 *
1747
 * The naming convention for states and transitions was adopted from a UTF-8
1748
 * to UTF-16/32 transcoder, whose table is reproduced below:
1749
 *
1750
 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1751
 *
1752
 * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
1753
 * ==========================================================================
1754
 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
1755
 * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
1756
 *                                                                  |
1757
 * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
1758
 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
1759
 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
1760
 *                                                                  |
1761
 * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
1762
 * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
1763
 *                                                                  |
1764
 * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
1765
 * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
1766
 *
1767
 * In the most straightforward implementation, a shift-based DFA for UTF-8
1768
 * requires 64-bit integers to encode the transitions, but with an SMT solver
1769
 * it's possible to find state numbers such that the transitions fit within
1770
 * 32-bit integers, as Dougall Johnson demonstrated:
1771
 *
1772
 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1773
 *
1774
 * This packed representation is the reason for the seemingly odd choice of
1775
 * state values below.
1776
 */
1777
1778
/* Error */
1779
0
#define ERR  0
1780
/* Begin */
1781
0
#define BGN 11
1782
/* Continuation states, expect 1/2/3 continuation bytes */
1783
#define CS1 16
1784
#define CS2  1
1785
#define CS3  5
1786
/* Partial states, where the first continuation byte has a restricted range */
1787
#define P3A  6          /* Lead was E0, check for 3-byte overlong */
1788
#define P3B 20          /* Lead was ED, check for surrogate */
1789
#define P4A 25          /* Lead was F0, check for 4-byte overlong */
1790
#define P4B 30          /* Lead was F4, check for too-large */
1791
/* Begin and End are the same state */
1792
0
#define END BGN
1793
1794
/* the encoded state transitions for the lookup table */
1795
1796
/* ASCII */
1797
#define ASC (END << BGN)
1798
/* 2-byte lead */
1799
#define L2A (CS1 << BGN)
1800
/* 3-byte lead */
1801
#define L3A (P3A << BGN)
1802
#define L3B (CS2 << BGN)
1803
#define L3C (P3B << BGN)
1804
/* 4-byte lead */
1805
#define L4A (P4A << BGN)
1806
#define L4B (CS3 << BGN)
1807
#define L4C (P4B << BGN)
1808
/* continuation byte */
1809
#define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1810
#define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1811
#define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1812
/* invalid byte */
1813
#define ILL ERR
1814
1815
static const uint32 Utf8Transition[256] =
1816
{
1817
  /* ASCII */
1818
1819
  ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1820
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1821
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1822
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1823
1824
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1825
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1826
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1827
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1828
1829
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1830
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1831
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1832
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1833
1834
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1835
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1836
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1837
  ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1838
1839
  /* continuation bytes */
1840
1841
  /* 80..8F */
1842
  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1843
  CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1844
1845
  /* 90..9F */
1846
  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1847
  CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1848
1849
  /* A0..BF */
1850
  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1851
  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1852
  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1853
  CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1854
1855
  /* leading bytes */
1856
1857
  /* C0..DF */
1858
  ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1859
  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1860
  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1861
  L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1862
1863
  /* E0..EF */
1864
  L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1865
  L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1866
1867
  /* F0..FF */
1868
  L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1869
  ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1870
};
1871
1872
static void
1873
utf8_advance(const unsigned char *s, uint32 *state, int len)
1874
0
{
1875
  /* Note: We deliberately don't check the state's value here. */
1876
0
  while (len > 0)
1877
0
  {
1878
    /*
1879
     * It's important that the mask value is 31: In most instruction sets,
1880
     * a shift by a 32-bit operand is understood to be a shift by its mod
1881
     * 32, so the compiler should elide the mask operation.
1882
     */
1883
0
    *state = Utf8Transition[*s++] >> (*state & 31);
1884
0
    len--;
1885
0
  }
1886
1887
0
  *state &= 31;
1888
0
}
1889
1890
static int
1891
pg_utf8_verifystr(const unsigned char *s, int len)
1892
0
{
1893
0
  const unsigned char *start = s;
1894
0
  const int orig_len = len;
1895
0
  uint32    state = BGN;
1896
1897
/*
1898
 * With a stride of two vector widths, gcc will unroll the loop. Even if
1899
 * the compiler can unroll a longer loop, it's not worth it because we
1900
 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1901
 */
1902
0
#define STRIDE_LENGTH (2 * sizeof(Vector8))
1903
1904
0
  if (len >= STRIDE_LENGTH)
1905
0
  {
1906
0
    while (len >= STRIDE_LENGTH)
1907
0
    {
1908
      /*
1909
       * If the chunk is all ASCII, we can skip the full UTF-8 check,
1910
       * but we must first check for a non-END state, which means the
1911
       * previous chunk ended in the middle of a multibyte sequence.
1912
       */
1913
0
      if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1914
0
        utf8_advance(s, &state, STRIDE_LENGTH);
1915
1916
0
      s += STRIDE_LENGTH;
1917
0
      len -= STRIDE_LENGTH;
1918
0
    }
1919
1920
    /* The error state persists, so we only need to check for it here. */
1921
0
    if (state == ERR)
1922
0
    {
1923
      /*
1924
       * Start over from the beginning with the slow path so we can
1925
       * count the valid bytes.
1926
       */
1927
0
      len = orig_len;
1928
0
      s = start;
1929
0
    }
1930
0
    else if (state != END)
1931
0
    {
1932
      /*
1933
       * The fast path exited in the middle of a multibyte sequence.
1934
       * Walk backwards to find the leading byte so that the slow path
1935
       * can resume checking from there. We must always backtrack at
1936
       * least one byte, since the current byte could be e.g. an ASCII
1937
       * byte after a 2-byte lead, which is invalid.
1938
       */
1939
0
      do
1940
0
      {
1941
0
        Assert(s > start);
1942
0
        s--;
1943
0
        len++;
1944
0
        Assert(IS_HIGHBIT_SET(*s));
1945
0
      } while (pg_utf_mblen(s) <= 1);
1946
0
    }
1947
0
  }
1948
1949
  /* check remaining bytes */
1950
0
  while (len > 0)
1951
0
  {
1952
0
    int     l;
1953
1954
    /* fast path for ASCII-subset characters */
1955
0
    if (!IS_HIGHBIT_SET(*s))
1956
0
    {
1957
0
      if (*s == '\0')
1958
0
        break;
1959
0
      l = 1;
1960
0
    }
1961
0
    else
1962
0
    {
1963
0
      l = pg_utf8_verifychar(s, len);
1964
0
      if (l == -1)
1965
0
        break;
1966
0
    }
1967
0
    s += l;
1968
0
    len -= l;
1969
0
  }
1970
1971
0
  return s - start;
1972
0
}
1973
1974
/*
1975
 * Check for validity of a single UTF-8 encoded character
1976
 *
1977
 * This directly implements the rules in RFC3629.  The bizarre-looking
1978
 * restrictions on the second byte are meant to ensure that there isn't
1979
 * more than one encoding of a given Unicode character point; that is,
1980
 * you may not use a longer-than-necessary byte sequence with high order
1981
 * zero bits to represent a character that would fit in fewer bytes.
1982
 * To do otherwise is to create security hazards (eg, create an apparent
1983
 * non-ASCII character that decodes to plain ASCII).
1984
 *
1985
 * length is assumed to have been obtained by pg_utf_mblen(), and the
1986
 * caller must have checked that that many bytes are present in the buffer.
1987
 */
1988
bool
1989
pg_utf8_islegal(const unsigned char *source, int length)
1990
0
{
1991
0
  unsigned char a;
1992
1993
0
  switch (length)
1994
0
  {
1995
0
    default:
1996
      /* reject lengths 5 and 6 for now */
1997
0
      return false;
1998
0
    case 4:
1999
0
      a = source[3];
2000
0
      if (a < 0x80 || a > 0xBF)
2001
0
        return false;
2002
      /* FALL THRU */
2003
0
    case 3:
2004
0
      a = source[2];
2005
0
      if (a < 0x80 || a > 0xBF)
2006
0
        return false;
2007
      /* FALL THRU */
2008
0
    case 2:
2009
0
      a = source[1];
2010
0
      switch (*source)
2011
0
      {
2012
0
        case 0xE0:
2013
0
          if (a < 0xA0 || a > 0xBF)
2014
0
            return false;
2015
0
          break;
2016
0
        case 0xED:
2017
0
          if (a < 0x80 || a > 0x9F)
2018
0
            return false;
2019
0
          break;
2020
0
        case 0xF0:
2021
0
          if (a < 0x90 || a > 0xBF)
2022
0
            return false;
2023
0
          break;
2024
0
        case 0xF4:
2025
0
          if (a < 0x80 || a > 0x8F)
2026
0
            return false;
2027
0
          break;
2028
0
        default:
2029
0
          if (a < 0x80 || a > 0xBF)
2030
0
            return false;
2031
0
          break;
2032
0
      }
2033
      /* FALL THRU */
2034
0
    case 1:
2035
0
      a = *source;
2036
0
      if (a >= 0x80 && a < 0xC2)
2037
0
        return false;
2038
0
      if (a > 0xF4)
2039
0
        return false;
2040
0
      break;
2041
0
  }
2042
0
  return true;
2043
0
}
2044
2045
2046
/*
2047
 * Fills the provided buffer with two bytes such that:
2048
 *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2049
 */
2050
void
2051
pg_encoding_set_invalid(int encoding, char *dst)
2052
0
{
2053
0
  Assert(pg_encoding_max_length(encoding) > 1);
2054
2055
0
  dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2056
0
  dst[1] = NONUTF8_INVALID_BYTE1;
2057
0
}
2058
2059
/*
2060
 *-------------------------------------------------------------------
2061
 * encoding info table
2062
 *-------------------------------------------------------------------
2063
 */
2064
const pg_wchar_tbl pg_wchar_table[] = {
2065
  [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2066
  [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2067
  [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2068
  [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2069
  [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2070
  [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2071
  [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2072
  [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2073
  [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2074
  [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2075
  [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2076
  [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2077
  [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2078
  [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2079
  [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2080
  [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2081
  [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2082
  [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2083
  [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2084
  [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2085
  [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2086
  [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2087
  [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2088
  [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2089
  [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2090
  [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2091
  [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2092
  [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2093
  [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2094
  [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2095
  [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2096
  [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2097
  [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2098
  [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2099
  [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2100
  [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2101
  [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2102
  [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2103
  [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2104
  [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2105
  [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2106
  [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2107
};
2108
2109
/*
2110
 * Returns the byte length of a multibyte character.
2111
 *
2112
 * Choose "mblen" functions based on the input string characteristics.
2113
 * pg_encoding_mblen() can be used when ANY of these conditions are met:
2114
 *
2115
 * - The input string is zero-terminated
2116
 *
2117
 * - The input string is known to be valid in the encoding (e.g., string
2118
 *   converted from database encoding)
2119
 *
2120
 * - The encoding is not GB18030 (e.g., when only database encodings are
2121
 *   passed to 'encoding' parameter)
2122
 *
2123
 * encoding==GB18030 requires examining up to two bytes to determine character
2124
 * length.  Therefore, callers satisfying none of those conditions must use
2125
 * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be
2126
 * guaranteed to be within allocation bounds.
2127
 *
2128
 * When dealing with text that is not certainly valid in the specified
2129
 * encoding, the result may exceed the actual remaining string length.
2130
 * Callers that are not prepared to deal with that should use Min(remaining,
2131
 * pg_encoding_mblen_or_incomplete()).  For zero-terminated strings, that and
2132
 * pg_encoding_mblen_bounded() are interchangeable.
2133
 */
2134
int
2135
pg_encoding_mblen(int encoding, const char *mbstr)
2136
513
{
2137
513
  return (PG_VALID_ENCODING(encoding) ?
2138
513
      pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2139
513
      pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2140
513
}
2141
2142
/*
2143
 * Returns the byte length of a multibyte character (possibly not
2144
 * zero-terminated), or INT_MAX if too few bytes remain to determine a length.
2145
 */
2146
int
2147
pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr,
2148
                size_t remaining)
2149
513
{
2150
  /*
2151
   * Define zero remaining as too few, even for single-byte encodings.
2152
   * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read
2153
   * zero; others read one.
2154
   */
2155
513
  if (remaining < 1 ||
2156
513
    (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2))
2157
0
    return INT_MAX;
2158
513
  return pg_encoding_mblen(encoding, mbstr);
2159
513
}
2160
2161
/*
2162
 * Returns the byte length of a multibyte character; but not more than the
2163
 * distance to the terminating zero byte.  For input that might lack a
2164
 * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()).
2165
 */
2166
int
2167
pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2168
0
{
2169
0
  return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2170
0
}
2171
2172
/*
2173
 * Returns the display length of a multibyte character.
2174
 */
2175
int
2176
pg_encoding_dsplen(int encoding, const char *mbstr)
2177
0
{
2178
0
  return (PG_VALID_ENCODING(encoding) ?
2179
0
      pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2180
0
      pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2181
0
}
2182
2183
/*
2184
 * Verify the first multibyte character of the given string.
2185
 * Return its byte length if good, -1 if bad.  (See comments above for
2186
 * full details of the mbverifychar API.)
2187
 */
2188
int
2189
pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2190
0
{
2191
0
  return (PG_VALID_ENCODING(encoding) ?
2192
0
      pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2193
0
      pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2194
0
}
2195
2196
/*
2197
 * Verify that a string is valid for the given encoding.
2198
 * Returns the number of input bytes (<= len) that form a valid string.
2199
 * (See comments above for full details of the mbverifystr API.)
2200
 */
2201
int
2202
pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2203
0
{
2204
0
  return (PG_VALID_ENCODING(encoding) ?
2205
0
      pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2206
0
      pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2207
0
}
2208
2209
/*
2210
 * fetch maximum length of a given encoding
2211
 */
2212
int
2213
pg_encoding_max_length(int encoding)
2214
3.29k
{
2215
3.29k
  Assert(PG_VALID_ENCODING(encoding));
2216
2217
  /*
2218
   * Check for the encoding despite the assert, due to some mingw versions
2219
   * otherwise issuing bogus warnings.
2220
   */
2221
3.29k
  return PG_VALID_ENCODING(encoding) ?
2222
3.29k
    pg_wchar_table[encoding].maxmblen :
2223
3.29k
    pg_wchar_table[PG_SQL_ASCII].maxmblen;
2224
3.29k
}