Coverage Report

Created: 2025-06-15 06:31

/src/postgres/src/backend/utils/mb/conv.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 *    Utility functions for conversion procs.
4
 *
5
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6
 * Portions Copyright (c) 1994, Regents of the University of California
7
 *
8
 * IDENTIFICATION
9
 *    src/backend/utils/mb/conv.c
10
 *
11
 *-------------------------------------------------------------------------
12
 */
13
#include "postgres.h"
14
#include "mb/pg_wchar.h"
15
16
17
/*
18
 * local2local: a generic single byte charset encoding
19
 * conversion between two ASCII-superset encodings.
20
 *
21
 * l points to the source string of length len
22
 * p is the output area (must be large enough!)
23
 * src_encoding is the PG identifier for the source encoding
24
 * dest_encoding is the PG identifier for the target encoding
25
 * tab holds conversion entries for the source charset
26
 * starting from 128 (0x80). each entry in the table holds the corresponding
27
 * code point for the target charset, or 0 if there is no equivalent code.
28
 *
29
 * Returns the number of input bytes consumed.  If noError is true, this can
30
 * be less than 'len'.
31
 */
32
int
33
local2local(const unsigned char *l,
34
      unsigned char *p,
35
      int len,
36
      int src_encoding,
37
      int dest_encoding,
38
      const unsigned char *tab,
39
      bool noError)
40
0
{
41
0
  const unsigned char *start = l;
42
0
  unsigned char c1,
43
0
        c2;
44
45
0
  while (len > 0)
46
0
  {
47
0
    c1 = *l;
48
0
    if (c1 == 0)
49
0
    {
50
0
      if (noError)
51
0
        break;
52
0
      report_invalid_encoding(src_encoding, (const char *) l, len);
53
0
    }
54
0
    if (!IS_HIGHBIT_SET(c1))
55
0
      *p++ = c1;
56
0
    else
57
0
    {
58
0
      c2 = tab[c1 - HIGHBIT];
59
0
      if (c2)
60
0
        *p++ = c2;
61
0
      else
62
0
      {
63
0
        if (noError)
64
0
          break;
65
0
        report_untranslatable_char(src_encoding, dest_encoding,
66
0
                       (const char *) l, len);
67
0
      }
68
0
    }
69
0
    l++;
70
0
    len--;
71
0
  }
72
0
  *p = '\0';
73
74
0
  return l - start;
75
0
}
76
77
/*
78
 * LATINn ---> MIC when the charset's local codes map directly to MIC
79
 *
80
 * l points to the source string of length len
81
 * p is the output area (must be large enough!)
82
 * lc is the mule character set id for the local encoding
83
 * encoding is the PG identifier for the local encoding
84
 *
85
 * Returns the number of input bytes consumed.  If noError is true, this can
86
 * be less than 'len'.
87
 */
88
int
89
latin2mic(const unsigned char *l, unsigned char *p, int len,
90
      int lc, int encoding, bool noError)
91
0
{
92
0
  const unsigned char *start = l;
93
0
  int     c1;
94
95
0
  while (len > 0)
96
0
  {
97
0
    c1 = *l;
98
0
    if (c1 == 0)
99
0
    {
100
0
      if (noError)
101
0
        break;
102
0
      report_invalid_encoding(encoding, (const char *) l, len);
103
0
    }
104
0
    if (IS_HIGHBIT_SET(c1))
105
0
      *p++ = lc;
106
0
    *p++ = c1;
107
0
    l++;
108
0
    len--;
109
0
  }
110
0
  *p = '\0';
111
112
0
  return l - start;
113
0
}
114
115
/*
116
 * MIC ---> LATINn when the charset's local codes map directly to MIC
117
 *
118
 * mic points to the source string of length len
119
 * p is the output area (must be large enough!)
120
 * lc is the mule character set id for the local encoding
121
 * encoding is the PG identifier for the local encoding
122
 *
123
 * Returns the number of input bytes consumed.  If noError is true, this can
124
 * be less than 'len'.
125
 */
126
int
127
mic2latin(const unsigned char *mic, unsigned char *p, int len,
128
      int lc, int encoding, bool noError)
129
0
{
130
0
  const unsigned char *start = mic;
131
0
  int     c1;
132
133
0
  while (len > 0)
134
0
  {
135
0
    c1 = *mic;
136
0
    if (c1 == 0)
137
0
    {
138
0
      if (noError)
139
0
        break;
140
0
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141
0
    }
142
0
    if (!IS_HIGHBIT_SET(c1))
143
0
    {
144
      /* easy for ASCII */
145
0
      *p++ = c1;
146
0
      mic++;
147
0
      len--;
148
0
    }
149
0
    else
150
0
    {
151
0
      int     l = pg_mule_mblen(mic);
152
153
0
      if (len < l)
154
0
      {
155
0
        if (noError)
156
0
          break;
157
0
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158
0
                    len);
159
0
      }
160
0
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161
0
      {
162
0
        if (noError)
163
0
          break;
164
0
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165
0
                       (const char *) mic, len);
166
0
      }
167
0
      *p++ = mic[1];
168
0
      mic += 2;
169
0
      len -= 2;
170
0
    }
171
0
  }
172
0
  *p = '\0';
173
174
0
  return mic - start;
175
0
}
176
177
178
/*
179
 * latin2mic_with_table: a generic single byte charset encoding
180
 * conversion from a local charset to the mule internal code.
181
 *
182
 * l points to the source string of length len
183
 * p is the output area (must be large enough!)
184
 * lc is the mule character set id for the local encoding
185
 * encoding is the PG identifier for the local encoding
186
 * tab holds conversion entries for the local charset
187
 * starting from 128 (0x80). each entry in the table holds the corresponding
188
 * code point for the mule encoding, or 0 if there is no equivalent code.
189
 *
190
 * Returns the number of input bytes consumed.  If noError is true, this can
191
 * be less than 'len'.
192
 */
193
int
194
latin2mic_with_table(const unsigned char *l,
195
           unsigned char *p,
196
           int len,
197
           int lc,
198
           int encoding,
199
           const unsigned char *tab,
200
           bool noError)
201
0
{
202
0
  const unsigned char *start = l;
203
0
  unsigned char c1,
204
0
        c2;
205
206
0
  while (len > 0)
207
0
  {
208
0
    c1 = *l;
209
0
    if (c1 == 0)
210
0
    {
211
0
      if (noError)
212
0
        break;
213
0
      report_invalid_encoding(encoding, (const char *) l, len);
214
0
    }
215
0
    if (!IS_HIGHBIT_SET(c1))
216
0
      *p++ = c1;
217
0
    else
218
0
    {
219
0
      c2 = tab[c1 - HIGHBIT];
220
0
      if (c2)
221
0
      {
222
0
        *p++ = lc;
223
0
        *p++ = c2;
224
0
      }
225
0
      else
226
0
      {
227
0
        if (noError)
228
0
          break;
229
0
        report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230
0
                       (const char *) l, len);
231
0
      }
232
0
    }
233
0
    l++;
234
0
    len--;
235
0
  }
236
0
  *p = '\0';
237
238
0
  return l - start;
239
0
}
240
241
/*
242
 * mic2latin_with_table: a generic single byte charset encoding
243
 * conversion from the mule internal code to a local charset.
244
 *
245
 * mic points to the source string of length len
246
 * p is the output area (must be large enough!)
247
 * lc is the mule character set id for the local encoding
248
 * encoding is the PG identifier for the local encoding
249
 * tab holds conversion entries for the mule internal code's second byte,
250
 * starting from 128 (0x80). each entry in the table holds the corresponding
251
 * code point for the local charset, or 0 if there is no equivalent code.
252
 *
253
 * Returns the number of input bytes consumed.  If noError is true, this can
254
 * be less than 'len'.
255
 */
256
int
257
mic2latin_with_table(const unsigned char *mic,
258
           unsigned char *p,
259
           int len,
260
           int lc,
261
           int encoding,
262
           const unsigned char *tab,
263
           bool noError)
264
0
{
265
0
  const unsigned char *start = mic;
266
0
  unsigned char c1,
267
0
        c2;
268
269
0
  while (len > 0)
270
0
  {
271
0
    c1 = *mic;
272
0
    if (c1 == 0)
273
0
    {
274
0
      if (noError)
275
0
        break;
276
0
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277
0
    }
278
0
    if (!IS_HIGHBIT_SET(c1))
279
0
    {
280
      /* easy for ASCII */
281
0
      *p++ = c1;
282
0
      mic++;
283
0
      len--;
284
0
    }
285
0
    else
286
0
    {
287
0
      int     l = pg_mule_mblen(mic);
288
289
0
      if (len < l)
290
0
      {
291
0
        if (noError)
292
0
          break;
293
0
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294
0
                    len);
295
0
      }
296
0
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297
0
        (c2 = tab[mic[1] - HIGHBIT]) == 0)
298
0
      {
299
0
        if (noError)
300
0
          break;
301
0
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302
0
                       (const char *) mic, len);
303
0
        break;     /* keep compiler quiet */
304
0
      }
305
0
      *p++ = c2;
306
0
      mic += 2;
307
0
      len -= 2;
308
0
    }
309
0
  }
310
0
  *p = '\0';
311
312
0
  return mic - start;
313
0
}
314
315
/*
316
 * comparison routine for bsearch()
317
 * this routine is intended for combined UTF8 -> local code
318
 */
319
static int
320
compare3(const void *p1, const void *p2)
321
0
{
322
0
  uint32    s1,
323
0
        s2,
324
0
        d1,
325
0
        d2;
326
327
0
  s1 = *(const uint32 *) p1;
328
0
  s2 = *((const uint32 *) p1 + 1);
329
0
  d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330
0
  d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331
0
  return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332
0
}
333
334
/*
335
 * comparison routine for bsearch()
336
 * this routine is intended for local code -> combined UTF8
337
 */
338
static int
339
compare4(const void *p1, const void *p2)
340
0
{
341
0
  uint32    v1,
342
0
        v2;
343
344
0
  v1 = *(const uint32 *) p1;
345
0
  v2 = ((const pg_local_to_utf_combined *) p2)->code;
346
0
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347
0
}
348
349
/*
350
 * store 32bit character representation into multibyte stream
351
 */
352
static inline unsigned char *
353
store_coded_char(unsigned char *dest, uint32 code)
354
0
{
355
0
  if (code & 0xff000000)
356
0
    *dest++ = code >> 24;
357
0
  if (code & 0x00ff0000)
358
0
    *dest++ = code >> 16;
359
0
  if (code & 0x0000ff00)
360
0
    *dest++ = code >> 8;
361
0
  if (code & 0x000000ff)
362
0
    *dest++ = code;
363
0
  return dest;
364
0
}
365
366
/*
367
 * Convert a character using a conversion radix tree.
368
 *
369
 * 'l' is the length of the input character in bytes, and b1-b4 are
370
 * the input character's bytes.
371
 */
372
static inline uint32
373
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374
         int l,
375
         unsigned char b1,
376
         unsigned char b2,
377
         unsigned char b3,
378
         unsigned char b4)
379
0
{
380
0
  if (l == 4)
381
0
  {
382
    /* 4-byte code */
383
384
    /* check code validity */
385
0
    if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386
0
      b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387
0
      b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388
0
      b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
389
0
      return 0;
390
391
    /* perform lookup */
392
0
    if (rt->chars32)
393
0
    {
394
0
      uint32    idx = rt->b4root;
395
396
0
      idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397
0
      idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398
0
      idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399
0
      return rt->chars32[b4 + idx - rt->b4_4_lower];
400
0
    }
401
0
    else
402
0
    {
403
0
      uint16    idx = rt->b4root;
404
405
0
      idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406
0
      idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407
0
      idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408
0
      return rt->chars16[b4 + idx - rt->b4_4_lower];
409
0
    }
410
0
  }
411
0
  else if (l == 3)
412
0
  {
413
    /* 3-byte code */
414
415
    /* check code validity */
416
0
    if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417
0
      b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418
0
      b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419
0
      return 0;
420
421
    /* perform lookup */
422
0
    if (rt->chars32)
423
0
    {
424
0
      uint32    idx = rt->b3root;
425
426
0
      idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427
0
      idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428
0
      return rt->chars32[b4 + idx - rt->b3_3_lower];
429
0
    }
430
0
    else
431
0
    {
432
0
      uint16    idx = rt->b3root;
433
434
0
      idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435
0
      idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436
0
      return rt->chars16[b4 + idx - rt->b3_3_lower];
437
0
    }
438
0
  }
439
0
  else if (l == 2)
440
0
  {
441
    /* 2-byte code */
442
443
    /* check code validity - first byte */
444
0
    if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445
0
      b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446
0
      return 0;
447
448
    /* perform lookup */
449
0
    if (rt->chars32)
450
0
    {
451
0
      uint32    idx = rt->b2root;
452
453
0
      idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454
0
      return rt->chars32[b4 + idx - rt->b2_2_lower];
455
0
    }
456
0
    else
457
0
    {
458
0
      uint16    idx = rt->b2root;
459
460
0
      idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461
0
      return rt->chars16[b4 + idx - rt->b2_2_lower];
462
0
    }
463
0
  }
464
0
  else if (l == 1)
465
0
  {
466
    /* 1-byte code */
467
468
    /* check code validity - first byte */
469
0
    if (b4 < rt->b1_lower || b4 > rt->b1_upper)
470
0
      return 0;
471
472
    /* perform lookup */
473
0
    if (rt->chars32)
474
0
      return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475
0
    else
476
0
      return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477
0
  }
478
0
  return 0;         /* shouldn't happen */
479
0
}
480
481
/*
482
 * UTF8 ---> local code
483
 *
484
 * utf: input string in UTF8 encoding (need not be null-terminated)
485
 * len: length of input string (in bytes)
486
 * iso: pointer to the output area (must be large enough!)
487
      (output string will be null-terminated)
488
 * map: conversion map for single characters
489
 * cmap: conversion map for combined characters
490
 *      (optional, pass NULL if none)
491
 * cmapsize: number of entries in the conversion map for combined characters
492
 *      (optional, pass 0 if none)
493
 * conv_func: algorithmic encoding conversion function
494
 *      (optional, pass NULL if none)
495
 * encoding: PG identifier for the local encoding
496
 *
497
 * For each character, the cmap (if provided) is consulted first; if no match,
498
 * the map is consulted next; if still no match, the conv_func (if provided)
499
 * is applied.  An error is raised if no match is found.
500
 *
501
 * See pg_wchar.h for more details about the data structures used here.
502
 *
503
 * Returns the number of input bytes consumed.  If noError is true, this can
504
 * be less than 'len'.
505
 */
506
int
507
UtfToLocal(const unsigned char *utf, int len,
508
       unsigned char *iso,
509
       const pg_mb_radix_tree *map,
510
       const pg_utf_to_local_combined *cmap, int cmapsize,
511
       utf_local_conversion_func conv_func,
512
       int encoding, bool noError)
513
0
{
514
0
  uint32    iutf;
515
0
  int     l;
516
0
  const pg_utf_to_local_combined *cp;
517
0
  const unsigned char *start = utf;
518
519
0
  if (!PG_VALID_ENCODING(encoding))
520
0
    ereport(ERROR,
521
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522
0
         errmsg("invalid encoding number: %d", encoding)));
523
524
0
  for (; len > 0; len -= l)
525
0
  {
526
0
    unsigned char b1 = 0;
527
0
    unsigned char b2 = 0;
528
0
    unsigned char b3 = 0;
529
0
    unsigned char b4 = 0;
530
531
    /* "break" cases all represent errors */
532
0
    if (*utf == '\0')
533
0
      break;
534
535
0
    l = pg_utf_mblen(utf);
536
0
    if (len < l)
537
0
      break;
538
539
0
    if (!pg_utf8_islegal(utf, l))
540
0
      break;
541
542
0
    if (l == 1)
543
0
    {
544
      /* ASCII case is easy, assume it's one-to-one conversion */
545
0
      *iso++ = *utf++;
546
0
      continue;
547
0
    }
548
549
    /* collect coded char of length l */
550
0
    if (l == 2)
551
0
    {
552
0
      b3 = *utf++;
553
0
      b4 = *utf++;
554
0
    }
555
0
    else if (l == 3)
556
0
    {
557
0
      b2 = *utf++;
558
0
      b3 = *utf++;
559
0
      b4 = *utf++;
560
0
    }
561
0
    else if (l == 4)
562
0
    {
563
0
      b1 = *utf++;
564
0
      b2 = *utf++;
565
0
      b3 = *utf++;
566
0
      b4 = *utf++;
567
0
    }
568
0
    else
569
0
    {
570
0
      elog(ERROR, "unsupported character length %d", l);
571
0
      iutf = 0;     /* keep compiler quiet */
572
0
    }
573
0
    iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574
575
    /* First, try with combined map if possible */
576
0
    if (cmap && len > l)
577
0
    {
578
0
      const unsigned char *utf_save = utf;
579
0
      int     len_save = len;
580
0
      int     l_save = l;
581
582
      /* collect next character, same as above */
583
0
      len -= l;
584
585
0
      l = pg_utf_mblen(utf);
586
0
      if (len < l)
587
0
      {
588
        /* need more data to decide if this is a combined char */
589
0
        utf -= l_save;
590
0
        break;
591
0
      }
592
593
0
      if (!pg_utf8_islegal(utf, l))
594
0
      {
595
0
        if (!noError)
596
0
          report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597
0
        utf -= l_save;
598
0
        break;
599
0
      }
600
601
      /* We assume ASCII character cannot be in combined map */
602
0
      if (l > 1)
603
0
      {
604
0
        uint32    iutf2;
605
0
        uint32    cutf[2];
606
607
0
        if (l == 2)
608
0
        {
609
0
          iutf2 = *utf++ << 8;
610
0
          iutf2 |= *utf++;
611
0
        }
612
0
        else if (l == 3)
613
0
        {
614
0
          iutf2 = *utf++ << 16;
615
0
          iutf2 |= *utf++ << 8;
616
0
          iutf2 |= *utf++;
617
0
        }
618
0
        else if (l == 4)
619
0
        {
620
0
          iutf2 = *utf++ << 24;
621
0
          iutf2 |= *utf++ << 16;
622
0
          iutf2 |= *utf++ << 8;
623
0
          iutf2 |= *utf++;
624
0
        }
625
0
        else
626
0
        {
627
0
          elog(ERROR, "unsupported character length %d", l);
628
0
          iutf2 = 0;  /* keep compiler quiet */
629
0
        }
630
631
0
        cutf[0] = iutf;
632
0
        cutf[1] = iutf2;
633
634
0
        cp = bsearch(cutf, cmap, cmapsize,
635
0
               sizeof(pg_utf_to_local_combined), compare3);
636
637
0
        if (cp)
638
0
        {
639
0
          iso = store_coded_char(iso, cp->code);
640
0
          continue;
641
0
        }
642
0
      }
643
644
      /* fail, so back up to reprocess second character next time */
645
0
      utf = utf_save;
646
0
      len = len_save;
647
0
      l = l_save;
648
0
    }
649
650
    /* Now check ordinary map */
651
0
    if (map)
652
0
    {
653
0
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654
655
0
      if (converted)
656
0
      {
657
0
        iso = store_coded_char(iso, converted);
658
0
        continue;
659
0
      }
660
0
    }
661
662
    /* if there's a conversion function, try that */
663
0
    if (conv_func)
664
0
    {
665
0
      uint32    converted = (*conv_func) (iutf);
666
667
0
      if (converted)
668
0
      {
669
0
        iso = store_coded_char(iso, converted);
670
0
        continue;
671
0
      }
672
0
    }
673
674
    /* failed to translate this character */
675
0
    utf -= l;
676
0
    if (noError)
677
0
      break;
678
0
    report_untranslatable_char(PG_UTF8, encoding,
679
0
                   (const char *) utf, len);
680
0
  }
681
682
  /* if we broke out of loop early, must be invalid input */
683
0
  if (len > 0 && !noError)
684
0
    report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685
686
0
  *iso = '\0';
687
688
0
  return utf - start;
689
0
}
690
691
/*
692
 * local code ---> UTF8
693
 *
694
 * iso: input string in local encoding (need not be null-terminated)
695
 * len: length of input string (in bytes)
696
 * utf: pointer to the output area (must be large enough!)
697
      (output string will be null-terminated)
698
 * map: conversion map for single characters
699
 * cmap: conversion map for combined characters
700
 *      (optional, pass NULL if none)
701
 * cmapsize: number of entries in the conversion map for combined characters
702
 *      (optional, pass 0 if none)
703
 * conv_func: algorithmic encoding conversion function
704
 *      (optional, pass NULL if none)
705
 * encoding: PG identifier for the local encoding
706
 *
707
 * For each character, the map is consulted first; if no match, the cmap
708
 * (if provided) is consulted next; if still no match, the conv_func
709
 * (if provided) is applied.  An error is raised if no match is found.
710
 *
711
 * See pg_wchar.h for more details about the data structures used here.
712
 *
713
 * Returns the number of input bytes consumed.  If noError is true, this can
714
 * be less than 'len'.
715
 */
716
int
717
LocalToUtf(const unsigned char *iso, int len,
718
       unsigned char *utf,
719
       const pg_mb_radix_tree *map,
720
       const pg_local_to_utf_combined *cmap, int cmapsize,
721
       utf_local_conversion_func conv_func,
722
       int encoding,
723
       bool noError)
724
0
{
725
0
  uint32    iiso;
726
0
  int     l;
727
0
  const pg_local_to_utf_combined *cp;
728
0
  const unsigned char *start = iso;
729
730
0
  if (!PG_VALID_ENCODING(encoding))
731
0
    ereport(ERROR,
732
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733
0
         errmsg("invalid encoding number: %d", encoding)));
734
735
0
  for (; len > 0; len -= l)
736
0
  {
737
0
    unsigned char b1 = 0;
738
0
    unsigned char b2 = 0;
739
0
    unsigned char b3 = 0;
740
0
    unsigned char b4 = 0;
741
742
    /* "break" cases all represent errors */
743
0
    if (*iso == '\0')
744
0
      break;
745
746
0
    if (!IS_HIGHBIT_SET(*iso))
747
0
    {
748
      /* ASCII case is easy, assume it's one-to-one conversion */
749
0
      *utf++ = *iso++;
750
0
      l = 1;
751
0
      continue;
752
0
    }
753
754
0
    l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755
0
    if (l < 0)
756
0
      break;
757
758
    /* collect coded char of length l */
759
0
    if (l == 1)
760
0
      b4 = *iso++;
761
0
    else if (l == 2)
762
0
    {
763
0
      b3 = *iso++;
764
0
      b4 = *iso++;
765
0
    }
766
0
    else if (l == 3)
767
0
    {
768
0
      b2 = *iso++;
769
0
      b3 = *iso++;
770
0
      b4 = *iso++;
771
0
    }
772
0
    else if (l == 4)
773
0
    {
774
0
      b1 = *iso++;
775
0
      b2 = *iso++;
776
0
      b3 = *iso++;
777
0
      b4 = *iso++;
778
0
    }
779
0
    else
780
0
    {
781
0
      elog(ERROR, "unsupported character length %d", l);
782
0
      iiso = 0;     /* keep compiler quiet */
783
0
    }
784
0
    iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785
786
0
    if (map)
787
0
    {
788
0
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789
790
0
      if (converted)
791
0
      {
792
0
        utf = store_coded_char(utf, converted);
793
0
        continue;
794
0
      }
795
796
      /* If there's a combined character map, try that */
797
0
      if (cmap)
798
0
      {
799
0
        cp = bsearch(&iiso, cmap, cmapsize,
800
0
               sizeof(pg_local_to_utf_combined), compare4);
801
802
0
        if (cp)
803
0
        {
804
0
          utf = store_coded_char(utf, cp->utf1);
805
0
          utf = store_coded_char(utf, cp->utf2);
806
0
          continue;
807
0
        }
808
0
      }
809
0
    }
810
811
    /* if there's a conversion function, try that */
812
0
    if (conv_func)
813
0
    {
814
0
      uint32    converted = (*conv_func) (iiso);
815
816
0
      if (converted)
817
0
      {
818
0
        utf = store_coded_char(utf, converted);
819
0
        continue;
820
0
      }
821
0
    }
822
823
    /* failed to translate this character */
824
0
    iso -= l;
825
0
    if (noError)
826
0
      break;
827
0
    report_untranslatable_char(encoding, PG_UTF8,
828
0
                   (const char *) iso, len);
829
0
  }
830
831
  /* if we broke out of loop early, must be invalid input */
832
0
  if (len > 0 && !noError)
833
0
    report_invalid_encoding(encoding, (const char *) iso, len);
834
835
0
  *utf = '\0';
836
837
0
  return iso - start;
838
0
}