/src/postgres/src/backend/utils/mb/conv.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 *    Utility functions for conversion procs.
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *    src/backend/utils/mb/conv.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"
#include "mb/pg_wchar.h"


/*
 * local2local: a generic single byte charset encoding
 * conversion between two ASCII-superset encodings.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * src_encoding is the PG identifier for the source encoding
 * dest_encoding is the PG identifier for the target encoding
 * tab holds conversion entries for the source charset
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the target charset, or 0 if there is no equivalent code.
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
local2local(const unsigned char *l,
      unsigned char *p,
      int len,
      int src_encoding,
      int dest_encoding,
      const unsigned char *tab,
      bool noError)
{
  const unsigned char *start = l;
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
    {
      if (noError)
        break;
      report_invalid_encoding(src_encoding, (const char *) l, len);
    }
    if (!IS_HIGHBIT_SET(c1))
      *p++ = c1;
    else
    {
      c2 = tab[c1 - HIGHBIT];
      if (c2)
        *p++ = c2;
      else
      {
        if (noError)
          break;
        report_untranslatable_char(src_encoding, dest_encoding,
                       (const char *) l, len);
      }
    }
    l++;
    len--;
  }
  *p = '\0';

  return l - start;
}

/*
 * LATINn ---> MIC when the charset's local codes map directly to MIC
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
latin2mic(const unsigned char *l, unsigned char *p, int len,
      int lc, int encoding, bool noError)
{
  const unsigned char *start = l;
  int     c1;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
    {
      if (noError)
        break;
      report_invalid_encoding(encoding, (const char *) l, len);
    }
    if (IS_HIGHBIT_SET(c1))
      *p++ = lc;
    *p++ = c1;
    l++;
    len--;
  }
  *p = '\0';

  return l - start;
}

/*
 * MIC ---> LATINn when the charset's local codes map directly to MIC
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
mic2latin(const unsigned char *mic, unsigned char *p, int len,
      int lc, int encoding, bool noError)
{
  const unsigned char *start = mic;
  int     c1;

  while (len > 0)
  {
    c1 = *mic;
    if (c1 == 0)
    {
      if (noError)
        break;
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
    }
    if (!IS_HIGHBIT_SET(c1))
    {
      /* easy for ASCII */
      *p++ = c1;
      mic++;
      len--;
    }
    else
    {
      int     l = pg_mule_mblen(mic);

      if (len < l)
      {
        if (noError)
          break;
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                    len);
      }
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
      {
        if (noError)
          break;
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                       (const char *) mic, len);
      }
      *p++ = mic[1];
      mic += 2;
      len -= 2;
    }
  }
  *p = '\0';

  return mic - start;
}


/*
 * latin2mic_with_table: a generic single byte charset encoding
 * conversion from a local charset to the mule internal code.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the local charset
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the mule encoding, or 0 if there is no equivalent code.
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
latin2mic_with_table(const unsigned char *l,
           unsigned char *p,
           int len,
           int lc,
           int encoding,
           const unsigned char *tab,
           bool noError)
{
  const unsigned char *start = l;
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
    {
      if (noError)
        break;
      report_invalid_encoding(encoding, (const char *) l, len);
    }
    if (!IS_HIGHBIT_SET(c1))
      *p++ = c1;
    else
    {
      c2 = tab[c1 - HIGHBIT];
      if (c2)
      {
        *p++ = lc;
        *p++ = c2;
      }
      else
      {
        if (noError)
          break;
        report_untranslatable_char(encoding, PG_MULE_INTERNAL,
                       (const char *) l, len);
      }
    }
    l++;
    len--;
  }
  *p = '\0';

  return l - start;
}

/*
 * mic2latin_with_table: a generic single byte charset encoding
 * conversion from the mule internal code to a local charset.
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the mule internal code's second byte,
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the local charset, or 0 if there is no equivalent code.
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
mic2latin_with_table(const unsigned char *mic,
           unsigned char *p,
           int len,
           int lc,
           int encoding,
           const unsigned char *tab,
           bool noError)
{
  const unsigned char *start = mic;
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *mic;
    if (c1 == 0)
    {
      if (noError)
        break;
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
    }
    if (!IS_HIGHBIT_SET(c1))
    {
      /* easy for ASCII */
      *p++ = c1;
      mic++;
      len--;
    }
    else
    {
      int     l = pg_mule_mblen(mic);

      if (len < l)
      {
        if (noError)
          break;
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                    len);
      }
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
        (c2 = tab[mic[1] - HIGHBIT]) == 0)
      {
        if (noError)
          break;
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                       (const char *) mic, len);
        break;     /* keep compiler quiet */
      }
      *p++ = c2;
      mic += 2;
      len -= 2;
    }
  }
  *p = '\0';

  return mic - start;
}

/*
 * comparison routine for bsearch()
 * this routine is intended for combined UTF8 -> local code
 */
static int
compare3(const void *p1, const void *p2)
{
  uint32    s1,
        s2,
        d1,
        d2;

  s1 = *(const uint32 *) p1;
  s2 = *((const uint32 *) p1 + 1);
  d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
  d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
  return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}

/*
 * comparison routine for bsearch()
 * this routine is intended for local code -> combined UTF8
 */
static int
compare4(const void *p1, const void *p2)
{
  uint32    v1,
        v2;

  v1 = *(const uint32 *) p1;
  v2 = ((const pg_local_to_utf_combined *) p2)->code;
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}

/*
 * store 32bit character representation into multibyte stream
 */
static inline unsigned char *
store_coded_char(unsigned char *dest, uint32 code)
{
  if (code & 0xff000000)
    *dest++ = code >> 24;
  if (code & 0x00ff0000)
    *dest++ = code >> 16;
  if (code & 0x0000ff00)
    *dest++ = code >> 8;
  if (code & 0x000000ff)
    *dest++ = code;
  return dest;
}

/*
 * Convert a character using a conversion radix tree.
 *
 * 'l' is the length of the input character in bytes, and b1-b4 are
 * the input character's bytes.
 */
static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
         int l,
         unsigned char b1,
         unsigned char b2,
         unsigned char b3,
         unsigned char b4)
{
  if (l == 4)
  {
    /* 4-byte code */

    /* check code validity */
    if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
      b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
      b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
      b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b4root;

      idx = rt->chars32[b1 + idx - rt->b4_1_lower];
      idx = rt->chars32[b2 + idx - rt->b4_2_lower];
      idx = rt->chars32[b3 + idx - rt->b4_3_lower];
      return rt->chars32[b4 + idx - rt->b4_4_lower];
    }
    else
    {
      uint16    idx = rt->b4root;

      idx = rt->chars16[b1 + idx - rt->b4_1_lower];
      idx = rt->chars16[b2 + idx - rt->b4_2_lower];
      idx = rt->chars16[b3 + idx - rt->b4_3_lower];
      return rt->chars16[b4 + idx - rt->b4_4_lower];
    }
  }
  else if (l == 3)
  {
    /* 3-byte code */

    /* check code validity */
    if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
      b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
      b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b3root;

      idx = rt->chars32[b2 + idx - rt->b3_1_lower];
      idx = rt->chars32[b3 + idx - rt->b3_2_lower];
      return rt->chars32[b4 + idx - rt->b3_3_lower];
    }
    else
    {
      uint16    idx = rt->b3root;

      idx = rt->chars16[b2 + idx - rt->b3_1_lower];
      idx = rt->chars16[b3 + idx - rt->b3_2_lower];
      return rt->chars16[b4 + idx - rt->b3_3_lower];
    }
  }
  else if (l == 2)
  {
    /* 2-byte code */

    /* check code validity - first byte */
    if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
      b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b2root;

      idx = rt->chars32[b3 + idx - rt->b2_1_lower];
      return rt->chars32[b4 + idx - rt->b2_2_lower];
    }
    else
    {
      uint16    idx = rt->b2root;

      idx = rt->chars16[b3 + idx - rt->b2_1_lower];
      return rt->chars16[b4 + idx - rt->b2_2_lower];
    }
  }
  else if (l == 1)
  {
    /* 1-byte code */

    /* check code validity - first byte */
    if (b4 < rt->b1_lower || b4 > rt->b1_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
      return rt->chars32[b4 + rt->b1root - rt->b1_lower];
    else
      return rt->chars16[b4 + rt->b1root - rt->b1_lower];
  }
  return 0;         /* shouldn't happen */
}

/*
 * UTF8 ---> local code
 *
 * utf: input string in UTF8 encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * iso: pointer to the output area (must be large enough!)
      (output string will be null-terminated)
 * map: conversion map for single characters
 * cmap: conversion map for combined characters
 *      (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *      (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *      (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the cmap (if provided) is consulted first; if no match,
 * the map is consulted next; if still no match, the conv_func (if provided)
 * is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
UtfToLocal(const unsigned char *utf, int len,
       unsigned char *iso,
       const pg_mb_radix_tree *map,
       const pg_utf_to_local_combined *cmap, int cmapsize,
       utf_local_conversion_func conv_func,
       int encoding, bool noError)
{
  uint32    iutf;
  int     l;
  const pg_utf_to_local_combined *cp;
  const unsigned char *start = utf;

  if (!PG_VALID_ENCODING(encoding))
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("invalid encoding number: %d", encoding)));

  for (; len > 0; len -= l)
  {
    unsigned char b1 = 0;
    unsigned char b2 = 0;
    unsigned char b3 = 0;
    unsigned char b4 = 0;

    /* "break" cases all represent errors */
    if (*utf == '\0')
      break;

    l = pg_utf_mblen(utf);
    if (len < l)
      break;

    if (!pg_utf8_islegal(utf, l))
      break;

    if (l == 1)
    {
      /* ASCII case is easy, assume it's one-to-one conversion */
      *iso++ = *utf++;
      continue;
    }

    /* collect coded char of length l */
    if (l == 2)
    {
      b3 = *utf++;
      b4 = *utf++;
    }
    else if (l == 3)
    {
      b2 = *utf++;
      b3 = *utf++;
      b4 = *utf++;
    }
    else if (l == 4)
    {
      b1 = *utf++;
      b2 = *utf++;
      b3 = *utf++;
      b4 = *utf++;
    }
    else
    {
      elog(ERROR, "unsupported character length %d", l);
      iutf = 0;     /* keep compiler quiet */
    }
    iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

    /* First, try with combined map if possible */
    if (cmap && len > l)
    {
      const unsigned char *utf_save = utf;
      int     len_save = len;
      int     l_save = l;

      /* collect next character, same as above */
      len -= l;

      l = pg_utf_mblen(utf);
      if (len < l)
      {
        /* need more data to decide if this is a combined char */
        utf -= l_save;
        break;
      }

      if (!pg_utf8_islegal(utf, l))
      {
        if (!noError)
          report_invalid_encoding(PG_UTF8, (const char *) utf, len);
        utf -= l_save;
        break;
      }

      /* We assume ASCII character cannot be in combined map */
      if (l > 1)
      {
        uint32    iutf2;
        uint32    cutf[2];

        if (l == 2)
        {
          iutf2 = *utf++ << 8;
          iutf2 |= *utf++;
        }
        else if (l == 3)
        {
          iutf2 = *utf++ << 16;
          iutf2 |= *utf++ << 8;
          iutf2 |= *utf++;
        }
        else if (l == 4)
        {
          iutf2 = *utf++ << 24;
          iutf2 |= *utf++ << 16;
          iutf2 |= *utf++ << 8;
          iutf2 |= *utf++;
        }
        else
        {
          elog(ERROR, "unsupported character length %d", l);
          iutf2 = 0;  /* keep compiler quiet */
        }

        cutf[0] = iutf;
        cutf[1] = iutf2;

        cp = bsearch(cutf, cmap, cmapsize,
               sizeof(pg_utf_to_local_combined), compare3);

        if (cp)
        {
          iso = store_coded_char(iso, cp->code);
          continue;
        }
      }

      /* fail, so back up to reprocess second character next time */
      utf = utf_save;
      len = len_save;
      l = l_save;
    }

    /* Now check ordinary map */
    if (map)
    {
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

      if (converted)
      {
        iso = store_coded_char(iso, converted);
        continue;
      }
    }

    /* if there's a conversion function, try that */
    if (conv_func)
    {
      uint32    converted = (*conv_func) (iutf);

      if (converted)
      {
        iso = store_coded_char(iso, converted);
        continue;
      }
    }

    /* failed to translate this character */
    utf -= l;
    if (noError)
      break;
    report_untranslatable_char(PG_UTF8, encoding,
                   (const char *) utf, len);
  }

  /* if we broke out of loop early, must be invalid input */
  if (len > 0 && !noError)
    report_invalid_encoding(PG_UTF8, (const char *) utf, len);

  *iso = '\0';

  return utf - start;
}

/*
 * local code ---> UTF8
 *
 * iso: input string in local encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * utf: pointer to the output area (must be large enough!)
      (output string will be null-terminated)
 * map: conversion map for single characters
 * cmap: conversion map for combined characters
 *      (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *      (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *      (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the map is consulted first; if no match, the cmap
 * (if provided) is consulted next; if still no match, the conv_func
 * (if provided) is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 *
 * Returns the number of input bytes consumed.  If noError is true, this can
 * be less than 'len'.
 */
int
LocalToUtf(const unsigned char *iso, int len,
       unsigned char *utf,
       const pg_mb_radix_tree *map,
       const pg_local_to_utf_combined *cmap, int cmapsize,
       utf_local_conversion_func conv_func,
       int encoding,
       bool noError)
{
  uint32    iiso;
  int     l;
  const pg_local_to_utf_combined *cp;
  const unsigned char *start = iso;

  if (!PG_VALID_ENCODING(encoding))
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("invalid encoding number: %d", encoding)));

  for (; len > 0; len -= l)
  {
    unsigned char b1 = 0;
    unsigned char b2 = 0;
    unsigned char b3 = 0;
    unsigned char b4 = 0;

    /* "break" cases all represent errors */
    if (*iso == '\0')
      break;

    if (!IS_HIGHBIT_SET(*iso))
    {
      /* ASCII case is easy, assume it's one-to-one conversion */
      *utf++ = *iso++;
      l = 1;
      continue;
    }

    l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
    if (l < 0)
      break;

    /* collect coded char of length l */
    if (l == 1)
      b4 = *iso++;
    else if (l == 2)
    {
      b3 = *iso++;
      b4 = *iso++;
    }
    else if (l == 3)
    {
      b2 = *iso++;
      b3 = *iso++;
      b4 = *iso++;
    }
    else if (l == 4)
    {
      b1 = *iso++;
      b2 = *iso++;
      b3 = *iso++;
      b4 = *iso++;
    }
    else
    {
      elog(ERROR, "unsupported character length %d", l);
      iiso = 0;     /* keep compiler quiet */
    }
    iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

    if (map)
    {
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

      if (converted)
      {
        utf = store_coded_char(utf, converted);
        continue;
      }

      /* If there's a combined character map, try that */
      if (cmap)
      {
        cp = bsearch(&iiso, cmap, cmapsize,
               sizeof(pg_local_to_utf_combined), compare4);

        if (cp)
        {
          utf = store_coded_char(utf, cp->utf1);
          utf = store_coded_char(utf, cp->utf2);
          continue;
        }
      }
    }

    /* if there's a conversion function, try that */
    if (conv_func)
    {
      uint32    converted = (*conv_func) (iiso);

      if (converted)
      {
        utf = store_coded_char(utf, converted);
        continue;
      }
    }

    /* failed to translate this character */
    iso -= l;
    if (noError)
      break;
    report_untranslatable_char(encoding, PG_UTF8,
                   (const char *) iso, len);
  }

  /* if we broke out of loop early, must be invalid input */
  if (len > 0 && !noError)
    report_invalid_encoding(encoding, (const char *) iso, len);

  *utf = '\0';

  return iso - start;
}

Coverage Report

Created: 2025-06-15 06:31

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* Utility functions for conversion procs.
4		*
5		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6		* Portions Copyright (c) 1994, Regents of the University of California
7		*
8		* IDENTIFICATION
9		* src/backend/utils/mb/conv.c
10		*
11		*-------------------------------------------------------------------------
12		*/
13		#include "postgres.h"
14		#include "mb/pg_wchar.h"
15
16
17		/*
18		* local2local: a generic single byte charset encoding
19		* conversion between two ASCII-superset encodings.
20		*
21		* l points to the source string of length len
22		* p is the output area (must be large enough!)
23		* src_encoding is the PG identifier for the source encoding
24		* dest_encoding is the PG identifier for the target encoding
25		* tab holds conversion entries for the source charset
26		* starting from 128 (0x80). each entry in the table holds the corresponding
27		* code point for the target charset, or 0 if there is no equivalent code.
28		*
29		* Returns the number of input bytes consumed. If noError is true, this can
30		* be less than 'len'.
31		*/
32		int
33		local2local(const unsigned char *l,
34		unsigned char *p,
35		int len,
36		int src_encoding,
37		int dest_encoding,
38		const unsigned char *tab,
39		bool noError)
40	0	{
41	0	const unsigned char *start = l;
42	0	unsigned char c1,
43	0	c2;
44
45	0	while (len > 0)
46	0	{
47	0	c1 = *l;
48	0	if (c1 == 0)
49	0	{
50	0	if (noError)
51	0	break;
52	0	report_invalid_encoding(src_encoding, (const char *) l, len);
53	0	}
54	0	if (!IS_HIGHBIT_SET(c1))
55	0	*p++ = c1;
56	0	else
57	0	{
58	0	c2 = tab[c1 - HIGHBIT];
59	0	if (c2)
60	0	*p++ = c2;
61	0	else
62	0	{
63	0	if (noError)
64	0	break;
65	0	report_untranslatable_char(src_encoding, dest_encoding,
66	0	(const char *) l, len);
67	0	}
68	0	}
69	0	l++;
70	0	len--;
71	0	}
72	0	*p = '\0';
73
74	0	return l - start;
75	0	}
76
77		/*
78		* LATINn ---> MIC when the charset's local codes map directly to MIC
79		*
80		* l points to the source string of length len
81		* p is the output area (must be large enough!)
82		* lc is the mule character set id for the local encoding
83		* encoding is the PG identifier for the local encoding
84		*
85		* Returns the number of input bytes consumed. If noError is true, this can
86		* be less than 'len'.
87		*/
88		int
89		latin2mic(const unsigned char l, unsigned char p, int len,
90		int lc, int encoding, bool noError)
91	0	{
92	0	const unsigned char *start = l;
93	0	int c1;
94
95	0	while (len > 0)
96	0	{
97	0	c1 = *l;
98	0	if (c1 == 0)
99	0	{
100	0	if (noError)
101	0	break;
102	0	report_invalid_encoding(encoding, (const char *) l, len);
103	0	}
104	0	if (IS_HIGHBIT_SET(c1))
105	0	*p++ = lc;
106	0	*p++ = c1;
107	0	l++;
108	0	len--;
109	0	}
110	0	*p = '\0';
111
112	0	return l - start;
113	0	}
114
115		/*
116		* MIC ---> LATINn when the charset's local codes map directly to MIC
117		*
118		* mic points to the source string of length len
119		* p is the output area (must be large enough!)
120		* lc is the mule character set id for the local encoding
121		* encoding is the PG identifier for the local encoding
122		*
123		* Returns the number of input bytes consumed. If noError is true, this can
124		* be less than 'len'.
125		*/
126		int
127		mic2latin(const unsigned char mic, unsigned char p, int len,
128		int lc, int encoding, bool noError)
129	0	{
130	0	const unsigned char *start = mic;
131	0	int c1;
132
133	0	while (len > 0)
134	0	{
135	0	c1 = *mic;
136	0	if (c1 == 0)
137	0	{
138	0	if (noError)
139	0	break;
140	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141	0	}
142	0	if (!IS_HIGHBIT_SET(c1))
143	0	{
144		/* easy for ASCII */
145	0	*p++ = c1;
146	0	mic++;
147	0	len--;
148	0	}
149	0	else
150	0	{
151	0	int l = pg_mule_mblen(mic);
152
153	0	if (len < l)
154	0	{
155	0	if (noError)
156	0	break;
157	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158	0	len);
159	0	}
160	0	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]))
161	0	{
162	0	if (noError)
163	0	break;
164	0	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165	0	(const char *) mic, len);
166	0	}
167	0	*p++ = mic[1];
168	0	mic += 2;
169	0	len -= 2;
170	0	}
171	0	}
172	0	*p = '\0';
173
174	0	return mic - start;
175	0	}
176
177
178		/*
179		* latin2mic_with_table: a generic single byte charset encoding
180		* conversion from a local charset to the mule internal code.
181		*
182		* l points to the source string of length len
183		* p is the output area (must be large enough!)
184		* lc is the mule character set id for the local encoding
185		* encoding is the PG identifier for the local encoding
186		* tab holds conversion entries for the local charset
187		* starting from 128 (0x80). each entry in the table holds the corresponding
188		* code point for the mule encoding, or 0 if there is no equivalent code.
189		*
190		* Returns the number of input bytes consumed. If noError is true, this can
191		* be less than 'len'.
192		*/
193		int
194		latin2mic_with_table(const unsigned char *l,
195		unsigned char *p,
196		int len,
197		int lc,
198		int encoding,
199		const unsigned char *tab,
200		bool noError)
201	0	{
202	0	const unsigned char *start = l;
203	0	unsigned char c1,
204	0	c2;
205
206	0	while (len > 0)
207	0	{
208	0	c1 = *l;
209	0	if (c1 == 0)
210	0	{
211	0	if (noError)
212	0	break;
213	0	report_invalid_encoding(encoding, (const char *) l, len);
214	0	}
215	0	if (!IS_HIGHBIT_SET(c1))
216	0	*p++ = c1;
217	0	else
218	0	{
219	0	c2 = tab[c1 - HIGHBIT];
220	0	if (c2)
221	0	{
222	0	*p++ = lc;
223	0	*p++ = c2;
224	0	}
225	0	else
226	0	{
227	0	if (noError)
228	0	break;
229	0	report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230	0	(const char *) l, len);
231	0	}
232	0	}
233	0	l++;
234	0	len--;
235	0	}
236	0	*p = '\0';
237
238	0	return l - start;
239	0	}
240
241		/*
242		* mic2latin_with_table: a generic single byte charset encoding
243		* conversion from the mule internal code to a local charset.
244		*
245		* mic points to the source string of length len
246		* p is the output area (must be large enough!)
247		* lc is the mule character set id for the local encoding
248		* encoding is the PG identifier for the local encoding
249		* tab holds conversion entries for the mule internal code's second byte,
250		* starting from 128 (0x80). each entry in the table holds the corresponding
251		* code point for the local charset, or 0 if there is no equivalent code.
252		*
253		* Returns the number of input bytes consumed. If noError is true, this can
254		* be less than 'len'.
255		*/
256		int
257		mic2latin_with_table(const unsigned char *mic,
258		unsigned char *p,
259		int len,
260		int lc,
261		int encoding,
262		const unsigned char *tab,
263		bool noError)
264	0	{
265	0	const unsigned char *start = mic;
266	0	unsigned char c1,
267	0	c2;
268
269	0	while (len > 0)
270	0	{
271	0	c1 = *mic;
272	0	if (c1 == 0)
273	0	{
274	0	if (noError)
275	0	break;
276	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277	0	}
278	0	if (!IS_HIGHBIT_SET(c1))
279	0	{
280		/* easy for ASCII */
281	0	*p++ = c1;
282	0	mic++;
283	0	len--;
284	0	}
285	0	else
286	0	{
287	0	int l = pg_mule_mblen(mic);
288
289	0	if (len < l)
290	0	{
291	0	if (noError)
292	0	break;
293	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294	0	len);
295	0	}
296	0	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]) \|\|
297	0	(c2 = tab[mic[1] - HIGHBIT]) == 0)
298	0	{
299	0	if (noError)
300	0	break;
301	0	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302	0	(const char *) mic, len);
303	0	break; /* keep compiler quiet */
304	0	}
305	0	*p++ = c2;
306	0	mic += 2;
307	0	len -= 2;
308	0	}
309	0	}
310	0	*p = '\0';
311
312	0	return mic - start;
313	0	}
314
315		/*
316		* comparison routine for bsearch()
317		* this routine is intended for combined UTF8 -> local code
318		*/
319		static int
320		compare3(const void p1, const void p2)
321	0	{
322	0	uint32 s1,
323	0	s2,
324	0	d1,
325	0	d2;
326
327	0	s1 = (const uint32 ) p1;
328	0	s2 = ((const uint32 ) p1 + 1);
329	0	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330	0	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
331	0	return (s1 > d1 \|\| (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332	0	}
333
334		/*
335		* comparison routine for bsearch()
336		* this routine is intended for local code -> combined UTF8
337		*/
338		static int
339		compare4(const void p1, const void p2)
340	0	{
341	0	uint32 v1,
342	0	v2;
343
344	0	v1 = (const uint32 ) p1;
345	0	v2 = ((const pg_local_to_utf_combined *) p2)->code;
346	0	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347	0	}
348
349		/*
350		* store 32bit character representation into multibyte stream
351		*/
352		static inline unsigned char *
353		store_coded_char(unsigned char *dest, uint32 code)
354	0	{
355	0	if (code & 0xff000000)
356	0	*dest++ = code >> 24;
357	0	if (code & 0x00ff0000)
358	0	*dest++ = code >> 16;
359	0	if (code & 0x0000ff00)
360	0	*dest++ = code >> 8;
361	0	if (code & 0x000000ff)
362	0	*dest++ = code;
363	0	return dest;
364	0	}
365
366		/*
367		* Convert a character using a conversion radix tree.
368		*
369		* 'l' is the length of the input character in bytes, and b1-b4 are
370		* the input character's bytes.
371		*/
372		static inline uint32
373		pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374		int l,
375		unsigned char b1,
376		unsigned char b2,
377		unsigned char b3,
378		unsigned char b4)
379	0	{
380	0	if (l == 4)
381	0	{
382		/* 4-byte code */
383
384		/* check code validity */
385	0	if (b1 < rt->b4_1_lower \|\| b1 > rt->b4_1_upper \|\|
386	0	b2 < rt->b4_2_lower \|\| b2 > rt->b4_2_upper \|\|
387	0	b3 < rt->b4_3_lower \|\| b3 > rt->b4_3_upper \|\|
388	0	b4 < rt->b4_4_lower \|\| b4 > rt->b4_4_upper)
389	0	return 0;
390
391		/* perform lookup */
392	0	if (rt->chars32)
393	0	{
394	0	uint32 idx = rt->b4root;
395
396	0	idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397	0	idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398	0	idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399	0	return rt->chars32[b4 + idx - rt->b4_4_lower];
400	0	}
401	0	else
402	0	{
403	0	uint16 idx = rt->b4root;
404
405	0	idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406	0	idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407	0	idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408	0	return rt->chars16[b4 + idx - rt->b4_4_lower];
409	0	}
410	0	}
411	0	else if (l == 3)
412	0	{
413		/* 3-byte code */
414
415		/* check code validity */
416	0	if (b2 < rt->b3_1_lower \|\| b2 > rt->b3_1_upper \|\|
417	0	b3 < rt->b3_2_lower \|\| b3 > rt->b3_2_upper \|\|
418	0	b4 < rt->b3_3_lower \|\| b4 > rt->b3_3_upper)
419	0	return 0;
420
421		/* perform lookup */
422	0	if (rt->chars32)
423	0	{
424	0	uint32 idx = rt->b3root;
425
426	0	idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427	0	idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428	0	return rt->chars32[b4 + idx - rt->b3_3_lower];
429	0	}
430	0	else
431	0	{
432	0	uint16 idx = rt->b3root;
433
434	0	idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435	0	idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436	0	return rt->chars16[b4 + idx - rt->b3_3_lower];
437	0	}
438	0	}
439	0	else if (l == 2)
440	0	{
441		/* 2-byte code */
442
443		/* check code validity - first byte */
444	0	if (b3 < rt->b2_1_lower \|\| b3 > rt->b2_1_upper \|\|
445	0	b4 < rt->b2_2_lower \|\| b4 > rt->b2_2_upper)
446	0	return 0;
447
448		/* perform lookup */
449	0	if (rt->chars32)
450	0	{
451	0	uint32 idx = rt->b2root;
452
453	0	idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454	0	return rt->chars32[b4 + idx - rt->b2_2_lower];
455	0	}
456	0	else
457	0	{
458	0	uint16 idx = rt->b2root;
459
460	0	idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461	0	return rt->chars16[b4 + idx - rt->b2_2_lower];
462	0	}
463	0	}
464	0	else if (l == 1)
465	0	{
466		/* 1-byte code */
467
468		/* check code validity - first byte */
469	0	if (b4 < rt->b1_lower \|\| b4 > rt->b1_upper)
470	0	return 0;
471
472		/* perform lookup */
473	0	if (rt->chars32)
474	0	return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475	0	else
476	0	return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477	0	}
478	0	return 0; /* shouldn't happen */
479	0	}
480
481		/*
482		* UTF8 ---> local code
483		*
484		* utf: input string in UTF8 encoding (need not be null-terminated)
485		* len: length of input string (in bytes)
486		* iso: pointer to the output area (must be large enough!)
487		(output string will be null-terminated)
488		* map: conversion map for single characters
489		* cmap: conversion map for combined characters
490		* (optional, pass NULL if none)
491		* cmapsize: number of entries in the conversion map for combined characters
492		* (optional, pass 0 if none)
493		* conv_func: algorithmic encoding conversion function
494		* (optional, pass NULL if none)
495		* encoding: PG identifier for the local encoding
496		*
497		* For each character, the cmap (if provided) is consulted first; if no match,
498		* the map is consulted next; if still no match, the conv_func (if provided)
499		* is applied. An error is raised if no match is found.
500		*
501		* See pg_wchar.h for more details about the data structures used here.
502		*
503		* Returns the number of input bytes consumed. If noError is true, this can
504		* be less than 'len'.
505		*/
506		int
507		UtfToLocal(const unsigned char *utf, int len,
508		unsigned char *iso,
509		const pg_mb_radix_tree *map,
510		const pg_utf_to_local_combined *cmap, int cmapsize,
511		utf_local_conversion_func conv_func,
512		int encoding, bool noError)
513	0	{
514	0	uint32 iutf;
515	0	int l;
516	0	const pg_utf_to_local_combined *cp;
517	0	const unsigned char *start = utf;
518
519	0	if (!PG_VALID_ENCODING(encoding))
520	0	ereport(ERROR,
521	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522	0	errmsg("invalid encoding number: %d", encoding)));
523
524	0	for (; len > 0; len -= l)
525	0	{
526	0	unsigned char b1 = 0;
527	0	unsigned char b2 = 0;
528	0	unsigned char b3 = 0;
529	0	unsigned char b4 = 0;
530
531		/* "break" cases all represent errors */
532	0	if (*utf == '\0')
533	0	break;
534
535	0	l = pg_utf_mblen(utf);
536	0	if (len < l)
537	0	break;
538
539	0	if (!pg_utf8_islegal(utf, l))
540	0	break;
541
542	0	if (l == 1)
543	0	{
544		/* ASCII case is easy, assume it's one-to-one conversion */
545	0	iso++ = utf++;
546	0	continue;
547	0	}
548
549		/* collect coded char of length l */
550	0	if (l == 2)
551	0	{
552	0	b3 = *utf++;
553	0	b4 = *utf++;
554	0	}
555	0	else if (l == 3)
556	0	{
557	0	b2 = *utf++;
558	0	b3 = *utf++;
559	0	b4 = *utf++;
560	0	}
561	0	else if (l == 4)
562	0	{
563	0	b1 = *utf++;
564	0	b2 = *utf++;
565	0	b3 = *utf++;
566	0	b4 = *utf++;
567	0	}
568	0	else
569	0	{
570	0	elog(ERROR, "unsupported character length %d", l);
571	0	iutf = 0; /* keep compiler quiet */
572	0	}
573	0	iutf = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);
574
575		/* First, try with combined map if possible */
576	0	if (cmap && len > l)
577	0	{
578	0	const unsigned char *utf_save = utf;
579	0	int len_save = len;
580	0	int l_save = l;
581
582		/* collect next character, same as above */
583	0	len -= l;
584
585	0	l = pg_utf_mblen(utf);
586	0	if (len < l)
587	0	{
588		/* need more data to decide if this is a combined char */
589	0	utf -= l_save;
590	0	break;
591	0	}
592
593	0	if (!pg_utf8_islegal(utf, l))
594	0	{
595	0	if (!noError)
596	0	report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597	0	utf -= l_save;
598	0	break;
599	0	}
600
601		/* We assume ASCII character cannot be in combined map */
602	0	if (l > 1)
603	0	{
604	0	uint32 iutf2;
605	0	uint32 cutf[2];
606
607	0	if (l == 2)
608	0	{
609	0	iutf2 = *utf++ << 8;
610	0	iutf2 \|= *utf++;
611	0	}
612	0	else if (l == 3)
613	0	{
614	0	iutf2 = *utf++ << 16;
615	0	iutf2 \|= *utf++ << 8;
616	0	iutf2 \|= *utf++;
617	0	}
618	0	else if (l == 4)
619	0	{
620	0	iutf2 = *utf++ << 24;
621	0	iutf2 \|= *utf++ << 16;
622	0	iutf2 \|= *utf++ << 8;
623	0	iutf2 \|= *utf++;
624	0	}
625	0	else
626	0	{
627	0	elog(ERROR, "unsupported character length %d", l);
628	0	iutf2 = 0; /* keep compiler quiet */
629	0	}
630
631	0	cutf[0] = iutf;
632	0	cutf[1] = iutf2;
633
634	0	cp = bsearch(cutf, cmap, cmapsize,
635	0	sizeof(pg_utf_to_local_combined), compare3);
636
637	0	if (cp)
638	0	{
639	0	iso = store_coded_char(iso, cp->code);
640	0	continue;
641	0	}
642	0	}
643
644		/* fail, so back up to reprocess second character next time */
645	0	utf = utf_save;
646	0	len = len_save;
647	0	l = l_save;
648	0	}
649
650		/* Now check ordinary map */
651	0	if (map)
652	0	{
653	0	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654
655	0	if (converted)
656	0	{
657	0	iso = store_coded_char(iso, converted);
658	0	continue;
659	0	}
660	0	}
661
662		/* if there's a conversion function, try that */
663	0	if (conv_func)
664	0	{
665	0	uint32 converted = (*conv_func) (iutf);
666
667	0	if (converted)
668	0	{
669	0	iso = store_coded_char(iso, converted);
670	0	continue;
671	0	}
672	0	}
673
674		/* failed to translate this character */
675	0	utf -= l;
676	0	if (noError)
677	0	break;
678	0	report_untranslatable_char(PG_UTF8, encoding,
679	0	(const char *) utf, len);
680	0	}
681
682		/* if we broke out of loop early, must be invalid input */
683	0	if (len > 0 && !noError)
684	0	report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685
686	0	*iso = '\0';
687
688	0	return utf - start;
689	0	}
690
691		/*
692		* local code ---> UTF8
693		*
694		* iso: input string in local encoding (need not be null-terminated)
695		* len: length of input string (in bytes)
696		* utf: pointer to the output area (must be large enough!)
697		(output string will be null-terminated)
698		* map: conversion map for single characters
699		* cmap: conversion map for combined characters
700		* (optional, pass NULL if none)
701		* cmapsize: number of entries in the conversion map for combined characters
702		* (optional, pass 0 if none)
703		* conv_func: algorithmic encoding conversion function
704		* (optional, pass NULL if none)
705		* encoding: PG identifier for the local encoding
706		*
707		* For each character, the map is consulted first; if no match, the cmap
708		* (if provided) is consulted next; if still no match, the conv_func
709		* (if provided) is applied. An error is raised if no match is found.
710		*
711		* See pg_wchar.h for more details about the data structures used here.
712		*
713		* Returns the number of input bytes consumed. If noError is true, this can
714		* be less than 'len'.
715		*/
716		int
717		LocalToUtf(const unsigned char *iso, int len,
718		unsigned char *utf,
719		const pg_mb_radix_tree *map,
720		const pg_local_to_utf_combined *cmap, int cmapsize,
721		utf_local_conversion_func conv_func,
722		int encoding,
723		bool noError)
724	0	{
725	0	uint32 iiso;
726	0	int l;
727	0	const pg_local_to_utf_combined *cp;
728	0	const unsigned char *start = iso;
729
730	0	if (!PG_VALID_ENCODING(encoding))
731	0	ereport(ERROR,
732	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733	0	errmsg("invalid encoding number: %d", encoding)));
734
735	0	for (; len > 0; len -= l)
736	0	{
737	0	unsigned char b1 = 0;
738	0	unsigned char b2 = 0;
739	0	unsigned char b3 = 0;
740	0	unsigned char b4 = 0;
741
742		/* "break" cases all represent errors */
743	0	if (*iso == '\0')
744	0	break;
745
746	0	if (!IS_HIGHBIT_SET(*iso))
747	0	{
748		/* ASCII case is easy, assume it's one-to-one conversion */
749	0	utf++ = iso++;
750	0	l = 1;
751	0	continue;
752	0	}
753
754	0	l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
755	0	if (l < 0)
756	0	break;
757
758		/* collect coded char of length l */
759	0	if (l == 1)
760	0	b4 = *iso++;
761	0	else if (l == 2)
762	0	{
763	0	b3 = *iso++;
764	0	b4 = *iso++;
765	0	}
766	0	else if (l == 3)
767	0	{
768	0	b2 = *iso++;
769	0	b3 = *iso++;
770	0	b4 = *iso++;
771	0	}
772	0	else if (l == 4)
773	0	{
774	0	b1 = *iso++;
775	0	b2 = *iso++;
776	0	b3 = *iso++;
777	0	b4 = *iso++;
778	0	}
779	0	else
780	0	{
781	0	elog(ERROR, "unsupported character length %d", l);
782	0	iiso = 0; /* keep compiler quiet */
783	0	}
784	0	iiso = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);
785
786	0	if (map)
787	0	{
788	0	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789
790	0	if (converted)
791	0	{
792	0	utf = store_coded_char(utf, converted);
793	0	continue;
794	0	}
795
796		/* If there's a combined character map, try that */
797	0	if (cmap)
798	0	{
799	0	cp = bsearch(&iiso, cmap, cmapsize,
800	0	sizeof(pg_local_to_utf_combined), compare4);
801
802	0	if (cp)
803	0	{
804	0	utf = store_coded_char(utf, cp->utf1);
805	0	utf = store_coded_char(utf, cp->utf2);
806	0	continue;
807	0	}
808	0	}
809	0	}
810
811		/* if there's a conversion function, try that */
812	0	if (conv_func)
813	0	{
814	0	uint32 converted = (*conv_func) (iiso);
815
816	0	if (converted)
817	0	{
818	0	utf = store_coded_char(utf, converted);
819	0	continue;
820	0	}
821	0	}
822
823		/* failed to translate this character */
824	0	iso -= l;
825	0	if (noError)
826	0	break;
827	0	report_untranslatable_char(encoding, PG_UTF8,
828	0	(const char *) iso, len);
829	0	}
830
831		/* if we broke out of loop early, must be invalid input */
832	0	if (len > 0 && !noError)
833	0	report_invalid_encoding(encoding, (const char *) iso, len);
834
835	0	*utf = '\0';
836
837	0	return iso - start;
838	0	}