/src/FreeRDP/winpr/libwinpr/crt/unicode_builtin.c

Source (jump to first uncovered line)
/*
 * Copyright 2001-2004 Unicode, Inc.
 *
 * Disclaimer
 *
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 *
 * Limitations on Rights to Redistribute This Code
 *
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */

/* ---------------------------------------------------------------------

Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
Sept 2001: fixed const & error conditions per
mods suggested by S. Parent & A. Lillich.
June 2002: Tim Dodd added detection and handling of incomplete
source sequences, enhanced error detection, added casts
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

See the header file "utf.h" for complete documentation.

------------------------------------------------------------------------ */

#include <winpr/wtypes.h>
#include <winpr/string.h>
#include <winpr/assert.h>

#include "unicode.h"

#include "../log.h"
#define TAG WINPR_TAG("unicode")

/*
 * Character Types:
 *
 * UTF8:    uint8_t   8 bits
 * UTF16: uint16_t  16 bits
 * UTF32: uint32_t  32 bits
 */

/* Some fundamental constants */
#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
#define UNI_MAX_BMP (uint32_t)0x0000FFFF
#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF

typedef enum
{
  conversionOK,    /* conversion successful */
  sourceExhausted, /* partial character in source, but hit end */
  targetExhausted, /* insuff. room in target for conversion */
  sourceIllegal    /* source sequence is illegal/malformed */
} ConversionResult;

typedef enum
{
  strictConversion = 0,
  lenientConversion
} ConversionFlags;

static const int halfShift = 10; /* used for shifting by 10 bits */

static const uint32_t halfBase = 0x0010000UL;
static const uint32_t halfMask = 0x3FFUL;

#define UNI_SUR_HIGH_START (uint32_t)0xD800
#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
#define UNI_SUR_LOW_START (uint32_t)0xDC00
#define UNI_SUR_LOW_END (uint32_t)0xDFFF

/* --------------------------------------------------------------------- */

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
static const char trailingBytesForUTF8[256] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
                                           0x03C82080UL, 0xFA082080UL, 0x82082080UL };

/*
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 * into the first byte, depending on how many bytes follow.  There are
 * as many entries in this table as there are UTF-8 sequence types.
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 * for *legal* UTF-8 will be 4 or fewer bytes total.
 */
static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

/* --------------------------------------------------------------------- */

/* The interface converts a whole buffer to avoid function-call overhead.
 * Constants have been gathered. Loops & conditionals have been removed as
 * much as possible for efficiency, in favor of drop-through switches.
 * (See "Note A" at the bottom of the file for equivalent code.)
 * If your compiler supports it, the "isLegalUTF8" call can be turned
 * into an inline function.
 */

/* --------------------------------------------------------------------- */

static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
                                                          const uint16_t* sourceEnd,
                                                          uint8_t** targetStart, uint8_t* targetEnd,
                                                          ConversionFlags flags)
{
  bool computeLength = (!targetEnd) ? true : false;
  const uint16_t* source = *sourceStart;
  uint8_t* target = *targetStart;
  ConversionResult result = conversionOK;

  while (source < sourceEnd)
  {
    uint32_t ch = 0;
    unsigned short bytesToWrite = 0;
    const uint32_t byteMask = 0xBF;
    const uint32_t byteMark = 0x80;
    const uint16_t* oldSource =
        source; /* In case we have to back up because of target overflow. */

    ch = *source++;

    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
    {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd)
      {
        uint32_t ch2 = *source;

        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
        {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
               halfBase;
          ++source;
        }
        else if (flags == strictConversion)
        {
          /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      }
      else
      {
        /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    }
    else if (flags == strictConversion)
    {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
      {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }

    /* Figure out how many bytes the result will require */
    if (ch < (uint32_t)0x80)
    {
      bytesToWrite = 1;
    }
    else if (ch < (uint32_t)0x800)
    {
      bytesToWrite = 2;
    }
    else if (ch < (uint32_t)0x10000)
    {
      bytesToWrite = 3;
    }
    else if (ch < (uint32_t)0x110000)
    {
      bytesToWrite = 4;
    }
    else
    {
      bytesToWrite = 3;
      ch = UNI_REPLACEMENT_CHAR;
    }

    target += bytesToWrite;

    if ((target > targetEnd) && (!computeLength))
    {
      source = oldSource; /* Back up source pointer! */
      target -= bytesToWrite;
      result = targetExhausted;
      break;
    }

    if (!computeLength)
    {
      switch (bytesToWrite)
      {
          /* note: everything falls through. */
        case 4:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH
        case 3:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 2:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 1:
          *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
      }
    }
    else
    {
      switch (bytesToWrite)
      {
          /* note: everything falls through. */
        case 4:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 3:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 2:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 1:
          --target;
      }
    }

    target += bytesToWrite;
  }

  *sourceStart = source;
  *targetStart = target;
  return result;
}

/* --------------------------------------------------------------------- */

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * If not calling this from ConvertUTF8to*, then the length can be set by:
 *  length = trailingBytesForUTF8[*source]+1;
 * and the sequence is illegal right away if there aren't that many bytes
 * available.
 * If presented with a length > 4, this returns false.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */

static bool isLegalUTF8(const uint8_t* source, int length)
{
  uint8_t a = 0;
  const uint8_t* srcptr = source + length;

  switch (length)
  {
    default:
      return false;

      /* Everything else falls through when "true"... */
    case 4:
      if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
        return false;
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 3:
      if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
        return false;
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 2:
      if ((a = (*--srcptr)) > 0xBF)
        return false;

      switch (*source)
      {
          /* no fall-through in this inner switch */
        case 0xE0:
          if (a < 0xA0)
            return false;

          break;

        case 0xED:
          if (a > 0x9F)
            return false;

          break;

        case 0xF0:
          if (a < 0x90)
            return false;

          break;

        case 0xF4:
          if (a > 0x8F)
            return false;

          break;

        default:
          if (a < 0x80)
            return false;
          break;
      }
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 1:
      if (*source >= 0x80 && *source < 0xC2)
        return false;
  }

  if (*source > 0xF4)
    return false;

  return true;
}

/* --------------------------------------------------------------------- */

static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
                                                          const uint8_t* sourceEnd,
                                                          uint16_t** targetStart,
                                                          uint16_t* targetEnd,
                                                          ConversionFlags flags)
{
  bool computeLength = (!targetEnd) ? true : false;
  ConversionResult result = conversionOK;
  const uint8_t* source = *sourceStart;
  uint16_t* target = *targetStart;

  while (source < sourceEnd)
  {
    uint32_t ch = 0;
    unsigned short extraBytesToRead = trailingBytesForUTF8[*source];

    if ((source + extraBytesToRead) >= sourceEnd)
    {
      result = sourceExhausted;
      break;
    }

    /* Do this check whether lenient or strict */
    if (!isLegalUTF8(source, extraBytesToRead + 1))
    {
      result = sourceIllegal;
      break;
    }

    /*
     * The cases all fall through. See "Note A" below.
     */
    switch (extraBytesToRead)
    {
      case 5:
        ch += *source++;
        ch <<= 6; /* remember, illegal UTF-8 */
                  /* fallthrough */
        WINPR_FALLTHROUGH

      case 4:
        ch += *source++;
        ch <<= 6; /* remember, illegal UTF-8 */
                  /* fallthrough */
        WINPR_FALLTHROUGH

      case 3:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 2:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 1:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 0:
        ch += *source++;
    }

    ch -= offsetsFromUTF8[extraBytesToRead];

    if ((target >= targetEnd) && (!computeLength))
    {
      source -= (extraBytesToRead + 1); /* Back up source pointer! */
      result = targetExhausted;
      break;
    }

    if (ch <= UNI_MAX_BMP)
    {
      /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
      {
        if (flags == strictConversion)
        {
          source -= (extraBytesToRead + 1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
        else
        {
          if (!computeLength)
            *target++ = UNI_REPLACEMENT_CHAR;
          else
            target++;
        }
      }
      else
      {
        if (!computeLength)
          *target++ = (uint16_t)ch; /* normal case */
        else
          target++;
      }
    }
    else if (ch > UNI_MAX_UTF16)
    {
      if (flags == strictConversion)
      {
        result = sourceIllegal;
        source -= (extraBytesToRead + 1); /* return to the start */
        break;                            /* Bail out; shouldn't continue */
      }
      else
      {
        if (!computeLength)
          *target++ = UNI_REPLACEMENT_CHAR;
        else
          target++;
      }
    }
    else
    {
      /* target is a character in range 0xFFFF - 0x10FFFF. */
      if ((target + 1 >= targetEnd) && (!computeLength))
      {
        source -= (extraBytesToRead + 1); /* Back up source pointer! */
        result = targetExhausted;
        break;
      }

      ch -= halfBase;

      if (!computeLength)
      {
        *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
        *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
      }
      else
      {
        target++;
        target++;
      }
    }
  }

  *sourceStart = source;
  *targetStart = target;
  return result;
}

/**
 * WinPR built-in Unicode API
 */

static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
{
  size_t length = 0;
  uint16_t* dstBeg = NULL;
  uint16_t* dstEnd = NULL;
  const uint8_t* srcBeg = NULL;
  const uint8_t* srcEnd = NULL;
  ConversionResult result = sourceIllegal;

  if (cchSrc == -1)
    cchSrc = strlen((char*)src) + 1;

  srcBeg = src;
  srcEnd = &src[cchSrc];

  if (cchDst == 0)
  {
    result =
        winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - (uint16_t*)NULL;
  }
  else
  {
    dstBeg = dst;
    dstEnd = &dst[cchDst];

    result =
        winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - dst;
  }

  if (result == targetExhausted)
  {
    SetLastError(ERROR_INSUFFICIENT_BUFFER);
    return 0;
  }

  return (result == conversionOK) ? length : 0;
}

static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
{
  size_t length = 0;
  uint8_t* dstBeg = NULL;
  uint8_t* dstEnd = NULL;
  const uint16_t* srcBeg = NULL;
  const uint16_t* srcEnd = NULL;
  ConversionResult result = sourceIllegal;

  if (cchSrc == -1)
    cchSrc = _wcslen((uint16_t*)src) + 1;

  srcBeg = src;
  srcEnd = &src[cchSrc];

  if (cchDst == 0)
  {
    result =
        winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - ((uint8_t*)NULL);
  }
  else
  {
    dstBeg = dst;
    dstEnd = &dst[cchDst];

    result =
        winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - dst;
  }

  if (result == targetExhausted)
  {
    SetLastError(ERROR_INSUFFICIENT_BUFFER);
    return 0;
  }

  return (result == conversionOK) ? length : 0;
}

/* --------------------------------------------------------------------- */

int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
                            LPWSTR lpWideCharStr, int cchWideChar)
{
  size_t cbCharLen = (size_t)cbMultiByte;

  WINPR_UNUSED(dwFlags);

  /* If cbMultiByte is 0, the function fails */
  if ((cbMultiByte == 0) || (cbMultiByte < -1))
    return 0;

  if (cchWideChar < 0)
    return -1;

  if (cbMultiByte < 0)
  {
    const size_t len = strlen(lpMultiByteStr);
    if (len >= INT32_MAX)
      return 0;
    cbCharLen = (int)len + 1;
  }
  else
    cbCharLen = cbMultiByte;

  WINPR_ASSERT(lpMultiByteStr);
  switch (CodePage)
  {
    case CP_ACP:
    case CP_UTF8:
      break;

    default:
      WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
      return 0;
  }

  return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr, cbCharLen,
                                  (uint16_t*)lpWideCharStr, cchWideChar);
}

int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
                            LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
                            LPBOOL lpUsedDefaultChar)
{
  size_t cbCharLen = (size_t)cchWideChar;

  WINPR_UNUSED(dwFlags);
  /* If cchWideChar is 0, the function fails */
  if ((cchWideChar == 0) || (cchWideChar < -1))
    return 0;

  if (cbMultiByte < 0)
    return -1;

  WINPR_ASSERT(lpWideCharStr);
  /* If cchWideChar is -1, the string is null-terminated */
  if (cchWideChar == -1)
  {
    const size_t len = _wcslen(lpWideCharStr);
    if (len >= INT32_MAX)
      return 0;
    cbCharLen = (int)len + 1;
  }
  else
    cbCharLen = cchWideChar;

  /*
   * if cbMultiByte is 0, the function returns the required buffer size
   * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
   */

  return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr, cbCharLen,
                                  (uint8_t*)lpMultiByteStr, cbMultiByte);
}

Coverage Report

Created: 2024-09-08 06:20

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2001-2004 Unicode, Inc.
3		*
4		* Disclaimer
5		*
6		* This source code is provided as is by Unicode, Inc. No claims are
7		* made as to fitness for any particular purpose. No warranties of any
8		* kind are expressed or implied. The recipient agrees to determine
9		* applicability of information provided. If this file has been
10		* purchased on magnetic or optical media from Unicode, Inc., the
11		* sole remedy for any claim will be exchange of defective media
12		* within 90 days of receipt.
13		*
14		* Limitations on Rights to Redistribute This Code
15		*
16		* Unicode, Inc. hereby grants the right to freely use the information
17		* supplied in this file in the creation of products supporting the
18		* Unicode Standard, and to make copies of this file in any form
19		* for internal or external distribution as long as this notice
20		* remains attached.
21		*/
22
23		/* ---------------------------------------------------------------------
24
25		Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26		Author: Mark E. Davis, 1994.
27		Rev History: Rick McGowan, fixes & updates May 2001.
28		Sept 2001: fixed const & error conditions per
29		mods suggested by S. Parent & A. Lillich.
30		June 2002: Tim Dodd added detection and handling of incomplete
31		source sequences, enhanced error detection, added casts
32		to eliminate compiler warnings.
33		July 2003: slight mods to back out aggressive FFFE detection.
34		Jan 2004: updated switches in from-UTF8 conversions.
35		Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37		See the header file "utf.h" for complete documentation.
38
39		------------------------------------------------------------------------ */
40
41		#include <winpr/wtypes.h>
42		#include <winpr/string.h>
43		#include <winpr/assert.h>
44
45		#include "unicode.h"
46
47		#include "../log.h"
48		#define TAG WINPR_TAG("unicode")
49
50		/*
51		* Character Types:
52		*
53		* UTF8: uint8_t 8 bits
54		* UTF16: uint16_t 16 bits
55		* UTF32: uint32_t 32 bits
56		*/
57
58		/* Some fundamental constants */
59	0	#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
60	12.5M	#define UNI_MAX_BMP (uint32_t)0x0000FFFF
61	790	#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
62		#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
63		#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
64
65		typedef enum
66		{
67		conversionOK, /* conversion successful */
68		sourceExhausted, /* partial character in source, but hit end */
69		targetExhausted, /* insuff. room in target for conversion */
70		sourceIllegal /* source sequence is illegal/malformed */
71		} ConversionResult;
72
73		typedef enum
74		{
75		strictConversion = 0,
76		lenientConversion
77		} ConversionFlags;
78
79		static const int halfShift = 10; /* used for shifting by 10 bits */
80
81		static const uint32_t halfBase = 0x0010000UL;
82		static const uint32_t halfMask = 0x3FFUL;
83
84	38.5M	#define UNI_SUR_HIGH_START (uint32_t)0xD800
85	148k	#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
86	13.5M	#define UNI_SUR_LOW_START (uint32_t)0xDC00
87	147k	#define UNI_SUR_LOW_END (uint32_t)0xDFFF
88
89		/* --------------------------------------------------------------------- */
90
91		/*
92		* Index into the table below with the first byte of a UTF-8 sequence to
93		* get the number of trailing bytes that are supposed to follow it.
94		* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
95		* left as-is for anyone who may want to do such conversion, which was
96		* allowed in earlier algorithms.
97		*/
98		static const char trailingBytesForUTF8[256] = {
99		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
107		};
108
109		/*
110		* Magic values subtracted from a buffer value during UTF8 conversion.
111		* This table contains as many values as there might be trailing bytes
112		* in a UTF-8 sequence.
113		*/
114		static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
115		0x03C82080UL, 0xFA082080UL, 0x82082080UL };
116
117		/*
118		* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
119		* into the first byte, depending on how many bytes follow. There are
120		* as many entries in this table as there are UTF-8 sequence types.
121		* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
122		* for legal UTF-8 will be 4 or fewer bytes total.
123		*/
124		static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
125
126		/* --------------------------------------------------------------------- */
127
128		/* The interface converts a whole buffer to avoid function-call overhead.
129		* Constants have been gathered. Loops & conditionals have been removed as
130		* much as possible for efficiency, in favor of drop-through switches.
131		* (See "Note A" at the bottom of the file for equivalent code.)
132		* If your compiler supports it, the "isLegalUTF8" call can be turned
133		* into an inline function.
134		*/
135
136		/* --------------------------------------------------------------------- */
137
138		static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
139		const uint16_t* sourceEnd,
140		uint8_t** targetStart, uint8_t* targetEnd,
141		ConversionFlags flags)
142	362k	{
143	362k	bool computeLength = (!targetEnd) ? true : false;
144	362k	const uint16_t* source = *sourceStart;
145	362k	uint8_t* target = *targetStart;
146	362k	ConversionResult result = conversionOK;
147
148	7.11M	while (source < sourceEnd)
149	6.77M	{
150	6.77M	uint32_t ch = 0;
151	6.77M	unsigned short bytesToWrite = 0;
152	6.77M	const uint32_t byteMask = 0xBF;
153	6.77M	const uint32_t byteMark = 0x80;
154	6.77M	const uint16_t* oldSource =
155	6.77M	source; /* In case we have to back up because of target overflow. */
156
157	6.77M	ch = *source++;
158
159		/* If we have a surrogate pair, convert to UTF32 first. */
160	6.77M	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
161	1.67k	{
162		/* If the 16 bits following the high surrogate are in the source buffer... */
163	1.67k	if (source < sourceEnd)
164	1.64k	{
165	1.64k	uint32_t ch2 = *source;
166
167		/* If it's a low surrogate, convert to UTF32. */
168	1.64k	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
169	859	{
170	859	ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
171	859	halfBase;
172	859	++source;
173	859	}
174	790	else if (flags == strictConversion)
175	790	{
176		/* it's an unpaired high surrogate */
177	790	--source; /* return to the illegal value itself */
178	790	result = sourceIllegal;
179	790	break;
180	790	}
181	1.64k	}
182	26	else
183	26	{
184		/* We don't have the 16 bits following the high surrogate. */
185	26	--source; /* return to the high surrogate */
186	26	result = sourceExhausted;
187	26	break;
188	26	}
189	1.67k	}
190	6.77M	else if (flags == strictConversion)
191	6.77M	{
192		/* UTF-16 surrogate values are illegal in UTF-32 */
193	6.77M	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
194	2.93k	{
195	2.93k	--source; /* return to the illegal value itself */
196	2.93k	result = sourceIllegal;
197	2.93k	break;
198	2.93k	}
199	6.77M	}
200
201		/* Figure out how many bytes the result will require */
202	6.76M	if (ch < (uint32_t)0x80)
203	5.58M	{
204	5.58M	bytesToWrite = 1;
205	5.58M	}
206	1.18M	else if (ch < (uint32_t)0x800)
207	104k	{
208	104k	bytesToWrite = 2;
209	104k	}
210	1.08M	else if (ch < (uint32_t)0x10000)
211	1.08M	{
212	1.08M	bytesToWrite = 3;
213	1.08M	}
214	859	else if (ch < (uint32_t)0x110000)
215	859	{
216	859	bytesToWrite = 4;
217	859	}
218	0	else
219	0	{
220	0	bytesToWrite = 3;
221	0	ch = UNI_REPLACEMENT_CHAR;
222	0	}
223
224	6.76M	target += bytesToWrite;
225
226	6.76M	if ((target > targetEnd) && (!computeLength))
227	16.9k	{
228	16.9k	source = oldSource; /* Back up source pointer! */
229	16.9k	target -= bytesToWrite;
230	16.9k	result = targetExhausted;
231	16.9k	break;
232	16.9k	}
233
234	6.75M	if (!computeLength)
235	6.21M	{
236	6.21M	switch (bytesToWrite)
237	6.21M	{
238		/* note: everything falls through. */
239	428	case 4:
240	428	*--target = (uint8_t)((ch \| byteMark) & byteMask);
241	428	ch >>= 6;
242		/* fallthrough */
243	428	WINPR_FALLTHROUGH
244	717k	case 3:
245	717k	*--target = (uint8_t)((ch \| byteMark) & byteMask);
246	717k	ch >>= 6;
247		/* fallthrough */
248	717k	WINPR_FALLTHROUGH
249
250	804k	case 2:
251	804k	*--target = (uint8_t)((ch \| byteMark) & byteMask);
252	804k	ch >>= 6;
253		/* fallthrough */
254	804k	WINPR_FALLTHROUGH
255
256	6.21M	case 1:
257	6.21M	*--target = (uint8_t)(ch \| firstByteMark[bytesToWrite]);
258	6.21M	}
259	6.21M	}
260	536k	else
261	536k	{
262	536k	switch (bytesToWrite)
263	536k	{
264		/* note: everything falls through. */
265	431	case 4:
266	431	--target;
267		/* fallthrough */
268	431	WINPR_FALLTHROUGH
269
270	350k	case 3:
271	350k	--target;
272		/* fallthrough */
273	350k	WINPR_FALLTHROUGH
274
275	369k	case 2:
276	369k	--target;
277		/* fallthrough */
278	369k	WINPR_FALLTHROUGH
279
280	536k	case 1:
281	536k	--target;
282	536k	}
283	536k	}
284
285	6.75M	target += bytesToWrite;
286	6.75M	}
287
288	362k	*sourceStart = source;
289	362k	*targetStart = target;
290	362k	return result;
291	362k	}
292
293		/* --------------------------------------------------------------------- */
294
295		/*
296		* Utility routine to tell whether a sequence of bytes is legal UTF-8.
297		* This must be called with the length pre-determined by the first byte.
298		* If not calling this from ConvertUTF8to*, then the length can be set by:
299		* length = trailingBytesForUTF8[*source]+1;
300		* and the sequence is illegal right away if there aren't that many bytes
301		* available.
302		* If presented with a length > 4, this returns false. The Unicode
303		* definition of UTF-8 goes up to 4-byte sequences.
304		*/
305
306		static bool isLegalUTF8(const uint8_t* source, int length)
307	12.5M	{
308	12.5M	uint8_t a = 0;
309	12.5M	const uint8_t* srcptr = source + length;
310
311	12.5M	switch (length)
312	12.5M	{
313	1	default:
314	1	return false;
315
316		/* Everything else falls through when "true"... */
317	812	case 4:
318	812	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)
319	10	return false;
320		/* fallthrough */
321	802	WINPR_FALLTHROUGH
322
323	1.48k	case 3:
324	1.48k	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)
325	14	return false;
326		/* fallthrough */
327	1.46k	WINPR_FALLTHROUGH
328
329	1.68k	case 2:
330	1.68k	if ((a = (*--srcptr)) > 0xBF)
331	2	return false;
332
333	1.68k	switch (*source)
334	1.68k	{
335		/* no fall-through in this inner switch */
336	208	case 0xE0:
337	208	if (a < 0xA0)
338	10	return false;
339
340	198	break;
341
342	227	case 0xED:
343	227	if (a > 0x9F)
344	3	return false;
345
346	224	break;
347
348	224	case 0xF0:
349	211	if (a < 0x90)
350	9	return false;
351
352	202	break;
353
354	584	case 0xF4:
355	584	if (a > 0x8F)
356	2	return false;
357
358	582	break;
359
360	582	default:
361	452	if (a < 0x80)
362	6	return false;
363	446	break;
364	1.68k	}
365		/* fallthrough */
366	1.65k	WINPR_FALLTHROUGH
367
368	12.5M	case 1:
369	12.5M	if (source >= 0x80 && source < 0xC2)
370	23	return false;
371	12.5M	}
372
373	12.5M	if (*source > 0xF4)
374	1	return false;
375
376	12.5M	return true;
377	12.5M	}
378
379		/* --------------------------------------------------------------------- */
380
381		static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
382		const uint8_t* sourceEnd,
383		uint16_t** targetStart,
384		uint16_t* targetEnd,
385		ConversionFlags flags)
386	320k	{
387	320k	bool computeLength = (!targetEnd) ? true : false;
388	320k	ConversionResult result = conversionOK;
389	320k	const uint8_t* source = *sourceStart;
390	320k	uint16_t* target = *targetStart;
391
392	12.8M	while (source < sourceEnd)
393	12.5M	{
394	12.5M	uint32_t ch = 0;
395	12.5M	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
396
397	12.5M	if ((source + extraBytesToRead) >= sourceEnd)
398	1	{
399	1	result = sourceExhausted;
400	1	break;
401	1	}
402
403		/* Do this check whether lenient or strict */
404	12.5M	if (!isLegalUTF8(source, extraBytesToRead + 1))
405	81	{
406	81	result = sourceIllegal;
407	81	break;
408	81	}
409
410		/*
411		* The cases all fall through. See "Note A" below.
412		*/
413	12.5M	switch (extraBytesToRead)
414	12.5M	{
415	0	case 5:
416	0	ch += *source++;
417	0	ch <<= 6; /* remember, illegal UTF-8 */
418		/* fallthrough */
419	0	WINPR_FALLTHROUGH
420
421	0	case 4:
422	0	ch += *source++;
423	0	ch <<= 6; /* remember, illegal UTF-8 */
424		/* fallthrough */
425	0	WINPR_FALLTHROUGH
426
427	790	case 3:
428	790	ch += *source++;
429	790	ch <<= 6;
430		/* fallthrough */
431	790	WINPR_FALLTHROUGH
432
433	1.44k	case 2:
434	1.44k	ch += *source++;
435	1.44k	ch <<= 6;
436		/* fallthrough */
437	1.44k	WINPR_FALLTHROUGH
438
439	1.64k	case 1:
440	1.64k	ch += *source++;
441	1.64k	ch <<= 6;
442		/* fallthrough */
443	1.64k	WINPR_FALLTHROUGH
444
445	12.5M	case 0:
446	12.5M	ch += *source++;
447	12.5M	}
448
449	12.5M	ch -= offsetsFromUTF8[extraBytesToRead];
450
451	12.5M	if ((target >= targetEnd) && (!computeLength))
452	0	{
453	0	source -= (extraBytesToRead + 1); /* Back up source pointer! */
454	0	result = targetExhausted;
455	0	break;
456	0	}
457
458	12.5M	if (ch <= UNI_MAX_BMP)
459	12.5M	{
460		/* Target is a character <= 0xFFFF */
461		/* UTF-16 surrogate values are illegal in UTF-32 */
462	12.5M	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
463	0	{
464	0	if (flags == strictConversion)
465	0	{
466	0	source -= (extraBytesToRead + 1); /* return to the illegal value itself */
467	0	result = sourceIllegal;
468	0	break;
469	0	}
470	0	else
471	0	{
472	0	if (!computeLength)
473	0	*target++ = UNI_REPLACEMENT_CHAR;
474	0	else
475	0	target++;
476	0	}
477	0	}
478	12.5M	else
479	12.5M	{
480	12.5M	if (!computeLength)
481	9.20M	target++ = (uint16_t)ch; / normal case */
482	3.30M	else
483	3.30M	target++;
484	12.5M	}
485	12.5M	}
486	790	else if (ch > UNI_MAX_UTF16)
487	0	{
488	0	if (flags == strictConversion)
489	0	{
490	0	result = sourceIllegal;
491	0	source -= (extraBytesToRead + 1); /* return to the start */
492	0	break; /* Bail out; shouldn't continue */
493	0	}
494	0	else
495	0	{
496	0	if (!computeLength)
497	0	*target++ = UNI_REPLACEMENT_CHAR;
498	0	else
499	0	target++;
500	0	}
501	0	}
502	790	else
503	790	{
504		/* target is a character in range 0xFFFF - 0x10FFFF. */
505	790	if ((target + 1 >= targetEnd) && (!computeLength))
506	0	{
507	0	source -= (extraBytesToRead + 1); /* Back up source pointer! */
508	0	result = targetExhausted;
509	0	break;
510	0	}
511
512	790	ch -= halfBase;
513
514	790	if (!computeLength)
515	388	{
516	388	*target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
517	388	*target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
518	388	}
519	402	else
520	402	{
521	402	target++;
522	402	target++;
523	402	}
524	790	}
525	12.5M	}
526
527	320k	*sourceStart = source;
528	320k	*targetStart = target;
529	320k	return result;
530	320k	}
531
532		/**
533		* WinPR built-in Unicode API
534		*/
535
536		static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
537	320k	{
538	320k	size_t length = 0;
539	320k	uint16_t* dstBeg = NULL;
540	320k	uint16_t* dstEnd = NULL;
541	320k	const uint8_t* srcBeg = NULL;
542	320k	const uint8_t* srcEnd = NULL;
543	320k	ConversionResult result = sourceIllegal;
544
545	320k	if (cchSrc == -1)
546	0	cchSrc = strlen((char*)src) + 1;
547
548	320k	srcBeg = src;
549	320k	srcEnd = &src[cchSrc];
550
551	320k	if (cchDst == 0)
552	1.04k	{
553	1.04k	result =
554	1.04k	winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
555
556	1.04k	length = dstBeg - (uint16_t*)NULL;
557	1.04k	}
558	319k	else
559	319k	{
560	319k	dstBeg = dst;
561	319k	dstEnd = &dst[cchDst];
562
563	319k	result =
564	319k	winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
565
566	319k	length = dstBeg - dst;
567	319k	}
568
569	320k	if (result == targetExhausted)
570	0	{
571	0	SetLastError(ERROR_INSUFFICIENT_BUFFER);
572	0	return 0;
573	0	}
574
575	320k	return (result == conversionOK) ? length : 0;
576	320k	}
577
578		static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
579	362k	{
580	362k	size_t length = 0;
581	362k	uint8_t* dstBeg = NULL;
582	362k	uint8_t* dstEnd = NULL;
583	362k	const uint16_t* srcBeg = NULL;
584	362k	const uint16_t* srcEnd = NULL;
585	362k	ConversionResult result = sourceIllegal;
586
587	362k	if (cchSrc == -1)
588	0	cchSrc = _wcslen((uint16_t*)src) + 1;
589
590	362k	srcBeg = src;
591	362k	srcEnd = &src[cchSrc];
592
593	362k	if (cchDst == 0)
594	40.9k	{
595	40.9k	result =
596	40.9k	winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
597
598	40.9k	length = dstBeg - ((uint8_t*)NULL);
599	40.9k	}
600	321k	else
601	321k	{
602	321k	dstBeg = dst;
603	321k	dstEnd = &dst[cchDst];
604
605	321k	result =
606	321k	winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
607
608	321k	length = dstBeg - dst;
609	321k	}
610
611	362k	if (result == targetExhausted)
612	16.9k	{
613	16.9k	SetLastError(ERROR_INSUFFICIENT_BUFFER);
614	16.9k	return 0;
615	16.9k	}
616
617	345k	return (result == conversionOK) ? length : 0;
618	362k	}
619
620		/* --------------------------------------------------------------------- */
621
622		int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
623		LPWSTR lpWideCharStr, int cchWideChar)
624	320k	{
625	320k	size_t cbCharLen = (size_t)cbMultiByte;
626
627	320k	WINPR_UNUSED(dwFlags);
628
629		/* If cbMultiByte is 0, the function fails */
630	320k	if ((cbMultiByte == 0) \|\| (cbMultiByte < -1))
631	0	return 0;
632
633	320k	if (cchWideChar < 0)
634	0	return -1;
635
636	320k	if (cbMultiByte < 0)
637	0	{
638	0	const size_t len = strlen(lpMultiByteStr);
639	0	if (len >= INT32_MAX)
640	0	return 0;
641	0	cbCharLen = (int)len + 1;
642	0	}
643	320k	else
644	320k	cbCharLen = cbMultiByte;
645
646	320k	WINPR_ASSERT(lpMultiByteStr);
647	320k	switch (CodePage)
648	320k	{
649	0	case CP_ACP:
650	320k	case CP_UTF8:
651	320k	break;
652
653	0	default:
654	0	WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
655	0	return 0;
656	320k	}
657
658	320k	return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr, cbCharLen,
659	320k	(uint16_t*)lpWideCharStr, cchWideChar);
660	320k	}
661
662		int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
663		LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
664		LPBOOL lpUsedDefaultChar)
665	362k	{
666	362k	size_t cbCharLen = (size_t)cchWideChar;
667
668	362k	WINPR_UNUSED(dwFlags);
669		/* If cchWideChar is 0, the function fails */
670	362k	if ((cchWideChar == 0) \|\| (cchWideChar < -1))
671	0	return 0;
672
673	362k	if (cbMultiByte < 0)
674	0	return -1;
675
676	362k	WINPR_ASSERT(lpWideCharStr);
677		/* If cchWideChar is -1, the string is null-terminated */
678	362k	if (cchWideChar == -1)
679	0	{
680	0	const size_t len = _wcslen(lpWideCharStr);
681	0	if (len >= INT32_MAX)
682	0	return 0;
683	0	cbCharLen = (int)len + 1;
684	0	}
685	362k	else
686	362k	cbCharLen = cchWideChar;
687
688		/*
689		* if cbMultiByte is 0, the function returns the required buffer size
690		* in bytes for lpMultiByteStr and makes no use of the output parameter itself.
691		*/
692
693	362k	return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr, cbCharLen,
694	362k	(uint8_t*)lpMultiByteStr, cbMultiByte);
695	362k	}