/src/FreeRDP/winpr/libwinpr/crt/unicode_builtin.c

Source (jump to first uncovered line)
/*
 * Copyright 2001-2004 Unicode, Inc.
 *
 * Disclaimer
 *
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 *
 * Limitations on Rights to Redistribute This Code
 *
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */

/* ---------------------------------------------------------------------

Conversions between UTF32, UTF-16, and UTF-8. Source code file.
Author: Mark E. Davis, 1994.
Rev History: Rick McGowan, fixes & updates May 2001.
Sept 2001: fixed const & error conditions per
mods suggested by S. Parent & A. Lillich.
June 2002: Tim Dodd added detection and handling of incomplete
source sequences, enhanced error detection, added casts
to eliminate compiler warnings.
July 2003: slight mods to back out aggressive FFFE detection.
Jan 2004: updated switches in from-UTF8 conversions.
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

See the header file "utf.h" for complete documentation.

------------------------------------------------------------------------ */

#include <winpr/wtypes.h>
#include <winpr/string.h>
#include <winpr/assert.h>
#include <winpr/cast.h>

#include "unicode.h"

#include "../log.h"
#define TAG WINPR_TAG("unicode")

/*
 * Character Types:
 *
 * UTF8:    uint8_t   8 bits
 * UTF16: uint16_t  16 bits
 * UTF32: uint32_t  32 bits
 */

/* Some fundamental constants */
#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
#define UNI_MAX_BMP (uint32_t)0x0000FFFF
#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF

typedef enum
{
  conversionOK,    /* conversion successful */
  sourceExhausted, /* partial character in source, but hit end */
  targetExhausted, /* insuff. room in target for conversion */
  sourceIllegal    /* source sequence is illegal/malformed */
} ConversionResult;

typedef enum
{
  strictConversion = 0,
  lenientConversion
} ConversionFlags;

static const int halfShift = 10; /* used for shifting by 10 bits */

static const uint32_t halfBase = 0x0010000UL;
static const uint32_t halfMask = 0x3FFUL;

#define UNI_SUR_HIGH_START (uint32_t)0xD800
#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
#define UNI_SUR_LOW_START (uint32_t)0xDC00
#define UNI_SUR_LOW_END (uint32_t)0xDFFF

/* --------------------------------------------------------------------- */

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
static const char trailingBytesForUTF8[256] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
                                           0x03C82080UL, 0xFA082080UL, 0x82082080UL };

/*
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 * into the first byte, depending on how many bytes follow.  There are
 * as many entries in this table as there are UTF-8 sequence types.
 * (I.e., one byte sequence, two byte... etc.). Remember that sequence
 * for *legal* UTF-8 will be 4 or fewer bytes total.
 */
static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

/* --------------------------------------------------------------------- */

/* The interface converts a whole buffer to avoid function-call overhead.
 * Constants have been gathered. Loops & conditionals have been removed as
 * much as possible for efficiency, in favor of drop-through switches.
 * (See "Note A" at the bottom of the file for equivalent code.)
 * If your compiler supports it, the "isLegalUTF8" call can be turned
 * into an inline function.
 */

/* --------------------------------------------------------------------- */

static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
                                                          const uint16_t* sourceEnd,
                                                          uint8_t** targetStart, uint8_t* targetEnd,
                                                          ConversionFlags flags)
{
  bool computeLength = (!targetEnd) ? true : false;
  const uint16_t* source = *sourceStart;
  uint8_t* target = *targetStart;
  ConversionResult result = conversionOK;

  while (source < sourceEnd)
  {
    uint32_t ch = 0;
    unsigned short bytesToWrite = 0;
    const uint32_t byteMask = 0xBF;
    const uint32_t byteMark = 0x80;
    const uint16_t* oldSource =
        source; /* In case we have to back up because of target overflow. */

    ch = *source++;

    /* If we have a surrogate pair, convert to UTF32 first. */
    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
    {
      /* If the 16 bits following the high surrogate are in the source buffer... */
      if (source < sourceEnd)
      {
        uint32_t ch2 = *source;

        /* If it's a low surrogate, convert to UTF32. */
        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
        {
          ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
               halfBase;
          ++source;
        }
        else if (flags == strictConversion)
        {
          /* it's an unpaired high surrogate */
          --source; /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
      }
      else
      {
        /* We don't have the 16 bits following the high surrogate. */
        --source; /* return to the high surrogate */
        result = sourceExhausted;
        break;
      }
    }
    else if (flags == strictConversion)
    {
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
      {
        --source; /* return to the illegal value itself */
        result = sourceIllegal;
        break;
      }
    }

    /* Figure out how many bytes the result will require */
    if (ch < (uint32_t)0x80)
    {
      bytesToWrite = 1;
    }
    else if (ch < (uint32_t)0x800)
    {
      bytesToWrite = 2;
    }
    else if (ch < (uint32_t)0x10000)
    {
      bytesToWrite = 3;
    }
    else if (ch < (uint32_t)0x110000)
    {
      bytesToWrite = 4;
    }
    else
    {
      bytesToWrite = 3;
      ch = UNI_REPLACEMENT_CHAR;
    }

    target += bytesToWrite;

    if ((target > targetEnd) && (!computeLength))
    {
      source = oldSource; /* Back up source pointer! */
      target -= bytesToWrite;
      result = targetExhausted;
      break;
    }

    if (!computeLength)
    {
      switch (bytesToWrite)
      {
          /* note: everything falls through. */
        case 4:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH
        case 3:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 2:
          *--target = (uint8_t)((ch | byteMark) & byteMask);
          ch >>= 6;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 1:
          *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
      }
    }
    else
    {
      switch (bytesToWrite)
      {
          /* note: everything falls through. */
        case 4:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 3:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 2:
          --target;
          /* fallthrough */
          WINPR_FALLTHROUGH

        case 1:
          --target;
      }
    }

    target += bytesToWrite;
  }

  *sourceStart = source;
  *targetStart = target;
  return result;
}

/* --------------------------------------------------------------------- */

/*
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 * This must be called with the length pre-determined by the first byte.
 * If not calling this from ConvertUTF8to*, then the length can be set by:
 *  length = trailingBytesForUTF8[*source]+1;
 * and the sequence is illegal right away if there aren't that many bytes
 * available.
 * If presented with a length > 4, this returns false.  The Unicode
 * definition of UTF-8 goes up to 4-byte sequences.
 */

static bool isLegalUTF8(const uint8_t* source, int length)
{
  uint8_t a = 0;
  const uint8_t* srcptr = source + length;

  switch (length)
  {
    default:
      return false;

      /* Everything else falls through when "true"... */
    case 4:
      if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
        return false;
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 3:
      if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
        return false;
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 2:
      if ((a = (*--srcptr)) > 0xBF)
        return false;

      switch (*source)
      {
          /* no fall-through in this inner switch */
        case 0xE0:
          if (a < 0xA0)
            return false;

          break;

        case 0xED:
          if (a > 0x9F)
            return false;

          break;

        case 0xF0:
          if (a < 0x90)
            return false;

          break;

        case 0xF4:
          if (a > 0x8F)
            return false;

          break;

        default:
          if (a < 0x80)
            return false;
          break;
      }
      /* fallthrough */
      WINPR_FALLTHROUGH

    case 1:
      if (*source >= 0x80 && *source < 0xC2)
        return false;
  }

  if (*source > 0xF4)
    return false;

  return true;
}

/* --------------------------------------------------------------------- */

static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
                                                          const uint8_t* sourceEnd,
                                                          uint16_t** targetStart,
                                                          uint16_t* targetEnd,
                                                          ConversionFlags flags)
{
  bool computeLength = (!targetEnd) ? true : false;
  ConversionResult result = conversionOK;
  const uint8_t* source = *sourceStart;
  uint16_t* target = *targetStart;

  while (source < sourceEnd)
  {
    uint32_t ch = 0;
    unsigned short extraBytesToRead =
        WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]);

    if ((source + extraBytesToRead) >= sourceEnd)
    {
      result = sourceExhausted;
      break;
    }

    /* Do this check whether lenient or strict */
    if (!isLegalUTF8(source, extraBytesToRead + 1))
    {
      result = sourceIllegal;
      break;
    }

    /*
     * The cases all fall through. See "Note A" below.
     */
    switch (extraBytesToRead)
    {
      case 5:
        ch += *source++;
        ch <<= 6; /* remember, illegal UTF-8 */
                  /* fallthrough */
        WINPR_FALLTHROUGH

      case 4:
        ch += *source++;
        ch <<= 6; /* remember, illegal UTF-8 */
                  /* fallthrough */
        WINPR_FALLTHROUGH

      case 3:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 2:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 1:
        ch += *source++;
        ch <<= 6;
        /* fallthrough */
        WINPR_FALLTHROUGH

      case 0:
        ch += *source++;
    }

    ch -= offsetsFromUTF8[extraBytesToRead];

    if ((target >= targetEnd) && (!computeLength))
    {
      source -= (extraBytesToRead + 1); /* Back up source pointer! */
      result = targetExhausted;
      break;
    }

    if (ch <= UNI_MAX_BMP)
    {
      /* Target is a character <= 0xFFFF */
      /* UTF-16 surrogate values are illegal in UTF-32 */
      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
      {
        if (flags == strictConversion)
        {
          source -= (extraBytesToRead + 1); /* return to the illegal value itself */
          result = sourceIllegal;
          break;
        }
        else
        {
          if (!computeLength)
            *target++ = UNI_REPLACEMENT_CHAR;
          else
            target++;
        }
      }
      else
      {
        if (!computeLength)
          *target++ = (uint16_t)ch; /* normal case */
        else
          target++;
      }
    }
    else if (ch > UNI_MAX_UTF16)
    {
      if (flags == strictConversion)
      {
        result = sourceIllegal;
        source -= (extraBytesToRead + 1); /* return to the start */
        break;                            /* Bail out; shouldn't continue */
      }
      else
      {
        if (!computeLength)
          *target++ = UNI_REPLACEMENT_CHAR;
        else
          target++;
      }
    }
    else
    {
      /* target is a character in range 0xFFFF - 0x10FFFF. */
      if ((target + 1 >= targetEnd) && (!computeLength))
      {
        source -= (extraBytesToRead + 1); /* Back up source pointer! */
        result = targetExhausted;
        break;
      }

      ch -= halfBase;

      if (!computeLength)
      {
        *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
        *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
      }
      else
      {
        target++;
        target++;
      }
    }
  }

  *sourceStart = source;
  *targetStart = target;
  return result;
}

/**
 * WinPR built-in Unicode API
 */

static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
{
  size_t length = 0;
  uint16_t* dstBeg = NULL;
  uint16_t* dstEnd = NULL;
  const uint8_t* srcBeg = NULL;
  const uint8_t* srcEnd = NULL;
  ConversionResult result = sourceIllegal;

  if (cchSrc == -1)
    cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;

  srcBeg = src;
  srcEnd = &src[cchSrc];

  if (cchDst == 0)
  {
    result =
        winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - (uint16_t*)NULL;
  }
  else
  {
    dstBeg = dst;
    dstEnd = &dst[cchDst];

    result =
        winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - dst;
  }

  if (result == targetExhausted)
  {
    SetLastError(ERROR_INSUFFICIENT_BUFFER);
    return 0;
  }

  return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
}

static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
{
  size_t length = 0;
  uint8_t* dstBeg = NULL;
  uint8_t* dstEnd = NULL;
  const uint16_t* srcBeg = NULL;
  const uint16_t* srcEnd = NULL;
  ConversionResult result = sourceIllegal;

  if (cchSrc == -1)
    cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;

  srcBeg = src;
  srcEnd = &src[cchSrc];

  if (cchDst == 0)
  {
    result =
        winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - ((uint8_t*)NULL);
  }
  else
  {
    dstBeg = dst;
    dstEnd = &dst[cchDst];

    result =
        winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);

    length = dstBeg - dst;
  }

  if (result == targetExhausted)
  {
    SetLastError(ERROR_INSUFFICIENT_BUFFER);
    return 0;
  }

  return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
}

/* --------------------------------------------------------------------- */

int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
                            LPWSTR lpWideCharStr, int cchWideChar)
{
  size_t cbCharLen = (size_t)cbMultiByte;

  WINPR_UNUSED(dwFlags);

  /* If cbMultiByte is 0, the function fails */
  if ((cbMultiByte == 0) || (cbMultiByte < -1))
    return 0;

  if (cchWideChar < 0)
    return -1;

  if (cbMultiByte < 0)
  {
    const size_t len = strlen(lpMultiByteStr);
    if (len >= INT32_MAX)
      return 0;
    cbCharLen = (int)len + 1;
  }
  else
    cbCharLen = cbMultiByte;

  WINPR_ASSERT(lpMultiByteStr);
  switch (CodePage)
  {
    case CP_ACP:
    case CP_UTF8:
      break;

    default:
      WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
      return 0;
  }

  return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr,
                                  WINPR_ASSERTING_INT_CAST(int, cbCharLen),
                                  (uint16_t*)lpWideCharStr, cchWideChar);
}

int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
                            LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
                            LPBOOL lpUsedDefaultChar)
{
  size_t cbCharLen = (size_t)cchWideChar;

  WINPR_UNUSED(dwFlags);
  /* If cchWideChar is 0, the function fails */
  if ((cchWideChar == 0) || (cchWideChar < -1))
    return 0;

  if (cbMultiByte < 0)
    return -1;

  WINPR_ASSERT(lpWideCharStr);
  /* If cchWideChar is -1, the string is null-terminated */
  if (cchWideChar == -1)
  {
    const size_t len = _wcslen(lpWideCharStr);
    if (len >= INT32_MAX)
      return 0;
    cbCharLen = (int)len + 1;
  }
  else
    cbCharLen = cchWideChar;

  /*
   * if cbMultiByte is 0, the function returns the required buffer size
   * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
   */

  return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr,
                                  WINPR_ASSERTING_INT_CAST(int, cbCharLen),
                                  (uint8_t*)lpMultiByteStr, cbMultiByte);
}

Coverage Report

Created: 2025-08-03 07:10

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2001-2004 Unicode, Inc.
3		*
4		* Disclaimer
5		*
6		* This source code is provided as is by Unicode, Inc. No claims are
7		* made as to fitness for any particular purpose. No warranties of any
8		* kind are expressed or implied. The recipient agrees to determine
9		* applicability of information provided. If this file has been
10		* purchased on magnetic or optical media from Unicode, Inc., the
11		* sole remedy for any claim will be exchange of defective media
12		* within 90 days of receipt.
13		*
14		* Limitations on Rights to Redistribute This Code
15		*
16		* Unicode, Inc. hereby grants the right to freely use the information
17		* supplied in this file in the creation of products supporting the
18		* Unicode Standard, and to make copies of this file in any form
19		* for internal or external distribution as long as this notice
20		* remains attached.
21		*/
22
23		/* ---------------------------------------------------------------------
24
25		Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26		Author: Mark E. Davis, 1994.
27		Rev History: Rick McGowan, fixes & updates May 2001.
28		Sept 2001: fixed const & error conditions per
29		mods suggested by S. Parent & A. Lillich.
30		June 2002: Tim Dodd added detection and handling of incomplete
31		source sequences, enhanced error detection, added casts
32		to eliminate compiler warnings.
33		July 2003: slight mods to back out aggressive FFFE detection.
34		Jan 2004: updated switches in from-UTF8 conversions.
35		Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37		See the header file "utf.h" for complete documentation.
38
39		------------------------------------------------------------------------ */
40
41		#include <winpr/wtypes.h>
42		#include <winpr/string.h>
43		#include <winpr/assert.h>
44		#include <winpr/cast.h>
45
46		#include "unicode.h"
47
48		#include "../log.h"
49		#define TAG WINPR_TAG("unicode")
50
51		/*
52		* Character Types:
53		*
54		* UTF8: uint8_t 8 bits
55		* UTF16: uint16_t 16 bits
56		* UTF32: uint32_t 32 bits
57		*/
58
59		/* Some fundamental constants */
60	0	#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61	15.1M	#define UNI_MAX_BMP (uint32_t)0x0000FFFF
62	792	#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63		#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64		#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
65
66		typedef enum
67		{
68		conversionOK, /* conversion successful */
69		sourceExhausted, /* partial character in source, but hit end */
70		targetExhausted, /* insuff. room in target for conversion */
71		sourceIllegal /* source sequence is illegal/malformed */
72		} ConversionResult;
73
74		typedef enum
75		{
76		strictConversion = 0,
77		lenientConversion
78		} ConversionFlags;
79
80		static const int halfShift = 10; /* used for shifting by 10 bits */
81
82		static const uint32_t halfBase = 0x0010000UL;
83		static const uint32_t halfMask = 0x3FFUL;
84
85	46.6M	#define UNI_SUR_HIGH_START (uint32_t)0xD800
86	118k	#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87	16.4M	#define UNI_SUR_LOW_START (uint32_t)0xDC00
88	118k	#define UNI_SUR_LOW_END (uint32_t)0xDFFF
89
90		/* --------------------------------------------------------------------- */
91
92		/*
93		* Index into the table below with the first byte of a UTF-8 sequence to
94		* get the number of trailing bytes that are supposed to follow it.
95		* Note that legal UTF-8 values can't have 4 or 5-bytes. The table is
96		* left as-is for anyone who may want to do such conversion, which was
97		* allowed in earlier algorithms.
98		*/
99		static const char trailingBytesForUTF8[256] = {
100		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
108		};
109
110		/*
111		* Magic values subtracted from a buffer value during UTF8 conversion.
112		* This table contains as many values as there might be trailing bytes
113		* in a UTF-8 sequence.
114		*/
115		static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116		0x03C82080UL, 0xFA082080UL, 0x82082080UL };
117
118		/*
119		* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
120		* into the first byte, depending on how many bytes follow. There are
121		* as many entries in this table as there are UTF-8 sequence types.
122		* (I.e., one byte sequence, two byte... etc.). Remember that sequence
123		* for legal UTF-8 will be 4 or fewer bytes total.
124		*/
125		static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
126
127		/* --------------------------------------------------------------------- */
128
129		/* The interface converts a whole buffer to avoid function-call overhead.
130		* Constants have been gathered. Loops & conditionals have been removed as
131		* much as possible for efficiency, in favor of drop-through switches.
132		* (See "Note A" at the bottom of the file for equivalent code.)
133		* If your compiler supports it, the "isLegalUTF8" call can be turned
134		* into an inline function.
135		*/
136
137		/* --------------------------------------------------------------------- */
138
139		static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
140		const uint16_t* sourceEnd,
141		uint8_t** targetStart, uint8_t* targetEnd,
142		ConversionFlags flags)
143	479k	{
144	479k	bool computeLength = (!targetEnd) ? true : false;
145	479k	const uint16_t* source = *sourceStart;
146	479k	uint8_t* target = *targetStart;
147	479k	ConversionResult result = conversionOK;
148
149	8.70M	while (source < sourceEnd)
150	8.23M	{
151	8.23M	uint32_t ch = 0;
152	8.23M	unsigned short bytesToWrite = 0;
153	8.23M	const uint32_t byteMask = 0xBF;
154	8.23M	const uint32_t byteMark = 0x80;
155	8.23M	const uint16_t* oldSource =
156	8.23M	source; /* In case we have to back up because of target overflow. */
157
158	8.23M	ch = *source++;
159
160		/* If we have a surrogate pair, convert to UTF32 first. */
161	8.23M	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
162	1.48k	{
163		/* If the 16 bits following the high surrogate are in the source buffer... */
164	1.48k	if (source < sourceEnd)
165	1.46k	{
166	1.46k	uint32_t ch2 = *source;
167
168		/* If it's a low surrogate, convert to UTF32. */
169	1.46k	if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
170	997	{
171	997	ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
172	997	halfBase;
173	997	++source;
174	997	}
175	467	else if (flags == strictConversion)
176	467	{
177		/* it's an unpaired high surrogate */
178	467	--source; /* return to the illegal value itself */
179	467	result = sourceIllegal;
180	467	break;
181	467	}
182	1.46k	}
183	16	else
184	16	{
185		/* We don't have the 16 bits following the high surrogate. */
186	16	--source; /* return to the high surrogate */
187	16	result = sourceExhausted;
188	16	break;
189	16	}
190	1.48k	}
191	8.23M	else if (flags == strictConversion)
192	8.23M	{
193		/* UTF-16 surrogate values are illegal in UTF-32 */
194	8.23M	if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
195	246	{
196	246	--source; /* return to the illegal value itself */
197	246	result = sourceIllegal;
198	246	break;
199	246	}
200	8.23M	}
201
202		/* Figure out how many bytes the result will require */
203	8.23M	if (ch < (uint32_t)0x80)
204	7.22M	{
205	7.22M	bytesToWrite = 1;
206	7.22M	}
207	1.00M	else if (ch < (uint32_t)0x800)
208	86.0k	{
209	86.0k	bytesToWrite = 2;
210	86.0k	}
211	918k	else if (ch < (uint32_t)0x10000)
212	917k	{
213	917k	bytesToWrite = 3;
214	917k	}
215	997	else if (ch < (uint32_t)0x110000)
216	997	{
217	997	bytesToWrite = 4;
218	997	}
219	0	else
220	0	{
221	0	bytesToWrite = 3;
222	0	ch = UNI_REPLACEMENT_CHAR;
223	0	}
224
225	8.23M	target += bytesToWrite;
226
227	8.23M	if ((target > targetEnd) && (!computeLength))
228	6.38k	{
229	6.38k	source = oldSource; /* Back up source pointer! */
230	6.38k	target -= bytesToWrite;
231	6.38k	result = targetExhausted;
232	6.38k	break;
233	6.38k	}
234
235	8.22M	if (!computeLength)
236	7.65M	{
237	7.65M	switch (bytesToWrite)
238	7.65M	{
239		/* note: everything falls through. */
240	486	case 4:
241	486	*--target = (uint8_t)((ch \| byteMark) & byteMask);
242	486	ch >>= 6;
243		/* fallthrough */
244	486	WINPR_FALLTHROUGH
245	599k	case 3:
246	599k	*--target = (uint8_t)((ch \| byteMark) & byteMask);
247	599k	ch >>= 6;
248		/* fallthrough */
249	599k	WINPR_FALLTHROUGH
250
251	658k	case 2:
252	658k	*--target = (uint8_t)((ch \| byteMark) & byteMask);
253	658k	ch >>= 6;
254		/* fallthrough */
255	658k	WINPR_FALLTHROUGH
256
257	7.65M	case 1:
258	7.65M	*--target = (uint8_t)(ch \| firstByteMark[bytesToWrite]);
259	7.65M	}
260	7.65M	}
261	573k	else
262	573k	{
263	573k	switch (bytesToWrite)
264	573k	{
265		/* note: everything falls through. */
266	511	case 4:
267	511	--target;
268		/* fallthrough */
269	511	WINPR_FALLTHROUGH
270
271	312k	case 3:
272	312k	--target;
273		/* fallthrough */
274	312k	WINPR_FALLTHROUGH
275
276	339k	case 2:
277	339k	--target;
278		/* fallthrough */
279	339k	WINPR_FALLTHROUGH
280
281	573k	case 1:
282	573k	--target;
283	573k	}
284	573k	}
285
286	8.22M	target += bytesToWrite;
287	8.22M	}
288
289	479k	*sourceStart = source;
290	479k	*targetStart = target;
291	479k	return result;
292	479k	}
293
294		/* --------------------------------------------------------------------- */
295
296		/*
297		* Utility routine to tell whether a sequence of bytes is legal UTF-8.
298		* This must be called with the length pre-determined by the first byte.
299		* If not calling this from ConvertUTF8to*, then the length can be set by:
300		* length = trailingBytesForUTF8[*source]+1;
301		* and the sequence is illegal right away if there aren't that many bytes
302		* available.
303		* If presented with a length > 4, this returns false. The Unicode
304		* definition of UTF-8 goes up to 4-byte sequences.
305		*/
306
307		static bool isLegalUTF8(const uint8_t* source, int length)
308	15.1M	{
309	15.1M	uint8_t a = 0;
310	15.1M	const uint8_t* srcptr = source + length;
311
312	15.1M	switch (length)
313	15.1M	{
314	4	default:
315	4	return false;
316
317		/* Everything else falls through when "true"... */
318	815	case 4:
319	815	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)
320	12	return false;
321		/* fallthrough */
322	803	WINPR_FALLTHROUGH
323
324	1.63k	case 3:
325	1.63k	if ((a = (*--srcptr)) < 0x80 \|\| a > 0xBF)
326	12	return false;
327		/* fallthrough */
328	1.62k	WINPR_FALLTHROUGH
329
330	1.93k	case 2:
331	1.93k	if ((a = (*--srcptr)) > 0xBF)
332	3	return false;
333
334	1.93k	switch (*source)
335	1.93k	{
336		/* no fall-through in this inner switch */
337	348	case 0xE0:
338	348	if (a < 0xA0)
339	12	return false;
340
341	336	break;
342
343	336	case 0xED:
344	230	if (a > 0x9F)
345	3	return false;
346
347	227	break;
348
349	276	case 0xF0:
350	276	if (a < 0x90)
351	8	return false;
352
353	268	break;
354
355	517	case 0xF4:
356	517	if (a > 0x8F)
357	1	return false;
358
359	516	break;
360
361	560	default:
362	560	if (a < 0x80)
363	6	return false;
364	554	break;
365	1.93k	}
366		/* fallthrough */
367	1.90k	WINPR_FALLTHROUGH
368
369	15.1M	case 1:
370	15.1M	if (source >= 0x80 && source < 0xC2)
371	20	return false;
372	15.1M	}
373
374	15.1M	if (*source > 0xF4)
375	1	return false;
376
377	15.1M	return true;
378	15.1M	}
379
380		/* --------------------------------------------------------------------- */
381
382		static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
383		const uint8_t* sourceEnd,
384		uint16_t** targetStart,
385		uint16_t* targetEnd,
386		ConversionFlags flags)
387	453k	{
388	453k	bool computeLength = (!targetEnd) ? true : false;
389	453k	ConversionResult result = conversionOK;
390	453k	const uint8_t* source = *sourceStart;
391	453k	uint16_t* target = *targetStart;
392
393	15.5M	while (source < sourceEnd)
394	15.1M	{
395	15.1M	uint32_t ch = 0;
396	15.1M	unsigned short extraBytesToRead =
397	30.2M	WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]);
398
399	15.1M	if ((source + extraBytesToRead) >= sourceEnd)
400	1	{
401	1	result = sourceExhausted;
402	1	break;
403	1	}
404
405		/* Do this check whether lenient or strict */
406	15.1M	if (!isLegalUTF8(source, extraBytesToRead + 1))
407	82	{
408	82	result = sourceIllegal;
409	82	break;
410	82	}
411
412		/*
413		* The cases all fall through. See "Note A" below.
414		*/
415	15.1M	switch (extraBytesToRead)
416	15.1M	{
417	0	case 5:
418	0	ch += *source++;
419	0	ch <<= 6; /* remember, illegal UTF-8 */
420		/* fallthrough */
421	0	WINPR_FALLTHROUGH
422
423	0	case 4:
424	0	ch += *source++;
425	0	ch <<= 6; /* remember, illegal UTF-8 */
426		/* fallthrough */
427	0	WINPR_FALLTHROUGH
428
429	792	case 3:
430	792	ch += *source++;
431	792	ch <<= 6;
432		/* fallthrough */
433	792	WINPR_FALLTHROUGH
434
435	1.59k	case 2:
436	1.59k	ch += *source++;
437	1.59k	ch <<= 6;
438		/* fallthrough */
439	1.59k	WINPR_FALLTHROUGH
440
441	1.89k	case 1:
442	1.89k	ch += *source++;
443	1.89k	ch <<= 6;
444		/* fallthrough */
445	1.89k	WINPR_FALLTHROUGH
446
447	15.1M	case 0:
448	15.1M	ch += *source++;
449	15.1M	}
450
451	15.1M	ch -= offsetsFromUTF8[extraBytesToRead];
452
453	15.1M	if ((target >= targetEnd) && (!computeLength))
454	0	{
455	0	source -= (extraBytesToRead + 1); /* Back up source pointer! */
456	0	result = targetExhausted;
457	0	break;
458	0	}
459
460	15.1M	if (ch <= UNI_MAX_BMP)
461	15.1M	{
462		/* Target is a character <= 0xFFFF */
463		/* UTF-16 surrogate values are illegal in UTF-32 */
464	15.1M	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
465	0	{
466	0	if (flags == strictConversion)
467	0	{
468	0	source -= (extraBytesToRead + 1); /* return to the illegal value itself */
469	0	result = sourceIllegal;
470	0	break;
471	0	}
472	0	else
473	0	{
474	0	if (!computeLength)
475	0	*target++ = UNI_REPLACEMENT_CHAR;
476	0	else
477	0	target++;
478	0	}
479	0	}
480	15.1M	else
481	15.1M	{
482	15.1M	if (!computeLength)
483	11.8M	target++ = (uint16_t)ch; / normal case */
484	3.29M	else
485	3.29M	target++;
486	15.1M	}
487	15.1M	}
488	792	else if (ch > UNI_MAX_UTF16)
489	0	{
490	0	if (flags == strictConversion)
491	0	{
492	0	result = sourceIllegal;
493	0	source -= (extraBytesToRead + 1); /* return to the start */
494	0	break; /* Bail out; shouldn't continue */
495	0	}
496	0	else
497	0	{
498	0	if (!computeLength)
499	0	*target++ = UNI_REPLACEMENT_CHAR;
500	0	else
501	0	target++;
502	0	}
503	0	}
504	792	else
505	792	{
506		/* target is a character in range 0xFFFF - 0x10FFFF. */
507	792	if ((target + 1 >= targetEnd) && (!computeLength))
508	0	{
509	0	source -= (extraBytesToRead + 1); /* Back up source pointer! */
510	0	result = targetExhausted;
511	0	break;
512	0	}
513
514	792	ch -= halfBase;
515
516	792	if (!computeLength)
517	391	{
518	391	*target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
519	391	*target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
520	391	}
521	401	else
522	401	{
523	401	target++;
524	401	target++;
525	401	}
526	792	}
527	15.1M	}
528
529	453k	*sourceStart = source;
530	453k	*targetStart = target;
531	453k	return result;
532	453k	}
533
534		/**
535		* WinPR built-in Unicode API
536		*/
537
538		static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
539	453k	{
540	453k	size_t length = 0;
541	453k	uint16_t* dstBeg = NULL;
542	453k	uint16_t* dstEnd = NULL;
543	453k	const uint8_t* srcBeg = NULL;
544	453k	const uint8_t* srcEnd = NULL;
545	453k	ConversionResult result = sourceIllegal;
546
547	453k	if (cchSrc == -1)
548	0	cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;
549
550	453k	srcBeg = src;
551	453k	srcEnd = &src[cchSrc];
552
553	453k	if (cchDst == 0)
554	1.03k	{
555	1.03k	result =
556	1.03k	winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
557
558	1.03k	length = dstBeg - (uint16_t*)NULL;
559	1.03k	}
560	452k	else
561	452k	{
562	452k	dstBeg = dst;
563	452k	dstEnd = &dst[cchDst];
564
565	452k	result =
566	452k	winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
567
568	452k	length = dstBeg - dst;
569	452k	}
570
571	453k	if (result == targetExhausted)
572	0	{
573	0	SetLastError(ERROR_INSUFFICIENT_BUFFER);
574	0	return 0;
575	0	}
576
577	453k	return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
578	453k	}
579
580		static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
581	479k	{
582	479k	size_t length = 0;
583	479k	uint8_t* dstBeg = NULL;
584	479k	uint8_t* dstEnd = NULL;
585	479k	const uint16_t* srcBeg = NULL;
586	479k	const uint16_t* srcEnd = NULL;
587	479k	ConversionResult result = sourceIllegal;
588
589	479k	if (cchSrc == -1)
590	0	cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;
591
592	479k	srcBeg = src;
593	479k	srcEnd = &src[cchSrc];
594
595	479k	if (cchDst == 0)
596	57.7k	{
597	57.7k	result =
598	57.7k	winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
599
600	57.7k	length = dstBeg - ((uint8_t*)NULL);
601	57.7k	}
602	422k	else
603	422k	{
604	422k	dstBeg = dst;
605	422k	dstEnd = &dst[cchDst];
606
607	422k	result =
608	422k	winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
609
610	422k	length = dstBeg - dst;
611	422k	}
612
613	479k	if (result == targetExhausted)
614	6.38k	{
615	6.38k	SetLastError(ERROR_INSUFFICIENT_BUFFER);
616	6.38k	return 0;
617	6.38k	}
618
619	473k	return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
620	473k	}
621
622		/* --------------------------------------------------------------------- */
623
624		int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
625		LPWSTR lpWideCharStr, int cchWideChar)
626	453k	{
627	453k	size_t cbCharLen = (size_t)cbMultiByte;
628
629	453k	WINPR_UNUSED(dwFlags);
630
631		/* If cbMultiByte is 0, the function fails */
632	453k	if ((cbMultiByte == 0) \|\| (cbMultiByte < -1))
633	0	return 0;
634
635	453k	if (cchWideChar < 0)
636	0	return -1;
637
638	453k	if (cbMultiByte < 0)
639	0	{
640	0	const size_t len = strlen(lpMultiByteStr);
641	0	if (len >= INT32_MAX)
642	0	return 0;
643	0	cbCharLen = (int)len + 1;
644	0	}
645	453k	else
646	453k	cbCharLen = cbMultiByte;
647
648	453k	WINPR_ASSERT(lpMultiByteStr);
649	453k	switch (CodePage)
650	453k	{
651	0	case CP_ACP:
652	453k	case CP_UTF8:
653	453k	break;
654
655	0	default:
656	0	WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
657	0	return 0;
658	453k	}
659
660	453k	return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr,
661	453k	WINPR_ASSERTING_INT_CAST(int, cbCharLen),
662	0	(uint16_t*)lpWideCharStr, cchWideChar);
663	453k	}
664
665		int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
666		LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
667		LPBOOL lpUsedDefaultChar)
668	479k	{
669	479k	size_t cbCharLen = (size_t)cchWideChar;
670
671	479k	WINPR_UNUSED(dwFlags);
672		/* If cchWideChar is 0, the function fails */
673	479k	if ((cchWideChar == 0) \|\| (cchWideChar < -1))
674	0	return 0;
675
676	479k	if (cbMultiByte < 0)
677	0	return -1;
678
679	479k	WINPR_ASSERT(lpWideCharStr);
680		/* If cchWideChar is -1, the string is null-terminated */
681	479k	if (cchWideChar == -1)
682	0	{
683	0	const size_t len = _wcslen(lpWideCharStr);
684	0	if (len >= INT32_MAX)
685	0	return 0;
686	0	cbCharLen = (int)len + 1;
687	0	}
688	479k	else
689	479k	cbCharLen = cchWideChar;
690
691		/*
692		* if cbMultiByte is 0, the function returns the required buffer size
693		* in bytes for lpMultiByteStr and makes no use of the output parameter itself.
694		*/
695
696	479k	return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr,
697	479k	WINPR_ASSERTING_INT_CAST(int, cbCharLen),
698	0	(uint8_t*)lpMultiByteStr, cbMultiByte);
699	479k	}