/src/FreeRDP/winpr/libwinpr/crt/unicode.c

Source (jump to first uncovered line)
/**
 * WinPR: Windows Portable Runtime
 * Unicode Conversion (CRT)
 *
 * Copyright 2012 Marc-Andre Moreau <marcandre.moreau@gmail.com>
 * Copyright 2022 Armin Novak <anovak@thincast.com>
 * Copyright 2022 Thincast Technologies GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <winpr/config.h>
#include <winpr/assert.h>

#include <errno.h>
#include <wctype.h>

#include <winpr/crt.h>
#include <winpr/error.h>
#include <winpr/print.h>

#ifndef MIN
#define MIN(a, b) (a) < (b) ? (a) : (b)
#endif

#ifndef _WIN32

#include "unicode.h"

#include "../log.h"
#define TAG WINPR_TAG("unicode")

/**
 * Notes on cross-platform Unicode portability:
 *
 * Unicode has many possible Unicode Transformation Format (UTF) encodings,
 * where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32.
 *
 * The number in the UTF encoding name (8, 16, 32) refers to the number of bits
 * per code unit. A code unit is the minimal bit combination that can represent
 * a unit of encoded text in the given encoding. For instance, UTF-8 encodes
 * the English alphabet using 8 bits (or one byte) each, just like in ASCII.
 *
 * However, the total number of code points (values in the Unicode codespace)
 * only fits completely within 32 bits. This means that for UTF-8 and UTF-16,
 * more than one code unit may be required to fully encode a specific value.
 * UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width.
 *
 * UTF-8 has the advantage of being backwards compatible with ASCII, and is
 * one of the most commonly used Unicode encoding.
 *
 * UTF-16 is used everywhere in the Windows API. The strategy employed by
 * Microsoft to provide backwards compatibility in their API was to create
 * an ANSI and a Unicode version of the same function, ending with A (ANSI)
 * and W (Wide character, or UTF-16 Unicode). In headers, the original
 * function name is replaced by a macro that defines to either the ANSI
 * or Unicode version based on the definition of the _UNICODE macro.
 *
 * UTF-32 has the advantage of being fixed width, but wastes a lot of space
 * for English text (4x more than UTF-8, 2x more than UTF-16).
 *
 * In C, wide character strings are often defined with the wchar_t type.
 * Many functions are provided to deal with those wide character strings,
 * such as wcslen (strlen equivalent) or wprintf (printf equivalent).
 *
 * This may lead to some confusion, since many of these functions exist
 * on both Windows and Linux, but they are *not* the same!
 *
 * This sample hello world is a good example:
 *
 * #include <wchar.h>
 *
 * wchar_t hello[] = L"Hello, World!\n";
 *
 * int main(int argc, char** argv)
 * {
 *  wprintf(hello);
 *  wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t));
 *  return 0;
 * }
 *
 * There is a reason why the sample prints the size of the wchar_t type:
 * On Windows, wchar_t is two bytes (UTF-16), while on most other systems
 * it is 4 bytes (UTF-32). This means that if you write code on Windows,
 * use L"" to define a string which is meant to be UTF-16 and not UTF-32,
 * you will have a little surprise when trying to port your code to Linux.
 *
 * Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR
 * type to always be 2-bytes long and uses it instead of wchar_t. Do not
 * ever use wchar_t with WinPR unless you know what you are doing.
 *
 * As for L"", it is unfortunately unusable in a portable way, unless a
 * special option is passed to GCC to define wchar_t as being two bytes.
 * For string constants that must be UTF-16, it is a pain, but they can
 * be defined in a portable way like this:
 *
 * WCHAR hello[] = { 'H','e','l','l','o','\0' };
 *
 * Such strings cannot be passed to native functions like wcslen(), which
 * may expect a different wchar_t size. For this reason, WinPR provides
 * _wcslen, which expects UTF-16 WCHAR strings on all platforms.
 *
 */

/** \deprecated We no longer export this function, see ConvertUtf8ToWChar family of functions for a
 * replacement
 *
 * Conversion to Unicode (UTF-16)
 * MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/
 *
 * cbMultiByte is an input size in bytes (BYTE)
 * cchWideChar is an output size in wide characters (WCHAR)
 *
 * Null-terminated UTF-8 strings:
 *
 * cchWideChar *cannot* be assumed to be cbMultiByte since UTF-8 is variable-width!
 *
 * Instead, obtain the required cchWideChar output size like this:
 * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0);
 *
 * A value of -1 for cbMultiByte indicates that the input string is null-terminated,
 * and the null terminator *will* be processed. The size returned by MultiByteToWideChar
 * will therefore include the null terminator. Equivalent behavior can be obtained by
 * computing the length in bytes of the input buffer, including the null terminator:
 *
 * cbMultiByte = strlen((char*) lpMultiByteStr) + 1;
 *
 * An output buffer of the proper size can then be allocated:
 *
 * lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR));
 *
 * Since cchWideChar is an output size in wide characters, the actual buffer size is:
 * (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2)
 *
 * Finally, perform the conversion:
 *
 * cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr,
 * cchWideChar);
 *
 * The value returned by MultiByteToWideChar corresponds to the number of wide characters written
 * to the output buffer, and should match the value obtained on the first call to
 * MultiByteToWideChar.
 *
 */

#if !defined(WITH_WINPR_DEPRECATED)
static
#endif
    int
    MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
                        LPWSTR lpWideCharStr, int cchWideChar)
{
  return int_MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, lpWideCharStr,
                                 cchWideChar);
}

/** \deprecated We no longer export this function, see ConvertWCharToUtf8 family of functions for a
 * replacement
 *
 * Conversion from Unicode (UTF-16)
 * WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/
 *
 * cchWideChar is an input size in wide characters (WCHAR)
 * cbMultiByte is an output size in bytes (BYTE)
 *
 * Null-terminated UTF-16 strings:
 *
 * cbMultiByte *cannot* be assumed to be cchWideChar since UTF-8 is variable-width!
 *
 * Instead, obtain the required cbMultiByte output size like this:
 * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL);
 *
 * A value of -1 for cbMultiByte indicates that the input string is null-terminated,
 * and the null terminator *will* be processed. The size returned by WideCharToMultiByte
 * will therefore include the null terminator. Equivalent behavior can be obtained by
 * computing the length in bytes of the input buffer, including the null terminator:
 *
 * cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1;
 *
 * An output buffer of the proper size can then be allocated:
 * lpMultiByteStr = (LPSTR) malloc(cbMultiByte);
 *
 * Since cbMultiByte is an output size in bytes, it is the same as the buffer size
 *
 * Finally, perform the conversion:
 *
 * cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr,
 * cbMultiByte, NULL, NULL);
 *
 * The value returned by WideCharToMultiByte corresponds to the number of bytes written
 * to the output buffer, and should match the value obtained on the first call to
 * WideCharToMultiByte.
 *
 */

#if !defined(WITH_WINPR_DEPRECATED)
static
#endif
    int
    WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
                        LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
                        LPBOOL lpUsedDefaultChar)
{
  return int_WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
                                 cbMultiByte, lpDefaultChar, lpUsedDefaultChar);
}

#endif

/**
 * ConvertToUnicode is a convenience wrapper for MultiByteToWideChar:
 *
 * If the lpWideCharStr parameter for the converted string points to NULL
 * or if the cchWideChar parameter is set to 0 this function will automatically
 * allocate the required memory which is guaranteed to be null-terminated
 * after the conversion, even if the source c string isn't.
 *
 * If the cbMultiByte parameter is set to -1 the passed lpMultiByteStr must
 * be null-terminated and the required length for the converted string will be
 * calculated accordingly.
 */
#if defined(WITH_WINPR_DEPRECATED)
int ConvertToUnicode(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
                     LPWSTR* lpWideCharStr, int cchWideChar)
{
  int status = 0;
  BOOL allocate = FALSE;

  if (!lpMultiByteStr)
    return 0;

  if (!lpWideCharStr)
    return 0;

  if (cbMultiByte == -1)
  {
    size_t len = strnlen(lpMultiByteStr, INT_MAX);
    if (len >= INT_MAX)
      return 0;
    cbMultiByte = (int)(len + 1);
  }

  if (cchWideChar == 0)
  {
    cchWideChar = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, NULL, 0);
    allocate = TRUE;
  }
  else if (!(*lpWideCharStr))
    allocate = TRUE;

  if (cchWideChar < 1)
    return 0;

  if (allocate)
  {
    *lpWideCharStr = (LPWSTR)calloc(cchWideChar + 1, sizeof(WCHAR));

    if (!(*lpWideCharStr))
    {
      // SetLastError(ERROR_INSUFFICIENT_BUFFER);
      return 0;
    }
  }

  status = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, *lpWideCharStr,
                               cchWideChar);

  if (status != cchWideChar)
  {
    if (allocate)
    {
      free(*lpWideCharStr);
      *lpWideCharStr = NULL;
      status = 0;
    }
  }

  return status;
}
#endif

/**
 * ConvertFromUnicode is a convenience wrapper for WideCharToMultiByte:
 *
 * If the lpMultiByteStr parameter for the converted string points to NULL
 * or if the cbMultiByte parameter is set to 0 this function will automatically
 * allocate the required memory which is guaranteed to be null-terminated
 * after the conversion, even if the source unicode string isn't.
 *
 * If the cchWideChar parameter is set to -1 the passed lpWideCharStr must
 * be null-terminated and the required length for the converted string will be
 * calculated accordingly.
 */
#if defined(WITH_WINPR_DEPRECATED)
int ConvertFromUnicode(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
                       LPSTR* lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
                       LPBOOL lpUsedDefaultChar)
{
  int status = 0;
  BOOL allocate = FALSE;

  if (!lpWideCharStr)
    return 0;

  if (!lpMultiByteStr)
    return 0;

  if (cchWideChar == -1)
    cchWideChar = (int)(_wcslen(lpWideCharStr) + 1);

  if (cbMultiByte == 0)
  {
    cbMultiByte =
        WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL);
    allocate = TRUE;
  }
  else if (!(*lpMultiByteStr))
    allocate = TRUE;

  if (cbMultiByte < 1)
    return 0;

  if (allocate)
  {
    *lpMultiByteStr = (LPSTR)calloc(1, cbMultiByte + 1);

    if (!(*lpMultiByteStr))
    {
      // SetLastError(ERROR_INSUFFICIENT_BUFFER);
      return 0;
    }
  }

  status = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, *lpMultiByteStr,
                               cbMultiByte, lpDefaultChar, lpUsedDefaultChar);

  if ((status != cbMultiByte) && allocate)
  {
    status = 0;
  }

  if ((status <= 0) && allocate)
  {
    free(*lpMultiByteStr);
    *lpMultiByteStr = NULL;
  }

  return status;
}
#endif

/**
 * Swap Unicode byte order (UTF16LE <-> UTF16BE)
 */

const WCHAR* ByteSwapUnicode(WCHAR* wstr, size_t length)
{
  WINPR_ASSERT(wstr || (length == 0));

  for (size_t x = 0; x < length; x++)
    wstr[x] = _byteswap_ushort(wstr[x]);
  return wstr;
}

SSIZE_T ConvertWCharToUtf8(const WCHAR* wstr, char* str, size_t len)
{
  if (!wstr)
  {
    if (str && len)
      str[0] = 0;
    return 0;
  }

  const size_t wlen = _wcslen(wstr);
  return ConvertWCharNToUtf8(wstr, wlen + 1, str, len);
}

SSIZE_T ConvertWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len)
{
  BOOL isNullTerminated = FALSE;
  if (wlen == 0)
    return 0;

  WINPR_ASSERT(wstr);
  size_t iwlen = _wcsnlen(wstr, wlen);

  if (wlen > INT32_MAX)
  {
    SetLastError(ERROR_INVALID_PARAMETER);
    return -1;
  }

  if (iwlen < wlen)
  {
    isNullTerminated = TRUE;
    iwlen++;
  }
  const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)iwlen, str, (int)MIN(INT32_MAX, len),
                                     NULL, NULL);
  if ((rc <= 0) || ((len > 0) && ((size_t)rc > len)))
    return -1;
  else if (!isNullTerminated)
  {
    if (str && ((size_t)rc < len))
      str[rc] = '\0';
    return rc;
  }
  else if ((size_t)rc == len)
  {
    if (str && (str[rc - 1] != '\0'))
      return rc;
  }
  return rc - 1;
}

SSIZE_T ConvertMszWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len)
{
  if (wlen == 0)
    return 0;

  WINPR_ASSERT(wstr);

  if (wlen > INT32_MAX)
  {
    SetLastError(ERROR_INVALID_PARAMETER);
    return -1;
  }

  const int iwlen = MIN(INT32_MAX, len);
  const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)wlen, str, (int)iwlen, NULL, NULL);
  if ((rc <= 0) || ((len > 0) && (rc > iwlen)))
    return -1;

  return rc;
}

SSIZE_T ConvertUtf8ToWChar(const char* str, WCHAR* wstr, size_t wlen)
{
  if (!str)
  {
    if (wstr && wlen)
      wstr[0] = 0;
    return 0;
  }

  const size_t len = strlen(str);
  return ConvertUtf8NToWChar(str, len + 1, wstr, wlen);
}

SSIZE_T ConvertUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen)
{
  size_t ilen = strnlen(str, len);
  BOOL isNullTerminated = FALSE;
  if (len == 0)
    return 0;

  WINPR_ASSERT(str);

  if (len > INT32_MAX)
  {
    SetLastError(ERROR_INVALID_PARAMETER);
    return -1;
  }
  if (ilen < len)
  {
    isNullTerminated = TRUE;
    ilen++;
  }

  const int iwlen = MIN(INT32_MAX, wlen);
  const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)ilen, wstr, (int)iwlen);
  if ((rc <= 0) || ((wlen > 0) && (rc > iwlen)))
    return -1;
  if (!isNullTerminated)
  {
    if (wstr && (rc < iwlen))
      wstr[rc] = '\0';
    return rc;
  }
  else if (rc == iwlen)
  {
    if (wstr && (wstr[rc - 1] != '\0'))
      return rc;
  }
  return rc - 1;
}

SSIZE_T ConvertMszUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen)
{
  if (len == 0)
    return 0;

  WINPR_ASSERT(str);

  if (len > INT32_MAX)
  {
    SetLastError(ERROR_INVALID_PARAMETER);
    return -1;
  }

  const int iwlen = MIN(INT32_MAX, wlen);
  const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)len, wstr, (int)iwlen);
  if ((rc <= 0) || ((wlen > 0) && (rc > iwlen)))
    return -1;

  return rc;
}

char* ConvertWCharToUtf8Alloc(const WCHAR* wstr, size_t* pUtfCharLength)
{
  char* tmp = NULL;
  const SSIZE_T rc = ConvertWCharToUtf8(wstr, NULL, 0);
  if (pUtfCharLength)
    *pUtfCharLength = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(char));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertWCharToUtf8(wstr, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pUtfCharLength)
    *pUtfCharLength = (size_t)rc2;
  return tmp;
}

char* ConvertWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength)
{
  char* tmp = NULL;
  const SSIZE_T rc = ConvertWCharNToUtf8(wstr, wlen, NULL, 0);

  if (pUtfCharLength)
    *pUtfCharLength = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(char));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pUtfCharLength)
    *pUtfCharLength = (size_t)rc2;
  return tmp;
}

char* ConvertMszWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength)
{
  char* tmp = NULL;
  const SSIZE_T rc = ConvertMszWCharNToUtf8(wstr, wlen, NULL, 0);

  if (pUtfCharLength)
    *pUtfCharLength = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(char));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertMszWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pUtfCharLength)
    *pUtfCharLength = (size_t)rc2;
  return tmp;
}

WCHAR* ConvertUtf8ToWCharAlloc(const char* str, size_t* pSize)
{
  WCHAR* tmp = NULL;
  const SSIZE_T rc = ConvertUtf8ToWChar(str, NULL, 0);
  if (pSize)
    *pSize = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertUtf8ToWChar(str, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pSize)
    *pSize = (size_t)rc2;
  return tmp;
}

WCHAR* ConvertUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize)
{
  WCHAR* tmp = NULL;
  const SSIZE_T rc = ConvertUtf8NToWChar(str, len, NULL, 0);
  if (pSize)
    *pSize = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pSize)
    *pSize = (size_t)rc2;
  return tmp;
}

WCHAR* ConvertMszUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize)
{
  WCHAR* tmp = NULL;
  const SSIZE_T rc = ConvertMszUtf8NToWChar(str, len, NULL, 0);
  if (pSize)
    *pSize = 0;
  if (rc < 0)
    return NULL;
  tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
  if (!tmp)
    return NULL;
  const SSIZE_T rc2 = ConvertMszUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull);
  if (rc2 < 0)
  {
    free(tmp);
    return NULL;
  }
  WINPR_ASSERT(rc == rc2);
  if (pSize)
    *pSize = (size_t)rc2;
  return tmp;
}

Coverage Report

Created: 2024-05-20 06:11

Line	Count	Source (jump to first uncovered line)
1		/**
2		* WinPR: Windows Portable Runtime
3		* Unicode Conversion (CRT)
4		*
5		* Copyright 2012 Marc-Andre Moreau <marcandre.moreau@gmail.com>
6		* Copyright 2022 Armin Novak <anovak@thincast.com>
7		* Copyright 2022 Thincast Technologies GmbH
8		*
9		* Licensed under the Apache License, Version 2.0 (the "License");
10		* you may not use this file except in compliance with the License.
11		* You may obtain a copy of the License at
12		*
13		* http://www.apache.org/licenses/LICENSE-2.0
14		*
15		* Unless required by applicable law or agreed to in writing, software
16		* distributed under the License is distributed on an "AS IS" BASIS,
17		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18		* See the License for the specific language governing permissions and
19		* limitations under the License.
20		*/
21
22		#include <winpr/config.h>
23		#include <winpr/assert.h>
24
25		#include <errno.h>
26		#include <wctype.h>
27
28		#include <winpr/crt.h>
29		#include <winpr/error.h>
30		#include <winpr/print.h>
31
32		#ifndef MIN
33	376k	#define MIN(a, b) (a) < (b) ? (a) : (b)
34		#endif
35
36		#ifndef _WIN32
37
38		#include "unicode.h"
39
40		#include "../log.h"
41		#define TAG WINPR_TAG("unicode")
42
43		/**
44		* Notes on cross-platform Unicode portability:
45		*
46		* Unicode has many possible Unicode Transformation Format (UTF) encodings,
47		* where some of the most commonly used are UTF-8, UTF-16 and sometimes UTF-32.
48		*
49		* The number in the UTF encoding name (8, 16, 32) refers to the number of bits
50		* per code unit. A code unit is the minimal bit combination that can represent
51		* a unit of encoded text in the given encoding. For instance, UTF-8 encodes
52		* the English alphabet using 8 bits (or one byte) each, just like in ASCII.
53		*
54		* However, the total number of code points (values in the Unicode codespace)
55		* only fits completely within 32 bits. This means that for UTF-8 and UTF-16,
56		* more than one code unit may be required to fully encode a specific value.
57		* UTF-8 and UTF-16 are variable-width encodings, while UTF-32 is fixed-width.
58		*
59		* UTF-8 has the advantage of being backwards compatible with ASCII, and is
60		* one of the most commonly used Unicode encoding.
61		*
62		* UTF-16 is used everywhere in the Windows API. The strategy employed by
63		* Microsoft to provide backwards compatibility in their API was to create
64		* an ANSI and a Unicode version of the same function, ending with A (ANSI)
65		* and W (Wide character, or UTF-16 Unicode). In headers, the original
66		* function name is replaced by a macro that defines to either the ANSI
67		* or Unicode version based on the definition of the _UNICODE macro.
68		*
69		* UTF-32 has the advantage of being fixed width, but wastes a lot of space
70		* for English text (4x more than UTF-8, 2x more than UTF-16).
71		*
72		* In C, wide character strings are often defined with the wchar_t type.
73		* Many functions are provided to deal with those wide character strings,
74		* such as wcslen (strlen equivalent) or wprintf (printf equivalent).
75		*
76		* This may lead to some confusion, since many of these functions exist
77		* on both Windows and Linux, but they are not the same!
78		*
79		* This sample hello world is a good example:
80		*
81		* #include <wchar.h>
82		*
83		* wchar_t hello[] = L"Hello, World!\n";
84		*
85		* int main(int argc, char** argv)
86		* {
87		* wprintf(hello);
88		* wprintf(L"sizeof(wchar_t): %d\n", sizeof(wchar_t));
89		* return 0;
90		* }
91		*
92		* There is a reason why the sample prints the size of the wchar_t type:
93		* On Windows, wchar_t is two bytes (UTF-16), while on most other systems
94		* it is 4 bytes (UTF-32). This means that if you write code on Windows,
95		* use L"" to define a string which is meant to be UTF-16 and not UTF-32,
96		* you will have a little surprise when trying to port your code to Linux.
97		*
98		* Since the Windows API uses UTF-16, not UTF-32, WinPR defines the WCHAR
99		* type to always be 2-bytes long and uses it instead of wchar_t. Do not
100		* ever use wchar_t with WinPR unless you know what you are doing.
101		*
102		* As for L"", it is unfortunately unusable in a portable way, unless a
103		* special option is passed to GCC to define wchar_t as being two bytes.
104		* For string constants that must be UTF-16, it is a pain, but they can
105		* be defined in a portable way like this:
106		*
107		* WCHAR hello[] = { 'H','e','l','l','o','\0' };
108		*
109		* Such strings cannot be passed to native functions like wcslen(), which
110		* may expect a different wchar_t size. For this reason, WinPR provides
111		* _wcslen, which expects UTF-16 WCHAR strings on all platforms.
112		*
113		*/
114
115		/** \deprecated We no longer export this function, see ConvertUtf8ToWChar family of functions for a
116		* replacement
117		*
118		* Conversion to Unicode (UTF-16)
119		* MultiByteToWideChar: http://msdn.microsoft.com/en-us/library/windows/desktop/dd319072/
120		*
121		* cbMultiByte is an input size in bytes (BYTE)
122		* cchWideChar is an output size in wide characters (WCHAR)
123		*
124		* Null-terminated UTF-8 strings:
125		*
126		* cchWideChar cannot be assumed to be cbMultiByte since UTF-8 is variable-width!
127		*
128		* Instead, obtain the required cchWideChar output size like this:
129		* cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, NULL, 0);
130		*
131		* A value of -1 for cbMultiByte indicates that the input string is null-terminated,
132		* and the null terminator will be processed. The size returned by MultiByteToWideChar
133		* will therefore include the null terminator. Equivalent behavior can be obtained by
134		* computing the length in bytes of the input buffer, including the null terminator:
135		*
136		* cbMultiByte = strlen((char*) lpMultiByteStr) + 1;
137		*
138		* An output buffer of the proper size can then be allocated:
139		*
140		* lpWideCharStr = (LPWSTR) malloc(cchWideChar * sizeof(WCHAR));
141		*
142		* Since cchWideChar is an output size in wide characters, the actual buffer size is:
143		* (cchWideChar * sizeof(WCHAR)) or (cchWideChar * 2)
144		*
145		* Finally, perform the conversion:
146		*
147		* cchWideChar = MultiByteToWideChar(CP_UTF8, 0, (LPCSTR) lpMultiByteStr, -1, lpWideCharStr,
148		* cchWideChar);
149		*
150		* The value returned by MultiByteToWideChar corresponds to the number of wide characters written
151		* to the output buffer, and should match the value obtained on the first call to
152		* MultiByteToWideChar.
153		*
154		*/
155
156		#if !defined(WITH_WINPR_DEPRECATED)
157		static
158		#endif
159		int
160		MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
161		LPWSTR lpWideCharStr, int cchWideChar)
162	271k	{
163	271k	return int_MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, lpWideCharStr,
164	271k	cchWideChar);
165	271k	}
166
167		/** \deprecated We no longer export this function, see ConvertWCharToUtf8 family of functions for a
168		* replacement
169		*
170		* Conversion from Unicode (UTF-16)
171		* WideCharToMultiByte: http://msdn.microsoft.com/en-us/library/windows/desktop/dd374130/
172		*
173		* cchWideChar is an input size in wide characters (WCHAR)
174		* cbMultiByte is an output size in bytes (BYTE)
175		*
176		* Null-terminated UTF-16 strings:
177		*
178		* cbMultiByte cannot be assumed to be cchWideChar since UTF-8 is variable-width!
179		*
180		* Instead, obtain the required cbMultiByte output size like this:
181		* cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, NULL, 0, NULL, NULL);
182		*
183		* A value of -1 for cbMultiByte indicates that the input string is null-terminated,
184		* and the null terminator will be processed. The size returned by WideCharToMultiByte
185		* will therefore include the null terminator. Equivalent behavior can be obtained by
186		* computing the length in bytes of the input buffer, including the null terminator:
187		*
188		* cchWideChar = _wcslen((WCHAR*) lpWideCharStr) + 1;
189		*
190		* An output buffer of the proper size can then be allocated:
191		* lpMultiByteStr = (LPSTR) malloc(cbMultiByte);
192		*
193		* Since cbMultiByte is an output size in bytes, it is the same as the buffer size
194		*
195		* Finally, perform the conversion:
196		*
197		* cbMultiByte = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) lpWideCharStr, -1, lpMultiByteStr,
198		* cbMultiByte, NULL, NULL);
199		*
200		* The value returned by WideCharToMultiByte corresponds to the number of bytes written
201		* to the output buffer, and should match the value obtained on the first call to
202		* WideCharToMultiByte.
203		*
204		*/
205
206		#if !defined(WITH_WINPR_DEPRECATED)
207		static
208		#endif
209		int
210		WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
211		LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
212		LPBOOL lpUsedDefaultChar)
213	104k	{
214	104k	return int_WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, lpMultiByteStr,
215	104k	cbMultiByte, lpDefaultChar, lpUsedDefaultChar);
216	104k	}
217
218		#endif
219
220		/**
221		* ConvertToUnicode is a convenience wrapper for MultiByteToWideChar:
222		*
223		* If the lpWideCharStr parameter for the converted string points to NULL
224		* or if the cchWideChar parameter is set to 0 this function will automatically
225		* allocate the required memory which is guaranteed to be null-terminated
226		* after the conversion, even if the source c string isn't.
227		*
228		* If the cbMultiByte parameter is set to -1 the passed lpMultiByteStr must
229		* be null-terminated and the required length for the converted string will be
230		* calculated accordingly.
231		*/
232		#if defined(WITH_WINPR_DEPRECATED)
233		int ConvertToUnicode(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
234		LPWSTR* lpWideCharStr, int cchWideChar)
235		{
236		int status = 0;
237		BOOL allocate = FALSE;
238
239		if (!lpMultiByteStr)
240		return 0;
241
242		if (!lpWideCharStr)
243		return 0;
244
245		if (cbMultiByte == -1)
246		{
247		size_t len = strnlen(lpMultiByteStr, INT_MAX);
248		if (len >= INT_MAX)
249		return 0;
250		cbMultiByte = (int)(len + 1);
251		}
252
253		if (cchWideChar == 0)
254		{
255		cchWideChar = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, NULL, 0);
256		allocate = TRUE;
257		}
258		else if (!(*lpWideCharStr))
259		allocate = TRUE;
260
261		if (cchWideChar < 1)
262		return 0;
263
264		if (allocate)
265		{
266		*lpWideCharStr = (LPWSTR)calloc(cchWideChar + 1, sizeof(WCHAR));
267
268		if (!(*lpWideCharStr))
269		{
270		// SetLastError(ERROR_INSUFFICIENT_BUFFER);
271		return 0;
272		}
273		}
274
275		status = MultiByteToWideChar(CodePage, dwFlags, lpMultiByteStr, cbMultiByte, *lpWideCharStr,
276		cchWideChar);
277
278		if (status != cchWideChar)
279		{
280		if (allocate)
281		{
282		free(*lpWideCharStr);
283		*lpWideCharStr = NULL;
284		status = 0;
285		}
286		}
287
288		return status;
289		}
290		#endif
291
292		/**
293		* ConvertFromUnicode is a convenience wrapper for WideCharToMultiByte:
294		*
295		* If the lpMultiByteStr parameter for the converted string points to NULL
296		* or if the cbMultiByte parameter is set to 0 this function will automatically
297		* allocate the required memory which is guaranteed to be null-terminated
298		* after the conversion, even if the source unicode string isn't.
299		*
300		* If the cchWideChar parameter is set to -1 the passed lpWideCharStr must
301		* be null-terminated and the required length for the converted string will be
302		* calculated accordingly.
303		*/
304		#if defined(WITH_WINPR_DEPRECATED)
305		int ConvertFromUnicode(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
306		LPSTR* lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
307		LPBOOL lpUsedDefaultChar)
308		{
309		int status = 0;
310		BOOL allocate = FALSE;
311
312		if (!lpWideCharStr)
313		return 0;
314
315		if (!lpMultiByteStr)
316		return 0;
317
318		if (cchWideChar == -1)
319		cchWideChar = (int)(_wcslen(lpWideCharStr) + 1);
320
321		if (cbMultiByte == 0)
322		{
323		cbMultiByte =
324		WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, NULL, 0, NULL, NULL);
325		allocate = TRUE;
326		}
327		else if (!(*lpMultiByteStr))
328		allocate = TRUE;
329
330		if (cbMultiByte < 1)
331		return 0;
332
333		if (allocate)
334		{
335		*lpMultiByteStr = (LPSTR)calloc(1, cbMultiByte + 1);
336
337		if (!(*lpMultiByteStr))
338		{
339		// SetLastError(ERROR_INSUFFICIENT_BUFFER);
340		return 0;
341		}
342		}
343
344		status = WideCharToMultiByte(CodePage, dwFlags, lpWideCharStr, cchWideChar, *lpMultiByteStr,
345		cbMultiByte, lpDefaultChar, lpUsedDefaultChar);
346
347		if ((status != cbMultiByte) && allocate)
348		{
349		status = 0;
350		}
351
352		if ((status <= 0) && allocate)
353		{
354		free(*lpMultiByteStr);
355		*lpMultiByteStr = NULL;
356		}
357
358		return status;
359		}
360		#endif
361
362		/**
363		* Swap Unicode byte order (UTF16LE <-> UTF16BE)
364		*/
365
366		const WCHAR* ByteSwapUnicode(WCHAR* wstr, size_t length)
367	0	{
368	0	WINPR_ASSERT(wstr \|\| (length == 0));
369
370	0	for (size_t x = 0; x < length; x++)
371	0	wstr[x] = _byteswap_ushort(wstr[x]);
372	0	return wstr;
373	0	}
374
375		SSIZE_T ConvertWCharToUtf8(const WCHAR* wstr, char* str, size_t len)
376	0	{
377	0	if (!wstr)
378	0	{
379	0	if (str && len)
380	0	str[0] = 0;
381	0	return 0;
382	0	}
383
384	0	const size_t wlen = _wcslen(wstr);
385	0	return ConvertWCharNToUtf8(wstr, wlen + 1, str, len);
386	0	}
387
388		SSIZE_T ConvertWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len)
389	104k	{
390	104k	BOOL isNullTerminated = FALSE;
391	104k	if (wlen == 0)
392	20	return 0;
393
394	104k	WINPR_ASSERT(wstr);
395	104k	size_t iwlen = _wcsnlen(wstr, wlen);
396
397	104k	if (wlen > INT32_MAX)
398	0	{
399	0	SetLastError(ERROR_INVALID_PARAMETER);
400	0	return -1;
401	0	}
402
403	104k	if (iwlen < wlen)
404	101k	{
405	101k	isNullTerminated = TRUE;
406	101k	iwlen++;
407	101k	}
408	104k	const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)iwlen, str, (int)MIN(INT32_MAX, len),
409	104k	NULL, NULL);
410	104k	if ((rc <= 0) \|\| ((len > 0) && ((size_t)rc > len)))
411	6.69k	return -1;
412	98.0k	else if (!isNullTerminated)
413	3.05k	{
414	3.05k	if (str && ((size_t)rc < len))
415	2.66k	str[rc] = '\0';
416	3.05k	return rc;
417	3.05k	}
418	94.9k	else if ((size_t)rc == len)
419	45.2k	{
420	45.2k	if (str && (str[rc - 1] != '\0'))
421	0	return rc;
422	45.2k	}
423	94.9k	return rc - 1;
424	104k	}
425
426		SSIZE_T ConvertMszWCharNToUtf8(const WCHAR* wstr, size_t wlen, char* str, size_t len)
427	0	{
428	0	if (wlen == 0)
429	0	return 0;
430
431	0	WINPR_ASSERT(wstr);
432
433	0	if (wlen > INT32_MAX)
434	0	{
435	0	SetLastError(ERROR_INVALID_PARAMETER);
436	0	return -1;
437	0	}
438
439	0	const int iwlen = MIN(INT32_MAX, len);
440	0	const int rc = WideCharToMultiByte(CP_UTF8, 0, wstr, (int)wlen, str, (int)iwlen, NULL, NULL);
441	0	if ((rc <= 0) \|\| ((len > 0) && (rc > iwlen)))
442	0	return -1;
443
444	0	return rc;
445	0	}
446
447		SSIZE_T ConvertUtf8ToWChar(const char* str, WCHAR* wstr, size_t wlen)
448	271k	{
449	271k	if (!str)
450	102	{
451	102	if (wstr && wlen)
452	51	wstr[0] = 0;
453	102	return 0;
454	102	}
455
456	271k	const size_t len = strlen(str);
457	271k	return ConvertUtf8NToWChar(str, len + 1, wstr, wlen);
458	271k	}
459
460		SSIZE_T ConvertUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen)
461	271k	{
462	271k	size_t ilen = strnlen(str, len);
463	271k	BOOL isNullTerminated = FALSE;
464	271k	if (len == 0)
465	0	return 0;
466
467	271k	WINPR_ASSERT(str);
468
469	271k	if (len > INT32_MAX)
470	0	{
471	0	SetLastError(ERROR_INVALID_PARAMETER);
472	0	return -1;
473	0	}
474	271k	if (ilen < len)
475	271k	{
476	271k	isNullTerminated = TRUE;
477	271k	ilen++;
478	271k	}
479
480	271k	const int iwlen = MIN(INT32_MAX, wlen);
481	271k	const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)ilen, wstr, (int)iwlen);
482	271k	if ((rc <= 0) \|\| ((wlen > 0) && (rc > iwlen)))
483	72	return -1;
484	271k	if (!isNullTerminated)
485	0	{
486	0	if (wstr && (rc < iwlen))
487	0	wstr[rc] = '\0';
488	0	return rc;
489	0	}
490	271k	else if (rc == iwlen)
491	972	{
492	972	if (wstr && (wstr[rc - 1] != '\0'))
493	0	return rc;
494	972	}
495	271k	return rc - 1;
496	271k	}
497
498		SSIZE_T ConvertMszUtf8NToWChar(const char* str, size_t len, WCHAR* wstr, size_t wlen)
499	0	{
500	0	if (len == 0)
501	0	return 0;
502
503	0	WINPR_ASSERT(str);
504
505	0	if (len > INT32_MAX)
506	0	{
507	0	SetLastError(ERROR_INVALID_PARAMETER);
508	0	return -1;
509	0	}
510
511	0	const int iwlen = MIN(INT32_MAX, wlen);
512	0	const int rc = MultiByteToWideChar(CP_UTF8, 0, str, (int)len, wstr, (int)iwlen);
513	0	if ((rc <= 0) \|\| ((wlen > 0) && (rc > iwlen)))
514	0	return -1;
515
516	0	return rc;
517	0	}
518
519		char* ConvertWCharToUtf8Alloc(const WCHAR* wstr, size_t* pUtfCharLength)
520	0	{
521	0	char* tmp = NULL;
522	0	const SSIZE_T rc = ConvertWCharToUtf8(wstr, NULL, 0);
523	0	if (pUtfCharLength)
524	0	*pUtfCharLength = 0;
525	0	if (rc < 0)
526	0	return NULL;
527	0	tmp = calloc((size_t)rc + 1ull, sizeof(char));
528	0	if (!tmp)
529	0	return NULL;
530	0	const SSIZE_T rc2 = ConvertWCharToUtf8(wstr, tmp, (size_t)rc + 1ull);
531	0	if (rc2 < 0)
532	0	{
533	0	free(tmp);
534	0	return NULL;
535	0	}
536	0	WINPR_ASSERT(rc == rc2);
537	0	if (pUtfCharLength)
538	0	*pUtfCharLength = (size_t)rc2;
539	0	return tmp;
540	0	}
541
542		char* ConvertWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength)
543	45.8k	{
544	45.8k	char* tmp = NULL;
545	45.8k	const SSIZE_T rc = ConvertWCharNToUtf8(wstr, wlen, NULL, 0);
546
547	45.8k	if (pUtfCharLength)
548	45.5k	*pUtfCharLength = 0;
549	45.8k	if (rc < 0)
550	199	return NULL;
551	45.6k	tmp = calloc((size_t)rc + 1ull, sizeof(char));
552	45.6k	if (!tmp)
553	0	return NULL;
554	45.6k	const SSIZE_T rc2 = ConvertWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull);
555	45.6k	if (rc2 < 0)
556	0	{
557	0	free(tmp);
558	0	return NULL;
559	0	}
560	45.6k	WINPR_ASSERT(rc == rc2);
561	45.6k	if (pUtfCharLength)
562	45.4k	*pUtfCharLength = (size_t)rc2;
563	45.6k	return tmp;
564	45.6k	}
565
566		char* ConvertMszWCharNToUtf8Alloc(const WCHAR* wstr, size_t wlen, size_t* pUtfCharLength)
567	0	{
568	0	char* tmp = NULL;
569	0	const SSIZE_T rc = ConvertMszWCharNToUtf8(wstr, wlen, NULL, 0);
570
571	0	if (pUtfCharLength)
572	0	*pUtfCharLength = 0;
573	0	if (rc < 0)
574	0	return NULL;
575	0	tmp = calloc((size_t)rc + 1ull, sizeof(char));
576	0	if (!tmp)
577	0	return NULL;
578	0	const SSIZE_T rc2 = ConvertMszWCharNToUtf8(wstr, wlen, tmp, (size_t)rc + 1ull);
579	0	if (rc2 < 0)
580	0	{
581	0	free(tmp);
582	0	return NULL;
583	0	}
584	0	WINPR_ASSERT(rc == rc2);
585	0	if (pUtfCharLength)
586	0	*pUtfCharLength = (size_t)rc2;
587	0	return tmp;
588	0	}
589
590		WCHAR* ConvertUtf8ToWCharAlloc(const char* str, size_t* pSize)
591	1.09k	{
592	1.09k	WCHAR* tmp = NULL;
593	1.09k	const SSIZE_T rc = ConvertUtf8ToWChar(str, NULL, 0);
594	1.09k	if (pSize)
595	1.09k	*pSize = 0;
596	1.09k	if (rc < 0)
597	72	return NULL;
598	1.02k	tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
599	1.02k	if (!tmp)
600	0	return NULL;
601	1.02k	const SSIZE_T rc2 = ConvertUtf8ToWChar(str, tmp, (size_t)rc + 1ull);
602	1.02k	if (rc2 < 0)
603	0	{
604	0	free(tmp);
605	0	return NULL;
606	0	}
607	1.02k	WINPR_ASSERT(rc == rc2);
608	1.02k	if (pSize)
609	1.02k	*pSize = (size_t)rc2;
610	1.02k	return tmp;
611	1.02k	}
612
613		WCHAR* ConvertUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize)
614	0	{
615	0	WCHAR* tmp = NULL;
616	0	const SSIZE_T rc = ConvertUtf8NToWChar(str, len, NULL, 0);
617	0	if (pSize)
618	0	*pSize = 0;
619	0	if (rc < 0)
620	0	return NULL;
621	0	tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
622	0	if (!tmp)
623	0	return NULL;
624	0	const SSIZE_T rc2 = ConvertUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull);
625	0	if (rc2 < 0)
626	0	{
627	0	free(tmp);
628	0	return NULL;
629	0	}
630	0	WINPR_ASSERT(rc == rc2);
631	0	if (pSize)
632	0	*pSize = (size_t)rc2;
633	0	return tmp;
634	0	}
635
636		WCHAR* ConvertMszUtf8NToWCharAlloc(const char* str, size_t len, size_t* pSize)
637	0	{
638	0	WCHAR* tmp = NULL;
639	0	const SSIZE_T rc = ConvertMszUtf8NToWChar(str, len, NULL, 0);
640	0	if (pSize)
641	0	*pSize = 0;
642	0	if (rc < 0)
643	0	return NULL;
644	0	tmp = calloc((size_t)rc + 1ull, sizeof(WCHAR));
645	0	if (!tmp)
646	0	return NULL;
647	0	const SSIZE_T rc2 = ConvertMszUtf8NToWChar(str, len, tmp, (size_t)rc + 1ull);
648	0	if (rc2 < 0)
649	0	{
650	0	free(tmp);
651	0	return NULL;
652	0	}
653	0	WINPR_ASSERT(rc == rc2);
654	0	if (pSize)
655	0	*pSize = (size_t)rc2;
656	0	return tmp;
657	0	}