/src/unrar/unicode.cpp

Source (jump to first uncovered line)
#include "rar.hpp"
#define MBFUNCTIONS

#if defined(_UNIX) && defined(MBFUNCTIONS)

static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);

// In Unix we map high ASCII characters which cannot be converted to Unicode
// to 0xE000 - 0xE0FF private use Unicode area.
static const uint MapAreaStart=0xE000;

// Mapped string marker. Initially we used 0xFFFF for this purpose,
// but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
// While we could workaround it, it is safer to use another character.
static const uint MappedStringMark=0xFFFE;

#endif

bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
{
  bool RetCode=true;
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.

#ifdef _WIN_ALL
  if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
    RetCode=false;

// wcstombs is broken in Android NDK r9.
#elif defined(_APPLE)
  WideToUtf(Src,Dest,DestSize);

#elif defined(MBFUNCTIONS)
  if (!WideToCharMap(Src,Dest,DestSize,RetCode))
  {
    mbstate_t ps; // Use thread safe external state based functions.
    memset (&ps, 0, sizeof(ps));
    const wchar *SrcParam=Src; // wcsrtombs can change the pointer.

    // Some implementations of wcsrtombs can cause memory analyzing tools
    // like valgrind to report uninitialized data access. It happens because
    // internally these implementations call SSE4 based wcslen function,
    // which reads 16 bytes at once including those beyond of trailing 0.
    size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);

    if (ResultingSize==(size_t)-1 && errno==EILSEQ)
    {
      // Aborted on inconvertible character not zero terminating the result.
      // EILSEQ helps to distinguish it from small output buffer abort.
      // We want to convert as much as we can, so we clean the output buffer
      // and repeat conversion.
      memset (&ps, 0, sizeof(ps));
      SrcParam=Src; // wcsrtombs can change the pointer.
      memset(Dest,0,DestSize);
      ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
    }

    if (ResultingSize==(size_t)-1)
      RetCode=false;
    if (ResultingSize==0 && *Src!=0)
      RetCode=false;
  }
#else
  for (int I=0;I<DestSize;I++)
  {
    Dest[I]=(char)Src[I];
    if (Src[I]==0)
      break;
  }
#endif
  if (DestSize>0)
    Dest[DestSize-1]=0;

  // We tried to return the empty string if conversion is failed,
  // but it does not work well. WideCharToMultiByte returns 'failed' code
  // and partially converted string even if we wanted to convert only a part
  // of string and passed DestSize smaller than required for fully converted
  // string. Such call is the valid behavior in RAR code and we do not expect
  // the empty string in this case.

  return RetCode;
}


bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
{
  bool RetCode=true;
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.

#ifdef _WIN_ALL
  if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
    RetCode=false;

// mbstowcs is broken in Android NDK r9.
#elif defined(_APPLE)
  UtfToWide(Src,Dest,DestSize);

#elif defined(MBFUNCTIONS)
  mbstate_t ps;
  memset (&ps, 0, sizeof(ps));
  const char *SrcParam=Src; // mbsrtowcs can change the pointer.
  size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
  if (ResultingSize==(size_t)-1)
    RetCode=false;
  if (ResultingSize==0 && *Src!=0)
    RetCode=false;

  if (RetCode==false && DestSize>1)
    CharToWideMap(Src,Dest,DestSize,RetCode);
#else
  for (int I=0;I<DestSize;I++)
  {
    Dest[I]=(wchar_t)Src[I];
    if (Src[I]==0)
      break;
  }
#endif
  if (DestSize>0)
    Dest[DestSize-1]=0;

  // We tried to return the empty string if conversion is failed,
  // but it does not work well. MultiByteToWideChar returns 'failed' code
  // even if we wanted to convert only a part of string and passed DestSize
  // smaller than required for fully converted string. Such call is the valid
  // behavior in RAR code and we do not expect the empty string in this case.

  return RetCode;
}


#if defined(_UNIX) && defined(MBFUNCTIONS)
// Convert and restore mapped inconvertible Unicode characters. 
// We use it for extended ASCII names in Unix.
bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
{
  // String with inconvertible characters mapped to private use Unicode area
  // must have the mark code somewhere.
  if (wcschr(Src,(wchar)MappedStringMark)==NULL)
    return false;

  // Seems to be that wcrtomb in some memory analyzing libraries
  // can produce uninitilized output while reporting success on garbage input.
  // So we clean the destination to calm analyzers.
  memset(Dest,0,DestSize);
  
  Success=true;
  uint SrcPos=0,DestPos=0;
  while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
  {
    if (uint(Src[SrcPos])==MappedStringMark)
    {
      SrcPos++;
      continue;
    }
    // For security reasons do not restore low ASCII codes, so mapping cannot
    // be used to hide control codes like path separators.
    if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
      Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
    else
    {
      mbstate_t ps;
      memset(&ps,0,sizeof(ps));
      if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
      {
        Dest[DestPos]='_';
        Success=false;
      }
      SrcPos++;
      memset(&ps,0,sizeof(ps));
      int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
      DestPos+=Max(Length,1);
    }
  }
  Dest[Min(DestPos,DestSize-1)]=0;
  return true;
}
#endif


#if defined(_UNIX) && defined(MBFUNCTIONS)
// Convert and map inconvertible Unicode characters.
// We use it for extended ASCII names in Unix.
void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
{
  // Map inconvertible characters to private use Unicode area 0xE000.
  // Mark such string by placing special non-character code before
  // first inconvertible character.
  Success=false;
  bool MarkAdded=false;
  uint SrcPos=0,DestPos=0;
  while (DestPos<DestSize)
  {
    if (Src[SrcPos]==0)
    {
      Success=true;
      break;
    }
    mbstate_t ps;
    memset(&ps,0,sizeof(ps));
    size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
    if (res==(size_t)-1 || res==(size_t)-2)
    {
      // For security reasons we do not want to map low ASCII characters,
      // so we do not have additional .. and path separator codes.
      if (byte(Src[SrcPos])>=0x80)
      {
        if (!MarkAdded)
        {
          Dest[DestPos++]=MappedStringMark;
          MarkAdded=true;
          if (DestPos>=DestSize)
            break;
        }
        Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
      }
      else
        break;
    }
    else
    {
      memset(&ps,0,sizeof(ps));
      int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
      SrcPos+=Max(Length,1);
      DestPos++;
    }
  }
  Dest[Min(DestPos,DestSize-1)]=0;
}
#endif


// SrcSize is in wide characters, not in bytes.
byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
{
  for (size_t I=0;I<SrcSize;I++,Src++)
  {
    Dest[I*2]=(byte)*Src;
    Dest[I*2+1]=(byte)(*Src>>8);
    if (*Src==0)
      break;
  }
  return Dest;
}


wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
{
  for (size_t I=0;I<DestSize;I++)
    if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
      break;
  return Dest;
}


void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
{
  long dsize=(long)DestSize;
  dsize--;
  while (*Src!=0 && --dsize>=0)
  {
    uint c=*(Src++);
    if (c<0x80)
      *(Dest++)=c;
    else
      if (c<0x800 && --dsize>=0)
      {
        *(Dest++)=(0xc0|(c>>6));
        *(Dest++)=(0x80|(c&0x3f));
      }
      else
      {
        if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
        {
          c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
          Src++;
        }
        if (c<0x10000 && (dsize-=2)>=0)
        {
          *(Dest++)=(0xe0|(c>>12));
          *(Dest++)=(0x80|((c>>6)&0x3f));
          *(Dest++)=(0x80|(c&0x3f));
        }
        else
          if (c < 0x200000 && (dsize-=3)>=0)
          {
            *(Dest++)=(0xf0|(c>>18));
            *(Dest++)=(0x80|((c>>12)&0x3f));
            *(Dest++)=(0x80|((c>>6)&0x3f));
            *(Dest++)=(0x80|(c&0x3f));
          }
      }
  }
  *Dest=0;
}


size_t WideToUtfSize(const wchar *Src)
{
  size_t Size=0;
  for (;*Src!=0;Src++)
    if (*Src<0x80)
      Size++;
    else
      if (*Src<0x800)
        Size+=2;
      else
        if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
        {
          if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
          {
            Size+=4; // 4 output bytes for Unicode surrogate pair.
            Src++;
          }
          else
            Size+=3;
        }
        else
          if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
            Size+=4;
  return Size+1; // Include terminating zero.
}


bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
{
  bool Success=true;
  long dsize=(long)DestSize;
  dsize--;
  while (*Src!=0)
  {
    uint c=byte(*(Src++)),d;
    if (c<0x80)
      d=c;
    else
      if ((c>>5)==6)
      {
        if ((*Src&0xc0)!=0x80)
        {
          Success=false;
          break;
        }
        d=((c&0x1f)<<6)|(*Src&0x3f);
        Src++;
      }
      else
        if ((c>>4)==14)
        {
          if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
          {
            Success=false;
            break;
          }
          d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
          Src+=2;
        }
        else
          if ((c>>3)==30)
          {
            if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
            {
              Success=false;
              break;
            }
            d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
            Src+=3;
          }
          else
          {
            Success=false;
            break;
          }
    if (--dsize<0)
      break;
    if (d>0xffff)
    {
      if (--dsize<0)
        break;
      if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
      {
        Success=false;
        continue;
      }
      if (sizeof(*Dest)==2) // Use the surrogate pair.
      {
        *(Dest++)=((d-0x10000)>>10)+0xd800;
        *(Dest++)=(d&0x3ff)+0xdc00;
      }
      else
        *(Dest++)=d;
    }
    else
      *(Dest++)=d;
  }
  *Dest=0;
  return Success;
}


// For zero terminated strings.
bool IsTextUtf8(const byte *Src)
{
  return IsTextUtf8(Src,strlen((const char *)Src));
}


// Source data can be both with and without UTF-8 BOM.
bool IsTextUtf8(const byte *Src,size_t SrcSize)
{
  while (SrcSize-- > 0)
  {
    byte C=*(Src++);
    int HighOne=0; // Number of leftmost '1' bits.
    for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
      HighOne++;
    if (HighOne==1 || HighOne>6)
      return false;
    while (--HighOne > 0)
      if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
        return false;
  }
  return true;
}


int wcsicomp(const wchar *s1,const wchar *s2)
{
#ifdef _WIN_ALL
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
#else
  while (true)
  {
    wchar u1 = towupper(*s1);
    wchar u2 = towupper(*s2);
    if (u1 != u2)
      return u1 < u2 ? -1 : 1;
    if (*s1==0)
      break;
    s1++;
    s2++;
  }
  return 0;
#endif
}


int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
{
#ifdef _WIN_ALL
  // If we specify 'n' exceeding the actual string length, CompareString goes
  // beyond the trailing zero and compares garbage. So we need to limit 'n'
  // to real string length.
  size_t l1=Min(wcslen(s1)+1,n);
  size_t l2=Min(wcslen(s2)+1,n);
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
#else
  if (n==0)
    return 0;
  while (true)
  {
    wchar u1 = towupper(*s1);
    wchar u2 = towupper(*s2);
    if (u1 != u2)
      return u1 < u2 ? -1 : 1;
    if (*s1==0 || --n==0)
      break;
    s1++;
    s2++;
  }
  return 0;
#endif
}


// Case insensitive wcsstr().
const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
{
  for (size_t i=0;str[i]!=0;i++)
    for (size_t j=0;;j++)
    {
      if (search[j]==0)
        return str+i;
      if (tolowerw(str[i+j])!=tolowerw(search[j]))
        break;
    }
  return NULL;
}


#ifndef SFX_MODULE
wchar* wcslower(wchar *s)
{
#ifdef _WIN_ALL
  // _wcslwr requires setlocale and we do not want to depend on setlocale
  // in Windows. Also CharLower involves less overhead.
  CharLower(s);
#else
  for (wchar *c=s;*c!=0;c++)
    *c=towlower(*c);
#endif
  return s;
}
#endif


#ifndef SFX_MODULE
wchar* wcsupper(wchar *s)
{
#ifdef _WIN_ALL
  // _wcsupr requires setlocale and we do not want to depend on setlocale
  // in Windows. Also CharUpper involves less overhead.
  CharUpper(s);
#else
  for (wchar *c=s;*c!=0;c++)
    *c=towupper(*c);
#endif
  return s;
}
#endif




int toupperw(int ch)
{
#if defined(_WIN_ALL)
  // CharUpper is more reliable than towupper in Windows, which seems to be
  // C locale dependent even in Unicode version. For example, towupper failed
  // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
  // if value larger than 0xffff is passed to this function.
  return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
#else
  return towupper(ch);
#endif
}


int tolowerw(int ch)
{
#if defined(_WIN_ALL)
  // CharLower is more reliable than towlower in Windows.
  // See comment for towupper above. Use 0xffff mask to prevent crash
  // if value larger than 0xffff is passed to this function.
  return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
#else
  return towlower(ch);
#endif
}


int atoiw(const wchar *s)
{
  return (int)atoilw(s);
}


int64 atoilw(const wchar *s)
{
  bool sign=false;
  if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
  {
    s++;
    sign=true;
  }
  // Use unsigned type here, since long string can overflow the variable
  // and signed integer overflow is undefined behavior in C++.
  uint64 n=0;
  while (*s>='0' && *s<='9')
  {
    n=n*10+(*s-'0');
    s++;
  }
  // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
  // when negating 0x8000000000000000.
  return sign && int64(n)>=0 ? -int64(n) : int64(n);
}


#ifdef DBCS_SUPPORTED
SupportDBCS gdbcs;

SupportDBCS::SupportDBCS()
{
  Init();
}


void SupportDBCS::Init()
{
  CPINFO CPInfo;
  GetCPInfo(CP_ACP,&CPInfo);
  DBCSMode=CPInfo.MaxCharSize > 1;
  for (uint I=0;I<ASIZE(IsLeadByte);I++)
    IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
}


char* SupportDBCS::charnext(const char *s)
{
  // Zero cannot be the trail byte. So if next byte after the lead byte
  // is 0, the string is corrupt and we'll better return the pointer to 0,
  // to break string processing loops.
  return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
}
#endif



Coverage Report

Created: 2024-04-23 06:19

Line	Count	Source (jump to first uncovered line)
1		#include "rar.hpp"
2		#define MBFUNCTIONS
3
4		#if defined(_UNIX) && defined(MBFUNCTIONS)
5
6		static bool WideToCharMap(const wchar Src,char Dest,size_t DestSize,bool &Success);
7		static void CharToWideMap(const char Src,wchar Dest,size_t DestSize,bool &Success);
8
9		// In Unix we map high ASCII characters which cannot be converted to Unicode
10		// to 0xE000 - 0xE0FF private use Unicode area.
11		static const uint MapAreaStart=0xE000;
12
13		// Mapped string marker. Initially we used 0xFFFF for this purpose,
14		// but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15		// While we could workaround it, it is safer to use another character.
16		static const uint MappedStringMark=0xFFFE;
17
18		#endif
19
20		bool WideToChar(const wchar Src,char Dest,size_t DestSize)
21	24.1k	{
22	24.1k	bool RetCode=true;
23	24.1k	*Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24
25		#ifdef _WIN_ALL
26		if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27		RetCode=false;
28
29		// wcstombs is broken in Android NDK r9.
30		#elif defined(_APPLE)
31		WideToUtf(Src,Dest,DestSize);
32
33		#elif defined(MBFUNCTIONS)
34	24.1k	if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35	21.1k	{
36	21.1k	mbstate_t ps; // Use thread safe external state based functions.
37	21.1k	memset (&ps, 0, sizeof(ps));
38	21.1k	const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39
40		// Some implementations of wcsrtombs can cause memory analyzing tools
41		// like valgrind to report uninitialized data access. It happens because
42		// internally these implementations call SSE4 based wcslen function,
43		// which reads 16 bytes at once including those beyond of trailing 0.
44	21.1k	size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45
46	21.1k	if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47	3.65k	{
48		// Aborted on inconvertible character not zero terminating the result.
49		// EILSEQ helps to distinguish it from small output buffer abort.
50		// We want to convert as much as we can, so we clean the output buffer
51		// and repeat conversion.
52	3.65k	memset (&ps, 0, sizeof(ps));
53	3.65k	SrcParam=Src; // wcsrtombs can change the pointer.
54	3.65k	memset(Dest,0,DestSize);
55	3.65k	ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56	3.65k	}
57
58	21.1k	if (ResultingSize==(size_t)-1)
59	3.65k	RetCode=false;
60	21.1k	if (ResultingSize==0 && *Src!=0)
61	0	RetCode=false;
62	21.1k	}
63		#else
64		for (int I=0;I<DestSize;I++)
65		{
66		Dest[I]=(char)Src[I];
67		if (Src[I]==0)
68		break;
69		}
70		#endif
71	24.1k	if (DestSize>0)
72	24.1k	Dest[DestSize-1]=0;
73
74		// We tried to return the empty string if conversion is failed,
75		// but it does not work well. WideCharToMultiByte returns 'failed' code
76		// and partially converted string even if we wanted to convert only a part
77		// of string and passed DestSize smaller than required for fully converted
78		// string. Such call is the valid behavior in RAR code and we do not expect
79		// the empty string in this case.
80
81	24.1k	return RetCode;
82	24.1k	}
83
84
85		bool CharToWide(const char Src,wchar Dest,size_t DestSize)
86	1.67k	{
87	1.67k	bool RetCode=true;
88	1.67k	*Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89
90		#ifdef _WIN_ALL
91		if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92		RetCode=false;
93
94		// mbstowcs is broken in Android NDK r9.
95		#elif defined(_APPLE)
96		UtfToWide(Src,Dest,DestSize);
97
98		#elif defined(MBFUNCTIONS)
99	1.67k	mbstate_t ps;
100	1.67k	memset (&ps, 0, sizeof(ps));
101	1.67k	const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102	1.67k	size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103	1.67k	if (ResultingSize==(size_t)-1)
104	1.00k	RetCode=false;
105	1.67k	if (ResultingSize==0 && *Src!=0)
106	0	RetCode=false;
107
108	1.67k	if (RetCode==false && DestSize>1)
109	1.00k	CharToWideMap(Src,Dest,DestSize,RetCode);
110		#else
111		for (int I=0;I<DestSize;I++)
112		{
113		Dest[I]=(wchar_t)Src[I];
114		if (Src[I]==0)
115		break;
116		}
117		#endif
118	1.67k	if (DestSize>0)
119	1.67k	Dest[DestSize-1]=0;
120
121		// We tried to return the empty string if conversion is failed,
122		// but it does not work well. MultiByteToWideChar returns 'failed' code
123		// even if we wanted to convert only a part of string and passed DestSize
124		// smaller than required for fully converted string. Such call is the valid
125		// behavior in RAR code and we do not expect the empty string in this case.
126
127	1.67k	return RetCode;
128	1.67k	}
129
130
131		#if defined(_UNIX) && defined(MBFUNCTIONS)
132		// Convert and restore mapped inconvertible Unicode characters.
133		// We use it for extended ASCII names in Unix.
134		bool WideToCharMap(const wchar Src,char Dest,size_t DestSize,bool &Success)
135	24.1k	{
136		// String with inconvertible characters mapped to private use Unicode area
137		// must have the mark code somewhere.
138	24.1k	if (wcschr(Src,(wchar)MappedStringMark)==NULL)
139	21.1k	return false;
140
141		// Seems to be that wcrtomb in some memory analyzing libraries
142		// can produce uninitilized output while reporting success on garbage input.
143		// So we clean the destination to calm analyzers.
144	2.93k	memset(Dest,0,DestSize);
145
146	2.93k	Success=true;
147	2.93k	uint SrcPos=0,DestPos=0;
148	144k	while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
149	141k	{
150	141k	if (uint(Src[SrcPos])==MappedStringMark)
151	4.49k	{
152	4.49k	SrcPos++;
153	4.49k	continue;
154	4.49k	}
155		// For security reasons do not restore low ASCII codes, so mapping cannot
156		// be used to hide control codes like path separators.
157	137k	if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
158	32.7k	Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
159	104k	else
160	104k	{
161	104k	mbstate_t ps;
162	104k	memset(&ps,0,sizeof(ps));
163	104k	if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
164	33.2k	{
165	33.2k	Dest[DestPos]='_';
166	33.2k	Success=false;
167	33.2k	}
168	104k	SrcPos++;
169	104k	memset(&ps,0,sizeof(ps));
170	104k	int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
171	104k	DestPos+=Max(Length,1);
172	104k	}
173	137k	}
174	2.93k	Dest[Min(DestPos,DestSize-1)]=0;
175	2.93k	return true;
176	24.1k	}
177		#endif
178
179
180		#if defined(_UNIX) && defined(MBFUNCTIONS)
181		// Convert and map inconvertible Unicode characters.
182		// We use it for extended ASCII names in Unix.
183		void CharToWideMap(const char Src,wchar Dest,size_t DestSize,bool &Success)
184	1.00k	{
185		// Map inconvertible characters to private use Unicode area 0xE000.
186		// Mark such string by placing special non-character code before
187		// first inconvertible character.
188	1.00k	Success=false;
189	1.00k	bool MarkAdded=false;
190	1.00k	uint SrcPos=0,DestPos=0;
191	322k	while (DestPos<DestSize)
192	322k	{
193	322k	if (Src[SrcPos]==0)
194	971	{
195	971	Success=true;
196	971	break;
197	971	}
198	321k	mbstate_t ps;
199	321k	memset(&ps,0,sizeof(ps));
200	321k	size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
201	321k	if (res==(size_t)-1 \|\| res==(size_t)-2)
202	164k	{
203		// For security reasons we do not want to map low ASCII characters,
204		// so we do not have additional .. and path separator codes.
205	164k	if (byte(Src[SrcPos])>=0x80)
206	164k	{
207	164k	if (!MarkAdded)
208	1.00k	{
209	1.00k	Dest[DestPos++]=MappedStringMark;
210	1.00k	MarkAdded=true;
211	1.00k	if (DestPos>=DestSize)
212	0	break;
213	1.00k	}
214	164k	Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
215	164k	}
216	0	else
217	0	break;
218	164k	}
219	157k	else
220	157k	{
221	157k	memset(&ps,0,sizeof(ps));
222	157k	int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
223	157k	SrcPos+=Max(Length,1);
224	157k	DestPos++;
225	157k	}
226	321k	}
227	1.00k	Dest[Min(DestPos,DestSize-1)]=0;
228	1.00k	}
229		#endif
230
231
232		// SrcSize is in wide characters, not in bytes.
233		byte* WideToRaw(const wchar Src,byte Dest,size_t SrcSize)
234	0	{
235	0	for (size_t I=0;I<SrcSize;I++,Src++)
236	0	{
237	0	Dest[I2]=(byte)Src;
238	0	Dest[I2+1]=(byte)(Src>>8);
239	0	if (*Src==0)
240	0	break;
241	0	}
242	0	return Dest;
243	0	}
244
245
246		wchar* RawToWide(const byte Src,wchar Dest,size_t DestSize)
247	0	{
248	0	for (size_t I=0;I<DestSize;I++)
249	0	if ((Dest[I]=Src[I2]+(Src[I2+1]<<8))==0)
250	0	break;
251	0	return Dest;
252	0	}
253
254
255		void WideToUtf(const wchar Src,char Dest,size_t DestSize)
256	0	{
257	0	long dsize=(long)DestSize;
258	0	dsize--;
259	0	while (*Src!=0 && --dsize>=0)
260	0	{
261	0	uint c=*(Src++);
262	0	if (c<0x80)
263	0	*(Dest++)=c;
264	0	else
265	0	if (c<0x800 && --dsize>=0)
266	0	{
267	0	*(Dest++)=(0xc0\|(c>>6));
268	0	*(Dest++)=(0x80\|(c&0x3f));
269	0	}
270	0	else
271	0	{
272	0	if (c>=0xd800 && c<=0xdbff && Src>=0xdc00 && Src<=0xdfff) // Surrogate pair.
273	0	{
274	0	c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
275	0	Src++;
276	0	}
277	0	if (c<0x10000 && (dsize-=2)>=0)
278	0	{
279	0	*(Dest++)=(0xe0\|(c>>12));
280	0	*(Dest++)=(0x80\|((c>>6)&0x3f));
281	0	*(Dest++)=(0x80\|(c&0x3f));
282	0	}
283	0	else
284	0	if (c < 0x200000 && (dsize-=3)>=0)
285	0	{
286	0	*(Dest++)=(0xf0\|(c>>18));
287	0	*(Dest++)=(0x80\|((c>>12)&0x3f));
288	0	*(Dest++)=(0x80\|((c>>6)&0x3f));
289	0	*(Dest++)=(0x80\|(c&0x3f));
290	0	}
291	0	}
292	0	}
293	0	*Dest=0;
294	0	}
295
296
297		size_t WideToUtfSize(const wchar *Src)
298	0	{
299	0	size_t Size=0;
300	0	for (;*Src!=0;Src++)
301	0	if (*Src<0x80)
302	0	Size++;
303	0	else
304	0	if (*Src<0x800)
305	0	Size+=2;
306	0	else
307	0	if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
308	0	{
309	0	if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
310	0	{
311	0	Size+=4; // 4 output bytes for Unicode surrogate pair.
312	0	Src++;
313	0	}
314	0	else
315	0	Size+=3;
316	0	}
317	0	else
318	0	if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
319	0	Size+=4;
320	0	return Size+1; // Include terminating zero.
321	0	}
322
323
324		bool UtfToWide(const char Src,wchar Dest,size_t DestSize)
325	143k	{
326	143k	bool Success=true;
327	143k	long dsize=(long)DestSize;
328	143k	dsize--;
329	152k	while (*Src!=0)
330	9.02k	{
331	9.02k	uint c=byte(*(Src++)),d;
332	9.02k	if (c<0x80)
333	8.84k	d=c;
334	182	else
335	182	if ((c>>5)==6)
336	50	{
337	50	if ((*Src&0xc0)!=0x80)
338	45	{
339	45	Success=false;
340	45	break;
341	45	}
342	5	d=((c&0x1f)<<6)\|(*Src&0x3f);
343	5	Src++;
344	5	}
345	132	else
346	132	if ((c>>4)==14)
347	12	{
348	12	if ((Src[0]&0xc0)!=0x80 \|\| (Src[1]&0xc0)!=0x80)
349	11	{
350	11	Success=false;
351	11	break;
352	11	}
353	1	d=((c&0xf)<<12)\|((Src[0]&0x3f)<<6)\|(Src[1]&0x3f);
354	1	Src+=2;
355	1	}
356	120	else
357	120	if ((c>>3)==30)
358	5	{
359	5	if ((Src[0]&0xc0)!=0x80 \|\| (Src[1]&0xc0)!=0x80 \|\| (Src[2]&0xc0)!=0x80)
360	5	{
361	5	Success=false;
362	5	break;
363	5	}
364	0	d=((c&7)<<18)\|((Src[0]&0x3f)<<12)\|((Src[1]&0x3f)<<6)\|(Src[2]&0x3f);
365	0	Src+=3;
366	0	}
367	115	else
368	115	{
369	115	Success=false;
370	115	break;
371	115	}
372	8.84k	if (--dsize<0)
373	0	break;
374	8.84k	if (d>0xffff)
375	0	{
376	0	if (--dsize<0)
377	0	break;
378	0	if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
379	0	{
380	0	Success=false;
381	0	continue;
382	0	}
383	0	if (sizeof(*Dest)==2) // Use the surrogate pair.
384	0	{
385	0	*(Dest++)=((d-0x10000)>>10)+0xd800;
386	0	*(Dest++)=(d&0x3ff)+0xdc00;
387	0	}
388	0	else
389	0	*(Dest++)=d;
390	0	}
391	8.84k	else
392	8.84k	*(Dest++)=d;
393	8.84k	}
394	143k	*Dest=0;
395	143k	return Success;
396	143k	}
397
398
399		// For zero terminated strings.
400		bool IsTextUtf8(const byte *Src)
401	0	{
402	0	return IsTextUtf8(Src,strlen((const char *)Src));
403	0	}
404
405
406		// Source data can be both with and without UTF-8 BOM.
407		bool IsTextUtf8(const byte *Src,size_t SrcSize)
408	0	{
409	0	while (SrcSize-- > 0)
410	0	{
411	0	byte C=*(Src++);
412	0	int HighOne=0; // Number of leftmost '1' bits.
413	0	for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
414	0	HighOne++;
415	0	if (HighOne==1 \|\| HighOne>6)
416	0	return false;
417	0	while (--HighOne > 0)
418	0	if (SrcSize-- <= 0 \|\| (*(Src++) & 0xc0)!=0x80)
419	0	return false;
420	0	}
421	0	return true;
422	0	}
423
424
425		int wcsicomp(const wchar s1,const wchar s2)
426	802	{
427		#ifdef _WIN_ALL
428		return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE\|SORT_STRINGSORT,s1,-1,s2,-1)-2;
429		#else
430	1.97k	while (true)
431	1.97k	{
432	1.97k	wchar u1 = towupper(*s1);
433	1.97k	wchar u2 = towupper(*s2);
434	1.97k	if (u1 != u2)
435	734	return u1 < u2 ? -1 : 1;
436	1.23k	if (*s1==0)
437	68	break;
438	1.16k	s1++;
439	1.16k	s2++;
440	1.16k	}
441	68	return 0;
442	802	#endif
443	802	}
444
445
446		int wcsnicomp(const wchar s1,const wchar s2,size_t n)
447	0	{
448		#ifdef _WIN_ALL
449		// If we specify 'n' exceeding the actual string length, CompareString goes
450		// beyond the trailing zero and compares garbage. So we need to limit 'n'
451		// to real string length.
452		size_t l1=Min(wcslen(s1)+1,n);
453		size_t l2=Min(wcslen(s2)+1,n);
454		return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE\|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
455		#else
456	0	if (n==0)
457	0	return 0;
458	0	while (true)
459	0	{
460	0	wchar u1 = towupper(*s1);
461	0	wchar u2 = towupper(*s2);
462	0	if (u1 != u2)
463	0	return u1 < u2 ? -1 : 1;
464	0	if (*s1==0 \|\| --n==0)
465	0	break;
466	0	s1++;
467	0	s2++;
468	0	}
469	0	return 0;
470	0	#endif
471	0	}
472
473
474		// Case insensitive wcsstr().
475		const wchar_t* wcscasestr(const wchar_t str, const wchar_t search)
476	0	{
477	0	for (size_t i=0;str[i]!=0;i++)
478	0	for (size_t j=0;;j++)
479	0	{
480	0	if (search[j]==0)
481	0	return str+i;
482	0	if (tolowerw(str[i+j])!=tolowerw(search[j]))
483	0	break;
484	0	}
485	0	return NULL;
486	0	}
487
488
489		#ifndef SFX_MODULE
490		wchar* wcslower(wchar *s)
491	0	{
492		#ifdef _WIN_ALL
493		// _wcslwr requires setlocale and we do not want to depend on setlocale
494		// in Windows. Also CharLower involves less overhead.
495		CharLower(s);
496		#else
497	0	for (wchar c=s;c!=0;c++)
498	0	c=towlower(c);
499	0	#endif
500	0	return s;
501	0	}
502		#endif
503
504
505		#ifndef SFX_MODULE
506		wchar* wcsupper(wchar *s)
507	4.09k	{
508		#ifdef _WIN_ALL
509		// _wcsupr requires setlocale and we do not want to depend on setlocale
510		// in Windows. Also CharUpper involves less overhead.
511		CharUpper(s);
512		#else
513	8.19k	for (wchar c=s;c!=0;c++)
514	4.09k	c=towupper(c);
515	4.09k	#endif
516	4.09k	return s;
517	4.09k	}
518		#endif
519
520
521
522
523		int toupperw(int ch)
524	155k	{
525		#if defined(_WIN_ALL)
526		// CharUpper is more reliable than towupper in Windows, which seems to be
527		// C locale dependent even in Unicode version. For example, towupper failed
528		// to convert lowercase Russian characters. Use 0xffff mask to prevent crash
529		// if value larger than 0xffff is passed to this function.
530		return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
531		#else
532	155k	return towupper(ch);
533	155k	#endif
534	155k	}
535
536
537		int tolowerw(int ch)
538	0	{
539		#if defined(_WIN_ALL)
540		// CharLower is more reliable than towlower in Windows.
541		// See comment for towupper above. Use 0xffff mask to prevent crash
542		// if value larger than 0xffff is passed to this function.
543		return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
544		#else
545	0	return towlower(ch);
546	0	#endif
547	0	}
548
549
550		int atoiw(const wchar *s)
551	14	{
552	14	return (int)atoilw(s);
553	14	}
554
555
556		int64 atoilw(const wchar *s)
557	14	{
558	14	bool sign=false;
559	14	if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
560	1	{
561	1	s++;
562	1	sign=true;
563	1	}
564		// Use unsigned type here, since long string can overflow the variable
565		// and signed integer overflow is undefined behavior in C++.
566	14	uint64 n=0;
567	267	while (s>='0' && s<='9')
568	253	{
569	253	n=n10+(s-'0');
570	253	s++;
571	253	}
572		// Check int64(n)>=0 to avoid the signed overflow with undefined behavior
573		// when negating 0x8000000000000000.
574	14	return sign && int64(n)>=0 ? -int64(n) : int64(n);
575	14	}
576
577
578		#ifdef DBCS_SUPPORTED
579		SupportDBCS gdbcs;
580
581		SupportDBCS::SupportDBCS()
582		{
583		Init();
584		}
585
586
587		void SupportDBCS::Init()
588		{
589		CPINFO CPInfo;
590		GetCPInfo(CP_ACP,&CPInfo);
591		DBCSMode=CPInfo.MaxCharSize > 1;
592		for (uint I=0;I<ASIZE(IsLeadByte);I++)
593		IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
594		}
595
596
597		char* SupportDBCS::charnext(const char *s)
598		{
599		// Zero cannot be the trail byte. So if next byte after the lead byte
600		// is 0, the string is corrupt and we'll better return the pointer to 0,
601		// to break string processing loops.
602		return (char )(IsLeadByte[(byte)s] && s[1]!=0 ? s+2:s+1);
603		}
604		#endif
605
606