Coverage Report

Created: 2025-04-11 06:56

/src/unrar/unicode.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "rar.hpp"
2
#define MBFUNCTIONS
3
4
#if defined(_UNIX) && defined(MBFUNCTIONS)
5
6
static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7
static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8
9
// In Unix we map high ASCII characters which cannot be converted to Unicode
10
// to 0xE000 - 0xE0FF private use Unicode area.
11
static const uint MapAreaStart=0xE000;
12
13
// Mapped string marker. Initially we used 0xFFFF for this purpose,
14
// but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15
// While we could workaround it, it is safer to use another character.
16
static const uint MappedStringMark=0xFFFE;
17
18
#endif
19
20
bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21
373k
{
22
373k
  bool RetCode=true;
23
373k
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24
25
#ifdef _WIN_ALL
26
  if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27
    RetCode=false;
28
29
// wcstombs is broken in Android NDK r9.
30
#elif defined(_APPLE)
31
  WideToUtf(Src,Dest,DestSize);
32
33
#elif defined(MBFUNCTIONS)
34
373k
  if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35
153k
  {
36
153k
    mbstate_t ps; // Use thread safe external state based functions.
37
153k
    memset (&ps, 0, sizeof(ps));
38
153k
    const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39
40
    // Some implementations of wcsrtombs can cause memory analyzing tools
41
    // like valgrind to report uninitialized data access. It happens because
42
    // internally these implementations call SSE4 based wcslen function,
43
    // which reads 16 bytes at once including those beyond of trailing 0.
44
153k
    size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45
46
153k
    if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47
17.3k
    {
48
      // Aborted on inconvertible character not zero terminating the result.
49
      // EILSEQ helps to distinguish it from small output buffer abort.
50
      // We want to convert as much as we can, so we clean the output buffer
51
      // and repeat conversion.
52
17.3k
      memset (&ps, 0, sizeof(ps));
53
17.3k
      SrcParam=Src; // wcsrtombs can change the pointer.
54
17.3k
      memset(Dest,0,DestSize);
55
17.3k
      ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56
17.3k
    }
57
58
153k
    if (ResultingSize==(size_t)-1)
59
17.3k
      RetCode=false;
60
153k
    if (ResultingSize==0 && *Src!=0)
61
358
      RetCode=false;
62
153k
  }
63
#else
64
  for (int I=0;I<DestSize;I++)
65
  {
66
    Dest[I]=(char)Src[I];
67
    if (Src[I]==0)
68
      break;
69
  }
70
#endif
71
373k
  if (DestSize>0)
72
373k
    Dest[DestSize-1]=0;
73
74
  // We tried to return the empty string if conversion is failed,
75
  // but it does not work well. WideCharToMultiByte returns 'failed' code
76
  // and partially converted string even if we wanted to convert only a part
77
  // of string and passed DestSize smaller than required for fully converted
78
  // string. Such call is the valid behavior in RAR code and we do not expect
79
  // the empty string in this case.
80
81
373k
  return RetCode;
82
373k
}
83
84
85
bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
86
7.85k
{
87
7.85k
  bool RetCode=true;
88
7.85k
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89
90
#ifdef _WIN_ALL
91
  if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92
    RetCode=false;
93
94
// mbstowcs is broken in Android NDK r9.
95
#elif defined(_APPLE)
96
  UtfToWide(Src,Dest,DestSize);
97
98
#elif defined(MBFUNCTIONS)
99
  mbstate_t ps;
100
7.85k
  memset (&ps, 0, sizeof(ps));
101
7.85k
  const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102
7.85k
  size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103
7.85k
  if (ResultingSize==(size_t)-1)
104
3.23k
    RetCode=false;
105
7.85k
  if (ResultingSize==0 && *Src!=0)
106
0
    RetCode=false;
107
108
7.85k
  if (RetCode==false && DestSize>1)
109
3.23k
    CharToWideMap(Src,Dest,DestSize,RetCode);
110
#else
111
  for (int I=0;I<DestSize;I++)
112
  {
113
    Dest[I]=(wchar_t)Src[I];
114
    if (Src[I]==0)
115
      break;
116
  }
117
#endif
118
7.85k
  if (DestSize>0)
119
7.85k
    Dest[DestSize-1]=0;
120
121
  // We tried to return the empty string if conversion is failed,
122
  // but it does not work well. MultiByteToWideChar returns 'failed' code
123
  // even if we wanted to convert only a part of string and passed DestSize
124
  // smaller than required for fully converted string. Such call is the valid
125
  // behavior in RAR code and we do not expect the empty string in this case.
126
127
7.85k
  return RetCode;
128
7.85k
}
129
130
131
bool WideToChar(const std::wstring &Src,std::string &Dest)
132
373k
{
133
  // We need more than 1 char per wchar_t for DBCS and up to 4 for UTF-8.
134
373k
  std::vector<char> DestA(4*Src.size()+1); // "+1" for terminating zero.
135
373k
  bool Result=WideToChar(Src.c_str(),DestA.data(),DestA.size());
136
373k
  Dest=DestA.data();
137
373k
  return Result;
138
373k
}
139
140
141
bool CharToWide(const std::string &Src,std::wstring &Dest)
142
7.85k
{
143
  // 2 wchar_t per char in case char is converted to UTF-16 surrogate pair.
144
7.85k
  std::vector<wchar> DestW(2*Src.size()+1); // "+1" for terminating zero.
145
7.85k
  bool Result=CharToWide(Src.c_str(),DestW.data(),DestW.size());
146
7.85k
  Dest=DestW.data();
147
7.85k
  return Result;
148
7.85k
}
149
150
151
#if defined(_UNIX) && defined(MBFUNCTIONS)
152
// Convert and restore mapped inconvertible Unicode characters. 
153
// We use it for extended ASCII names in Unix.
154
bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
155
373k
{
156
  // String with inconvertible characters mapped to private use Unicode area
157
  // must have the mark code somewhere.
158
373k
  if (wcschr(Src,(wchar)MappedStringMark)==NULL)
159
153k
    return false;
160
161
  // Seems to be that wcrtomb in some memory analyzing libraries
162
  // can produce uninitilized output while reporting success on garbage input.
163
  // So we clean the destination to calm analyzers.
164
220k
  memset(Dest,0,DestSize);
165
  
166
220k
  Success=true;
167
220k
  uint SrcPos=0,DestPos=0;
168
317M
  while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
169
317M
  {
170
317M
    if (uint(Src[SrcPos])==MappedStringMark)
171
276k
    {
172
276k
      SrcPos++;
173
276k
      continue;
174
276k
    }
175
    // For security reasons do not restore low ASCII codes, so mapping cannot
176
    // be used to hide control codes like path separators.
177
316M
    if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
178
63.5M
      Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
179
253M
    else
180
253M
    {
181
253M
      mbstate_t ps;
182
253M
      memset(&ps,0,sizeof(ps));
183
253M
      if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
184
2.00M
      {
185
2.00M
        Dest[DestPos]='_';
186
2.00M
        Success=false;
187
2.00M
      }
188
253M
      SrcPos++;
189
253M
      memset(&ps,0,sizeof(ps));
190
253M
      int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
191
253M
      DestPos+=Max(Length,1);
192
253M
    }
193
316M
  }
194
220k
  Dest[Min(DestPos,DestSize-1)]=0;
195
220k
  return true;
196
373k
}
197
#endif
198
199
200
#if defined(_UNIX) && defined(MBFUNCTIONS)
201
// Convert and map inconvertible Unicode characters.
202
// We use it for extended ASCII names in Unix.
203
void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
204
3.23k
{
205
  // Map inconvertible characters to private use Unicode area 0xE000.
206
  // Mark such string by placing special non-character code before
207
  // first inconvertible character.
208
3.23k
  Success=false;
209
3.23k
  bool MarkAdded=false;
210
3.23k
  uint SrcPos=0,DestPos=0;
211
1.31M
  while (DestPos<DestSize)
212
1.31M
  {
213
1.31M
    if (Src[SrcPos]==0)
214
3.23k
    {
215
3.23k
      Success=true;
216
3.23k
      break;
217
3.23k
    }
218
1.30M
    mbstate_t ps;
219
1.30M
    memset(&ps,0,sizeof(ps));
220
1.30M
    size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
221
1.30M
    if (res==(size_t)-1 || res==(size_t)-2)
222
456k
    {
223
      // For security reasons we do not want to map low ASCII characters,
224
      // so we do not have additional .. and path separator codes.
225
456k
      if (byte(Src[SrcPos])>=0x80)
226
456k
      {
227
456k
        if (!MarkAdded)
228
3.23k
        {
229
3.23k
          Dest[DestPos++]=MappedStringMark;
230
3.23k
          MarkAdded=true;
231
3.23k
          if (DestPos>=DestSize)
232
0
            break;
233
3.23k
        }
234
456k
        Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
235
456k
      }
236
0
      else
237
0
        break;
238
456k
    }
239
852k
    else
240
852k
    {
241
852k
      memset(&ps,0,sizeof(ps));
242
852k
      int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
243
852k
      SrcPos+=Max(Length,1);
244
852k
      DestPos++;
245
852k
    }
246
1.30M
  }
247
3.23k
  Dest[Min(DestPos,DestSize-1)]=0;
248
3.23k
}
249
#endif
250
251
252
// SrcSize is source data size in wide characters, not in bytes.
253
// DestSize is the maximum allowed destination size.
254
byte* WideToRaw(const wchar *Src,size_t SrcSize,byte *Dest,size_t DestSize)
255
0
{
256
0
  for (size_t I=0;I<SrcSize && I*2+1<DestSize;I++,Src++)
257
0
  {
258
0
    Dest[I*2]=(byte)*Src;
259
0
    Dest[I*2+1]=(byte)(*Src>>8);
260
0
    if (*Src==0)
261
0
      break;
262
0
  }
263
0
  return Dest;
264
0
}
265
266
267
// Store UTF-16 raw byte stream.
268
void WideToRaw(const std::wstring &Src,std::vector<byte> &Dest)
269
0
{
270
0
  for (wchar C : Src)
271
0
  {
272
0
    Dest.push_back((byte)C);
273
0
    Dest.push_back((byte)(C>>8));
274
0
  }
275
  // In STL version of this function we do not add the trailing zero.
276
  // Otherwise we would need to remove it when restoring std::wstring
277
  // from raw data.
278
279
  // Dest.push_back(0); // 2 bytes of trailing UTF-16 zero.
280
  // Dest.push_back(0);
281
0
}
282
283
284
wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
285
0
{
286
0
  for (size_t I=0;I<DestSize;I++)
287
0
    if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
288
0
      break;
289
0
  return Dest;
290
0
}
291
292
293
std::wstring RawToWide(const std::vector<byte> &Src)
294
0
{
295
0
  std::wstring Dest;
296
0
  for (size_t I=0;I+1<Src.size();I+=2)
297
0
  {
298
0
    wchar c=Src[I]+(Src[I+1]<<8);
299
0
    Dest.push_back(c);
300
0
    if (c==0)
301
0
      break;
302
0
  }
303
0
  return Dest;
304
0
}
305
306
307
void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
308
0
{
309
0
  long dsize=(long)DestSize;
310
0
  dsize--;
311
0
  while (*Src!=0 && --dsize>=0)
312
0
  {
313
0
    uint c=*(Src++);
314
0
    if (c<0x80)
315
0
      *(Dest++)=c;
316
0
    else
317
0
      if (c<0x800 && --dsize>=0)
318
0
      {
319
0
        *(Dest++)=(0xc0|(c>>6));
320
0
        *(Dest++)=(0x80|(c&0x3f));
321
0
      }
322
0
      else
323
0
      {
324
0
        if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
325
0
        {
326
0
          c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
327
0
          Src++;
328
0
        }
329
0
        if (c<0x10000 && (dsize-=2)>=0)
330
0
        {
331
0
          *(Dest++)=(0xe0|(c>>12));
332
0
          *(Dest++)=(0x80|((c>>6)&0x3f));
333
0
          *(Dest++)=(0x80|(c&0x3f));
334
0
        }
335
0
        else
336
0
          if (c < 0x200000 && (dsize-=3)>=0)
337
0
          {
338
0
            *(Dest++)=(0xf0|(c>>18));
339
0
            *(Dest++)=(0x80|((c>>12)&0x3f));
340
0
            *(Dest++)=(0x80|((c>>6)&0x3f));
341
0
            *(Dest++)=(0x80|(c&0x3f));
342
0
          }
343
0
      }
344
0
  }
345
0
  *Dest=0;
346
0
}
347
348
349
void WideToUtf(const std::wstring &Src,std::string &Dest)
350
0
{
351
0
  for (size_t I=0;I<Src.size() && Src[I]!=0;)
352
0
  {
353
0
    uint c=Src[I++];
354
0
    if (c<0x80)
355
0
      Dest.push_back(c);
356
0
    else
357
0
      if (c<0x800)
358
0
      {
359
0
        Dest.push_back(0xc0|(c>>6));
360
0
        Dest.push_back(0x80|(c&0x3f));
361
0
      }
362
0
      else
363
0
      {
364
0
        if (c>=0xd800 && c<=0xdbff && I<Src.size() && Src[I]>=0xdc00 && Src[I]<=0xdfff) // Surrogate pair.
365
0
        {
366
0
          c=((c-0xd800)<<10)+(Src[I]-0xdc00)+0x10000;
367
0
          I++;
368
0
        }
369
0
        if (c<0x10000)
370
0
        {
371
0
          Dest.push_back(0xe0|(c>>12));
372
0
          Dest.push_back(0x80|((c>>6)&0x3f));
373
0
          Dest.push_back(0x80|(c&0x3f));
374
0
        }
375
0
        else
376
0
          if (c < 0x200000)
377
0
          {
378
0
            Dest.push_back(0xf0|(c>>18));
379
0
            Dest.push_back(0x80|((c>>12)&0x3f));
380
0
            Dest.push_back(0x80|((c>>6)&0x3f));
381
0
            Dest.push_back(0x80|(c&0x3f));
382
0
          }
383
0
      }
384
0
  }
385
0
}
386
387
388
389
size_t WideToUtfSize(const wchar *Src)
390
0
{
391
0
  size_t Size=0;
392
0
  for (;*Src!=0;Src++)
393
0
    if (*Src<0x80)
394
0
      Size++;
395
0
    else
396
0
      if (*Src<0x800)
397
0
        Size+=2;
398
0
      else
399
0
        if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
400
0
        {
401
0
          if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
402
0
          {
403
0
            Size+=4; // 4 output bytes for Unicode surrogate pair.
404
0
            Src++;
405
0
          }
406
0
          else
407
0
            Size+=3;
408
0
        }
409
0
        else
410
0
          if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
411
0
            Size+=4;
412
0
  return Size+1; // Include terminating zero.
413
0
}
414
415
416
bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
417
0
{
418
0
  bool Success=true;
419
0
  long dsize=(long)DestSize;
420
0
  dsize--;
421
0
  while (*Src!=0)
422
0
  {
423
0
    uint c=byte(*(Src++)),d;
424
0
    if (c<0x80)
425
0
      d=c;
426
0
    else
427
0
      if ((c>>5)==6)
428
0
      {
429
0
        if ((*Src&0xc0)!=0x80)
430
0
        {
431
0
          Success=false;
432
0
          break;
433
0
        }
434
0
        d=((c&0x1f)<<6)|(*Src&0x3f);
435
0
        Src++;
436
0
      }
437
0
      else
438
0
        if ((c>>4)==14)
439
0
        {
440
0
          if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
441
0
          {
442
0
            Success=false;
443
0
            break;
444
0
          }
445
0
          d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
446
0
          Src+=2;
447
0
        }
448
0
        else
449
0
          if ((c>>3)==30)
450
0
          {
451
0
            if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
452
0
            {
453
0
              Success=false;
454
0
              break;
455
0
            }
456
0
            d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
457
0
            Src+=3;
458
0
          }
459
0
          else
460
0
          {
461
0
            Success=false;
462
0
            break;
463
0
          }
464
0
    if (--dsize<0)
465
0
      break;
466
0
    if (d>0xffff)
467
0
    {
468
0
      if (--dsize<0)
469
0
        break;
470
0
      if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
471
0
      {
472
0
        Success=false;
473
0
        continue;
474
0
      }
475
0
      if (sizeof(*Dest)==2) // Use the surrogate pair.
476
0
      {
477
0
        *(Dest++)=((d-0x10000)>>10)+0xd800;
478
0
        *(Dest++)=(d&0x3ff)+0xdc00;
479
0
      }
480
0
      else
481
0
        *(Dest++)=d;
482
0
    }
483
0
    else
484
0
      *(Dest++)=d;
485
0
  }
486
0
  *Dest=0;
487
0
  return Success;
488
0
}
489
490
491
bool UtfToWide(const char *Src,std::wstring &Dest)
492
92.6k
{
493
92.6k
  bool Success=true;
494
92.6k
  Dest.clear();
495
207k
  while (*Src!=0)
496
117k
  {
497
117k
    uint c=byte(*(Src++)),d;
498
117k
    if (c<0x80)
499
113k
      d=c;
500
3.38k
    else
501
3.38k
      if ((c>>5)==6)
502
372
      {
503
372
        if ((*Src&0xc0)!=0x80)
504
261
        {
505
261
          Success=false;
506
261
          break;
507
261
        }
508
111
        d=((c&0x1f)<<6)|(*Src&0x3f);
509
111
        Src++;
510
111
      }
511
3.01k
      else
512
3.01k
        if ((c>>4)==14)
513
415
        {
514
415
          if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
515
389
          {
516
389
            Success=false;
517
389
            break;
518
389
          }
519
26
          d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
520
26
          Src+=2;
521
26
        }
522
2.59k
        else
523
2.59k
          if ((c>>3)==30)
524
675
          {
525
675
            if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
526
150
            {
527
150
              Success=false;
528
150
              break;
529
150
            }
530
525
            d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
531
525
            Src+=3;
532
525
          }
533
1.92k
          else
534
1.92k
          {
535
1.92k
            Success=false;
536
1.92k
            break;
537
1.92k
          }
538
114k
    if (d>0xffff)
539
525
    {
540
525
      if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
541
82
      {
542
82
        Success=false;
543
82
        continue;
544
82
      }
545
443
      if (sizeof(wchar_t)==2) // Use the surrogate pair.
546
0
      {
547
0
        Dest.push_back( ((d-0x10000)>>10)+0xd800 );
548
0
        Dest.push_back( (d&0x3ff)+0xdc00 );
549
0
      }
550
443
      else
551
443
        Dest.push_back( d );
552
443
    }
553
113k
    else
554
113k
      Dest.push_back( d );
555
114k
  }
556
92.6k
  return Success;
557
92.6k
}
558
559
560
/*
561
bool UtfToWide(const std::vector<char> &Src,std::wstring &Dest)
562
{
563
  bool Success=true;
564
  Dest.clear();
565
  for (size_t I=0;I<Src.size() && Src[I]!=0;) // We expect it to always stop at 0.
566
  {
567
    uint c=byte(Src[I++]),d;
568
    if (c<0x80)
569
      d=c;
570
    else
571
      if ((c>>5)==6)
572
      {
573
        if (Src.size()-I<1 || (Src[I]&0xc0)!=0x80)
574
        {
575
          Success=false;
576
          break;
577
        }
578
        d=((c&0x1f)<<6)|(Src[I]&0x3f);
579
        I++;
580
      }
581
      else
582
        if ((c>>4)==14)
583
        {
584
          if (Src.size()-I<2 || (Src[I]&0xc0)!=0x80 || (Src[I+1]&0xc0)!=0x80)
585
          {
586
            Success=false;
587
            break;
588
          }
589
          d=((c&0xf)<<12)|((Src[I]&0x3f)<<6)|(Src[I+1]&0x3f);
590
          I+=2;
591
        }
592
        else
593
          if ((c>>3)==30)
594
          {
595
            if (Src.size()-I<3 || (Src[I]&0xc0)!=0x80 || (Src[I+1]&0xc0)!=0x80 || (Src[I+2]&0xc0)!=0x80)
596
            {
597
              Success=false;
598
              break;
599
            }
600
            d=((c&7)<<18)|((Src[I]&0x3f)<<12)|((Src[I+1]&0x3f)<<6)|(Src[I+2]&0x3f);
601
            I+=3;
602
          }
603
          else
604
          {
605
            Success=false;
606
            break;
607
          }
608
    if (d>0xffff)
609
    {
610
      if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
611
      {
612
        Success=false;
613
        continue;
614
      }
615
      if (sizeof(Dest[0])==2) // Use the surrogate pair.
616
      {
617
        Dest.push_back( ((d-0x10000)>>10)+0xd800 );
618
        Dest.push_back( (d&0x3ff)+0xdc00 );
619
      }
620
      else
621
        Dest.push_back( d );
622
    }
623
    else
624
      Dest.push_back( d );
625
  }
626
  return Success;
627
}
628
*/
629
630
631
// For zero terminated strings.
632
bool IsTextUtf8(const byte *Src)
633
0
{
634
0
  return IsTextUtf8(Src,strlen((const char *)Src));
635
0
}
636
637
638
// Source data can be both with and without UTF-8 BOM.
639
bool IsTextUtf8(const byte *Src,size_t SrcSize)
640
0
{
641
0
  while (SrcSize-- > 0)
642
0
  {
643
0
    byte C=*(Src++);
644
0
    int HighOne=0; // Number of leftmost '1' bits.
645
0
    for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
646
0
      HighOne++;
647
0
    if (HighOne==1 || HighOne>6)
648
0
      return false;
649
0
    while (--HighOne > 0)
650
0
      if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
651
0
        return false;
652
0
  }
653
0
  return true;
654
0
}
655
656
657
int wcsicomp(const wchar *s1,const wchar *s2)
658
1.99k
{
659
  // If strings are English or numeric, perform the fast comparison.
660
  // It improves speed in cases like comparing against a lot of MOTW masks.
661
1.99k
  bool FastMode=true;
662
4.56k
  while (true)
663
4.56k
  {
664
    // English uppercase, English lowercase and digit flags.
665
4.56k
    bool u1=*s1>='A' && *s1<='Z', l1=*s1>='a' && *s1<='z', d1=*s1>='0' && *s1<='9';
666
4.56k
    bool u2=*s2>='A' && *s2<='Z', l2=*s2>='a' && *s2<='z', d2=*s2>='0' && *s2<='9';
667
668
    // Fast comparison is impossible if both characters are not alphanumeric or 0.
669
4.56k
    if (!u1 && !l1 && !d1 && *s1!=0 && !u2 && !l2 && !d2 && *s2!=0)
670
425
    {
671
425
      FastMode=false;
672
425
      break;
673
425
    }
674
    // Convert lowercase to uppercase, keep numeric and not alphanumeric as is.
675
4.13k
    wchar c1 = l1 ? *s1-'a'+'A' : *s1;
676
4.13k
    wchar c2 = l2 ? *s2-'a'+'A' : *s2;
677
678
    // If characters mistmatch, to return a proper value we must compare
679
    // already converted, case insensitive characters instead of original ones.
680
    // So we place a.txt before B.txt and can perform the correct case
681
    // insensitive binary search in different string lists.
682
4.13k
    if (c1 != c2)
683
1.27k
      return c1 < c2 ? -1 : 1;
684
685
2.86k
    if (*s1==0)
686
290
      break;
687
2.57k
    s1++;
688
2.57k
    s2++;
689
2.57k
  }
690
715
  if (FastMode)
691
290
    return 0;
692
693
#ifdef _WIN_ALL
694
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
695
#else
696
2.12k
  while (true)
697
2.12k
  {
698
2.12k
    wchar u1 = towupper(*s1);
699
2.12k
    wchar u2 = towupper(*s2);
700
701
    // If characters mistmatch, to return a proper value we must compare
702
    // already converted, case insensitive characters instead of original ones.
703
    // So we place a.txt before B.txt and can perform the correct case
704
    // insensitive binary search in different string lists.
705
2.12k
    if (u1 != u2)
706
0
      return u1 < u2 ? -1 : 1;
707
2.12k
    if (*s1==0)
708
425
      break;
709
1.70k
    s1++;
710
1.70k
    s2++;
711
1.70k
  }
712
425
  return 0;
713
425
#endif
714
425
}
715
716
717
int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
718
0
{
719
#ifdef _WIN_ALL
720
  // If we specify 'n' exceeding the actual string length, CompareString goes
721
  // beyond the trailing zero and compares garbage. So we need to limit 'n'
722
  // to real string length.
723
  size_t sl1=wcslen(s1); // Pre-compute to not call wcslen() in Min() twice.
724
  size_t l1=Min(sl1+1,n);
725
  size_t sl2=wcslen(s2); // Pre-compute to not call wcslen() in Min() twice.
726
  size_t l2=Min(sl2+1,n);
727
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
728
#else
729
0
  if (n==0)
730
0
    return 0;
731
0
  while (true)
732
0
  {
733
0
    wchar u1 = towupper(*s1);
734
0
    wchar u2 = towupper(*s2);
735
0
    if (u1 != u2)
736
0
      return u1 < u2 ? -1 : 1;
737
0
    if (*s1==0 || --n==0)
738
0
      break;
739
0
    s1++;
740
0
    s2++;
741
0
  }
742
0
  return 0;
743
0
#endif
744
0
}
745
746
747
// Case insensitive wcsstr().
748
const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
749
0
{
750
0
  for (size_t i=0;str[i]!=0;i++)
751
0
    for (size_t j=0;;j++)
752
0
    {
753
0
      if (search[j]==0)
754
0
        return str+i;
755
0
      if (tolowerw(str[i+j])!=tolowerw(search[j]))
756
0
        break;
757
0
    }
758
0
  return nullptr;
759
0
}
760
761
762
// Case insensitive std::wstring substring search.
763
std::wstring::size_type wcscasestr(const std::wstring &str, const std::wstring &search)
764
0
{
765
0
  const wchar *Found=wcscasestr(str.c_str(),search.c_str());
766
0
  return Found==nullptr ? std::wstring::npos : Found-str.c_str();
767
0
}
768
769
770
#ifndef SFX_MODULE
771
wchar* wcslower(wchar *s)
772
0
{
773
#ifdef _WIN_ALL
774
  // _wcslwr requires setlocale and we do not want to depend on setlocale
775
  // in Windows. Also CharLower involves less overhead.
776
  CharLower(s);
777
#else
778
0
  for (wchar *c=s;*c!=0;c++)
779
0
    *c=towlower(*c);
780
0
#endif
781
0
  return s;
782
0
}
783
784
785
void wcslower(std::wstring &s)
786
0
{
787
0
  wcslower(&s[0]);
788
0
}
789
790
791
wchar* wcsupper(wchar *s)
792
11.9k
{
793
#ifdef _WIN_ALL
794
  // _wcsupr requires setlocale and we do not want to depend on setlocale
795
  // in Windows. Also CharUpper involves less overhead.
796
  CharUpper(s);
797
#else
798
23.9k
  for (wchar *c=s;*c!=0;c++)
799
11.9k
    *c=towupper(*c);
800
11.9k
#endif
801
11.9k
  return s;
802
11.9k
}
803
804
805
void wcsupper(std::wstring &s)
806
11.9k
{
807
11.9k
  wcsupper(&s[0]);
808
11.9k
}
809
#endif
810
811
812
813
814
int toupperw(int ch)
815
35.9k
{
816
#if defined(_WIN_ALL)
817
  // CharUpper is more reliable than towupper in Windows, which seems to be
818
  // C locale dependent even in Unicode version. For example, towupper failed
819
  // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
820
  // if value larger than 0xffff is passed to this function.
821
  return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
822
#else
823
35.9k
  return towupper(ch);
824
35.9k
#endif
825
35.9k
}
826
827
828
int tolowerw(int ch)
829
0
{
830
#if defined(_WIN_ALL)
831
  // CharLower is more reliable than towlower in Windows.
832
  // See comment for towupper above. Use 0xffff mask to prevent crash
833
  // if value larger than 0xffff is passed to this function.
834
  return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
835
#else
836
0
  return towlower(ch);
837
0
#endif
838
0
}
839
840
841
int atoiw(const std::wstring &s)
842
238
{
843
238
  return (int)atoilw(s);
844
238
}
845
846
847
int64 atoilw(const std::wstring &s)
848
238
{
849
238
  bool sign=false;
850
238
  size_t Pos=0;
851
238
  if (s[Pos]=='-') // We do use signed integers here, for example, in GUI SFX.
852
81
  {
853
81
    Pos++;
854
81
    sign=true;
855
81
  }
856
  // Use unsigned type here, since long string can overflow the variable
857
  // and signed integer overflow is undefined behavior in C++.
858
238
  uint64 n=0;
859
1.75k
  while (s[Pos]>='0' && s[Pos]<='9')
860
1.52k
  {
861
1.52k
    n=n*10+(s[Pos]-'0');
862
1.52k
    Pos++;
863
1.52k
  }
864
  // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
865
  // when negating 0x8000000000000000.
866
238
  return sign && int64(n)>=0 ? -int64(n) : int64(n);
867
238
}
868
869
870
#ifdef DBCS_SUPPORTED
871
SupportDBCS gdbcs;
872
873
SupportDBCS::SupportDBCS()
874
{
875
  Init();
876
}
877
878
879
void SupportDBCS::Init()
880
{
881
  CPINFO CPInfo;
882
  GetCPInfo(CP_ACP,&CPInfo);
883
  DBCSMode=CPInfo.MaxCharSize > 1;
884
  for (uint I=0;I<ASIZE(IsLeadByte);I++)
885
    IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
886
}
887
888
889
char* SupportDBCS::charnext(const char *s)
890
{
891
  // Zero cannot be the trail byte. So if next byte after the lead byte
892
  // is 0, the string is corrupt and we'll better return the pointer to 0,
893
  // to break string processing loops.
894
  return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
895
}
896
#endif
897
898