Coverage Report

Created: 2024-04-23 06:19

/src/unrar/unicode.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "rar.hpp"
2
#define MBFUNCTIONS
3
4
#if defined(_UNIX) && defined(MBFUNCTIONS)
5
6
static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success);
7
static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success);
8
9
// In Unix we map high ASCII characters which cannot be converted to Unicode
10
// to 0xE000 - 0xE0FF private use Unicode area.
11
static const uint MapAreaStart=0xE000;
12
13
// Mapped string marker. Initially we used 0xFFFF for this purpose,
14
// but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker).
15
// While we could workaround it, it is safer to use another character.
16
static const uint MappedStringMark=0xFFFE;
17
18
#endif
19
20
bool WideToChar(const wchar *Src,char *Dest,size_t DestSize)
21
24.1k
{
22
24.1k
  bool RetCode=true;
23
24.1k
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
24
25
#ifdef _WIN_ALL
26
  if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0)
27
    RetCode=false;
28
29
// wcstombs is broken in Android NDK r9.
30
#elif defined(_APPLE)
31
  WideToUtf(Src,Dest,DestSize);
32
33
#elif defined(MBFUNCTIONS)
34
24.1k
  if (!WideToCharMap(Src,Dest,DestSize,RetCode))
35
21.1k
  {
36
21.1k
    mbstate_t ps; // Use thread safe external state based functions.
37
21.1k
    memset (&ps, 0, sizeof(ps));
38
21.1k
    const wchar *SrcParam=Src; // wcsrtombs can change the pointer.
39
40
    // Some implementations of wcsrtombs can cause memory analyzing tools
41
    // like valgrind to report uninitialized data access. It happens because
42
    // internally these implementations call SSE4 based wcslen function,
43
    // which reads 16 bytes at once including those beyond of trailing 0.
44
21.1k
    size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
45
46
21.1k
    if (ResultingSize==(size_t)-1 && errno==EILSEQ)
47
3.65k
    {
48
      // Aborted on inconvertible character not zero terminating the result.
49
      // EILSEQ helps to distinguish it from small output buffer abort.
50
      // We want to convert as much as we can, so we clean the output buffer
51
      // and repeat conversion.
52
3.65k
      memset (&ps, 0, sizeof(ps));
53
3.65k
      SrcParam=Src; // wcsrtombs can change the pointer.
54
3.65k
      memset(Dest,0,DestSize);
55
3.65k
      ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps);
56
3.65k
    }
57
58
21.1k
    if (ResultingSize==(size_t)-1)
59
3.65k
      RetCode=false;
60
21.1k
    if (ResultingSize==0 && *Src!=0)
61
0
      RetCode=false;
62
21.1k
  }
63
#else
64
  for (int I=0;I<DestSize;I++)
65
  {
66
    Dest[I]=(char)Src[I];
67
    if (Src[I]==0)
68
      break;
69
  }
70
#endif
71
24.1k
  if (DestSize>0)
72
24.1k
    Dest[DestSize-1]=0;
73
74
  // We tried to return the empty string if conversion is failed,
75
  // but it does not work well. WideCharToMultiByte returns 'failed' code
76
  // and partially converted string even if we wanted to convert only a part
77
  // of string and passed DestSize smaller than required for fully converted
78
  // string. Such call is the valid behavior in RAR code and we do not expect
79
  // the empty string in this case.
80
81
24.1k
  return RetCode;
82
24.1k
}
83
84
85
bool CharToWide(const char *Src,wchar *Dest,size_t DestSize)
86
1.67k
{
87
1.67k
  bool RetCode=true;
88
1.67k
  *Dest=0; // Set 'Dest' to zero just in case the conversion will fail.
89
90
#ifdef _WIN_ALL
91
  if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0)
92
    RetCode=false;
93
94
// mbstowcs is broken in Android NDK r9.
95
#elif defined(_APPLE)
96
  UtfToWide(Src,Dest,DestSize);
97
98
#elif defined(MBFUNCTIONS)
99
1.67k
  mbstate_t ps;
100
1.67k
  memset (&ps, 0, sizeof(ps));
101
1.67k
  const char *SrcParam=Src; // mbsrtowcs can change the pointer.
102
1.67k
  size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps);
103
1.67k
  if (ResultingSize==(size_t)-1)
104
1.00k
    RetCode=false;
105
1.67k
  if (ResultingSize==0 && *Src!=0)
106
0
    RetCode=false;
107
108
1.67k
  if (RetCode==false && DestSize>1)
109
1.00k
    CharToWideMap(Src,Dest,DestSize,RetCode);
110
#else
111
  for (int I=0;I<DestSize;I++)
112
  {
113
    Dest[I]=(wchar_t)Src[I];
114
    if (Src[I]==0)
115
      break;
116
  }
117
#endif
118
1.67k
  if (DestSize>0)
119
1.67k
    Dest[DestSize-1]=0;
120
121
  // We tried to return the empty string if conversion is failed,
122
  // but it does not work well. MultiByteToWideChar returns 'failed' code
123
  // even if we wanted to convert only a part of string and passed DestSize
124
  // smaller than required for fully converted string. Such call is the valid
125
  // behavior in RAR code and we do not expect the empty string in this case.
126
127
1.67k
  return RetCode;
128
1.67k
}
129
130
131
#if defined(_UNIX) && defined(MBFUNCTIONS)
132
// Convert and restore mapped inconvertible Unicode characters. 
133
// We use it for extended ASCII names in Unix.
134
bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success)
135
24.1k
{
136
  // String with inconvertible characters mapped to private use Unicode area
137
  // must have the mark code somewhere.
138
24.1k
  if (wcschr(Src,(wchar)MappedStringMark)==NULL)
139
21.1k
    return false;
140
141
  // Seems to be that wcrtomb in some memory analyzing libraries
142
  // can produce uninitilized output while reporting success on garbage input.
143
  // So we clean the destination to calm analyzers.
144
2.93k
  memset(Dest,0,DestSize);
145
  
146
2.93k
  Success=true;
147
2.93k
  uint SrcPos=0,DestPos=0;
148
144k
  while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX)
149
141k
  {
150
141k
    if (uint(Src[SrcPos])==MappedStringMark)
151
4.49k
    {
152
4.49k
      SrcPos++;
153
4.49k
      continue;
154
4.49k
    }
155
    // For security reasons do not restore low ASCII codes, so mapping cannot
156
    // be used to hide control codes like path separators.
157
137k
    if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100)
158
32.7k
      Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart);
159
104k
    else
160
104k
    {
161
104k
      mbstate_t ps;
162
104k
      memset(&ps,0,sizeof(ps));
163
104k
      if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1)
164
33.2k
      {
165
33.2k
        Dest[DestPos]='_';
166
33.2k
        Success=false;
167
33.2k
      }
168
104k
      SrcPos++;
169
104k
      memset(&ps,0,sizeof(ps));
170
104k
      int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps);
171
104k
      DestPos+=Max(Length,1);
172
104k
    }
173
137k
  }
174
2.93k
  Dest[Min(DestPos,DestSize-1)]=0;
175
2.93k
  return true;
176
24.1k
}
177
#endif
178
179
180
#if defined(_UNIX) && defined(MBFUNCTIONS)
181
// Convert and map inconvertible Unicode characters.
182
// We use it for extended ASCII names in Unix.
183
void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success)
184
1.00k
{
185
  // Map inconvertible characters to private use Unicode area 0xE000.
186
  // Mark such string by placing special non-character code before
187
  // first inconvertible character.
188
1.00k
  Success=false;
189
1.00k
  bool MarkAdded=false;
190
1.00k
  uint SrcPos=0,DestPos=0;
191
322k
  while (DestPos<DestSize)
192
322k
  {
193
322k
    if (Src[SrcPos]==0)
194
971
    {
195
971
      Success=true;
196
971
      break;
197
971
    }
198
321k
    mbstate_t ps;
199
321k
    memset(&ps,0,sizeof(ps));
200
321k
    size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps);
201
321k
    if (res==(size_t)-1 || res==(size_t)-2)
202
164k
    {
203
      // For security reasons we do not want to map low ASCII characters,
204
      // so we do not have additional .. and path separator codes.
205
164k
      if (byte(Src[SrcPos])>=0x80)
206
164k
      {
207
164k
        if (!MarkAdded)
208
1.00k
        {
209
1.00k
          Dest[DestPos++]=MappedStringMark;
210
1.00k
          MarkAdded=true;
211
1.00k
          if (DestPos>=DestSize)
212
0
            break;
213
1.00k
        }
214
164k
        Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart;
215
164k
      }
216
0
      else
217
0
        break;
218
164k
    }
219
157k
    else
220
157k
    {
221
157k
      memset(&ps,0,sizeof(ps));
222
157k
      int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps);
223
157k
      SrcPos+=Max(Length,1);
224
157k
      DestPos++;
225
157k
    }
226
321k
  }
227
1.00k
  Dest[Min(DestPos,DestSize-1)]=0;
228
1.00k
}
229
#endif
230
231
232
// SrcSize is in wide characters, not in bytes.
233
byte* WideToRaw(const wchar *Src,byte *Dest,size_t SrcSize)
234
0
{
235
0
  for (size_t I=0;I<SrcSize;I++,Src++)
236
0
  {
237
0
    Dest[I*2]=(byte)*Src;
238
0
    Dest[I*2+1]=(byte)(*Src>>8);
239
0
    if (*Src==0)
240
0
      break;
241
0
  }
242
0
  return Dest;
243
0
}
244
245
246
wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize)
247
0
{
248
0
  for (size_t I=0;I<DestSize;I++)
249
0
    if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0)
250
0
      break;
251
0
  return Dest;
252
0
}
253
254
255
void WideToUtf(const wchar *Src,char *Dest,size_t DestSize)
256
0
{
257
0
  long dsize=(long)DestSize;
258
0
  dsize--;
259
0
  while (*Src!=0 && --dsize>=0)
260
0
  {
261
0
    uint c=*(Src++);
262
0
    if (c<0x80)
263
0
      *(Dest++)=c;
264
0
    else
265
0
      if (c<0x800 && --dsize>=0)
266
0
      {
267
0
        *(Dest++)=(0xc0|(c>>6));
268
0
        *(Dest++)=(0x80|(c&0x3f));
269
0
      }
270
0
      else
271
0
      {
272
0
        if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair.
273
0
        {
274
0
          c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000;
275
0
          Src++;
276
0
        }
277
0
        if (c<0x10000 && (dsize-=2)>=0)
278
0
        {
279
0
          *(Dest++)=(0xe0|(c>>12));
280
0
          *(Dest++)=(0x80|((c>>6)&0x3f));
281
0
          *(Dest++)=(0x80|(c&0x3f));
282
0
        }
283
0
        else
284
0
          if (c < 0x200000 && (dsize-=3)>=0)
285
0
          {
286
0
            *(Dest++)=(0xf0|(c>>18));
287
0
            *(Dest++)=(0x80|((c>>12)&0x3f));
288
0
            *(Dest++)=(0x80|((c>>6)&0x3f));
289
0
            *(Dest++)=(0x80|(c&0x3f));
290
0
          }
291
0
      }
292
0
  }
293
0
  *Dest=0;
294
0
}
295
296
297
size_t WideToUtfSize(const wchar *Src)
298
0
{
299
0
  size_t Size=0;
300
0
  for (;*Src!=0;Src++)
301
0
    if (*Src<0x80)
302
0
      Size++;
303
0
    else
304
0
      if (*Src<0x800)
305
0
        Size+=2;
306
0
      else
307
0
        if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
308
0
        {
309
0
          if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff)
310
0
          {
311
0
            Size+=4; // 4 output bytes for Unicode surrogate pair.
312
0
            Src++;
313
0
          }
314
0
          else
315
0
            Size+=3;
316
0
        }
317
0
        else
318
0
          if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t.
319
0
            Size+=4;
320
0
  return Size+1; // Include terminating zero.
321
0
}
322
323
324
bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize)
325
143k
{
326
143k
  bool Success=true;
327
143k
  long dsize=(long)DestSize;
328
143k
  dsize--;
329
152k
  while (*Src!=0)
330
9.02k
  {
331
9.02k
    uint c=byte(*(Src++)),d;
332
9.02k
    if (c<0x80)
333
8.84k
      d=c;
334
182
    else
335
182
      if ((c>>5)==6)
336
50
      {
337
50
        if ((*Src&0xc0)!=0x80)
338
45
        {
339
45
          Success=false;
340
45
          break;
341
45
        }
342
5
        d=((c&0x1f)<<6)|(*Src&0x3f);
343
5
        Src++;
344
5
      }
345
132
      else
346
132
        if ((c>>4)==14)
347
12
        {
348
12
          if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80)
349
11
          {
350
11
            Success=false;
351
11
            break;
352
11
          }
353
1
          d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f);
354
1
          Src+=2;
355
1
        }
356
120
        else
357
120
          if ((c>>3)==30)
358
5
          {
359
5
            if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80)
360
5
            {
361
5
              Success=false;
362
5
              break;
363
5
            }
364
0
            d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f);
365
0
            Src+=3;
366
0
          }
367
115
          else
368
115
          {
369
115
            Success=false;
370
115
            break;
371
115
          }
372
8.84k
    if (--dsize<0)
373
0
      break;
374
8.84k
    if (d>0xffff)
375
0
    {
376
0
      if (--dsize<0)
377
0
        break;
378
0
      if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629.
379
0
      {
380
0
        Success=false;
381
0
        continue;
382
0
      }
383
0
      if (sizeof(*Dest)==2) // Use the surrogate pair.
384
0
      {
385
0
        *(Dest++)=((d-0x10000)>>10)+0xd800;
386
0
        *(Dest++)=(d&0x3ff)+0xdc00;
387
0
      }
388
0
      else
389
0
        *(Dest++)=d;
390
0
    }
391
8.84k
    else
392
8.84k
      *(Dest++)=d;
393
8.84k
  }
394
143k
  *Dest=0;
395
143k
  return Success;
396
143k
}
397
398
399
// For zero terminated strings.
400
bool IsTextUtf8(const byte *Src)
401
0
{
402
0
  return IsTextUtf8(Src,strlen((const char *)Src));
403
0
}
404
405
406
// Source data can be both with and without UTF-8 BOM.
407
bool IsTextUtf8(const byte *Src,size_t SrcSize)
408
0
{
409
0
  while (SrcSize-- > 0)
410
0
  {
411
0
    byte C=*(Src++);
412
0
    int HighOne=0; // Number of leftmost '1' bits.
413
0
    for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1)
414
0
      HighOne++;
415
0
    if (HighOne==1 || HighOne>6)
416
0
      return false;
417
0
    while (--HighOne > 0)
418
0
      if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80)
419
0
        return false;
420
0
  }
421
0
  return true;
422
0
}
423
424
425
int wcsicomp(const wchar *s1,const wchar *s2)
426
802
{
427
#ifdef _WIN_ALL
428
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2;
429
#else
430
1.97k
  while (true)
431
1.97k
  {
432
1.97k
    wchar u1 = towupper(*s1);
433
1.97k
    wchar u2 = towupper(*s2);
434
1.97k
    if (u1 != u2)
435
734
      return u1 < u2 ? -1 : 1;
436
1.23k
    if (*s1==0)
437
68
      break;
438
1.16k
    s1++;
439
1.16k
    s2++;
440
1.16k
  }
441
68
  return 0;
442
802
#endif
443
802
}
444
445
446
int wcsnicomp(const wchar *s1,const wchar *s2,size_t n)
447
0
{
448
#ifdef _WIN_ALL
449
  // If we specify 'n' exceeding the actual string length, CompareString goes
450
  // beyond the trailing zero and compares garbage. So we need to limit 'n'
451
  // to real string length.
452
  size_t l1=Min(wcslen(s1)+1,n);
453
  size_t l2=Min(wcslen(s2)+1,n);
454
  return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2;
455
#else
456
0
  if (n==0)
457
0
    return 0;
458
0
  while (true)
459
0
  {
460
0
    wchar u1 = towupper(*s1);
461
0
    wchar u2 = towupper(*s2);
462
0
    if (u1 != u2)
463
0
      return u1 < u2 ? -1 : 1;
464
0
    if (*s1==0 || --n==0)
465
0
      break;
466
0
    s1++;
467
0
    s2++;
468
0
  }
469
0
  return 0;
470
0
#endif
471
0
}
472
473
474
// Case insensitive wcsstr().
475
const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search)
476
0
{
477
0
  for (size_t i=0;str[i]!=0;i++)
478
0
    for (size_t j=0;;j++)
479
0
    {
480
0
      if (search[j]==0)
481
0
        return str+i;
482
0
      if (tolowerw(str[i+j])!=tolowerw(search[j]))
483
0
        break;
484
0
    }
485
0
  return NULL;
486
0
}
487
488
489
#ifndef SFX_MODULE
490
wchar* wcslower(wchar *s)
491
0
{
492
#ifdef _WIN_ALL
493
  // _wcslwr requires setlocale and we do not want to depend on setlocale
494
  // in Windows. Also CharLower involves less overhead.
495
  CharLower(s);
496
#else
497
0
  for (wchar *c=s;*c!=0;c++)
498
0
    *c=towlower(*c);
499
0
#endif
500
0
  return s;
501
0
}
502
#endif
503
504
505
#ifndef SFX_MODULE
506
wchar* wcsupper(wchar *s)
507
4.09k
{
508
#ifdef _WIN_ALL
509
  // _wcsupr requires setlocale and we do not want to depend on setlocale
510
  // in Windows. Also CharUpper involves less overhead.
511
  CharUpper(s);
512
#else
513
8.19k
  for (wchar *c=s;*c!=0;c++)
514
4.09k
    *c=towupper(*c);
515
4.09k
#endif
516
4.09k
  return s;
517
4.09k
}
518
#endif
519
520
521
522
523
int toupperw(int ch)
524
155k
{
525
#if defined(_WIN_ALL)
526
  // CharUpper is more reliable than towupper in Windows, which seems to be
527
  // C locale dependent even in Unicode version. For example, towupper failed
528
  // to convert lowercase Russian characters. Use 0xffff mask to prevent crash
529
  // if value larger than 0xffff is passed to this function.
530
  return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff));
531
#else
532
155k
  return towupper(ch);
533
155k
#endif
534
155k
}
535
536
537
int tolowerw(int ch)
538
0
{
539
#if defined(_WIN_ALL)
540
  // CharLower is more reliable than towlower in Windows.
541
  // See comment for towupper above. Use 0xffff mask to prevent crash
542
  // if value larger than 0xffff is passed to this function.
543
  return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff));
544
#else
545
0
  return towlower(ch);
546
0
#endif
547
0
}
548
549
550
int atoiw(const wchar *s)
551
14
{
552
14
  return (int)atoilw(s);
553
14
}
554
555
556
int64 atoilw(const wchar *s)
557
14
{
558
14
  bool sign=false;
559
14
  if (*s=='-') // We do use signed integers here, for example, in GUI SFX.
560
1
  {
561
1
    s++;
562
1
    sign=true;
563
1
  }
564
  // Use unsigned type here, since long string can overflow the variable
565
  // and signed integer overflow is undefined behavior in C++.
566
14
  uint64 n=0;
567
267
  while (*s>='0' && *s<='9')
568
253
  {
569
253
    n=n*10+(*s-'0');
570
253
    s++;
571
253
  }
572
  // Check int64(n)>=0 to avoid the signed overflow with undefined behavior
573
  // when negating 0x8000000000000000.
574
14
  return sign && int64(n)>=0 ? -int64(n) : int64(n);
575
14
}
576
577
578
#ifdef DBCS_SUPPORTED
579
SupportDBCS gdbcs;
580
581
SupportDBCS::SupportDBCS()
582
{
583
  Init();
584
}
585
586
587
void SupportDBCS::Init()
588
{
589
  CPINFO CPInfo;
590
  GetCPInfo(CP_ACP,&CPInfo);
591
  DBCSMode=CPInfo.MaxCharSize > 1;
592
  for (uint I=0;I<ASIZE(IsLeadByte);I++)
593
    IsLeadByte[I]=IsDBCSLeadByte(I)!=0;
594
}
595
596
597
char* SupportDBCS::charnext(const char *s)
598
{
599
  // Zero cannot be the trail byte. So if next byte after the lead byte
600
  // is 0, the string is corrupt and we'll better return the pointer to 0,
601
  // to break string processing loops.
602
  return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1);
603
}
604
#endif
605
606