Line | Count | Source (jump to first uncovered line) |
1 | | #include "rar.hpp" |
2 | | #define MBFUNCTIONS |
3 | | |
4 | | #if defined(_UNIX) && defined(MBFUNCTIONS) |
5 | | |
6 | | static bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success); |
7 | | static void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success); |
8 | | |
9 | | // In Unix we map high ASCII characters which cannot be converted to Unicode |
10 | | // to 0xE000 - 0xE0FF private use Unicode area. |
11 | | static const uint MapAreaStart=0xE000; |
12 | | |
13 | | // Mapped string marker. Initially we used 0xFFFF for this purpose, |
14 | | // but it causes MSVC2008 swprintf to fail (it treats 0xFFFF as error marker). |
15 | | // While we could workaround it, it is safer to use another character. |
16 | | static const uint MappedStringMark=0xFFFE; |
17 | | |
18 | | #endif |
19 | | |
20 | | bool WideToChar(const wchar *Src,char *Dest,size_t DestSize) |
21 | 373k | { |
22 | 373k | bool RetCode=true; |
23 | 373k | *Dest=0; // Set 'Dest' to zero just in case the conversion will fail. |
24 | | |
25 | | #ifdef _WIN_ALL |
26 | | if (WideCharToMultiByte(CP_ACP,0,Src,-1,Dest,(int)DestSize,NULL,NULL)==0) |
27 | | RetCode=false; |
28 | | |
29 | | // wcstombs is broken in Android NDK r9. |
30 | | #elif defined(_APPLE) |
31 | | WideToUtf(Src,Dest,DestSize); |
32 | | |
33 | | #elif defined(MBFUNCTIONS) |
34 | 373k | if (!WideToCharMap(Src,Dest,DestSize,RetCode)) |
35 | 153k | { |
36 | 153k | mbstate_t ps; // Use thread safe external state based functions. |
37 | 153k | memset (&ps, 0, sizeof(ps)); |
38 | 153k | const wchar *SrcParam=Src; // wcsrtombs can change the pointer. |
39 | | |
40 | | // Some implementations of wcsrtombs can cause memory analyzing tools |
41 | | // like valgrind to report uninitialized data access. It happens because |
42 | | // internally these implementations call SSE4 based wcslen function, |
43 | | // which reads 16 bytes at once including those beyond of trailing 0. |
44 | 153k | size_t ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps); |
45 | | |
46 | 153k | if (ResultingSize==(size_t)-1 && errno==EILSEQ) |
47 | 17.3k | { |
48 | | // Aborted on inconvertible character not zero terminating the result. |
49 | | // EILSEQ helps to distinguish it from small output buffer abort. |
50 | | // We want to convert as much as we can, so we clean the output buffer |
51 | | // and repeat conversion. |
52 | 17.3k | memset (&ps, 0, sizeof(ps)); |
53 | 17.3k | SrcParam=Src; // wcsrtombs can change the pointer. |
54 | 17.3k | memset(Dest,0,DestSize); |
55 | 17.3k | ResultingSize=wcsrtombs(Dest,&SrcParam,DestSize,&ps); |
56 | 17.3k | } |
57 | | |
58 | 153k | if (ResultingSize==(size_t)-1) |
59 | 17.3k | RetCode=false; |
60 | 153k | if (ResultingSize==0 && *Src!=0) |
61 | 358 | RetCode=false; |
62 | 153k | } |
63 | | #else |
64 | | for (int I=0;I<DestSize;I++) |
65 | | { |
66 | | Dest[I]=(char)Src[I]; |
67 | | if (Src[I]==0) |
68 | | break; |
69 | | } |
70 | | #endif |
71 | 373k | if (DestSize>0) |
72 | 373k | Dest[DestSize-1]=0; |
73 | | |
74 | | // We tried to return the empty string if conversion is failed, |
75 | | // but it does not work well. WideCharToMultiByte returns 'failed' code |
76 | | // and partially converted string even if we wanted to convert only a part |
77 | | // of string and passed DestSize smaller than required for fully converted |
78 | | // string. Such call is the valid behavior in RAR code and we do not expect |
79 | | // the empty string in this case. |
80 | | |
81 | 373k | return RetCode; |
82 | 373k | } |
83 | | |
84 | | |
85 | | bool CharToWide(const char *Src,wchar *Dest,size_t DestSize) |
86 | 7.85k | { |
87 | 7.85k | bool RetCode=true; |
88 | 7.85k | *Dest=0; // Set 'Dest' to zero just in case the conversion will fail. |
89 | | |
90 | | #ifdef _WIN_ALL |
91 | | if (MultiByteToWideChar(CP_ACP,0,Src,-1,Dest,(int)DestSize)==0) |
92 | | RetCode=false; |
93 | | |
94 | | // mbstowcs is broken in Android NDK r9. |
95 | | #elif defined(_APPLE) |
96 | | UtfToWide(Src,Dest,DestSize); |
97 | | |
98 | | #elif defined(MBFUNCTIONS) |
99 | | mbstate_t ps; |
100 | 7.85k | memset (&ps, 0, sizeof(ps)); |
101 | 7.85k | const char *SrcParam=Src; // mbsrtowcs can change the pointer. |
102 | 7.85k | size_t ResultingSize=mbsrtowcs(Dest,&SrcParam,DestSize,&ps); |
103 | 7.85k | if (ResultingSize==(size_t)-1) |
104 | 3.23k | RetCode=false; |
105 | 7.85k | if (ResultingSize==0 && *Src!=0) |
106 | 0 | RetCode=false; |
107 | | |
108 | 7.85k | if (RetCode==false && DestSize>1) |
109 | 3.23k | CharToWideMap(Src,Dest,DestSize,RetCode); |
110 | | #else |
111 | | for (int I=0;I<DestSize;I++) |
112 | | { |
113 | | Dest[I]=(wchar_t)Src[I]; |
114 | | if (Src[I]==0) |
115 | | break; |
116 | | } |
117 | | #endif |
118 | 7.85k | if (DestSize>0) |
119 | 7.85k | Dest[DestSize-1]=0; |
120 | | |
121 | | // We tried to return the empty string if conversion is failed, |
122 | | // but it does not work well. MultiByteToWideChar returns 'failed' code |
123 | | // even if we wanted to convert only a part of string and passed DestSize |
124 | | // smaller than required for fully converted string. Such call is the valid |
125 | | // behavior in RAR code and we do not expect the empty string in this case. |
126 | | |
127 | 7.85k | return RetCode; |
128 | 7.85k | } |
129 | | |
130 | | |
131 | | bool WideToChar(const std::wstring &Src,std::string &Dest) |
132 | 373k | { |
133 | | // We need more than 1 char per wchar_t for DBCS and up to 4 for UTF-8. |
134 | 373k | std::vector<char> DestA(4*Src.size()+1); // "+1" for terminating zero. |
135 | 373k | bool Result=WideToChar(Src.c_str(),DestA.data(),DestA.size()); |
136 | 373k | Dest=DestA.data(); |
137 | 373k | return Result; |
138 | 373k | } |
139 | | |
140 | | |
141 | | bool CharToWide(const std::string &Src,std::wstring &Dest) |
142 | 7.85k | { |
143 | | // 2 wchar_t per char in case char is converted to UTF-16 surrogate pair. |
144 | 7.85k | std::vector<wchar> DestW(2*Src.size()+1); // "+1" for terminating zero. |
145 | 7.85k | bool Result=CharToWide(Src.c_str(),DestW.data(),DestW.size()); |
146 | 7.85k | Dest=DestW.data(); |
147 | 7.85k | return Result; |
148 | 7.85k | } |
149 | | |
150 | | |
151 | | #if defined(_UNIX) && defined(MBFUNCTIONS) |
152 | | // Convert and restore mapped inconvertible Unicode characters. |
153 | | // We use it for extended ASCII names in Unix. |
154 | | bool WideToCharMap(const wchar *Src,char *Dest,size_t DestSize,bool &Success) |
155 | 373k | { |
156 | | // String with inconvertible characters mapped to private use Unicode area |
157 | | // must have the mark code somewhere. |
158 | 373k | if (wcschr(Src,(wchar)MappedStringMark)==NULL) |
159 | 153k | return false; |
160 | | |
161 | | // Seems to be that wcrtomb in some memory analyzing libraries |
162 | | // can produce uninitilized output while reporting success on garbage input. |
163 | | // So we clean the destination to calm analyzers. |
164 | 220k | memset(Dest,0,DestSize); |
165 | | |
166 | 220k | Success=true; |
167 | 220k | uint SrcPos=0,DestPos=0; |
168 | 317M | while (Src[SrcPos]!=0 && DestPos<DestSize-MB_CUR_MAX) |
169 | 317M | { |
170 | 317M | if (uint(Src[SrcPos])==MappedStringMark) |
171 | 276k | { |
172 | 276k | SrcPos++; |
173 | 276k | continue; |
174 | 276k | } |
175 | | // For security reasons do not restore low ASCII codes, so mapping cannot |
176 | | // be used to hide control codes like path separators. |
177 | 316M | if (uint(Src[SrcPos])>=MapAreaStart+0x80 && uint(Src[SrcPos])<MapAreaStart+0x100) |
178 | 63.5M | Dest[DestPos++]=char(uint(Src[SrcPos++])-MapAreaStart); |
179 | 253M | else |
180 | 253M | { |
181 | 253M | mbstate_t ps; |
182 | 253M | memset(&ps,0,sizeof(ps)); |
183 | 253M | if (wcrtomb(Dest+DestPos,Src[SrcPos],&ps)==(size_t)-1) |
184 | 2.00M | { |
185 | 2.00M | Dest[DestPos]='_'; |
186 | 2.00M | Success=false; |
187 | 2.00M | } |
188 | 253M | SrcPos++; |
189 | 253M | memset(&ps,0,sizeof(ps)); |
190 | 253M | int Length=mbrlen(Dest+DestPos,MB_CUR_MAX,&ps); |
191 | 253M | DestPos+=Max(Length,1); |
192 | 253M | } |
193 | 316M | } |
194 | 220k | Dest[Min(DestPos,DestSize-1)]=0; |
195 | 220k | return true; |
196 | 373k | } |
197 | | #endif |
198 | | |
199 | | |
200 | | #if defined(_UNIX) && defined(MBFUNCTIONS) |
201 | | // Convert and map inconvertible Unicode characters. |
202 | | // We use it for extended ASCII names in Unix. |
203 | | void CharToWideMap(const char *Src,wchar *Dest,size_t DestSize,bool &Success) |
204 | 3.23k | { |
205 | | // Map inconvertible characters to private use Unicode area 0xE000. |
206 | | // Mark such string by placing special non-character code before |
207 | | // first inconvertible character. |
208 | 3.23k | Success=false; |
209 | 3.23k | bool MarkAdded=false; |
210 | 3.23k | uint SrcPos=0,DestPos=0; |
211 | 1.31M | while (DestPos<DestSize) |
212 | 1.31M | { |
213 | 1.31M | if (Src[SrcPos]==0) |
214 | 3.23k | { |
215 | 3.23k | Success=true; |
216 | 3.23k | break; |
217 | 3.23k | } |
218 | 1.30M | mbstate_t ps; |
219 | 1.30M | memset(&ps,0,sizeof(ps)); |
220 | 1.30M | size_t res=mbrtowc(Dest+DestPos,Src+SrcPos,MB_CUR_MAX,&ps); |
221 | 1.30M | if (res==(size_t)-1 || res==(size_t)-2) |
222 | 456k | { |
223 | | // For security reasons we do not want to map low ASCII characters, |
224 | | // so we do not have additional .. and path separator codes. |
225 | 456k | if (byte(Src[SrcPos])>=0x80) |
226 | 456k | { |
227 | 456k | if (!MarkAdded) |
228 | 3.23k | { |
229 | 3.23k | Dest[DestPos++]=MappedStringMark; |
230 | 3.23k | MarkAdded=true; |
231 | 3.23k | if (DestPos>=DestSize) |
232 | 0 | break; |
233 | 3.23k | } |
234 | 456k | Dest[DestPos++]=byte(Src[SrcPos++])+MapAreaStart; |
235 | 456k | } |
236 | 0 | else |
237 | 0 | break; |
238 | 456k | } |
239 | 852k | else |
240 | 852k | { |
241 | 852k | memset(&ps,0,sizeof(ps)); |
242 | 852k | int Length=mbrlen(Src+SrcPos,MB_CUR_MAX,&ps); |
243 | 852k | SrcPos+=Max(Length,1); |
244 | 852k | DestPos++; |
245 | 852k | } |
246 | 1.30M | } |
247 | 3.23k | Dest[Min(DestPos,DestSize-1)]=0; |
248 | 3.23k | } |
249 | | #endif |
250 | | |
251 | | |
252 | | // SrcSize is source data size in wide characters, not in bytes. |
253 | | // DestSize is the maximum allowed destination size. |
254 | | byte* WideToRaw(const wchar *Src,size_t SrcSize,byte *Dest,size_t DestSize) |
255 | 0 | { |
256 | 0 | for (size_t I=0;I<SrcSize && I*2+1<DestSize;I++,Src++) |
257 | 0 | { |
258 | 0 | Dest[I*2]=(byte)*Src; |
259 | 0 | Dest[I*2+1]=(byte)(*Src>>8); |
260 | 0 | if (*Src==0) |
261 | 0 | break; |
262 | 0 | } |
263 | 0 | return Dest; |
264 | 0 | } |
265 | | |
266 | | |
267 | | // Store UTF-16 raw byte stream. |
268 | | void WideToRaw(const std::wstring &Src,std::vector<byte> &Dest) |
269 | 0 | { |
270 | 0 | for (wchar C : Src) |
271 | 0 | { |
272 | 0 | Dest.push_back((byte)C); |
273 | 0 | Dest.push_back((byte)(C>>8)); |
274 | 0 | } |
275 | | // In STL version of this function we do not add the trailing zero. |
276 | | // Otherwise we would need to remove it when restoring std::wstring |
277 | | // from raw data. |
278 | | |
279 | | // Dest.push_back(0); // 2 bytes of trailing UTF-16 zero. |
280 | | // Dest.push_back(0); |
281 | 0 | } |
282 | | |
283 | | |
284 | | wchar* RawToWide(const byte *Src,wchar *Dest,size_t DestSize) |
285 | 0 | { |
286 | 0 | for (size_t I=0;I<DestSize;I++) |
287 | 0 | if ((Dest[I]=Src[I*2]+(Src[I*2+1]<<8))==0) |
288 | 0 | break; |
289 | 0 | return Dest; |
290 | 0 | } |
291 | | |
292 | | |
293 | | std::wstring RawToWide(const std::vector<byte> &Src) |
294 | 0 | { |
295 | 0 | std::wstring Dest; |
296 | 0 | for (size_t I=0;I+1<Src.size();I+=2) |
297 | 0 | { |
298 | 0 | wchar c=Src[I]+(Src[I+1]<<8); |
299 | 0 | Dest.push_back(c); |
300 | 0 | if (c==0) |
301 | 0 | break; |
302 | 0 | } |
303 | 0 | return Dest; |
304 | 0 | } |
305 | | |
306 | | |
307 | | void WideToUtf(const wchar *Src,char *Dest,size_t DestSize) |
308 | 0 | { |
309 | 0 | long dsize=(long)DestSize; |
310 | 0 | dsize--; |
311 | 0 | while (*Src!=0 && --dsize>=0) |
312 | 0 | { |
313 | 0 | uint c=*(Src++); |
314 | 0 | if (c<0x80) |
315 | 0 | *(Dest++)=c; |
316 | 0 | else |
317 | 0 | if (c<0x800 && --dsize>=0) |
318 | 0 | { |
319 | 0 | *(Dest++)=(0xc0|(c>>6)); |
320 | 0 | *(Dest++)=(0x80|(c&0x3f)); |
321 | 0 | } |
322 | 0 | else |
323 | 0 | { |
324 | 0 | if (c>=0xd800 && c<=0xdbff && *Src>=0xdc00 && *Src<=0xdfff) // Surrogate pair. |
325 | 0 | { |
326 | 0 | c=((c-0xd800)<<10)+(*Src-0xdc00)+0x10000; |
327 | 0 | Src++; |
328 | 0 | } |
329 | 0 | if (c<0x10000 && (dsize-=2)>=0) |
330 | 0 | { |
331 | 0 | *(Dest++)=(0xe0|(c>>12)); |
332 | 0 | *(Dest++)=(0x80|((c>>6)&0x3f)); |
333 | 0 | *(Dest++)=(0x80|(c&0x3f)); |
334 | 0 | } |
335 | 0 | else |
336 | 0 | if (c < 0x200000 && (dsize-=3)>=0) |
337 | 0 | { |
338 | 0 | *(Dest++)=(0xf0|(c>>18)); |
339 | 0 | *(Dest++)=(0x80|((c>>12)&0x3f)); |
340 | 0 | *(Dest++)=(0x80|((c>>6)&0x3f)); |
341 | 0 | *(Dest++)=(0x80|(c&0x3f)); |
342 | 0 | } |
343 | 0 | } |
344 | 0 | } |
345 | 0 | *Dest=0; |
346 | 0 | } |
347 | | |
348 | | |
349 | | void WideToUtf(const std::wstring &Src,std::string &Dest) |
350 | 0 | { |
351 | 0 | for (size_t I=0;I<Src.size() && Src[I]!=0;) |
352 | 0 | { |
353 | 0 | uint c=Src[I++]; |
354 | 0 | if (c<0x80) |
355 | 0 | Dest.push_back(c); |
356 | 0 | else |
357 | 0 | if (c<0x800) |
358 | 0 | { |
359 | 0 | Dest.push_back(0xc0|(c>>6)); |
360 | 0 | Dest.push_back(0x80|(c&0x3f)); |
361 | 0 | } |
362 | 0 | else |
363 | 0 | { |
364 | 0 | if (c>=0xd800 && c<=0xdbff && I<Src.size() && Src[I]>=0xdc00 && Src[I]<=0xdfff) // Surrogate pair. |
365 | 0 | { |
366 | 0 | c=((c-0xd800)<<10)+(Src[I]-0xdc00)+0x10000; |
367 | 0 | I++; |
368 | 0 | } |
369 | 0 | if (c<0x10000) |
370 | 0 | { |
371 | 0 | Dest.push_back(0xe0|(c>>12)); |
372 | 0 | Dest.push_back(0x80|((c>>6)&0x3f)); |
373 | 0 | Dest.push_back(0x80|(c&0x3f)); |
374 | 0 | } |
375 | 0 | else |
376 | 0 | if (c < 0x200000) |
377 | 0 | { |
378 | 0 | Dest.push_back(0xf0|(c>>18)); |
379 | 0 | Dest.push_back(0x80|((c>>12)&0x3f)); |
380 | 0 | Dest.push_back(0x80|((c>>6)&0x3f)); |
381 | 0 | Dest.push_back(0x80|(c&0x3f)); |
382 | 0 | } |
383 | 0 | } |
384 | 0 | } |
385 | 0 | } |
386 | | |
387 | | |
388 | | |
389 | | size_t WideToUtfSize(const wchar *Src) |
390 | 0 | { |
391 | 0 | size_t Size=0; |
392 | 0 | for (;*Src!=0;Src++) |
393 | 0 | if (*Src<0x80) |
394 | 0 | Size++; |
395 | 0 | else |
396 | 0 | if (*Src<0x800) |
397 | 0 | Size+=2; |
398 | 0 | else |
399 | 0 | if ((uint)*Src<0x10000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t. |
400 | 0 | { |
401 | 0 | if (Src[0]>=0xd800 && Src[0]<=0xdbff && Src[1]>=0xdc00 && Src[1]<=0xdfff) |
402 | 0 | { |
403 | 0 | Size+=4; // 4 output bytes for Unicode surrogate pair. |
404 | 0 | Src++; |
405 | 0 | } |
406 | 0 | else |
407 | 0 | Size+=3; |
408 | 0 | } |
409 | 0 | else |
410 | 0 | if ((uint)*Src<0x200000) //(uint) to avoid Clang/win "always true" warning for 16-bit wchar_t. |
411 | 0 | Size+=4; |
412 | 0 | return Size+1; // Include terminating zero. |
413 | 0 | } |
414 | | |
415 | | |
416 | | bool UtfToWide(const char *Src,wchar *Dest,size_t DestSize) |
417 | 0 | { |
418 | 0 | bool Success=true; |
419 | 0 | long dsize=(long)DestSize; |
420 | 0 | dsize--; |
421 | 0 | while (*Src!=0) |
422 | 0 | { |
423 | 0 | uint c=byte(*(Src++)),d; |
424 | 0 | if (c<0x80) |
425 | 0 | d=c; |
426 | 0 | else |
427 | 0 | if ((c>>5)==6) |
428 | 0 | { |
429 | 0 | if ((*Src&0xc0)!=0x80) |
430 | 0 | { |
431 | 0 | Success=false; |
432 | 0 | break; |
433 | 0 | } |
434 | 0 | d=((c&0x1f)<<6)|(*Src&0x3f); |
435 | 0 | Src++; |
436 | 0 | } |
437 | 0 | else |
438 | 0 | if ((c>>4)==14) |
439 | 0 | { |
440 | 0 | if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80) |
441 | 0 | { |
442 | 0 | Success=false; |
443 | 0 | break; |
444 | 0 | } |
445 | 0 | d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f); |
446 | 0 | Src+=2; |
447 | 0 | } |
448 | 0 | else |
449 | 0 | if ((c>>3)==30) |
450 | 0 | { |
451 | 0 | if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80) |
452 | 0 | { |
453 | 0 | Success=false; |
454 | 0 | break; |
455 | 0 | } |
456 | 0 | d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f); |
457 | 0 | Src+=3; |
458 | 0 | } |
459 | 0 | else |
460 | 0 | { |
461 | 0 | Success=false; |
462 | 0 | break; |
463 | 0 | } |
464 | 0 | if (--dsize<0) |
465 | 0 | break; |
466 | 0 | if (d>0xffff) |
467 | 0 | { |
468 | 0 | if (--dsize<0) |
469 | 0 | break; |
470 | 0 | if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629. |
471 | 0 | { |
472 | 0 | Success=false; |
473 | 0 | continue; |
474 | 0 | } |
475 | 0 | if (sizeof(*Dest)==2) // Use the surrogate pair. |
476 | 0 | { |
477 | 0 | *(Dest++)=((d-0x10000)>>10)+0xd800; |
478 | 0 | *(Dest++)=(d&0x3ff)+0xdc00; |
479 | 0 | } |
480 | 0 | else |
481 | 0 | *(Dest++)=d; |
482 | 0 | } |
483 | 0 | else |
484 | 0 | *(Dest++)=d; |
485 | 0 | } |
486 | 0 | *Dest=0; |
487 | 0 | return Success; |
488 | 0 | } |
489 | | |
490 | | |
491 | | bool UtfToWide(const char *Src,std::wstring &Dest) |
492 | 92.6k | { |
493 | 92.6k | bool Success=true; |
494 | 92.6k | Dest.clear(); |
495 | 207k | while (*Src!=0) |
496 | 117k | { |
497 | 117k | uint c=byte(*(Src++)),d; |
498 | 117k | if (c<0x80) |
499 | 113k | d=c; |
500 | 3.38k | else |
501 | 3.38k | if ((c>>5)==6) |
502 | 372 | { |
503 | 372 | if ((*Src&0xc0)!=0x80) |
504 | 261 | { |
505 | 261 | Success=false; |
506 | 261 | break; |
507 | 261 | } |
508 | 111 | d=((c&0x1f)<<6)|(*Src&0x3f); |
509 | 111 | Src++; |
510 | 111 | } |
511 | 3.01k | else |
512 | 3.01k | if ((c>>4)==14) |
513 | 415 | { |
514 | 415 | if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80) |
515 | 389 | { |
516 | 389 | Success=false; |
517 | 389 | break; |
518 | 389 | } |
519 | 26 | d=((c&0xf)<<12)|((Src[0]&0x3f)<<6)|(Src[1]&0x3f); |
520 | 26 | Src+=2; |
521 | 26 | } |
522 | 2.59k | else |
523 | 2.59k | if ((c>>3)==30) |
524 | 675 | { |
525 | 675 | if ((Src[0]&0xc0)!=0x80 || (Src[1]&0xc0)!=0x80 || (Src[2]&0xc0)!=0x80) |
526 | 150 | { |
527 | 150 | Success=false; |
528 | 150 | break; |
529 | 150 | } |
530 | 525 | d=((c&7)<<18)|((Src[0]&0x3f)<<12)|((Src[1]&0x3f)<<6)|(Src[2]&0x3f); |
531 | 525 | Src+=3; |
532 | 525 | } |
533 | 1.92k | else |
534 | 1.92k | { |
535 | 1.92k | Success=false; |
536 | 1.92k | break; |
537 | 1.92k | } |
538 | 114k | if (d>0xffff) |
539 | 525 | { |
540 | 525 | if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629. |
541 | 82 | { |
542 | 82 | Success=false; |
543 | 82 | continue; |
544 | 82 | } |
545 | 443 | if (sizeof(wchar_t)==2) // Use the surrogate pair. |
546 | 0 | { |
547 | 0 | Dest.push_back( ((d-0x10000)>>10)+0xd800 ); |
548 | 0 | Dest.push_back( (d&0x3ff)+0xdc00 ); |
549 | 0 | } |
550 | 443 | else |
551 | 443 | Dest.push_back( d ); |
552 | 443 | } |
553 | 113k | else |
554 | 113k | Dest.push_back( d ); |
555 | 114k | } |
556 | 92.6k | return Success; |
557 | 92.6k | } |
558 | | |
559 | | |
560 | | /* |
561 | | bool UtfToWide(const std::vector<char> &Src,std::wstring &Dest) |
562 | | { |
563 | | bool Success=true; |
564 | | Dest.clear(); |
565 | | for (size_t I=0;I<Src.size() && Src[I]!=0;) // We expect it to always stop at 0. |
566 | | { |
567 | | uint c=byte(Src[I++]),d; |
568 | | if (c<0x80) |
569 | | d=c; |
570 | | else |
571 | | if ((c>>5)==6) |
572 | | { |
573 | | if (Src.size()-I<1 || (Src[I]&0xc0)!=0x80) |
574 | | { |
575 | | Success=false; |
576 | | break; |
577 | | } |
578 | | d=((c&0x1f)<<6)|(Src[I]&0x3f); |
579 | | I++; |
580 | | } |
581 | | else |
582 | | if ((c>>4)==14) |
583 | | { |
584 | | if (Src.size()-I<2 || (Src[I]&0xc0)!=0x80 || (Src[I+1]&0xc0)!=0x80) |
585 | | { |
586 | | Success=false; |
587 | | break; |
588 | | } |
589 | | d=((c&0xf)<<12)|((Src[I]&0x3f)<<6)|(Src[I+1]&0x3f); |
590 | | I+=2; |
591 | | } |
592 | | else |
593 | | if ((c>>3)==30) |
594 | | { |
595 | | if (Src.size()-I<3 || (Src[I]&0xc0)!=0x80 || (Src[I+1]&0xc0)!=0x80 || (Src[I+2]&0xc0)!=0x80) |
596 | | { |
597 | | Success=false; |
598 | | break; |
599 | | } |
600 | | d=((c&7)<<18)|((Src[I]&0x3f)<<12)|((Src[I+1]&0x3f)<<6)|(Src[I+2]&0x3f); |
601 | | I+=3; |
602 | | } |
603 | | else |
604 | | { |
605 | | Success=false; |
606 | | break; |
607 | | } |
608 | | if (d>0xffff) |
609 | | { |
610 | | if (d>0x10ffff) // UTF-8 must end at 0x10ffff according to RFC 3629. |
611 | | { |
612 | | Success=false; |
613 | | continue; |
614 | | } |
615 | | if (sizeof(Dest[0])==2) // Use the surrogate pair. |
616 | | { |
617 | | Dest.push_back( ((d-0x10000)>>10)+0xd800 ); |
618 | | Dest.push_back( (d&0x3ff)+0xdc00 ); |
619 | | } |
620 | | else |
621 | | Dest.push_back( d ); |
622 | | } |
623 | | else |
624 | | Dest.push_back( d ); |
625 | | } |
626 | | return Success; |
627 | | } |
628 | | */ |
629 | | |
630 | | |
631 | | // For zero terminated strings. |
632 | | bool IsTextUtf8(const byte *Src) |
633 | 0 | { |
634 | 0 | return IsTextUtf8(Src,strlen((const char *)Src)); |
635 | 0 | } |
636 | | |
637 | | |
638 | | // Source data can be both with and without UTF-8 BOM. |
639 | | bool IsTextUtf8(const byte *Src,size_t SrcSize) |
640 | 0 | { |
641 | 0 | while (SrcSize-- > 0) |
642 | 0 | { |
643 | 0 | byte C=*(Src++); |
644 | 0 | int HighOne=0; // Number of leftmost '1' bits. |
645 | 0 | for (byte Mask=0x80;Mask!=0 && (C & Mask)!=0;Mask>>=1) |
646 | 0 | HighOne++; |
647 | 0 | if (HighOne==1 || HighOne>6) |
648 | 0 | return false; |
649 | 0 | while (--HighOne > 0) |
650 | 0 | if (SrcSize-- <= 0 || (*(Src++) & 0xc0)!=0x80) |
651 | 0 | return false; |
652 | 0 | } |
653 | 0 | return true; |
654 | 0 | } |
655 | | |
656 | | |
657 | | int wcsicomp(const wchar *s1,const wchar *s2) |
658 | 1.99k | { |
659 | | // If strings are English or numeric, perform the fast comparison. |
660 | | // It improves speed in cases like comparing against a lot of MOTW masks. |
661 | 1.99k | bool FastMode=true; |
662 | 4.56k | while (true) |
663 | 4.56k | { |
664 | | // English uppercase, English lowercase and digit flags. |
665 | 4.56k | bool u1=*s1>='A' && *s1<='Z', l1=*s1>='a' && *s1<='z', d1=*s1>='0' && *s1<='9'; |
666 | 4.56k | bool u2=*s2>='A' && *s2<='Z', l2=*s2>='a' && *s2<='z', d2=*s2>='0' && *s2<='9'; |
667 | | |
668 | | // Fast comparison is impossible if both characters are not alphanumeric or 0. |
669 | 4.56k | if (!u1 && !l1 && !d1 && *s1!=0 && !u2 && !l2 && !d2 && *s2!=0) |
670 | 425 | { |
671 | 425 | FastMode=false; |
672 | 425 | break; |
673 | 425 | } |
674 | | // Convert lowercase to uppercase, keep numeric and not alphanumeric as is. |
675 | 4.13k | wchar c1 = l1 ? *s1-'a'+'A' : *s1; |
676 | 4.13k | wchar c2 = l2 ? *s2-'a'+'A' : *s2; |
677 | | |
678 | | // If characters mistmatch, to return a proper value we must compare |
679 | | // already converted, case insensitive characters instead of original ones. |
680 | | // So we place a.txt before B.txt and can perform the correct case |
681 | | // insensitive binary search in different string lists. |
682 | 4.13k | if (c1 != c2) |
683 | 1.27k | return c1 < c2 ? -1 : 1; |
684 | | |
685 | 2.86k | if (*s1==0) |
686 | 290 | break; |
687 | 2.57k | s1++; |
688 | 2.57k | s2++; |
689 | 2.57k | } |
690 | 715 | if (FastMode) |
691 | 290 | return 0; |
692 | | |
693 | | #ifdef _WIN_ALL |
694 | | return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,-1,s2,-1)-2; |
695 | | #else |
696 | 2.12k | while (true) |
697 | 2.12k | { |
698 | 2.12k | wchar u1 = towupper(*s1); |
699 | 2.12k | wchar u2 = towupper(*s2); |
700 | | |
701 | | // If characters mistmatch, to return a proper value we must compare |
702 | | // already converted, case insensitive characters instead of original ones. |
703 | | // So we place a.txt before B.txt and can perform the correct case |
704 | | // insensitive binary search in different string lists. |
705 | 2.12k | if (u1 != u2) |
706 | 0 | return u1 < u2 ? -1 : 1; |
707 | 2.12k | if (*s1==0) |
708 | 425 | break; |
709 | 1.70k | s1++; |
710 | 1.70k | s2++; |
711 | 1.70k | } |
712 | 425 | return 0; |
713 | 425 | #endif |
714 | 425 | } |
715 | | |
716 | | |
717 | | int wcsnicomp(const wchar *s1,const wchar *s2,size_t n) |
718 | 0 | { |
719 | | #ifdef _WIN_ALL |
720 | | // If we specify 'n' exceeding the actual string length, CompareString goes |
721 | | // beyond the trailing zero and compares garbage. So we need to limit 'n' |
722 | | // to real string length. |
723 | | size_t sl1=wcslen(s1); // Pre-compute to not call wcslen() in Min() twice. |
724 | | size_t l1=Min(sl1+1,n); |
725 | | size_t sl2=wcslen(s2); // Pre-compute to not call wcslen() in Min() twice. |
726 | | size_t l2=Min(sl2+1,n); |
727 | | return CompareStringW(LOCALE_USER_DEFAULT,NORM_IGNORECASE|SORT_STRINGSORT,s1,(int)l1,s2,(int)l2)-2; |
728 | | #else |
729 | 0 | if (n==0) |
730 | 0 | return 0; |
731 | 0 | while (true) |
732 | 0 | { |
733 | 0 | wchar u1 = towupper(*s1); |
734 | 0 | wchar u2 = towupper(*s2); |
735 | 0 | if (u1 != u2) |
736 | 0 | return u1 < u2 ? -1 : 1; |
737 | 0 | if (*s1==0 || --n==0) |
738 | 0 | break; |
739 | 0 | s1++; |
740 | 0 | s2++; |
741 | 0 | } |
742 | 0 | return 0; |
743 | 0 | #endif |
744 | 0 | } |
745 | | |
746 | | |
747 | | // Case insensitive wcsstr(). |
748 | | const wchar_t* wcscasestr(const wchar_t *str, const wchar_t *search) |
749 | 0 | { |
750 | 0 | for (size_t i=0;str[i]!=0;i++) |
751 | 0 | for (size_t j=0;;j++) |
752 | 0 | { |
753 | 0 | if (search[j]==0) |
754 | 0 | return str+i; |
755 | 0 | if (tolowerw(str[i+j])!=tolowerw(search[j])) |
756 | 0 | break; |
757 | 0 | } |
758 | 0 | return nullptr; |
759 | 0 | } |
760 | | |
761 | | |
762 | | // Case insensitive std::wstring substring search. |
763 | | std::wstring::size_type wcscasestr(const std::wstring &str, const std::wstring &search) |
764 | 0 | { |
765 | 0 | const wchar *Found=wcscasestr(str.c_str(),search.c_str()); |
766 | 0 | return Found==nullptr ? std::wstring::npos : Found-str.c_str(); |
767 | 0 | } |
768 | | |
769 | | |
770 | | #ifndef SFX_MODULE |
771 | | wchar* wcslower(wchar *s) |
772 | 0 | { |
773 | | #ifdef _WIN_ALL |
774 | | // _wcslwr requires setlocale and we do not want to depend on setlocale |
775 | | // in Windows. Also CharLower involves less overhead. |
776 | | CharLower(s); |
777 | | #else |
778 | 0 | for (wchar *c=s;*c!=0;c++) |
779 | 0 | *c=towlower(*c); |
780 | 0 | #endif |
781 | 0 | return s; |
782 | 0 | } |
783 | | |
784 | | |
785 | | void wcslower(std::wstring &s) |
786 | 0 | { |
787 | 0 | wcslower(&s[0]); |
788 | 0 | } |
789 | | |
790 | | |
791 | | wchar* wcsupper(wchar *s) |
792 | 11.9k | { |
793 | | #ifdef _WIN_ALL |
794 | | // _wcsupr requires setlocale and we do not want to depend on setlocale |
795 | | // in Windows. Also CharUpper involves less overhead. |
796 | | CharUpper(s); |
797 | | #else |
798 | 23.9k | for (wchar *c=s;*c!=0;c++) |
799 | 11.9k | *c=towupper(*c); |
800 | 11.9k | #endif |
801 | 11.9k | return s; |
802 | 11.9k | } |
803 | | |
804 | | |
805 | | void wcsupper(std::wstring &s) |
806 | 11.9k | { |
807 | 11.9k | wcsupper(&s[0]); |
808 | 11.9k | } |
809 | | #endif |
810 | | |
811 | | |
812 | | |
813 | | |
814 | | int toupperw(int ch) |
815 | 35.9k | { |
816 | | #if defined(_WIN_ALL) |
817 | | // CharUpper is more reliable than towupper in Windows, which seems to be |
818 | | // C locale dependent even in Unicode version. For example, towupper failed |
819 | | // to convert lowercase Russian characters. Use 0xffff mask to prevent crash |
820 | | // if value larger than 0xffff is passed to this function. |
821 | | return (int)(INT_PTR)CharUpper((wchar *)(INT_PTR)(ch&0xffff)); |
822 | | #else |
823 | 35.9k | return towupper(ch); |
824 | 35.9k | #endif |
825 | 35.9k | } |
826 | | |
827 | | |
828 | | int tolowerw(int ch) |
829 | 0 | { |
830 | | #if defined(_WIN_ALL) |
831 | | // CharLower is more reliable than towlower in Windows. |
832 | | // See comment for towupper above. Use 0xffff mask to prevent crash |
833 | | // if value larger than 0xffff is passed to this function. |
834 | | return (int)(INT_PTR)CharLower((wchar *)(INT_PTR)(ch&0xffff)); |
835 | | #else |
836 | 0 | return towlower(ch); |
837 | 0 | #endif |
838 | 0 | } |
839 | | |
840 | | |
841 | | int atoiw(const std::wstring &s) |
842 | 238 | { |
843 | 238 | return (int)atoilw(s); |
844 | 238 | } |
845 | | |
846 | | |
847 | | int64 atoilw(const std::wstring &s) |
848 | 238 | { |
849 | 238 | bool sign=false; |
850 | 238 | size_t Pos=0; |
851 | 238 | if (s[Pos]=='-') // We do use signed integers here, for example, in GUI SFX. |
852 | 81 | { |
853 | 81 | Pos++; |
854 | 81 | sign=true; |
855 | 81 | } |
856 | | // Use unsigned type here, since long string can overflow the variable |
857 | | // and signed integer overflow is undefined behavior in C++. |
858 | 238 | uint64 n=0; |
859 | 1.75k | while (s[Pos]>='0' && s[Pos]<='9') |
860 | 1.52k | { |
861 | 1.52k | n=n*10+(s[Pos]-'0'); |
862 | 1.52k | Pos++; |
863 | 1.52k | } |
864 | | // Check int64(n)>=0 to avoid the signed overflow with undefined behavior |
865 | | // when negating 0x8000000000000000. |
866 | 238 | return sign && int64(n)>=0 ? -int64(n) : int64(n); |
867 | 238 | } |
868 | | |
869 | | |
870 | | #ifdef DBCS_SUPPORTED |
871 | | SupportDBCS gdbcs; |
872 | | |
873 | | SupportDBCS::SupportDBCS() |
874 | | { |
875 | | Init(); |
876 | | } |
877 | | |
878 | | |
879 | | void SupportDBCS::Init() |
880 | | { |
881 | | CPINFO CPInfo; |
882 | | GetCPInfo(CP_ACP,&CPInfo); |
883 | | DBCSMode=CPInfo.MaxCharSize > 1; |
884 | | for (uint I=0;I<ASIZE(IsLeadByte);I++) |
885 | | IsLeadByte[I]=IsDBCSLeadByte(I)!=0; |
886 | | } |
887 | | |
888 | | |
889 | | char* SupportDBCS::charnext(const char *s) |
890 | | { |
891 | | // Zero cannot be the trail byte. So if next byte after the lead byte |
892 | | // is 0, the string is corrupt and we'll better return the pointer to 0, |
893 | | // to break string processing loops. |
894 | | return (char *)(IsLeadByte[(byte)*s] && s[1]!=0 ? s+2:s+1); |
895 | | } |
896 | | #endif |
897 | | |
898 | | |