/src/FreeRDP/winpr/libwinpr/crt/unicode_builtin.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2001-2004 Unicode, Inc. |
3 | | * |
4 | | * Disclaimer |
5 | | * |
6 | | * This source code is provided as is by Unicode, Inc. No claims are |
7 | | * made as to fitness for any particular purpose. No warranties of any |
8 | | * kind are expressed or implied. The recipient agrees to determine |
9 | | * applicability of information provided. If this file has been |
10 | | * purchased on magnetic or optical media from Unicode, Inc., the |
11 | | * sole remedy for any claim will be exchange of defective media |
12 | | * within 90 days of receipt. |
13 | | * |
14 | | * Limitations on Rights to Redistribute This Code |
15 | | * |
16 | | * Unicode, Inc. hereby grants the right to freely use the information |
17 | | * supplied in this file in the creation of products supporting the |
18 | | * Unicode Standard, and to make copies of this file in any form |
19 | | * for internal or external distribution as long as this notice |
20 | | * remains attached. |
21 | | */ |
22 | | |
23 | | /* --------------------------------------------------------------------- |
24 | | |
25 | | Conversions between UTF32, UTF-16, and UTF-8. Source code file. |
26 | | Author: Mark E. Davis, 1994. |
27 | | Rev History: Rick McGowan, fixes & updates May 2001. |
28 | | Sept 2001: fixed const & error conditions per |
29 | | mods suggested by S. Parent & A. Lillich. |
30 | | June 2002: Tim Dodd added detection and handling of incomplete |
31 | | source sequences, enhanced error detection, added casts |
32 | | to eliminate compiler warnings. |
33 | | July 2003: slight mods to back out aggressive FFFE detection. |
34 | | Jan 2004: updated switches in from-UTF8 conversions. |
35 | | Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. |
36 | | |
37 | | See the header file "utf.h" for complete documentation. |
38 | | |
39 | | ------------------------------------------------------------------------ */ |
40 | | |
41 | | #include <winpr/wtypes.h> |
42 | | #include <winpr/string.h> |
43 | | #include <winpr/assert.h> |
44 | | #include <winpr/cast.h> |
45 | | |
46 | | #include "unicode.h" |
47 | | |
48 | | #include "../log.h" |
49 | | #define TAG WINPR_TAG("unicode") |
50 | | |
51 | | /* |
52 | | * Character Types: |
53 | | * |
54 | | * UTF8: uint8_t 8 bits |
55 | | * UTF16: uint16_t 16 bits |
56 | | * UTF32: uint32_t 32 bits |
57 | | */ |
58 | | |
59 | | /* Some fundamental constants */ |
60 | 0 | #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD |
61 | 6.45M | #define UNI_MAX_BMP (uint32_t)0x0000FFFF |
62 | 798 | #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF |
63 | | #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF |
64 | | #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF |
65 | | |
66 | | typedef enum |
67 | | { |
68 | | conversionOK, /* conversion successful */ |
69 | | sourceExhausted, /* partial character in source, but hit end */ |
70 | | targetExhausted, /* insuff. room in target for conversion */ |
71 | | sourceIllegal /* source sequence is illegal/malformed */ |
72 | | } ConversionResult; |
73 | | |
74 | | typedef enum |
75 | | { |
76 | | strictConversion = 0, |
77 | | lenientConversion |
78 | | } ConversionFlags; |
79 | | |
80 | | static const int halfShift = 10; /* used for shifting by 10 bits */ |
81 | | |
82 | | static const uint32_t halfBase = 0x0010000UL; |
83 | | static const uint32_t halfMask = 0x3FFUL; |
84 | | |
85 | 14.1M | #define UNI_SUR_HIGH_START (uint32_t)0xD800 |
86 | 70.8k | #define UNI_SUR_HIGH_END (uint32_t)0xDBFF |
87 | 1.20M | #define UNI_SUR_LOW_START (uint32_t)0xDC00 |
88 | 71.0k | #define UNI_SUR_LOW_END (uint32_t)0xDFFF |
89 | | |
90 | | /* --------------------------------------------------------------------- */ |
91 | | |
92 | | /* |
93 | | * Index into the table below with the first byte of a UTF-8 sequence to |
94 | | * get the number of trailing bytes that are supposed to follow it. |
95 | | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is |
96 | | * left as-is for anyone who may want to do such conversion, which was |
97 | | * allowed in earlier algorithms. |
98 | | */ |
99 | | static const char trailingBytesForUTF8[256] = { |
100 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
101 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
102 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
103 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
104 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
105 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
106 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
107 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 |
108 | | }; |
109 | | |
110 | | /* |
111 | | * Magic values subtracted from a buffer value during UTF8 conversion. |
112 | | * This table contains as many values as there might be trailing bytes |
113 | | * in a UTF-8 sequence. |
114 | | */ |
115 | | static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
116 | | 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; |
117 | | |
118 | | /* |
119 | | * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
120 | | * into the first byte, depending on how many bytes follow. There are |
121 | | * as many entries in this table as there are UTF-8 sequence types. |
122 | | * (I.e., one byte sequence, two byte... etc.). Remember that sequence |
123 | | * for *legal* UTF-8 will be 4 or fewer bytes total. |
124 | | */ |
125 | | static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
126 | | |
127 | | /* We always need UTF-16LE, even on big endian systems! */ |
128 | | static WCHAR setWcharFrom(WCHAR w) |
129 | 3.83M | { |
130 | | #if defined(__BIG_ENDIAN__) |
131 | | union |
132 | | { |
133 | | WCHAR w; |
134 | | char c[2]; |
135 | | } cnv; |
136 | | |
137 | | cnv.w = w; |
138 | | const char c = cnv.c[0]; |
139 | | cnv.c[0] = cnv.c[1]; |
140 | | cnv.c[1] = c; |
141 | | return cnv.w; |
142 | | #else |
143 | 3.83M | return w; |
144 | 3.83M | #endif |
145 | 3.83M | } |
146 | | |
147 | | /* --------------------------------------------------------------------- */ |
148 | | |
149 | | /* The interface converts a whole buffer to avoid function-call overhead. |
150 | | * Constants have been gathered. Loops & conditionals have been removed as |
151 | | * much as possible for efficiency, in favor of drop-through switches. |
152 | | * (See "Note A" at the bottom of the file for equivalent code.) |
153 | | * If your compiler supports it, the "isLegalUTF8" call can be turned |
154 | | * into an inline function. |
155 | | */ |
156 | | |
157 | | /* --------------------------------------------------------------------- */ |
158 | | |
159 | | static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart, |
160 | | const uint16_t* sourceEnd, |
161 | | uint8_t** targetStart, |
162 | | const uint8_t* targetEnd, |
163 | | ConversionFlags flags) |
164 | 398 | { |
165 | 398 | bool computeLength = (!targetEnd) ? true : false; |
166 | 398 | const uint16_t* source = *sourceStart; |
167 | 398 | uint8_t* target = *targetStart; |
168 | 398 | ConversionResult result = conversionOK; |
169 | | |
170 | 604k | while (source < sourceEnd) |
171 | 604k | { |
172 | 604k | uint32_t ch = 0; |
173 | 604k | unsigned short bytesToWrite = 0; |
174 | 604k | const uint32_t byteMask = 0xBF; |
175 | 604k | const uint32_t byteMark = 0x80; |
176 | 604k | const uint16_t* oldSource = |
177 | 604k | source; /* In case we have to back up because of target overflow. */ |
178 | | |
179 | 604k | ch = setWcharFrom(*source++); |
180 | | |
181 | | /* If we have a surrogate pair, convert to UTF32 first. */ |
182 | 604k | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) |
183 | 972 | { |
184 | | /* If the 16 bits following the high surrogate are in the source buffer... */ |
185 | 972 | if (source < sourceEnd) |
186 | 970 | { |
187 | 970 | uint32_t ch2 = setWcharFrom(*source); |
188 | | |
189 | | /* If it's a low surrogate, convert to UTF32. */ |
190 | 970 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) |
191 | 907 | { |
192 | 907 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + |
193 | 907 | halfBase; |
194 | 907 | ++source; |
195 | 907 | } |
196 | 63 | else if (flags == strictConversion) |
197 | 63 | { |
198 | | /* it's an unpaired high surrogate */ |
199 | 63 | --source; /* return to the illegal value itself */ |
200 | 63 | result = sourceIllegal; |
201 | 63 | break; |
202 | 63 | } |
203 | 970 | } |
204 | 2 | else |
205 | 2 | { |
206 | | /* We don't have the 16 bits following the high surrogate. */ |
207 | 2 | --source; /* return to the high surrogate */ |
208 | 2 | result = sourceExhausted; |
209 | 2 | break; |
210 | 2 | } |
211 | 972 | } |
212 | 603k | else if (flags == strictConversion) |
213 | 603k | { |
214 | | /* UTF-16 surrogate values are illegal in UTF-32 */ |
215 | 603k | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) |
216 | 27 | { |
217 | 27 | --source; /* return to the illegal value itself */ |
218 | 27 | result = sourceIllegal; |
219 | 27 | break; |
220 | 27 | } |
221 | 603k | } |
222 | | |
223 | | /* Figure out how many bytes the result will require */ |
224 | 604k | if (ch < (uint32_t)0x80) |
225 | 722 | { |
226 | 722 | bytesToWrite = 1; |
227 | 722 | } |
228 | 603k | else if (ch < (uint32_t)0x800) |
229 | 40.9k | { |
230 | 40.9k | bytesToWrite = 2; |
231 | 40.9k | } |
232 | 562k | else if (ch < (uint32_t)0x10000) |
233 | 561k | { |
234 | 561k | bytesToWrite = 3; |
235 | 561k | } |
236 | 907 | else if (ch < (uint32_t)0x110000) |
237 | 907 | { |
238 | 907 | bytesToWrite = 4; |
239 | 907 | } |
240 | 0 | else |
241 | 0 | { |
242 | 0 | bytesToWrite = 3; |
243 | 0 | ch = UNI_REPLACEMENT_CHAR; |
244 | 0 | } |
245 | | |
246 | 604k | target += bytesToWrite; |
247 | | |
248 | 604k | if ((target > targetEnd) && (!computeLength)) |
249 | 0 | { |
250 | 0 | source = oldSource; /* Back up source pointer! */ |
251 | 0 | target -= bytesToWrite; |
252 | 0 | result = targetExhausted; |
253 | 0 | break; |
254 | 0 | } |
255 | | |
256 | 604k | if (!computeLength) |
257 | 301k | { |
258 | 301k | switch (bytesToWrite) |
259 | 301k | { |
260 | | /* note: everything falls through. */ |
261 | 453 | case 4: |
262 | 453 | *--target = (uint8_t)((ch | byteMark) & byteMask); |
263 | 453 | ch >>= 6; |
264 | | /* fallthrough */ |
265 | 453 | WINPR_FALLTHROUGH |
266 | 280k | case 3: |
267 | 280k | *--target = (uint8_t)((ch | byteMark) & byteMask); |
268 | 280k | ch >>= 6; |
269 | | /* fallthrough */ |
270 | 280k | WINPR_FALLTHROUGH |
271 | | |
272 | 301k | case 2: |
273 | 301k | *--target = (uint8_t)((ch | byteMark) & byteMask); |
274 | 301k | ch >>= 6; |
275 | | /* fallthrough */ |
276 | 301k | WINPR_FALLTHROUGH |
277 | | |
278 | 301k | case 1: |
279 | 301k | *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]); |
280 | 301k | break; |
281 | 0 | default: |
282 | 0 | return sourceIllegal; |
283 | 301k | } |
284 | 301k | } |
285 | 302k | else |
286 | 302k | { |
287 | 302k | switch (bytesToWrite) |
288 | 302k | { |
289 | | /* note: everything falls through. */ |
290 | 454 | case 4: |
291 | 454 | --target; |
292 | | /* fallthrough */ |
293 | 454 | WINPR_FALLTHROUGH |
294 | | |
295 | 281k | case 3: |
296 | 281k | --target; |
297 | | /* fallthrough */ |
298 | 281k | WINPR_FALLTHROUGH |
299 | | |
300 | 302k | case 2: |
301 | 302k | --target; |
302 | | /* fallthrough */ |
303 | 302k | WINPR_FALLTHROUGH |
304 | | |
305 | 302k | case 1: |
306 | 302k | --target; |
307 | 302k | break; |
308 | 0 | default: |
309 | 0 | return sourceIllegal; |
310 | 302k | } |
311 | 302k | } |
312 | | |
313 | 604k | target += bytesToWrite; |
314 | 604k | } |
315 | | |
316 | 398 | *sourceStart = source; |
317 | 398 | *targetStart = target; |
318 | 398 | return result; |
319 | 398 | } |
320 | | |
321 | | /* --------------------------------------------------------------------- */ |
322 | | |
323 | | /* |
324 | | * Utility routine to tell whether a sequence of bytes is legal UTF-8. |
325 | | * This must be called with the length pre-determined by the first byte. |
326 | | * If not calling this from ConvertUTF8to*, then the length can be set by: |
327 | | * length = trailingBytesForUTF8[*source]+1; |
328 | | * and the sequence is illegal right away if there aren't that many bytes |
329 | | * available. |
330 | | * If presented with a length > 4, this returns false. The Unicode |
331 | | * definition of UTF-8 goes up to 4-byte sequences. |
332 | | */ |
333 | | |
334 | | static bool isLegalUTF8(const uint8_t* source, int length) |
335 | 6.45M | { |
336 | 6.45M | uint8_t a = 0; |
337 | 6.45M | const uint8_t* srcptr = source + length; |
338 | | |
339 | 6.45M | switch (length) |
340 | 6.45M | { |
341 | 2 | default: |
342 | 2 | return false; |
343 | | |
344 | | /* Everything else falls through when "true"... */ |
345 | 820 | case 4: |
346 | 820 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
347 | 10 | return false; |
348 | | /* fallthrough */ |
349 | 810 | WINPR_FALLTHROUGH |
350 | | |
351 | 1.50k | case 3: |
352 | 1.50k | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
353 | 10 | return false; |
354 | | /* fallthrough */ |
355 | 1.49k | WINPR_FALLTHROUGH |
356 | | |
357 | 1.79k | case 2: |
358 | 1.79k | if ((a = (*--srcptr)) > 0xBF) |
359 | 2 | return false; |
360 | | |
361 | 1.79k | switch (*source) |
362 | 1.79k | { |
363 | | /* no fall-through in this inner switch */ |
364 | 221 | case 0xE0: |
365 | 221 | if (a < 0xA0) |
366 | 11 | return false; |
367 | | |
368 | 210 | break; |
369 | | |
370 | 223 | case 0xED: |
371 | 223 | if (a > 0x9F) |
372 | 2 | return false; |
373 | | |
374 | 221 | break; |
375 | | |
376 | 282 | case 0xF0: |
377 | 282 | if (a < 0x90) |
378 | 8 | return false; |
379 | | |
380 | 274 | break; |
381 | | |
382 | 518 | case 0xF4: |
383 | 518 | if (a > 0x8F) |
384 | 2 | return false; |
385 | | |
386 | 516 | break; |
387 | | |
388 | 549 | default: |
389 | 549 | if (a < 0x80) |
390 | 5 | return false; |
391 | 544 | break; |
392 | 1.79k | } |
393 | | /* fallthrough */ |
394 | 1.76k | WINPR_FALLTHROUGH |
395 | | |
396 | 6.45M | case 1: |
397 | 6.45M | if (*source >= 0x80 && *source < 0xC2) |
398 | 24 | return false; |
399 | 6.45M | } |
400 | | |
401 | 6.45M | if (*source > 0xF4) |
402 | 2 | return false; |
403 | | |
404 | 6.45M | return true; |
405 | 6.45M | } |
406 | | |
407 | | /* --------------------------------------------------------------------- */ |
408 | | |
409 | | static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart, |
410 | | const uint8_t* sourceEnd, |
411 | | uint16_t** targetStart, |
412 | | const uint16_t* targetEnd, |
413 | | ConversionFlags flags) |
414 | 1.97k | { |
415 | 1.97k | bool computeLength = (!targetEnd) ? true : false; |
416 | 1.97k | ConversionResult result = conversionOK; |
417 | 1.97k | const uint8_t* source = *sourceStart; |
418 | 1.97k | uint16_t* target = *targetStart; |
419 | | |
420 | 6.45M | while (source < sourceEnd) |
421 | 6.45M | { |
422 | 6.45M | uint32_t ch = 0; |
423 | 6.45M | unsigned short extraBytesToRead = |
424 | 12.9M | WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]); |
425 | | |
426 | 6.45M | if ((source + extraBytesToRead) >= sourceEnd) |
427 | 1 | { |
428 | 1 | result = sourceExhausted; |
429 | 1 | break; |
430 | 1 | } |
431 | | |
432 | | /* Do this check whether lenient or strict */ |
433 | 6.45M | if (!isLegalUTF8(source, extraBytesToRead + 1)) |
434 | 78 | { |
435 | 78 | result = sourceIllegal; |
436 | 78 | break; |
437 | 78 | } |
438 | | |
439 | | /* |
440 | | * The cases all fall through. See "Note A" below. |
441 | | */ |
442 | 6.45M | switch (extraBytesToRead) |
443 | 6.45M | { |
444 | 0 | case 5: |
445 | 0 | ch += *source++; |
446 | 0 | ch <<= 6; /* remember, illegal UTF-8 */ |
447 | | /* fallthrough */ |
448 | 0 | WINPR_FALLTHROUGH |
449 | |
|
450 | 0 | case 4: |
451 | 0 | ch += *source++; |
452 | 0 | ch <<= 6; /* remember, illegal UTF-8 */ |
453 | | /* fallthrough */ |
454 | 0 | WINPR_FALLTHROUGH |
455 | |
|
456 | 798 | case 3: |
457 | 798 | ch += *source++; |
458 | 798 | ch <<= 6; |
459 | | /* fallthrough */ |
460 | 798 | WINPR_FALLTHROUGH |
461 | | |
462 | 1.47k | case 2: |
463 | 1.47k | ch += *source++; |
464 | 1.47k | ch <<= 6; |
465 | | /* fallthrough */ |
466 | 1.47k | WINPR_FALLTHROUGH |
467 | | |
468 | 1.75k | case 1: |
469 | 1.75k | ch += *source++; |
470 | 1.75k | ch <<= 6; |
471 | | /* fallthrough */ |
472 | 1.75k | WINPR_FALLTHROUGH |
473 | | |
474 | 6.45M | case 0: |
475 | 6.45M | ch += *source++; |
476 | 6.45M | break; |
477 | 0 | default: |
478 | 0 | return sourceIllegal; |
479 | 6.45M | } |
480 | | |
481 | 6.45M | ch -= offsetsFromUTF8[extraBytesToRead]; |
482 | | |
483 | 6.45M | if ((target >= targetEnd) && (!computeLength)) |
484 | 0 | { |
485 | 0 | source -= (extraBytesToRead + 1); /* Back up source pointer! */ |
486 | 0 | result = targetExhausted; |
487 | 0 | break; |
488 | 0 | } |
489 | | |
490 | 6.45M | if (ch <= UNI_MAX_BMP) |
491 | 6.45M | { |
492 | | /* Target is a character <= 0xFFFF */ |
493 | | /* UTF-16 surrogate values are illegal in UTF-32 */ |
494 | 6.45M | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) |
495 | 0 | { |
496 | 0 | if (flags == strictConversion) |
497 | 0 | { |
498 | 0 | source -= (extraBytesToRead + 1); /* return to the illegal value itself */ |
499 | 0 | result = sourceIllegal; |
500 | 0 | break; |
501 | 0 | } |
502 | 0 | else |
503 | 0 | { |
504 | 0 | if (!computeLength) |
505 | 0 | *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR); |
506 | 0 | else |
507 | 0 | target++; |
508 | 0 | } |
509 | 0 | } |
510 | 6.45M | else |
511 | 6.45M | { |
512 | 6.45M | if (!computeLength) |
513 | 3.22M | *target++ = setWcharFrom((WCHAR)ch); /* normal case */ |
514 | 3.22M | else |
515 | 3.22M | target++; |
516 | 6.45M | } |
517 | 6.45M | } |
518 | 798 | else if (ch > UNI_MAX_UTF16) |
519 | 0 | { |
520 | 0 | if (flags == strictConversion) |
521 | 0 | { |
522 | 0 | result = sourceIllegal; |
523 | 0 | source -= (extraBytesToRead + 1); /* return to the start */ |
524 | 0 | break; /* Bail out; shouldn't continue */ |
525 | 0 | } |
526 | 0 | else |
527 | 0 | { |
528 | 0 | if (!computeLength) |
529 | 0 | *target++ = setWcharFrom(UNI_REPLACEMENT_CHAR); |
530 | 0 | else |
531 | 0 | target++; |
532 | 0 | } |
533 | 0 | } |
534 | 798 | else |
535 | 798 | { |
536 | | /* target is a character in range 0xFFFF - 0x10FFFF. */ |
537 | 798 | if ((target + 1 >= targetEnd) && (!computeLength)) |
538 | 0 | { |
539 | 0 | source -= (extraBytesToRead + 1); /* Back up source pointer! */ |
540 | 0 | result = targetExhausted; |
541 | 0 | break; |
542 | 0 | } |
543 | | |
544 | 798 | ch -= halfBase; |
545 | | |
546 | 798 | if (!computeLength) |
547 | 394 | { |
548 | 394 | *target++ = setWcharFrom((WCHAR)((ch >> halfShift) + UNI_SUR_HIGH_START)); |
549 | 394 | *target++ = setWcharFrom((WCHAR)((ch & halfMask) + UNI_SUR_LOW_START)); |
550 | 394 | } |
551 | 404 | else |
552 | 404 | { |
553 | 404 | target++; |
554 | 404 | target++; |
555 | 404 | } |
556 | 798 | } |
557 | 6.45M | } |
558 | | |
559 | 1.97k | *sourceStart = source; |
560 | 1.97k | *targetStart = target; |
561 | 1.97k | return result; |
562 | 1.97k | } |
563 | | |
564 | | /** |
565 | | * WinPR built-in Unicode API |
566 | | */ |
567 | | |
568 | | static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst) |
569 | 1.97k | { |
570 | 1.97k | size_t length = 0; |
571 | 1.97k | uint16_t* dstBeg = NULL; |
572 | 1.97k | uint16_t* dstEnd = NULL; |
573 | 1.97k | const uint8_t* srcBeg = NULL; |
574 | 1.97k | const uint8_t* srcEnd = NULL; |
575 | 1.97k | ConversionResult result = sourceIllegal; |
576 | | |
577 | 1.97k | if (cchSrc == -1) |
578 | 0 | cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1; |
579 | | |
580 | 1.97k | srcBeg = src; |
581 | 1.97k | srcEnd = &src[cchSrc]; |
582 | | |
583 | 1.97k | if (cchDst == 0) |
584 | 1.02k | { |
585 | 1.02k | result = |
586 | 1.02k | winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
587 | | |
588 | 1.02k | length = dstBeg - (uint16_t*)NULL; |
589 | 1.02k | } |
590 | 950 | else |
591 | 950 | { |
592 | 950 | dstBeg = dst; |
593 | 950 | dstEnd = &dst[cchDst]; |
594 | | |
595 | 950 | result = |
596 | 950 | winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
597 | | |
598 | 950 | length = dstBeg - dst; |
599 | 950 | } |
600 | | |
601 | 1.97k | if (result == targetExhausted) |
602 | 0 | { |
603 | 0 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
604 | 0 | return 0; |
605 | 0 | } |
606 | | |
607 | 1.97k | return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0; |
608 | 1.97k | } |
609 | | |
610 | | static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst) |
611 | 398 | { |
612 | 398 | size_t length = 0; |
613 | 398 | uint8_t* dstBeg = NULL; |
614 | 398 | uint8_t* dstEnd = NULL; |
615 | 398 | const uint16_t* srcBeg = NULL; |
616 | 398 | const uint16_t* srcEnd = NULL; |
617 | 398 | ConversionResult result = sourceIllegal; |
618 | | |
619 | 398 | if (cchSrc == -1) |
620 | 0 | cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1; |
621 | | |
622 | 398 | srcBeg = src; |
623 | 398 | srcEnd = &src[cchSrc]; |
624 | | |
625 | 398 | if (cchDst == 0) |
626 | 245 | { |
627 | 245 | result = |
628 | 245 | winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
629 | | |
630 | 245 | length = dstBeg - ((uint8_t*)NULL); |
631 | 245 | } |
632 | 153 | else |
633 | 153 | { |
634 | 153 | dstBeg = dst; |
635 | 153 | dstEnd = &dst[cchDst]; |
636 | | |
637 | 153 | result = |
638 | 153 | winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
639 | | |
640 | 153 | length = dstBeg - dst; |
641 | 153 | } |
642 | | |
643 | 398 | if (result == targetExhausted) |
644 | 0 | { |
645 | 0 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
646 | 0 | return 0; |
647 | 0 | } |
648 | | |
649 | 398 | return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0; |
650 | 398 | } |
651 | | |
652 | | /* --------------------------------------------------------------------- */ |
653 | | |
654 | | int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte, |
655 | | LPWSTR lpWideCharStr, int cchWideChar) |
656 | 1.97k | { |
657 | 1.97k | size_t cbCharLen = (size_t)cbMultiByte; |
658 | | |
659 | 1.97k | WINPR_UNUSED(dwFlags); |
660 | | |
661 | | /* If cbMultiByte is 0, the function fails */ |
662 | 1.97k | if ((cbMultiByte == 0) || (cbMultiByte < -1)) |
663 | 0 | return 0; |
664 | | |
665 | 1.97k | if (cchWideChar < 0) |
666 | 0 | return -1; |
667 | | |
668 | 1.97k | if (cbMultiByte < 0) |
669 | 0 | { |
670 | 0 | const size_t len = strlen(lpMultiByteStr); |
671 | 0 | if (len >= INT32_MAX) |
672 | 0 | return 0; |
673 | 0 | cbCharLen = (int)len + 1; |
674 | 0 | } |
675 | 1.97k | else |
676 | 1.97k | cbCharLen = cbMultiByte; |
677 | | |
678 | 1.97k | WINPR_ASSERT(lpMultiByteStr); |
679 | 1.97k | switch (CodePage) |
680 | 1.97k | { |
681 | 0 | case CP_ACP: |
682 | 1.97k | case CP_UTF8: |
683 | 1.97k | break; |
684 | | |
685 | 0 | default: |
686 | 0 | WLog_ERR(TAG, "Unsupported encoding %u", CodePage); |
687 | 0 | return 0; |
688 | 1.97k | } |
689 | | |
690 | 1.97k | return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr, |
691 | 1.97k | WINPR_ASSERTING_INT_CAST(int, cbCharLen), |
692 | 0 | (uint16_t*)lpWideCharStr, cchWideChar); |
693 | 1.97k | } |
694 | | |
695 | | int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, |
696 | | LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, |
697 | | LPBOOL lpUsedDefaultChar) |
698 | 398 | { |
699 | 398 | size_t cbCharLen = (size_t)cchWideChar; |
700 | | |
701 | 398 | WINPR_UNUSED(dwFlags); |
702 | | /* If cchWideChar is 0, the function fails */ |
703 | 398 | if ((cchWideChar == 0) || (cchWideChar < -1)) |
704 | 0 | return 0; |
705 | | |
706 | 398 | if (cbMultiByte < 0) |
707 | 0 | return -1; |
708 | | |
709 | 398 | WINPR_ASSERT(lpWideCharStr); |
710 | | /* If cchWideChar is -1, the string is null-terminated */ |
711 | 398 | if (cchWideChar == -1) |
712 | 0 | { |
713 | 0 | const size_t len = _wcslen(lpWideCharStr); |
714 | 0 | if (len >= INT32_MAX) |
715 | 0 | return 0; |
716 | 0 | cbCharLen = (int)len + 1; |
717 | 0 | } |
718 | 398 | else |
719 | 398 | cbCharLen = cchWideChar; |
720 | | |
721 | | /* |
722 | | * if cbMultiByte is 0, the function returns the required buffer size |
723 | | * in bytes for lpMultiByteStr and makes no use of the output parameter itself. |
724 | | */ |
725 | | |
726 | 398 | return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr, |
727 | 398 | WINPR_ASSERTING_INT_CAST(int, cbCharLen), |
728 | 0 | (uint8_t*)lpMultiByteStr, cbMultiByte); |
729 | 398 | } |