/src/FreeRDP/winpr/libwinpr/crt/unicode_builtin.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2001-2004 Unicode, Inc. |
3 | | * |
4 | | * Disclaimer |
5 | | * |
6 | | * This source code is provided as is by Unicode, Inc. No claims are |
7 | | * made as to fitness for any particular purpose. No warranties of any |
8 | | * kind are expressed or implied. The recipient agrees to determine |
9 | | * applicability of information provided. If this file has been |
10 | | * purchased on magnetic or optical media from Unicode, Inc., the |
11 | | * sole remedy for any claim will be exchange of defective media |
12 | | * within 90 days of receipt. |
13 | | * |
14 | | * Limitations on Rights to Redistribute This Code |
15 | | * |
16 | | * Unicode, Inc. hereby grants the right to freely use the information |
17 | | * supplied in this file in the creation of products supporting the |
18 | | * Unicode Standard, and to make copies of this file in any form |
19 | | * for internal or external distribution as long as this notice |
20 | | * remains attached. |
21 | | */ |
22 | | |
23 | | /* --------------------------------------------------------------------- |
24 | | |
25 | | Conversions between UTF32, UTF-16, and UTF-8. Source code file. |
26 | | Author: Mark E. Davis, 1994. |
27 | | Rev History: Rick McGowan, fixes & updates May 2001. |
28 | | Sept 2001: fixed const & error conditions per |
29 | | mods suggested by S. Parent & A. Lillich. |
30 | | June 2002: Tim Dodd added detection and handling of incomplete |
31 | | source sequences, enhanced error detection, added casts |
32 | | to eliminate compiler warnings. |
33 | | July 2003: slight mods to back out aggressive FFFE detection. |
34 | | Jan 2004: updated switches in from-UTF8 conversions. |
35 | | Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. |
36 | | |
37 | | See the header file "utf.h" for complete documentation. |
38 | | |
39 | | ------------------------------------------------------------------------ */ |
40 | | |
41 | | #include <winpr/wtypes.h> |
42 | | #include <winpr/string.h> |
43 | | #include <winpr/assert.h> |
44 | | |
45 | | #include "unicode.h" |
46 | | |
47 | | #include "../log.h" |
48 | | #define TAG WINPR_TAG("unicode") |
49 | | |
50 | | /* |
51 | | * Character Types: |
52 | | * |
53 | | * UTF8: uint8_t 8 bits |
54 | | * UTF16: uint16_t 16 bits |
55 | | * UTF32: uint32_t 32 bits |
56 | | */ |
57 | | |
58 | | /* Some fundamental constants */ |
59 | 0 | #define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD |
60 | 13.0M | #define UNI_MAX_BMP (uint32_t)0x0000FFFF |
61 | 800 | #define UNI_MAX_UTF16 (uint32_t)0x0010FFFF |
62 | | #define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF |
63 | | #define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF |
64 | | |
65 | | typedef enum |
66 | | { |
67 | | conversionOK, /* conversion successful */ |
68 | | sourceExhausted, /* partial character in source, but hit end */ |
69 | | targetExhausted, /* insuff. room in target for conversion */ |
70 | | sourceIllegal /* source sequence is illegal/malformed */ |
71 | | } ConversionResult; |
72 | | |
73 | | typedef enum |
74 | | { |
75 | | strictConversion = 0, |
76 | | lenientConversion |
77 | | } ConversionFlags; |
78 | | |
79 | | static const int halfShift = 10; /* used for shifting by 10 bits */ |
80 | | |
81 | | static const uint32_t halfBase = 0x0010000UL; |
82 | | static const uint32_t halfMask = 0x3FFUL; |
83 | | |
84 | 33.2M | #define UNI_SUR_HIGH_START (uint32_t)0xD800 |
85 | 102k | #define UNI_SUR_HIGH_END (uint32_t)0xDBFF |
86 | 7.09M | #define UNI_SUR_LOW_START (uint32_t)0xDC00 |
87 | 102k | #define UNI_SUR_LOW_END (uint32_t)0xDFFF |
88 | | |
89 | | /* --------------------------------------------------------------------- */ |
90 | | |
91 | | /* |
92 | | * Index into the table below with the first byte of a UTF-8 sequence to |
93 | | * get the number of trailing bytes that are supposed to follow it. |
94 | | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is |
95 | | * left as-is for anyone who may want to do such conversion, which was |
96 | | * allowed in earlier algorithms. |
97 | | */ |
98 | | static const char trailingBytesForUTF8[256] = { |
99 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
100 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
101 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
102 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
103 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
104 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
105 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
106 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 |
107 | | }; |
108 | | |
109 | | /* |
110 | | * Magic values subtracted from a buffer value during UTF8 conversion. |
111 | | * This table contains as many values as there might be trailing bytes |
112 | | * in a UTF-8 sequence. |
113 | | */ |
114 | | static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
115 | | 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; |
116 | | |
117 | | /* |
118 | | * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed |
119 | | * into the first byte, depending on how many bytes follow. There are |
120 | | * as many entries in this table as there are UTF-8 sequence types. |
121 | | * (I.e., one byte sequence, two byte... etc.). Remember that sequencs |
122 | | * for *legal* UTF-8 will be 4 or fewer bytes total. |
123 | | */ |
124 | | static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
125 | | |
126 | | /* --------------------------------------------------------------------- */ |
127 | | |
128 | | /* The interface converts a whole buffer to avoid function-call overhead. |
129 | | * Constants have been gathered. Loops & conditionals have been removed as |
130 | | * much as possible for efficiency, in favor of drop-through switches. |
131 | | * (See "Note A" at the bottom of the file for equivalent code.) |
132 | | * If your compiler supports it, the "isLegalUTF8" call can be turned |
133 | | * into an inline function. |
134 | | */ |
135 | | |
136 | | /* --------------------------------------------------------------------- */ |
137 | | |
138 | | static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart, |
139 | | const uint16_t* sourceEnd, |
140 | | uint8_t** targetStart, uint8_t* targetEnd, |
141 | | ConversionFlags flags) |
142 | 104k | { |
143 | 104k | bool computeLength = (!targetEnd) ? true : false; |
144 | 104k | const uint16_t* source = *sourceStart; |
145 | 104k | uint8_t* target = *targetStart; |
146 | 104k | ConversionResult result = conversionOK; |
147 | | |
148 | 3.64M | while (source < sourceEnd) |
149 | 3.54M | { |
150 | 3.54M | uint32_t ch = 0; |
151 | 3.54M | unsigned short bytesToWrite = 0; |
152 | 3.54M | const uint32_t byteMask = 0xBF; |
153 | 3.54M | const uint32_t byteMark = 0x80; |
154 | 3.54M | const uint16_t* oldSource = |
155 | 3.54M | source; /* In case we have to back up because of target overflow. */ |
156 | | |
157 | 3.54M | ch = *source++; |
158 | | |
159 | | /* If we have a surrogate pair, convert to UTF32 first. */ |
160 | 3.54M | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) |
161 | 760 | { |
162 | | /* If the 16 bits following the high surrogate are in the source buffer... */ |
163 | 760 | if (source < sourceEnd) |
164 | 751 | { |
165 | 751 | uint32_t ch2 = *source; |
166 | | |
167 | | /* If it's a low surrogate, convert to UTF32. */ |
168 | 751 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) |
169 | 513 | { |
170 | 513 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + |
171 | 513 | halfBase; |
172 | 513 | ++source; |
173 | 513 | } |
174 | 238 | else if (flags == strictConversion) |
175 | 238 | { |
176 | | /* it's an unpaired high surrogate */ |
177 | 238 | --source; /* return to the illegal value itself */ |
178 | 238 | result = sourceIllegal; |
179 | 238 | break; |
180 | 238 | } |
181 | 751 | } |
182 | 9 | else |
183 | 9 | { |
184 | | /* We don't have the 16 bits following the high surrogate. */ |
185 | 9 | --source; /* return to the high surrogate */ |
186 | 9 | result = sourceExhausted; |
187 | 9 | break; |
188 | 9 | } |
189 | 760 | } |
190 | 3.54M | else if (flags == strictConversion) |
191 | 3.54M | { |
192 | | /* UTF-16 surrogate values are illegal in UTF-32 */ |
193 | 3.54M | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) |
194 | 6.08k | { |
195 | 6.08k | --source; /* return to the illegal value itself */ |
196 | 6.08k | result = sourceIllegal; |
197 | 6.08k | break; |
198 | 6.08k | } |
199 | 3.54M | } |
200 | | |
201 | | /* Figure out how many bytes the result will require */ |
202 | 3.54M | if (ch < (uint32_t)0x80) |
203 | 2.52M | { |
204 | 2.52M | bytesToWrite = 1; |
205 | 2.52M | } |
206 | 1.01M | else if (ch < (uint32_t)0x800) |
207 | 59.9k | { |
208 | 59.9k | bytesToWrite = 2; |
209 | 59.9k | } |
210 | 954k | else if (ch < (uint32_t)0x10000) |
211 | 954k | { |
212 | 954k | bytesToWrite = 3; |
213 | 954k | } |
214 | 513 | else if (ch < (uint32_t)0x110000) |
215 | 513 | { |
216 | 513 | bytesToWrite = 4; |
217 | 513 | } |
218 | 0 | else |
219 | 0 | { |
220 | 0 | bytesToWrite = 3; |
221 | 0 | ch = UNI_REPLACEMENT_CHAR; |
222 | 0 | } |
223 | | |
224 | 3.54M | target += bytesToWrite; |
225 | | |
226 | 3.54M | if ((target > targetEnd) && (!computeLength)) |
227 | 361 | { |
228 | 361 | source = oldSource; /* Back up source pointer! */ |
229 | 361 | target -= bytesToWrite; |
230 | 361 | result = targetExhausted; |
231 | 361 | break; |
232 | 361 | } |
233 | | |
234 | 3.54M | if (!computeLength) |
235 | 1.83M | { |
236 | 1.83M | switch (bytesToWrite) |
237 | 1.83M | { |
238 | | /* note: everything falls through. */ |
239 | 260 | case 4: |
240 | 260 | *--target = (uint8_t)((ch | byteMark) & byteMask); |
241 | 260 | ch >>= 6; |
242 | | /* fallthrough */ |
243 | 260 | WINPR_FALLTHROUGH |
244 | 499k | case 3: |
245 | 499k | *--target = (uint8_t)((ch | byteMark) & byteMask); |
246 | 499k | ch >>= 6; |
247 | | /* fallthrough */ |
248 | 499k | WINPR_FALLTHROUGH |
249 | | |
250 | 530k | case 2: |
251 | 530k | *--target = (uint8_t)((ch | byteMark) & byteMask); |
252 | 530k | ch >>= 6; |
253 | | /* fallthrough */ |
254 | 530k | WINPR_FALLTHROUGH |
255 | | |
256 | 1.83M | case 1: |
257 | 1.83M | *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]); |
258 | 1.83M | } |
259 | 1.83M | } |
260 | 1.70M | else |
261 | 1.70M | { |
262 | 1.70M | switch (bytesToWrite) |
263 | 1.70M | { |
264 | | /* note: everything falls through. */ |
265 | 253 | case 4: |
266 | 253 | --target; |
267 | | /* fallthrough */ |
268 | 253 | WINPR_FALLTHROUGH |
269 | | |
270 | 455k | case 3: |
271 | 455k | --target; |
272 | | /* fallthrough */ |
273 | 455k | WINPR_FALLTHROUGH |
274 | | |
275 | 484k | case 2: |
276 | 484k | --target; |
277 | | /* fallthrough */ |
278 | 484k | WINPR_FALLTHROUGH |
279 | | |
280 | 1.70M | case 1: |
281 | 1.70M | --target; |
282 | 1.70M | } |
283 | 1.70M | } |
284 | | |
285 | 3.54M | target += bytesToWrite; |
286 | 3.54M | } |
287 | | |
288 | 104k | *sourceStart = source; |
289 | 104k | *targetStart = target; |
290 | 104k | return result; |
291 | 104k | } |
292 | | |
293 | | /* --------------------------------------------------------------------- */ |
294 | | |
295 | | /* |
296 | | * Utility routine to tell whether a sequence of bytes is legal UTF-8. |
297 | | * This must be called with the length pre-determined by the first byte. |
298 | | * If not calling this from ConvertUTF8to*, then the length can be set by: |
299 | | * length = trailingBytesForUTF8[*source]+1; |
300 | | * and the sequence is illegal right away if there aren't that many bytes |
301 | | * available. |
302 | | * If presented with a length > 4, this returns false. The Unicode |
303 | | * definition of UTF-8 goes up to 4-byte sequences. |
304 | | */ |
305 | | |
306 | | static bool isLegalUTF8(const uint8_t* source, int length) |
307 | 13.0M | { |
308 | 13.0M | uint8_t a = 0; |
309 | 13.0M | const uint8_t* srcptr = source + length; |
310 | | |
311 | 13.0M | switch (length) |
312 | 13.0M | { |
313 | 3 | default: |
314 | 3 | return false; |
315 | | |
316 | | /* Everything else falls through when "true"... */ |
317 | 823 | case 4: |
318 | 823 | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
319 | 9 | return false; |
320 | | /* fallthrough */ |
321 | 823 | WINPR_FALLTHROUGH |
322 | | |
323 | 1.52k | case 3: |
324 | 1.52k | if ((a = (*--srcptr)) < 0x80 || a > 0xBF) |
325 | 6 | return false; |
326 | | /* fallthrough */ |
327 | 1.52k | WINPR_FALLTHROUGH |
328 | | |
329 | 1.81k | case 2: |
330 | 1.81k | if ((a = (*--srcptr)) > 0xBF) |
331 | 1 | return false; |
332 | | |
333 | 1.81k | switch (*source) |
334 | 1.81k | { |
335 | | /* no fall-through in this inner switch */ |
336 | 229 | case 0xE0: |
337 | 229 | if (a < 0xA0) |
338 | 10 | return false; |
339 | | |
340 | 219 | break; |
341 | | |
342 | 232 | case 0xED: |
343 | 232 | if (a > 0x9F) |
344 | 1 | return false; |
345 | | |
346 | 231 | break; |
347 | | |
348 | 231 | case 0xF0: |
349 | 216 | if (a < 0x90) |
350 | 11 | return false; |
351 | | |
352 | 205 | break; |
353 | | |
354 | 586 | case 0xF4: |
355 | 586 | if (a > 0x8F) |
356 | 1 | return false; |
357 | | |
358 | 585 | break; |
359 | | |
360 | 585 | default: |
361 | 551 | if (a < 0x80) |
362 | 4 | return false; |
363 | 547 | break; |
364 | 1.81k | } |
365 | | /* fallthrough */ |
366 | 1.81k | WINPR_FALLTHROUGH |
367 | | |
368 | 13.0M | case 1: |
369 | 13.0M | if (*source >= 0x80 && *source < 0xC2) |
370 | 24 | return false; |
371 | 13.0M | } |
372 | | |
373 | 13.0M | if (*source > 0xF4) |
374 | 1 | return false; |
375 | | |
376 | 13.0M | return true; |
377 | 13.0M | } |
378 | | |
379 | | /* --------------------------------------------------------------------- */ |
380 | | |
381 | | static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart, |
382 | | const uint8_t* sourceEnd, |
383 | | uint16_t** targetStart, |
384 | | uint16_t* targetEnd, |
385 | | ConversionFlags flags) |
386 | 271k | { |
387 | 271k | bool computeLength = (!targetEnd) ? true : false; |
388 | 271k | ConversionResult result = conversionOK; |
389 | 271k | const uint8_t* source = *sourceStart; |
390 | 271k | uint16_t* target = *targetStart; |
391 | | |
392 | 13.3M | while (source < sourceEnd) |
393 | 13.0M | { |
394 | 13.0M | uint32_t ch = 0; |
395 | 13.0M | unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; |
396 | | |
397 | 13.0M | if ((source + extraBytesToRead) >= sourceEnd) |
398 | 1 | { |
399 | 1 | result = sourceExhausted; |
400 | 1 | break; |
401 | 1 | } |
402 | | |
403 | | /* Do this check whether lenient or strict */ |
404 | 13.0M | if (!isLegalUTF8(source, extraBytesToRead + 1)) |
405 | 71 | { |
406 | 71 | result = sourceIllegal; |
407 | 71 | break; |
408 | 71 | } |
409 | | |
410 | | /* |
411 | | * The cases all fall through. See "Note A" below. |
412 | | */ |
413 | 13.0M | switch (extraBytesToRead) |
414 | 13.0M | { |
415 | 0 | case 5: |
416 | 0 | ch += *source++; |
417 | 0 | ch <<= 6; /* remember, illegal UTF-8 */ |
418 | | /* fallthrough */ |
419 | 0 | WINPR_FALLTHROUGH |
420 | |
|
421 | 0 | case 4: |
422 | 0 | ch += *source++; |
423 | 0 | ch <<= 6; /* remember, illegal UTF-8 */ |
424 | | /* fallthrough */ |
425 | 0 | WINPR_FALLTHROUGH |
426 | |
|
427 | 800 | case 3: |
428 | 800 | ch += *source++; |
429 | 800 | ch <<= 6; |
430 | | /* fallthrough */ |
431 | 800 | WINPR_FALLTHROUGH |
432 | | |
433 | 1.49k | case 2: |
434 | 1.49k | ch += *source++; |
435 | 1.49k | ch <<= 6; |
436 | | /* fallthrough */ |
437 | 1.49k | WINPR_FALLTHROUGH |
438 | | |
439 | 1.78k | case 1: |
440 | 1.78k | ch += *source++; |
441 | 1.78k | ch <<= 6; |
442 | | /* fallthrough */ |
443 | 1.78k | WINPR_FALLTHROUGH |
444 | | |
445 | 13.0M | case 0: |
446 | 13.0M | ch += *source++; |
447 | 13.0M | } |
448 | | |
449 | 13.0M | ch -= offsetsFromUTF8[extraBytesToRead]; |
450 | | |
451 | 13.0M | if ((target >= targetEnd) && (!computeLength)) |
452 | 0 | { |
453 | 0 | source -= (extraBytesToRead + 1); /* Back up source pointer! */ |
454 | 0 | result = targetExhausted; |
455 | 0 | break; |
456 | 0 | } |
457 | | |
458 | 13.0M | if (ch <= UNI_MAX_BMP) |
459 | 13.0M | { |
460 | | /* Target is a character <= 0xFFFF */ |
461 | | /* UTF-16 surrogate values are illegal in UTF-32 */ |
462 | 13.0M | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) |
463 | 0 | { |
464 | 0 | if (flags == strictConversion) |
465 | 0 | { |
466 | 0 | source -= (extraBytesToRead + 1); /* return to the illegal value itself */ |
467 | 0 | result = sourceIllegal; |
468 | 0 | break; |
469 | 0 | } |
470 | 0 | else |
471 | 0 | { |
472 | 0 | if (!computeLength) |
473 | 0 | *target++ = UNI_REPLACEMENT_CHAR; |
474 | 0 | else |
475 | 0 | target++; |
476 | 0 | } |
477 | 0 | } |
478 | 13.0M | else |
479 | 13.0M | { |
480 | 13.0M | if (!computeLength) |
481 | 9.76M | *target++ = (uint16_t)ch; /* normal case */ |
482 | 3.29M | else |
483 | 3.29M | target++; |
484 | 13.0M | } |
485 | 13.0M | } |
486 | 800 | else if (ch > UNI_MAX_UTF16) |
487 | 0 | { |
488 | 0 | if (flags == strictConversion) |
489 | 0 | { |
490 | 0 | result = sourceIllegal; |
491 | 0 | source -= (extraBytesToRead + 1); /* return to the start */ |
492 | 0 | break; /* Bail out; shouldn't continue */ |
493 | 0 | } |
494 | 0 | else |
495 | 0 | { |
496 | 0 | if (!computeLength) |
497 | 0 | *target++ = UNI_REPLACEMENT_CHAR; |
498 | 0 | else |
499 | 0 | target++; |
500 | 0 | } |
501 | 0 | } |
502 | 800 | else |
503 | 800 | { |
504 | | /* target is a character in range 0xFFFF - 0x10FFFF. */ |
505 | 800 | if ((target + 1 >= targetEnd) && (!computeLength)) |
506 | 0 | { |
507 | 0 | source -= (extraBytesToRead + 1); /* Back up source pointer! */ |
508 | 0 | result = targetExhausted; |
509 | 0 | break; |
510 | 0 | } |
511 | | |
512 | 800 | ch -= halfBase; |
513 | | |
514 | 800 | if (!computeLength) |
515 | 390 | { |
516 | 390 | *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START); |
517 | 390 | *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START); |
518 | 390 | } |
519 | 410 | else |
520 | 410 | { |
521 | 410 | target++; |
522 | 410 | target++; |
523 | 410 | } |
524 | 800 | } |
525 | 13.0M | } |
526 | | |
527 | 271k | *sourceStart = source; |
528 | 271k | *targetStart = target; |
529 | 271k | return result; |
530 | 271k | } |
531 | | |
532 | | /** |
533 | | * WinPR built-in Unicode API |
534 | | */ |
535 | | |
536 | | static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst) |
537 | 271k | { |
538 | 271k | size_t length = 0; |
539 | 271k | uint16_t* dstBeg = NULL; |
540 | 271k | uint16_t* dstEnd = NULL; |
541 | 271k | const uint8_t* srcBeg = NULL; |
542 | 271k | const uint8_t* srcEnd = NULL; |
543 | 271k | ConversionResult result = sourceIllegal; |
544 | | |
545 | 271k | if (cchSrc == -1) |
546 | 0 | cchSrc = strlen((char*)src) + 1; |
547 | | |
548 | 271k | srcBeg = src; |
549 | 271k | srcEnd = &src[cchSrc]; |
550 | | |
551 | 271k | if (cchDst == 0) |
552 | 1.04k | { |
553 | 1.04k | result = |
554 | 1.04k | winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
555 | | |
556 | 1.04k | length = dstBeg - (uint16_t*)NULL; |
557 | 1.04k | } |
558 | 270k | else |
559 | 270k | { |
560 | 270k | dstBeg = dst; |
561 | 270k | dstEnd = &dst[cchDst]; |
562 | | |
563 | 270k | result = |
564 | 270k | winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
565 | | |
566 | 270k | length = dstBeg - dst; |
567 | 270k | } |
568 | | |
569 | 271k | if (result == targetExhausted) |
570 | 0 | { |
571 | 0 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
572 | 0 | return 0; |
573 | 0 | } |
574 | | |
575 | 271k | return (result == conversionOK) ? length : 0; |
576 | 271k | } |
577 | | |
578 | | static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst) |
579 | 104k | { |
580 | 104k | size_t length = 0; |
581 | 104k | uint8_t* dstBeg = NULL; |
582 | 104k | uint8_t* dstEnd = NULL; |
583 | 104k | const uint16_t* srcBeg = NULL; |
584 | 104k | const uint16_t* srcEnd = NULL; |
585 | 104k | ConversionResult result = sourceIllegal; |
586 | | |
587 | 104k | if (cchSrc == -1) |
588 | 0 | cchSrc = _wcslen((uint16_t*)src) + 1; |
589 | | |
590 | 104k | srcBeg = src; |
591 | 104k | srcEnd = &src[cchSrc]; |
592 | | |
593 | 104k | if (cchDst == 0) |
594 | 45.8k | { |
595 | 45.8k | result = |
596 | 45.8k | winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
597 | | |
598 | 45.8k | length = dstBeg - ((uint8_t*)NULL); |
599 | 45.8k | } |
600 | 58.8k | else |
601 | 58.8k | { |
602 | 58.8k | dstBeg = dst; |
603 | 58.8k | dstEnd = &dst[cchDst]; |
604 | | |
605 | 58.8k | result = |
606 | 58.8k | winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion); |
607 | | |
608 | 58.8k | length = dstBeg - dst; |
609 | 58.8k | } |
610 | | |
611 | 104k | if (result == targetExhausted) |
612 | 361 | { |
613 | 361 | SetLastError(ERROR_INSUFFICIENT_BUFFER); |
614 | 361 | return 0; |
615 | 361 | } |
616 | | |
617 | 104k | return (result == conversionOK) ? length : 0; |
618 | 104k | } |
619 | | |
620 | | /* --------------------------------------------------------------------- */ |
621 | | |
622 | | int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte, |
623 | | LPWSTR lpWideCharStr, int cchWideChar) |
624 | 271k | { |
625 | 271k | size_t cbCharLen = (size_t)cbMultiByte; |
626 | | |
627 | 271k | WINPR_UNUSED(dwFlags); |
628 | | |
629 | | /* If cbMultiByte is 0, the function fails */ |
630 | 271k | if ((cbMultiByte == 0) || (cbMultiByte < -1)) |
631 | 0 | return 0; |
632 | | |
633 | 271k | if (cchWideChar < 0) |
634 | 0 | return -1; |
635 | | |
636 | 271k | if (cbMultiByte < 0) |
637 | 0 | { |
638 | 0 | const size_t len = strlen(lpMultiByteStr); |
639 | 0 | if (len >= INT32_MAX) |
640 | 0 | return 0; |
641 | 0 | cbCharLen = (int)len + 1; |
642 | 0 | } |
643 | 271k | else |
644 | 271k | cbCharLen = cbMultiByte; |
645 | | |
646 | 271k | WINPR_ASSERT(lpMultiByteStr); |
647 | 271k | switch (CodePage) |
648 | 271k | { |
649 | 0 | case CP_ACP: |
650 | 271k | case CP_UTF8: |
651 | 271k | break; |
652 | | |
653 | 0 | default: |
654 | 0 | WLog_ERR(TAG, "Unsupported encoding %u", CodePage); |
655 | 0 | return 0; |
656 | 271k | } |
657 | | |
658 | 271k | return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr, cbCharLen, |
659 | 271k | (uint16_t*)lpWideCharStr, cchWideChar); |
660 | 271k | } |
661 | | |
662 | | int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar, |
663 | | LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar, |
664 | | LPBOOL lpUsedDefaultChar) |
665 | 104k | { |
666 | 104k | size_t cbCharLen = (size_t)cchWideChar; |
667 | | |
668 | 104k | WINPR_UNUSED(dwFlags); |
669 | | /* If cchWideChar is 0, the function fails */ |
670 | 104k | if ((cchWideChar == 0) || (cchWideChar < -1)) |
671 | 0 | return 0; |
672 | | |
673 | 104k | if (cbMultiByte < 0) |
674 | 0 | return -1; |
675 | | |
676 | 104k | WINPR_ASSERT(lpWideCharStr); |
677 | | /* If cchWideChar is -1, the string is null-terminated */ |
678 | 104k | if (cchWideChar == -1) |
679 | 0 | { |
680 | 0 | const size_t len = _wcslen(lpWideCharStr); |
681 | 0 | if (len >= INT32_MAX) |
682 | 0 | return 0; |
683 | 0 | cbCharLen = (int)len + 1; |
684 | 0 | } |
685 | 104k | else |
686 | 104k | cbCharLen = cchWideChar; |
687 | | |
688 | | /* |
689 | | * if cbMultiByte is 0, the function returns the required buffer size |
690 | | * in bytes for lpMultiByteStr and makes no use of the output parameter itself. |
691 | | */ |
692 | | |
693 | 104k | return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr, cbCharLen, |
694 | 104k | (uint8_t*)lpMultiByteStr, cbMultiByte); |
695 | 104k | } |