/src/icu/source/common/ustrtrns.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ****************************************************************************** |
5 | | * |
6 | | * Copyright (C) 2001-2016, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ****************************************************************************** |
10 | | * |
11 | | * File ustrtrns.cpp |
12 | | * |
13 | | * Modification History: |
14 | | * |
15 | | * Date Name Description |
16 | | * 9/10/2001 Ram Creation. |
17 | | ****************************************************************************** |
18 | | */ |
19 | | |
20 | | /******************************************************************************* |
21 | | * |
22 | | * u_strTo* and u_strFrom* APIs |
23 | | * WCS functions moved to ustr_wcs.c for better modularization |
24 | | * |
25 | | ******************************************************************************* |
26 | | */ |
27 | | |
28 | | |
29 | | #include "unicode/putil.h" |
30 | | #include "unicode/ustring.h" |
31 | | #include "unicode/utf.h" |
32 | | #include "unicode/utf8.h" |
33 | | #include "unicode/utf16.h" |
34 | | #include "cstring.h" |
35 | | #include "cmemory.h" |
36 | | #include "ustr_imp.h" |
37 | | #include "uassert.h" |
38 | | |
39 | | U_CAPI UChar* U_EXPORT2 |
40 | | u_strFromUTF32WithSub(UChar *dest, |
41 | | int32_t destCapacity, |
42 | | int32_t *pDestLength, |
43 | | const UChar32 *src, |
44 | | int32_t srcLength, |
45 | | UChar32 subchar, int32_t *pNumSubstitutions, |
46 | 0 | UErrorCode *pErrorCode) { |
47 | 0 | const UChar32 *srcLimit; |
48 | 0 | UChar32 ch; |
49 | 0 | UChar *destLimit; |
50 | 0 | UChar *pDest; |
51 | 0 | int32_t reqLength; |
52 | 0 | int32_t numSubstitutions; |
53 | | |
54 | | /* args check */ |
55 | 0 | if(U_FAILURE(*pErrorCode)){ |
56 | 0 | return NULL; |
57 | 0 | } |
58 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
59 | 0 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
60 | 0 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
61 | 0 | ) { |
62 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
63 | 0 | return NULL; |
64 | 0 | } |
65 | | |
66 | 0 | if(pNumSubstitutions != NULL) { |
67 | 0 | *pNumSubstitutions = 0; |
68 | 0 | } |
69 | |
|
70 | 0 | pDest = dest; |
71 | 0 | destLimit = (dest!=NULL)?(dest + destCapacity):NULL; |
72 | 0 | reqLength = 0; |
73 | 0 | numSubstitutions = 0; |
74 | |
|
75 | 0 | if(srcLength < 0) { |
76 | | /* simple loop for conversion of a NUL-terminated BMP string */ |
77 | 0 | while((ch=*src) != 0 && |
78 | 0 | ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { |
79 | 0 | ++src; |
80 | 0 | if(pDest < destLimit) { |
81 | 0 | *pDest++ = (UChar)ch; |
82 | 0 | } else { |
83 | 0 | ++reqLength; |
84 | 0 | } |
85 | 0 | } |
86 | 0 | srcLimit = src; |
87 | 0 | if(ch != 0) { |
88 | | /* "complicated" case, find the end of the remaining string */ |
89 | 0 | while(*++srcLimit != 0) {} |
90 | 0 | } |
91 | 0 | } else { |
92 | 0 | srcLimit = (src!=NULL)?(src + srcLength):NULL; |
93 | 0 | } |
94 | | |
95 | | /* convert with length */ |
96 | 0 | while(src < srcLimit) { |
97 | 0 | ch = *src++; |
98 | 0 | do { |
99 | | /* usually "loops" once; twice only for writing subchar */ |
100 | 0 | if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { |
101 | 0 | if(pDest < destLimit) { |
102 | 0 | *pDest++ = (UChar)ch; |
103 | 0 | } else { |
104 | 0 | ++reqLength; |
105 | 0 | } |
106 | 0 | break; |
107 | 0 | } else if(0x10000 <= ch && ch <= 0x10ffff) { |
108 | 0 | if(pDest!=NULL && ((pDest + 2) <= destLimit)) { |
109 | 0 | *pDest++ = U16_LEAD(ch); |
110 | 0 | *pDest++ = U16_TRAIL(ch); |
111 | 0 | } else { |
112 | 0 | reqLength += 2; |
113 | 0 | } |
114 | 0 | break; |
115 | 0 | } else if((ch = subchar) < 0) { |
116 | | /* surrogate code point, or not a Unicode code point at all */ |
117 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
118 | 0 | return NULL; |
119 | 0 | } else { |
120 | 0 | ++numSubstitutions; |
121 | 0 | } |
122 | 0 | } while(TRUE); |
123 | 0 | } |
124 | | |
125 | 0 | reqLength += (int32_t)(pDest - dest); |
126 | 0 | if(pDestLength) { |
127 | 0 | *pDestLength = reqLength; |
128 | 0 | } |
129 | 0 | if(pNumSubstitutions != NULL) { |
130 | 0 | *pNumSubstitutions = numSubstitutions; |
131 | 0 | } |
132 | | |
133 | | /* Terminate the buffer */ |
134 | 0 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
135 | | |
136 | 0 | return dest; |
137 | 0 | } |
138 | | |
139 | | U_CAPI UChar* U_EXPORT2 |
140 | | u_strFromUTF32(UChar *dest, |
141 | | int32_t destCapacity, |
142 | | int32_t *pDestLength, |
143 | | const UChar32 *src, |
144 | | int32_t srcLength, |
145 | 0 | UErrorCode *pErrorCode) { |
146 | 0 | return u_strFromUTF32WithSub( |
147 | 0 | dest, destCapacity, pDestLength, |
148 | 0 | src, srcLength, |
149 | 0 | U_SENTINEL, NULL, |
150 | 0 | pErrorCode); |
151 | 0 | } |
152 | | |
153 | | U_CAPI UChar32* U_EXPORT2 |
154 | | u_strToUTF32WithSub(UChar32 *dest, |
155 | | int32_t destCapacity, |
156 | | int32_t *pDestLength, |
157 | | const UChar *src, |
158 | | int32_t srcLength, |
159 | | UChar32 subchar, int32_t *pNumSubstitutions, |
160 | 0 | UErrorCode *pErrorCode) { |
161 | 0 | const UChar *srcLimit; |
162 | 0 | UChar32 ch; |
163 | 0 | UChar ch2; |
164 | 0 | UChar32 *destLimit; |
165 | 0 | UChar32 *pDest; |
166 | 0 | int32_t reqLength; |
167 | 0 | int32_t numSubstitutions; |
168 | | |
169 | | /* args check */ |
170 | 0 | if(U_FAILURE(*pErrorCode)){ |
171 | 0 | return NULL; |
172 | 0 | } |
173 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
174 | 0 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
175 | 0 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
176 | 0 | ) { |
177 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
178 | 0 | return NULL; |
179 | 0 | } |
180 | | |
181 | 0 | if(pNumSubstitutions != NULL) { |
182 | 0 | *pNumSubstitutions = 0; |
183 | 0 | } |
184 | |
|
185 | 0 | pDest = dest; |
186 | 0 | destLimit = (dest!=NULL)?(dest + destCapacity):NULL; |
187 | 0 | reqLength = 0; |
188 | 0 | numSubstitutions = 0; |
189 | |
|
190 | 0 | if(srcLength < 0) { |
191 | | /* simple loop for conversion of a NUL-terminated BMP string */ |
192 | 0 | while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { |
193 | 0 | ++src; |
194 | 0 | if(pDest < destLimit) { |
195 | 0 | *pDest++ = ch; |
196 | 0 | } else { |
197 | 0 | ++reqLength; |
198 | 0 | } |
199 | 0 | } |
200 | 0 | srcLimit = src; |
201 | 0 | if(ch != 0) { |
202 | | /* "complicated" case, find the end of the remaining string */ |
203 | 0 | while(*++srcLimit != 0) {} |
204 | 0 | } |
205 | 0 | } else { |
206 | 0 | srcLimit = (src!=NULL)?(src + srcLength):NULL; |
207 | 0 | } |
208 | | |
209 | | /* convert with length */ |
210 | 0 | while(src < srcLimit) { |
211 | 0 | ch = *src++; |
212 | 0 | if(!U16_IS_SURROGATE(ch)) { |
213 | | /* write or count ch below */ |
214 | 0 | } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { |
215 | 0 | ++src; |
216 | 0 | ch = U16_GET_SUPPLEMENTARY(ch, ch2); |
217 | 0 | } else if((ch = subchar) < 0) { |
218 | | /* unpaired surrogate */ |
219 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
220 | 0 | return NULL; |
221 | 0 | } else { |
222 | 0 | ++numSubstitutions; |
223 | 0 | } |
224 | 0 | if(pDest < destLimit) { |
225 | 0 | *pDest++ = ch; |
226 | 0 | } else { |
227 | 0 | ++reqLength; |
228 | 0 | } |
229 | 0 | } |
230 | | |
231 | 0 | reqLength += (int32_t)(pDest - dest); |
232 | 0 | if(pDestLength) { |
233 | 0 | *pDestLength = reqLength; |
234 | 0 | } |
235 | 0 | if(pNumSubstitutions != NULL) { |
236 | 0 | *pNumSubstitutions = numSubstitutions; |
237 | 0 | } |
238 | | |
239 | | /* Terminate the buffer */ |
240 | 0 | u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); |
241 | |
|
242 | 0 | return dest; |
243 | 0 | } |
244 | | |
245 | | U_CAPI UChar32* U_EXPORT2 |
246 | | u_strToUTF32(UChar32 *dest, |
247 | | int32_t destCapacity, |
248 | | int32_t *pDestLength, |
249 | | const UChar *src, |
250 | | int32_t srcLength, |
251 | 0 | UErrorCode *pErrorCode) { |
252 | 0 | return u_strToUTF32WithSub( |
253 | 0 | dest, destCapacity, pDestLength, |
254 | 0 | src, srcLength, |
255 | 0 | U_SENTINEL, NULL, |
256 | 0 | pErrorCode); |
257 | 0 | } |
258 | | |
259 | | /* for utf8_nextCharSafeBodyTerminated() */ |
260 | | static const UChar32 |
261 | | utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; |
262 | | |
263 | | /* |
264 | | * Version of utf8_nextCharSafeBody() with the following differences: |
265 | | * - checks for NUL termination instead of length |
266 | | * - works with pointers instead of indexes |
267 | | * - always strict (strict==-1) |
268 | | * |
269 | | * *ps points to after the lead byte and will be moved to after the last trail byte. |
270 | | * c is the lead byte. |
271 | | * @return the code point, or U_SENTINEL |
272 | | */ |
273 | | static UChar32 |
274 | 0 | utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { |
275 | 0 | const uint8_t *s=*ps; |
276 | 0 | uint8_t trail, illegal=0; |
277 | 0 | uint8_t count=U8_COUNT_TRAIL_BYTES(c); |
278 | 0 | U_ASSERT(count<6); |
279 | 0 | U8_MASK_LEAD_BYTE((c), count); |
280 | | /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ |
281 | 0 | switch(count) { |
282 | | /* each branch falls through to the next one */ |
283 | 0 | case 5: |
284 | 0 | case 4: |
285 | | /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
286 | 0 | illegal=1; |
287 | 0 | break; |
288 | 0 | case 3: |
289 | 0 | trail=(uint8_t)(*s++ - 0x80); |
290 | 0 | c=(c<<6)|trail; |
291 | 0 | if(trail>0x3f || c>=0x110) { |
292 | | /* not a trail byte, or code point>0x10ffff (outside Unicode) */ |
293 | 0 | illegal=1; |
294 | 0 | break; |
295 | 0 | } |
296 | 0 | U_FALLTHROUGH; |
297 | 0 | case 2: |
298 | 0 | trail=(uint8_t)(*s++ - 0x80); |
299 | 0 | if(trail>0x3f) { |
300 | | /* not a trail byte */ |
301 | 0 | illegal=1; |
302 | 0 | break; |
303 | 0 | } |
304 | 0 | c=(c<<6)|trail; |
305 | 0 | U_FALLTHROUGH; |
306 | 0 | case 1: |
307 | 0 | trail=(uint8_t)(*s++ - 0x80); |
308 | 0 | if(trail>0x3f) { |
309 | | /* not a trail byte */ |
310 | 0 | illegal=1; |
311 | 0 | } |
312 | 0 | c=(c<<6)|trail; |
313 | 0 | break; |
314 | 0 | case 0: |
315 | 0 | return U_SENTINEL; |
316 | | /* no default branch to optimize switch() - all values are covered */ |
317 | 0 | } |
318 | | |
319 | | /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
320 | | /* illegal is also set if count>=4 */ |
321 | 0 | if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { |
322 | | /* error handling */ |
323 | | /* don't go beyond this sequence */ |
324 | 0 | s=*ps; |
325 | 0 | while(count>0 && U8_IS_TRAIL(*s)) { |
326 | 0 | ++s; |
327 | 0 | --count; |
328 | 0 | } |
329 | 0 | c=U_SENTINEL; |
330 | 0 | } |
331 | 0 | *ps=s; |
332 | 0 | return c; |
333 | 0 | } |
334 | | |
335 | | /* |
336 | | * Version of utf8_nextCharSafeBody() with the following differences: |
337 | | * - works with pointers instead of indexes |
338 | | * - always strict (strict==-1) |
339 | | * |
340 | | * *ps points to after the lead byte and will be moved to after the last trail byte. |
341 | | * c is the lead byte. |
342 | | * @return the code point, or U_SENTINEL |
343 | | */ |
344 | | static UChar32 |
345 | 0 | utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { |
346 | 0 | const uint8_t *s=*ps; |
347 | 0 | uint8_t trail, illegal=0; |
348 | 0 | uint8_t count=U8_COUNT_TRAIL_BYTES(c); |
349 | 0 | if((limit-s)>=count) { |
350 | 0 | U8_MASK_LEAD_BYTE((c), count); |
351 | | /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ |
352 | 0 | switch(count) { |
353 | | /* each branch falls through to the next one */ |
354 | 0 | case 5: |
355 | 0 | case 4: |
356 | | /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
357 | 0 | illegal=1; |
358 | 0 | break; |
359 | 0 | case 3: |
360 | 0 | trail=*s++; |
361 | 0 | c=(c<<6)|(trail&0x3f); |
362 | 0 | if(c<0x110) { |
363 | 0 | illegal|=(trail&0xc0)^0x80; |
364 | 0 | } else { |
365 | | /* code point>0x10ffff, outside Unicode */ |
366 | 0 | illegal=1; |
367 | 0 | break; |
368 | 0 | } |
369 | 0 | U_FALLTHROUGH; |
370 | 0 | case 2: |
371 | 0 | trail=*s++; |
372 | 0 | c=(c<<6)|(trail&0x3f); |
373 | 0 | illegal|=(trail&0xc0)^0x80; |
374 | 0 | U_FALLTHROUGH; |
375 | 0 | case 1: |
376 | 0 | trail=*s++; |
377 | 0 | c=(c<<6)|(trail&0x3f); |
378 | 0 | illegal|=(trail&0xc0)^0x80; |
379 | 0 | break; |
380 | 0 | case 0: |
381 | 0 | return U_SENTINEL; |
382 | | /* no default branch to optimize switch() - all values are covered */ |
383 | 0 | } |
384 | 0 | } else { |
385 | 0 | illegal=1; /* too few bytes left */ |
386 | 0 | } |
387 | | |
388 | | /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
389 | | /* illegal is also set if count>=4 */ |
390 | 0 | U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal)); |
391 | 0 | if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { |
392 | | /* error handling */ |
393 | | /* don't go beyond this sequence */ |
394 | 0 | s=*ps; |
395 | 0 | while(count>0 && s<limit && U8_IS_TRAIL(*s)) { |
396 | 0 | ++s; |
397 | 0 | --count; |
398 | 0 | } |
399 | 0 | c=U_SENTINEL; |
400 | 0 | } |
401 | 0 | *ps=s; |
402 | 0 | return c; |
403 | 0 | } |
404 | | |
405 | | U_CAPI UChar* U_EXPORT2 |
406 | | u_strFromUTF8WithSub(UChar *dest, |
407 | | int32_t destCapacity, |
408 | | int32_t *pDestLength, |
409 | | const char* src, |
410 | | int32_t srcLength, |
411 | | UChar32 subchar, int32_t *pNumSubstitutions, |
412 | 0 | UErrorCode *pErrorCode){ |
413 | 0 | UChar *pDest = dest; |
414 | 0 | UChar *pDestLimit = dest+destCapacity; |
415 | 0 | UChar32 ch; |
416 | 0 | int32_t reqLength = 0; |
417 | 0 | const uint8_t* pSrc = (const uint8_t*) src; |
418 | 0 | uint8_t t1, t2; /* trail bytes */ |
419 | 0 | int32_t numSubstitutions; |
420 | | |
421 | | /* args check */ |
422 | 0 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
423 | 0 | return NULL; |
424 | 0 | } |
425 | | |
426 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
427 | 0 | (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
428 | 0 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
429 | 0 | ) { |
430 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
431 | 0 | return NULL; |
432 | 0 | } |
433 | | |
434 | 0 | if(pNumSubstitutions!=NULL) { |
435 | 0 | *pNumSubstitutions=0; |
436 | 0 | } |
437 | 0 | numSubstitutions=0; |
438 | | |
439 | | /* |
440 | | * Inline processing of UTF-8 byte sequences: |
441 | | * |
442 | | * Byte sequences for the most common characters are handled inline in |
443 | | * the conversion loops. In order to reduce the path lengths for those |
444 | | * characters, the tests are arranged in a kind of binary search. |
445 | | * ASCII (<=0x7f) is checked first, followed by the dividing point |
446 | | * between 2- and 3-byte sequences (0xe0). |
447 | | * The 3-byte branch is tested first to speed up CJK text. |
448 | | * The compiler should combine the subtractions for the two tests for 0xe0. |
449 | | * Each branch then tests for the other end of its range. |
450 | | */ |
451 | |
|
452 | 0 | if(srcLength < 0){ |
453 | | /* |
454 | | * Transform a NUL-terminated string. |
455 | | * The code explicitly checks for NULs only in the lead byte position. |
456 | | * A NUL byte in the trail byte position fails the trail byte range check anyway. |
457 | | */ |
458 | 0 | while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
459 | 0 | if(ch <= 0x7f){ |
460 | 0 | *pDest++=(UChar)ch; |
461 | 0 | ++pSrc; |
462 | 0 | } else { |
463 | 0 | if(ch > 0xe0) { |
464 | 0 | if( /* handle U+1000..U+CFFF inline */ |
465 | 0 | ch <= 0xec && |
466 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
467 | 0 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
468 | 0 | ) { |
469 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
470 | 0 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
471 | 0 | pSrc += 3; |
472 | 0 | continue; |
473 | 0 | } |
474 | 0 | } else if(ch < 0xe0) { |
475 | 0 | if( /* handle U+0080..U+07FF inline */ |
476 | 0 | ch >= 0xc2 && |
477 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
478 | 0 | ) { |
479 | 0 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
480 | 0 | pSrc += 2; |
481 | 0 | continue; |
482 | 0 | } |
483 | 0 | } |
484 | | |
485 | | /* function call for "complicated" and error cases */ |
486 | 0 | ++pSrc; /* continue after the lead byte */ |
487 | 0 | ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
488 | 0 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
489 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
490 | 0 | return NULL; |
491 | 0 | } else if(ch<=0xFFFF) { |
492 | 0 | *(pDest++)=(UChar)ch; |
493 | 0 | } else { |
494 | 0 | *(pDest++)=U16_LEAD(ch); |
495 | 0 | if(pDest<pDestLimit) { |
496 | 0 | *(pDest++)=U16_TRAIL(ch); |
497 | 0 | } else { |
498 | 0 | reqLength++; |
499 | 0 | break; |
500 | 0 | } |
501 | 0 | } |
502 | 0 | } |
503 | 0 | } |
504 | | |
505 | | /* Pre-flight the rest of the string. */ |
506 | 0 | while((ch = *pSrc) != 0) { |
507 | 0 | if(ch <= 0x7f){ |
508 | 0 | ++reqLength; |
509 | 0 | ++pSrc; |
510 | 0 | } else { |
511 | 0 | if(ch > 0xe0) { |
512 | 0 | if( /* handle U+1000..U+CFFF inline */ |
513 | 0 | ch <= 0xec && |
514 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
515 | 0 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
516 | 0 | ) { |
517 | 0 | ++reqLength; |
518 | 0 | pSrc += 3; |
519 | 0 | continue; |
520 | 0 | } |
521 | 0 | } else if(ch < 0xe0) { |
522 | 0 | if( /* handle U+0080..U+07FF inline */ |
523 | 0 | ch >= 0xc2 && |
524 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
525 | 0 | ) { |
526 | 0 | ++reqLength; |
527 | 0 | pSrc += 2; |
528 | 0 | continue; |
529 | 0 | } |
530 | 0 | } |
531 | | |
532 | | /* function call for "complicated" and error cases */ |
533 | 0 | ++pSrc; /* continue after the lead byte */ |
534 | 0 | ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); |
535 | 0 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { |
536 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
537 | 0 | return NULL; |
538 | 0 | } |
539 | 0 | reqLength += U16_LENGTH(ch); |
540 | 0 | } |
541 | 0 | } |
542 | 0 | } else /* srcLength >= 0 */ { |
543 | 0 | const uint8_t *pSrcLimit = pSrc + srcLength; |
544 | 0 | int32_t count; |
545 | | |
546 | | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
547 | 0 | for(;;) { |
548 | | /* |
549 | | * Each iteration of the inner loop progresses by at most 3 UTF-8 |
550 | | * bytes and one UChar, for most characters. |
551 | | * For supplementary code points (4 & 2), which are rare, |
552 | | * there is an additional adjustment. |
553 | | */ |
554 | 0 | count = (int32_t)(pDestLimit - pDest); |
555 | 0 | srcLength = (int32_t)((pSrcLimit - pSrc) / 3); |
556 | 0 | if(count > srcLength) { |
557 | 0 | count = srcLength; /* min(remaining dest, remaining src/3) */ |
558 | 0 | } |
559 | 0 | if(count < 3) { |
560 | | /* |
561 | | * Too much overhead if we get near the end of the string, |
562 | | * continue with the next loop. |
563 | | */ |
564 | 0 | break; |
565 | 0 | } |
566 | | |
567 | 0 | do { |
568 | 0 | ch = *pSrc; |
569 | 0 | if(ch <= 0x7f){ |
570 | 0 | *pDest++=(UChar)ch; |
571 | 0 | ++pSrc; |
572 | 0 | } else { |
573 | 0 | if(ch > 0xe0) { |
574 | 0 | if( /* handle U+1000..U+CFFF inline */ |
575 | 0 | ch <= 0xec && |
576 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
577 | 0 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
578 | 0 | ) { |
579 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
580 | 0 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
581 | 0 | pSrc += 3; |
582 | 0 | continue; |
583 | 0 | } |
584 | 0 | } else if(ch < 0xe0) { |
585 | 0 | if( /* handle U+0080..U+07FF inline */ |
586 | 0 | ch >= 0xc2 && |
587 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
588 | 0 | ) { |
589 | 0 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
590 | 0 | pSrc += 2; |
591 | 0 | continue; |
592 | 0 | } |
593 | 0 | } |
594 | | |
595 | 0 | if(ch >= 0xf0 || subchar > 0xffff) { |
596 | | /* |
597 | | * We may read up to six bytes and write up to two UChars, |
598 | | * which we didn't account for with computing count, |
599 | | * so we adjust it here. |
600 | | */ |
601 | 0 | if(--count == 0) { |
602 | 0 | break; |
603 | 0 | } |
604 | 0 | } |
605 | | |
606 | | /* function call for "complicated" and error cases */ |
607 | 0 | ++pSrc; /* continue after the lead byte */ |
608 | 0 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
609 | 0 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
610 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
611 | 0 | return NULL; |
612 | 0 | }else if(ch<=0xFFFF){ |
613 | 0 | *(pDest++)=(UChar)ch; |
614 | 0 | }else{ |
615 | 0 | *(pDest++)=U16_LEAD(ch); |
616 | 0 | *(pDest++)=U16_TRAIL(ch); |
617 | 0 | } |
618 | 0 | } |
619 | 0 | } while(--count > 0); |
620 | 0 | } |
621 | | |
622 | 0 | while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
623 | 0 | ch = *pSrc; |
624 | 0 | if(ch <= 0x7f){ |
625 | 0 | *pDest++=(UChar)ch; |
626 | 0 | ++pSrc; |
627 | 0 | } else { |
628 | 0 | if(ch > 0xe0) { |
629 | 0 | if( /* handle U+1000..U+CFFF inline */ |
630 | 0 | ch <= 0xec && |
631 | 0 | ((pSrcLimit - pSrc) >= 3) && |
632 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
633 | 0 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
634 | 0 | ) { |
635 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
636 | 0 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
637 | 0 | pSrc += 3; |
638 | 0 | continue; |
639 | 0 | } |
640 | 0 | } else if(ch < 0xe0) { |
641 | 0 | if( /* handle U+0080..U+07FF inline */ |
642 | 0 | ch >= 0xc2 && |
643 | 0 | ((pSrcLimit - pSrc) >= 2) && |
644 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
645 | 0 | ) { |
646 | 0 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
647 | 0 | pSrc += 2; |
648 | 0 | continue; |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | | /* function call for "complicated" and error cases */ |
653 | 0 | ++pSrc; /* continue after the lead byte */ |
654 | 0 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
655 | 0 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
656 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
657 | 0 | return NULL; |
658 | 0 | }else if(ch<=0xFFFF){ |
659 | 0 | *(pDest++)=(UChar)ch; |
660 | 0 | }else{ |
661 | 0 | *(pDest++)=U16_LEAD(ch); |
662 | 0 | if(pDest<pDestLimit){ |
663 | 0 | *(pDest++)=U16_TRAIL(ch); |
664 | 0 | }else{ |
665 | 0 | reqLength++; |
666 | 0 | break; |
667 | 0 | } |
668 | 0 | } |
669 | 0 | } |
670 | 0 | } |
671 | | /* do not fill the dest buffer just count the UChars needed */ |
672 | 0 | while(pSrc < pSrcLimit){ |
673 | 0 | ch = *pSrc; |
674 | 0 | if(ch <= 0x7f){ |
675 | 0 | reqLength++; |
676 | 0 | ++pSrc; |
677 | 0 | } else { |
678 | 0 | if(ch > 0xe0) { |
679 | 0 | if( /* handle U+1000..U+CFFF inline */ |
680 | 0 | ch <= 0xec && |
681 | 0 | ((pSrcLimit - pSrc) >= 3) && |
682 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
683 | 0 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
684 | 0 | ) { |
685 | 0 | reqLength++; |
686 | 0 | pSrc += 3; |
687 | 0 | continue; |
688 | 0 | } |
689 | 0 | } else if(ch < 0xe0) { |
690 | 0 | if( /* handle U+0080..U+07FF inline */ |
691 | 0 | ch >= 0xc2 && |
692 | 0 | ((pSrcLimit - pSrc) >= 2) && |
693 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
694 | 0 | ) { |
695 | 0 | reqLength++; |
696 | 0 | pSrc += 2; |
697 | 0 | continue; |
698 | 0 | } |
699 | 0 | } |
700 | | |
701 | | /* function call for "complicated" and error cases */ |
702 | 0 | ++pSrc; /* continue after the lead byte */ |
703 | 0 | ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
704 | 0 | if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ |
705 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
706 | 0 | return NULL; |
707 | 0 | } |
708 | 0 | reqLength+=U16_LENGTH(ch); |
709 | 0 | } |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | 0 | reqLength+=(int32_t)(pDest - dest); |
714 | |
|
715 | 0 | if(pNumSubstitutions!=NULL) { |
716 | 0 | *pNumSubstitutions=numSubstitutions; |
717 | 0 | } |
718 | |
|
719 | 0 | if(pDestLength){ |
720 | 0 | *pDestLength = reqLength; |
721 | 0 | } |
722 | | |
723 | | /* Terminate the buffer */ |
724 | 0 | u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
725 | |
|
726 | 0 | return dest; |
727 | 0 | } |
728 | | |
729 | | U_CAPI UChar* U_EXPORT2 |
730 | | u_strFromUTF8(UChar *dest, |
731 | | int32_t destCapacity, |
732 | | int32_t *pDestLength, |
733 | | const char* src, |
734 | | int32_t srcLength, |
735 | 0 | UErrorCode *pErrorCode){ |
736 | 0 | return u_strFromUTF8WithSub( |
737 | 0 | dest, destCapacity, pDestLength, |
738 | 0 | src, srcLength, |
739 | 0 | U_SENTINEL, NULL, |
740 | 0 | pErrorCode); |
741 | 0 | } |
742 | | |
743 | | U_CAPI UChar * U_EXPORT2 |
744 | | u_strFromUTF8Lenient(UChar *dest, |
745 | | int32_t destCapacity, |
746 | | int32_t *pDestLength, |
747 | | const char *src, |
748 | | int32_t srcLength, |
749 | 0 | UErrorCode *pErrorCode) { |
750 | 0 | UChar *pDest = dest; |
751 | 0 | UChar32 ch; |
752 | 0 | int32_t reqLength = 0; |
753 | 0 | uint8_t* pSrc = (uint8_t*) src; |
754 | | |
755 | | /* args check */ |
756 | 0 | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
757 | 0 | return NULL; |
758 | 0 | } |
759 | | |
760 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
761 | 0 | (destCapacity<0) || (dest == NULL && destCapacity > 0) |
762 | 0 | ) { |
763 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
764 | 0 | return NULL; |
765 | 0 | } |
766 | | |
767 | 0 | if(srcLength < 0) { |
768 | | /* Transform a NUL-terminated string. */ |
769 | 0 | UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; |
770 | 0 | uint8_t t1, t2, t3; /* trail bytes */ |
771 | |
|
772 | 0 | while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { |
773 | 0 | if(ch < 0xc0) { |
774 | | /* |
775 | | * ASCII, or a trail byte in lead position which is treated like |
776 | | * a single-byte sequence for better character boundary |
777 | | * resynchronization after illegal sequences. |
778 | | */ |
779 | 0 | *pDest++=(UChar)ch; |
780 | 0 | ++pSrc; |
781 | 0 | continue; |
782 | 0 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
783 | 0 | if((t1 = pSrc[1]) != 0) { |
784 | | /* 0x3080 = (0xc0 << 6) + 0x80 */ |
785 | 0 | *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); |
786 | 0 | pSrc += 2; |
787 | 0 | continue; |
788 | 0 | } |
789 | 0 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
790 | 0 | if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { |
791 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
792 | | /* 0x2080 = (0x80 << 6) + 0x80 */ |
793 | 0 | *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); |
794 | 0 | pSrc += 3; |
795 | 0 | continue; |
796 | 0 | } |
797 | 0 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
798 | 0 | if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { |
799 | 0 | pSrc += 4; |
800 | | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
801 | 0 | ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; |
802 | 0 | *(pDest++) = U16_LEAD(ch); |
803 | 0 | if(pDest < pDestLimit) { |
804 | 0 | *(pDest++) = U16_TRAIL(ch); |
805 | 0 | } else { |
806 | 0 | reqLength = 1; |
807 | 0 | break; |
808 | 0 | } |
809 | 0 | continue; |
810 | 0 | } |
811 | 0 | } |
812 | | |
813 | | /* truncated character at the end */ |
814 | 0 | *pDest++ = 0xfffd; |
815 | 0 | while(*++pSrc != 0) {} |
816 | 0 | break; |
817 | 0 | } |
818 | | |
819 | | /* Pre-flight the rest of the string. */ |
820 | 0 | while((ch = *pSrc) != 0) { |
821 | 0 | if(ch < 0xc0) { |
822 | | /* |
823 | | * ASCII, or a trail byte in lead position which is treated like |
824 | | * a single-byte sequence for better character boundary |
825 | | * resynchronization after illegal sequences. |
826 | | */ |
827 | 0 | ++reqLength; |
828 | 0 | ++pSrc; |
829 | 0 | continue; |
830 | 0 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
831 | 0 | if(pSrc[1] != 0) { |
832 | 0 | ++reqLength; |
833 | 0 | pSrc += 2; |
834 | 0 | continue; |
835 | 0 | } |
836 | 0 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
837 | 0 | if(pSrc[1] != 0 && pSrc[2] != 0) { |
838 | 0 | ++reqLength; |
839 | 0 | pSrc += 3; |
840 | 0 | continue; |
841 | 0 | } |
842 | 0 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
843 | 0 | if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { |
844 | 0 | reqLength += 2; |
845 | 0 | pSrc += 4; |
846 | 0 | continue; |
847 | 0 | } |
848 | 0 | } |
849 | | |
850 | | /* truncated character at the end */ |
851 | 0 | ++reqLength; |
852 | 0 | break; |
853 | 0 | } |
854 | 0 | } else /* srcLength >= 0 */ { |
855 | 0 | const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; |
856 | | |
857 | | /* |
858 | | * This function requires that if srcLength is given, then it must be |
859 | | * destCapatity >= srcLength so that we need not check for |
860 | | * destination buffer overflow in the loop. |
861 | | */ |
862 | 0 | if(destCapacity < srcLength) { |
863 | 0 | if(pDestLength != NULL) { |
864 | 0 | *pDestLength = srcLength; /* this likely overestimates the true destLength! */ |
865 | 0 | } |
866 | 0 | *pErrorCode = U_BUFFER_OVERFLOW_ERROR; |
867 | 0 | return NULL; |
868 | 0 | } |
869 | | |
870 | 0 | if((pSrcLimit - pSrc) >= 4) { |
871 | 0 | pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ |
872 | | |
873 | | /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ |
874 | 0 | do { |
875 | 0 | ch = *pSrc++; |
876 | 0 | if(ch < 0xc0) { |
877 | | /* |
878 | | * ASCII, or a trail byte in lead position which is treated like |
879 | | * a single-byte sequence for better character boundary |
880 | | * resynchronization after illegal sequences. |
881 | | */ |
882 | 0 | *pDest++=(UChar)ch; |
883 | 0 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
884 | | /* 0x3080 = (0xc0 << 6) + 0x80 */ |
885 | 0 | *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
886 | 0 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
887 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
888 | | /* 0x2080 = (0x80 << 6) + 0x80 */ |
889 | 0 | ch = (ch << 12) + (*pSrc++ << 6); |
890 | 0 | *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
891 | 0 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
892 | | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
893 | 0 | ch = (ch << 18) + (*pSrc++ << 12); |
894 | 0 | ch += *pSrc++ << 6; |
895 | 0 | ch += *pSrc++ - 0x3c82080; |
896 | 0 | *(pDest++) = U16_LEAD(ch); |
897 | 0 | *(pDest++) = U16_TRAIL(ch); |
898 | 0 | } |
899 | 0 | } while(pSrc < pSrcLimit); |
900 | |
|
901 | 0 | pSrcLimit += 3; /* restore original pSrcLimit */ |
902 | 0 | } |
903 | |
|
904 | 0 | while(pSrc < pSrcLimit) { |
905 | 0 | ch = *pSrc++; |
906 | 0 | if(ch < 0xc0) { |
907 | | /* |
908 | | * ASCII, or a trail byte in lead position which is treated like |
909 | | * a single-byte sequence for better character boundary |
910 | | * resynchronization after illegal sequences. |
911 | | */ |
912 | 0 | *pDest++=(UChar)ch; |
913 | 0 | continue; |
914 | 0 | } else if(ch < 0xe0) { /* U+0080..U+07FF */ |
915 | 0 | if(pSrc < pSrcLimit) { |
916 | | /* 0x3080 = (0xc0 << 6) + 0x80 */ |
917 | 0 | *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); |
918 | 0 | continue; |
919 | 0 | } |
920 | 0 | } else if(ch < 0xf0) { /* U+0800..U+FFFF */ |
921 | 0 | if((pSrcLimit - pSrc) >= 2) { |
922 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
923 | | /* 0x2080 = (0x80 << 6) + 0x80 */ |
924 | 0 | ch = (ch << 12) + (*pSrc++ << 6); |
925 | 0 | *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); |
926 | 0 | pSrc += 3; |
927 | 0 | continue; |
928 | 0 | } |
929 | 0 | } else /* f0..f4 */ { /* U+10000..U+10FFFF */ |
930 | 0 | if((pSrcLimit - pSrc) >= 3) { |
931 | | /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ |
932 | 0 | ch = (ch << 18) + (*pSrc++ << 12); |
933 | 0 | ch += *pSrc++ << 6; |
934 | 0 | ch += *pSrc++ - 0x3c82080; |
935 | 0 | *(pDest++) = U16_LEAD(ch); |
936 | 0 | *(pDest++) = U16_TRAIL(ch); |
937 | 0 | pSrc += 4; |
938 | 0 | continue; |
939 | 0 | } |
940 | 0 | } |
941 | | |
942 | | /* truncated character at the end */ |
943 | 0 | *pDest++ = 0xfffd; |
944 | 0 | break; |
945 | 0 | } |
946 | 0 | } |
947 | | |
948 | 0 | reqLength+=(int32_t)(pDest - dest); |
949 | |
|
950 | 0 | if(pDestLength){ |
951 | 0 | *pDestLength = reqLength; |
952 | 0 | } |
953 | | |
954 | | /* Terminate the buffer */ |
955 | 0 | u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
956 | |
|
957 | 0 | return dest; |
958 | 0 | } |
959 | | |
960 | | static inline uint8_t * |
961 | 96 | _appendUTF8(uint8_t *pDest, UChar32 c) { |
962 | | /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ |
963 | 96 | if((c)<=0x7f) { |
964 | 0 | *pDest++=(uint8_t)c; |
965 | 96 | } else if(c<=0x7ff) { |
966 | 0 | *pDest++=(uint8_t)((c>>6)|0xc0); |
967 | 0 | *pDest++=(uint8_t)((c&0x3f)|0x80); |
968 | 96 | } else if(c<=0xffff) { |
969 | 0 | *pDest++=(uint8_t)((c>>12)|0xe0); |
970 | 0 | *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); |
971 | 0 | *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
972 | 96 | } else /* if((uint32_t)(c)<=0x10ffff) */ { |
973 | 96 | *pDest++=(uint8_t)(((c)>>18)|0xf0); |
974 | 96 | *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); |
975 | 96 | *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); |
976 | 96 | *pDest++=(uint8_t)(((c)&0x3f)|0x80); |
977 | 96 | } |
978 | 96 | return pDest; |
979 | 96 | } |
980 | | |
981 | | |
982 | | U_CAPI char* U_EXPORT2 |
983 | | u_strToUTF8WithSub(char *dest, |
984 | | int32_t destCapacity, |
985 | | int32_t *pDestLength, |
986 | | const UChar *pSrc, |
987 | | int32_t srcLength, |
988 | | UChar32 subchar, int32_t *pNumSubstitutions, |
989 | 3.57k | UErrorCode *pErrorCode){ |
990 | 3.57k | int32_t reqLength=0; |
991 | 3.57k | uint32_t ch=0,ch2=0; |
992 | 3.57k | uint8_t *pDest = (uint8_t *)dest; |
993 | 3.57k | uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; |
994 | 3.57k | int32_t numSubstitutions; |
995 | | |
996 | | /* args check */ |
997 | 3.57k | if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ |
998 | 0 | return NULL; |
999 | 0 | } |
1000 | | |
1001 | 3.57k | if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || |
1002 | 3.57k | (destCapacity<0) || (dest == NULL && destCapacity > 0) || |
1003 | 3.57k | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
1004 | 3.57k | ) { |
1005 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
1006 | 0 | return NULL; |
1007 | 0 | } |
1008 | | |
1009 | 3.57k | if(pNumSubstitutions!=NULL) { |
1010 | 0 | *pNumSubstitutions=0; |
1011 | 0 | } |
1012 | 3.57k | numSubstitutions=0; |
1013 | | |
1014 | 3.57k | if(srcLength==-1) { |
1015 | 0 | while((ch=*pSrc)!=0) { |
1016 | 0 | ++pSrc; |
1017 | 0 | if(ch <= 0x7f) { |
1018 | 0 | if(pDest<pDestLimit) { |
1019 | 0 | *pDest++ = (uint8_t)ch; |
1020 | 0 | } else { |
1021 | 0 | reqLength = 1; |
1022 | 0 | break; |
1023 | 0 | } |
1024 | 0 | } else if(ch <= 0x7ff) { |
1025 | 0 | if((pDestLimit - pDest) >= 2) { |
1026 | 0 | *pDest++=(uint8_t)((ch>>6)|0xc0); |
1027 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1028 | 0 | } else { |
1029 | 0 | reqLength = 2; |
1030 | 0 | break; |
1031 | 0 | } |
1032 | 0 | } else if(ch <= 0xd7ff || ch >= 0xe000) { |
1033 | 0 | if((pDestLimit - pDest) >= 3) { |
1034 | 0 | *pDest++=(uint8_t)((ch>>12)|0xe0); |
1035 | 0 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1036 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1037 | 0 | } else { |
1038 | 0 | reqLength = 3; |
1039 | 0 | break; |
1040 | 0 | } |
1041 | 0 | } else /* ch is a surrogate */ { |
1042 | 0 | int32_t length; |
1043 | | |
1044 | | /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ |
1045 | 0 | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
1046 | 0 | ++pSrc; |
1047 | 0 | ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
1048 | 0 | } else if(subchar>=0) { |
1049 | 0 | ch=subchar; |
1050 | 0 | ++numSubstitutions; |
1051 | 0 | } else { |
1052 | | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
1053 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1054 | 0 | return NULL; |
1055 | 0 | } |
1056 | | |
1057 | 0 | length = U8_LENGTH(ch); |
1058 | 0 | if((pDestLimit - pDest) >= length) { |
1059 | | /* convert and append*/ |
1060 | 0 | pDest=_appendUTF8(pDest, ch); |
1061 | 0 | } else { |
1062 | 0 | reqLength = length; |
1063 | 0 | break; |
1064 | 0 | } |
1065 | 0 | } |
1066 | 0 | } |
1067 | 0 | while((ch=*pSrc++)!=0) { |
1068 | 0 | if(ch<=0x7f) { |
1069 | 0 | ++reqLength; |
1070 | 0 | } else if(ch<=0x7ff) { |
1071 | 0 | reqLength+=2; |
1072 | 0 | } else if(!U16_IS_SURROGATE(ch)) { |
1073 | 0 | reqLength+=3; |
1074 | 0 | } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
1075 | 0 | ++pSrc; |
1076 | 0 | reqLength+=4; |
1077 | 0 | } else if(subchar>=0) { |
1078 | 0 | reqLength+=U8_LENGTH(subchar); |
1079 | 0 | ++numSubstitutions; |
1080 | 0 | } else { |
1081 | | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
1082 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1083 | 0 | return NULL; |
1084 | 0 | } |
1085 | 0 | } |
1086 | 3.57k | } else { |
1087 | 3.57k | const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; |
1088 | 3.57k | int32_t count; |
1089 | | |
1090 | | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
1091 | 6.38k | for(;;) { |
1092 | | /* |
1093 | | * Each iteration of the inner loop progresses by at most 3 UTF-8 |
1094 | | * bytes and one UChar, for most characters. |
1095 | | * For supplementary code points (4 & 2), which are rare, |
1096 | | * there is an additional adjustment. |
1097 | | */ |
1098 | 6.38k | count = (int32_t)((pDestLimit - pDest) / 3); |
1099 | 6.38k | srcLength = (int32_t)(pSrcLimit - pSrc); |
1100 | 6.38k | if(count > srcLength) { |
1101 | 6.38k | count = srcLength; /* min(remaining dest/3, remaining src) */ |
1102 | 6.38k | } |
1103 | 6.38k | if(count < 3) { |
1104 | | /* |
1105 | | * Too much overhead if we get near the end of the string, |
1106 | | * continue with the next loop. |
1107 | | */ |
1108 | 3.57k | break; |
1109 | 3.57k | } |
1110 | 7.44M | do { |
1111 | 7.44M | ch=*pSrc++; |
1112 | 7.44M | if(ch <= 0x7f) { |
1113 | 4.36M | *pDest++ = (uint8_t)ch; |
1114 | 4.36M | } else if(ch <= 0x7ff) { |
1115 | 1.30M | *pDest++=(uint8_t)((ch>>6)|0xc0); |
1116 | 1.30M | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1117 | 1.76M | } else if(ch <= 0xd7ff || ch >= 0xe000) { |
1118 | 1.74M | *pDest++=(uint8_t)((ch>>12)|0xe0); |
1119 | 1.74M | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1120 | 1.74M | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1121 | 1.74M | } else /* ch is a surrogate */ { |
1122 | | /* |
1123 | | * We will read two UChars and probably output four bytes, |
1124 | | * which we didn't account for with computing count, |
1125 | | * so we adjust it here. |
1126 | | */ |
1127 | 24.6k | if(--count == 0) { |
1128 | 0 | --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ |
1129 | 0 | break; /* recompute count */ |
1130 | 0 | } |
1131 | | |
1132 | 24.6k | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { |
1133 | 24.6k | ++pSrc; |
1134 | 24.6k | ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
1135 | | |
1136 | | /* writing 4 bytes per 2 UChars is ok */ |
1137 | 24.6k | *pDest++=(uint8_t)((ch>>18)|0xf0); |
1138 | 24.6k | *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); |
1139 | 24.6k | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1140 | 24.6k | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1141 | 24.6k | } else { |
1142 | | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
1143 | 0 | if(subchar>=0) { |
1144 | 0 | ch=subchar; |
1145 | 0 | ++numSubstitutions; |
1146 | 0 | } else { |
1147 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1148 | 0 | return NULL; |
1149 | 0 | } |
1150 | | |
1151 | | /* convert and append*/ |
1152 | 0 | pDest=_appendUTF8(pDest, ch); |
1153 | 0 | } |
1154 | 24.6k | } |
1155 | 7.44M | } while(--count > 0); |
1156 | 2.81k | } |
1157 | | |
1158 | 4.71k | while(pSrc<pSrcLimit) { |
1159 | 1.13k | ch=*pSrc++; |
1160 | 1.13k | if(ch <= 0x7f) { |
1161 | 181 | if(pDest<pDestLimit) { |
1162 | 181 | *pDest++ = (uint8_t)ch; |
1163 | 181 | } else { |
1164 | 0 | reqLength = 1; |
1165 | 0 | break; |
1166 | 0 | } |
1167 | 957 | } else if(ch <= 0x7ff) { |
1168 | 347 | if((pDestLimit - pDest) >= 2) { |
1169 | 347 | *pDest++=(uint8_t)((ch>>6)|0xc0); |
1170 | 347 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1171 | 347 | } else { |
1172 | 0 | reqLength = 2; |
1173 | 0 | break; |
1174 | 0 | } |
1175 | 610 | } else if(ch <= 0xd7ff || ch >= 0xe000) { |
1176 | 514 | if((pDestLimit - pDest) >= 3) { |
1177 | 514 | *pDest++=(uint8_t)((ch>>12)|0xe0); |
1178 | 514 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1179 | 514 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1180 | 514 | } else { |
1181 | 0 | reqLength = 3; |
1182 | 0 | break; |
1183 | 0 | } |
1184 | 514 | } else /* ch is a surrogate */ { |
1185 | 96 | int32_t length; |
1186 | | |
1187 | 96 | if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { |
1188 | 96 | ++pSrc; |
1189 | 96 | ch=U16_GET_SUPPLEMENTARY(ch, ch2); |
1190 | 96 | } else if(subchar>=0) { |
1191 | 0 | ch=subchar; |
1192 | 0 | ++numSubstitutions; |
1193 | 0 | } else { |
1194 | | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
1195 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1196 | 0 | return NULL; |
1197 | 0 | } |
1198 | | |
1199 | 96 | length = U8_LENGTH(ch); |
1200 | 96 | if((pDestLimit - pDest) >= length) { |
1201 | | /* convert and append*/ |
1202 | 96 | pDest=_appendUTF8(pDest, ch); |
1203 | 96 | } else { |
1204 | 0 | reqLength = length; |
1205 | 0 | break; |
1206 | 0 | } |
1207 | 96 | } |
1208 | 1.13k | } |
1209 | 3.57k | while(pSrc<pSrcLimit) { |
1210 | 0 | ch=*pSrc++; |
1211 | 0 | if(ch<=0x7f) { |
1212 | 0 | ++reqLength; |
1213 | 0 | } else if(ch<=0x7ff) { |
1214 | 0 | reqLength+=2; |
1215 | 0 | } else if(!U16_IS_SURROGATE(ch)) { |
1216 | 0 | reqLength+=3; |
1217 | 0 | } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { |
1218 | 0 | ++pSrc; |
1219 | 0 | reqLength+=4; |
1220 | 0 | } else if(subchar>=0) { |
1221 | 0 | reqLength+=U8_LENGTH(subchar); |
1222 | 0 | ++numSubstitutions; |
1223 | 0 | } else { |
1224 | | /* Unicode 3.2 forbids surrogate code points in UTF-8 */ |
1225 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1226 | 0 | return NULL; |
1227 | 0 | } |
1228 | 0 | } |
1229 | 3.57k | } |
1230 | | |
1231 | 3.57k | reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
1232 | | |
1233 | 3.57k | if(pNumSubstitutions!=NULL) { |
1234 | 0 | *pNumSubstitutions=numSubstitutions; |
1235 | 0 | } |
1236 | | |
1237 | 3.57k | if(pDestLength){ |
1238 | 0 | *pDestLength = reqLength; |
1239 | 0 | } |
1240 | | |
1241 | | /* Terminate the buffer */ |
1242 | 3.57k | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
1243 | 3.57k | return dest; |
1244 | 3.57k | } |
1245 | | |
1246 | | U_CAPI char* U_EXPORT2 |
1247 | | u_strToUTF8(char *dest, |
1248 | | int32_t destCapacity, |
1249 | | int32_t *pDestLength, |
1250 | | const UChar *pSrc, |
1251 | | int32_t srcLength, |
1252 | 3.57k | UErrorCode *pErrorCode){ |
1253 | 3.57k | return u_strToUTF8WithSub( |
1254 | 3.57k | dest, destCapacity, pDestLength, |
1255 | 3.57k | pSrc, srcLength, |
1256 | 3.57k | U_SENTINEL, NULL, |
1257 | 3.57k | pErrorCode); |
1258 | 3.57k | } |
1259 | | |
1260 | | U_CAPI UChar* U_EXPORT2 |
1261 | | u_strFromJavaModifiedUTF8WithSub( |
1262 | | UChar *dest, |
1263 | | int32_t destCapacity, |
1264 | | int32_t *pDestLength, |
1265 | | const char *src, |
1266 | | int32_t srcLength, |
1267 | | UChar32 subchar, int32_t *pNumSubstitutions, |
1268 | 0 | UErrorCode *pErrorCode) { |
1269 | 0 | UChar *pDest = dest; |
1270 | 0 | UChar *pDestLimit = dest+destCapacity; |
1271 | 0 | UChar32 ch; |
1272 | 0 | int32_t reqLength = 0; |
1273 | 0 | const uint8_t* pSrc = (const uint8_t*) src; |
1274 | 0 | const uint8_t *pSrcLimit; |
1275 | 0 | int32_t count; |
1276 | 0 | uint8_t t1, t2; /* trail bytes */ |
1277 | 0 | int32_t numSubstitutions; |
1278 | | |
1279 | | /* args check */ |
1280 | 0 | if(U_FAILURE(*pErrorCode)){ |
1281 | 0 | return NULL; |
1282 | 0 | } |
1283 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
1284 | 0 | (dest==NULL && destCapacity!=0) || destCapacity<0 || |
1285 | 0 | subchar > 0x10ffff || U_IS_SURROGATE(subchar) |
1286 | 0 | ) { |
1287 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
1288 | 0 | return NULL; |
1289 | 0 | } |
1290 | | |
1291 | 0 | if(pNumSubstitutions!=NULL) { |
1292 | 0 | *pNumSubstitutions=0; |
1293 | 0 | } |
1294 | 0 | numSubstitutions=0; |
1295 | |
|
1296 | 0 | if(srcLength < 0) { |
1297 | | /* |
1298 | | * Transform a NUL-terminated ASCII string. |
1299 | | * Handle non-ASCII strings with slower code. |
1300 | | */ |
1301 | 0 | while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { |
1302 | 0 | *pDest++=(UChar)ch; |
1303 | 0 | ++pSrc; |
1304 | 0 | } |
1305 | 0 | if(ch == 0) { |
1306 | 0 | reqLength=(int32_t)(pDest - dest); |
1307 | 0 | if(pDestLength) { |
1308 | 0 | *pDestLength = reqLength; |
1309 | 0 | } |
1310 | | |
1311 | | /* Terminate the buffer */ |
1312 | 0 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
1313 | 0 | return dest; |
1314 | 0 | } |
1315 | 0 | srcLength = uprv_strlen((const char *)pSrc); |
1316 | 0 | } |
1317 | | |
1318 | | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
1319 | 0 | pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; |
1320 | 0 | for(;;) { |
1321 | 0 | count = (int32_t)(pDestLimit - pDest); |
1322 | 0 | srcLength = (int32_t)(pSrcLimit - pSrc); |
1323 | 0 | if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { |
1324 | | /* fast ASCII loop */ |
1325 | 0 | const uint8_t *prevSrc = pSrc; |
1326 | 0 | int32_t delta; |
1327 | 0 | while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { |
1328 | 0 | *pDest++=(UChar)ch; |
1329 | 0 | ++pSrc; |
1330 | 0 | } |
1331 | 0 | delta = (int32_t)(pSrc - prevSrc); |
1332 | 0 | count -= delta; |
1333 | 0 | srcLength -= delta; |
1334 | 0 | } |
1335 | | /* |
1336 | | * Each iteration of the inner loop progresses by at most 3 UTF-8 |
1337 | | * bytes and one UChar. |
1338 | | */ |
1339 | 0 | srcLength /= 3; |
1340 | 0 | if(count > srcLength) { |
1341 | 0 | count = srcLength; /* min(remaining dest, remaining src/3) */ |
1342 | 0 | } |
1343 | 0 | if(count < 3) { |
1344 | | /* |
1345 | | * Too much overhead if we get near the end of the string, |
1346 | | * continue with the next loop. |
1347 | | */ |
1348 | 0 | break; |
1349 | 0 | } |
1350 | 0 | do { |
1351 | 0 | ch = *pSrc; |
1352 | 0 | if(ch <= 0x7f){ |
1353 | 0 | *pDest++=(UChar)ch; |
1354 | 0 | ++pSrc; |
1355 | 0 | } else { |
1356 | 0 | if(ch >= 0xe0) { |
1357 | 0 | if( /* handle U+0000..U+FFFF inline */ |
1358 | 0 | ch <= 0xef && |
1359 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
1360 | 0 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
1361 | 0 | ) { |
1362 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
1363 | 0 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
1364 | 0 | pSrc += 3; |
1365 | 0 | continue; |
1366 | 0 | } |
1367 | 0 | } else { |
1368 | 0 | if( /* handle U+0000..U+07FF inline */ |
1369 | 0 | ch >= 0xc0 && |
1370 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
1371 | 0 | ) { |
1372 | 0 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
1373 | 0 | pSrc += 2; |
1374 | 0 | continue; |
1375 | 0 | } |
1376 | 0 | } |
1377 | | |
1378 | 0 | if(subchar < 0) { |
1379 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1380 | 0 | return NULL; |
1381 | 0 | } else if(subchar > 0xffff && --count == 0) { |
1382 | | /* |
1383 | | * We need to write two UChars, adjusted count for that, |
1384 | | * and ran out of space. |
1385 | | */ |
1386 | 0 | break; |
1387 | 0 | } else { |
1388 | | /* function call for error cases */ |
1389 | 0 | ++pSrc; /* continue after the lead byte */ |
1390 | 0 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
1391 | 0 | ++numSubstitutions; |
1392 | 0 | if(subchar<=0xFFFF) { |
1393 | 0 | *(pDest++)=(UChar)subchar; |
1394 | 0 | } else { |
1395 | 0 | *(pDest++)=U16_LEAD(subchar); |
1396 | 0 | *(pDest++)=U16_TRAIL(subchar); |
1397 | 0 | } |
1398 | 0 | } |
1399 | 0 | } |
1400 | 0 | } while(--count > 0); |
1401 | 0 | } |
1402 | | |
1403 | 0 | while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { |
1404 | 0 | ch = *pSrc; |
1405 | 0 | if(ch <= 0x7f){ |
1406 | 0 | *pDest++=(UChar)ch; |
1407 | 0 | ++pSrc; |
1408 | 0 | } else { |
1409 | 0 | if(ch >= 0xe0) { |
1410 | 0 | if( /* handle U+0000..U+FFFF inline */ |
1411 | 0 | ch <= 0xef && |
1412 | 0 | ((pSrcLimit - pSrc) >= 3) && |
1413 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && |
1414 | 0 | (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f |
1415 | 0 | ) { |
1416 | | /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ |
1417 | 0 | *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); |
1418 | 0 | pSrc += 3; |
1419 | 0 | continue; |
1420 | 0 | } |
1421 | 0 | } else { |
1422 | 0 | if( /* handle U+0000..U+07FF inline */ |
1423 | 0 | ch >= 0xc0 && |
1424 | 0 | ((pSrcLimit - pSrc) >= 2) && |
1425 | 0 | (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f |
1426 | 0 | ) { |
1427 | 0 | *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); |
1428 | 0 | pSrc += 2; |
1429 | 0 | continue; |
1430 | 0 | } |
1431 | 0 | } |
1432 | | |
1433 | 0 | if(subchar < 0) { |
1434 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1435 | 0 | return NULL; |
1436 | 0 | } else { |
1437 | | /* function call for error cases */ |
1438 | 0 | ++pSrc; /* continue after the lead byte */ |
1439 | 0 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
1440 | 0 | ++numSubstitutions; |
1441 | 0 | if(subchar<=0xFFFF) { |
1442 | 0 | *(pDest++)=(UChar)subchar; |
1443 | 0 | } else { |
1444 | 0 | *(pDest++)=U16_LEAD(subchar); |
1445 | 0 | if(pDest<pDestLimit) { |
1446 | 0 | *(pDest++)=U16_TRAIL(subchar); |
1447 | 0 | } else { |
1448 | 0 | reqLength++; |
1449 | 0 | break; |
1450 | 0 | } |
1451 | 0 | } |
1452 | 0 | } |
1453 | 0 | } |
1454 | 0 | } |
1455 | | |
1456 | | /* do not fill the dest buffer just count the UChars needed */ |
1457 | 0 | while(pSrc < pSrcLimit){ |
1458 | 0 | ch = *pSrc; |
1459 | 0 | if(ch <= 0x7f) { |
1460 | 0 | reqLength++; |
1461 | 0 | ++pSrc; |
1462 | 0 | } else { |
1463 | 0 | if(ch >= 0xe0) { |
1464 | 0 | if( /* handle U+0000..U+FFFF inline */ |
1465 | 0 | ch <= 0xef && |
1466 | 0 | ((pSrcLimit - pSrc) >= 3) && |
1467 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f && |
1468 | 0 | (uint8_t)(pSrc[2] - 0x80) <= 0x3f |
1469 | 0 | ) { |
1470 | 0 | reqLength++; |
1471 | 0 | pSrc += 3; |
1472 | 0 | continue; |
1473 | 0 | } |
1474 | 0 | } else { |
1475 | 0 | if( /* handle U+0000..U+07FF inline */ |
1476 | 0 | ch >= 0xc0 && |
1477 | 0 | ((pSrcLimit - pSrc) >= 2) && |
1478 | 0 | (uint8_t)(pSrc[1] - 0x80) <= 0x3f |
1479 | 0 | ) { |
1480 | 0 | reqLength++; |
1481 | 0 | pSrc += 2; |
1482 | 0 | continue; |
1483 | 0 | } |
1484 | 0 | } |
1485 | | |
1486 | 0 | if(subchar < 0) { |
1487 | 0 | *pErrorCode = U_INVALID_CHAR_FOUND; |
1488 | 0 | return NULL; |
1489 | 0 | } else { |
1490 | | /* function call for error cases */ |
1491 | 0 | ++pSrc; /* continue after the lead byte */ |
1492 | 0 | utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); |
1493 | 0 | ++numSubstitutions; |
1494 | 0 | reqLength+=U16_LENGTH(ch); |
1495 | 0 | } |
1496 | 0 | } |
1497 | 0 | } |
1498 | | |
1499 | 0 | if(pNumSubstitutions!=NULL) { |
1500 | 0 | *pNumSubstitutions=numSubstitutions; |
1501 | 0 | } |
1502 | |
|
1503 | 0 | reqLength+=(int32_t)(pDest - dest); |
1504 | 0 | if(pDestLength) { |
1505 | 0 | *pDestLength = reqLength; |
1506 | 0 | } |
1507 | | |
1508 | | /* Terminate the buffer */ |
1509 | 0 | u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); |
1510 | 0 | return dest; |
1511 | 0 | } |
1512 | | |
1513 | | U_CAPI char* U_EXPORT2 |
1514 | | u_strToJavaModifiedUTF8( |
1515 | | char *dest, |
1516 | | int32_t destCapacity, |
1517 | | int32_t *pDestLength, |
1518 | | const UChar *src, |
1519 | | int32_t srcLength, |
1520 | 0 | UErrorCode *pErrorCode) { |
1521 | 0 | int32_t reqLength=0; |
1522 | 0 | uint32_t ch=0; |
1523 | 0 | uint8_t *pDest = (uint8_t *)dest; |
1524 | 0 | uint8_t *pDestLimit = pDest + destCapacity; |
1525 | 0 | const UChar *pSrcLimit; |
1526 | 0 | int32_t count; |
1527 | | |
1528 | | /* args check */ |
1529 | 0 | if(U_FAILURE(*pErrorCode)){ |
1530 | 0 | return NULL; |
1531 | 0 | } |
1532 | 0 | if( (src==NULL && srcLength!=0) || srcLength < -1 || |
1533 | 0 | (dest==NULL && destCapacity!=0) || destCapacity<0 |
1534 | 0 | ) { |
1535 | 0 | *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; |
1536 | 0 | return NULL; |
1537 | 0 | } |
1538 | | |
1539 | 0 | if(srcLength==-1) { |
1540 | | /* Convert NUL-terminated ASCII, then find the string length. */ |
1541 | 0 | while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { |
1542 | 0 | *pDest++ = (uint8_t)ch; |
1543 | 0 | ++src; |
1544 | 0 | } |
1545 | 0 | if(ch == 0) { |
1546 | 0 | reqLength=(int32_t)(pDest - (uint8_t *)dest); |
1547 | 0 | if(pDestLength) { |
1548 | 0 | *pDestLength = reqLength; |
1549 | 0 | } |
1550 | | |
1551 | | /* Terminate the buffer */ |
1552 | 0 | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
1553 | 0 | return dest; |
1554 | 0 | } |
1555 | 0 | srcLength = u_strlen(src); |
1556 | 0 | } |
1557 | | |
1558 | | /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ |
1559 | 0 | pSrcLimit = (src!=NULL)?(src+srcLength):NULL; |
1560 | 0 | for(;;) { |
1561 | 0 | count = (int32_t)(pDestLimit - pDest); |
1562 | 0 | srcLength = (int32_t)(pSrcLimit - src); |
1563 | 0 | if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { |
1564 | | /* fast ASCII loop */ |
1565 | 0 | const UChar *prevSrc = src; |
1566 | 0 | int32_t delta; |
1567 | 0 | while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { |
1568 | 0 | *pDest++=(uint8_t)ch; |
1569 | 0 | ++src; |
1570 | 0 | } |
1571 | 0 | delta = (int32_t)(src - prevSrc); |
1572 | 0 | count -= delta; |
1573 | 0 | srcLength -= delta; |
1574 | 0 | } |
1575 | | /* |
1576 | | * Each iteration of the inner loop progresses by at most 3 UTF-8 |
1577 | | * bytes and one UChar. |
1578 | | */ |
1579 | 0 | count /= 3; |
1580 | 0 | if(count > srcLength) { |
1581 | 0 | count = srcLength; /* min(remaining dest/3, remaining src) */ |
1582 | 0 | } |
1583 | 0 | if(count < 3) { |
1584 | | /* |
1585 | | * Too much overhead if we get near the end of the string, |
1586 | | * continue with the next loop. |
1587 | | */ |
1588 | 0 | break; |
1589 | 0 | } |
1590 | 0 | do { |
1591 | 0 | ch=*src++; |
1592 | 0 | if(ch <= 0x7f && ch != 0) { |
1593 | 0 | *pDest++ = (uint8_t)ch; |
1594 | 0 | } else if(ch <= 0x7ff) { |
1595 | 0 | *pDest++=(uint8_t)((ch>>6)|0xc0); |
1596 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1597 | 0 | } else { |
1598 | 0 | *pDest++=(uint8_t)((ch>>12)|0xe0); |
1599 | 0 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1600 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1601 | 0 | } |
1602 | 0 | } while(--count > 0); |
1603 | 0 | } |
1604 | |
|
1605 | 0 | while(src<pSrcLimit) { |
1606 | 0 | ch=*src++; |
1607 | 0 | if(ch <= 0x7f && ch != 0) { |
1608 | 0 | if(pDest<pDestLimit) { |
1609 | 0 | *pDest++ = (uint8_t)ch; |
1610 | 0 | } else { |
1611 | 0 | reqLength = 1; |
1612 | 0 | break; |
1613 | 0 | } |
1614 | 0 | } else if(ch <= 0x7ff) { |
1615 | 0 | if((pDestLimit - pDest) >= 2) { |
1616 | 0 | *pDest++=(uint8_t)((ch>>6)|0xc0); |
1617 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1618 | 0 | } else { |
1619 | 0 | reqLength = 2; |
1620 | 0 | break; |
1621 | 0 | } |
1622 | 0 | } else { |
1623 | 0 | if((pDestLimit - pDest) >= 3) { |
1624 | 0 | *pDest++=(uint8_t)((ch>>12)|0xe0); |
1625 | 0 | *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); |
1626 | 0 | *pDest++=(uint8_t)((ch&0x3f)|0x80); |
1627 | 0 | } else { |
1628 | 0 | reqLength = 3; |
1629 | 0 | break; |
1630 | 0 | } |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | while(src<pSrcLimit) { |
1634 | 0 | ch=*src++; |
1635 | 0 | if(ch <= 0x7f && ch != 0) { |
1636 | 0 | ++reqLength; |
1637 | 0 | } else if(ch<=0x7ff) { |
1638 | 0 | reqLength+=2; |
1639 | 0 | } else { |
1640 | 0 | reqLength+=3; |
1641 | 0 | } |
1642 | 0 | } |
1643 | |
|
1644 | 0 | reqLength+=(int32_t)(pDest - (uint8_t *)dest); |
1645 | 0 | if(pDestLength){ |
1646 | 0 | *pDestLength = reqLength; |
1647 | 0 | } |
1648 | | |
1649 | | /* Terminate the buffer */ |
1650 | 0 | u_terminateChars(dest, destCapacity, reqLength, pErrorCode); |
1651 | 0 | return dest; |
1652 | 0 | } |