Coverage Report

Created: 2023-06-07 07:17

/src/icu/source/common/ustrtrns.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2001-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*
11
* File ustrtrns.cpp
12
*
13
* Modification History:
14
*
15
*   Date        Name        Description
16
*   9/10/2001    Ram    Creation.
17
******************************************************************************
18
*/
19
20
/*******************************************************************************
21
 *
22
 * u_strTo* and u_strFrom* APIs
23
 * WCS functions moved to ustr_wcs.c for better modularization
24
 *
25
 *******************************************************************************
26
 */
27
28
29
#include "unicode/putil.h"
30
#include "unicode/ustring.h"
31
#include "unicode/utf.h"
32
#include "unicode/utf8.h"
33
#include "unicode/utf16.h"
34
#include "cstring.h"
35
#include "cmemory.h"
36
#include "ustr_imp.h"
37
#include "uassert.h"
38
39
U_CAPI UChar* U_EXPORT2 
40
u_strFromUTF32WithSub(UChar *dest,
41
               int32_t destCapacity,
42
               int32_t *pDestLength,
43
               const UChar32 *src,
44
               int32_t srcLength,
45
               UChar32 subchar, int32_t *pNumSubstitutions,
46
0
               UErrorCode *pErrorCode) {
47
0
    const UChar32 *srcLimit;
48
0
    UChar32 ch;
49
0
    UChar *destLimit;
50
0
    UChar *pDest;
51
0
    int32_t reqLength;
52
0
    int32_t numSubstitutions;
53
54
    /* args check */
55
0
    if(U_FAILURE(*pErrorCode)){
56
0
        return NULL;
57
0
    }
58
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59
0
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61
0
    ) {
62
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63
0
        return NULL;
64
0
    }
65
66
0
    if(pNumSubstitutions != NULL) {
67
0
        *pNumSubstitutions = 0;
68
0
    }
69
70
0
    pDest = dest;
71
0
    destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72
0
    reqLength = 0;
73
0
    numSubstitutions = 0;
74
75
0
    if(srcLength < 0) {
76
        /* simple loop for conversion of a NUL-terminated BMP string */
77
0
        while((ch=*src) != 0 &&
78
0
              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79
0
            ++src;
80
0
            if(pDest < destLimit) {
81
0
                *pDest++ = (UChar)ch;
82
0
            } else {
83
0
                ++reqLength;
84
0
            }
85
0
        }
86
0
        srcLimit = src;
87
0
        if(ch != 0) {
88
            /* "complicated" case, find the end of the remaining string */
89
0
            while(*++srcLimit != 0) {}
90
0
        }
91
0
    } else {
92
0
      srcLimit = (src!=NULL)?(src + srcLength):NULL;
93
0
    }
94
95
    /* convert with length */
96
0
    while(src < srcLimit) {
97
0
        ch = *src++;
98
0
        do {
99
            /* usually "loops" once; twice only for writing subchar */
100
0
            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101
0
                if(pDest < destLimit) {
102
0
                    *pDest++ = (UChar)ch;
103
0
                } else {
104
0
                    ++reqLength;
105
0
                }
106
0
                break;
107
0
            } else if(0x10000 <= ch && ch <= 0x10ffff) {
108
0
                if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109
0
                    *pDest++ = U16_LEAD(ch);
110
0
                    *pDest++ = U16_TRAIL(ch);
111
0
                } else {
112
0
                    reqLength += 2;
113
0
                }
114
0
                break;
115
0
            } else if((ch = subchar) < 0) {
116
                /* surrogate code point, or not a Unicode code point at all */
117
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
118
0
                return NULL;
119
0
            } else {
120
0
                ++numSubstitutions;
121
0
            }
122
0
        } while(TRUE);
123
0
    }
124
125
0
    reqLength += (int32_t)(pDest - dest);
126
0
    if(pDestLength) {
127
0
        *pDestLength = reqLength;
128
0
    }
129
0
    if(pNumSubstitutions != NULL) {
130
0
        *pNumSubstitutions = numSubstitutions;
131
0
    }
132
133
    /* Terminate the buffer */
134
0
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
    
136
0
    return dest;
137
0
}
138
139
U_CAPI UChar* U_EXPORT2 
140
u_strFromUTF32(UChar *dest,
141
               int32_t destCapacity, 
142
               int32_t *pDestLength,
143
               const UChar32 *src,
144
               int32_t srcLength,
145
0
               UErrorCode *pErrorCode) {
146
0
    return u_strFromUTF32WithSub(
147
0
            dest, destCapacity, pDestLength,
148
0
            src, srcLength,
149
0
            U_SENTINEL, NULL,
150
0
            pErrorCode);
151
0
}
152
153
U_CAPI UChar32* U_EXPORT2 
154
u_strToUTF32WithSub(UChar32 *dest,
155
             int32_t destCapacity,
156
             int32_t *pDestLength,
157
             const UChar *src,
158
             int32_t srcLength,
159
             UChar32 subchar, int32_t *pNumSubstitutions,
160
0
             UErrorCode *pErrorCode) {
161
0
    const UChar *srcLimit;
162
0
    UChar32 ch;
163
0
    UChar ch2;
164
0
    UChar32 *destLimit;
165
0
    UChar32 *pDest;
166
0
    int32_t reqLength;
167
0
    int32_t numSubstitutions;
168
169
    /* args check */
170
0
    if(U_FAILURE(*pErrorCode)){
171
0
        return NULL;
172
0
    }
173
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174
0
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176
0
    ) {
177
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178
0
        return NULL;
179
0
    }
180
181
0
    if(pNumSubstitutions != NULL) {
182
0
        *pNumSubstitutions = 0;
183
0
    }
184
185
0
    pDest = dest;
186
0
    destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187
0
    reqLength = 0;
188
0
    numSubstitutions = 0;
189
190
0
    if(srcLength < 0) {
191
        /* simple loop for conversion of a NUL-terminated BMP string */
192
0
        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193
0
            ++src;
194
0
            if(pDest < destLimit) {
195
0
                *pDest++ = ch;
196
0
            } else {
197
0
                ++reqLength;
198
0
            }
199
0
        }
200
0
        srcLimit = src;
201
0
        if(ch != 0) {
202
            /* "complicated" case, find the end of the remaining string */
203
0
            while(*++srcLimit != 0) {}
204
0
        }
205
0
    } else {
206
0
        srcLimit = (src!=NULL)?(src + srcLength):NULL;
207
0
    }
208
209
    /* convert with length */
210
0
    while(src < srcLimit) {
211
0
        ch = *src++;
212
0
        if(!U16_IS_SURROGATE(ch)) {
213
            /* write or count ch below */
214
0
        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215
0
            ++src;
216
0
            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217
0
        } else if((ch = subchar) < 0) {
218
            /* unpaired surrogate */
219
0
            *pErrorCode = U_INVALID_CHAR_FOUND;
220
0
            return NULL;
221
0
        } else {
222
0
            ++numSubstitutions;
223
0
        }
224
0
        if(pDest < destLimit) {
225
0
            *pDest++ = ch;
226
0
        } else {
227
0
            ++reqLength;
228
0
        }
229
0
    }
230
231
0
    reqLength += (int32_t)(pDest - dest);
232
0
    if(pDestLength) {
233
0
        *pDestLength = reqLength;
234
0
    }
235
0
    if(pNumSubstitutions != NULL) {
236
0
        *pNumSubstitutions = numSubstitutions;
237
0
    }
238
239
    /* Terminate the buffer */
240
0
    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242
0
    return dest;
243
0
}
244
245
U_CAPI UChar32* U_EXPORT2 
246
u_strToUTF32(UChar32 *dest, 
247
             int32_t destCapacity,
248
             int32_t *pDestLength,
249
             const UChar *src, 
250
             int32_t srcLength,
251
0
             UErrorCode *pErrorCode) {
252
0
    return u_strToUTF32WithSub(
253
0
            dest, destCapacity, pDestLength,
254
0
            src, srcLength,
255
0
            U_SENTINEL, NULL,
256
0
            pErrorCode);
257
0
}
258
259
/* for utf8_nextCharSafeBodyTerminated() */
260
static const UChar32
261
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262
263
/*
264
 * Version of utf8_nextCharSafeBody() with the following differences:
265
 * - checks for NUL termination instead of length
266
 * - works with pointers instead of indexes
267
 * - always strict (strict==-1)
268
 *
269
 * *ps points to after the lead byte and will be moved to after the last trail byte.
270
 * c is the lead byte.
271
 * @return the code point, or U_SENTINEL
272
 */
273
static UChar32
274
0
utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275
0
    const uint8_t *s=*ps;
276
0
    uint8_t trail, illegal=0;
277
0
    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278
0
    U_ASSERT(count<6);
279
0
    U8_MASK_LEAD_BYTE((c), count);
280
    /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281
0
    switch(count) {
282
    /* each branch falls through to the next one */
283
0
    case 5:
284
0
    case 4:
285
        /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286
0
        illegal=1;
287
0
        break;
288
0
    case 3:
289
0
        trail=(uint8_t)(*s++ - 0x80);
290
0
        c=(c<<6)|trail;
291
0
        if(trail>0x3f || c>=0x110) {
292
            /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293
0
            illegal=1;
294
0
            break;
295
0
        }
296
0
        U_FALLTHROUGH;
297
0
    case 2:
298
0
        trail=(uint8_t)(*s++ - 0x80);
299
0
        if(trail>0x3f) {
300
            /* not a trail byte */
301
0
            illegal=1;
302
0
            break;
303
0
        }
304
0
        c=(c<<6)|trail;
305
0
        U_FALLTHROUGH;
306
0
    case 1:
307
0
        trail=(uint8_t)(*s++ - 0x80);
308
0
        if(trail>0x3f) {
309
            /* not a trail byte */
310
0
            illegal=1;
311
0
        }
312
0
        c=(c<<6)|trail;
313
0
        break;
314
0
    case 0:
315
0
        return U_SENTINEL;
316
    /* no default branch to optimize switch()  - all values are covered */
317
0
    }
318
319
    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320
    /* illegal is also set if count>=4 */
321
0
    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
322
        /* error handling */
323
        /* don't go beyond this sequence */
324
0
        s=*ps;
325
0
        while(count>0 && U8_IS_TRAIL(*s)) {
326
0
            ++s;
327
0
            --count;
328
0
        }
329
0
        c=U_SENTINEL;
330
0
    }
331
0
    *ps=s;
332
0
    return c;
333
0
}
334
335
/*
336
 * Version of utf8_nextCharSafeBody() with the following differences:
337
 * - works with pointers instead of indexes
338
 * - always strict (strict==-1)
339
 *
340
 * *ps points to after the lead byte and will be moved to after the last trail byte.
341
 * c is the lead byte.
342
 * @return the code point, or U_SENTINEL
343
 */
344
static UChar32
345
0
utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
346
0
    const uint8_t *s=*ps;
347
0
    uint8_t trail, illegal=0;
348
0
    uint8_t count=U8_COUNT_TRAIL_BYTES(c);
349
0
    if((limit-s)>=count) {
350
0
        U8_MASK_LEAD_BYTE((c), count);
351
        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
352
0
        switch(count) {
353
        /* each branch falls through to the next one */
354
0
        case 5:
355
0
        case 4:
356
            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
357
0
            illegal=1;
358
0
            break;
359
0
        case 3:
360
0
            trail=*s++;
361
0
            c=(c<<6)|(trail&0x3f);
362
0
            if(c<0x110) {
363
0
                illegal|=(trail&0xc0)^0x80;
364
0
            } else {
365
                /* code point>0x10ffff, outside Unicode */
366
0
                illegal=1;
367
0
                break;
368
0
            }
369
0
            U_FALLTHROUGH;
370
0
        case 2:
371
0
            trail=*s++;
372
0
            c=(c<<6)|(trail&0x3f);
373
0
            illegal|=(trail&0xc0)^0x80;
374
0
            U_FALLTHROUGH;
375
0
        case 1:
376
0
            trail=*s++;
377
0
            c=(c<<6)|(trail&0x3f);
378
0
            illegal|=(trail&0xc0)^0x80;
379
0
            break;
380
0
        case 0:
381
0
            return U_SENTINEL;
382
        /* no default branch to optimize switch()  - all values are covered */
383
0
        }
384
0
    } else {
385
0
        illegal=1; /* too few bytes left */
386
0
    }
387
388
    /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389
    /* illegal is also set if count>=4 */
390
0
    U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
391
0
    if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
392
        /* error handling */
393
        /* don't go beyond this sequence */
394
0
        s=*ps;
395
0
        while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
396
0
            ++s;
397
0
            --count;
398
0
        }
399
0
        c=U_SENTINEL;
400
0
    }
401
0
    *ps=s;
402
0
    return c;
403
0
}
404
405
U_CAPI UChar* U_EXPORT2
406
u_strFromUTF8WithSub(UChar *dest,
407
              int32_t destCapacity,
408
              int32_t *pDestLength,
409
              const char* src,
410
              int32_t srcLength,
411
              UChar32 subchar, int32_t *pNumSubstitutions,
412
0
              UErrorCode *pErrorCode){
413
0
    UChar *pDest = dest;
414
0
    UChar *pDestLimit = dest+destCapacity;
415
0
    UChar32 ch;
416
0
    int32_t reqLength = 0;
417
0
    const uint8_t* pSrc = (const uint8_t*) src;
418
0
    uint8_t t1, t2; /* trail bytes */
419
0
    int32_t numSubstitutions;
420
421
    /* args check */
422
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
423
0
        return NULL;
424
0
    }
425
        
426
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
427
0
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
428
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
429
0
    ) {
430
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
431
0
        return NULL;
432
0
    }
433
434
0
    if(pNumSubstitutions!=NULL) {
435
0
        *pNumSubstitutions=0;
436
0
    }
437
0
    numSubstitutions=0;
438
439
    /*
440
     * Inline processing of UTF-8 byte sequences:
441
     *
442
     * Byte sequences for the most common characters are handled inline in
443
     * the conversion loops. In order to reduce the path lengths for those
444
     * characters, the tests are arranged in a kind of binary search.
445
     * ASCII (<=0x7f) is checked first, followed by the dividing point
446
     * between 2- and 3-byte sequences (0xe0).
447
     * The 3-byte branch is tested first to speed up CJK text.
448
     * The compiler should combine the subtractions for the two tests for 0xe0.
449
     * Each branch then tests for the other end of its range.
450
     */
451
452
0
    if(srcLength < 0){
453
        /*
454
         * Transform a NUL-terminated string.
455
         * The code explicitly checks for NULs only in the lead byte position.
456
         * A NUL byte in the trail byte position fails the trail byte range check anyway.
457
         */
458
0
        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
459
0
            if(ch <= 0x7f){
460
0
                *pDest++=(UChar)ch;
461
0
                ++pSrc;
462
0
            } else {
463
0
                if(ch > 0xe0) {
464
0
                    if( /* handle U+1000..U+CFFF inline */
465
0
                        ch <= 0xec &&
466
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
467
0
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
468
0
                    ) {
469
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470
0
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
471
0
                        pSrc += 3;
472
0
                        continue;
473
0
                    }
474
0
                } else if(ch < 0xe0) {
475
0
                    if( /* handle U+0080..U+07FF inline */
476
0
                        ch >= 0xc2 &&
477
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
478
0
                    ) {
479
0
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
480
0
                        pSrc += 2;
481
0
                        continue;
482
0
                    }
483
0
                }
484
485
                /* function call for "complicated" and error cases */
486
0
                ++pSrc; /* continue after the lead byte */
487
0
                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
488
0
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
489
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
490
0
                    return NULL;
491
0
                } else if(ch<=0xFFFF) {
492
0
                    *(pDest++)=(UChar)ch;
493
0
                } else {
494
0
                    *(pDest++)=U16_LEAD(ch);
495
0
                    if(pDest<pDestLimit) {
496
0
                        *(pDest++)=U16_TRAIL(ch);
497
0
                    } else {
498
0
                        reqLength++;
499
0
                        break;
500
0
                    }
501
0
                }
502
0
            }
503
0
        }
504
505
        /* Pre-flight the rest of the string. */
506
0
        while((ch = *pSrc) != 0) {
507
0
            if(ch <= 0x7f){
508
0
                ++reqLength;
509
0
                ++pSrc;
510
0
            } else {
511
0
                if(ch > 0xe0) {
512
0
                    if( /* handle U+1000..U+CFFF inline */
513
0
                        ch <= 0xec &&
514
0
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
515
0
                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
516
0
                    ) {
517
0
                        ++reqLength;
518
0
                        pSrc += 3;
519
0
                        continue;
520
0
                    }
521
0
                } else if(ch < 0xe0) {
522
0
                    if( /* handle U+0080..U+07FF inline */
523
0
                        ch >= 0xc2 &&
524
0
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
525
0
                    ) {
526
0
                        ++reqLength;
527
0
                        pSrc += 2;
528
0
                        continue;
529
0
                    }
530
0
                }
531
532
                /* function call for "complicated" and error cases */
533
0
                ++pSrc; /* continue after the lead byte */
534
0
                ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
535
0
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
536
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
537
0
                    return NULL;
538
0
                }
539
0
                reqLength += U16_LENGTH(ch);
540
0
            }
541
0
        }
542
0
    } else /* srcLength >= 0 */ {
543
0
        const uint8_t *pSrcLimit = pSrc + srcLength;
544
0
        int32_t count;
545
546
        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
547
0
        for(;;) {
548
            /*
549
             * Each iteration of the inner loop progresses by at most 3 UTF-8
550
             * bytes and one UChar, for most characters.
551
             * For supplementary code points (4 & 2), which are rare,
552
             * there is an additional adjustment.
553
             */
554
0
            count = (int32_t)(pDestLimit - pDest);
555
0
            srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
556
0
            if(count > srcLength) {
557
0
                count = srcLength; /* min(remaining dest, remaining src/3) */
558
0
            }
559
0
            if(count < 3) {
560
                /*
561
                 * Too much overhead if we get near the end of the string,
562
                 * continue with the next loop.
563
                 */
564
0
                break;
565
0
            }
566
567
0
            do {
568
0
                ch = *pSrc;
569
0
                if(ch <= 0x7f){
570
0
                    *pDest++=(UChar)ch;
571
0
                    ++pSrc;
572
0
                } else {
573
0
                    if(ch > 0xe0) {
574
0
                        if( /* handle U+1000..U+CFFF inline */
575
0
                            ch <= 0xec &&
576
0
                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
577
0
                            (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
578
0
                        ) {
579
                            /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580
0
                            *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
581
0
                            pSrc += 3;
582
0
                            continue;
583
0
                        }
584
0
                    } else if(ch < 0xe0) {
585
0
                        if( /* handle U+0080..U+07FF inline */
586
0
                            ch >= 0xc2 &&
587
0
                            (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
588
0
                        ) {
589
0
                            *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
590
0
                            pSrc += 2;
591
0
                            continue;
592
0
                        }
593
0
                    }
594
595
0
                    if(ch >= 0xf0 || subchar > 0xffff) {
596
                        /*
597
                         * We may read up to six bytes and write up to two UChars,
598
                         * which we didn't account for with computing count,
599
                         * so we adjust it here.
600
                         */
601
0
                        if(--count == 0) {
602
0
                            break;
603
0
                        }
604
0
                    }
605
606
                    /* function call for "complicated" and error cases */
607
0
                    ++pSrc; /* continue after the lead byte */
608
0
                    ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
609
0
                    if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
610
0
                        *pErrorCode = U_INVALID_CHAR_FOUND;
611
0
                        return NULL;
612
0
                    }else if(ch<=0xFFFF){
613
0
                        *(pDest++)=(UChar)ch;
614
0
                    }else{
615
0
                        *(pDest++)=U16_LEAD(ch);
616
0
                        *(pDest++)=U16_TRAIL(ch);
617
0
                    }
618
0
                }
619
0
            } while(--count > 0);
620
0
        }
621
622
0
        while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
623
0
            ch = *pSrc;
624
0
            if(ch <= 0x7f){
625
0
                *pDest++=(UChar)ch;
626
0
                ++pSrc;
627
0
            } else {
628
0
                if(ch > 0xe0) {
629
0
                    if( /* handle U+1000..U+CFFF inline */
630
0
                        ch <= 0xec &&
631
0
                        ((pSrcLimit - pSrc) >= 3) &&
632
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
633
0
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
634
0
                    ) {
635
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636
0
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
637
0
                        pSrc += 3;
638
0
                        continue;
639
0
                    }
640
0
                } else if(ch < 0xe0) {
641
0
                    if( /* handle U+0080..U+07FF inline */
642
0
                        ch >= 0xc2 &&
643
0
                        ((pSrcLimit - pSrc) >= 2) &&
644
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
645
0
                    ) {
646
0
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
647
0
                        pSrc += 2;
648
0
                        continue;
649
0
                    }
650
0
                }
651
652
                /* function call for "complicated" and error cases */
653
0
                ++pSrc; /* continue after the lead byte */
654
0
                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
655
0
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
656
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
657
0
                    return NULL;
658
0
                }else if(ch<=0xFFFF){
659
0
                    *(pDest++)=(UChar)ch;
660
0
                }else{
661
0
                    *(pDest++)=U16_LEAD(ch);
662
0
                    if(pDest<pDestLimit){
663
0
                        *(pDest++)=U16_TRAIL(ch);
664
0
                    }else{
665
0
                        reqLength++;
666
0
                        break;
667
0
                    }
668
0
                }
669
0
            }
670
0
        }
671
        /* do not fill the dest buffer just count the UChars needed */
672
0
        while(pSrc < pSrcLimit){
673
0
            ch = *pSrc;
674
0
            if(ch <= 0x7f){
675
0
                reqLength++;
676
0
                ++pSrc;
677
0
            } else {
678
0
                if(ch > 0xe0) {
679
0
                    if( /* handle U+1000..U+CFFF inline */
680
0
                        ch <= 0xec &&
681
0
                        ((pSrcLimit - pSrc) >= 3) &&
682
0
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
683
0
                        (uint8_t)(pSrc[2] - 0x80) <= 0x3f
684
0
                    ) {
685
0
                        reqLength++;
686
0
                        pSrc += 3;
687
0
                        continue;
688
0
                    }
689
0
                } else if(ch < 0xe0) {
690
0
                    if( /* handle U+0080..U+07FF inline */
691
0
                        ch >= 0xc2 &&
692
0
                        ((pSrcLimit - pSrc) >= 2) &&
693
0
                        (uint8_t)(pSrc[1] - 0x80) <= 0x3f
694
0
                    ) {
695
0
                        reqLength++;
696
0
                        pSrc += 2;
697
0
                        continue;
698
0
                    }
699
0
                }
700
701
                /* function call for "complicated" and error cases */
702
0
                ++pSrc; /* continue after the lead byte */
703
0
                ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
704
0
                if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
705
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
706
0
                    return NULL;
707
0
                }
708
0
                reqLength+=U16_LENGTH(ch);
709
0
            }
710
0
        }
711
0
    }
712
713
0
    reqLength+=(int32_t)(pDest - dest);
714
715
0
    if(pNumSubstitutions!=NULL) {
716
0
        *pNumSubstitutions=numSubstitutions;
717
0
    }
718
719
0
    if(pDestLength){
720
0
        *pDestLength = reqLength;
721
0
    }
722
723
    /* Terminate the buffer */
724
0
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
725
726
0
    return dest;
727
0
}
728
729
U_CAPI UChar* U_EXPORT2
730
u_strFromUTF8(UChar *dest,
731
              int32_t destCapacity,
732
              int32_t *pDestLength,
733
              const char* src,
734
              int32_t srcLength,
735
0
              UErrorCode *pErrorCode){
736
0
    return u_strFromUTF8WithSub(
737
0
            dest, destCapacity, pDestLength,
738
0
            src, srcLength,
739
0
            U_SENTINEL, NULL,
740
0
            pErrorCode);
741
0
}
742
743
U_CAPI UChar * U_EXPORT2
744
u_strFromUTF8Lenient(UChar *dest,
745
                     int32_t destCapacity,
746
                     int32_t *pDestLength,
747
                     const char *src,
748
                     int32_t srcLength,
749
0
                     UErrorCode *pErrorCode) {
750
0
    UChar *pDest = dest;
751
0
    UChar32 ch;
752
0
    int32_t reqLength = 0;
753
0
    uint8_t* pSrc = (uint8_t*) src;
754
755
    /* args check */
756
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
757
0
        return NULL;
758
0
    }
759
        
760
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
761
0
        (destCapacity<0) || (dest == NULL && destCapacity > 0)
762
0
    ) {
763
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
764
0
        return NULL;
765
0
    }
766
767
0
    if(srcLength < 0) {
768
        /* Transform a NUL-terminated string. */
769
0
        UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
770
0
        uint8_t t1, t2, t3; /* trail bytes */
771
772
0
        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
773
0
            if(ch < 0xc0) {
774
                /*
775
                 * ASCII, or a trail byte in lead position which is treated like
776
                 * a single-byte sequence for better character boundary
777
                 * resynchronization after illegal sequences.
778
                 */
779
0
                *pDest++=(UChar)ch;
780
0
                ++pSrc;
781
0
                continue;
782
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
783
0
                if((t1 = pSrc[1]) != 0) {
784
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
785
0
                    *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
786
0
                    pSrc += 2;
787
0
                    continue;
788
0
                }
789
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
790
0
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
791
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792
                    /* 0x2080 = (0x80 << 6) + 0x80 */
793
0
                    *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
794
0
                    pSrc += 3;
795
0
                    continue;
796
0
                }
797
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798
0
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
799
0
                    pSrc += 4;
800
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801
0
                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
802
0
                    *(pDest++) = U16_LEAD(ch);
803
0
                    if(pDest < pDestLimit) {
804
0
                        *(pDest++) = U16_TRAIL(ch);
805
0
                    } else {
806
0
                        reqLength = 1;
807
0
                        break;
808
0
                    }
809
0
                    continue;
810
0
                }
811
0
            }
812
813
            /* truncated character at the end */
814
0
            *pDest++ = 0xfffd;
815
0
            while(*++pSrc != 0) {}
816
0
            break;
817
0
        }
818
819
        /* Pre-flight the rest of the string. */
820
0
        while((ch = *pSrc) != 0) {
821
0
            if(ch < 0xc0) {
822
                /*
823
                 * ASCII, or a trail byte in lead position which is treated like
824
                 * a single-byte sequence for better character boundary
825
                 * resynchronization after illegal sequences.
826
                 */
827
0
                ++reqLength;
828
0
                ++pSrc;
829
0
                continue;
830
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
831
0
                if(pSrc[1] != 0) {
832
0
                    ++reqLength;
833
0
                    pSrc += 2;
834
0
                    continue;
835
0
                }
836
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
837
0
                if(pSrc[1] != 0 && pSrc[2] != 0) {
838
0
                    ++reqLength;
839
0
                    pSrc += 3;
840
0
                    continue;
841
0
                }
842
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843
0
                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
844
0
                    reqLength += 2;
845
0
                    pSrc += 4;
846
0
                    continue;
847
0
                }
848
0
            }
849
850
            /* truncated character at the end */
851
0
            ++reqLength;
852
0
            break;
853
0
        }
854
0
    } else /* srcLength >= 0 */ {
855
0
      const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
856
857
        /*
858
         * This function requires that if srcLength is given, then it must be
859
         * destCapatity >= srcLength so that we need not check for
860
         * destination buffer overflow in the loop.
861
         */
862
0
        if(destCapacity < srcLength) {
863
0
            if(pDestLength != NULL) {
864
0
                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
865
0
            }
866
0
            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
867
0
            return NULL;
868
0
        }
869
870
0
        if((pSrcLimit - pSrc) >= 4) {
871
0
            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
872
873
            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
874
0
            do {
875
0
                ch = *pSrc++;
876
0
                if(ch < 0xc0) {
877
                    /*
878
                     * ASCII, or a trail byte in lead position which is treated like
879
                     * a single-byte sequence for better character boundary
880
                     * resynchronization after illegal sequences.
881
                     */
882
0
                    *pDest++=(UChar)ch;
883
0
                } else if(ch < 0xe0) { /* U+0080..U+07FF */
884
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
885
0
                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
886
0
                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
887
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888
                    /* 0x2080 = (0x80 << 6) + 0x80 */
889
0
                    ch = (ch << 12) + (*pSrc++ << 6);
890
0
                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
891
0
                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893
0
                    ch = (ch << 18) + (*pSrc++ << 12);
894
0
                    ch += *pSrc++ << 6;
895
0
                    ch += *pSrc++ - 0x3c82080;
896
0
                    *(pDest++) = U16_LEAD(ch);
897
0
                    *(pDest++) = U16_TRAIL(ch);
898
0
                }
899
0
            } while(pSrc < pSrcLimit);
900
901
0
            pSrcLimit += 3; /* restore original pSrcLimit */
902
0
        }
903
904
0
        while(pSrc < pSrcLimit) {
905
0
            ch = *pSrc++;
906
0
            if(ch < 0xc0) {
907
                /*
908
                 * ASCII, or a trail byte in lead position which is treated like
909
                 * a single-byte sequence for better character boundary
910
                 * resynchronization after illegal sequences.
911
                 */
912
0
                *pDest++=(UChar)ch;
913
0
                continue;
914
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
915
0
                if(pSrc < pSrcLimit) {
916
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
917
0
                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
918
0
                    continue;
919
0
                }
920
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
921
0
                if((pSrcLimit - pSrc) >= 2) {
922
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923
                    /* 0x2080 = (0x80 << 6) + 0x80 */
924
0
                    ch = (ch << 12) + (*pSrc++ << 6);
925
0
                    *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
926
0
                    pSrc += 3;
927
0
                    continue;
928
0
                }
929
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930
0
                if((pSrcLimit - pSrc) >= 3) {
931
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932
0
                    ch = (ch << 18) + (*pSrc++ << 12);
933
0
                    ch += *pSrc++ << 6;
934
0
                    ch += *pSrc++ - 0x3c82080;
935
0
                    *(pDest++) = U16_LEAD(ch);
936
0
                    *(pDest++) = U16_TRAIL(ch);
937
0
                    pSrc += 4;
938
0
                    continue;
939
0
                }
940
0
            }
941
942
            /* truncated character at the end */
943
0
            *pDest++ = 0xfffd;
944
0
            break;
945
0
        }
946
0
    }
947
948
0
    reqLength+=(int32_t)(pDest - dest);
949
950
0
    if(pDestLength){
951
0
        *pDestLength = reqLength;
952
0
    }
953
954
    /* Terminate the buffer */
955
0
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
956
957
0
    return dest;
958
0
}
959
960
static inline uint8_t *
961
96
_appendUTF8(uint8_t *pDest, UChar32 c) {
962
    /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
963
96
    if((c)<=0x7f) {
964
0
        *pDest++=(uint8_t)c;
965
96
    } else if(c<=0x7ff) {
966
0
        *pDest++=(uint8_t)((c>>6)|0xc0);
967
0
        *pDest++=(uint8_t)((c&0x3f)|0x80);
968
96
    } else if(c<=0xffff) {
969
0
        *pDest++=(uint8_t)((c>>12)|0xe0);
970
0
        *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
971
0
        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
972
96
    } else /* if((uint32_t)(c)<=0x10ffff) */ {
973
96
        *pDest++=(uint8_t)(((c)>>18)|0xf0);
974
96
        *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
975
96
        *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
976
96
        *pDest++=(uint8_t)(((c)&0x3f)|0x80);
977
96
    }
978
96
    return pDest;
979
96
}
980
981
   
982
U_CAPI char* U_EXPORT2 
983
u_strToUTF8WithSub(char *dest,
984
            int32_t destCapacity,
985
            int32_t *pDestLength,
986
            const UChar *pSrc,
987
            int32_t srcLength,
988
            UChar32 subchar, int32_t *pNumSubstitutions,
989
3.57k
            UErrorCode *pErrorCode){
990
3.57k
    int32_t reqLength=0;
991
3.57k
    uint32_t ch=0,ch2=0;
992
3.57k
    uint8_t *pDest = (uint8_t *)dest;
993
3.57k
    uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
994
3.57k
    int32_t numSubstitutions;
995
996
    /* args check */
997
3.57k
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
998
0
        return NULL;
999
0
    }
1000
        
1001
3.57k
    if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002
3.57k
        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003
3.57k
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1004
3.57k
    ) {
1005
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1006
0
        return NULL;
1007
0
    }
1008
1009
3.57k
    if(pNumSubstitutions!=NULL) {
1010
0
        *pNumSubstitutions=0;
1011
0
    }
1012
3.57k
    numSubstitutions=0;
1013
1014
3.57k
    if(srcLength==-1) {
1015
0
        while((ch=*pSrc)!=0) {
1016
0
            ++pSrc;
1017
0
            if(ch <= 0x7f) {
1018
0
                if(pDest<pDestLimit) {
1019
0
                    *pDest++ = (uint8_t)ch;
1020
0
                } else {
1021
0
                    reqLength = 1;
1022
0
                    break;
1023
0
                }
1024
0
            } else if(ch <= 0x7ff) {
1025
0
                if((pDestLimit - pDest) >= 2) {
1026
0
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1027
0
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1028
0
                } else {
1029
0
                    reqLength = 2;
1030
0
                    break;
1031
0
                }
1032
0
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033
0
                if((pDestLimit - pDest) >= 3) {
1034
0
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1035
0
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036
0
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1037
0
                } else {
1038
0
                    reqLength = 3;
1039
0
                    break;
1040
0
                }
1041
0
            } else /* ch is a surrogate */ {
1042
0
                int32_t length;
1043
1044
                /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045
0
                if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
1046
0
                    ++pSrc;
1047
0
                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048
0
                } else if(subchar>=0) {
1049
0
                    ch=subchar;
1050
0
                    ++numSubstitutions;
1051
0
                } else {
1052
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
1054
0
                    return NULL;
1055
0
                }
1056
1057
0
                length = U8_LENGTH(ch);
1058
0
                if((pDestLimit - pDest) >= length) {
1059
                    /* convert and append*/
1060
0
                    pDest=_appendUTF8(pDest, ch);
1061
0
                } else {
1062
0
                    reqLength = length;
1063
0
                    break;
1064
0
                }
1065
0
            }
1066
0
        }
1067
0
        while((ch=*pSrc++)!=0) {
1068
0
            if(ch<=0x7f) {
1069
0
                ++reqLength;
1070
0
            } else if(ch<=0x7ff) {
1071
0
                reqLength+=2;
1072
0
            } else if(!U16_IS_SURROGATE(ch)) {
1073
0
                reqLength+=3;
1074
0
            } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1075
0
                ++pSrc;
1076
0
                reqLength+=4;
1077
0
            } else if(subchar>=0) {
1078
0
                reqLength+=U8_LENGTH(subchar);
1079
0
                ++numSubstitutions;
1080
0
            } else {
1081
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1083
0
                return NULL;
1084
0
            }
1085
0
        }
1086
3.57k
    } else {
1087
3.57k
        const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1088
3.57k
        int32_t count;
1089
1090
        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091
6.38k
        for(;;) {
1092
            /*
1093
             * Each iteration of the inner loop progresses by at most 3 UTF-8
1094
             * bytes and one UChar, for most characters.
1095
             * For supplementary code points (4 & 2), which are rare,
1096
             * there is an additional adjustment.
1097
             */
1098
6.38k
            count = (int32_t)((pDestLimit - pDest) / 3);
1099
6.38k
            srcLength = (int32_t)(pSrcLimit - pSrc);
1100
6.38k
            if(count > srcLength) {
1101
6.38k
                count = srcLength; /* min(remaining dest/3, remaining src) */
1102
6.38k
            }
1103
6.38k
            if(count < 3) {
1104
                /*
1105
                 * Too much overhead if we get near the end of the string,
1106
                 * continue with the next loop.
1107
                 */
1108
3.57k
                break;
1109
3.57k
            }
1110
7.44M
            do {
1111
7.44M
                ch=*pSrc++;
1112
7.44M
                if(ch <= 0x7f) {
1113
4.36M
                    *pDest++ = (uint8_t)ch;
1114
4.36M
                } else if(ch <= 0x7ff) {
1115
1.30M
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1116
1.30M
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117
1.76M
                } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118
1.74M
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1119
1.74M
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120
1.74M
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121
1.74M
                } else /* ch is a surrogate */ {
1122
                    /*
1123
                     * We will read two UChars and probably output four bytes,
1124
                     * which we didn't account for with computing count,
1125
                     * so we adjust it here.
1126
                     */
1127
24.6k
                    if(--count == 0) {
1128
0
                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129
0
                        break;  /* recompute count */
1130
0
                    }
1131
1132
24.6k
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
1133
24.6k
                        ++pSrc;
1134
24.6k
                        ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1135
1136
                        /* writing 4 bytes per 2 UChars is ok */
1137
24.6k
                        *pDest++=(uint8_t)((ch>>18)|0xf0);
1138
24.6k
                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139
24.6k
                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140
24.6k
                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
1141
24.6k
                    } else  {
1142
                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143
0
                        if(subchar>=0) {
1144
0
                            ch=subchar;
1145
0
                            ++numSubstitutions;
1146
0
                        } else {
1147
0
                            *pErrorCode = U_INVALID_CHAR_FOUND;
1148
0
                            return NULL;
1149
0
                        }
1150
1151
                        /* convert and append*/
1152
0
                        pDest=_appendUTF8(pDest, ch);
1153
0
                    }
1154
24.6k
                }
1155
7.44M
            } while(--count > 0);
1156
2.81k
        }
1157
1158
4.71k
        while(pSrc<pSrcLimit) {
1159
1.13k
            ch=*pSrc++;
1160
1.13k
            if(ch <= 0x7f) {
1161
181
                if(pDest<pDestLimit) {
1162
181
                    *pDest++ = (uint8_t)ch;
1163
181
                } else {
1164
0
                    reqLength = 1;
1165
0
                    break;
1166
0
                }
1167
957
            } else if(ch <= 0x7ff) {
1168
347
                if((pDestLimit - pDest) >= 2) {
1169
347
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
1170
347
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1171
347
                } else {
1172
0
                    reqLength = 2;
1173
0
                    break;
1174
0
                }
1175
610
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176
514
                if((pDestLimit - pDest) >= 3) {
1177
514
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
1178
514
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179
514
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
1180
514
                } else {
1181
0
                    reqLength = 3;
1182
0
                    break;
1183
0
                }
1184
514
            } else /* ch is a surrogate */ {
1185
96
                int32_t length;
1186
1187
96
                if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 
1188
96
                    ++pSrc;
1189
96
                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190
96
                } else if(subchar>=0) {
1191
0
                    ch=subchar;
1192
0
                    ++numSubstitutions;
1193
0
                } else {
1194
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
1196
0
                    return NULL;
1197
0
                }
1198
1199
96
                length = U8_LENGTH(ch);
1200
96
                if((pDestLimit - pDest) >= length) {
1201
                    /* convert and append*/
1202
96
                    pDest=_appendUTF8(pDest, ch);
1203
96
                } else {
1204
0
                    reqLength = length;
1205
0
                    break;
1206
0
                }
1207
96
            }
1208
1.13k
        }
1209
3.57k
        while(pSrc<pSrcLimit) {
1210
0
            ch=*pSrc++;
1211
0
            if(ch<=0x7f) {
1212
0
                ++reqLength;
1213
0
            } else if(ch<=0x7ff) {
1214
0
                reqLength+=2;
1215
0
            } else if(!U16_IS_SURROGATE(ch)) {
1216
0
                reqLength+=3;
1217
0
            } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1218
0
                ++pSrc;
1219
0
                reqLength+=4;
1220
0
            } else if(subchar>=0) {
1221
0
                reqLength+=U8_LENGTH(subchar);
1222
0
                ++numSubstitutions;
1223
0
            } else {
1224
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1226
0
                return NULL;
1227
0
            }
1228
0
        }
1229
3.57k
    }
1230
1231
3.57k
    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1232
1233
3.57k
    if(pNumSubstitutions!=NULL) {
1234
0
        *pNumSubstitutions=numSubstitutions;
1235
0
    }
1236
1237
3.57k
    if(pDestLength){
1238
0
        *pDestLength = reqLength;
1239
0
    }
1240
1241
    /* Terminate the buffer */
1242
3.57k
    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1243
3.57k
    return dest;
1244
3.57k
}
1245
1246
U_CAPI char* U_EXPORT2 
1247
u_strToUTF8(char *dest,
1248
            int32_t destCapacity,
1249
            int32_t *pDestLength,
1250
            const UChar *pSrc,
1251
            int32_t srcLength,
1252
3.57k
            UErrorCode *pErrorCode){
1253
3.57k
    return u_strToUTF8WithSub(
1254
3.57k
            dest, destCapacity, pDestLength,
1255
3.57k
            pSrc, srcLength,
1256
3.57k
            U_SENTINEL, NULL,
1257
3.57k
            pErrorCode);
1258
3.57k
}
1259
1260
U_CAPI UChar* U_EXPORT2
1261
u_strFromJavaModifiedUTF8WithSub(
1262
        UChar *dest,
1263
        int32_t destCapacity,
1264
        int32_t *pDestLength,
1265
        const char *src,
1266
        int32_t srcLength,
1267
        UChar32 subchar, int32_t *pNumSubstitutions,
1268
0
        UErrorCode *pErrorCode) {
1269
0
    UChar *pDest = dest;
1270
0
    UChar *pDestLimit = dest+destCapacity;
1271
0
    UChar32 ch;
1272
0
    int32_t reqLength = 0;
1273
0
    const uint8_t* pSrc = (const uint8_t*) src;
1274
0
    const uint8_t *pSrcLimit;
1275
0
    int32_t count;
1276
0
    uint8_t t1, t2; /* trail bytes */
1277
0
    int32_t numSubstitutions;
1278
1279
    /* args check */
1280
0
    if(U_FAILURE(*pErrorCode)){
1281
0
        return NULL;
1282
0
    }
1283
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284
0
        (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1286
0
    ) {
1287
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1288
0
        return NULL;
1289
0
    }
1290
1291
0
    if(pNumSubstitutions!=NULL) {
1292
0
        *pNumSubstitutions=0;
1293
0
    }
1294
0
    numSubstitutions=0;
1295
1296
0
    if(srcLength < 0) {
1297
        /*
1298
         * Transform a NUL-terminated ASCII string.
1299
         * Handle non-ASCII strings with slower code.
1300
         */
1301
0
        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1302
0
            *pDest++=(UChar)ch;
1303
0
            ++pSrc;
1304
0
        }
1305
0
        if(ch == 0) {
1306
0
            reqLength=(int32_t)(pDest - dest);
1307
0
            if(pDestLength) {
1308
0
                *pDestLength = reqLength;
1309
0
            }
1310
1311
            /* Terminate the buffer */
1312
0
            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1313
0
            return dest;
1314
0
        }
1315
0
        srcLength = uprv_strlen((const char *)pSrc);
1316
0
    }
1317
1318
    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319
0
    pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1320
0
    for(;;) {
1321
0
        count = (int32_t)(pDestLimit - pDest);
1322
0
        srcLength = (int32_t)(pSrcLimit - pSrc);
1323
0
        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324
            /* fast ASCII loop */
1325
0
            const uint8_t *prevSrc = pSrc;
1326
0
            int32_t delta;
1327
0
            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1328
0
                *pDest++=(UChar)ch;
1329
0
                ++pSrc;
1330
0
            }
1331
0
            delta = (int32_t)(pSrc - prevSrc);
1332
0
            count -= delta;
1333
0
            srcLength -= delta;
1334
0
        }
1335
        /*
1336
         * Each iteration of the inner loop progresses by at most 3 UTF-8
1337
         * bytes and one UChar.
1338
         */
1339
0
        srcLength /= 3;
1340
0
        if(count > srcLength) {
1341
0
            count = srcLength; /* min(remaining dest, remaining src/3) */
1342
0
        }
1343
0
        if(count < 3) {
1344
            /*
1345
             * Too much overhead if we get near the end of the string,
1346
             * continue with the next loop.
1347
             */
1348
0
            break;
1349
0
        }
1350
0
        do {
1351
0
            ch = *pSrc;
1352
0
            if(ch <= 0x7f){
1353
0
                *pDest++=(UChar)ch;
1354
0
                ++pSrc;
1355
0
            } else {
1356
0
                if(ch >= 0xe0) {
1357
0
                    if( /* handle U+0000..U+FFFF inline */
1358
0
                        ch <= 0xef &&
1359
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360
0
                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1361
0
                    ) {
1362
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363
0
                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1364
0
                        pSrc += 3;
1365
0
                        continue;
1366
0
                    }
1367
0
                } else {
1368
0
                    if( /* handle U+0000..U+07FF inline */
1369
0
                        ch >= 0xc0 &&
1370
0
                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1371
0
                    ) {
1372
0
                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1373
0
                        pSrc += 2;
1374
0
                        continue;
1375
0
                    }
1376
0
                }
1377
1378
0
                if(subchar < 0) {
1379
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
1380
0
                    return NULL;
1381
0
                } else if(subchar > 0xffff && --count == 0) {
1382
                    /*
1383
                     * We need to write two UChars, adjusted count for that,
1384
                     * and ran out of space.
1385
                     */
1386
0
                    break;
1387
0
                } else {
1388
                    /* function call for error cases */
1389
0
                    ++pSrc; /* continue after the lead byte */
1390
0
                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1391
0
                    ++numSubstitutions;
1392
0
                    if(subchar<=0xFFFF) {
1393
0
                        *(pDest++)=(UChar)subchar;
1394
0
                    } else {
1395
0
                        *(pDest++)=U16_LEAD(subchar);
1396
0
                        *(pDest++)=U16_TRAIL(subchar);
1397
0
                    }
1398
0
                }
1399
0
            }
1400
0
        } while(--count > 0);
1401
0
    }
1402
1403
0
    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1404
0
        ch = *pSrc;
1405
0
        if(ch <= 0x7f){
1406
0
            *pDest++=(UChar)ch;
1407
0
            ++pSrc;
1408
0
        } else {
1409
0
            if(ch >= 0xe0) {
1410
0
                if( /* handle U+0000..U+FFFF inline */
1411
0
                    ch <= 0xef &&
1412
0
                    ((pSrcLimit - pSrc) >= 3) &&
1413
0
                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414
0
                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1415
0
                ) {
1416
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417
0
                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1418
0
                    pSrc += 3;
1419
0
                    continue;
1420
0
                }
1421
0
            } else {
1422
0
                if( /* handle U+0000..U+07FF inline */
1423
0
                    ch >= 0xc0 &&
1424
0
                    ((pSrcLimit - pSrc) >= 2) &&
1425
0
                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1426
0
                ) {
1427
0
                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1428
0
                    pSrc += 2;
1429
0
                    continue;
1430
0
                }
1431
0
            }
1432
1433
0
            if(subchar < 0) {
1434
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1435
0
                return NULL;
1436
0
            } else {
1437
                /* function call for error cases */
1438
0
                ++pSrc; /* continue after the lead byte */
1439
0
                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1440
0
                ++numSubstitutions;
1441
0
                if(subchar<=0xFFFF) {
1442
0
                    *(pDest++)=(UChar)subchar;
1443
0
                } else {
1444
0
                    *(pDest++)=U16_LEAD(subchar);
1445
0
                    if(pDest<pDestLimit) {
1446
0
                        *(pDest++)=U16_TRAIL(subchar);
1447
0
                    } else {
1448
0
                        reqLength++;
1449
0
                        break;
1450
0
                    }
1451
0
                }
1452
0
            }
1453
0
        }
1454
0
    }
1455
1456
    /* do not fill the dest buffer just count the UChars needed */
1457
0
    while(pSrc < pSrcLimit){
1458
0
        ch = *pSrc;
1459
0
        if(ch <= 0x7f) {
1460
0
            reqLength++;
1461
0
            ++pSrc;
1462
0
        } else {
1463
0
            if(ch >= 0xe0) {
1464
0
                if( /* handle U+0000..U+FFFF inline */
1465
0
                    ch <= 0xef &&
1466
0
                    ((pSrcLimit - pSrc) >= 3) &&
1467
0
                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468
0
                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1469
0
                ) {
1470
0
                    reqLength++;
1471
0
                    pSrc += 3;
1472
0
                    continue;
1473
0
                }
1474
0
            } else {
1475
0
                if( /* handle U+0000..U+07FF inline */
1476
0
                    ch >= 0xc0 &&
1477
0
                    ((pSrcLimit - pSrc) >= 2) &&
1478
0
                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1479
0
                ) {
1480
0
                    reqLength++;
1481
0
                    pSrc += 2;
1482
0
                    continue;
1483
0
                }
1484
0
            }
1485
1486
0
            if(subchar < 0) {
1487
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1488
0
                return NULL;
1489
0
            } else {
1490
                /* function call for error cases */
1491
0
                ++pSrc; /* continue after the lead byte */
1492
0
                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1493
0
                ++numSubstitutions;
1494
0
                reqLength+=U16_LENGTH(ch);
1495
0
            }
1496
0
        }
1497
0
    }
1498
1499
0
    if(pNumSubstitutions!=NULL) {
1500
0
        *pNumSubstitutions=numSubstitutions;
1501
0
    }
1502
1503
0
    reqLength+=(int32_t)(pDest - dest);
1504
0
    if(pDestLength) {
1505
0
        *pDestLength = reqLength;
1506
0
    }
1507
1508
    /* Terminate the buffer */
1509
0
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1510
0
    return dest;
1511
0
}
1512
1513
U_CAPI char* U_EXPORT2 
1514
u_strToJavaModifiedUTF8(
1515
        char *dest,
1516
        int32_t destCapacity,
1517
        int32_t *pDestLength,
1518
        const UChar *src, 
1519
        int32_t srcLength,
1520
0
        UErrorCode *pErrorCode) {
1521
0
    int32_t reqLength=0;
1522
0
    uint32_t ch=0;
1523
0
    uint8_t *pDest = (uint8_t *)dest;
1524
0
    uint8_t *pDestLimit = pDest + destCapacity;
1525
0
    const UChar *pSrcLimit;
1526
0
    int32_t count;
1527
1528
    /* args check */
1529
0
    if(U_FAILURE(*pErrorCode)){
1530
0
        return NULL;
1531
0
    }
1532
0
    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533
0
        (dest==NULL && destCapacity!=0) || destCapacity<0
1534
0
    ) {
1535
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1536
0
        return NULL;
1537
0
    }
1538
1539
0
    if(srcLength==-1) {
1540
        /* Convert NUL-terminated ASCII, then find the string length. */
1541
0
        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542
0
            *pDest++ = (uint8_t)ch;
1543
0
            ++src;
1544
0
        }
1545
0
        if(ch == 0) {
1546
0
            reqLength=(int32_t)(pDest - (uint8_t *)dest);
1547
0
            if(pDestLength) {
1548
0
                *pDestLength = reqLength;
1549
0
            }
1550
1551
            /* Terminate the buffer */
1552
0
            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1553
0
            return dest;
1554
0
        }
1555
0
        srcLength = u_strlen(src);
1556
0
    }
1557
1558
    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559
0
    pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1560
0
    for(;;) {
1561
0
        count = (int32_t)(pDestLimit - pDest);
1562
0
        srcLength = (int32_t)(pSrcLimit - src);
1563
0
        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564
            /* fast ASCII loop */
1565
0
            const UChar *prevSrc = src;
1566
0
            int32_t delta;
1567
0
            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568
0
                *pDest++=(uint8_t)ch;
1569
0
                ++src;
1570
0
            }
1571
0
            delta = (int32_t)(src - prevSrc);
1572
0
            count -= delta;
1573
0
            srcLength -= delta;
1574
0
        }
1575
        /*
1576
         * Each iteration of the inner loop progresses by at most 3 UTF-8
1577
         * bytes and one UChar.
1578
         */
1579
0
        count /= 3;
1580
0
        if(count > srcLength) {
1581
0
            count = srcLength; /* min(remaining dest/3, remaining src) */
1582
0
        }
1583
0
        if(count < 3) {
1584
            /*
1585
             * Too much overhead if we get near the end of the string,
1586
             * continue with the next loop.
1587
             */
1588
0
            break;
1589
0
        }
1590
0
        do {
1591
0
            ch=*src++;
1592
0
            if(ch <= 0x7f && ch != 0) {
1593
0
                *pDest++ = (uint8_t)ch;
1594
0
            } else if(ch <= 0x7ff) {
1595
0
                *pDest++=(uint8_t)((ch>>6)|0xc0);
1596
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597
0
            } else {
1598
0
                *pDest++=(uint8_t)((ch>>12)|0xe0);
1599
0
                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1601
0
            }
1602
0
        } while(--count > 0);
1603
0
    }
1604
1605
0
    while(src<pSrcLimit) {
1606
0
        ch=*src++;
1607
0
        if(ch <= 0x7f && ch != 0) {
1608
0
            if(pDest<pDestLimit) {
1609
0
                *pDest++ = (uint8_t)ch;
1610
0
            } else {
1611
0
                reqLength = 1;
1612
0
                break;
1613
0
            }
1614
0
        } else if(ch <= 0x7ff) {
1615
0
            if((pDestLimit - pDest) >= 2) {
1616
0
                *pDest++=(uint8_t)((ch>>6)|0xc0);
1617
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1618
0
            } else {
1619
0
                reqLength = 2;
1620
0
                break;
1621
0
            }
1622
0
        } else {
1623
0
            if((pDestLimit - pDest) >= 3) {
1624
0
                *pDest++=(uint8_t)((ch>>12)|0xe0);
1625
0
                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1627
0
            } else {
1628
0
                reqLength = 3;
1629
0
                break;
1630
0
            }
1631
0
        }
1632
0
    }
1633
0
    while(src<pSrcLimit) {
1634
0
        ch=*src++;
1635
0
        if(ch <= 0x7f && ch != 0) {
1636
0
            ++reqLength;
1637
0
        } else if(ch<=0x7ff) {
1638
0
            reqLength+=2;
1639
0
        } else {
1640
0
            reqLength+=3;
1641
0
        }
1642
0
    }
1643
1644
0
    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1645
0
    if(pDestLength){
1646
0
        *pDestLength = reqLength;
1647
0
    }
1648
1649
    /* Terminate the buffer */
1650
0
    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1651
0
    return dest;
1652
0
}