Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/common/ustrtrns.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2001-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*
11
* File ustrtrns.cpp
12
*
13
* Modification History:
14
*
15
*   Date        Name        Description
16
*   9/10/2001    Ram    Creation.
17
******************************************************************************
18
*/
19
20
/*******************************************************************************
21
 *
22
 * u_strTo* and u_strFrom* APIs
23
 * WCS functions moved to ustr_wcs.c for better modularization
24
 *
25
 *******************************************************************************
26
 */
27
28
29
#include "unicode/putil.h"
30
#include "unicode/ustring.h"
31
#include "unicode/utf.h"
32
#include "unicode/utf8.h"
33
#include "unicode/utf16.h"
34
#include "cstring.h"
35
#include "cmemory.h"
36
#include "ustr_imp.h"
37
#include "uassert.h"
38
39
U_CAPI char16_t* U_EXPORT2
40
u_strFromUTF32WithSub(char16_t *dest,
41
               int32_t destCapacity,
42
               int32_t *pDestLength,
43
               const UChar32 *src,
44
               int32_t srcLength,
45
               UChar32 subchar, int32_t *pNumSubstitutions,
46
0
               UErrorCode *pErrorCode) {
47
0
    const UChar32 *srcLimit;
48
0
    UChar32 ch;
49
0
    char16_t *destLimit;
50
0
    char16_t *pDest;
51
0
    int32_t reqLength;
52
0
    int32_t numSubstitutions;
53
54
    /* args check */
55
0
    if(U_FAILURE(*pErrorCode)){
56
0
        return nullptr;
57
0
    }
58
0
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
59
0
        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
60
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61
0
    ) {
62
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63
0
        return nullptr;
64
0
    }
65
66
0
    if(pNumSubstitutions != nullptr) {
67
0
        *pNumSubstitutions = 0;
68
0
    }
69
70
0
    pDest = dest;
71
0
    destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
72
0
    reqLength = 0;
73
0
    numSubstitutions = 0;
74
75
0
    if(srcLength < 0) {
76
        /* simple loop for conversion of a NUL-terminated BMP string */
77
0
        while((ch=*src) != 0 &&
78
0
              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79
0
            ++src;
80
0
            if(pDest < destLimit) {
81
0
                *pDest++ = (char16_t)ch;
82
0
            } else {
83
0
                ++reqLength;
84
0
            }
85
0
        }
86
0
        srcLimit = src;
87
0
        if(ch != 0) {
88
            /* "complicated" case, find the end of the remaining string */
89
0
            while(*++srcLimit != 0) {}
90
0
        }
91
0
    } else {
92
0
      srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
93
0
    }
94
95
    /* convert with length */
96
0
    while(src < srcLimit) {
97
0
        ch = *src++;
98
0
        do {
99
            /* usually "loops" once; twice only for writing subchar */
100
0
            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101
0
                if(pDest < destLimit) {
102
0
                    *pDest++ = (char16_t)ch;
103
0
                } else {
104
0
                    ++reqLength;
105
0
                }
106
0
                break;
107
0
            } else if(0x10000 <= ch && ch <= 0x10ffff) {
108
0
                if(pDest!=nullptr && ((pDest + 2) <= destLimit)) {
109
0
                    *pDest++ = U16_LEAD(ch);
110
0
                    *pDest++ = U16_TRAIL(ch);
111
0
                } else {
112
0
                    reqLength += 2;
113
0
                }
114
0
                break;
115
0
            } else if((ch = subchar) < 0) {
116
                /* surrogate code point, or not a Unicode code point at all */
117
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
118
0
                return nullptr;
119
0
            } else {
120
0
                ++numSubstitutions;
121
0
            }
122
0
        } while(true);
123
0
    }
124
125
0
    reqLength += (int32_t)(pDest - dest);
126
0
    if(pDestLength) {
127
0
        *pDestLength = reqLength;
128
0
    }
129
0
    if(pNumSubstitutions != nullptr) {
130
0
        *pNumSubstitutions = numSubstitutions;
131
0
    }
132
133
    /* Terminate the buffer */
134
0
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135
    
136
0
    return dest;
137
0
}
138
139
U_CAPI char16_t* U_EXPORT2
140
u_strFromUTF32(char16_t *dest,
141
               int32_t destCapacity, 
142
               int32_t *pDestLength,
143
               const UChar32 *src,
144
               int32_t srcLength,
145
0
               UErrorCode *pErrorCode) {
146
0
    return u_strFromUTF32WithSub(
147
0
            dest, destCapacity, pDestLength,
148
0
            src, srcLength,
149
0
            U_SENTINEL, nullptr,
150
0
            pErrorCode);
151
0
}
152
153
U_CAPI UChar32* U_EXPORT2 
154
u_strToUTF32WithSub(UChar32 *dest,
155
             int32_t destCapacity,
156
             int32_t *pDestLength,
157
             const char16_t *src,
158
             int32_t srcLength,
159
             UChar32 subchar, int32_t *pNumSubstitutions,
160
0
             UErrorCode *pErrorCode) {
161
0
    const char16_t *srcLimit;
162
0
    UChar32 ch;
163
0
    char16_t ch2;
164
0
    UChar32 *destLimit;
165
0
    UChar32 *pDest;
166
0
    int32_t reqLength;
167
0
    int32_t numSubstitutions;
168
169
    /* args check */
170
0
    if(U_FAILURE(*pErrorCode)){
171
0
        return nullptr;
172
0
    }
173
0
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
174
0
        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
175
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176
0
    ) {
177
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178
0
        return nullptr;
179
0
    }
180
181
0
    if(pNumSubstitutions != nullptr) {
182
0
        *pNumSubstitutions = 0;
183
0
    }
184
185
0
    pDest = dest;
186
0
    destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
187
0
    reqLength = 0;
188
0
    numSubstitutions = 0;
189
190
0
    if(srcLength < 0) {
191
        /* simple loop for conversion of a NUL-terminated BMP string */
192
0
        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193
0
            ++src;
194
0
            if(pDest < destLimit) {
195
0
                *pDest++ = ch;
196
0
            } else {
197
0
                ++reqLength;
198
0
            }
199
0
        }
200
0
        srcLimit = src;
201
0
        if(ch != 0) {
202
            /* "complicated" case, find the end of the remaining string */
203
0
            while(*++srcLimit != 0) {}
204
0
        }
205
0
    } else {
206
0
        srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
207
0
    }
208
209
    /* convert with length */
210
0
    while(src < srcLimit) {
211
0
        ch = *src++;
212
0
        if(!U16_IS_SURROGATE(ch)) {
213
            /* write or count ch below */
214
0
        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215
0
            ++src;
216
0
            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217
0
        } else if((ch = subchar) < 0) {
218
            /* unpaired surrogate */
219
0
            *pErrorCode = U_INVALID_CHAR_FOUND;
220
0
            return nullptr;
221
0
        } else {
222
0
            ++numSubstitutions;
223
0
        }
224
0
        if(pDest < destLimit) {
225
0
            *pDest++ = ch;
226
0
        } else {
227
0
            ++reqLength;
228
0
        }
229
0
    }
230
231
0
    reqLength += (int32_t)(pDest - dest);
232
0
    if(pDestLength) {
233
0
        *pDestLength = reqLength;
234
0
    }
235
0
    if(pNumSubstitutions != nullptr) {
236
0
        *pNumSubstitutions = numSubstitutions;
237
0
    }
238
239
    /* Terminate the buffer */
240
0
    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241
242
0
    return dest;
243
0
}
244
245
U_CAPI UChar32* U_EXPORT2 
246
u_strToUTF32(UChar32 *dest, 
247
             int32_t destCapacity,
248
             int32_t *pDestLength,
249
             const char16_t *src,
250
             int32_t srcLength,
251
0
             UErrorCode *pErrorCode) {
252
0
    return u_strToUTF32WithSub(
253
0
            dest, destCapacity, pDestLength,
254
0
            src, srcLength,
255
0
            U_SENTINEL, nullptr,
256
0
            pErrorCode);
257
0
}
258
259
U_CAPI char16_t* U_EXPORT2
260
u_strFromUTF8WithSub(char16_t *dest,
261
              int32_t destCapacity,
262
              int32_t *pDestLength,
263
              const char* src,
264
              int32_t srcLength,
265
              UChar32 subchar, int32_t *pNumSubstitutions,
266
33.8k
              UErrorCode *pErrorCode){
267
    /* args check */
268
33.8k
    if(U_FAILURE(*pErrorCode)) {
269
0
        return nullptr;
270
0
    }
271
33.8k
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
272
33.8k
        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
273
33.8k
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
274
33.8k
    ) {
275
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
276
0
        return nullptr;
277
0
    }
278
279
33.8k
    if(pNumSubstitutions!=nullptr) {
280
0
        *pNumSubstitutions=0;
281
0
    }
282
33.8k
    char16_t *pDest = dest;
283
33.8k
    char16_t *pDestLimit = dest+destCapacity;
284
33.8k
    int32_t reqLength = 0;
285
33.8k
    int32_t numSubstitutions=0;
286
287
    /*
288
     * Inline processing of UTF-8 byte sequences:
289
     *
290
     * Byte sequences for the most common characters are handled inline in
291
     * the conversion loops. In order to reduce the path lengths for those
292
     * characters, the tests are arranged in a kind of binary search.
293
     * ASCII (<=0x7f) is checked first, followed by the dividing point
294
     * between 2- and 3-byte sequences (0xe0).
295
     * The 3-byte branch is tested first to speed up CJK text.
296
     * The compiler should combine the subtractions for the two tests for 0xe0.
297
     * Each branch then tests for the other end of its range.
298
     */
299
300
33.8k
    if(srcLength < 0){
301
        /*
302
         * Transform a NUL-terminated string.
303
         * The code explicitly checks for NULs only in the lead byte position.
304
         * A NUL byte in the trail byte position fails the trail byte range check anyway.
305
         */
306
0
        int32_t i;
307
0
        UChar32 c;
308
0
        for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
309
            // modified copy of U8_NEXT()
310
0
            ++i;
311
0
            if(U8_IS_SINGLE(c)) {
312
0
                *pDest++=(char16_t)c;
313
0
            } else {
314
0
                uint8_t __t1, __t2;
315
0
                if( /* handle U+0800..U+FFFF inline */
316
0
                        (0xe0<=(c) && (c)<0xf0) &&
317
0
                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
318
0
                        (__t2=src[(i)+1]-0x80)<=0x3f) {
319
0
                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
320
0
                    i+=2;
321
0
                } else if( /* handle U+0080..U+07FF inline */
322
0
                        ((c)<0xe0 && (c)>=0xc2) &&
323
0
                        (__t1=src[i]-0x80)<=0x3f) {
324
0
                    *pDest++ = (((c)&0x1f)<<6)|__t1;
325
0
                    ++(i);
326
0
                } else {
327
                    /* function call for "complicated" and error cases */
328
0
                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
329
0
                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
330
0
                        *pErrorCode = U_INVALID_CHAR_FOUND;
331
0
                        return nullptr;
332
0
                    } else if(c<=0xFFFF) {
333
0
                        *(pDest++)=(char16_t)c;
334
0
                    } else {
335
0
                        *(pDest++)=U16_LEAD(c);
336
0
                        if(pDest<pDestLimit) {
337
0
                            *(pDest++)=U16_TRAIL(c);
338
0
                        } else {
339
0
                            reqLength++;
340
0
                            break;
341
0
                        }
342
0
                    }
343
0
                }
344
0
            }
345
0
        }
346
347
        /* Pre-flight the rest of the string. */
348
0
        while((c = (uint8_t)src[i]) != 0) {
349
            // modified copy of U8_NEXT()
350
0
            ++i;
351
0
            if(U8_IS_SINGLE(c)) {
352
0
                ++reqLength;
353
0
            } else {
354
0
                uint8_t __t1, __t2;
355
0
                if( /* handle U+0800..U+FFFF inline */
356
0
                        (0xe0<=(c) && (c)<0xf0) &&
357
0
                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
358
0
                        (__t2=src[(i)+1]-0x80)<=0x3f) {
359
0
                    ++reqLength;
360
0
                    i+=2;
361
0
                } else if( /* handle U+0080..U+07FF inline */
362
0
                        ((c)<0xe0 && (c)>=0xc2) &&
363
0
                        (__t1=src[i]-0x80)<=0x3f) {
364
0
                    ++reqLength;
365
0
                    ++(i);
366
0
                } else {
367
                    /* function call for "complicated" and error cases */
368
0
                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
369
0
                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
370
0
                        *pErrorCode = U_INVALID_CHAR_FOUND;
371
0
                        return nullptr;
372
0
                    }
373
0
                    reqLength += U16_LENGTH(c);
374
0
                }
375
0
            }
376
0
        }
377
33.8k
    } else /* srcLength >= 0 */ {
378
        /* Faster loop without ongoing checking for srcLength and pDestLimit. */
379
33.8k
        int32_t i = 0;
380
33.8k
        UChar32 c;
381
394k
        for(;;) {
382
            /*
383
             * Each iteration of the inner loop progresses by at most 3 UTF-8
384
             * bytes and one char16_t, for most characters.
385
             * For supplementary code points (4 & 2), which are rare,
386
             * there is an additional adjustment.
387
             */
388
394k
            int32_t count = (int32_t)(pDestLimit - pDest);
389
394k
            int32_t count2 = (srcLength - i) / 3;
390
394k
            if(count > count2) {
391
394k
                count = count2; /* min(remaining dest, remaining src/3) */
392
394k
            }
393
394k
            if(count < 3) {
394
                /*
395
                 * Too much overhead if we get near the end of the string,
396
                 * continue with the next loop.
397
                 */
398
33.8k
                break;
399
33.8k
            }
400
401
238M
            do {
402
                // modified copy of U8_NEXT()
403
238M
                c = (uint8_t)src[i++];
404
238M
                if(U8_IS_SINGLE(c)) {
405
217M
                    *pDest++=(char16_t)c;
406
217M
                } else {
407
20.0M
                    uint8_t __t1, __t2;
408
20.0M
                    if( /* handle U+0800..U+FFFF inline */
409
20.0M
                            (0xe0<=(c) && (c)<0xf0) &&
410
20.0M
                            ((i)+1)<srcLength &&
411
20.0M
                            U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
412
20.0M
                            (__t2=src[(i)+1]-0x80)<=0x3f) {
413
7.34M
                        *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
414
7.34M
                        i+=2;
415
12.7M
                    } else if( /* handle U+0080..U+07FF inline */
416
12.7M
                            ((c)<0xe0 && (c)>=0xc2) &&
417
12.7M
                            ((i)!=srcLength) &&
418
12.7M
                            (__t1=src[i]-0x80)<=0x3f) {
419
317k
                        *pDest++ = (((c)&0x1f)<<6)|__t1;
420
317k
                        ++(i);
421
12.3M
                    } else {
422
12.3M
                        if(c >= 0xf0 || subchar > 0xffff) {
423
                            // We may read up to four bytes and write up to two UChars,
424
                            // which we didn't account for with computing count,
425
                            // so we adjust it here.
426
4.77M
                            if(--count == 0) {
427
1.85k
                                --i;  // back out byte c
428
1.85k
                                break;
429
1.85k
                            }
430
4.77M
                        }
431
432
                        /* function call for "complicated" and error cases */
433
12.3M
                        (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
434
12.3M
                        if(c<0 && (++numSubstitutions, c = subchar) < 0) {
435
0
                            *pErrorCode = U_INVALID_CHAR_FOUND;
436
0
                            return nullptr;
437
12.3M
                        } else if(c<=0xFFFF) {
438
12.3M
                            *(pDest++)=(char16_t)c;
439
12.3M
                        } else {
440
27.4k
                            *(pDest++)=U16_LEAD(c);
441
27.4k
                            *(pDest++)=U16_TRAIL(c);
442
27.4k
                        }
443
12.3M
                    }
444
20.0M
                }
445
238M
            } while(--count > 0);
446
360k
        }
447
448
231k
        while(i < srcLength && (pDest < pDestLimit)) {
449
            // modified copy of U8_NEXT()
450
197k
            c = (uint8_t)src[i++];
451
197k
            if(U8_IS_SINGLE(c)) {
452
191k
                *pDest++=(char16_t)c;
453
191k
            } else {
454
5.64k
                uint8_t __t1, __t2;
455
5.64k
                if( /* handle U+0800..U+FFFF inline */
456
5.64k
                        (0xe0<=(c) && (c)<0xf0) &&
457
5.64k
                        ((i)+1)<srcLength &&
458
5.64k
                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
459
5.64k
                        (__t2=src[(i)+1]-0x80)<=0x3f) {
460
1.50k
                    *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
461
1.50k
                    i+=2;
462
4.14k
                } else if( /* handle U+0080..U+07FF inline */
463
4.14k
                        ((c)<0xe0 && (c)>=0xc2) &&
464
4.14k
                        ((i)!=srcLength) &&
465
4.14k
                        (__t1=src[i]-0x80)<=0x3f) {
466
514
                    *pDest++ = (((c)&0x1f)<<6)|__t1;
467
514
                    ++(i);
468
3.62k
                } else {
469
                    /* function call for "complicated" and error cases */
470
3.62k
                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
471
3.62k
                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
472
0
                        *pErrorCode = U_INVALID_CHAR_FOUND;
473
0
                        return nullptr;
474
3.62k
                    } else if(c<=0xFFFF) {
475
3.34k
                        *(pDest++)=(char16_t)c;
476
3.34k
                    } else {
477
281
                        *(pDest++)=U16_LEAD(c);
478
281
                        if(pDest<pDestLimit) {
479
281
                            *(pDest++)=U16_TRAIL(c);
480
281
                        } else {
481
0
                            reqLength++;
482
0
                            break;
483
0
                        }
484
281
                    }
485
3.62k
                }
486
5.64k
            }
487
197k
        }
488
489
        /* Pre-flight the rest of the string. */
490
33.8k
        while(i < srcLength) {
491
            // modified copy of U8_NEXT()
492
0
            c = (uint8_t)src[i++];
493
0
            if(U8_IS_SINGLE(c)) {
494
0
                ++reqLength;
495
0
            } else {
496
0
                uint8_t __t1, __t2;
497
0
                if( /* handle U+0800..U+FFFF inline */
498
0
                        (0xe0<=(c) && (c)<0xf0) &&
499
0
                        ((i)+1)<srcLength &&
500
0
                        U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
501
0
                        (__t2=src[(i)+1]-0x80)<=0x3f) {
502
0
                    ++reqLength;
503
0
                    i+=2;
504
0
                } else if( /* handle U+0080..U+07FF inline */
505
0
                        ((c)<0xe0 && (c)>=0xc2) &&
506
0
                        ((i)!=srcLength) &&
507
0
                        (__t1=src[i]-0x80)<=0x3f) {
508
0
                    ++reqLength;
509
0
                    ++(i);
510
0
                } else {
511
                    /* function call for "complicated" and error cases */
512
0
                    (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
513
0
                    if(c<0 && (++numSubstitutions, c = subchar) < 0) {
514
0
                        *pErrorCode = U_INVALID_CHAR_FOUND;
515
0
                        return nullptr;
516
0
                    }
517
0
                    reqLength += U16_LENGTH(c);
518
0
                }
519
0
            }
520
0
        }
521
33.8k
    }
522
523
33.8k
    reqLength+=(int32_t)(pDest - dest);
524
525
33.8k
    if(pNumSubstitutions!=nullptr) {
526
0
        *pNumSubstitutions=numSubstitutions;
527
0
    }
528
529
33.8k
    if(pDestLength){
530
33.8k
        *pDestLength = reqLength;
531
33.8k
    }
532
533
    /* Terminate the buffer */
534
33.8k
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
535
536
33.8k
    return dest;
537
33.8k
}
538
539
U_CAPI char16_t* U_EXPORT2
540
u_strFromUTF8(char16_t *dest,
541
              int32_t destCapacity,
542
              int32_t *pDestLength,
543
              const char* src,
544
              int32_t srcLength,
545
0
              UErrorCode *pErrorCode){
546
0
    return u_strFromUTF8WithSub(
547
0
            dest, destCapacity, pDestLength,
548
0
            src, srcLength,
549
0
            U_SENTINEL, nullptr,
550
0
            pErrorCode);
551
0
}
552
553
U_CAPI char16_t * U_EXPORT2
554
u_strFromUTF8Lenient(char16_t *dest,
555
                     int32_t destCapacity,
556
                     int32_t *pDestLength,
557
                     const char *src,
558
                     int32_t srcLength,
559
0
                     UErrorCode *pErrorCode) {
560
0
    char16_t *pDest = dest;
561
0
    UChar32 ch;
562
0
    int32_t reqLength = 0;
563
0
    uint8_t* pSrc = (uint8_t*) src;
564
565
    /* args check */
566
0
    if(U_FAILURE(*pErrorCode)){
567
0
        return nullptr;
568
0
    }
569
        
570
0
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
571
0
        (destCapacity<0) || (dest == nullptr && destCapacity > 0)
572
0
    ) {
573
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
574
0
        return nullptr;
575
0
    }
576
577
0
    if(srcLength < 0) {
578
        /* Transform a NUL-terminated string. */
579
0
        char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr;
580
0
        uint8_t t1, t2, t3; /* trail bytes */
581
582
0
        while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
583
0
            if(ch < 0xc0) {
584
                /*
585
                 * ASCII, or a trail byte in lead position which is treated like
586
                 * a single-byte sequence for better character boundary
587
                 * resynchronization after illegal sequences.
588
                 */
589
0
                *pDest++=(char16_t)ch;
590
0
                ++pSrc;
591
0
                continue;
592
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
593
0
                if((t1 = pSrc[1]) != 0) {
594
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
595
0
                    *pDest++ = (char16_t)((ch << 6) + t1 - 0x3080);
596
0
                    pSrc += 2;
597
0
                    continue;
598
0
                }
599
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
600
0
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
601
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
602
                    /* 0x2080 = (0x80 << 6) + 0x80 */
603
0
                    *pDest++ = (char16_t)((ch << 12) + (t1 << 6) + t2 - 0x2080);
604
0
                    pSrc += 3;
605
0
                    continue;
606
0
                }
607
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
608
0
                if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
609
0
                    pSrc += 4;
610
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
611
0
                    ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
612
0
                    *(pDest++) = U16_LEAD(ch);
613
0
                    if(pDest < pDestLimit) {
614
0
                        *(pDest++) = U16_TRAIL(ch);
615
0
                    } else {
616
0
                        reqLength = 1;
617
0
                        break;
618
0
                    }
619
0
                    continue;
620
0
                }
621
0
            }
622
623
            /* truncated character at the end */
624
0
            *pDest++ = 0xfffd;
625
0
            while(*++pSrc != 0) {}
626
0
            break;
627
0
        }
628
629
        /* Pre-flight the rest of the string. */
630
0
        while((ch = *pSrc) != 0) {
631
0
            if(ch < 0xc0) {
632
                /*
633
                 * ASCII, or a trail byte in lead position which is treated like
634
                 * a single-byte sequence for better character boundary
635
                 * resynchronization after illegal sequences.
636
                 */
637
0
                ++reqLength;
638
0
                ++pSrc;
639
0
                continue;
640
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
641
0
                if(pSrc[1] != 0) {
642
0
                    ++reqLength;
643
0
                    pSrc += 2;
644
0
                    continue;
645
0
                }
646
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
647
0
                if(pSrc[1] != 0 && pSrc[2] != 0) {
648
0
                    ++reqLength;
649
0
                    pSrc += 3;
650
0
                    continue;
651
0
                }
652
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
653
0
                if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
654
0
                    reqLength += 2;
655
0
                    pSrc += 4;
656
0
                    continue;
657
0
                }
658
0
            }
659
660
            /* truncated character at the end */
661
0
            ++reqLength;
662
0
            break;
663
0
        }
664
0
    } else /* srcLength >= 0 */ {
665
0
      const uint8_t *pSrcLimit = (pSrc!=nullptr)?(pSrc + srcLength):nullptr;
666
667
        /*
668
         * This function requires that if srcLength is given, then it must be
669
         * destCapatity >= srcLength so that we need not check for
670
         * destination buffer overflow in the loop.
671
         */
672
0
        if(destCapacity < srcLength) {
673
0
            if(pDestLength != nullptr) {
674
0
                *pDestLength = srcLength; /* this likely overestimates the true destLength! */
675
0
            }
676
0
            *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
677
0
            return nullptr;
678
0
        }
679
680
0
        if((pSrcLimit - pSrc) >= 4) {
681
0
            pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
682
683
            /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
684
0
            do {
685
0
                ch = *pSrc++;
686
0
                if(ch < 0xc0) {
687
                    /*
688
                     * ASCII, or a trail byte in lead position which is treated like
689
                     * a single-byte sequence for better character boundary
690
                     * resynchronization after illegal sequences.
691
                     */
692
0
                    *pDest++=(char16_t)ch;
693
0
                } else if(ch < 0xe0) { /* U+0080..U+07FF */
694
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
695
0
                    *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
696
0
                } else if(ch < 0xf0) { /* U+0800..U+FFFF */
697
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
698
                    /* 0x2080 = (0x80 << 6) + 0x80 */
699
0
                    ch = (ch << 12) + (*pSrc++ << 6);
700
0
                    *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
701
0
                } else /* f0..f4 */ { /* U+10000..U+10FFFF */
702
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
703
0
                    ch = (ch << 18) + (*pSrc++ << 12);
704
0
                    ch += *pSrc++ << 6;
705
0
                    ch += *pSrc++ - 0x3c82080;
706
0
                    *(pDest++) = U16_LEAD(ch);
707
0
                    *(pDest++) = U16_TRAIL(ch);
708
0
                }
709
0
            } while(pSrc < pSrcLimit);
710
711
0
            pSrcLimit += 3; /* restore original pSrcLimit */
712
0
        }
713
714
0
        while(pSrc < pSrcLimit) {
715
0
            ch = *pSrc++;
716
0
            if(ch < 0xc0) {
717
                /*
718
                 * ASCII, or a trail byte in lead position which is treated like
719
                 * a single-byte sequence for better character boundary
720
                 * resynchronization after illegal sequences.
721
                 */
722
0
                *pDest++=(char16_t)ch;
723
0
                continue;
724
0
            } else if(ch < 0xe0) { /* U+0080..U+07FF */
725
0
                if(pSrc < pSrcLimit) {
726
                    /* 0x3080 = (0xc0 << 6) + 0x80 */
727
0
                    *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
728
0
                    continue;
729
0
                }
730
0
            } else if(ch < 0xf0) { /* U+0800..U+FFFF */
731
0
                if((pSrcLimit - pSrc) >= 2) {
732
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
733
                    /* 0x2080 = (0x80 << 6) + 0x80 */
734
0
                    ch = (ch << 12) + (*pSrc++ << 6);
735
0
                    *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
736
0
                    pSrc += 3;
737
0
                    continue;
738
0
                }
739
0
            } else /* f0..f4 */ { /* U+10000..U+10FFFF */
740
0
                if((pSrcLimit - pSrc) >= 3) {
741
                    /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
742
0
                    ch = (ch << 18) + (*pSrc++ << 12);
743
0
                    ch += *pSrc++ << 6;
744
0
                    ch += *pSrc++ - 0x3c82080;
745
0
                    *(pDest++) = U16_LEAD(ch);
746
0
                    *(pDest++) = U16_TRAIL(ch);
747
0
                    pSrc += 4;
748
0
                    continue;
749
0
                }
750
0
            }
751
752
            /* truncated character at the end */
753
0
            *pDest++ = 0xfffd;
754
0
            break;
755
0
        }
756
0
    }
757
758
0
    reqLength+=(int32_t)(pDest - dest);
759
760
0
    if(pDestLength){
761
0
        *pDestLength = reqLength;
762
0
    }
763
764
    /* Terminate the buffer */
765
0
    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
766
767
0
    return dest;
768
0
}
769
770
static inline uint8_t *
771
750
_appendUTF8(uint8_t *pDest, UChar32 c) {
772
    /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
773
750
    if((c)<=0x7f) {
774
0
        *pDest++ = static_cast<uint8_t>(c);
775
750
    } else if(c<=0x7ff) {
776
0
        *pDest++ = static_cast<uint8_t>((c >> 6) | 0xc0);
777
0
        *pDest++ = static_cast<uint8_t>((c & 0x3f) | 0x80);
778
750
    } else if(c<=0xffff) {
779
632
        *pDest++ = static_cast<uint8_t>((c >> 12) | 0xe0);
780
632
        *pDest++ = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
781
632
        *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
782
632
    } else /* if((uint32_t)(c)<=0x10ffff) */ {
783
118
        *pDest++ = static_cast<uint8_t>(((c) >> 18) | 0xf0);
784
118
        *pDest++ = static_cast<uint8_t>((((c) >> 12) & 0x3f) | 0x80);
785
118
        *pDest++ = static_cast<uint8_t>((((c) >> 6) & 0x3f) | 0x80);
786
118
        *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
787
118
    }
788
750
    return pDest;
789
750
}
790
791
   
792
U_CAPI char* U_EXPORT2 
793
u_strToUTF8WithSub(char *dest,
794
            int32_t destCapacity,
795
            int32_t *pDestLength,
796
            const char16_t *pSrc,
797
            int32_t srcLength,
798
            UChar32 subchar, int32_t *pNumSubstitutions,
799
15.9k
            UErrorCode *pErrorCode){
800
15.9k
    int32_t reqLength=0;
801
15.9k
    uint32_t ch=0,ch2=0;
802
15.9k
    uint8_t *pDest = (uint8_t *)dest;
803
15.9k
    uint8_t *pDestLimit = (pDest!=nullptr)?(pDest + destCapacity):nullptr;
804
15.9k
    int32_t numSubstitutions;
805
806
    /* args check */
807
15.9k
    if(U_FAILURE(*pErrorCode)){
808
0
        return nullptr;
809
0
    }
810
        
811
15.9k
    if( (pSrc==nullptr && srcLength!=0) || srcLength < -1 ||
812
15.9k
        (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
813
15.9k
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
814
15.9k
    ) {
815
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
816
0
        return nullptr;
817
0
    }
818
819
15.9k
    if(pNumSubstitutions!=nullptr) {
820
0
        *pNumSubstitutions=0;
821
0
    }
822
15.9k
    numSubstitutions=0;
823
824
15.9k
    if(srcLength==-1) {
825
0
        while((ch=*pSrc)!=0) {
826
0
            ++pSrc;
827
0
            if(ch <= 0x7f) {
828
0
                if(pDest<pDestLimit) {
829
0
                    *pDest++ = (uint8_t)ch;
830
0
                } else {
831
0
                    reqLength = 1;
832
0
                    break;
833
0
                }
834
0
            } else if(ch <= 0x7ff) {
835
0
                if((pDestLimit - pDest) >= 2) {
836
0
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
837
0
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
838
0
                } else {
839
0
                    reqLength = 2;
840
0
                    break;
841
0
                }
842
0
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
843
0
                if((pDestLimit - pDest) >= 3) {
844
0
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
845
0
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
846
0
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
847
0
                } else {
848
0
                    reqLength = 3;
849
0
                    break;
850
0
                }
851
0
            } else /* ch is a surrogate */ {
852
0
                int32_t length;
853
854
                /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
855
0
                if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
856
0
                    ++pSrc;
857
0
                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
858
0
                } else if(subchar>=0) {
859
0
                    ch=subchar;
860
0
                    ++numSubstitutions;
861
0
                } else {
862
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
863
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
864
0
                    return nullptr;
865
0
                }
866
867
0
                length = U8_LENGTH(ch);
868
0
                if((pDestLimit - pDest) >= length) {
869
                    /* convert and append*/
870
0
                    pDest=_appendUTF8(pDest, ch);
871
0
                } else {
872
0
                    reqLength = length;
873
0
                    break;
874
0
                }
875
0
            }
876
0
        }
877
0
        while((ch=*pSrc++)!=0) {
878
0
            if(ch<=0x7f) {
879
0
                ++reqLength;
880
0
            } else if(ch<=0x7ff) {
881
0
                reqLength+=2;
882
0
            } else if(!U16_IS_SURROGATE(ch)) {
883
0
                reqLength+=3;
884
0
            } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
885
0
                ++pSrc;
886
0
                reqLength+=4;
887
0
            } else if(subchar>=0) {
888
0
                reqLength+=U8_LENGTH(subchar);
889
0
                ++numSubstitutions;
890
0
            } else {
891
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
892
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
893
0
                return nullptr;
894
0
            }
895
0
        }
896
15.9k
    } else {
897
15.9k
        const char16_t *pSrcLimit = (pSrc!=nullptr)?(pSrc+srcLength):nullptr;
898
15.9k
        int32_t count;
899
900
        /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
901
26.5k
        for(;;) {
902
            /*
903
             * Each iteration of the inner loop progresses by at most 3 UTF-8
904
             * bytes and one char16_t, for most characters.
905
             * For supplementary code points (4 & 2), which are rare,
906
             * there is an additional adjustment.
907
             */
908
26.5k
            count = (int32_t)((pDestLimit - pDest) / 3);
909
26.5k
            srcLength = (int32_t)(pSrcLimit - pSrc);
910
26.5k
            if(count > srcLength) {
911
11.7k
                count = srcLength; /* min(remaining dest/3, remaining src) */
912
11.7k
            }
913
26.5k
            if(count < 3) {
914
                /*
915
                 * Too much overhead if we get near the end of the string,
916
                 * continue with the next loop.
917
                 */
918
15.9k
                break;
919
15.9k
            }
920
8.05M
            do {
921
8.05M
                ch=*pSrc++;
922
8.05M
                if(ch <= 0x7f) {
923
551k
                    *pDest++ = (uint8_t)ch;
924
7.50M
                } else if(ch <= 0x7ff) {
925
160k
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
926
160k
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
927
7.34M
                } else if(ch <= 0xd7ff || ch >= 0xe000) {
928
7.31M
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
929
7.31M
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
930
7.31M
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
931
7.31M
                } else /* ch is a surrogate */ {
932
                    /*
933
                     * We will read two UChars and probably output four bytes,
934
                     * which we didn't account for with computing count,
935
                     * so we adjust it here.
936
                     */
937
29.0k
                    if(--count == 0) {
938
138
                        --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
939
138
                        break;  /* recompute count */
940
138
                    }
941
942
28.8k
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
943
28.2k
                        ++pSrc;
944
28.2k
                        ch=U16_GET_SUPPLEMENTARY(ch, ch2);
945
946
                        /* writing 4 bytes per 2 UChars is ok */
947
28.2k
                        *pDest++=(uint8_t)((ch>>18)|0xf0);
948
28.2k
                        *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
949
28.2k
                        *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
950
28.2k
                        *pDest++=(uint8_t)((ch&0x3f)|0x80);
951
28.2k
                    } else  {
952
                        /* Unicode 3.2 forbids surrogate code points in UTF-8 */
953
611
                        if(subchar>=0) {
954
611
                            ch=subchar;
955
611
                            ++numSubstitutions;
956
611
                        } else {
957
0
                            *pErrorCode = U_INVALID_CHAR_FOUND;
958
0
                            return nullptr;
959
0
                        }
960
961
                        /* convert and append*/
962
611
                        pDest=_appendUTF8(pDest, ch);
963
611
                    }
964
28.8k
                }
965
8.05M
            } while(--count > 0);
966
10.6k
        }
967
968
20.7k
        while(pSrc<pSrcLimit) {
969
12.7k
            ch=*pSrc++;
970
12.7k
            if(ch <= 0x7f) {
971
4.75k
                if(pDest<pDestLimit) {
972
3.61k
                    *pDest++ = (uint8_t)ch;
973
3.61k
                } else {
974
1.13k
                    reqLength = 1;
975
1.13k
                    break;
976
1.13k
                }
977
8.03k
            } else if(ch <= 0x7ff) {
978
369
                if((pDestLimit - pDest) >= 2) {
979
185
                    *pDest++=(uint8_t)((ch>>6)|0xc0);
980
185
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
981
185
                } else {
982
184
                    reqLength = 2;
983
184
                    break;
984
184
                }
985
7.66k
            } else if(ch <= 0xd7ff || ch >= 0xe000) {
986
2.51k
                if((pDestLimit - pDest) >= 3) {
987
883
                    *pDest++=(uint8_t)((ch>>12)|0xe0);
988
883
                    *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
989
883
                    *pDest++=(uint8_t)((ch&0x3f)|0x80);
990
1.62k
                } else {
991
1.62k
                    reqLength = 3;
992
1.62k
                    break;
993
1.62k
                }
994
5.15k
            } else /* ch is a surrogate */ {
995
5.15k
                int32_t length;
996
997
5.15k
                if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 
998
5.13k
                    ++pSrc;
999
5.13k
                    ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1000
5.13k
                } else if(subchar>=0) {
1001
21
                    ch=subchar;
1002
21
                    ++numSubstitutions;
1003
21
                } else {
1004
                    /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1005
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
1006
0
                    return nullptr;
1007
0
                }
1008
1009
5.15k
                length = U8_LENGTH(ch);
1010
5.15k
                if((pDestLimit - pDest) >= length) {
1011
                    /* convert and append*/
1012
139
                    pDest=_appendUTF8(pDest, ch);
1013
5.01k
                } else {
1014
5.01k
                    reqLength = length;
1015
5.01k
                    break;
1016
5.01k
                }
1017
5.15k
            }
1018
12.7k
        }
1019
8.07M
        while(pSrc<pSrcLimit) {
1020
8.05M
            ch=*pSrc++;
1021
8.05M
            if(ch<=0x7f) {
1022
554k
                ++reqLength;
1023
7.50M
            } else if(ch<=0x7ff) {
1024
160k
                reqLength+=2;
1025
7.34M
            } else if(!U16_IS_SURROGATE(ch)) {
1026
7.31M
                reqLength+=3;
1027
7.31M
            } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1028
23.3k
                ++pSrc;
1029
23.3k
                reqLength+=4;
1030
23.3k
            } else if(subchar>=0) {
1031
632
                reqLength+=U8_LENGTH(subchar);
1032
632
                ++numSubstitutions;
1033
632
            } else {
1034
                /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1035
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1036
0
                return nullptr;
1037
0
            }
1038
8.05M
        }
1039
15.9k
    }
1040
1041
15.9k
    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1042
1043
15.9k
    if(pNumSubstitutions!=nullptr) {
1044
0
        *pNumSubstitutions=numSubstitutions;
1045
0
    }
1046
1047
15.9k
    if(pDestLength){
1048
15.9k
        *pDestLength = reqLength;
1049
15.9k
    }
1050
1051
    /* Terminate the buffer */
1052
15.9k
    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1053
15.9k
    return dest;
1054
15.9k
}
1055
1056
U_CAPI char* U_EXPORT2 
1057
u_strToUTF8(char *dest,
1058
            int32_t destCapacity,
1059
            int32_t *pDestLength,
1060
            const char16_t *pSrc,
1061
            int32_t srcLength,
1062
10.4k
            UErrorCode *pErrorCode){
1063
10.4k
    return u_strToUTF8WithSub(
1064
10.4k
            dest, destCapacity, pDestLength,
1065
10.4k
            pSrc, srcLength,
1066
10.4k
            U_SENTINEL, nullptr,
1067
10.4k
            pErrorCode);
1068
10.4k
}
1069
1070
U_CAPI char16_t* U_EXPORT2
1071
u_strFromJavaModifiedUTF8WithSub(
1072
        char16_t *dest,
1073
        int32_t destCapacity,
1074
        int32_t *pDestLength,
1075
        const char *src,
1076
        int32_t srcLength,
1077
        UChar32 subchar, int32_t *pNumSubstitutions,
1078
0
        UErrorCode *pErrorCode) {
1079
    /* args check */
1080
0
    if(U_FAILURE(*pErrorCode)) {
1081
0
        return nullptr;
1082
0
    }
1083
0
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
1084
0
        (dest==nullptr && destCapacity!=0) || destCapacity<0 ||
1085
0
        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1086
0
    ) {
1087
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1088
0
        return nullptr;
1089
0
    }
1090
1091
0
    if(pNumSubstitutions!=nullptr) {
1092
0
        *pNumSubstitutions=0;
1093
0
    }
1094
0
    char16_t *pDest = dest;
1095
0
    char16_t *pDestLimit = dest+destCapacity;
1096
0
    int32_t reqLength = 0;
1097
0
    int32_t numSubstitutions=0;
1098
1099
0
    if(srcLength < 0) {
1100
        /*
1101
         * Transform a NUL-terminated ASCII string.
1102
         * Handle non-ASCII strings with slower code.
1103
         */
1104
0
        UChar32 c;
1105
0
        while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
1106
0
            *pDest++=(char16_t)c;
1107
0
            ++src;
1108
0
        }
1109
0
        if(c == 0) {
1110
0
            reqLength=(int32_t)(pDest - dest);
1111
0
            if(pDestLength) {
1112
0
                *pDestLength = reqLength;
1113
0
            }
1114
1115
            /* Terminate the buffer */
1116
0
            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1117
0
            return dest;
1118
0
        }
1119
0
        srcLength = static_cast<int32_t>(uprv_strlen(src));
1120
0
    }
1121
1122
    /* Faster loop without ongoing checking for srcLength and pDestLimit. */
1123
0
    UChar32 ch;
1124
0
    uint8_t t1, t2;
1125
0
    int32_t i = 0;
1126
0
    for(;;) {
1127
0
        int32_t count = (int32_t)(pDestLimit - pDest);
1128
0
        int32_t count2 = srcLength - i;
1129
0
        if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
1130
            /* fast ASCII loop */
1131
0
            int32_t start = i;
1132
0
            uint8_t b;
1133
0
            while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
1134
0
                *pDest++=b;
1135
0
                ++i;
1136
0
            }
1137
0
            int32_t delta = i - start;
1138
0
            count -= delta;
1139
0
            count2 -= delta;
1140
0
        }
1141
        /*
1142
         * Each iteration of the inner loop progresses by at most 3 UTF-8
1143
         * bytes and one char16_t.
1144
         */
1145
0
        if(subchar > 0xFFFF) {
1146
0
            break;
1147
0
        }
1148
0
        count2 /= 3;
1149
0
        if(count > count2) {
1150
0
            count = count2; /* min(remaining dest, remaining src/3) */
1151
0
        }
1152
0
        if(count < 3) {
1153
            /*
1154
             * Too much overhead if we get near the end of the string,
1155
             * continue with the next loop.
1156
             */
1157
0
            break;
1158
0
        }
1159
0
        do {
1160
0
            ch = (uint8_t)src[i++];
1161
0
            if(U8_IS_SINGLE(ch)) {
1162
0
                *pDest++=(char16_t)ch;
1163
0
            } else {
1164
0
                if(ch >= 0xe0) {
1165
0
                    if( /* handle U+0000..U+FFFF inline */
1166
0
                        ch <= 0xef &&
1167
0
                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1168
0
                        (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1169
0
                    ) {
1170
                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
1171
0
                        *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
1172
0
                        i += 2;
1173
0
                        continue;
1174
0
                    }
1175
0
                } else {
1176
0
                    if( /* handle U+0000..U+07FF inline */
1177
0
                        ch >= 0xc0 &&
1178
0
                        (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1179
0
                    ) {
1180
0
                        *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
1181
0
                        ++i;
1182
0
                        continue;
1183
0
                    }
1184
0
                }
1185
1186
0
                if(subchar < 0) {
1187
0
                    *pErrorCode = U_INVALID_CHAR_FOUND;
1188
0
                    return nullptr;
1189
0
                } else if(subchar > 0xffff && --count == 0) {
1190
                    /*
1191
                     * We need to write two UChars, adjusted count for that,
1192
                     * and ran out of space.
1193
                     */
1194
0
                    --i;  // back out byte ch
1195
0
                    break;
1196
0
                } else {
1197
                    /* function call for error cases */
1198
0
                    utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1199
0
                    ++numSubstitutions;
1200
0
                    *(pDest++)=(char16_t)subchar;
1201
0
                }
1202
0
            }
1203
0
        } while(--count > 0);
1204
0
    }
1205
1206
0
    while(i < srcLength && (pDest < pDestLimit)) {
1207
0
        ch = (uint8_t)src[i++];
1208
0
        if(U8_IS_SINGLE(ch)){
1209
0
            *pDest++=(char16_t)ch;
1210
0
        } else {
1211
0
            if(ch >= 0xe0) {
1212
0
                if( /* handle U+0000..U+FFFF inline */
1213
0
                    ch <= 0xef &&
1214
0
                    (i+1) < srcLength &&
1215
0
                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
1216
0
                    (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
1217
0
                ) {
1218
                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
1219
0
                    *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
1220
0
                    i += 2;
1221
0
                    continue;
1222
0
                }
1223
0
            } else {
1224
0
                if( /* handle U+0000..U+07FF inline */
1225
0
                    ch >= 0xc0 &&
1226
0
                    i < srcLength &&
1227
0
                    (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
1228
0
                ) {
1229
0
                    *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
1230
0
                    ++i;
1231
0
                    continue;
1232
0
                }
1233
0
            }
1234
1235
0
            if(subchar < 0) {
1236
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1237
0
                return nullptr;
1238
0
            } else {
1239
                /* function call for error cases */
1240
0
                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1241
0
                ++numSubstitutions;
1242
0
                if(subchar<=0xFFFF) {
1243
0
                    *(pDest++)=(char16_t)subchar;
1244
0
                } else {
1245
0
                    *(pDest++)=U16_LEAD(subchar);
1246
0
                    if(pDest<pDestLimit) {
1247
0
                        *(pDest++)=U16_TRAIL(subchar);
1248
0
                    } else {
1249
0
                        reqLength++;
1250
0
                        break;
1251
0
                    }
1252
0
                }
1253
0
            }
1254
0
        }
1255
0
    }
1256
1257
    /* Pre-flight the rest of the string. */
1258
0
    while(i < srcLength) {
1259
0
        ch = (uint8_t)src[i++];
1260
0
        if(U8_IS_SINGLE(ch)) {
1261
0
            reqLength++;
1262
0
        } else {
1263
0
            if(ch >= 0xe0) {
1264
0
                if( /* handle U+0000..U+FFFF inline */
1265
0
                    ch <= 0xef &&
1266
0
                    (i+1) < srcLength &&
1267
0
                    (uint8_t)(src[i] - 0x80) <= 0x3f &&
1268
0
                    (uint8_t)(src[i+1] - 0x80) <= 0x3f
1269
0
                ) {
1270
0
                    reqLength++;
1271
0
                    i += 2;
1272
0
                    continue;
1273
0
                }
1274
0
            } else {
1275
0
                if( /* handle U+0000..U+07FF inline */
1276
0
                    ch >= 0xc0 &&
1277
0
                    i < srcLength &&
1278
0
                    (uint8_t)(src[i] - 0x80) <= 0x3f
1279
0
                ) {
1280
0
                    reqLength++;
1281
0
                    ++i;
1282
0
                    continue;
1283
0
                }
1284
0
            }
1285
1286
0
            if(subchar < 0) {
1287
0
                *pErrorCode = U_INVALID_CHAR_FOUND;
1288
0
                return nullptr;
1289
0
            } else {
1290
                /* function call for error cases */
1291
0
                utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
1292
0
                ++numSubstitutions;
1293
0
                reqLength+=U16_LENGTH(ch);
1294
0
            }
1295
0
        }
1296
0
    }
1297
1298
0
    if(pNumSubstitutions!=nullptr) {
1299
0
        *pNumSubstitutions=numSubstitutions;
1300
0
    }
1301
1302
0
    reqLength+=(int32_t)(pDest - dest);
1303
0
    if(pDestLength) {
1304
0
        *pDestLength = reqLength;
1305
0
    }
1306
1307
    /* Terminate the buffer */
1308
0
    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1309
0
    return dest;
1310
0
}
1311
1312
U_CAPI char* U_EXPORT2 
1313
u_strToJavaModifiedUTF8(
1314
        char *dest,
1315
        int32_t destCapacity,
1316
        int32_t *pDestLength,
1317
        const char16_t *src,
1318
        int32_t srcLength,
1319
0
        UErrorCode *pErrorCode) {
1320
0
    int32_t reqLength=0;
1321
0
    uint32_t ch=0;
1322
0
    const char16_t *pSrcLimit;
1323
0
    int32_t count;
1324
1325
    /* args check */
1326
0
    if(U_FAILURE(*pErrorCode)){
1327
0
        return nullptr;
1328
0
    }
1329
0
    if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
1330
0
        (dest==nullptr && destCapacity!=0) || destCapacity<0
1331
0
    ) {
1332
0
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1333
0
        return nullptr;
1334
0
    }
1335
0
    uint8_t *pDest = (uint8_t *)dest;
1336
0
    uint8_t *pDestLimit = pDest + destCapacity;
1337
1338
0
    if(srcLength==-1) {
1339
        /* Convert NUL-terminated ASCII, then find the string length. */
1340
0
        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1341
0
            *pDest++ = (uint8_t)ch;
1342
0
            ++src;
1343
0
        }
1344
0
        if(ch == 0) {
1345
0
            reqLength=(int32_t)(pDest - (uint8_t *)dest);
1346
0
            if(pDestLength) {
1347
0
                *pDestLength = reqLength;
1348
0
            }
1349
1350
            /* Terminate the buffer */
1351
0
            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1352
0
            return dest;
1353
0
        }
1354
0
        srcLength = u_strlen(src);
1355
0
    }
1356
1357
    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1358
0
    pSrcLimit = (src!=nullptr)?(src+srcLength):nullptr;
1359
0
    for(;;) {
1360
0
        count = (int32_t)(pDestLimit - pDest);
1361
0
        srcLength = (int32_t)(pSrcLimit - src);
1362
0
        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1363
            /* fast ASCII loop */
1364
0
            const char16_t *prevSrc = src;
1365
0
            int32_t delta;
1366
0
            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1367
0
                *pDest++=(uint8_t)ch;
1368
0
                ++src;
1369
0
            }
1370
0
            delta = (int32_t)(src - prevSrc);
1371
0
            count -= delta;
1372
0
            srcLength -= delta;
1373
0
        }
1374
        /*
1375
         * Each iteration of the inner loop progresses by at most 3 UTF-8
1376
         * bytes and one char16_t.
1377
         */
1378
0
        count /= 3;
1379
0
        if(count > srcLength) {
1380
0
            count = srcLength; /* min(remaining dest/3, remaining src) */
1381
0
        }
1382
0
        if(count < 3) {
1383
            /*
1384
             * Too much overhead if we get near the end of the string,
1385
             * continue with the next loop.
1386
             */
1387
0
            break;
1388
0
        }
1389
0
        do {
1390
0
            ch=*src++;
1391
0
            if(ch <= 0x7f && ch != 0) {
1392
0
                *pDest++ = (uint8_t)ch;
1393
0
            } else if(ch <= 0x7ff) {
1394
0
                *pDest++=(uint8_t)((ch>>6)|0xc0);
1395
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1396
0
            } else {
1397
0
                *pDest++=(uint8_t)((ch>>12)|0xe0);
1398
0
                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1399
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1400
0
            }
1401
0
        } while(--count > 0);
1402
0
    }
1403
1404
0
    while(src<pSrcLimit) {
1405
0
        ch=*src++;
1406
0
        if(ch <= 0x7f && ch != 0) {
1407
0
            if(pDest<pDestLimit) {
1408
0
                *pDest++ = (uint8_t)ch;
1409
0
            } else {
1410
0
                reqLength = 1;
1411
0
                break;
1412
0
            }
1413
0
        } else if(ch <= 0x7ff) {
1414
0
            if((pDestLimit - pDest) >= 2) {
1415
0
                *pDest++=(uint8_t)((ch>>6)|0xc0);
1416
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1417
0
            } else {
1418
0
                reqLength = 2;
1419
0
                break;
1420
0
            }
1421
0
        } else {
1422
0
            if((pDestLimit - pDest) >= 3) {
1423
0
                *pDest++=(uint8_t)((ch>>12)|0xe0);
1424
0
                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1425
0
                *pDest++=(uint8_t)((ch&0x3f)|0x80);
1426
0
            } else {
1427
0
                reqLength = 3;
1428
0
                break;
1429
0
            }
1430
0
        }
1431
0
    }
1432
0
    while(src<pSrcLimit) {
1433
0
        ch=*src++;
1434
0
        if(ch <= 0x7f && ch != 0) {
1435
0
            ++reqLength;
1436
0
        } else if(ch<=0x7ff) {
1437
0
            reqLength+=2;
1438
0
        } else {
1439
0
            reqLength+=3;
1440
0
        }
1441
0
    }
1442
1443
0
    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1444
0
    if(pDestLength){
1445
0
        *pDestLength = reqLength;
1446
0
    }
1447
1448
    /* Terminate the buffer */
1449
0
    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1450
0
    return dest;
1451
0
}