Coverage Report

Created: 2026-03-12 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/common/ucnv_u8.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u8.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
17
*
18
*   Also, CESU-8 implementation, see UTR 26.
19
*   The CESU-8 converter uses all the same functions as the
20
*   UTF-8 converter, with a branch for converting supplementary code points.
21
*/
22
23
#include "unicode/utypes.h"
24
25
#if !UCONFIG_NO_CONVERSION
26
27
#include "unicode/ucnv.h"
28
#include "unicode/utf.h"
29
#include "unicode/utf8.h"
30
#include "unicode/utf16.h"
31
#include "uassert.h"
32
#include "ucnv_bld.h"
33
#include "ucnv_cnv.h"
34
#include "cmemory.h"
35
#include "ustr_imp.h"
36
37
/* Prototypes --------------------------------------------------------------- */
38
39
/* Keep these here to make finicky compilers happy */
40
41
U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42
                                           UErrorCode *err);
43
U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44
                                                        UErrorCode *err);
45
46
47
/* UTF-8 -------------------------------------------------------------------- */
48
49
179M
#define MAXIMUM_UCS2            0x0000FFFF
50
51
static const uint32_t offsetsFromUTF8[5] = {0,
52
  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53
  (uint32_t) 0x03C82080
54
};
55
56
static UBool hasCESU8Data(const UConverter *cnv)
57
5.56M
{
58
#if UCONFIG_ONLY_HTML_CONVERSION
59
    return FALSE;
60
#else
61
5.56M
    return (UBool)(cnv->sharedData == &_CESU8Data);
62
5.56M
#endif
63
5.56M
}
64
U_CDECL_BEGIN
65
static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66
                                  UErrorCode * err)
67
3.68M
{
68
3.68M
    UConverter *cnv = args->converter;
69
3.68M
    const unsigned char *mySource = (unsigned char *) args->source;
70
3.68M
    UChar *myTarget = args->target;
71
3.68M
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72
3.68M
    const UChar *targetLimit = args->targetLimit;
73
3.68M
    unsigned char *toUBytes = cnv->toUBytes;
74
3.68M
    UBool isCESU8 = hasCESU8Data(cnv);
75
3.68M
    uint32_t ch, ch2 = 0;
76
3.68M
    int32_t i, inBytes;
77
78
    /* Restore size of current sequence */
79
3.68M
    if (cnv->toUnicodeStatus && myTarget < targetLimit)
80
4.00k
    {
81
4.00k
        inBytes = cnv->mode;            /* restore # of bytes to consume */
82
4.00k
        i = cnv->toULength;             /* restore # of bytes consumed */
83
4.00k
        cnv->toULength = 0;
84
85
4.00k
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86
4.00k
        cnv->toUnicodeStatus = 0;
87
4.00k
        goto morebytes;
88
4.00k
    }
89
90
91
36.6M
    while (mySource < sourceLimit && myTarget < targetLimit)
92
35.7M
    {
93
35.7M
        ch = *(mySource++);
94
35.7M
        if (U8_IS_SINGLE(ch))        /* Simple case */
95
31.7M
        {
96
31.7M
            *(myTarget++) = (UChar) ch;
97
31.7M
        }
98
4.02M
        else
99
4.02M
        {
100
            /* store the first char */
101
4.02M
            toUBytes[0] = (char)ch;
102
4.02M
            inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
103
4.02M
            i = 1;
104
105
4.02M
morebytes:
106
7.76M
            while (i < inBytes)
107
4.00M
            {
108
4.00M
                if (mySource < sourceLimit)
109
3.99M
                {
110
3.99M
                    toUBytes[i] = (char) (ch2 = *mySource);
111
3.99M
                    if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
112
257k
                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
113
257k
                    {
114
257k
                        break; /* i < inBytes */
115
257k
                    }
116
3.73M
                    ch = (ch << 6) + ch2;
117
3.73M
                    ++mySource;
118
3.73M
                    i++;
119
3.73M
                }
120
6.19k
                else
121
6.19k
                {
122
                    /* stores a partially calculated target*/
123
6.19k
                    cnv->toUnicodeStatus = ch;
124
6.19k
                    cnv->mode = inBytes;
125
6.19k
                    cnv->toULength = (int8_t) i;
126
6.19k
                    goto donefornow;
127
6.19k
                }
128
4.00M
            }
129
130
            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131
4.02M
            if (i == inBytes && (!isCESU8 || i <= 3))
132
1.52M
            {
133
                /* Remove the accumulated high bits */
134
1.52M
                ch -= offsetsFromUTF8[inBytes];
135
136
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137
1.52M
                if (ch <= MAXIMUM_UCS2) 
138
558k
                {
139
                    /* fits in 16 bits */
140
558k
                    *(myTarget++) = (UChar) ch;
141
558k
                }
142
969k
                else
143
969k
                {
144
                    /* write out the surrogates */
145
969k
                    *(myTarget++) = U16_LEAD(ch);
146
969k
                    ch = U16_TRAIL(ch);
147
969k
                    if (myTarget < targetLimit)
148
706k
                    {
149
706k
                        *(myTarget++) = (UChar)ch;
150
706k
                    }
151
263k
                    else
152
263k
                    {
153
                        /* Put in overflow buffer (not handled here) */
154
263k
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
155
263k
                        cnv->UCharErrorBufferLength = 1;
156
263k
                        *err = U_BUFFER_OVERFLOW_ERROR;
157
263k
                        break;
158
263k
                    }
159
969k
                }
160
1.52M
            }
161
2.49M
            else
162
2.49M
            {
163
2.49M
                cnv->toULength = (int8_t)i;
164
2.49M
                *err = U_ILLEGAL_CHAR_FOUND;
165
2.49M
                break;
166
2.49M
            }
167
4.02M
        }
168
35.7M
    }
169
170
3.68M
donefornow:
171
3.68M
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172
908k
    {
173
        /* End of target buffer */
174
908k
        *err = U_BUFFER_OVERFLOW_ERROR;
175
908k
    }
176
177
3.68M
    args->target = myTarget;
178
3.68M
    args->source = (const char *) mySource;
179
3.68M
}
180
181
static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182
                                                UErrorCode * err)
183
0
{
184
0
    UConverter *cnv = args->converter;
185
0
    const unsigned char *mySource = (unsigned char *) args->source;
186
0
    UChar *myTarget = args->target;
187
0
    int32_t *myOffsets = args->offsets;
188
0
    int32_t offsetNum = 0;
189
0
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190
0
    const UChar *targetLimit = args->targetLimit;
191
0
    unsigned char *toUBytes = cnv->toUBytes;
192
0
    UBool isCESU8 = hasCESU8Data(cnv);
193
0
    uint32_t ch, ch2 = 0;
194
0
    int32_t i, inBytes;
195
196
    /* Restore size of current sequence */
197
0
    if (cnv->toUnicodeStatus && myTarget < targetLimit)
198
0
    {
199
0
        inBytes = cnv->mode;            /* restore # of bytes to consume */
200
0
        i = cnv->toULength;             /* restore # of bytes consumed */
201
0
        cnv->toULength = 0;
202
203
0
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204
0
        cnv->toUnicodeStatus = 0;
205
0
        goto morebytes;
206
0
    }
207
208
0
    while (mySource < sourceLimit && myTarget < targetLimit)
209
0
    {
210
0
        ch = *(mySource++);
211
0
        if (U8_IS_SINGLE(ch))        /* Simple case */
212
0
        {
213
0
            *(myTarget++) = (UChar) ch;
214
0
            *(myOffsets++) = offsetNum++;
215
0
        }
216
0
        else
217
0
        {
218
0
            toUBytes[0] = (char)ch;
219
0
            inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220
0
            i = 1;
221
222
0
morebytes:
223
0
            while (i < inBytes)
224
0
            {
225
0
                if (mySource < sourceLimit)
226
0
                {
227
0
                    toUBytes[i] = (char) (ch2 = *mySource);
228
0
                    if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
229
0
                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
230
0
                    {
231
0
                        break; /* i < inBytes */
232
0
                    }
233
0
                    ch = (ch << 6) + ch2;
234
0
                    ++mySource;
235
0
                    i++;
236
0
                }
237
0
                else
238
0
                {
239
0
                    cnv->toUnicodeStatus = ch;
240
0
                    cnv->mode = inBytes;
241
0
                    cnv->toULength = (int8_t)i;
242
0
                    goto donefornow;
243
0
                }
244
0
            }
245
246
            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247
0
            if (i == inBytes && (!isCESU8 || i <= 3))
248
0
            {
249
                /* Remove the accumulated high bits */
250
0
                ch -= offsetsFromUTF8[inBytes];
251
252
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253
0
                if (ch <= MAXIMUM_UCS2) 
254
0
                {
255
                    /* fits in 16 bits */
256
0
                    *(myTarget++) = (UChar) ch;
257
0
                    *(myOffsets++) = offsetNum;
258
0
                }
259
0
                else
260
0
                {
261
                    /* write out the surrogates */
262
0
                    *(myTarget++) = U16_LEAD(ch);
263
0
                    *(myOffsets++) = offsetNum;
264
0
                    ch = U16_TRAIL(ch);
265
0
                    if (myTarget < targetLimit)
266
0
                    {
267
0
                        *(myTarget++) = (UChar)ch;
268
0
                        *(myOffsets++) = offsetNum;
269
0
                    }
270
0
                    else
271
0
                    {
272
0
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
273
0
                        cnv->UCharErrorBufferLength = 1;
274
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
275
0
                    }
276
0
                }
277
0
                offsetNum += i;
278
0
            }
279
0
            else
280
0
            {
281
0
                cnv->toULength = (int8_t)i;
282
0
                *err = U_ILLEGAL_CHAR_FOUND;
283
0
                break;
284
0
            }
285
0
        }
286
0
    }
287
288
0
donefornow:
289
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290
0
    {   /* End of target buffer */
291
0
        *err = U_BUFFER_OVERFLOW_ERROR;
292
0
    }
293
294
0
    args->target = myTarget;
295
0
    args->source = (const char *) mySource;
296
0
    args->offsets = myOffsets;
297
0
}
298
U_CDECL_END
299
300
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301
                                    UErrorCode * err)
302
1.87M
{
303
1.87M
    UConverter *cnv = args->converter;
304
1.87M
    const UChar *mySource = args->source;
305
1.87M
    const UChar *sourceLimit = args->sourceLimit;
306
1.87M
    uint8_t *myTarget = (uint8_t *) args->target;
307
1.87M
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308
1.87M
    uint8_t *tempPtr;
309
1.87M
    UChar32 ch;
310
1.87M
    uint8_t tempBuf[4];
311
1.87M
    int32_t indexToWrite;
312
1.87M
    UBool isNotCESU8 = !hasCESU8Data(cnv);
313
314
1.87M
    if (cnv->fromUChar32 && myTarget < targetLimit)
315
266k
    {
316
266k
        ch = cnv->fromUChar32;
317
266k
        cnv->fromUChar32 = 0;
318
266k
        goto lowsurrogate;
319
266k
    }
320
321
375M
    while (mySource < sourceLimit && myTarget < targetLimit)
322
373M
    {
323
373M
        ch = *(mySource++);
324
325
373M
        if (ch < 0x80)        /* Single byte */
326
181M
        {
327
181M
            *(myTarget++) = (uint8_t) ch;
328
181M
        }
329
191M
        else if (ch < 0x800)  /* Double byte */
330
13.5M
        {
331
13.5M
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
332
13.5M
            if (myTarget < targetLimit)
333
13.4M
            {
334
13.4M
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
335
13.4M
            }
336
2.46k
            else
337
2.46k
            {
338
2.46k
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
339
2.46k
                cnv->charErrorBufferLength = 1;
340
2.46k
                *err = U_BUFFER_OVERFLOW_ERROR;
341
2.46k
            }
342
13.5M
        }
343
178M
        else {
344
            /* Check for surrogates */
345
178M
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346
2.70M
lowsurrogate:
347
2.70M
                if (mySource < sourceLimit) {
348
                    /* test both code units */
349
2.43M
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350
                        /* convert and consume this supplementary code point */
351
2.43M
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352
2.43M
                        ++mySource;
353
                        /* exit this condition tree */
354
2.43M
                    }
355
0
                    else {
356
                        /* this is an unpaired trail or lead code unit */
357
                        /* callback(illegal) */
358
0
                        cnv->fromUChar32 = ch;
359
0
                        *err = U_ILLEGAL_CHAR_FOUND;
360
0
                        break;
361
0
                    }
362
2.43M
                }
363
266k
                else {
364
                    /* no more input */
365
266k
                    cnv->fromUChar32 = ch;
366
266k
                    break;
367
266k
                }
368
2.70M
            }
369
370
            /* Do we write the buffer directly for speed,
371
            or do we have to be careful about target buffer space? */
372
178M
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373
374
178M
            if (ch <= MAXIMUM_UCS2) {
375
175M
                indexToWrite = 2;
376
175M
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
377
175M
            }
378
2.43M
            else {
379
2.43M
                indexToWrite = 3;
380
2.43M
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381
2.43M
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
382
2.43M
            }
383
178M
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384
178M
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
385
386
178M
            if (tempPtr == myTarget) {
387
                /* There was enough space to write the codepoint directly. */
388
178M
                myTarget += (indexToWrite + 1);
389
178M
            }
390
45.1k
            else {
391
                /* We might run out of room soon. Write it slowly. */
392
181k
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393
135k
                    if (myTarget < targetLimit) {
394
95.9k
                        *(myTarget++) = *tempPtr;
395
95.9k
                    }
396
39.9k
                    else {
397
39.9k
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398
39.9k
                        *err = U_BUFFER_OVERFLOW_ERROR;
399
39.9k
                    }
400
135k
                }
401
45.1k
            }
402
178M
        }
403
373M
    }
404
405
1.87M
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406
17.8k
    {
407
17.8k
        *err = U_BUFFER_OVERFLOW_ERROR;
408
17.8k
    }
409
410
1.87M
    args->target = (char *) myTarget;
411
1.87M
    args->source = mySource;
412
1.87M
}
413
414
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415
                                                  UErrorCode * err)
416
0
{
417
0
    UConverter *cnv = args->converter;
418
0
    const UChar *mySource = args->source;
419
0
    int32_t *myOffsets = args->offsets;
420
0
    const UChar *sourceLimit = args->sourceLimit;
421
0
    uint8_t *myTarget = (uint8_t *) args->target;
422
0
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423
0
    uint8_t *tempPtr;
424
0
    UChar32 ch;
425
0
    int32_t offsetNum, nextSourceIndex;
426
0
    int32_t indexToWrite;
427
0
    uint8_t tempBuf[4];
428
0
    UBool isNotCESU8 = !hasCESU8Data(cnv);
429
430
0
    if (cnv->fromUChar32 && myTarget < targetLimit)
431
0
    {
432
0
        ch = cnv->fromUChar32;
433
0
        cnv->fromUChar32 = 0;
434
0
        offsetNum = -1;
435
0
        nextSourceIndex = 0;
436
0
        goto lowsurrogate;
437
0
    } else {
438
0
        offsetNum = 0;
439
0
    }
440
441
0
    while (mySource < sourceLimit && myTarget < targetLimit)
442
0
    {
443
0
        ch = *(mySource++);
444
445
0
        if (ch < 0x80)        /* Single byte */
446
0
        {
447
0
            *(myOffsets++) = offsetNum++;
448
0
            *(myTarget++) = (char) ch;
449
0
        }
450
0
        else if (ch < 0x800)  /* Double byte */
451
0
        {
452
0
            *(myOffsets++) = offsetNum;
453
0
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
454
0
            if (myTarget < targetLimit)
455
0
            {
456
0
                *(myOffsets++) = offsetNum++;
457
0
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
458
0
            }
459
0
            else
460
0
            {
461
0
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
462
0
                cnv->charErrorBufferLength = 1;
463
0
                *err = U_BUFFER_OVERFLOW_ERROR;
464
0
            }
465
0
        }
466
0
        else
467
        /* Check for surrogates */
468
0
        {
469
0
            nextSourceIndex = offsetNum + 1;
470
471
0
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472
0
lowsurrogate:
473
0
                if (mySource < sourceLimit) {
474
                    /* test both code units */
475
0
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476
                        /* convert and consume this supplementary code point */
477
0
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478
0
                        ++mySource;
479
0
                        ++nextSourceIndex;
480
                        /* exit this condition tree */
481
0
                    }
482
0
                    else {
483
                        /* this is an unpaired trail or lead code unit */
484
                        /* callback(illegal) */
485
0
                        cnv->fromUChar32 = ch;
486
0
                        *err = U_ILLEGAL_CHAR_FOUND;
487
0
                        break;
488
0
                    }
489
0
                }
490
0
                else {
491
                    /* no more input */
492
0
                    cnv->fromUChar32 = ch;
493
0
                    break;
494
0
                }
495
0
            }
496
497
            /* Do we write the buffer directly for speed,
498
            or do we have to be careful about target buffer space? */
499
0
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500
501
0
            if (ch <= MAXIMUM_UCS2) {
502
0
                indexToWrite = 2;
503
0
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
504
0
            }
505
0
            else {
506
0
                indexToWrite = 3;
507
0
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508
0
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
509
0
            }
510
0
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511
0
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512
513
0
            if (tempPtr == myTarget) {
514
                /* There was enough space to write the codepoint directly. */
515
0
                myTarget += (indexToWrite + 1);
516
0
                myOffsets[0] = offsetNum;
517
0
                myOffsets[1] = offsetNum;
518
0
                myOffsets[2] = offsetNum;
519
0
                if (indexToWrite >= 3) {
520
0
                    myOffsets[3] = offsetNum;
521
0
                }
522
0
                myOffsets += (indexToWrite + 1);
523
0
            }
524
0
            else {
525
                /* We might run out of room soon. Write it slowly. */
526
0
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527
0
                    if (myTarget < targetLimit)
528
0
                    {
529
0
                        *(myOffsets++) = offsetNum;
530
0
                        *(myTarget++) = *tempPtr;
531
0
                    }
532
0
                    else
533
0
                    {
534
0
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
536
0
                    }
537
0
                }
538
0
            }
539
0
            offsetNum = nextSourceIndex;
540
0
        }
541
0
    }
542
543
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544
0
    {
545
0
        *err = U_BUFFER_OVERFLOW_ERROR;
546
0
    }
547
548
0
    args->target = (char *) myTarget;
549
0
    args->source = mySource;
550
0
    args->offsets = myOffsets;
551
0
}
552
553
U_CDECL_BEGIN
554
static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555
0
                                               UErrorCode *err) {
556
0
    UConverter *cnv;
557
0
    const uint8_t *sourceInitial;
558
0
    const uint8_t *source;
559
0
    uint8_t myByte;
560
0
    UChar32 ch;
561
0
    int8_t i;
562
563
    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564
565
0
    cnv = args->converter;
566
0
    sourceInitial = source = (const uint8_t *)args->source;
567
0
    if (source >= (const uint8_t *)args->sourceLimit)
568
0
    {
569
        /* no input */
570
0
        *err = U_INDEX_OUTOFBOUNDS_ERROR;
571
0
        return 0xffff;
572
0
    }
573
574
0
    myByte = (uint8_t)*(source++);
575
0
    if (U8_IS_SINGLE(myByte))
576
0
    {
577
0
        args->source = (const char *)source;
578
0
        return (UChar32)myByte;
579
0
    }
580
581
0
    uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582
0
    if (countTrailBytes == 0) {
583
0
        cnv->toUBytes[0] = myByte;
584
0
        cnv->toULength = 1;
585
0
        *err = U_ILLEGAL_CHAR_FOUND;
586
0
        args->source = (const char *)source;
587
0
        return 0xffff;
588
0
    }
589
590
    /*The byte sequence is longer than the buffer area passed*/
591
0
    if (((const char *)source + countTrailBytes) > args->sourceLimit)
592
0
    {
593
        /* check if all of the remaining bytes are trail bytes */
594
0
        uint16_t extraBytesToWrite = countTrailBytes + 1;
595
0
        cnv->toUBytes[0] = myByte;
596
0
        i = 1;
597
0
        *err = U_TRUNCATED_CHAR_FOUND;
598
0
        while(source < (const uint8_t *)args->sourceLimit) {
599
0
            uint8_t b = *source;
600
0
            if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601
0
                cnv->toUBytes[i++] = b;
602
0
                ++source;
603
0
            } else {
604
                /* error even before we run out of input */
605
0
                *err = U_ILLEGAL_CHAR_FOUND;
606
0
                break;
607
0
            }
608
0
        }
609
0
        cnv->toULength = i;
610
0
        args->source = (const char *)source;
611
0
        return 0xffff;
612
0
    }
613
614
0
    ch = myByte << 6;
615
0
    if(countTrailBytes == 2) {
616
0
        uint8_t t1 = *source, t2;
617
0
        if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618
0
            args->source = (const char *)(source + 1);
619
0
            return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
620
0
        }
621
0
    } else if(countTrailBytes == 1) {
622
0
        uint8_t t1 = *source;
623
0
        if(U8_IS_TRAIL(t1)) {
624
0
            args->source = (const char *)(source + 1);
625
0
            return (ch + t1) - offsetsFromUTF8[2];
626
0
        }
627
0
    } else {  // countTrailBytes == 3
628
0
        uint8_t t1 = *source, t2, t3;
629
0
        if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630
0
                U8_IS_TRAIL(t3 = *++source)) {
631
0
            args->source = (const char *)(source + 1);
632
0
            return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
633
0
        }
634
0
    }
635
0
    args->source = (const char *)source;
636
637
0
    for(i = 0; sourceInitial < source; ++i) {
638
0
        cnv->toUBytes[i] = *sourceInitial++;
639
0
    }
640
0
    cnv->toULength = i;
641
0
    *err = U_ILLEGAL_CHAR_FOUND;
642
0
    return 0xffff;
643
0
} 
644
U_CDECL_END
645
646
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647
648
U_CDECL_BEGIN
649
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650
static void U_CALLCONV
651
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652
                  UConverterToUnicodeArgs *pToUArgs,
653
1.20M
                  UErrorCode *pErrorCode) {
654
1.20M
    UConverter *utf8;
655
1.20M
    const uint8_t *source, *sourceLimit;
656
1.20M
    uint8_t *target;
657
1.20M
    int32_t targetCapacity;
658
1.20M
    int32_t count;
659
660
1.20M
    int8_t oldToULength, toULength, toULimit;
661
662
1.20M
    UChar32 c;
663
1.20M
    uint8_t b, t1, t2;
664
665
    /* set up the local pointers */
666
1.20M
    utf8=pToUArgs->converter;
667
1.20M
    source=(uint8_t *)pToUArgs->source;
668
1.20M
    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669
1.20M
    target=(uint8_t *)pFromUArgs->target;
670
1.20M
    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671
672
    /* get the converter state from the UTF-8 UConverter */
673
1.20M
    c=(UChar32)utf8->toUnicodeStatus;
674
1.20M
    if(c!=0) {
675
0
        toULength=oldToULength=utf8->toULength;
676
0
        toULimit=(int8_t)utf8->mode;
677
1.20M
    } else {
678
1.20M
        toULength=oldToULength=toULimit=0;
679
1.20M
    }
680
681
1.20M
    count=(int32_t)(sourceLimit-source)+oldToULength;
682
1.20M
    if(count<toULimit) {
683
        /*
684
         * Not enough input to complete the partial character.
685
         * Jump to moreBytes below - it will not output to target.
686
         */
687
1.20M
    } else if(targetCapacity<toULimit) {
688
        /*
689
         * Not enough target capacity to output the partial character.
690
         * Let the standard converter handle this.
691
         */
692
0
        *pErrorCode=U_USING_DEFAULT_WARNING;
693
0
        return;
694
1.20M
    } else {
695
        // Use a single counter for source and target, counting the minimum of
696
        // the source length and the target capacity.
697
        // Let the standard converter handle edge cases.
698
1.20M
        const uint8_t *limit=sourceLimit;
699
1.20M
        if(count>targetCapacity) {
700
561k
            limit-=(count-targetCapacity);
701
561k
            count=targetCapacity;
702
561k
        }
703
704
        // The conversion loop checks count>0 only once per 1/2/3-byte character.
705
        // If the buffer ends with a truncated 2- or 3-byte sequence,
706
        // then we reduce the count to stop before that,
707
        // and collect the remaining bytes after the conversion loop.
708
1.20M
        {
709
            // Do not go back into the bytes that will be read for finishing a partial
710
            // sequence from the previous buffer.
711
1.20M
            int32_t length=count-toULimit;
712
1.20M
            if(length>0) {
713
1.20M
                uint8_t b1=*(limit-1);
714
1.20M
                if(U8_IS_SINGLE(b1)) {
715
                    // common ASCII character
716
1.08M
                } else if(U8_IS_TRAIL(b1) && length>=2) {
717
84.8k
                    uint8_t b2=*(limit-2);
718
84.8k
                    if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
719
                        // truncated 3-byte sequence
720
7.42k
                        count-=2;
721
7.42k
                    }
722
84.8k
                } else if(0xc2<=b1 && b1<0xf0) {
723
                    // truncated 2- or 3-byte sequence
724
20.4k
                    --count;
725
20.4k
                }
726
1.20M
            }
727
1.20M
        }
728
1.20M
    }
729
730
1.20M
    if(c!=0) {
731
0
        utf8->toUnicodeStatus=0;
732
0
        utf8->toULength=0;
733
0
        goto moreBytes;
734
        /* See note in ucnv_SBCSFromUTF8() about this goto. */
735
0
    }
736
737
    /* conversion loop */
738
74.5M
    while(count>0) {
739
74.5M
        b=*source++;
740
74.5M
        if(U8_IS_SINGLE(b)) {
741
            /* convert ASCII */
742
68.3M
            *target++=b;
743
68.3M
            --count;
744
68.3M
            continue;
745
68.3M
        } else {
746
6.22M
            if(b>=0xe0) {
747
3.87M
                if( /* handle U+0800..U+FFFF inline */
748
3.87M
                    b<0xf0 &&
749
2.21M
                    U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
750
2.10M
                    U8_IS_TRAIL(t2=source[1])
751
3.87M
                ) {
752
2.10M
                    source+=2;
753
2.10M
                    *target++=b;
754
2.10M
                    *target++=t1;
755
2.10M
                    *target++=t2;
756
2.10M
                    count-=3;
757
2.10M
                    continue;
758
2.10M
                }
759
3.87M
            } else {
760
2.34M
                if( /* handle U+0080..U+07FF inline */
761
2.34M
                    b>=0xc2 &&
762
1.50M
                    U8_IS_TRAIL(t1=*source)
763
2.34M
                ) {
764
1.43M
                    ++source;
765
1.43M
                    *target++=b;
766
1.43M
                    *target++=t1;
767
1.43M
                    count-=2;
768
1.43M
                    continue;
769
1.43M
                }
770
2.34M
            }
771
772
            /* handle "complicated" and error cases, and continuing partial characters */
773
2.69M
            oldToULength=0;
774
2.69M
            toULength=1;
775
2.69M
            toULimit=U8_COUNT_BYTES_NON_ASCII(b);
776
2.69M
            if (toULimit > (const uint8_t *)pFromUArgs->targetLimit - target)
777
1.95k
            {
778
1.95k
                *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
779
1.95k
                break;
780
1.95k
            }
781
2.68M
            c=b;
782
2.68M
moreBytes:
783
7.51M
            while(toULength<toULimit) {
784
5.16M
                if(source<sourceLimit) {
785
5.16M
                    b=*source;
786
5.16M
                    if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
787
4.82M
                        ++source;
788
4.82M
                        ++toULength;
789
4.82M
                        c=(c<<6)+b;
790
4.82M
                    } else {
791
340k
                        break; /* sequence too short, stop with toULength<toULimit */
792
340k
                    }
793
5.16M
                } else {
794
                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
795
930
                    source-=(toULength-oldToULength);
796
2.75k
                    while(oldToULength<toULength) {
797
1.82k
                        utf8->toUBytes[oldToULength++]=*source++;
798
1.82k
                    }
799
930
                    utf8->toUnicodeStatus=c;
800
930
                    utf8->toULength=toULength;
801
930
                    utf8->mode=toULimit;
802
930
                    pToUArgs->source=(char *)source;
803
930
                    pFromUArgs->target=(char *)target;
804
930
                    return;
805
930
                }
806
5.16M
            }
807
808
2.68M
            if(toULength!=toULimit) {
809
                /* error handling: illegal UTF-8 byte sequence */
810
1.18M
                source-=(toULength-oldToULength);
811
2.69M
                while(oldToULength<toULength) {
812
1.50M
                    utf8->toUBytes[oldToULength++]=*source++;
813
1.50M
                }
814
1.18M
                utf8->toULength=toULength;
815
1.18M
                pToUArgs->source=(char *)source;
816
1.18M
                pFromUArgs->target=(char *)target;
817
1.18M
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
818
1.18M
                return;
819
1.18M
            }
820
821
            /* copy the legal byte sequence to the target */
822
1.50M
            if(count>=toULength) {
823
1.50M
                int8_t i;
824
825
1.50M
                for(i=0; i<oldToULength; ++i) {
826
0
                    *target++=utf8->toUBytes[i];
827
0
                }
828
1.50M
                source-=(toULength-oldToULength);
829
7.51M
                for(; i<toULength; ++i) {
830
6.01M
                    *target++=*source++;
831
6.01M
                }
832
1.50M
                count-=toULength;
833
1.50M
            } else {
834
                // A supplementary character that does not fit into the target.
835
                // Let the standard converter handle this.
836
0
                source-=(toULength-oldToULength);
837
0
                pToUArgs->source=(char *)source;
838
0
                pFromUArgs->target=(char *)target;
839
0
                *pErrorCode=U_USING_DEFAULT_WARNING;
840
0
                return;
841
0
            }
842
1.50M
        }
843
74.5M
    }
844
22.3k
    U_ASSERT(count>=0);
845
846
22.3k
    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
847
2.73k
        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
848
1.14k
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
849
1.58k
        } else {
850
1.58k
            b=*source;
851
1.58k
            toULimit=U8_COUNT_BYTES(b);
852
1.58k
            if(toULimit>(sourceLimit-source)) {
853
                /* collect a truncated byte sequence */
854
1.37k
                toULength=0;
855
1.37k
                c=b;
856
1.68k
                for(;;) {
857
1.68k
                    utf8->toUBytes[toULength++]=b;
858
1.68k
                    if(++source==sourceLimit) {
859
                        /* partial byte sequence at end of source */
860
1.15k
                        utf8->toUnicodeStatus=c;
861
1.15k
                        utf8->toULength=toULength;
862
1.15k
                        utf8->mode=toULimit;
863
1.15k
                        break;
864
1.15k
                    } else if(!U8_IS_TRAIL(b=*source)) {
865
                        /* lead byte in trail byte position */
866
216
                        utf8->toULength=toULength;
867
216
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
868
216
                        break;
869
216
                    }
870
314
                    c=(c<<6)+b;
871
314
                }
872
1.37k
            } else {
873
                /* partial-sequence target overflow: fall back to the pivoting implementation */
874
214
                *pErrorCode=U_USING_DEFAULT_WARNING;
875
214
            }
876
1.58k
        }
877
2.73k
    }
878
879
    /* write back the updated pointers */
880
22.3k
    pToUArgs->source=(char *)source;
881
22.3k
    pFromUArgs->target=(char *)target;
882
22.3k
}
883
884
U_CDECL_END
885
886
/* UTF-8 converter data ----------------------------------------------------- */
887
888
static const UConverterImpl _UTF8Impl={
889
    UCNV_UTF8,
890
891
    NULL,
892
    NULL,
893
894
    NULL,
895
    NULL,
896
    NULL,
897
898
    ucnv_toUnicode_UTF8,
899
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
900
    ucnv_fromUnicode_UTF8,
901
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
902
    ucnv_getNextUChar_UTF8,
903
904
    NULL,
905
    NULL,
906
    NULL,
907
    NULL,
908
    ucnv_getNonSurrogateUnicodeSet,
909
910
    ucnv_UTF8FromUTF8,
911
    ucnv_UTF8FromUTF8
912
};
913
914
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
915
static const UConverterStaticData _UTF8StaticData={
916
    sizeof(UConverterStaticData),
917
    "UTF-8",
918
    1208, UCNV_IBM, UCNV_UTF8,
919
    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
920
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
921
    0,
922
    0,
923
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
924
};
925
926
927
const UConverterSharedData _UTF8Data=
928
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
929
930
/* CESU-8 converter data ---------------------------------------------------- */
931
932
static const UConverterImpl _CESU8Impl={
933
    UCNV_CESU8,
934
935
    NULL,
936
    NULL,
937
938
    NULL,
939
    NULL,
940
    NULL,
941
942
    ucnv_toUnicode_UTF8,
943
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
944
    ucnv_fromUnicode_UTF8,
945
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
946
    NULL,
947
948
    NULL,
949
    NULL,
950
    NULL,
951
    NULL,
952
    ucnv_getCompleteUnicodeSet,
953
954
    NULL,
955
    NULL
956
};
957
958
static const UConverterStaticData _CESU8StaticData={
959
    sizeof(UConverterStaticData),
960
    "CESU-8",
961
    9400, /* CCSID for CESU-8 */
962
    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
963
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
964
    0,
965
    0,
966
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
967
};
968
969
970
const UConverterSharedData _CESU8Data=
971
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
972
973
#endif