Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucnv_u8.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u8.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
17
*
18
*   Also, CESU-8 implementation, see UTR 26.
19
*   The CESU-8 converter uses all the same functions as the
20
*   UTF-8 converter, with a branch for converting supplementary code points.
21
*/
22
23
#include "unicode/utypes.h"
24
25
#if !UCONFIG_NO_CONVERSION
26
27
#include "unicode/ucnv.h"
28
#include "unicode/utf.h"
29
#include "unicode/utf8.h"
30
#include "unicode/utf16.h"
31
#include "uassert.h"
32
#include "ucnv_bld.h"
33
#include "ucnv_cnv.h"
34
#include "cmemory.h"
35
#include "ustr_imp.h"
36
37
/* Prototypes --------------------------------------------------------------- */
38
39
/* Keep these here to make finicky compilers happy */
40
41
U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
42
                                           UErrorCode *err);
43
U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
44
                                                        UErrorCode *err);
45
46
47
/* UTF-8 -------------------------------------------------------------------- */
48
49
0
#define MAXIMUM_UCS2            0x0000FFFF
50
51
static const uint32_t offsetsFromUTF8[5] = {0,
52
  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
53
  (uint32_t) 0x03C82080
54
};
55
56
static UBool hasCESU8Data(const UConverter *cnv)
57
0
{
58
#if UCONFIG_ONLY_HTML_CONVERSION
59
    return FALSE;
60
#else
61
0
    return (UBool)(cnv->sharedData == &_CESU8Data);
62
0
#endif
63
0
}
64
U_CDECL_BEGIN
65
static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
66
                                  UErrorCode * err)
67
0
{
68
0
    UConverter *cnv = args->converter;
69
0
    const unsigned char *mySource = (unsigned char *) args->source;
70
0
    UChar *myTarget = args->target;
71
0
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
72
0
    const UChar *targetLimit = args->targetLimit;
73
0
    unsigned char *toUBytes = cnv->toUBytes;
74
0
    UBool isCESU8 = hasCESU8Data(cnv);
75
0
    uint32_t ch, ch2 = 0;
76
0
    int32_t i, inBytes;
77
78
    /* Restore size of current sequence */
79
0
    if (cnv->toULength > 0 && myTarget < targetLimit)
80
0
    {
81
0
        inBytes = cnv->mode;            /* restore # of bytes to consume */
82
0
        i = cnv->toULength;             /* restore # of bytes consumed */
83
0
        cnv->toULength = 0;
84
85
0
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
86
0
        cnv->toUnicodeStatus = 0;
87
0
        goto morebytes;
88
0
    }
89
90
91
0
    while (mySource < sourceLimit && myTarget < targetLimit)
92
0
    {
93
0
        ch = *(mySource++);
94
0
        if (U8_IS_SINGLE(ch))        /* Simple case */
95
0
        {
96
0
            *(myTarget++) = (UChar) ch;
97
0
        }
98
0
        else
99
0
        {
100
            /* store the first char */
101
0
            toUBytes[0] = (char)ch;
102
0
            inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
103
0
            i = 1;
104
105
0
morebytes:
106
0
            while (i < inBytes)
107
0
            {
108
0
                if (mySource < sourceLimit)
109
0
                {
110
0
                    toUBytes[i] = (char) (ch2 = *mySource);
111
0
                    if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
112
0
                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
113
0
                    {
114
0
                        break; /* i < inBytes */
115
0
                    }
116
0
                    ch = (ch << 6) + ch2;
117
0
                    ++mySource;
118
0
                    i++;
119
0
                }
120
0
                else
121
0
                {
122
                    /* stores a partially calculated target*/
123
0
                    cnv->toUnicodeStatus = ch;
124
0
                    cnv->mode = inBytes;
125
0
                    cnv->toULength = (int8_t) i;
126
0
                    goto donefornow;
127
0
                }
128
0
            }
129
130
            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
131
0
            if (i == inBytes && (!isCESU8 || i <= 3))
132
0
            {
133
                /* Remove the accumulated high bits */
134
0
                ch -= offsetsFromUTF8[inBytes];
135
136
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
137
0
                if (ch <= MAXIMUM_UCS2) 
138
0
                {
139
                    /* fits in 16 bits */
140
0
                    *(myTarget++) = (UChar) ch;
141
0
                }
142
0
                else
143
0
                {
144
                    /* write out the surrogates */
145
0
                    *(myTarget++) = U16_LEAD(ch);
146
0
                    ch = U16_TRAIL(ch);
147
0
                    if (myTarget < targetLimit)
148
0
                    {
149
0
                        *(myTarget++) = (UChar)ch;
150
0
                    }
151
0
                    else
152
0
                    {
153
                        /* Put in overflow buffer (not handled here) */
154
0
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
155
0
                        cnv->UCharErrorBufferLength = 1;
156
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
157
0
                        break;
158
0
                    }
159
0
                }
160
0
            }
161
0
            else
162
0
            {
163
0
                cnv->toULength = (int8_t)i;
164
0
                *err = U_ILLEGAL_CHAR_FOUND;
165
0
                break;
166
0
            }
167
0
        }
168
0
    }
169
170
0
donefornow:
171
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
172
0
    {
173
        /* End of target buffer */
174
0
        *err = U_BUFFER_OVERFLOW_ERROR;
175
0
    }
176
177
0
    args->target = myTarget;
178
0
    args->source = (const char *) mySource;
179
0
}
180
181
static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
182
                                                UErrorCode * err)
183
0
{
184
0
    UConverter *cnv = args->converter;
185
0
    const unsigned char *mySource = (unsigned char *) args->source;
186
0
    UChar *myTarget = args->target;
187
0
    int32_t *myOffsets = args->offsets;
188
0
    int32_t offsetNum = 0;
189
0
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
190
0
    const UChar *targetLimit = args->targetLimit;
191
0
    unsigned char *toUBytes = cnv->toUBytes;
192
0
    UBool isCESU8 = hasCESU8Data(cnv);
193
0
    uint32_t ch, ch2 = 0;
194
0
    int32_t i, inBytes;
195
196
    /* Restore size of current sequence */
197
0
    if (cnv->toULength > 0 && myTarget < targetLimit)
198
0
    {
199
0
        inBytes = cnv->mode;            /* restore # of bytes to consume */
200
0
        i = cnv->toULength;             /* restore # of bytes consumed */
201
0
        cnv->toULength = 0;
202
203
0
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
204
0
        cnv->toUnicodeStatus = 0;
205
0
        goto morebytes;
206
0
    }
207
208
0
    while (mySource < sourceLimit && myTarget < targetLimit)
209
0
    {
210
0
        ch = *(mySource++);
211
0
        if (U8_IS_SINGLE(ch))        /* Simple case */
212
0
        {
213
0
            *(myTarget++) = (UChar) ch;
214
0
            *(myOffsets++) = offsetNum++;
215
0
        }
216
0
        else
217
0
        {
218
0
            toUBytes[0] = (char)ch;
219
0
            inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
220
0
            i = 1;
221
222
0
morebytes:
223
0
            while (i < inBytes)
224
0
            {
225
0
                if (mySource < sourceLimit)
226
0
                {
227
0
                    toUBytes[i] = (char) (ch2 = *mySource);
228
0
                    if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
229
0
                            !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
230
0
                    {
231
0
                        break; /* i < inBytes */
232
0
                    }
233
0
                    ch = (ch << 6) + ch2;
234
0
                    ++mySource;
235
0
                    i++;
236
0
                }
237
0
                else
238
0
                {
239
0
                    cnv->toUnicodeStatus = ch;
240
0
                    cnv->mode = inBytes;
241
0
                    cnv->toULength = (int8_t)i;
242
0
                    goto donefornow;
243
0
                }
244
0
            }
245
246
            // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
247
0
            if (i == inBytes && (!isCESU8 || i <= 3))
248
0
            {
249
                /* Remove the accumulated high bits */
250
0
                ch -= offsetsFromUTF8[inBytes];
251
252
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
253
0
                if (ch <= MAXIMUM_UCS2) 
254
0
                {
255
                    /* fits in 16 bits */
256
0
                    *(myTarget++) = (UChar) ch;
257
0
                    *(myOffsets++) = offsetNum;
258
0
                }
259
0
                else
260
0
                {
261
                    /* write out the surrogates */
262
0
                    *(myTarget++) = U16_LEAD(ch);
263
0
                    *(myOffsets++) = offsetNum;
264
0
                    ch = U16_TRAIL(ch);
265
0
                    if (myTarget < targetLimit)
266
0
                    {
267
0
                        *(myTarget++) = (UChar)ch;
268
0
                        *(myOffsets++) = offsetNum;
269
0
                    }
270
0
                    else
271
0
                    {
272
0
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
273
0
                        cnv->UCharErrorBufferLength = 1;
274
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
275
0
                    }
276
0
                }
277
0
                offsetNum += i;
278
0
            }
279
0
            else
280
0
            {
281
0
                cnv->toULength = (int8_t)i;
282
0
                *err = U_ILLEGAL_CHAR_FOUND;
283
0
                break;
284
0
            }
285
0
        }
286
0
    }
287
288
0
donefornow:
289
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
290
0
    {   /* End of target buffer */
291
0
        *err = U_BUFFER_OVERFLOW_ERROR;
292
0
    }
293
294
0
    args->target = myTarget;
295
0
    args->source = (const char *) mySource;
296
0
    args->offsets = myOffsets;
297
0
}
298
U_CDECL_END
299
300
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
301
                                    UErrorCode * err)
302
0
{
303
0
    UConverter *cnv = args->converter;
304
0
    const UChar *mySource = args->source;
305
0
    const UChar *sourceLimit = args->sourceLimit;
306
0
    uint8_t *myTarget = (uint8_t *) args->target;
307
0
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
308
0
    uint8_t *tempPtr;
309
0
    UChar32 ch;
310
0
    uint8_t tempBuf[4];
311
0
    int32_t indexToWrite;
312
0
    UBool isNotCESU8 = !hasCESU8Data(cnv);
313
314
0
    if (cnv->fromUChar32 && myTarget < targetLimit)
315
0
    {
316
0
        ch = cnv->fromUChar32;
317
0
        cnv->fromUChar32 = 0;
318
0
        goto lowsurrogate;
319
0
    }
320
321
0
    while (mySource < sourceLimit && myTarget < targetLimit)
322
0
    {
323
0
        ch = *(mySource++);
324
325
0
        if (ch < 0x80)        /* Single byte */
326
0
        {
327
0
            *(myTarget++) = (uint8_t) ch;
328
0
        }
329
0
        else if (ch < 0x800)  /* Double byte */
330
0
        {
331
0
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
332
0
            if (myTarget < targetLimit)
333
0
            {
334
0
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
335
0
            }
336
0
            else
337
0
            {
338
0
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
339
0
                cnv->charErrorBufferLength = 1;
340
0
                *err = U_BUFFER_OVERFLOW_ERROR;
341
0
            }
342
0
        }
343
0
        else {
344
            /* Check for surrogates */
345
0
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
346
0
lowsurrogate:
347
0
                if (mySource < sourceLimit) {
348
                    /* test both code units */
349
0
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
350
                        /* convert and consume this supplementary code point */
351
0
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
352
0
                        ++mySource;
353
                        /* exit this condition tree */
354
0
                    }
355
0
                    else {
356
                        /* this is an unpaired trail or lead code unit */
357
                        /* callback(illegal) */
358
0
                        cnv->fromUChar32 = ch;
359
0
                        *err = U_ILLEGAL_CHAR_FOUND;
360
0
                        break;
361
0
                    }
362
0
                }
363
0
                else {
364
                    /* no more input */
365
0
                    cnv->fromUChar32 = ch;
366
0
                    break;
367
0
                }
368
0
            }
369
370
            /* Do we write the buffer directly for speed,
371
            or do we have to be careful about target buffer space? */
372
0
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
373
374
0
            if (ch <= MAXIMUM_UCS2) {
375
0
                indexToWrite = 2;
376
0
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
377
0
            }
378
0
            else {
379
0
                indexToWrite = 3;
380
0
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
381
0
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
382
0
            }
383
0
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
384
0
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
385
386
0
            if (tempPtr == myTarget) {
387
                /* There was enough space to write the codepoint directly. */
388
0
                myTarget += (indexToWrite + 1);
389
0
            }
390
0
            else {
391
                /* We might run out of room soon. Write it slowly. */
392
0
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
393
0
                    if (myTarget < targetLimit) {
394
0
                        *(myTarget++) = *tempPtr;
395
0
                    }
396
0
                    else {
397
0
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
398
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
399
0
                    }
400
0
                }
401
0
            }
402
0
        }
403
0
    }
404
405
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
406
0
    {
407
0
        *err = U_BUFFER_OVERFLOW_ERROR;
408
0
    }
409
410
0
    args->target = (char *) myTarget;
411
0
    args->source = mySource;
412
0
}
413
414
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
415
                                                  UErrorCode * err)
416
0
{
417
0
    UConverter *cnv = args->converter;
418
0
    const UChar *mySource = args->source;
419
0
    int32_t *myOffsets = args->offsets;
420
0
    const UChar *sourceLimit = args->sourceLimit;
421
0
    uint8_t *myTarget = (uint8_t *) args->target;
422
0
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
423
0
    uint8_t *tempPtr;
424
0
    UChar32 ch;
425
0
    int32_t offsetNum, nextSourceIndex;
426
0
    int32_t indexToWrite;
427
0
    uint8_t tempBuf[4];
428
0
    UBool isNotCESU8 = !hasCESU8Data(cnv);
429
430
0
    if (cnv->fromUChar32 && myTarget < targetLimit)
431
0
    {
432
0
        ch = cnv->fromUChar32;
433
0
        cnv->fromUChar32 = 0;
434
0
        offsetNum = -1;
435
0
        nextSourceIndex = 0;
436
0
        goto lowsurrogate;
437
0
    } else {
438
0
        offsetNum = 0;
439
0
    }
440
441
0
    while (mySource < sourceLimit && myTarget < targetLimit)
442
0
    {
443
0
        ch = *(mySource++);
444
445
0
        if (ch < 0x80)        /* Single byte */
446
0
        {
447
0
            *(myOffsets++) = offsetNum++;
448
0
            *(myTarget++) = (char) ch;
449
0
        }
450
0
        else if (ch < 0x800)  /* Double byte */
451
0
        {
452
0
            *(myOffsets++) = offsetNum;
453
0
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
454
0
            if (myTarget < targetLimit)
455
0
            {
456
0
                *(myOffsets++) = offsetNum++;
457
0
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
458
0
            }
459
0
            else
460
0
            {
461
0
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
462
0
                cnv->charErrorBufferLength = 1;
463
0
                *err = U_BUFFER_OVERFLOW_ERROR;
464
0
            }
465
0
        }
466
0
        else
467
        /* Check for surrogates */
468
0
        {
469
0
            nextSourceIndex = offsetNum + 1;
470
471
0
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
472
0
lowsurrogate:
473
0
                if (mySource < sourceLimit) {
474
                    /* test both code units */
475
0
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
476
                        /* convert and consume this supplementary code point */
477
0
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
478
0
                        ++mySource;
479
0
                        ++nextSourceIndex;
480
                        /* exit this condition tree */
481
0
                    }
482
0
                    else {
483
                        /* this is an unpaired trail or lead code unit */
484
                        /* callback(illegal) */
485
0
                        cnv->fromUChar32 = ch;
486
0
                        *err = U_ILLEGAL_CHAR_FOUND;
487
0
                        break;
488
0
                    }
489
0
                }
490
0
                else {
491
                    /* no more input */
492
0
                    cnv->fromUChar32 = ch;
493
0
                    break;
494
0
                }
495
0
            }
496
497
            /* Do we write the buffer directly for speed,
498
            or do we have to be careful about target buffer space? */
499
0
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
500
501
0
            if (ch <= MAXIMUM_UCS2) {
502
0
                indexToWrite = 2;
503
0
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
504
0
            }
505
0
            else {
506
0
                indexToWrite = 3;
507
0
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
508
0
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
509
0
            }
510
0
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
511
0
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
512
513
0
            if (tempPtr == myTarget) {
514
                /* There was enough space to write the codepoint directly. */
515
0
                myTarget += (indexToWrite + 1);
516
0
                myOffsets[0] = offsetNum;
517
0
                myOffsets[1] = offsetNum;
518
0
                myOffsets[2] = offsetNum;
519
0
                if (indexToWrite >= 3) {
520
0
                    myOffsets[3] = offsetNum;
521
0
                }
522
0
                myOffsets += (indexToWrite + 1);
523
0
            }
524
0
            else {
525
                /* We might run out of room soon. Write it slowly. */
526
0
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
527
0
                    if (myTarget < targetLimit)
528
0
                    {
529
0
                        *(myOffsets++) = offsetNum;
530
0
                        *(myTarget++) = *tempPtr;
531
0
                    }
532
0
                    else
533
0
                    {
534
0
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
535
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
536
0
                    }
537
0
                }
538
0
            }
539
0
            offsetNum = nextSourceIndex;
540
0
        }
541
0
    }
542
543
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
544
0
    {
545
0
        *err = U_BUFFER_OVERFLOW_ERROR;
546
0
    }
547
548
0
    args->target = (char *) myTarget;
549
0
    args->source = mySource;
550
0
    args->offsets = myOffsets;
551
0
}
552
553
U_CDECL_BEGIN
554
static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
555
0
                                               UErrorCode *err) {
556
0
    UConverter *cnv;
557
0
    const uint8_t *sourceInitial;
558
0
    const uint8_t *source;
559
0
    uint8_t myByte;
560
0
    UChar32 ch;
561
0
    int8_t i;
562
563
    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
564
565
0
    cnv = args->converter;
566
0
    sourceInitial = source = (const uint8_t *)args->source;
567
0
    if (source >= (const uint8_t *)args->sourceLimit)
568
0
    {
569
        /* no input */
570
0
        *err = U_INDEX_OUTOFBOUNDS_ERROR;
571
0
        return 0xffff;
572
0
    }
573
574
0
    myByte = (uint8_t)*(source++);
575
0
    if (U8_IS_SINGLE(myByte))
576
0
    {
577
0
        args->source = (const char *)source;
578
0
        return (UChar32)myByte;
579
0
    }
580
581
0
    uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
582
0
    if (countTrailBytes == 0) {
583
0
        cnv->toUBytes[0] = myByte;
584
0
        cnv->toULength = 1;
585
0
        *err = U_ILLEGAL_CHAR_FOUND;
586
0
        args->source = (const char *)source;
587
0
        return 0xffff;
588
0
    }
589
590
    /*The byte sequence is longer than the buffer area passed*/
591
0
    if (((const char *)source + countTrailBytes) > args->sourceLimit)
592
0
    {
593
        /* check if all of the remaining bytes are trail bytes */
594
0
        uint16_t extraBytesToWrite = countTrailBytes + 1;
595
0
        cnv->toUBytes[0] = myByte;
596
0
        i = 1;
597
0
        *err = U_TRUNCATED_CHAR_FOUND;
598
0
        while(source < (const uint8_t *)args->sourceLimit) {
599
0
            uint8_t b = *source;
600
0
            if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
601
0
                cnv->toUBytes[i++] = b;
602
0
                ++source;
603
0
            } else {
604
                /* error even before we run out of input */
605
0
                *err = U_ILLEGAL_CHAR_FOUND;
606
0
                break;
607
0
            }
608
0
        }
609
0
        cnv->toULength = i;
610
0
        args->source = (const char *)source;
611
0
        return 0xffff;
612
0
    }
613
614
0
    ch = myByte << 6;
615
0
    if(countTrailBytes == 2) {
616
0
        uint8_t t1 = *source, t2;
617
0
        if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
618
0
            args->source = (const char *)(source + 1);
619
0
            return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
620
0
        }
621
0
    } else if(countTrailBytes == 1) {
622
0
        uint8_t t1 = *source;
623
0
        if(U8_IS_TRAIL(t1)) {
624
0
            args->source = (const char *)(source + 1);
625
0
            return (ch + t1) - offsetsFromUTF8[2];
626
0
        }
627
0
    } else {  // countTrailBytes == 3
628
0
        uint8_t t1 = *source, t2, t3;
629
0
        if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
630
0
                U8_IS_TRAIL(t3 = *++source)) {
631
0
            args->source = (const char *)(source + 1);
632
0
            return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
633
0
        }
634
0
    }
635
0
    args->source = (const char *)source;
636
637
0
    for(i = 0; sourceInitial < source; ++i) {
638
0
        cnv->toUBytes[i] = *sourceInitial++;
639
0
    }
640
0
    cnv->toULength = i;
641
0
    *err = U_ILLEGAL_CHAR_FOUND;
642
0
    return 0xffff;
643
0
} 
644
U_CDECL_END
645
646
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
647
648
U_CDECL_BEGIN
649
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
650
static void U_CALLCONV
651
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
652
                  UConverterToUnicodeArgs *pToUArgs,
653
0
                  UErrorCode *pErrorCode) {
654
0
    UConverter *utf8;
655
0
    const uint8_t *source, *sourceLimit;
656
0
    uint8_t *target;
657
0
    int32_t targetCapacity;
658
0
    int32_t count;
659
660
0
    int8_t oldToULength, toULength, toULimit;
661
662
0
    UChar32 c;
663
0
    uint8_t b, t1, t2;
664
665
    /* set up the local pointers */
666
0
    utf8=pToUArgs->converter;
667
0
    source=(uint8_t *)pToUArgs->source;
668
0
    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
669
0
    target=(uint8_t *)pFromUArgs->target;
670
0
    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
671
672
    /* get the converter state from the UTF-8 UConverter */
673
0
    if(utf8->toULength > 0) {
674
0
        toULength=oldToULength=utf8->toULength;
675
0
        toULimit=(int8_t)utf8->mode;
676
0
        c=(UChar32)utf8->toUnicodeStatus;
677
0
    } else {
678
0
        toULength=oldToULength=toULimit=0;
679
0
        c = 0;
680
0
    }
681
682
0
    count=(int32_t)(sourceLimit-source)+oldToULength;
683
0
    if(count<toULimit) {
684
        /*
685
         * Not enough input to complete the partial character.
686
         * Jump to moreBytes below - it will not output to target.
687
         */
688
0
    } else if(targetCapacity<toULimit) {
689
        /*
690
         * Not enough target capacity to output the partial character.
691
         * Let the standard converter handle this.
692
         */
693
0
        *pErrorCode=U_USING_DEFAULT_WARNING;
694
0
        return;
695
0
    } else {
696
        // Use a single counter for source and target, counting the minimum of
697
        // the source length and the target capacity.
698
        // Let the standard converter handle edge cases.
699
0
        if(count>targetCapacity) {
700
0
            count=targetCapacity;
701
0
        }
702
703
        // The conversion loop checks count>0 only once per character.
704
        // If the buffer ends with a truncated sequence,
705
        // then we reduce the count to stop before that,
706
        // and collect the remaining bytes after the conversion loop.
707
708
        // Do not go back into the bytes that will be read for finishing a partial
709
        // sequence from the previous buffer.
710
0
        int32_t length=count-toULength;
711
0
        U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
712
0
        count=toULength+length;
713
0
    }
714
715
0
    if(c!=0) {
716
0
        utf8->toUnicodeStatus=0;
717
0
        utf8->toULength=0;
718
0
        goto moreBytes;
719
        /* See note in ucnv_SBCSFromUTF8() about this goto. */
720
0
    }
721
722
    /* conversion loop */
723
0
    while(count>0) {
724
0
        b=*source++;
725
0
        if(U8_IS_SINGLE(b)) {
726
            /* convert ASCII */
727
0
            *target++=b;
728
0
            --count;
729
0
            continue;
730
0
        } else {
731
0
            if(b>=0xe0) {
732
0
                if( /* handle U+0800..U+FFFF inline */
733
0
                    b<0xf0 &&
734
0
                    U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
735
0
                    U8_IS_TRAIL(t2=source[1])
736
0
                ) {
737
0
                    source+=2;
738
0
                    *target++=b;
739
0
                    *target++=t1;
740
0
                    *target++=t2;
741
0
                    count-=3;
742
0
                    continue;
743
0
                }
744
0
            } else {
745
0
                if( /* handle U+0080..U+07FF inline */
746
0
                    b>=0xc2 &&
747
0
                    U8_IS_TRAIL(t1=*source)
748
0
                ) {
749
0
                    ++source;
750
0
                    *target++=b;
751
0
                    *target++=t1;
752
0
                    count-=2;
753
0
                    continue;
754
0
                }
755
0
            }
756
757
            /* handle "complicated" and error cases, and continuing partial characters */
758
0
            oldToULength=0;
759
0
            toULength=1;
760
0
            toULimit=U8_COUNT_BYTES_NON_ASCII(b);
761
0
            c=b;
762
0
moreBytes:
763
0
            while(toULength<toULimit) {
764
0
                if(source<sourceLimit) {
765
0
                    b=*source;
766
0
                    if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
767
0
                        ++source;
768
0
                        ++toULength;
769
0
                        c=(c<<6)+b;
770
0
                    } else {
771
0
                        break; /* sequence too short, stop with toULength<toULimit */
772
0
                    }
773
0
                } else {
774
                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
775
0
                    source-=(toULength-oldToULength);
776
0
                    while(oldToULength<toULength) {
777
0
                        utf8->toUBytes[oldToULength++]=*source++;
778
0
                    }
779
0
                    utf8->toUnicodeStatus=c;
780
0
                    utf8->toULength=toULength;
781
0
                    utf8->mode=toULimit;
782
0
                    pToUArgs->source=(char *)source;
783
0
                    pFromUArgs->target=(char *)target;
784
0
                    return;
785
0
                }
786
0
            }
787
788
0
            if(toULength!=toULimit) {
789
                /* error handling: illegal UTF-8 byte sequence */
790
0
                source-=(toULength-oldToULength);
791
0
                while(oldToULength<toULength) {
792
0
                    utf8->toUBytes[oldToULength++]=*source++;
793
0
                }
794
0
                utf8->toULength=toULength;
795
0
                pToUArgs->source=(char *)source;
796
0
                pFromUArgs->target=(char *)target;
797
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
798
0
                return;
799
0
            }
800
801
            /* copy the legal byte sequence to the target */
802
0
            {
803
0
                int8_t i;
804
805
0
                for(i=0; i<oldToULength; ++i) {
806
0
                    *target++=utf8->toUBytes[i];
807
0
                }
808
0
                source-=(toULength-oldToULength);
809
0
                for(; i<toULength; ++i) {
810
0
                    *target++=*source++;
811
0
                }
812
0
                count-=toULength;
813
0
            }
814
0
        }
815
0
    }
816
0
    U_ASSERT(count>=0);
817
818
0
    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
819
0
        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
820
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
821
0
        } else {
822
0
            b=*source;
823
0
            toULimit=U8_COUNT_BYTES(b);
824
0
            if(toULimit>(sourceLimit-source)) {
825
                /* collect a truncated byte sequence */
826
0
                toULength=0;
827
0
                c=b;
828
0
                for(;;) {
829
0
                    utf8->toUBytes[toULength++]=b;
830
0
                    if(++source==sourceLimit) {
831
                        /* partial byte sequence at end of source */
832
0
                        utf8->toUnicodeStatus=c;
833
0
                        utf8->toULength=toULength;
834
0
                        utf8->mode=toULimit;
835
0
                        break;
836
0
                    } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
837
0
                        utf8->toULength=toULength;
838
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
839
0
                        break;
840
0
                    }
841
0
                    c=(c<<6)+b;
842
0
                }
843
0
            } else {
844
                /* partial-sequence target overflow: fall back to the pivoting implementation */
845
0
                *pErrorCode=U_USING_DEFAULT_WARNING;
846
0
            }
847
0
        }
848
0
    }
849
850
    /* write back the updated pointers */
851
0
    pToUArgs->source=(char *)source;
852
0
    pFromUArgs->target=(char *)target;
853
0
}
854
855
U_CDECL_END
856
857
/* UTF-8 converter data ----------------------------------------------------- */
858
859
static const UConverterImpl _UTF8Impl={
860
    UCNV_UTF8,
861
862
    NULL,
863
    NULL,
864
865
    NULL,
866
    NULL,
867
    NULL,
868
869
    ucnv_toUnicode_UTF8,
870
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
871
    ucnv_fromUnicode_UTF8,
872
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
873
    ucnv_getNextUChar_UTF8,
874
875
    NULL,
876
    NULL,
877
    NULL,
878
    NULL,
879
    ucnv_getNonSurrogateUnicodeSet,
880
881
    ucnv_UTF8FromUTF8,
882
    ucnv_UTF8FromUTF8
883
};
884
885
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
886
static const UConverterStaticData _UTF8StaticData={
887
    sizeof(UConverterStaticData),
888
    "UTF-8",
889
    1208, UCNV_IBM, UCNV_UTF8,
890
    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
891
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
892
    0,
893
    0,
894
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
895
};
896
897
898
const UConverterSharedData _UTF8Data=
899
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
900
901
/* CESU-8 converter data ---------------------------------------------------- */
902
903
static const UConverterImpl _CESU8Impl={
904
    UCNV_CESU8,
905
906
    NULL,
907
    NULL,
908
909
    NULL,
910
    NULL,
911
    NULL,
912
913
    ucnv_toUnicode_UTF8,
914
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
915
    ucnv_fromUnicode_UTF8,
916
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
917
    NULL,
918
919
    NULL,
920
    NULL,
921
    NULL,
922
    NULL,
923
    ucnv_getCompleteUnicodeSet,
924
925
    NULL,
926
    NULL
927
};
928
929
static const UConverterStaticData _CESU8StaticData={
930
    sizeof(UConverterStaticData),
931
    "CESU-8",
932
    9400, /* CCSID for CESU-8 */
933
    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
934
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
935
    0,
936
    0,
937
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
938
};
939
940
941
const UConverterSharedData _CESU8Data=
942
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
943
944
#endif