Coverage Report

Created: 2025-07-11 06:23

/src/icu/source/common/ucnv_u8.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u8.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-8 converter implementation. Used to be in ucnv_utf.c.
17
*
18
*   Also, CESU-8 implementation, see UTR 26.
19
*   The CESU-8 converter uses all the same functions as the
20
*   UTF-8 converter, with a branch for converting supplementary code points.
21
*/
22
23
#include "unicode/utypes.h"
24
25
#if !UCONFIG_NO_CONVERSION
26
27
#include "unicode/ucnv.h"
28
#include "unicode/utf.h"
29
#include "unicode/utf8.h"
30
#include "unicode/utf16.h"
31
#include "ucnv_bld.h"
32
#include "ucnv_cnv.h"
33
#include "cmemory.h"
34
35
/* Prototypes --------------------------------------------------------------- */
36
37
/* Keep these here to make finicky compilers happy */
38
39
U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
40
                                           UErrorCode *err);
41
U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
42
                                                        UErrorCode *err);
43
44
45
/* UTF-8 -------------------------------------------------------------------- */
46
47
/* UTF-8 Conversion DATA
48
 *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
49
 */
50
/*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51
316k
#define MAXIMUM_UCS2            0x0000FFFF
52
1.12M
#define MAXIMUM_UTF             0x0010FFFF
53
#define MAXIMUM_UCS4            0x7FFFFFFF
54
44.1k
#define HALF_SHIFT              10
55
44.1k
#define HALF_BASE               0x0010000
56
44.1k
#define HALF_MASK               0x3FF
57
44.1k
#define SURROGATE_HIGH_START    0xD800
58
#define SURROGATE_HIGH_END      0xDBFF
59
44.1k
#define SURROGATE_LOW_START     0xDC00
60
#define SURROGATE_LOW_END       0xDFFF
61
62
/* -SURROGATE_LOW_START + HALF_BASE */
63
#define SURROGATE_LOW_BASE      9216
64
65
static const uint32_t offsetsFromUTF8[7] = {0,
66
  (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67
  (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68
};
69
70
/* END OF UTF-8 Conversion DATA */
71
72
static const int8_t bytesFromUTF8[256] = {
73
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
81
};
82
83
/*
84
 * Starting with Unicode 3.0.1:
85
 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86
 * byte sequences with more than 4 bytes are illegal in UTF-8,
87
 * which is tested with impossible values for them
88
 */
89
static const uint32_t
90
utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
91
92
static UBool hasCESU8Data(const UConverter *cnv)
93
468k
{
94
#if UCONFIG_ONLY_HTML_CONVERSION
95
    return FALSE;
96
#else
97
468k
    return (UBool)(cnv->sharedData == &_CESU8Data);
98
468k
#endif
99
468k
}
100
U_CDECL_BEGIN
101
static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
102
                                  UErrorCode * err)
103
468k
{
104
468k
    UConverter *cnv = args->converter;
105
468k
    const unsigned char *mySource = (unsigned char *) args->source;
106
468k
    UChar *myTarget = args->target;
107
468k
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
108
468k
    const UChar *targetLimit = args->targetLimit;
109
468k
    unsigned char *toUBytes = cnv->toUBytes;
110
468k
    UBool isCESU8 = hasCESU8Data(cnv);
111
468k
    uint32_t ch, ch2 = 0;
112
468k
    int32_t i, inBytes;
113
114
    /* Restore size of current sequence */
115
468k
    if (cnv->toUnicodeStatus && myTarget < targetLimit)
116
227
    {
117
227
        inBytes = cnv->mode;            /* restore # of bytes to consume */
118
227
        i = cnv->toULength;             /* restore # of bytes consumed */
119
227
        cnv->toULength = 0;
120
121
227
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
122
227
        cnv->toUnicodeStatus = 0;
123
227
        goto morebytes;
124
227
    }
125
126
127
1.85M
    while (mySource < sourceLimit && myTarget < targetLimit)
128
1.84M
    {
129
1.84M
        ch = *(mySource++);
130
1.84M
        if (ch < 0x80)        /* Simple case */
131
1.06M
        {
132
1.06M
            *(myTarget++) = (UChar) ch;
133
1.06M
        }
134
784k
        else
135
784k
        {
136
            /* store the first char */
137
784k
            toUBytes[0] = (char)ch;
138
784k
            inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
139
784k
            i = 1;
140
141
784k
morebytes:
142
1.28M
            while (i < inBytes)
143
610k
            {
144
610k
                if (mySource < sourceLimit)
145
610k
                {
146
610k
                    toUBytes[i] = (char) (ch2 = *mySource);
147
610k
                    if (!U8_IS_TRAIL(ch2))
148
112k
                    {
149
112k
                        break; /* i < inBytes */
150
112k
                    }
151
497k
                    ch = (ch << 6) + ch2;
152
497k
                    ++mySource;
153
497k
                    i++;
154
497k
                }
155
455
                else
156
455
                {
157
                    /* stores a partially calculated target*/
158
455
                    cnv->toUnicodeStatus = ch;
159
455
                    cnv->mode = inBytes;
160
455
                    cnv->toULength = (int8_t) i;
161
455
                    goto donefornow;
162
455
                }
163
610k
            }
164
165
            /* Remove the accumulated high bits */
166
783k
            ch -= offsetsFromUTF8[inBytes];
167
168
            /*
169
             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170
             * - use only trail bytes after a lead byte (checked above)
171
             * - use the right number of trail bytes for a given lead byte
172
             * - encode a code point <= U+10ffff
173
             * - use the fewest possible number of bytes for their code points
174
             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
175
             *
176
             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177
             * There are no irregular sequences any more.
178
             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
179
             */
180
783k
            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
181
783k
                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
182
316k
            {
183
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184
316k
                if (ch <= MAXIMUM_UCS2) 
185
272k
                {
186
                    /* fits in 16 bits */
187
272k
                    *(myTarget++) = (UChar) ch;
188
272k
                }
189
44.1k
                else
190
44.1k
                {
191
                    /* write out the surrogates */
192
44.1k
                    ch -= HALF_BASE;
193
44.1k
                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
194
44.1k
                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
195
44.1k
                    if (myTarget < targetLimit)
196
44.1k
                    {
197
44.1k
                        *(myTarget++) = (UChar)ch;
198
44.1k
                    }
199
0
                    else
200
0
                    {
201
                        /* Put in overflow buffer (not handled here) */
202
0
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
203
0
                        cnv->UCharErrorBufferLength = 1;
204
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
205
0
                        break;
206
0
                    }
207
44.1k
                }
208
316k
            }
209
467k
            else
210
467k
            {
211
467k
                cnv->toULength = (int8_t)i;
212
467k
                *err = U_ILLEGAL_CHAR_FOUND;
213
467k
                break;
214
467k
            }
215
783k
        }
216
1.84M
    }
217
218
468k
donefornow:
219
468k
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
220
0
    {
221
        /* End of target buffer */
222
0
        *err = U_BUFFER_OVERFLOW_ERROR;
223
0
    }
224
225
468k
    args->target = myTarget;
226
468k
    args->source = (const char *) mySource;
227
468k
}
228
229
static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
230
                                                UErrorCode * err)
231
0
{
232
0
    UConverter *cnv = args->converter;
233
0
    const unsigned char *mySource = (unsigned char *) args->source;
234
0
    UChar *myTarget = args->target;
235
0
    int32_t *myOffsets = args->offsets;
236
0
    int32_t offsetNum = 0;
237
0
    const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
238
0
    const UChar *targetLimit = args->targetLimit;
239
0
    unsigned char *toUBytes = cnv->toUBytes;
240
0
    UBool isCESU8 = hasCESU8Data(cnv);
241
0
    uint32_t ch, ch2 = 0;
242
0
    int32_t i, inBytes;
243
244
    /* Restore size of current sequence */
245
0
    if (cnv->toUnicodeStatus && myTarget < targetLimit)
246
0
    {
247
0
        inBytes = cnv->mode;            /* restore # of bytes to consume */
248
0
        i = cnv->toULength;             /* restore # of bytes consumed */
249
0
        cnv->toULength = 0;
250
251
0
        ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
252
0
        cnv->toUnicodeStatus = 0;
253
0
        goto morebytes;
254
0
    }
255
256
0
    while (mySource < sourceLimit && myTarget < targetLimit)
257
0
    {
258
0
        ch = *(mySource++);
259
0
        if (ch < 0x80)        /* Simple case */
260
0
        {
261
0
            *(myTarget++) = (UChar) ch;
262
0
            *(myOffsets++) = offsetNum++;
263
0
        }
264
0
        else
265
0
        {
266
0
            toUBytes[0] = (char)ch;
267
0
            inBytes = bytesFromUTF8[ch];
268
0
            i = 1;
269
270
0
morebytes:
271
0
            while (i < inBytes)
272
0
            {
273
0
                if (mySource < sourceLimit)
274
0
                {
275
0
                    toUBytes[i] = (char) (ch2 = *mySource);
276
0
                    if (!U8_IS_TRAIL(ch2))
277
0
                    {
278
0
                        break; /* i < inBytes */
279
0
                    }
280
0
                    ch = (ch << 6) + ch2;
281
0
                    ++mySource;
282
0
                    i++;
283
0
                }
284
0
                else
285
0
                {
286
0
                    cnv->toUnicodeStatus = ch;
287
0
                    cnv->mode = inBytes;
288
0
                    cnv->toULength = (int8_t)i;
289
0
                    goto donefornow;
290
0
                }
291
0
            }
292
293
            /* Remove the accumulated high bits */
294
0
            ch -= offsetsFromUTF8[inBytes];
295
296
            /*
297
             * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298
             * - use only trail bytes after a lead byte (checked above)
299
             * - use the right number of trail bytes for a given lead byte
300
             * - encode a code point <= U+10ffff
301
             * - use the fewest possible number of bytes for their code points
302
             * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
303
             *
304
             * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305
             * There are no irregular sequences any more.
306
             * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
307
             */
308
0
            if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
309
0
                (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
310
0
            {
311
                /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312
0
                if (ch <= MAXIMUM_UCS2) 
313
0
                {
314
                    /* fits in 16 bits */
315
0
                    *(myTarget++) = (UChar) ch;
316
0
                    *(myOffsets++) = offsetNum;
317
0
                }
318
0
                else
319
0
                {
320
                    /* write out the surrogates */
321
0
                    ch -= HALF_BASE;
322
0
                    *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
323
0
                    *(myOffsets++) = offsetNum;
324
0
                    ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
325
0
                    if (myTarget < targetLimit)
326
0
                    {
327
0
                        *(myTarget++) = (UChar)ch;
328
0
                        *(myOffsets++) = offsetNum;
329
0
                    }
330
0
                    else
331
0
                    {
332
0
                        cnv->UCharErrorBuffer[0] = (UChar) ch;
333
0
                        cnv->UCharErrorBufferLength = 1;
334
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
335
0
                    }
336
0
                }
337
0
                offsetNum += i;
338
0
            }
339
0
            else
340
0
            {
341
0
                cnv->toULength = (int8_t)i;
342
0
                *err = U_ILLEGAL_CHAR_FOUND;
343
0
                break;
344
0
            }
345
0
        }
346
0
    }
347
348
0
donefornow:
349
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
350
0
    {   /* End of target buffer */
351
0
        *err = U_BUFFER_OVERFLOW_ERROR;
352
0
    }
353
354
0
    args->target = myTarget;
355
0
    args->source = (const char *) mySource;
356
0
    args->offsets = myOffsets;
357
0
}
358
U_CDECL_END
359
360
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
361
                                    UErrorCode * err)
362
0
{
363
0
    UConverter *cnv = args->converter;
364
0
    const UChar *mySource = args->source;
365
0
    const UChar *sourceLimit = args->sourceLimit;
366
0
    uint8_t *myTarget = (uint8_t *) args->target;
367
0
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
368
0
    uint8_t *tempPtr;
369
0
    UChar32 ch;
370
0
    uint8_t tempBuf[4];
371
0
    int32_t indexToWrite;
372
0
    UBool isNotCESU8 = !hasCESU8Data(cnv);
373
374
0
    if (cnv->fromUChar32 && myTarget < targetLimit)
375
0
    {
376
0
        ch = cnv->fromUChar32;
377
0
        cnv->fromUChar32 = 0;
378
0
        goto lowsurrogate;
379
0
    }
380
381
0
    while (mySource < sourceLimit && myTarget < targetLimit)
382
0
    {
383
0
        ch = *(mySource++);
384
385
0
        if (ch < 0x80)        /* Single byte */
386
0
        {
387
0
            *(myTarget++) = (uint8_t) ch;
388
0
        }
389
0
        else if (ch < 0x800)  /* Double byte */
390
0
        {
391
0
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
392
0
            if (myTarget < targetLimit)
393
0
            {
394
0
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
395
0
            }
396
0
            else
397
0
            {
398
0
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
399
0
                cnv->charErrorBufferLength = 1;
400
0
                *err = U_BUFFER_OVERFLOW_ERROR;
401
0
            }
402
0
        }
403
0
        else {
404
            /* Check for surrogates */
405
0
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
406
0
lowsurrogate:
407
0
                if (mySource < sourceLimit) {
408
                    /* test both code units */
409
0
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
410
                        /* convert and consume this supplementary code point */
411
0
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
412
0
                        ++mySource;
413
                        /* exit this condition tree */
414
0
                    }
415
0
                    else {
416
                        /* this is an unpaired trail or lead code unit */
417
                        /* callback(illegal) */
418
0
                        cnv->fromUChar32 = ch;
419
0
                        *err = U_ILLEGAL_CHAR_FOUND;
420
0
                        break;
421
0
                    }
422
0
                }
423
0
                else {
424
                    /* no more input */
425
0
                    cnv->fromUChar32 = ch;
426
0
                    break;
427
0
                }
428
0
            }
429
430
            /* Do we write the buffer directly for speed,
431
            or do we have to be careful about target buffer space? */
432
0
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
433
434
0
            if (ch <= MAXIMUM_UCS2) {
435
0
                indexToWrite = 2;
436
0
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
437
0
            }
438
0
            else {
439
0
                indexToWrite = 3;
440
0
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
441
0
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
442
0
            }
443
0
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
444
0
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
445
446
0
            if (tempPtr == myTarget) {
447
                /* There was enough space to write the codepoint directly. */
448
0
                myTarget += (indexToWrite + 1);
449
0
            }
450
0
            else {
451
                /* We might run out of room soon. Write it slowly. */
452
0
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
453
0
                    if (myTarget < targetLimit) {
454
0
                        *(myTarget++) = *tempPtr;
455
0
                    }
456
0
                    else {
457
0
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
458
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
459
0
                    }
460
0
                }
461
0
            }
462
0
        }
463
0
    }
464
465
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
466
0
    {
467
0
        *err = U_BUFFER_OVERFLOW_ERROR;
468
0
    }
469
470
0
    args->target = (char *) myTarget;
471
0
    args->source = mySource;
472
0
}
473
474
U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
475
                                                  UErrorCode * err)
476
0
{
477
0
    UConverter *cnv = args->converter;
478
0
    const UChar *mySource = args->source;
479
0
    int32_t *myOffsets = args->offsets;
480
0
    const UChar *sourceLimit = args->sourceLimit;
481
0
    uint8_t *myTarget = (uint8_t *) args->target;
482
0
    const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
483
0
    uint8_t *tempPtr;
484
0
    UChar32 ch;
485
0
    int32_t offsetNum, nextSourceIndex;
486
0
    int32_t indexToWrite;
487
0
    uint8_t tempBuf[4];
488
0
    UBool isNotCESU8 = !hasCESU8Data(cnv);
489
490
0
    if (cnv->fromUChar32 && myTarget < targetLimit)
491
0
    {
492
0
        ch = cnv->fromUChar32;
493
0
        cnv->fromUChar32 = 0;
494
0
        offsetNum = -1;
495
0
        nextSourceIndex = 0;
496
0
        goto lowsurrogate;
497
0
    } else {
498
0
        offsetNum = 0;
499
0
    }
500
501
0
    while (mySource < sourceLimit && myTarget < targetLimit)
502
0
    {
503
0
        ch = *(mySource++);
504
505
0
        if (ch < 0x80)        /* Single byte */
506
0
        {
507
0
            *(myOffsets++) = offsetNum++;
508
0
            *(myTarget++) = (char) ch;
509
0
        }
510
0
        else if (ch < 0x800)  /* Double byte */
511
0
        {
512
0
            *(myOffsets++) = offsetNum;
513
0
            *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
514
0
            if (myTarget < targetLimit)
515
0
            {
516
0
                *(myOffsets++) = offsetNum++;
517
0
                *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
518
0
            }
519
0
            else
520
0
            {
521
0
                cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
522
0
                cnv->charErrorBufferLength = 1;
523
0
                *err = U_BUFFER_OVERFLOW_ERROR;
524
0
            }
525
0
        }
526
0
        else
527
        /* Check for surrogates */
528
0
        {
529
0
            nextSourceIndex = offsetNum + 1;
530
531
0
            if(U16_IS_SURROGATE(ch) && isNotCESU8) {
532
0
lowsurrogate:
533
0
                if (mySource < sourceLimit) {
534
                    /* test both code units */
535
0
                    if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
536
                        /* convert and consume this supplementary code point */
537
0
                        ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
538
0
                        ++mySource;
539
0
                        ++nextSourceIndex;
540
                        /* exit this condition tree */
541
0
                    }
542
0
                    else {
543
                        /* this is an unpaired trail or lead code unit */
544
                        /* callback(illegal) */
545
0
                        cnv->fromUChar32 = ch;
546
0
                        *err = U_ILLEGAL_CHAR_FOUND;
547
0
                        break;
548
0
                    }
549
0
                }
550
0
                else {
551
                    /* no more input */
552
0
                    cnv->fromUChar32 = ch;
553
0
                    break;
554
0
                }
555
0
            }
556
557
            /* Do we write the buffer directly for speed,
558
            or do we have to be careful about target buffer space? */
559
0
            tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
560
561
0
            if (ch <= MAXIMUM_UCS2) {
562
0
                indexToWrite = 2;
563
0
                tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
564
0
            }
565
0
            else {
566
0
                indexToWrite = 3;
567
0
                tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
568
0
                tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
569
0
            }
570
0
            tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
571
0
            tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
572
573
0
            if (tempPtr == myTarget) {
574
                /* There was enough space to write the codepoint directly. */
575
0
                myTarget += (indexToWrite + 1);
576
0
                myOffsets[0] = offsetNum;
577
0
                myOffsets[1] = offsetNum;
578
0
                myOffsets[2] = offsetNum;
579
0
                if (indexToWrite >= 3) {
580
0
                    myOffsets[3] = offsetNum;
581
0
                }
582
0
                myOffsets += (indexToWrite + 1);
583
0
            }
584
0
            else {
585
                /* We might run out of room soon. Write it slowly. */
586
0
                for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
587
0
                    if (myTarget < targetLimit)
588
0
                    {
589
0
                        *(myOffsets++) = offsetNum;
590
0
                        *(myTarget++) = *tempPtr;
591
0
                    }
592
0
                    else
593
0
                    {
594
0
                        cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
595
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
596
0
                    }
597
0
                }
598
0
            }
599
0
            offsetNum = nextSourceIndex;
600
0
        }
601
0
    }
602
603
0
    if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
604
0
    {
605
0
        *err = U_BUFFER_OVERFLOW_ERROR;
606
0
    }
607
608
0
    args->target = (char *) myTarget;
609
0
    args->source = mySource;
610
0
    args->offsets = myOffsets;
611
0
}
612
613
U_CDECL_BEGIN
614
static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
615
0
                                               UErrorCode *err) {
616
0
    UConverter *cnv;
617
0
    const uint8_t *sourceInitial;
618
0
    const uint8_t *source;
619
0
    uint16_t extraBytesToWrite;
620
0
    uint8_t myByte;
621
0
    UChar32 ch;
622
0
    int8_t i, isLegalSequence;
623
624
    /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
625
626
0
    cnv = args->converter;
627
0
    sourceInitial = source = (const uint8_t *)args->source;
628
0
    if (source >= (const uint8_t *)args->sourceLimit)
629
0
    {
630
        /* no input */
631
0
        *err = U_INDEX_OUTOFBOUNDS_ERROR;
632
0
        return 0xffff;
633
0
    }
634
635
0
    myByte = (uint8_t)*(source++);
636
0
    if (myByte < 0x80)
637
0
    {
638
0
        args->source = (const char *)source;
639
0
        return (UChar32)myByte;
640
0
    }
641
642
0
    extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
643
0
    if (extraBytesToWrite == 0) {
644
0
        cnv->toUBytes[0] = myByte;
645
0
        cnv->toULength = 1;
646
0
        *err = U_ILLEGAL_CHAR_FOUND;
647
0
        args->source = (const char *)source;
648
0
        return 0xffff;
649
0
    }
650
651
    /*The byte sequence is longer than the buffer area passed*/
652
0
    if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
653
0
    {
654
        /* check if all of the remaining bytes are trail bytes */
655
0
        cnv->toUBytes[0] = myByte;
656
0
        i = 1;
657
0
        *err = U_TRUNCATED_CHAR_FOUND;
658
0
        while(source < (const uint8_t *)args->sourceLimit) {
659
0
            if(U8_IS_TRAIL(myByte = *source)) {
660
0
                cnv->toUBytes[i++] = myByte;
661
0
                ++source;
662
0
            } else {
663
                /* error even before we run out of input */
664
0
                *err = U_ILLEGAL_CHAR_FOUND;
665
0
                break;
666
0
            }
667
0
        }
668
0
        cnv->toULength = i;
669
0
        args->source = (const char *)source;
670
0
        return 0xffff;
671
0
    }
672
673
0
    isLegalSequence = 1;
674
0
    ch = myByte << 6;
675
0
    switch(extraBytesToWrite)
676
0
    {     
677
      /* note: code falls through cases! (sic)*/ 
678
0
    case 6:
679
0
        ch += (myByte = *source);
680
0
        ch <<= 6;
681
0
        if (!U8_IS_TRAIL(myByte))
682
0
        {
683
0
            isLegalSequence = 0;
684
0
            break;
685
0
        }
686
0
        ++source;
687
0
        U_FALLTHROUGH;
688
0
    case 5:
689
0
        ch += (myByte = *source);
690
0
        ch <<= 6;
691
0
        if (!U8_IS_TRAIL(myByte))
692
0
        {
693
0
            isLegalSequence = 0;
694
0
            break;
695
0
        }
696
0
        ++source;
697
0
        U_FALLTHROUGH;
698
0
    case 4:
699
0
        ch += (myByte = *source);
700
0
        ch <<= 6;
701
0
        if (!U8_IS_TRAIL(myByte))
702
0
        {
703
0
            isLegalSequence = 0;
704
0
            break;
705
0
        }
706
0
        ++source;
707
0
        U_FALLTHROUGH;
708
0
    case 3:
709
0
        ch += (myByte = *source);
710
0
        ch <<= 6;
711
0
        if (!U8_IS_TRAIL(myByte))
712
0
        {
713
0
            isLegalSequence = 0;
714
0
            break;
715
0
        }
716
0
        ++source;
717
0
        U_FALLTHROUGH;
718
0
    case 2:
719
0
        ch += (myByte = *source);
720
0
        if (!U8_IS_TRAIL(myByte))
721
0
        {
722
0
            isLegalSequence = 0;
723
0
            break;
724
0
        }
725
0
        ++source;
726
0
    };
727
0
    ch -= offsetsFromUTF8[extraBytesToWrite];
728
0
    args->source = (const char *)source;
729
730
    /*
731
     * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
732
     * - use only trail bytes after a lead byte (checked above)
733
     * - use the right number of trail bytes for a given lead byte
734
     * - encode a code point <= U+10ffff
735
     * - use the fewest possible number of bytes for their code points
736
     * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
737
     *
738
     * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
739
     * There are no irregular sequences any more.
740
     */
741
0
    if (isLegalSequence &&
742
0
        (uint32_t)ch <= MAXIMUM_UTF &&
743
0
        (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
744
0
        !U_IS_SURROGATE(ch)
745
0
    ) {
746
0
        return ch; /* return the code point */
747
0
    }
748
749
0
    for(i = 0; sourceInitial < source; ++i) {
750
0
        cnv->toUBytes[i] = *sourceInitial++;
751
0
    }
752
0
    cnv->toULength = i;
753
0
    *err = U_ILLEGAL_CHAR_FOUND;
754
0
    return 0xffff;
755
0
} 
756
U_CDECL_END
757
758
/* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
759
760
/* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
761
static const UChar32
762
utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
763
764
/* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
765
static const UChar32
766
utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
767
768
U_CDECL_BEGIN
769
/* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
770
static void U_CALLCONV
771
ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
772
                  UConverterToUnicodeArgs *pToUArgs,
773
0
                  UErrorCode *pErrorCode) {
774
0
    UConverter *utf8;
775
0
    const uint8_t *source, *sourceLimit;
776
0
    uint8_t *target;
777
0
    int32_t targetCapacity;
778
0
    int32_t count;
779
780
0
    int8_t oldToULength, toULength, toULimit;
781
782
0
    UChar32 c;
783
0
    uint8_t b, t1, t2;
784
785
    /* set up the local pointers */
786
0
    utf8=pToUArgs->converter;
787
0
    source=(uint8_t *)pToUArgs->source;
788
0
    sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
789
0
    target=(uint8_t *)pFromUArgs->target;
790
0
    targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
791
792
    /* get the converter state from the UTF-8 UConverter */
793
0
    c=(UChar32)utf8->toUnicodeStatus;
794
0
    if(c!=0) {
795
0
        toULength=oldToULength=utf8->toULength;
796
0
        toULimit=(int8_t)utf8->mode;
797
0
    } else {
798
0
        toULength=oldToULength=toULimit=0;
799
0
    }
800
801
0
    count=(int32_t)(sourceLimit-source)+oldToULength;
802
0
    if(count<toULimit) {
803
        /*
804
         * Not enough input to complete the partial character.
805
         * Jump to moreBytes below - it will not output to target.
806
         */
807
0
    } else if(targetCapacity<toULimit) {
808
        /*
809
         * Not enough target capacity to output the partial character.
810
         * Let the standard converter handle this.
811
         */
812
0
        *pErrorCode=U_USING_DEFAULT_WARNING;
813
0
        return;
814
0
    } else {
815
        /*
816
         * Use a single counter for source and target, counting the minimum of
817
         * the source length and the target capacity.
818
         * As a result, the source length is checked only once per multi-byte
819
         * character instead of twice.
820
         *
821
         * Make sure that the last byte sequence is complete, or else
822
         * stop just before it.
823
         * (The longest legal byte sequence has 3 trail bytes.)
824
         * Count oldToULength (number of source bytes from a previous buffer)
825
         * into the source length but reduce the source index by toULimit
826
         * while going back over trail bytes in order to not go back into
827
         * the bytes that will be read for finishing a partial
828
         * sequence from the previous buffer.
829
         * Let the standard converter handle edge cases.
830
         */
831
0
        int32_t i;
832
833
0
        if(count>targetCapacity) {
834
0
            count=targetCapacity;
835
0
        }
836
837
0
        i=0;
838
0
        while(i<3 && i<(count-toULimit)) {
839
0
            b=source[count-oldToULength-i-1];
840
0
            if(U8_IS_TRAIL(b)) {
841
0
                ++i;
842
0
            } else {
843
0
                if(i<U8_COUNT_TRAIL_BYTES(b)) {
844
                    /* stop converting before the lead byte if there are not enough trail bytes for it */
845
0
                    count-=i+1;
846
0
                }
847
0
                break;
848
0
            }
849
0
        }
850
0
    }
851
852
0
    if(c!=0) {
853
0
        utf8->toUnicodeStatus=0;
854
0
        utf8->toULength=0;
855
0
        goto moreBytes;
856
        /* See note in ucnv_SBCSFromUTF8() about this goto. */
857
0
    }
858
859
    /* conversion loop */
860
0
    while(count>0) {
861
0
        b=*source++;
862
0
        if((int8_t)b>=0) {
863
            /* convert ASCII */
864
0
            *target++=b;
865
0
            --count;
866
0
            continue;
867
0
        } else {
868
0
            if(b>0xe0) {
869
0
                if( /* handle U+1000..U+D7FF inline */
870
0
                    (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
871
0
                                               (b==0xed && (t1 <= 0x9f))) &&
872
0
                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
873
0
                ) {
874
0
                    source+=2;
875
0
                    *target++=b;
876
0
                    *target++=t1;
877
0
                    *target++=t2;
878
0
                    count-=3;
879
0
                    continue;
880
0
                }
881
0
            } else if(b<0xe0) {
882
0
                if( /* handle U+0080..U+07FF inline */
883
0
                    b>=0xc2 &&
884
0
                    (t1=*source) >= 0x80 && t1 <= 0xbf
885
0
                ) {
886
0
                    ++source;
887
0
                    *target++=b;
888
0
                    *target++=t1;
889
0
                    count-=2;
890
0
                    continue;
891
0
                }
892
0
            } else if(b==0xe0) {
893
0
                if( /* handle U+0800..U+0FFF inline */
894
0
                    (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
895
0
                    (t2=source[1]) >= 0x80 && t2 <= 0xbf
896
0
                ) {
897
0
                    source+=2;
898
0
                    *target++=b;
899
0
                    *target++=t1;
900
0
                    *target++=t2;
901
0
                    count-=3;
902
0
                    continue;
903
0
                }
904
0
            }
905
906
            /* handle "complicated" and error cases, and continuing partial characters */
907
0
            oldToULength=0;
908
0
            toULength=1;
909
0
            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
910
0
            c=b;
911
0
moreBytes:
912
0
            while(toULength<toULimit) {
913
0
                if(source<sourceLimit) {
914
0
                    b=*source;
915
0
                    if(U8_IS_TRAIL(b)) {
916
0
                        ++source;
917
0
                        ++toULength;
918
0
                        c=(c<<6)+b;
919
0
                    } else {
920
0
                        break; /* sequence too short, stop with toULength<toULimit */
921
0
                    }
922
0
                } else {
923
                    /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
924
0
                    source-=(toULength-oldToULength);
925
0
                    while(oldToULength<toULength) {
926
0
                        utf8->toUBytes[oldToULength++]=*source++;
927
0
                    }
928
0
                    utf8->toUnicodeStatus=c;
929
0
                    utf8->toULength=toULength;
930
0
                    utf8->mode=toULimit;
931
0
                    pToUArgs->source=(char *)source;
932
0
                    pFromUArgs->target=(char *)target;
933
0
                    return;
934
0
                }
935
0
            }
936
937
0
            if( toULength==toULimit &&      /* consumed all trail bytes */
938
0
                (toULength==3 || toULength==2) &&             /* BMP */
939
0
                (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
940
0
                (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
941
0
            ) {
942
                /* legal byte sequence for BMP code point */
943
0
            } else if(
944
0
                toULength==toULimit && toULength==4 &&
945
0
                (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
946
0
            ) {
947
                /* legal byte sequence for supplementary code point */
948
0
            } else {
949
                /* error handling: illegal UTF-8 byte sequence */
950
0
                source-=(toULength-oldToULength);
951
0
                while(oldToULength<toULength) {
952
0
                    utf8->toUBytes[oldToULength++]=*source++;
953
0
                }
954
0
                utf8->toULength=toULength;
955
0
                pToUArgs->source=(char *)source;
956
0
                pFromUArgs->target=(char *)target;
957
0
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
958
0
                return;
959
0
            }
960
961
            /* copy the legal byte sequence to the target */
962
0
            {
963
0
                int8_t i;
964
965
0
                for(i=0; i<oldToULength; ++i) {
966
0
                    *target++=utf8->toUBytes[i];
967
0
                }
968
0
                source-=(toULength-oldToULength);
969
0
                for(; i<toULength; ++i) {
970
0
                    *target++=*source++;
971
0
                }
972
0
                count-=toULength;
973
0
            }
974
0
        }
975
0
    }
976
977
0
    if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
978
0
        if(target==(const uint8_t *)pFromUArgs->targetLimit) {
979
0
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
980
0
        } else {
981
0
            b=*source;
982
0
            toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
983
0
            if(toULimit>(sourceLimit-source)) {
984
                /* collect a truncated byte sequence */
985
0
                toULength=0;
986
0
                c=b;
987
0
                for(;;) {
988
0
                    utf8->toUBytes[toULength++]=b;
989
0
                    if(++source==sourceLimit) {
990
                        /* partial byte sequence at end of source */
991
0
                        utf8->toUnicodeStatus=c;
992
0
                        utf8->toULength=toULength;
993
0
                        utf8->mode=toULimit;
994
0
                        break;
995
0
                    } else if(!U8_IS_TRAIL(b=*source)) {
996
                        /* lead byte in trail byte position */
997
0
                        utf8->toULength=toULength;
998
0
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
999
0
                        break;
1000
0
                    }
1001
0
                    c=(c<<6)+b;
1002
0
                }
1003
0
            } else {
1004
                /* partial-sequence target overflow: fall back to the pivoting implementation */
1005
0
                *pErrorCode=U_USING_DEFAULT_WARNING;
1006
0
            }
1007
0
        }
1008
0
    }
1009
1010
    /* write back the updated pointers */
1011
0
    pToUArgs->source=(char *)source;
1012
0
    pFromUArgs->target=(char *)target;
1013
0
}
1014
1015
U_CDECL_END
1016
1017
/* UTF-8 converter data ----------------------------------------------------- */
1018
1019
static const UConverterImpl _UTF8Impl={
1020
    UCNV_UTF8,
1021
1022
    NULL,
1023
    NULL,
1024
1025
    NULL,
1026
    NULL,
1027
    NULL,
1028
1029
    ucnv_toUnicode_UTF8,
1030
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1031
    ucnv_fromUnicode_UTF8,
1032
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1033
    ucnv_getNextUChar_UTF8,
1034
1035
    NULL,
1036
    NULL,
1037
    NULL,
1038
    NULL,
1039
    ucnv_getNonSurrogateUnicodeSet,
1040
1041
    ucnv_UTF8FromUTF8,
1042
    ucnv_UTF8FromUTF8
1043
};
1044
1045
/* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1046
static const UConverterStaticData _UTF8StaticData={
1047
    sizeof(UConverterStaticData),
1048
    "UTF-8",
1049
    1208, UCNV_IBM, UCNV_UTF8,
1050
    1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1051
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1052
    0,
1053
    0,
1054
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1055
};
1056
1057
1058
const UConverterSharedData _UTF8Data=
1059
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1060
1061
/* CESU-8 converter data ---------------------------------------------------- */
1062
1063
static const UConverterImpl _CESU8Impl={
1064
    UCNV_CESU8,
1065
1066
    NULL,
1067
    NULL,
1068
1069
    NULL,
1070
    NULL,
1071
    NULL,
1072
1073
    ucnv_toUnicode_UTF8,
1074
    ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1075
    ucnv_fromUnicode_UTF8,
1076
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1077
    NULL,
1078
1079
    NULL,
1080
    NULL,
1081
    NULL,
1082
    NULL,
1083
    ucnv_getCompleteUnicodeSet,
1084
1085
    NULL,
1086
    NULL
1087
};
1088
1089
static const UConverterStaticData _CESU8StaticData={
1090
    sizeof(UConverterStaticData),
1091
    "CESU-8",
1092
    9400, /* CCSID for CESU-8 */
1093
    UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1094
    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1095
    0,
1096
    0,
1097
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1098
};
1099
1100
1101
const UConverterSharedData _CESU8Data=
1102
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1103
1104
#endif