Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucnv_u16.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
**********************************************************************
5
*   Copyright (C) 2002-2015, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv_u16.c
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2002jul01
14
*   created by: Markus W. Scherer
15
*
16
*   UTF-16 converter implementation. Used to be in ucnv_utf.c.
17
*/
18
19
#include "unicode/utypes.h"
20
21
#if !UCONFIG_NO_CONVERSION
22
23
#include "unicode/ucnv.h"
24
#include "unicode/uversion.h"
25
#include "ucnv_bld.h"
26
#include "ucnv_cnv.h"
27
#include "cmemory.h"
28
29
enum {
30
    UCNV_NEED_TO_WRITE_BOM=1
31
};
32
33
U_CDECL_BEGIN
34
/*
35
 * The UTF-16 toUnicode implementation is also used for the Java-specific
36
 * "with BOM" variants of UTF-16BE and UTF-16LE.
37
 */
38
static void  U_CALLCONV
39
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
40
                           UErrorCode *pErrorCode);
41
42
/* UTF-16BE ----------------------------------------------------------------- */
43
44
#if U_IS_BIG_ENDIAN
45
#   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
46
#else
47
#   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
48
#endif
49
50
51
static void  U_CALLCONV
52
_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
53
0
                               UErrorCode *pErrorCode) {
54
0
    UConverter *cnv;
55
0
    const UChar *source;
56
0
    char *target;
57
0
    int32_t *offsets;
58
59
0
    uint32_t targetCapacity, length, sourceIndex;
60
0
    UChar c, trail;
61
0
    char overflow[4];
62
63
0
    source=pArgs->source;
64
0
    length=(int32_t)(pArgs->sourceLimit-source);
65
0
    if(length<=0) {
66
        /* no input, nothing to do */
67
0
        return;
68
0
    }
69
70
0
    cnv=pArgs->converter;
71
72
    /* write the BOM if necessary */
73
0
    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
74
0
        static const char bom[]={ (char)0xfeu, (char)0xffu };
75
0
        ucnv_fromUWriteBytes(cnv,
76
0
                             bom, 2,
77
0
                             &pArgs->target, pArgs->targetLimit,
78
0
                             &pArgs->offsets, -1,
79
0
                             pErrorCode);
80
0
        cnv->fromUnicodeStatus=0;
81
0
    }
82
83
0
    target=pArgs->target;
84
0
    if(target >= pArgs->targetLimit) {
85
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86
0
        return;
87
0
    }
88
89
0
    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
90
0
    offsets=pArgs->offsets;
91
0
    sourceIndex=0;
92
93
    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
94
95
0
    if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
96
        /* the last buffer ended with a lead surrogate, output the surrogate pair */
97
0
        ++source;
98
0
        --length;
99
0
        target[0]=(uint8_t)(c>>8);
100
0
        target[1]=(uint8_t)c;
101
0
        target[2]=(uint8_t)(trail>>8);
102
0
        target[3]=(uint8_t)trail;
103
0
        target+=4;
104
0
        targetCapacity-=4;
105
0
        if(offsets!=NULL) {
106
0
            *offsets++=-1;
107
0
            *offsets++=-1;
108
0
            *offsets++=-1;
109
0
            *offsets++=-1;
110
0
        }
111
0
        sourceIndex=1;
112
0
        cnv->fromUChar32=c=0;
113
0
    }
114
115
0
    if(c==0) {
116
        /* copy an even number of bytes for complete UChars */
117
0
        uint32_t count=2*length;
118
0
        if(count>targetCapacity) {
119
0
            count=targetCapacity&~1;
120
0
        }
121
        /* count is even */
122
0
        targetCapacity-=count;
123
0
        count>>=1;
124
0
        length-=count;
125
126
0
        if(offsets==NULL) {
127
0
            while(count>0) {
128
0
                c=*source++;
129
0
                if(U16_IS_SINGLE(c)) {
130
0
                    target[0]=(uint8_t)(c>>8);
131
0
                    target[1]=(uint8_t)c;
132
0
                    target+=2;
133
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
134
0
                    ++source;
135
0
                    --count;
136
0
                    target[0]=(uint8_t)(c>>8);
137
0
                    target[1]=(uint8_t)c;
138
0
                    target[2]=(uint8_t)(trail>>8);
139
0
                    target[3]=(uint8_t)trail;
140
0
                    target+=4;
141
0
                } else {
142
0
                    break;
143
0
                }
144
0
                --count;
145
0
            }
146
0
        } else {
147
0
            while(count>0) {
148
0
                c=*source++;
149
0
                if(U16_IS_SINGLE(c)) {
150
0
                    target[0]=(uint8_t)(c>>8);
151
0
                    target[1]=(uint8_t)c;
152
0
                    target+=2;
153
0
                    *offsets++=sourceIndex;
154
0
                    *offsets++=sourceIndex++;
155
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
156
0
                    ++source;
157
0
                    --count;
158
0
                    target[0]=(uint8_t)(c>>8);
159
0
                    target[1]=(uint8_t)c;
160
0
                    target[2]=(uint8_t)(trail>>8);
161
0
                    target[3]=(uint8_t)trail;
162
0
                    target+=4;
163
0
                    *offsets++=sourceIndex;
164
0
                    *offsets++=sourceIndex;
165
0
                    *offsets++=sourceIndex;
166
0
                    *offsets++=sourceIndex;
167
0
                    sourceIndex+=2;
168
0
                } else {
169
0
                    break;
170
0
                }
171
0
                --count;
172
0
            }
173
0
        }
174
175
0
        if(count==0) {
176
            /* done with the loop for complete UChars */
177
0
            if(length>0 && targetCapacity>0) {
178
                /*
179
                 * there is more input and some target capacity -
180
                 * it must be targetCapacity==1 because otherwise
181
                 * the above would have copied more;
182
                 * prepare for overflow output
183
                 */
184
0
                if(U16_IS_SINGLE(c=*source++)) {
185
0
                    overflow[0]=(char)(c>>8);
186
0
                    overflow[1]=(char)c;
187
0
                    length=2; /* 2 bytes to output */
188
0
                    c=0;
189
                /* } else { keep c for surrogate handling, length will be set there */
190
0
                }
191
0
            } else {
192
0
                length=0;
193
0
                c=0;
194
0
            }
195
0
        } else {
196
            /* keep c for surrogate handling, length will be set there */
197
0
            targetCapacity+=2*count;
198
0
        }
199
0
    } else {
200
0
        length=0; /* from here on, length counts the bytes in overflow[] */
201
0
    }
202
    
203
0
    if(c!=0) {
204
        /*
205
         * c is a surrogate, and
206
         * - source or target too short
207
         * - or the surrogate is unmatched
208
         */
209
0
        length=0;
210
0
        if(U16_IS_SURROGATE_LEAD(c)) {
211
0
            if(source<pArgs->sourceLimit) {
212
0
                if(U16_IS_TRAIL(trail=*source)) {
213
                    /* output the surrogate pair, will overflow (see conditions comment above) */
214
0
                    ++source;
215
0
                    overflow[0]=(char)(c>>8);
216
0
                    overflow[1]=(char)c;
217
0
                    overflow[2]=(char)(trail>>8);
218
0
                    overflow[3]=(char)trail;
219
0
                    length=4; /* 4 bytes to output */
220
0
                    c=0;
221
0
                } else {
222
                    /* unmatched lead surrogate */
223
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
224
0
                }
225
0
            } else {
226
                /* see if the trail surrogate is in the next buffer */
227
0
            }
228
0
        } else {
229
            /* unmatched trail surrogate */
230
0
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
231
0
        }
232
0
        cnv->fromUChar32=c;
233
0
    }
234
235
0
    if(length>0) {
236
        /* output length bytes with overflow (length>targetCapacity>0) */
237
0
        ucnv_fromUWriteBytes(cnv,
238
0
                             overflow, length,
239
0
                             (char **)&target, pArgs->targetLimit,
240
0
                             &offsets, sourceIndex,
241
0
                             pErrorCode);
242
0
        targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
243
0
    }
244
245
0
    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
246
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
247
0
    }
248
249
    /* write back the updated pointers */
250
0
    pArgs->source=source;
251
0
    pArgs->target=(char *)target;
252
0
    pArgs->offsets=offsets;
253
0
}
254
255
static void  U_CALLCONV
256
_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
257
0
                             UErrorCode *pErrorCode) {
258
0
    UConverter *cnv;
259
0
    const uint8_t *source;
260
0
    UChar *target;
261
0
    int32_t *offsets;
262
263
0
    uint32_t targetCapacity, length, count, sourceIndex;
264
0
    UChar c, trail;
265
266
0
    if(pArgs->converter->mode<8) {
267
0
        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
268
0
        return;
269
0
    }
270
271
0
    cnv=pArgs->converter;
272
0
    source=(const uint8_t *)pArgs->source;
273
0
    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
274
0
    if(length<=0 && cnv->toUnicodeStatus==0) {
275
        /* no input, nothing to do */
276
0
        return;
277
0
    }
278
279
0
    target=pArgs->target;
280
0
    if(target >= pArgs->targetLimit) {
281
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
282
0
        return;
283
0
    }
284
285
0
    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
286
0
    offsets=pArgs->offsets;
287
0
    sourceIndex=0;
288
0
    c=0;
289
290
    /* complete a partial UChar or pair from the last call */
291
0
    if(cnv->toUnicodeStatus!=0) {
292
        /*
293
         * special case: single byte from a previous buffer,
294
         * where the byte turned out not to belong to a trail surrogate
295
         * and the preceding, unmatched lead surrogate was put into toUBytes[]
296
         * for error handling
297
         */
298
0
        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
299
0
        cnv->toULength=1;
300
0
        cnv->toUnicodeStatus=0;
301
0
    }
302
0
    if((count=cnv->toULength)!=0) {
303
0
        uint8_t *p=cnv->toUBytes;
304
0
        do {
305
0
            p[count++]=*source++;
306
0
            ++sourceIndex;
307
0
            --length;
308
0
            if(count==2) {
309
0
                c=((UChar)p[0]<<8)|p[1];
310
0
                if(U16_IS_SINGLE(c)) {
311
                    /* output the BMP code point */
312
0
                    *target++=c;
313
0
                    if(offsets!=NULL) {
314
0
                        *offsets++=-1;
315
0
                    }
316
0
                    --targetCapacity;
317
0
                    count=0;
318
0
                    c=0;
319
0
                    break;
320
0
                } else if(U16_IS_SURROGATE_LEAD(c)) {
321
                    /* continue collecting bytes for the trail surrogate */
322
0
                    c=0; /* avoid unnecessary surrogate handling below */
323
0
                } else {
324
                    /* fall through to error handling for an unmatched trail surrogate */
325
0
                    break;
326
0
                }
327
0
            } else if(count==4) {
328
0
                c=((UChar)p[0]<<8)|p[1];
329
0
                trail=((UChar)p[2]<<8)|p[3];
330
0
                if(U16_IS_TRAIL(trail)) {
331
                    /* output the surrogate pair */
332
0
                    *target++=c;
333
0
                    if(targetCapacity>=2) {
334
0
                        *target++=trail;
335
0
                        if(offsets!=NULL) {
336
0
                            *offsets++=-1;
337
0
                            *offsets++=-1;
338
0
                        }
339
0
                        targetCapacity-=2;
340
0
                    } else /* targetCapacity==1 */ {
341
0
                        targetCapacity=0;
342
0
                        cnv->UCharErrorBuffer[0]=trail;
343
0
                        cnv->UCharErrorBufferLength=1;
344
0
                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
345
0
                    }
346
0
                    count=0;
347
0
                    c=0;
348
0
                    break;
349
0
                } else {
350
                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
351
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
352
353
                    /* back out reading the code unit after it */
354
0
                    if(((const uint8_t *)pArgs->source-source)>=2) {
355
0
                        source-=2;
356
0
                    } else {
357
                        /*
358
                         * if the trail unit's first byte was in a previous buffer, then
359
                         * we need to put it into a special place because toUBytes[] will be
360
                         * used for the lead unit's bytes
361
                         */
362
0
                        cnv->toUnicodeStatus=0x100|p[2];
363
0
                        --source;
364
0
                    }
365
0
                    cnv->toULength=2;
366
367
                    /* write back the updated pointers */
368
0
                    pArgs->source=(const char *)source;
369
0
                    pArgs->target=target;
370
0
                    pArgs->offsets=offsets;
371
0
                    return;
372
0
                }
373
0
            }
374
0
        } while(length>0);
375
0
        cnv->toULength=(int8_t)count;
376
0
    }
377
378
    /* copy an even number of bytes for complete UChars */
379
0
    count=2*targetCapacity;
380
0
    if(count>length) {
381
0
        count=length&~1;
382
0
    }
383
0
    if(c==0 && count>0) {
384
0
        length-=count;
385
0
        count>>=1;
386
0
        targetCapacity-=count;
387
0
        if(offsets==NULL) {
388
0
            do {
389
0
                c=((UChar)source[0]<<8)|source[1];
390
0
                source+=2;
391
0
                if(U16_IS_SINGLE(c)) {
392
0
                    *target++=c;
393
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
394
0
                          U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
395
0
                ) {
396
0
                    source+=2;
397
0
                    --count;
398
0
                    *target++=c;
399
0
                    *target++=trail;
400
0
                } else {
401
0
                    break;
402
0
                }
403
0
            } while(--count>0);
404
0
        } else {
405
0
            do {
406
0
                c=((UChar)source[0]<<8)|source[1];
407
0
                source+=2;
408
0
                if(U16_IS_SINGLE(c)) {
409
0
                    *target++=c;
410
0
                    *offsets++=sourceIndex;
411
0
                    sourceIndex+=2;
412
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
413
0
                          U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
414
0
                ) {
415
0
                    source+=2;
416
0
                    --count;
417
0
                    *target++=c;
418
0
                    *target++=trail;
419
0
                    *offsets++=sourceIndex;
420
0
                    *offsets++=sourceIndex;
421
0
                    sourceIndex+=4;
422
0
                } else {
423
0
                    break;
424
0
                }
425
0
            } while(--count>0);
426
0
        }
427
428
0
        if(count==0) {
429
            /* done with the loop for complete UChars */
430
0
            c=0;
431
0
        } else {
432
            /* keep c for surrogate handling, trail will be set there */
433
0
            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
434
0
            targetCapacity+=count;
435
0
        }
436
0
    }
437
438
0
    if(c!=0) {
439
        /*
440
         * c is a surrogate, and
441
         * - source or target too short
442
         * - or the surrogate is unmatched
443
         */
444
0
        cnv->toUBytes[0]=(uint8_t)(c>>8);
445
0
        cnv->toUBytes[1]=(uint8_t)c;
446
0
        cnv->toULength=2;
447
448
0
        if(U16_IS_SURROGATE_LEAD(c)) {
449
0
            if(length>=2) {
450
0
                if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
451
                    /* output the surrogate pair, will overflow (see conditions comment above) */
452
0
                    source+=2;
453
0
                    length-=2;
454
0
                    *target++=c;
455
0
                    if(offsets!=NULL) {
456
0
                        *offsets++=sourceIndex;
457
0
                    }
458
0
                    cnv->UCharErrorBuffer[0]=trail;
459
0
                    cnv->UCharErrorBufferLength=1;
460
0
                    cnv->toULength=0;
461
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
462
0
                } else {
463
                    /* unmatched lead surrogate */
464
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
465
0
                }
466
0
            } else {
467
                /* see if the trail surrogate is in the next buffer */
468
0
            }
469
0
        } else {
470
            /* unmatched trail surrogate */
471
0
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
472
0
        }
473
0
    }
474
475
0
    if(U_SUCCESS(*pErrorCode)) {
476
        /* check for a remaining source byte */
477
0
        if(length>0) {
478
0
            if(targetCapacity==0) {
479
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
480
0
            } else {
481
                /* it must be length==1 because otherwise the above would have copied more */
482
0
                cnv->toUBytes[cnv->toULength++]=*source++;
483
0
            }
484
0
        }
485
0
    }
486
487
    /* write back the updated pointers */
488
0
    pArgs->source=(const char *)source;
489
0
    pArgs->target=target;
490
0
    pArgs->offsets=offsets;
491
0
}
492
493
static UChar32  U_CALLCONV
494
0
_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
495
0
    const uint8_t *s, *sourceLimit;
496
0
    UChar32 c;
497
498
0
    if(pArgs->converter->mode<8) {
499
0
        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
500
0
    }
501
502
0
    s=(const uint8_t *)pArgs->source;
503
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
504
505
0
    if(s>=sourceLimit) {
506
        /* no input */
507
0
        *err=U_INDEX_OUTOFBOUNDS_ERROR;
508
0
        return 0xffff;
509
0
    }
510
511
0
    if(s+2>sourceLimit) {
512
        /* only one byte: truncated UChar */
513
0
        pArgs->converter->toUBytes[0]=*s++;
514
0
        pArgs->converter->toULength=1;
515
0
        pArgs->source=(const char *)s;
516
0
        *err = U_TRUNCATED_CHAR_FOUND;
517
0
        return 0xffff;
518
0
    }
519
520
    /* get one UChar */
521
0
    c=((UChar32)*s<<8)|s[1];
522
0
    s+=2;
523
524
    /* check for a surrogate pair */
525
0
    if(U_IS_SURROGATE(c)) {
526
0
        if(U16_IS_SURROGATE_LEAD(c)) {
527
0
            if(s+2<=sourceLimit) {
528
0
                UChar trail;
529
530
                /* get a second UChar and see if it is a trail surrogate */
531
0
                trail=((UChar)*s<<8)|s[1];
532
0
                if(U16_IS_TRAIL(trail)) {
533
0
                    c=U16_GET_SUPPLEMENTARY(c, trail);
534
0
                    s+=2;
535
0
                } else {
536
                    /* unmatched lead surrogate */
537
0
                    c=-2;
538
0
                }
539
0
            } else {
540
                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
541
0
                uint8_t *bytes=pArgs->converter->toUBytes;
542
0
                s-=2;
543
0
                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
544
0
                do {
545
0
                    *bytes++=*s++;
546
0
                } while(s<sourceLimit);
547
548
0
                c=0xffff;
549
0
                *err=U_TRUNCATED_CHAR_FOUND;
550
0
            }
551
0
        } else {
552
            /* unmatched trail surrogate */
553
0
            c=-2;
554
0
        }
555
556
0
        if(c<0) {
557
            /* write the unmatched surrogate */
558
0
            uint8_t *bytes=pArgs->converter->toUBytes;
559
0
            pArgs->converter->toULength=2;
560
0
            *bytes=*(s-2);
561
0
            bytes[1]=*(s-1);
562
563
0
            c=0xffff;
564
0
            *err=U_ILLEGAL_CHAR_FOUND;
565
0
        }
566
0
    }
567
568
0
    pArgs->source=(const char *)s;
569
0
    return c;
570
0
} 
571
572
static void  U_CALLCONV
573
0
_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
574
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
575
        /* reset toUnicode state */
576
0
        if(UCNV_GET_VERSION(cnv)==0) {
577
0
            cnv->mode=8; /* no BOM handling */
578
0
        } else {
579
0
            cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
580
0
        }
581
0
    }
582
0
    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
583
        /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
584
0
        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
585
0
    }
586
0
}
587
588
static void  U_CALLCONV
589
_UTF16BEOpen(UConverter *cnv,
590
             UConverterLoadArgs *pArgs,
591
0
             UErrorCode *pErrorCode) {
592
0
    (void)pArgs;
593
0
    if(UCNV_GET_VERSION(cnv)<=1) {
594
0
        _UTF16BEReset(cnv, UCNV_RESET_BOTH);
595
0
    } else {
596
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
597
0
    }
598
0
}
599
600
static const char *  U_CALLCONV
601
0
_UTF16BEGetName(const UConverter *cnv) {
602
0
    if(UCNV_GET_VERSION(cnv)==0) {
603
0
        return "UTF-16BE";
604
0
    } else {
605
0
        return "UTF-16BE,version=1";
606
0
    }
607
0
}
608
U_CDECL_END
609
610
static const UConverterImpl _UTF16BEImpl={
611
    UCNV_UTF16_BigEndian,
612
613
    NULL,
614
    NULL,
615
616
    _UTF16BEOpen,
617
    NULL,
618
    _UTF16BEReset,
619
620
    _UTF16BEToUnicodeWithOffsets,
621
    _UTF16BEToUnicodeWithOffsets,
622
    _UTF16BEFromUnicodeWithOffsets,
623
    _UTF16BEFromUnicodeWithOffsets,
624
    _UTF16BEGetNextUChar,
625
626
    NULL,
627
    _UTF16BEGetName,
628
    NULL,
629
    NULL,
630
    ucnv_getNonSurrogateUnicodeSet,
631
632
    NULL,
633
    NULL
634
};
635
636
static const UConverterStaticData _UTF16BEStaticData={
637
    sizeof(UConverterStaticData),
638
    "UTF-16BE",
639
    1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
640
    { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
641
    0,
642
    0,
643
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
644
};
645
646
647
const UConverterSharedData _UTF16BEData=
648
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
649
650
/* UTF-16LE ----------------------------------------------------------------- */
651
U_CDECL_BEGIN
652
static void  U_CALLCONV
653
_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
654
0
                               UErrorCode *pErrorCode) {
655
0
    UConverter *cnv;
656
0
    const UChar *source;
657
0
    char *target;
658
0
    int32_t *offsets;
659
660
0
    uint32_t targetCapacity, length, sourceIndex;
661
0
    UChar c, trail;
662
0
    char overflow[4];
663
664
0
    source=pArgs->source;
665
0
    length=(int32_t)(pArgs->sourceLimit-source);
666
0
    if(length<=0) {
667
        /* no input, nothing to do */
668
0
        return;
669
0
    }
670
671
0
    cnv=pArgs->converter;
672
673
    /* write the BOM if necessary */
674
0
    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
675
0
        static const char bom[]={ (char)0xffu, (char)0xfeu };
676
0
        ucnv_fromUWriteBytes(cnv,
677
0
                             bom, 2,
678
0
                             &pArgs->target, pArgs->targetLimit,
679
0
                             &pArgs->offsets, -1,
680
0
                             pErrorCode);
681
0
        cnv->fromUnicodeStatus=0;
682
0
    }
683
684
0
    target=pArgs->target;
685
0
    if(target >= pArgs->targetLimit) {
686
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
687
0
        return;
688
0
    }
689
690
0
    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
691
0
    offsets=pArgs->offsets;
692
0
    sourceIndex=0;
693
694
    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
695
696
0
    if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
697
        /* the last buffer ended with a lead surrogate, output the surrogate pair */
698
0
        ++source;
699
0
        --length;
700
0
        target[0]=(uint8_t)c;
701
0
        target[1]=(uint8_t)(c>>8);
702
0
        target[2]=(uint8_t)trail;
703
0
        target[3]=(uint8_t)(trail>>8);
704
0
        target+=4;
705
0
        targetCapacity-=4;
706
0
        if(offsets!=NULL) {
707
0
            *offsets++=-1;
708
0
            *offsets++=-1;
709
0
            *offsets++=-1;
710
0
            *offsets++=-1;
711
0
        }
712
0
        sourceIndex=1;
713
0
        cnv->fromUChar32=c=0;
714
0
    }
715
716
0
    if(c==0) {
717
        /* copy an even number of bytes for complete UChars */
718
0
        uint32_t count=2*length;
719
0
        if(count>targetCapacity) {
720
0
            count=targetCapacity&~1;
721
0
        }
722
        /* count is even */
723
0
        targetCapacity-=count;
724
0
        count>>=1;
725
0
        length-=count;
726
727
0
        if(offsets==NULL) {
728
0
            while(count>0) {
729
0
                c=*source++;
730
0
                if(U16_IS_SINGLE(c)) {
731
0
                    target[0]=(uint8_t)c;
732
0
                    target[1]=(uint8_t)(c>>8);
733
0
                    target+=2;
734
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
735
0
                    ++source;
736
0
                    --count;
737
0
                    target[0]=(uint8_t)c;
738
0
                    target[1]=(uint8_t)(c>>8);
739
0
                    target[2]=(uint8_t)trail;
740
0
                    target[3]=(uint8_t)(trail>>8);
741
0
                    target+=4;
742
0
                } else {
743
0
                    break;
744
0
                }
745
0
                --count;
746
0
            }
747
0
        } else {
748
0
            while(count>0) {
749
0
                c=*source++;
750
0
                if(U16_IS_SINGLE(c)) {
751
0
                    target[0]=(uint8_t)c;
752
0
                    target[1]=(uint8_t)(c>>8);
753
0
                    target+=2;
754
0
                    *offsets++=sourceIndex;
755
0
                    *offsets++=sourceIndex++;
756
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
757
0
                    ++source;
758
0
                    --count;
759
0
                    target[0]=(uint8_t)c;
760
0
                    target[1]=(uint8_t)(c>>8);
761
0
                    target[2]=(uint8_t)trail;
762
0
                    target[3]=(uint8_t)(trail>>8);
763
0
                    target+=4;
764
0
                    *offsets++=sourceIndex;
765
0
                    *offsets++=sourceIndex;
766
0
                    *offsets++=sourceIndex;
767
0
                    *offsets++=sourceIndex;
768
0
                    sourceIndex+=2;
769
0
                } else {
770
0
                    break;
771
0
                }
772
0
                --count;
773
0
            }
774
0
        }
775
776
0
        if(count==0) {
777
            /* done with the loop for complete UChars */
778
0
            if(length>0 && targetCapacity>0) {
779
                /*
780
                 * there is more input and some target capacity -
781
                 * it must be targetCapacity==1 because otherwise
782
                 * the above would have copied more;
783
                 * prepare for overflow output
784
                 */
785
0
                if(U16_IS_SINGLE(c=*source++)) {
786
0
                    overflow[0]=(char)c;
787
0
                    overflow[1]=(char)(c>>8);
788
0
                    length=2; /* 2 bytes to output */
789
0
                    c=0;
790
                /* } else { keep c for surrogate handling, length will be set there */
791
0
                }
792
0
            } else {
793
0
                length=0;
794
0
                c=0;
795
0
            }
796
0
        } else {
797
            /* keep c for surrogate handling, length will be set there */
798
0
            targetCapacity+=2*count;
799
0
        }
800
0
    } else {
801
0
        length=0; /* from here on, length counts the bytes in overflow[] */
802
0
    }
803
    
804
0
    if(c!=0) {
805
        /*
806
         * c is a surrogate, and
807
         * - source or target too short
808
         * - or the surrogate is unmatched
809
         */
810
0
        length=0;
811
0
        if(U16_IS_SURROGATE_LEAD(c)) {
812
0
            if(source<pArgs->sourceLimit) {
813
0
                if(U16_IS_TRAIL(trail=*source)) {
814
                    /* output the surrogate pair, will overflow (see conditions comment above) */
815
0
                    ++source;
816
0
                    overflow[0]=(char)c;
817
0
                    overflow[1]=(char)(c>>8);
818
0
                    overflow[2]=(char)trail;
819
0
                    overflow[3]=(char)(trail>>8);
820
0
                    length=4; /* 4 bytes to output */
821
0
                    c=0;
822
0
                } else {
823
                    /* unmatched lead surrogate */
824
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
825
0
                }
826
0
            } else {
827
                /* see if the trail surrogate is in the next buffer */
828
0
            }
829
0
        } else {
830
            /* unmatched trail surrogate */
831
0
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
832
0
        }
833
0
        cnv->fromUChar32=c;
834
0
    }
835
836
0
    if(length>0) {
837
        /* output length bytes with overflow (length>targetCapacity>0) */
838
0
        ucnv_fromUWriteBytes(cnv,
839
0
                             overflow, length,
840
0
                             &target, pArgs->targetLimit,
841
0
                             &offsets, sourceIndex,
842
0
                             pErrorCode);
843
0
        targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
844
0
    }
845
846
0
    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
847
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
848
0
    }
849
850
    /* write back the updated pointers */
851
0
    pArgs->source=source;
852
0
    pArgs->target=target;
853
0
    pArgs->offsets=offsets;
854
0
}
855
856
static void  U_CALLCONV
857
_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
858
0
                             UErrorCode *pErrorCode) {
859
0
    UConverter *cnv;
860
0
    const uint8_t *source;
861
0
    UChar *target;
862
0
    int32_t *offsets;
863
864
0
    uint32_t targetCapacity, length, count, sourceIndex;
865
0
    UChar c, trail;
866
867
0
    if(pArgs->converter->mode<8) {
868
0
        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
869
0
        return;
870
0
    }
871
872
0
    cnv=pArgs->converter;
873
0
    source=(const uint8_t *)pArgs->source;
874
0
    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
875
0
    if(length<=0 && cnv->toUnicodeStatus==0) {
876
        /* no input, nothing to do */
877
0
        return;
878
0
    }
879
880
0
    target=pArgs->target;
881
0
    if(target >= pArgs->targetLimit) {
882
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
883
0
        return;
884
0
    }
885
886
0
    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
887
0
    offsets=pArgs->offsets;
888
0
    sourceIndex=0;
889
0
    c=0;
890
891
    /* complete a partial UChar or pair from the last call */
892
0
    if(cnv->toUnicodeStatus!=0) {
893
        /*
894
         * special case: single byte from a previous buffer,
895
         * where the byte turned out not to belong to a trail surrogate
896
         * and the preceding, unmatched lead surrogate was put into toUBytes[]
897
         * for error handling
898
         */
899
0
        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
900
0
        cnv->toULength=1;
901
0
        cnv->toUnicodeStatus=0;
902
0
    }
903
0
    if((count=cnv->toULength)!=0) {
904
0
        uint8_t *p=cnv->toUBytes;
905
0
        do {
906
0
            p[count++]=*source++;
907
0
            ++sourceIndex;
908
0
            --length;
909
0
            if(count==2) {
910
0
                c=((UChar)p[1]<<8)|p[0];
911
0
                if(U16_IS_SINGLE(c)) {
912
                    /* output the BMP code point */
913
0
                    *target++=c;
914
0
                    if(offsets!=NULL) {
915
0
                        *offsets++=-1;
916
0
                    }
917
0
                    --targetCapacity;
918
0
                    count=0;
919
0
                    c=0;
920
0
                    break;
921
0
                } else if(U16_IS_SURROGATE_LEAD(c)) {
922
                    /* continue collecting bytes for the trail surrogate */
923
0
                    c=0; /* avoid unnecessary surrogate handling below */
924
0
                } else {
925
                    /* fall through to error handling for an unmatched trail surrogate */
926
0
                    break;
927
0
                }
928
0
            } else if(count==4) {
929
0
                c=((UChar)p[1]<<8)|p[0];
930
0
                trail=((UChar)p[3]<<8)|p[2];
931
0
                if(U16_IS_TRAIL(trail)) {
932
                    /* output the surrogate pair */
933
0
                    *target++=c;
934
0
                    if(targetCapacity>=2) {
935
0
                        *target++=trail;
936
0
                        if(offsets!=NULL) {
937
0
                            *offsets++=-1;
938
0
                            *offsets++=-1;
939
0
                        }
940
0
                        targetCapacity-=2;
941
0
                    } else /* targetCapacity==1 */ {
942
0
                        targetCapacity=0;
943
0
                        cnv->UCharErrorBuffer[0]=trail;
944
0
                        cnv->UCharErrorBufferLength=1;
945
0
                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
946
0
                    }
947
0
                    count=0;
948
0
                    c=0;
949
0
                    break;
950
0
                } else {
951
                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
952
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
953
954
                    /* back out reading the code unit after it */
955
0
                    if(((const uint8_t *)pArgs->source-source)>=2) {
956
0
                        source-=2;
957
0
                    } else {
958
                        /*
959
                         * if the trail unit's first byte was in a previous buffer, then
960
                         * we need to put it into a special place because toUBytes[] will be
961
                         * used for the lead unit's bytes
962
                         */
963
0
                        cnv->toUnicodeStatus=0x100|p[2];
964
0
                        --source;
965
0
                    }
966
0
                    cnv->toULength=2;
967
968
                    /* write back the updated pointers */
969
0
                    pArgs->source=(const char *)source;
970
0
                    pArgs->target=target;
971
0
                    pArgs->offsets=offsets;
972
0
                    return;
973
0
                }
974
0
            }
975
0
        } while(length>0);
976
0
        cnv->toULength=(int8_t)count;
977
0
    }
978
979
    /* copy an even number of bytes for complete UChars */
980
0
    count=2*targetCapacity;
981
0
    if(count>length) {
982
0
        count=length&~1;
983
0
    }
984
0
    if(c==0 && count>0) {
985
0
        length-=count;
986
0
        count>>=1;
987
0
        targetCapacity-=count;
988
0
        if(offsets==NULL) {
989
0
            do {
990
0
                c=((UChar)source[1]<<8)|source[0];
991
0
                source+=2;
992
0
                if(U16_IS_SINGLE(c)) {
993
0
                    *target++=c;
994
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
995
0
                          U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
996
0
                ) {
997
0
                    source+=2;
998
0
                    --count;
999
0
                    *target++=c;
1000
0
                    *target++=trail;
1001
0
                } else {
1002
0
                    break;
1003
0
                }
1004
0
            } while(--count>0);
1005
0
        } else {
1006
0
            do {
1007
0
                c=((UChar)source[1]<<8)|source[0];
1008
0
                source+=2;
1009
0
                if(U16_IS_SINGLE(c)) {
1010
0
                    *target++=c;
1011
0
                    *offsets++=sourceIndex;
1012
0
                    sourceIndex+=2;
1013
0
                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
1014
0
                          U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
1015
0
                ) {
1016
0
                    source+=2;
1017
0
                    --count;
1018
0
                    *target++=c;
1019
0
                    *target++=trail;
1020
0
                    *offsets++=sourceIndex;
1021
0
                    *offsets++=sourceIndex;
1022
0
                    sourceIndex+=4;
1023
0
                } else {
1024
0
                    break;
1025
0
                }
1026
0
            } while(--count>0);
1027
0
        }
1028
1029
0
        if(count==0) {
1030
            /* done with the loop for complete UChars */
1031
0
            c=0;
1032
0
        } else {
1033
            /* keep c for surrogate handling, trail will be set there */
1034
0
            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
1035
0
            targetCapacity+=count;
1036
0
        }
1037
0
    }
1038
1039
0
    if(c!=0) {
1040
        /*
1041
         * c is a surrogate, and
1042
         * - source or target too short
1043
         * - or the surrogate is unmatched
1044
         */
1045
0
        cnv->toUBytes[0]=(uint8_t)c;
1046
0
        cnv->toUBytes[1]=(uint8_t)(c>>8);
1047
0
        cnv->toULength=2;
1048
1049
0
        if(U16_IS_SURROGATE_LEAD(c)) {
1050
0
            if(length>=2) {
1051
0
                if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
1052
                    /* output the surrogate pair, will overflow (see conditions comment above) */
1053
0
                    source+=2;
1054
0
                    length-=2;
1055
0
                    *target++=c;
1056
0
                    if(offsets!=NULL) {
1057
0
                        *offsets++=sourceIndex;
1058
0
                    }
1059
0
                    cnv->UCharErrorBuffer[0]=trail;
1060
0
                    cnv->UCharErrorBufferLength=1;
1061
0
                    cnv->toULength=0;
1062
0
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1063
0
                } else {
1064
                    /* unmatched lead surrogate */
1065
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1066
0
                }
1067
0
            } else {
1068
                /* see if the trail surrogate is in the next buffer */
1069
0
            }
1070
0
        } else {
1071
            /* unmatched trail surrogate */
1072
0
            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1073
0
        }
1074
0
    }
1075
1076
0
    if(U_SUCCESS(*pErrorCode)) {
1077
        /* check for a remaining source byte */
1078
0
        if(length>0) {
1079
0
            if(targetCapacity==0) {
1080
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1081
0
            } else {
1082
                /* it must be length==1 because otherwise the above would have copied more */
1083
0
                cnv->toUBytes[cnv->toULength++]=*source++;
1084
0
            }
1085
0
        }
1086
0
    }
1087
1088
    /* write back the updated pointers */
1089
0
    pArgs->source=(const char *)source;
1090
0
    pArgs->target=target;
1091
0
    pArgs->offsets=offsets;
1092
0
}
1093
1094
static UChar32  U_CALLCONV
1095
0
_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
1096
0
    const uint8_t *s, *sourceLimit;
1097
0
    UChar32 c;
1098
1099
0
    if(pArgs->converter->mode<8) {
1100
0
        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1101
0
    }
1102
1103
0
    s=(const uint8_t *)pArgs->source;
1104
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1105
1106
0
    if(s>=sourceLimit) {
1107
        /* no input */
1108
0
        *err=U_INDEX_OUTOFBOUNDS_ERROR;
1109
0
        return 0xffff;
1110
0
    }
1111
1112
0
    if(s+2>sourceLimit) {
1113
        /* only one byte: truncated UChar */
1114
0
        pArgs->converter->toUBytes[0]=*s++;
1115
0
        pArgs->converter->toULength=1;
1116
0
        pArgs->source=(const char *)s;
1117
0
        *err = U_TRUNCATED_CHAR_FOUND;
1118
0
        return 0xffff;
1119
0
    }
1120
1121
    /* get one UChar */
1122
0
    c=((UChar32)s[1]<<8)|*s;
1123
0
    s+=2;
1124
1125
    /* check for a surrogate pair */
1126
0
    if(U_IS_SURROGATE(c)) {
1127
0
        if(U16_IS_SURROGATE_LEAD(c)) {
1128
0
            if(s+2<=sourceLimit) {
1129
0
                UChar trail;
1130
1131
                /* get a second UChar and see if it is a trail surrogate */
1132
0
                trail=((UChar)s[1]<<8)|*s;
1133
0
                if(U16_IS_TRAIL(trail)) {
1134
0
                    c=U16_GET_SUPPLEMENTARY(c, trail);
1135
0
                    s+=2;
1136
0
                } else {
1137
                    /* unmatched lead surrogate */
1138
0
                    c=-2;
1139
0
                }
1140
0
            } else {
1141
                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
1142
0
                uint8_t *bytes=pArgs->converter->toUBytes;
1143
0
                s-=2;
1144
0
                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
1145
0
                do {
1146
0
                    *bytes++=*s++;
1147
0
                } while(s<sourceLimit);
1148
1149
0
                c=0xffff;
1150
0
                *err=U_TRUNCATED_CHAR_FOUND;
1151
0
            }
1152
0
        } else {
1153
            /* unmatched trail surrogate */
1154
0
            c=-2;
1155
0
        }
1156
1157
0
        if(c<0) {
1158
            /* write the unmatched surrogate */
1159
0
            uint8_t *bytes=pArgs->converter->toUBytes;
1160
0
            pArgs->converter->toULength=2;
1161
0
            *bytes=*(s-2);
1162
0
            bytes[1]=*(s-1);
1163
1164
0
            c=0xffff;
1165
0
            *err=U_ILLEGAL_CHAR_FOUND;
1166
0
        }
1167
0
    }
1168
1169
0
    pArgs->source=(const char *)s;
1170
0
    return c;
1171
0
} 
1172
1173
static void  U_CALLCONV
1174
0
_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
1175
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
1176
        /* reset toUnicode state */
1177
0
        if(UCNV_GET_VERSION(cnv)==0) {
1178
0
            cnv->mode=8; /* no BOM handling */
1179
0
        } else {
1180
0
            cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
1181
0
        }
1182
0
    }
1183
0
    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
1184
        /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
1185
0
        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1186
0
    }
1187
0
}
1188
1189
static void  U_CALLCONV
1190
_UTF16LEOpen(UConverter *cnv,
1191
             UConverterLoadArgs *pArgs,
1192
0
             UErrorCode *pErrorCode) {
1193
0
    (void)pArgs;
1194
0
    if(UCNV_GET_VERSION(cnv)<=1) {
1195
0
        _UTF16LEReset(cnv, UCNV_RESET_BOTH);
1196
0
    } else {
1197
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1198
0
    }
1199
0
}
1200
1201
static const char *  U_CALLCONV
1202
0
_UTF16LEGetName(const UConverter *cnv) {
1203
0
    if(UCNV_GET_VERSION(cnv)==0) {
1204
0
        return "UTF-16LE";
1205
0
    } else {
1206
0
        return "UTF-16LE,version=1";
1207
0
    }
1208
0
}
1209
U_CDECL_END
1210
1211
static const UConverterImpl _UTF16LEImpl={
1212
    UCNV_UTF16_LittleEndian,
1213
1214
    NULL,
1215
    NULL,
1216
1217
    _UTF16LEOpen,
1218
    NULL,
1219
    _UTF16LEReset,
1220
1221
    _UTF16LEToUnicodeWithOffsets,
1222
    _UTF16LEToUnicodeWithOffsets,
1223
    _UTF16LEFromUnicodeWithOffsets,
1224
    _UTF16LEFromUnicodeWithOffsets,
1225
    _UTF16LEGetNextUChar,
1226
1227
    NULL,
1228
    _UTF16LEGetName,
1229
    NULL,
1230
    NULL,
1231
    ucnv_getNonSurrogateUnicodeSet,
1232
1233
    NULL,
1234
    NULL
1235
};
1236
1237
1238
static const UConverterStaticData _UTF16LEStaticData={
1239
    sizeof(UConverterStaticData),
1240
    "UTF-16LE",
1241
    1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
1242
    { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
1243
    0,
1244
    0,
1245
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1246
};
1247
1248
1249
const UConverterSharedData _UTF16LEData=
1250
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
1251
1252
/* UTF-16 (Detect BOM) ------------------------------------------------------ */
1253
1254
/*
1255
 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
1256
 * accordingly.
1257
 * This is a simpler version of the UTF-32 converter, with
1258
 * fewer states for shorter BOMs.
1259
 *
1260
 * State values:
1261
 * 0    initial state
1262
 * 1    saw first byte
1263
 * 2..5 -
1264
 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
1265
 * 8    UTF-16BE mode
1266
 * 9    UTF-16LE mode
1267
 *
1268
 * During detection: state==number of initial bytes seen so far.
1269
 *
1270
 * On output, emit U+FEFF as the first code point.
1271
 *
1272
 * Variants:
1273
 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
1274
 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
1275
 *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
1276
 */
1277
U_CDECL_BEGIN
1278
static void  U_CALLCONV
1279
0
_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
1280
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
1281
        /* reset toUnicode: state=0 */
1282
0
        cnv->mode=0;
1283
0
    }
1284
0
    if(choice!=UCNV_RESET_TO_UNICODE) {
1285
        /* reset fromUnicode: prepare to output the UTF-16PE BOM */
1286
0
        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
1287
0
    }
1288
0
}
1289
U_CDECL_END
1290
extern const UConverterSharedData _UTF16v2Data;
1291
U_CDECL_BEGIN
1292
static void U_CALLCONV
1293
_UTF16Open(UConverter *cnv,
1294
           UConverterLoadArgs *pArgs,
1295
0
           UErrorCode *pErrorCode) {
1296
0
    if(UCNV_GET_VERSION(cnv)<=2) {
1297
0
        if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
1298
            /*
1299
             * Switch implementation, and switch the staticData that's different
1300
             * and was copied into the UConverter.
1301
             * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
1302
             * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
1303
             */
1304
0
            cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
1305
0
            uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
1306
0
        }
1307
0
        _UTF16Reset(cnv, UCNV_RESET_BOTH);
1308
0
    } else {
1309
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1310
0
    }
1311
0
}
1312
1313
static const char *  U_CALLCONV
1314
0
_UTF16GetName(const UConverter *cnv) {
1315
0
    if(UCNV_GET_VERSION(cnv)==0) {
1316
0
        return "UTF-16";
1317
0
    } else if(UCNV_GET_VERSION(cnv)==1) {
1318
0
        return "UTF-16,version=1";
1319
0
    } else {
1320
0
        return "UTF-16,version=2";
1321
0
    }
1322
0
}
1323
U_CDECL_END
1324
extern const UConverterSharedData _UTF16Data;
1325
1326
0
static inline bool IS_UTF16BE(const UConverter *cnv) {
1327
0
    return ((cnv)->sharedData == &_UTF16BEData);
1328
0
}
1329
1330
0
static inline bool IS_UTF16LE(const UConverter *cnv) {
1331
0
    return ((cnv)->sharedData == &_UTF16LEData);
1332
0
}
1333
1334
0
static inline bool IS_UTF16(const UConverter *cnv) {
1335
0
    return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
1336
0
}
1337
1338
U_CDECL_BEGIN
1339
static void U_CALLCONV
1340
_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1341
0
                           UErrorCode *pErrorCode) {
1342
0
    UConverter *cnv=pArgs->converter;
1343
0
    const char *source=pArgs->source;
1344
0
    const char *sourceLimit=pArgs->sourceLimit;
1345
0
    int32_t *offsets=pArgs->offsets;
1346
1347
0
    int32_t state, offsetDelta;
1348
0
    uint8_t b;
1349
1350
0
    state=cnv->mode;
1351
1352
    /*
1353
     * If we detect a BOM in this buffer, then we must add the BOM size to the
1354
     * offsets because the actual converter function will not see and count the BOM.
1355
     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
1356
     */
1357
0
    offsetDelta=0;
1358
1359
0
    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
1360
0
        switch(state) {
1361
0
        case 0:
1362
0
            cnv->toUBytes[0]=(uint8_t)*source++;
1363
0
            cnv->toULength=1;
1364
0
            state=1;
1365
0
            break;
1366
0
        case 1:
1367
            /*
1368
             * Only inside this switch case can the state variable
1369
             * temporarily take two additional values:
1370
             * 6: BOM error, continue with BE
1371
             * 7: BOM error, continue with LE
1372
             */
1373
0
            b=*source;
1374
0
            if(cnv->toUBytes[0]==0xfe && b==0xff) {
1375
0
                if(IS_UTF16LE(cnv)) {
1376
0
                    state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
1377
0
                } else {
1378
0
                    state=8; /* detect UTF-16BE */
1379
0
                }
1380
0
            } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
1381
0
                if(IS_UTF16BE(cnv)) {
1382
0
                    state=6; /* illegal reverse BOM for Java "UnicodeBig" */
1383
0
                } else {
1384
0
                    state=9; /* detect UTF-16LE */
1385
0
                }
1386
0
            } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
1387
0
                state=6; /* illegal missing BOM for Java "Unicode" */
1388
0
            }
1389
0
            if(state>=8) {
1390
                /* BOM detected, consume it */
1391
0
                ++source;
1392
0
                cnv->toULength=0;
1393
0
                offsetDelta=(int32_t)(source-pArgs->source);
1394
0
            } else if(state<6) {
1395
                /* ok: no BOM, and not a reverse BOM */
1396
0
                if(source!=pArgs->source) {
1397
                    /* reset the source for a correct first offset */
1398
0
                    source=pArgs->source;
1399
0
                    cnv->toULength=0;
1400
0
                }
1401
0
                if(IS_UTF16LE(cnv)) {
1402
                    /* Make Java "UnicodeLittle" default to LE. */
1403
0
                    state=9;
1404
0
                } else {
1405
                    /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
1406
0
                    state=8;
1407
0
                }
1408
0
            } else {
1409
                /*
1410
                 * error: missing BOM, or reverse BOM
1411
                 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
1412
                 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
1413
                 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
1414
                 */
1415
                /* report the non-BOM or reverse BOM as an illegal sequence */
1416
0
                cnv->toUBytes[1]=b;
1417
0
                cnv->toULength=2;
1418
0
                pArgs->source=source+1;
1419
                /* continue with conversion if the callback resets the error */
1420
                /*
1421
                 * Make Java "Unicode" default to BE like standard UTF-16.
1422
                 * Make Java "UnicodeBig" and "UnicodeLittle" default
1423
                 * to their normal endiannesses.
1424
                 */
1425
0
                cnv->mode=state+2;
1426
0
                *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
1427
0
                return;
1428
0
            }
1429
            /* convert the rest of the stream */
1430
0
            cnv->mode=state;
1431
0
            continue;
1432
0
        case 8:
1433
            /* call UTF-16BE */
1434
0
            pArgs->source=source;
1435
0
            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1436
0
            source=pArgs->source;
1437
0
            break;
1438
0
        case 9:
1439
            /* call UTF-16LE */
1440
0
            pArgs->source=source;
1441
0
            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1442
0
            source=pArgs->source;
1443
0
            break;
1444
0
        default:
1445
0
            break; /* does not occur */
1446
0
        }
1447
0
    }
1448
1449
    /* add BOM size to offsets - see comment at offsetDelta declaration */
1450
0
    if(offsets!=NULL && offsetDelta!=0) {
1451
0
        int32_t *offsetsLimit=pArgs->offsets;
1452
0
        while(offsets<offsetsLimit) {
1453
0
            *offsets++ += offsetDelta;
1454
0
        }
1455
0
    }
1456
1457
0
    pArgs->source=source;
1458
1459
0
    if(source==sourceLimit && pArgs->flush) {
1460
        /* handle truncated input */
1461
0
        switch(state) {
1462
0
        case 0:
1463
0
            break; /* no input at all, nothing to do */
1464
0
        case 8:
1465
0
            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
1466
0
            break;
1467
0
        case 9:
1468
0
            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
1469
0
            break;
1470
0
        default:
1471
            /* 0<state<8: framework will report truncation, nothing to do here */
1472
0
            break;
1473
0
        }
1474
0
    }
1475
1476
0
    cnv->mode=state;
1477
0
}
1478
1479
static UChar32 U_CALLCONV
1480
_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
1481
0
                   UErrorCode *pErrorCode) {
1482
0
    switch(pArgs->converter->mode) {
1483
0
    case 8:
1484
0
        return _UTF16BEGetNextUChar(pArgs, pErrorCode);
1485
0
    case 9:
1486
0
        return _UTF16LEGetNextUChar(pArgs, pErrorCode);
1487
0
    default:
1488
0
        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
1489
0
    }
1490
0
}
1491
U_CDECL_END
1492
1493
static const UConverterImpl _UTF16Impl = {
1494
    UCNV_UTF16,
1495
1496
    NULL,
1497
    NULL,
1498
1499
    _UTF16Open,
1500
    NULL,
1501
    _UTF16Reset,
1502
1503
    _UTF16ToUnicodeWithOffsets,
1504
    _UTF16ToUnicodeWithOffsets,
1505
    _UTF16PEFromUnicodeWithOffsets,
1506
    _UTF16PEFromUnicodeWithOffsets,
1507
    _UTF16GetNextUChar,
1508
1509
    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1510
    _UTF16GetName,
1511
    NULL,
1512
    NULL,
1513
    ucnv_getNonSurrogateUnicodeSet,
1514
1515
    NULL,
1516
    NULL
1517
};
1518
1519
static const UConverterStaticData _UTF16StaticData = {
1520
    sizeof(UConverterStaticData),
1521
    "UTF-16",
1522
    1204, /* CCSID for BOM sensitive UTF-16 */
1523
    UCNV_IBM, UCNV_UTF16, 2, 2,
1524
#if U_IS_BIG_ENDIAN
1525
    { 0xff, 0xfd, 0, 0 }, 2,
1526
#else
1527
    { 0xfd, 0xff, 0, 0 }, 2,
1528
#endif
1529
    FALSE, FALSE,
1530
    0,
1531
    0,
1532
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1533
};
1534
1535
const UConverterSharedData _UTF16Data =
1536
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
1537
1538
static const UConverterImpl _UTF16v2Impl = {
1539
    UCNV_UTF16,
1540
1541
    NULL,
1542
    NULL,
1543
1544
    _UTF16Open,
1545
    NULL,
1546
    _UTF16Reset,
1547
1548
    _UTF16ToUnicodeWithOffsets,
1549
    _UTF16ToUnicodeWithOffsets,
1550
    _UTF16BEFromUnicodeWithOffsets,
1551
    _UTF16BEFromUnicodeWithOffsets,
1552
    _UTF16GetNextUChar,
1553
1554
    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
1555
    _UTF16GetName,
1556
    NULL,
1557
    NULL,
1558
    ucnv_getNonSurrogateUnicodeSet,
1559
1560
    NULL,
1561
    NULL
1562
};
1563
1564
static const UConverterStaticData _UTF16v2StaticData = {
1565
    sizeof(UConverterStaticData),
1566
    "UTF-16,version=2",
1567
    1204, /* CCSID for BOM sensitive UTF-16 */
1568
    UCNV_IBM, UCNV_UTF16, 2, 2,
1569
    { 0xff, 0xfd, 0, 0 }, 2,
1570
    FALSE, FALSE,
1571
    0,
1572
    0,
1573
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1574
};
1575
1576
const UConverterSharedData _UTF16v2Data =
1577
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
1578
1579
#endif