Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucnvscsu.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2000-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  ucnvscsu.c
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2000nov18
16
*   created by: Markus W. Scherer
17
*
18
*   This is an implementation of the Standard Compression Scheme for Unicode
19
*   as defined in https://www.unicode.org/reports/tr6/ .
20
*   Reserved commands and window settings are treated as illegal sequences and
21
*   will result in callback calls.
22
*/
23
24
#include "unicode/utypes.h"
25
26
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
27
28
#include "unicode/ucnv.h"
29
#include "unicode/ucnv_cb.h"
30
#include "unicode/utf16.h"
31
#include "ucnv_bld.h"
32
#include "ucnv_cnv.h"
33
#include "cmemory.h"
34
35
/* SCSU definitions --------------------------------------------------------- */
36
37
/* SCSU command byte values */
38
enum {
39
    SQ0=0x01, /* Quote from window pair 0 */
40
    SQ7=0x08, /* Quote from window pair 7 */
41
    SDX=0x0B, /* Define a window as extended */
42
    Srs=0x0C, /* reserved */
43
    SQU=0x0E, /* Quote a single Unicode character */
44
    SCU=0x0F, /* Change to Unicode mode */
45
    SC0=0x10, /* Select window 0 */
46
    SC7=0x17, /* Select window 7 */
47
    SD0=0x18, /* Define and select window 0 */
48
    SD7=0x1F, /* Define and select window 7 */
49
50
    UC0=0xE0, /* Select window 0 */
51
    UC7=0xE7, /* Select window 7 */
52
    UD0=0xE8, /* Define and select window 0 */
53
    UD7=0xEF, /* Define and select window 7 */
54
    UQU=0xF0, /* Quote a single Unicode character */
55
    UDX=0xF1, /* Define a Window as extended */
56
    Urs=0xF2  /* reserved */
57
};
58
59
enum {
60
    /*
61
     * Unicode code points from 3400 to E000 are not adressible by
62
     * dynamic window, since in these areas no short run alphabets are
63
     * found. Therefore add gapOffset to all values from gapThreshold.
64
     */
65
    gapThreshold=0x68,
66
    gapOffset=0xAC00,
67
68
    /* values between reservedStart and fixedThreshold are reserved */
69
    reservedStart=0xA8,
70
71
    /* use table of predefined fixed offsets for values from fixedThreshold */
72
    fixedThreshold=0xF9
73
};
74
75
/* constant offsets for the 8 static windows */
76
static const uint32_t staticOffsets[8]={
77
    0x0000, /* ASCII for quoted tags */
78
    0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79
    0x0100, /* Latin Extended-A */
80
    0x0300, /* Combining Diacritical Marks */
81
    0x2000, /* General Punctuation */
82
    0x2080, /* Currency Symbols */
83
    0x2100, /* Letterlike Symbols and Number Forms */
84
    0x3000  /* CJK Symbols and punctuation */
85
};
86
87
/* initial offsets for the 8 dynamic (sliding) windows */
88
static const uint32_t initialDynamicOffsets[8]={
89
    0x0080, /* Latin-1 */
90
    0x00C0, /* Latin Extended A */
91
    0x0400, /* Cyrillic */
92
    0x0600, /* Arabic */
93
    0x0900, /* Devanagari */
94
    0x3040, /* Hiragana */
95
    0x30A0, /* Katakana */
96
    0xFF00  /* Fullwidth ASCII */
97
};
98
99
/* Table of fixed predefined Offsets */
100
static const uint32_t fixedOffsets[]={
101
    /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102
    /* 0xFA */ 0x0250, /* IPA extensions */
103
    /* 0xFB */ 0x0370, /* Greek */
104
    /* 0xFC */ 0x0530, /* Armenian */
105
    /* 0xFD */ 0x3040, /* Hiragana */
106
    /* 0xFE */ 0x30A0, /* Katakana */
107
    /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
108
};
109
110
/* state values */
111
enum {
112
    readCommand,
113
    quotePairOne,
114
    quotePairTwo,
115
    quoteOne,
116
    definePairOne,
117
    definePairTwo,
118
    defineOne
119
};
120
121
typedef struct SCSUData {
122
    /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
123
    uint32_t toUDynamicOffsets[8];
124
    uint32_t fromUDynamicOffsets[8];
125
126
    /* state machine state - toUnicode */
127
    UBool toUIsSingleByteMode;
128
    uint8_t toUState;
129
    int8_t toUQuoteWindow, toUDynamicWindow;
130
    uint8_t toUByteOne;
131
    uint8_t toUPadding[3];
132
133
    /* state machine state - fromUnicode */
134
    UBool fromUIsSingleByteMode;
135
    int8_t fromUDynamicWindow;
136
137
    /*
138
     * windowUse[] keeps track of the use of the dynamic windows:
139
     * At nextWindowUseIndex there is the least recently used window,
140
     * and the following windows (in a wrapping manner) are more and more
141
     * recently used.
142
     * At nextWindowUseIndex-1 there is the most recently used window.
143
     */
144
    uint8_t locale;
145
    int8_t nextWindowUseIndex;
146
    int8_t windowUse[8];
147
} SCSUData;
148
149
static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150
static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151
152
enum {
153
    lGeneric, l_ja
154
};
155
156
/* SCSU setup functions ----------------------------------------------------- */
157
U_CDECL_BEGIN
158
static void U_CALLCONV
159
0
_SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160
0
    SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161
162
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
163
        /* reset toUnicode */
164
0
        uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165
166
0
        scsu->toUIsSingleByteMode=TRUE;
167
0
        scsu->toUState=readCommand;
168
0
        scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169
0
        scsu->toUByteOne=0;
170
171
0
        cnv->toULength=0;
172
0
    }
173
0
    if(choice!=UCNV_RESET_TO_UNICODE) {
174
        /* reset fromUnicode */
175
0
        uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176
177
0
        scsu->fromUIsSingleByteMode=TRUE;
178
0
        scsu->fromUDynamicWindow=0;
179
180
0
        scsu->nextWindowUseIndex=0;
181
0
        switch(scsu->locale) {
182
0
        case l_ja:
183
0
            uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184
0
            break;
185
0
        default:
186
0
            uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187
0
            break;
188
0
        }
189
190
0
        cnv->fromUChar32=0;
191
0
    }
192
0
}
193
194
static void U_CALLCONV
195
_SCSUOpen(UConverter *cnv,
196
          UConverterLoadArgs *pArgs,
197
0
          UErrorCode *pErrorCode) {
198
0
    const char *locale=pArgs->locale;
199
0
    if(pArgs->onlyTestIsLoadable) {
200
0
        return;
201
0
    }
202
0
    cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203
0
    if(cnv->extraInfo!=NULL) {
204
0
        if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205
0
            ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206
0
        } else {
207
0
            ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208
0
        }
209
0
        _SCSUReset(cnv, UCNV_RESET_BOTH);
210
0
    } else {
211
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212
0
    }
213
214
    /* Set the substitution character U+fffd as a Unicode string. */
215
0
    cnv->subUChars[0]=0xfffd;
216
0
    cnv->subCharLen=-1;
217
0
}
218
219
static void U_CALLCONV
220
0
_SCSUClose(UConverter *cnv) {
221
0
    if(cnv->extraInfo!=NULL) {
222
0
        if(!cnv->isExtraLocal) {
223
0
            uprv_free(cnv->extraInfo);
224
0
        }
225
0
        cnv->extraInfo=NULL;
226
0
    }
227
0
}
228
229
/* SCSU-to-Unicode conversion functions ------------------------------------- */
230
231
static void U_CALLCONV
232
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233
0
                          UErrorCode *pErrorCode) {
234
0
    UConverter *cnv;
235
0
    SCSUData *scsu;
236
0
    const uint8_t *source, *sourceLimit;
237
0
    UChar *target;
238
0
    const UChar *targetLimit;
239
0
    int32_t *offsets;
240
0
    UBool isSingleByteMode;
241
0
    uint8_t state, byteOne;
242
0
    int8_t quoteWindow, dynamicWindow;
243
244
0
    int32_t sourceIndex, nextSourceIndex;
245
246
0
    uint8_t b;
247
248
    /* set up the local pointers */
249
0
    cnv=pArgs->converter;
250
0
    scsu=(SCSUData *)cnv->extraInfo;
251
252
0
    source=(const uint8_t *)pArgs->source;
253
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254
0
    target=pArgs->target;
255
0
    targetLimit=pArgs->targetLimit;
256
0
    offsets=pArgs->offsets;
257
258
    /* get the state machine state */
259
0
    isSingleByteMode=scsu->toUIsSingleByteMode;
260
0
    state=scsu->toUState;
261
0
    quoteWindow=scsu->toUQuoteWindow;
262
0
    dynamicWindow=scsu->toUDynamicWindow;
263
0
    byteOne=scsu->toUByteOne;
264
265
    /* sourceIndex=-1 if the current character began in the previous buffer */
266
0
    sourceIndex=state==readCommand ? 0 : -1;
267
0
    nextSourceIndex=0;
268
269
    /*
270
     * conversion "loop"
271
     *
272
     * For performance, this is not a normal C loop.
273
     * Instead, there are two code blocks for the two SCSU modes.
274
     * The function branches to either one, and a change of the mode is done with a goto to
275
     * the other branch.
276
     *
277
     * Each branch has two conventional loops:
278
     * - a fast-path loop for the most common codes in the mode
279
     * - a loop for all other codes in the mode
280
     * When the fast-path runs into a code that it cannot handle, its loop ends and it
281
     * runs into the following loop to handle the other codes.
282
     * The end of the input or output buffer is also handled by the slower loop.
283
     * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284
     *
285
     * The callback handling is done by returning with an error code.
286
     * The conversion framework actually calls the callback function.
287
     */
288
0
    if(isSingleByteMode) {
289
        /* fast path for single-byte mode */
290
0
        if(state==readCommand) {
291
0
fastSingle:
292
0
            while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293
0
                ++source;
294
0
                ++nextSourceIndex;
295
0
                if(b<=0x7f) {
296
                    /* write US-ASCII graphic character or DEL */
297
0
                    *target++=(UChar)b;
298
0
                    if(offsets!=NULL) {
299
0
                        *offsets++=sourceIndex;
300
0
                    }
301
0
                } else {
302
                    /* write from dynamic window */
303
0
                    uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304
0
                    if(c<=0xffff) {
305
0
                        *target++=(UChar)c;
306
0
                        if(offsets!=NULL) {
307
0
                            *offsets++=sourceIndex;
308
0
                        }
309
0
                    } else {
310
                        /* output surrogate pair */
311
0
                        *target++=(UChar)(0xd7c0+(c>>10));
312
0
                        if(target<targetLimit) {
313
0
                            *target++=(UChar)(0xdc00|(c&0x3ff));
314
0
                            if(offsets!=NULL) {
315
0
                                *offsets++=sourceIndex;
316
0
                                *offsets++=sourceIndex;
317
0
                            }
318
0
                        } else {
319
                            /* target overflow */
320
0
                            if(offsets!=NULL) {
321
0
                                *offsets++=sourceIndex;
322
0
                            }
323
0
                            cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
324
0
                            cnv->UCharErrorBufferLength=1;
325
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326
0
                            goto endloop;
327
0
                        }
328
0
                    }
329
0
                }
330
0
                sourceIndex=nextSourceIndex;
331
0
            }
332
0
        }
333
334
        /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335
0
singleByteMode:
336
0
        while(source<sourceLimit) {
337
0
            if(target>=targetLimit) {
338
                /* target is full */
339
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340
0
                break;
341
0
            }
342
0
            b=*source++;
343
0
            ++nextSourceIndex;
344
0
            switch(state) {
345
0
            case readCommand:
346
                /* redundant conditions are commented out */
347
                /* here: b<0x20 because otherwise we would be in fastSingle */
348
0
                if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349
                    /* CR/LF/TAB/NUL */
350
0
                    *target++=(UChar)b;
351
0
                    if(offsets!=NULL) {
352
0
                        *offsets++=sourceIndex;
353
0
                    }
354
0
                    sourceIndex=nextSourceIndex;
355
0
                    goto fastSingle;
356
0
                } else if(SC0<=b) {
357
0
                    if(b<=SC7) {
358
0
                        dynamicWindow=(int8_t)(b-SC0);
359
0
                        sourceIndex=nextSourceIndex;
360
0
                        goto fastSingle;
361
0
                    } else /* if(SD0<=b && b<=SD7) */ {
362
0
                        dynamicWindow=(int8_t)(b-SD0);
363
0
                        state=defineOne;
364
0
                    }
365
0
                } else if(/* SQ0<=b && */ b<=SQ7) {
366
0
                    quoteWindow=(int8_t)(b-SQ0);
367
0
                    state=quoteOne;
368
0
                } else if(b==SDX) {
369
0
                    state=definePairOne;
370
0
                } else if(b==SQU) {
371
0
                    state=quotePairOne;
372
0
                } else if(b==SCU) {
373
0
                    sourceIndex=nextSourceIndex;
374
0
                    isSingleByteMode=FALSE;
375
0
                    goto fastUnicode;
376
0
                } else /* Srs */ {
377
                    /* callback(illegal) */
378
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379
0
                    cnv->toUBytes[0]=b;
380
0
                    cnv->toULength=1;
381
0
                    goto endloop;
382
0
                }
383
384
                /* store the first byte of a multibyte sequence in toUBytes[] */
385
0
                cnv->toUBytes[0]=b;
386
0
                cnv->toULength=1;
387
0
                break;
388
0
            case quotePairOne:
389
0
                byteOne=b;
390
0
                cnv->toUBytes[1]=b;
391
0
                cnv->toULength=2;
392
0
                state=quotePairTwo;
393
0
                break;
394
0
            case quotePairTwo:
395
0
                *target++=(UChar)((byteOne<<8)|b);
396
0
                if(offsets!=NULL) {
397
0
                    *offsets++=sourceIndex;
398
0
                }
399
0
                sourceIndex=nextSourceIndex;
400
0
                state=readCommand;
401
0
                goto fastSingle;
402
0
            case quoteOne:
403
0
                if(b<0x80) {
404
                    /* all static offsets are in the BMP */
405
0
                    *target++=(UChar)(staticOffsets[quoteWindow]+b);
406
0
                    if(offsets!=NULL) {
407
0
                        *offsets++=sourceIndex;
408
0
                    }
409
0
                } else {
410
                    /* write from dynamic window */
411
0
                    uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412
0
                    if(c<=0xffff) {
413
0
                        *target++=(UChar)c;
414
0
                        if(offsets!=NULL) {
415
0
                            *offsets++=sourceIndex;
416
0
                        }
417
0
                    } else {
418
                        /* output surrogate pair */
419
0
                        *target++=(UChar)(0xd7c0+(c>>10));
420
0
                        if(target<targetLimit) {
421
0
                            *target++=(UChar)(0xdc00|(c&0x3ff));
422
0
                            if(offsets!=NULL) {
423
0
                                *offsets++=sourceIndex;
424
0
                                *offsets++=sourceIndex;
425
0
                            }
426
0
                        } else {
427
                            /* target overflow */
428
0
                            if(offsets!=NULL) {
429
0
                                *offsets++=sourceIndex;
430
0
                            }
431
0
                            cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
432
0
                            cnv->UCharErrorBufferLength=1;
433
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434
0
                            goto endloop;
435
0
                        }
436
0
                    }
437
0
                }
438
0
                sourceIndex=nextSourceIndex;
439
0
                state=readCommand;
440
0
                goto fastSingle;
441
0
            case definePairOne:
442
0
                dynamicWindow=(int8_t)((b>>5)&7);
443
0
                byteOne=(uint8_t)(b&0x1f);
444
0
                cnv->toUBytes[1]=b;
445
0
                cnv->toULength=2;
446
0
                state=definePairTwo;
447
0
                break;
448
0
            case definePairTwo:
449
0
                scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450
0
                sourceIndex=nextSourceIndex;
451
0
                state=readCommand;
452
0
                goto fastSingle;
453
0
            case defineOne:
454
0
                if(b==0) {
455
                    /* callback(illegal): Reserved window offset value 0 */
456
0
                    cnv->toUBytes[1]=b;
457
0
                    cnv->toULength=2;
458
0
                    goto endloop;
459
0
                } else if(b<gapThreshold) {
460
0
                    scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461
0
                } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462
0
                    scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463
0
                } else if(b>=fixedThreshold) {
464
0
                    scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465
0
                } else {
466
                    /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
467
0
                    cnv->toUBytes[1]=b;
468
0
                    cnv->toULength=2;
469
0
                    goto endloop;
470
0
                }
471
0
                sourceIndex=nextSourceIndex;
472
0
                state=readCommand;
473
0
                goto fastSingle;
474
0
            }
475
0
        }
476
0
    } else {
477
        /* fast path for Unicode mode */
478
0
        if(state==readCommand) {
479
0
fastUnicode:
480
0
            while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481
0
                *target++=(UChar)((b<<8)|source[1]);
482
0
                if(offsets!=NULL) {
483
0
                    *offsets++=sourceIndex;
484
0
                }
485
0
                sourceIndex=nextSourceIndex;
486
0
                nextSourceIndex+=2;
487
0
                source+=2;
488
0
            }
489
0
        }
490
491
        /* normal state machine for Unicode mode */
492
/* unicodeByteMode: */
493
0
        while(source<sourceLimit) {
494
0
            if(target>=targetLimit) {
495
                /* target is full */
496
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497
0
                break;
498
0
            }
499
0
            b=*source++;
500
0
            ++nextSourceIndex;
501
0
            switch(state) {
502
0
            case readCommand:
503
0
                if((uint8_t)(b-UC0)>(Urs-UC0)) {
504
0
                    byteOne=b;
505
0
                    cnv->toUBytes[0]=b;
506
0
                    cnv->toULength=1;
507
0
                    state=quotePairTwo;
508
0
                } else if(/* UC0<=b && */ b<=UC7) {
509
0
                    dynamicWindow=(int8_t)(b-UC0);
510
0
                    sourceIndex=nextSourceIndex;
511
0
                    isSingleByteMode=TRUE;
512
0
                    goto fastSingle;
513
0
                } else if(/* UD0<=b && */ b<=UD7) {
514
0
                    dynamicWindow=(int8_t)(b-UD0);
515
0
                    isSingleByteMode=TRUE;
516
0
                    cnv->toUBytes[0]=b;
517
0
                    cnv->toULength=1;
518
0
                    state=defineOne;
519
0
                    goto singleByteMode;
520
0
                } else if(b==UDX) {
521
0
                    isSingleByteMode=TRUE;
522
0
                    cnv->toUBytes[0]=b;
523
0
                    cnv->toULength=1;
524
0
                    state=definePairOne;
525
0
                    goto singleByteMode;
526
0
                } else if(b==UQU) {
527
0
                    cnv->toUBytes[0]=b;
528
0
                    cnv->toULength=1;
529
0
                    state=quotePairOne;
530
0
                } else /* Urs */ {
531
                    /* callback(illegal) */
532
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533
0
                    cnv->toUBytes[0]=b;
534
0
                    cnv->toULength=1;
535
0
                    goto endloop;
536
0
                }
537
0
                break;
538
0
            case quotePairOne:
539
0
                byteOne=b;
540
0
                cnv->toUBytes[1]=b;
541
0
                cnv->toULength=2;
542
0
                state=quotePairTwo;
543
0
                break;
544
0
            case quotePairTwo:
545
0
                *target++=(UChar)((byteOne<<8)|b);
546
0
                if(offsets!=NULL) {
547
0
                    *offsets++=sourceIndex;
548
0
                }
549
0
                sourceIndex=nextSourceIndex;
550
0
                state=readCommand;
551
0
                goto fastUnicode;
552
0
            }
553
0
        }
554
0
    }
555
0
endloop:
556
557
    /* set the converter state back into UConverter */
558
0
    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559
        /* reset to deal with the next character */
560
0
        state=readCommand;
561
0
    } else if(state==readCommand) {
562
        /* not in a multi-byte sequence, reset toULength */
563
0
        cnv->toULength=0;
564
0
    }
565
0
    scsu->toUIsSingleByteMode=isSingleByteMode;
566
0
    scsu->toUState=state;
567
0
    scsu->toUQuoteWindow=quoteWindow;
568
0
    scsu->toUDynamicWindow=dynamicWindow;
569
0
    scsu->toUByteOne=byteOne;
570
571
    /* write back the updated pointers */
572
0
    pArgs->source=(const char *)source;
573
0
    pArgs->target=target;
574
0
    pArgs->offsets=offsets;
575
0
    return;
576
0
}
577
578
/*
579
 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
580
 * If a change is made in the original function, then either
581
 * change this function the same way or
582
 * re-copy the original function and remove the variables
583
 * offsets, sourceIndex, and nextSourceIndex.
584
 */
585
static void U_CALLCONV
586
_SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
587
0
               UErrorCode *pErrorCode) {
588
0
    UConverter *cnv;
589
0
    SCSUData *scsu;
590
0
    const uint8_t *source, *sourceLimit;
591
0
    UChar *target;
592
0
    const UChar *targetLimit;
593
0
    UBool isSingleByteMode;
594
0
    uint8_t state, byteOne;
595
0
    int8_t quoteWindow, dynamicWindow;
596
597
0
    uint8_t b;
598
599
    /* set up the local pointers */
600
0
    cnv=pArgs->converter;
601
0
    scsu=(SCSUData *)cnv->extraInfo;
602
603
0
    source=(const uint8_t *)pArgs->source;
604
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
605
0
    target=pArgs->target;
606
0
    targetLimit=pArgs->targetLimit;
607
608
    /* get the state machine state */
609
0
    isSingleByteMode=scsu->toUIsSingleByteMode;
610
0
    state=scsu->toUState;
611
0
    quoteWindow=scsu->toUQuoteWindow;
612
0
    dynamicWindow=scsu->toUDynamicWindow;
613
0
    byteOne=scsu->toUByteOne;
614
615
    /*
616
     * conversion "loop"
617
     *
618
     * For performance, this is not a normal C loop.
619
     * Instead, there are two code blocks for the two SCSU modes.
620
     * The function branches to either one, and a change of the mode is done with a goto to
621
     * the other branch.
622
     *
623
     * Each branch has two conventional loops:
624
     * - a fast-path loop for the most common codes in the mode
625
     * - a loop for all other codes in the mode
626
     * When the fast-path runs into a code that it cannot handle, its loop ends and it
627
     * runs into the following loop to handle the other codes.
628
     * The end of the input or output buffer is also handled by the slower loop.
629
     * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
630
     *
631
     * The callback handling is done by returning with an error code.
632
     * The conversion framework actually calls the callback function.
633
     */
634
0
    if(isSingleByteMode) {
635
        /* fast path for single-byte mode */
636
0
        if(state==readCommand) {
637
0
fastSingle:
638
0
            while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
639
0
                ++source;
640
0
                if(b<=0x7f) {
641
                    /* write US-ASCII graphic character or DEL */
642
0
                    *target++=(UChar)b;
643
0
                } else {
644
                    /* write from dynamic window */
645
0
                    uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
646
0
                    if(c<=0xffff) {
647
0
                        *target++=(UChar)c;
648
0
                    } else {
649
                        /* output surrogate pair */
650
0
                        *target++=(UChar)(0xd7c0+(c>>10));
651
0
                        if(target<targetLimit) {
652
0
                            *target++=(UChar)(0xdc00|(c&0x3ff));
653
0
                        } else {
654
                            /* target overflow */
655
0
                            cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
656
0
                            cnv->UCharErrorBufferLength=1;
657
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
658
0
                            goto endloop;
659
0
                        }
660
0
                    }
661
0
                }
662
0
            }
663
0
        }
664
665
        /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
666
0
singleByteMode:
667
0
        while(source<sourceLimit) {
668
0
            if(target>=targetLimit) {
669
                /* target is full */
670
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
671
0
                break;
672
0
            }
673
0
            b=*source++;
674
0
            switch(state) {
675
0
            case readCommand:
676
                /* redundant conditions are commented out */
677
                /* here: b<0x20 because otherwise we would be in fastSingle */
678
0
                if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
679
                    /* CR/LF/TAB/NUL */
680
0
                    *target++=(UChar)b;
681
0
                    goto fastSingle;
682
0
                } else if(SC0<=b) {
683
0
                    if(b<=SC7) {
684
0
                        dynamicWindow=(int8_t)(b-SC0);
685
0
                        goto fastSingle;
686
0
                    } else /* if(SD0<=b && b<=SD7) */ {
687
0
                        dynamicWindow=(int8_t)(b-SD0);
688
0
                        state=defineOne;
689
0
                    }
690
0
                } else if(/* SQ0<=b && */ b<=SQ7) {
691
0
                    quoteWindow=(int8_t)(b-SQ0);
692
0
                    state=quoteOne;
693
0
                } else if(b==SDX) {
694
0
                    state=definePairOne;
695
0
                } else if(b==SQU) {
696
0
                    state=quotePairOne;
697
0
                } else if(b==SCU) {
698
0
                    isSingleByteMode=FALSE;
699
0
                    goto fastUnicode;
700
0
                } else /* Srs */ {
701
                    /* callback(illegal) */
702
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
703
0
                    cnv->toUBytes[0]=b;
704
0
                    cnv->toULength=1;
705
0
                    goto endloop;
706
0
                }
707
708
                /* store the first byte of a multibyte sequence in toUBytes[] */
709
0
                cnv->toUBytes[0]=b;
710
0
                cnv->toULength=1;
711
0
                break;
712
0
            case quotePairOne:
713
0
                byteOne=b;
714
0
                cnv->toUBytes[1]=b;
715
0
                cnv->toULength=2;
716
0
                state=quotePairTwo;
717
0
                break;
718
0
            case quotePairTwo:
719
0
                *target++=(UChar)((byteOne<<8)|b);
720
0
                state=readCommand;
721
0
                goto fastSingle;
722
0
            case quoteOne:
723
0
                if(b<0x80) {
724
                    /* all static offsets are in the BMP */
725
0
                    *target++=(UChar)(staticOffsets[quoteWindow]+b);
726
0
                } else {
727
                    /* write from dynamic window */
728
0
                    uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
729
0
                    if(c<=0xffff) {
730
0
                        *target++=(UChar)c;
731
0
                    } else {
732
                        /* output surrogate pair */
733
0
                        *target++=(UChar)(0xd7c0+(c>>10));
734
0
                        if(target<targetLimit) {
735
0
                            *target++=(UChar)(0xdc00|(c&0x3ff));
736
0
                        } else {
737
                            /* target overflow */
738
0
                            cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
739
0
                            cnv->UCharErrorBufferLength=1;
740
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
741
0
                            goto endloop;
742
0
                        }
743
0
                    }
744
0
                }
745
0
                state=readCommand;
746
0
                goto fastSingle;
747
0
            case definePairOne:
748
0
                dynamicWindow=(int8_t)((b>>5)&7);
749
0
                byteOne=(uint8_t)(b&0x1f);
750
0
                cnv->toUBytes[1]=b;
751
0
                cnv->toULength=2;
752
0
                state=definePairTwo;
753
0
                break;
754
0
            case definePairTwo:
755
0
                scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
756
0
                state=readCommand;
757
0
                goto fastSingle;
758
0
            case defineOne:
759
0
                if(b==0) {
760
                    /* callback(illegal): Reserved window offset value 0 */
761
0
                    cnv->toUBytes[1]=b;
762
0
                    cnv->toULength=2;
763
0
                    goto endloop;
764
0
                } else if(b<gapThreshold) {
765
0
                    scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
766
0
                } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
767
0
                    scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
768
0
                } else if(b>=fixedThreshold) {
769
0
                    scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
770
0
                } else {
771
                    /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
772
0
                    cnv->toUBytes[1]=b;
773
0
                    cnv->toULength=2;
774
0
                    goto endloop;
775
0
                }
776
0
                state=readCommand;
777
0
                goto fastSingle;
778
0
            }
779
0
        }
780
0
    } else {
781
        /* fast path for Unicode mode */
782
0
        if(state==readCommand) {
783
0
fastUnicode:
784
0
            while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
785
0
                *target++=(UChar)((b<<8)|source[1]);
786
0
                source+=2;
787
0
            }
788
0
        }
789
790
        /* normal state machine for Unicode mode */
791
/* unicodeByteMode: */
792
0
        while(source<sourceLimit) {
793
0
            if(target>=targetLimit) {
794
                /* target is full */
795
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
796
0
                break;
797
0
            }
798
0
            b=*source++;
799
0
            switch(state) {
800
0
            case readCommand:
801
0
                if((uint8_t)(b-UC0)>(Urs-UC0)) {
802
0
                    byteOne=b;
803
0
                    cnv->toUBytes[0]=b;
804
0
                    cnv->toULength=1;
805
0
                    state=quotePairTwo;
806
0
                } else if(/* UC0<=b && */ b<=UC7) {
807
0
                    dynamicWindow=(int8_t)(b-UC0);
808
0
                    isSingleByteMode=TRUE;
809
0
                    goto fastSingle;
810
0
                } else if(/* UD0<=b && */ b<=UD7) {
811
0
                    dynamicWindow=(int8_t)(b-UD0);
812
0
                    isSingleByteMode=TRUE;
813
0
                    cnv->toUBytes[0]=b;
814
0
                    cnv->toULength=1;
815
0
                    state=defineOne;
816
0
                    goto singleByteMode;
817
0
                } else if(b==UDX) {
818
0
                    isSingleByteMode=TRUE;
819
0
                    cnv->toUBytes[0]=b;
820
0
                    cnv->toULength=1;
821
0
                    state=definePairOne;
822
0
                    goto singleByteMode;
823
0
                } else if(b==UQU) {
824
0
                    cnv->toUBytes[0]=b;
825
0
                    cnv->toULength=1;
826
0
                    state=quotePairOne;
827
0
                } else /* Urs */ {
828
                    /* callback(illegal) */
829
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
830
0
                    cnv->toUBytes[0]=b;
831
0
                    cnv->toULength=1;
832
0
                    goto endloop;
833
0
                }
834
0
                break;
835
0
            case quotePairOne:
836
0
                byteOne=b;
837
0
                cnv->toUBytes[1]=b;
838
0
                cnv->toULength=2;
839
0
                state=quotePairTwo;
840
0
                break;
841
0
            case quotePairTwo:
842
0
                *target++=(UChar)((byteOne<<8)|b);
843
0
                state=readCommand;
844
0
                goto fastUnicode;
845
0
            }
846
0
        }
847
0
    }
848
0
endloop:
849
850
    /* set the converter state back into UConverter */
851
0
    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
852
        /* reset to deal with the next character */
853
0
        state=readCommand;
854
0
    } else if(state==readCommand) {
855
        /* not in a multi-byte sequence, reset toULength */
856
0
        cnv->toULength=0;
857
0
    }
858
0
    scsu->toUIsSingleByteMode=isSingleByteMode;
859
0
    scsu->toUState=state;
860
0
    scsu->toUQuoteWindow=quoteWindow;
861
0
    scsu->toUDynamicWindow=dynamicWindow;
862
0
    scsu->toUByteOne=byteOne;
863
864
    /* write back the updated pointers */
865
0
    pArgs->source=(const char *)source;
866
0
    pArgs->target=target;
867
0
    return;
868
0
}
869
U_CDECL_END
870
/* SCSU-from-Unicode conversion functions ----------------------------------- */
871
872
/*
873
 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
874
 * reasonable results. The lookahead is minimal.
875
 * Many cases are simple:
876
 * A character fits directly into the current mode, a dynamic or static window,
877
 * or is not compressible. These cases are tested first.
878
 * Real compression heuristics are applied to the rest, in code branches for
879
 * single/Unicode mode and BMP/supplementary code points.
880
 * The heuristics used here are extremely simple.
881
 */
882
883
/* get the number of the window that this character is in, or -1 */
884
static int8_t
885
0
getWindow(const uint32_t offsets[8], uint32_t c) {
886
0
    int i;
887
0
    for(i=0; i<8; ++i) {
888
0
        if((uint32_t)(c-offsets[i])<=0x7f) {
889
0
            return (int8_t)(i);
890
0
        }
891
0
    }
892
0
    return -1;
893
0
}
894
895
/* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
896
static UBool
897
0
isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
898
0
    return (UBool)(c<=offset+0x7f &&
899
0
          (c>=offset || (c<=0x7f &&
900
0
                        (c>=0x20 || (1UL<<c)&0x2601))));
901
                                /* binary 0010 0110 0000 0001,
902
                                   check for b==0xd || b==0xa || b==9 || b==0 */
903
0
}
904
905
/*
906
 * getNextDynamicWindow returns the next dynamic window to be redefined
907
 */
908
static int8_t
909
0
getNextDynamicWindow(SCSUData *scsu) {
910
0
    int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
911
0
    if(++scsu->nextWindowUseIndex==8) {
912
0
        scsu->nextWindowUseIndex=0;
913
0
    }
914
0
    return window;
915
0
}
916
917
/*
918
 * useDynamicWindow() adjusts
919
 * windowUse[] and nextWindowUseIndex for the algorithm to choose
920
 * the next dynamic window to be defined;
921
 * a subclass may override it and provide its own algorithm.
922
 */
923
static void
924
0
useDynamicWindow(SCSUData *scsu, int8_t window) {
925
    /*
926
     * move the existing window, which just became the most recently used one,
927
     * up in windowUse[] to nextWindowUseIndex-1
928
     */
929
930
    /* first, find the index of the window - backwards to favor the more recently used windows */
931
0
    int i, j;
932
933
0
    i=scsu->nextWindowUseIndex;
934
0
    do {
935
0
        if(--i<0) {
936
0
            i=7;
937
0
        }
938
0
    } while(scsu->windowUse[i]!=window);
939
940
    /* now copy each windowUse[i+1] to [i] */
941
0
    j=i+1;
942
0
    if(j==8) {
943
0
        j=0;
944
0
    }
945
0
    while(j!=scsu->nextWindowUseIndex) {
946
0
        scsu->windowUse[i]=scsu->windowUse[j];
947
0
        i=j;
948
0
        if(++j==8) { j=0; }
949
0
    }
950
951
    /* finally, set the window into the most recently used index */
952
0
    scsu->windowUse[i]=window;
953
0
}
954
955
/*
956
 * calculate the offset and the code for a dynamic window that contains the character
957
 * takes fixed offsets into account
958
 * the offset of the window is stored in the offset variable,
959
 * the code is returned
960
 *
961
 * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
962
 */
963
static int
964
0
getDynamicOffset(uint32_t c, uint32_t *pOffset) {
965
0
    int i;
966
967
0
    for(i=0; i<7; ++i) {
968
0
        if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
969
0
            *pOffset=fixedOffsets[i];
970
0
            return 0xf9+i;
971
0
        }
972
0
    }
973
974
0
    if(c<0x80) {
975
        /* No dynamic window for US-ASCII. */
976
0
        return -1;
977
0
    } else if(c<0x3400 ||
978
0
              (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
979
0
              (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
980
0
    ) {
981
        /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
982
0
        *pOffset=c&0x7fffff80;
983
0
        return (int)(c>>7);
984
0
    } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
985
        /* For these characters we need to take the gapOffset into account. */
986
0
        *pOffset=c&0x7fffff80;
987
0
        return (int)((c-gapOffset)>>7);
988
0
    } else {
989
0
        return -1;
990
0
    }
991
0
}
992
U_CDECL_BEGIN
993
/*
994
 * Idea for compression:
995
 *  - save SCSUData and other state before really starting work
996
 *  - at endloop, see if compression could be better with just unicode mode
997
 *  - don't do this if a callback has been called
998
 *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
999
 *  - different buffer handling!
1000
 *
1001
 * Drawback or need for corrective handling:
1002
 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1003
 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1004
 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1005
 *
1006
 * How to achieve both?
1007
 *  - Only replace the result after an SDX or SCU?
1008
 */
1009
1010
static void U_CALLCONV
1011
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1012
0
                            UErrorCode *pErrorCode) {
1013
0
    UConverter *cnv;
1014
0
    SCSUData *scsu;
1015
0
    const UChar *source, *sourceLimit;
1016
0
    uint8_t *target;
1017
0
    int32_t targetCapacity;
1018
0
    int32_t *offsets;
1019
1020
0
    UBool isSingleByteMode;
1021
0
    uint8_t dynamicWindow;
1022
0
    uint32_t currentOffset;
1023
1024
0
    uint32_t c, delta;
1025
1026
0
    int32_t sourceIndex, nextSourceIndex;
1027
1028
0
    int32_t length;
1029
1030
    /* variables for compression heuristics */
1031
0
    uint32_t offset;
1032
0
    UChar lead, trail;
1033
0
    int code;
1034
0
    int8_t window;
1035
1036
    /* set up the local pointers */
1037
0
    cnv=pArgs->converter;
1038
0
    scsu=(SCSUData *)cnv->extraInfo;
1039
1040
    /* set up the local pointers */
1041
0
    source=pArgs->source;
1042
0
    sourceLimit=pArgs->sourceLimit;
1043
0
    target=(uint8_t *)pArgs->target;
1044
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1045
0
    offsets=pArgs->offsets;
1046
1047
    /* get the state machine state */
1048
0
    isSingleByteMode=scsu->fromUIsSingleByteMode;
1049
0
    dynamicWindow=scsu->fromUDynamicWindow;
1050
0
    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1051
1052
0
    c=cnv->fromUChar32;
1053
1054
    /* sourceIndex=-1 if the current character began in the previous buffer */
1055
0
    sourceIndex= c==0 ? 0 : -1;
1056
0
    nextSourceIndex=0;
1057
1058
    /* similar conversion "loop" as in toUnicode */
1059
0
loop:
1060
0
    if(isSingleByteMode) {
1061
0
        if(c!=0 && targetCapacity>0) {
1062
0
            goto getTrailSingle;
1063
0
        }
1064
1065
        /* state machine for single-byte mode */
1066
/* singleByteMode: */
1067
0
        while(source<sourceLimit) {
1068
0
            if(targetCapacity<=0) {
1069
                /* target is full */
1070
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1071
0
                break;
1072
0
            }
1073
0
            c=*source++;
1074
0
            ++nextSourceIndex;
1075
1076
0
            if((c-0x20)<=0x5f) {
1077
                /* pass US-ASCII graphic character through */
1078
0
                *target++=(uint8_t)c;
1079
0
                if(offsets!=NULL) {
1080
0
                    *offsets++=sourceIndex;
1081
0
                }
1082
0
                --targetCapacity;
1083
0
            } else if(c<0x20) {
1084
0
                if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1085
                    /* CR/LF/TAB/NUL */
1086
0
                    *target++=(uint8_t)c;
1087
0
                    if(offsets!=NULL) {
1088
0
                        *offsets++=sourceIndex;
1089
0
                    }
1090
0
                    --targetCapacity;
1091
0
                } else {
1092
                    /* quote C0 control character */
1093
0
                    c|=SQ0<<8;
1094
0
                    length=2;
1095
0
                    goto outputBytes;
1096
0
                }
1097
0
            } else if((delta=c-currentOffset)<=0x7f) {
1098
                /* use the current dynamic window */
1099
0
                *target++=(uint8_t)(delta|0x80);
1100
0
                if(offsets!=NULL) {
1101
0
                    *offsets++=sourceIndex;
1102
0
                }
1103
0
                --targetCapacity;
1104
0
            } else if(U16_IS_SURROGATE(c)) {
1105
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1106
0
getTrailSingle:
1107
0
                    lead=(UChar)c;
1108
0
                    if(source<sourceLimit) {
1109
                        /* test the following code unit */
1110
0
                        trail=*source;
1111
0
                        if(U16_IS_TRAIL(trail)) {
1112
0
                            ++source;
1113
0
                            ++nextSourceIndex;
1114
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1115
                            /* convert this surrogate code point */
1116
                            /* exit this condition tree */
1117
0
                        } else {
1118
                            /* this is an unmatched lead code unit (1st surrogate) */
1119
                            /* callback(illegal) */
1120
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121
0
                            goto endloop;
1122
0
                        }
1123
0
                    } else {
1124
                        /* no more input */
1125
0
                        break;
1126
0
                    }
1127
0
                } else {
1128
                    /* this is an unmatched trail code unit (2nd surrogate) */
1129
                    /* callback(illegal) */
1130
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1131
0
                    goto endloop;
1132
0
                }
1133
1134
                /* compress supplementary character U+10000..U+10ffff */
1135
0
                if((delta=c-currentOffset)<=0x7f) {
1136
                    /* use the current dynamic window */
1137
0
                    *target++=(uint8_t)(delta|0x80);
1138
0
                    if(offsets!=NULL) {
1139
0
                        *offsets++=sourceIndex;
1140
0
                    }
1141
0
                    --targetCapacity;
1142
0
                } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1143
                    /* there is a dynamic window that contains this character, change to it */
1144
0
                    dynamicWindow=window;
1145
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1146
0
                    useDynamicWindow(scsu, dynamicWindow);
1147
0
                    c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1148
0
                    length=2;
1149
0
                    goto outputBytes;
1150
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1151
                    /* might check if there are more characters in this window to come */
1152
                    /* define an extended window with this character */
1153
0
                    code-=0x200;
1154
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1155
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1156
0
                    useDynamicWindow(scsu, dynamicWindow);
1157
0
                    c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1158
0
                    length=4;
1159
0
                    goto outputBytes;
1160
0
                } else {
1161
                    /* change to Unicode mode and output this (lead, trail) pair */
1162
0
                    isSingleByteMode=FALSE;
1163
0
                    *target++=(uint8_t)SCU;
1164
0
                    if(offsets!=NULL) {
1165
0
                        *offsets++=sourceIndex;
1166
0
                    }
1167
0
                    --targetCapacity;
1168
0
                    c=((uint32_t)lead<<16)|trail;
1169
0
                    length=4;
1170
0
                    goto outputBytes;
1171
0
                }
1172
0
            } else if(c<0xa0) {
1173
                /* quote C1 control character */
1174
0
                c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1175
0
                length=2;
1176
0
                goto outputBytes;
1177
0
            } else if(c==0xfeff || c>=0xfff0) {
1178
                /* quote signature character=byte order mark and specials */
1179
0
                c|=SQU<<16;
1180
0
                length=3;
1181
0
                goto outputBytes;
1182
0
            } else {
1183
                /* compress all other BMP characters */
1184
0
                if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1185
                    /* there is a window defined that contains this character - switch to it or quote from it? */
1186
0
                    if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1187
                        /* change to dynamic window */
1188
0
                        dynamicWindow=window;
1189
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1190
0
                        useDynamicWindow(scsu, dynamicWindow);
1191
0
                        c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1192
0
                        length=2;
1193
0
                        goto outputBytes;
1194
0
                    } else {
1195
                        /* quote from dynamic window */
1196
0
                        c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1197
0
                        length=2;
1198
0
                        goto outputBytes;
1199
0
                    }
1200
0
                } else if((window=getWindow(staticOffsets, c))>=0) {
1201
                    /* quote from static window */
1202
0
                    c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1203
0
                    length=2;
1204
0
                    goto outputBytes;
1205
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1206
                    /* define a dynamic window with this character */
1207
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1208
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1209
0
                    useDynamicWindow(scsu, dynamicWindow);
1210
0
                    c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1211
0
                    length=3;
1212
0
                    goto outputBytes;
1213
0
                } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1214
0
                          (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1215
0
                ) {
1216
                    /*
1217
                     * this character is not compressible (a BMP ideograph or similar);
1218
                     * switch to Unicode mode if this is the last character in the block
1219
                     * or there is at least one more ideograph following immediately
1220
                     */
1221
0
                    isSingleByteMode=FALSE;
1222
0
                    c|=SCU<<16;
1223
0
                    length=3;
1224
0
                    goto outputBytes;
1225
0
                } else {
1226
                    /* quote Unicode */
1227
0
                    c|=SQU<<16;
1228
0
                    length=3;
1229
0
                    goto outputBytes;
1230
0
                }
1231
0
            }
1232
1233
            /* normal end of conversion: prepare for a new character */
1234
0
            c=0;
1235
0
            sourceIndex=nextSourceIndex;
1236
0
        }
1237
0
    } else {
1238
0
        if(c!=0 && targetCapacity>0) {
1239
0
            goto getTrailUnicode;
1240
0
        }
1241
1242
        /* state machine for Unicode mode */
1243
/* unicodeByteMode: */
1244
0
        while(source<sourceLimit) {
1245
0
            if(targetCapacity<=0) {
1246
                /* target is full */
1247
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248
0
                break;
1249
0
            }
1250
0
            c=*source++;
1251
0
            ++nextSourceIndex;
1252
1253
0
            if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1254
                /* not compressible, write character directly */
1255
0
                if(targetCapacity>=2) {
1256
0
                    *target++=(uint8_t)(c>>8);
1257
0
                    *target++=(uint8_t)c;
1258
0
                    if(offsets!=NULL) {
1259
0
                        *offsets++=sourceIndex;
1260
0
                        *offsets++=sourceIndex;
1261
0
                    }
1262
0
                    targetCapacity-=2;
1263
0
                } else {
1264
0
                    length=2;
1265
0
                    goto outputBytes;
1266
0
                }
1267
0
            } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1268
                /* compress BMP character if the following one is not an uncompressible ideograph */
1269
0
                if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1270
0
                    if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1271
                        /* ASCII digit or letter */
1272
0
                        isSingleByteMode=TRUE;
1273
0
                        c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1274
0
                        length=2;
1275
0
                        goto outputBytes;
1276
0
                    } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1277
                        /* there is a dynamic window that contains this character, change to it */
1278
0
                        isSingleByteMode=TRUE;
1279
0
                        dynamicWindow=window;
1280
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1281
0
                        useDynamicWindow(scsu, dynamicWindow);
1282
0
                        c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1283
0
                        length=2;
1284
0
                        goto outputBytes;
1285
0
                    } else if((code=getDynamicOffset(c, &offset))>=0) {
1286
                        /* define a dynamic window with this character */
1287
0
                        isSingleByteMode=TRUE;
1288
0
                        dynamicWindow=getNextDynamicWindow(scsu);
1289
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1290
0
                        useDynamicWindow(scsu, dynamicWindow);
1291
0
                        c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1292
0
                        length=3;
1293
0
                        goto outputBytes;
1294
0
                    }
1295
0
                }
1296
1297
                /* don't know how to compress this character, just write it directly */
1298
0
                length=2;
1299
0
                goto outputBytes;
1300
0
            } else if(c<0xe000) {
1301
                /* c is a surrogate */
1302
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1303
0
getTrailUnicode:
1304
0
                    lead=(UChar)c;
1305
0
                    if(source<sourceLimit) {
1306
                        /* test the following code unit */
1307
0
                        trail=*source;
1308
0
                        if(U16_IS_TRAIL(trail)) {
1309
0
                            ++source;
1310
0
                            ++nextSourceIndex;
1311
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1312
                            /* convert this surrogate code point */
1313
                            /* exit this condition tree */
1314
0
                        } else {
1315
                            /* this is an unmatched lead code unit (1st surrogate) */
1316
                            /* callback(illegal) */
1317
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1318
0
                            goto endloop;
1319
0
                        }
1320
0
                    } else {
1321
                        /* no more input */
1322
0
                        break;
1323
0
                    }
1324
0
                } else {
1325
                    /* this is an unmatched trail code unit (2nd surrogate) */
1326
                    /* callback(illegal) */
1327
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1328
0
                    goto endloop;
1329
0
                }
1330
1331
                /* compress supplementary character */
1332
0
                if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1333
0
                    !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1334
0
                ) {
1335
                    /*
1336
                     * there is a dynamic window that contains this character and
1337
                     * the following character is not uncompressible,
1338
                     * change to the window
1339
                     */
1340
0
                    isSingleByteMode=TRUE;
1341
0
                    dynamicWindow=window;
1342
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1343
0
                    useDynamicWindow(scsu, dynamicWindow);
1344
0
                    c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1345
0
                    length=2;
1346
0
                    goto outputBytes;
1347
0
                } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1348
0
                          (code=getDynamicOffset(c, &offset))>=0
1349
0
                ) {
1350
                    /* two supplementary characters in (probably) the same window - define an extended one */
1351
0
                    isSingleByteMode=TRUE;
1352
0
                    code-=0x200;
1353
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1354
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1355
0
                    useDynamicWindow(scsu, dynamicWindow);
1356
0
                    c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1357
0
                    length=4;
1358
0
                    goto outputBytes;
1359
0
                } else {
1360
                    /* don't know how to compress this character, just write it directly */
1361
0
                    c=((uint32_t)lead<<16)|trail;
1362
0
                    length=4;
1363
0
                    goto outputBytes;
1364
0
                }
1365
0
            } else /* 0xe000<=c<0xf300 */ {
1366
                /* quote to avoid SCSU tags */
1367
0
                c|=UQU<<16;
1368
0
                length=3;
1369
0
                goto outputBytes;
1370
0
            }
1371
1372
            /* normal end of conversion: prepare for a new character */
1373
0
            c=0;
1374
0
            sourceIndex=nextSourceIndex;
1375
0
        }
1376
0
    }
1377
0
endloop:
1378
1379
    /* set the converter state back into UConverter */
1380
0
    scsu->fromUIsSingleByteMode=isSingleByteMode;
1381
0
    scsu->fromUDynamicWindow=dynamicWindow;
1382
1383
0
    cnv->fromUChar32=c;
1384
1385
    /* write back the updated pointers */
1386
0
    pArgs->source=source;
1387
0
    pArgs->target=(char *)target;
1388
0
    pArgs->offsets=offsets;
1389
0
    return;
1390
1391
0
outputBytes:
1392
    /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1393
    /* from the first if in the loop we know that targetCapacity>0 */
1394
0
    if(length<=targetCapacity) {
1395
0
        if(offsets==NULL) {
1396
0
            switch(length) {
1397
                /* each branch falls through to the next one */
1398
0
            case 4:
1399
0
                *target++=(uint8_t)(c>>24);
1400
0
                U_FALLTHROUGH;
1401
0
            case 3:
1402
0
                *target++=(uint8_t)(c>>16);
1403
0
                U_FALLTHROUGH;
1404
0
            case 2:
1405
0
                *target++=(uint8_t)(c>>8);
1406
0
                U_FALLTHROUGH;
1407
0
            case 1:
1408
0
                *target++=(uint8_t)c;
1409
0
                U_FALLTHROUGH;
1410
0
            default:
1411
                /* will never occur */
1412
0
                break;
1413
0
            }
1414
0
        } else {
1415
0
            switch(length) {
1416
                /* each branch falls through to the next one */
1417
0
            case 4:
1418
0
                *target++=(uint8_t)(c>>24);
1419
0
                *offsets++=sourceIndex;
1420
0
                U_FALLTHROUGH;
1421
0
            case 3:
1422
0
                *target++=(uint8_t)(c>>16);
1423
0
                *offsets++=sourceIndex;
1424
0
                U_FALLTHROUGH;
1425
0
            case 2:
1426
0
                *target++=(uint8_t)(c>>8);
1427
0
                *offsets++=sourceIndex;
1428
0
                U_FALLTHROUGH;
1429
0
            case 1:
1430
0
                *target++=(uint8_t)c;
1431
0
                *offsets++=sourceIndex;
1432
0
                U_FALLTHROUGH;
1433
0
            default:
1434
                /* will never occur */
1435
0
                break;
1436
0
            }
1437
0
        }
1438
0
        targetCapacity-=length;
1439
1440
        /* normal end of conversion: prepare for a new character */
1441
0
        c=0;
1442
0
        sourceIndex=nextSourceIndex;
1443
0
        goto loop;
1444
0
    } else {
1445
0
        uint8_t *p;
1446
1447
        /*
1448
         * We actually do this backwards here:
1449
         * In order to save an intermediate variable, we output
1450
         * first to the overflow buffer what does not fit into the
1451
         * regular target.
1452
         */
1453
        /* we know that 0<=targetCapacity<length<=4 */
1454
        /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1455
0
        length-=targetCapacity;
1456
0
        p=(uint8_t *)cnv->charErrorBuffer;
1457
0
        switch(length) {
1458
            /* each branch falls through to the next one */
1459
0
        case 4:
1460
0
            *p++=(uint8_t)(c>>24);
1461
0
            U_FALLTHROUGH;
1462
0
        case 3:
1463
0
            *p++=(uint8_t)(c>>16);
1464
0
            U_FALLTHROUGH;
1465
0
        case 2:
1466
0
            *p++=(uint8_t)(c>>8);
1467
0
            U_FALLTHROUGH;
1468
0
        case 1:
1469
0
            *p=(uint8_t)c;
1470
0
            U_FALLTHROUGH;
1471
0
        default:
1472
            /* will never occur */
1473
0
            break;
1474
0
        }
1475
0
        cnv->charErrorBufferLength=(int8_t)length;
1476
1477
        /* now output what fits into the regular target */
1478
0
        c>>=8*length; /* length was reduced by targetCapacity */
1479
0
        switch(targetCapacity) {
1480
            /* each branch falls through to the next one */
1481
0
        case 3:
1482
0
            *target++=(uint8_t)(c>>16);
1483
0
            if(offsets!=NULL) {
1484
0
                *offsets++=sourceIndex;
1485
0
            }
1486
0
            U_FALLTHROUGH;
1487
0
        case 2:
1488
0
            *target++=(uint8_t)(c>>8);
1489
0
            if(offsets!=NULL) {
1490
0
                *offsets++=sourceIndex;
1491
0
            }
1492
0
            U_FALLTHROUGH;
1493
0
        case 1:
1494
0
            *target++=(uint8_t)c;
1495
0
            if(offsets!=NULL) {
1496
0
                *offsets++=sourceIndex;
1497
0
            }
1498
0
            U_FALLTHROUGH;
1499
0
        default:
1500
0
            break;
1501
0
        }
1502
1503
        /* target overflow */
1504
0
        targetCapacity=0;
1505
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1506
0
        c=0;
1507
0
        goto endloop;
1508
0
    }
1509
0
}
1510
1511
/*
1512
 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1513
 * If a change is made in the original function, then either
1514
 * change this function the same way or
1515
 * re-copy the original function and remove the variables
1516
 * offsets, sourceIndex, and nextSourceIndex.
1517
 */
1518
static void U_CALLCONV
1519
_SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1520
0
                 UErrorCode *pErrorCode) {
1521
0
    UConverter *cnv;
1522
0
    SCSUData *scsu;
1523
0
    const UChar *source, *sourceLimit;
1524
0
    uint8_t *target;
1525
0
    int32_t targetCapacity;
1526
1527
0
    UBool isSingleByteMode;
1528
0
    uint8_t dynamicWindow;
1529
0
    uint32_t currentOffset;
1530
1531
0
    uint32_t c, delta;
1532
1533
0
    int32_t length;
1534
1535
    /* variables for compression heuristics */
1536
0
    uint32_t offset;
1537
0
    UChar lead, trail;
1538
0
    int code;
1539
0
    int8_t window;
1540
1541
    /* set up the local pointers */
1542
0
    cnv=pArgs->converter;
1543
0
    scsu=(SCSUData *)cnv->extraInfo;
1544
1545
    /* set up the local pointers */
1546
0
    source=pArgs->source;
1547
0
    sourceLimit=pArgs->sourceLimit;
1548
0
    target=(uint8_t *)pArgs->target;
1549
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1550
1551
    /* get the state machine state */
1552
0
    isSingleByteMode=scsu->fromUIsSingleByteMode;
1553
0
    dynamicWindow=scsu->fromUDynamicWindow;
1554
0
    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1555
1556
0
    c=cnv->fromUChar32;
1557
1558
    /* similar conversion "loop" as in toUnicode */
1559
0
loop:
1560
0
    if(isSingleByteMode) {
1561
0
        if(c!=0 && targetCapacity>0) {
1562
0
            goto getTrailSingle;
1563
0
        }
1564
1565
        /* state machine for single-byte mode */
1566
/* singleByteMode: */
1567
0
        while(source<sourceLimit) {
1568
0
            if(targetCapacity<=0) {
1569
                /* target is full */
1570
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1571
0
                break;
1572
0
            }
1573
0
            c=*source++;
1574
1575
0
            if((c-0x20)<=0x5f) {
1576
                /* pass US-ASCII graphic character through */
1577
0
                *target++=(uint8_t)c;
1578
0
                --targetCapacity;
1579
0
            } else if(c<0x20) {
1580
0
                if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1581
                    /* CR/LF/TAB/NUL */
1582
0
                    *target++=(uint8_t)c;
1583
0
                    --targetCapacity;
1584
0
                } else {
1585
                    /* quote C0 control character */
1586
0
                    c|=SQ0<<8;
1587
0
                    length=2;
1588
0
                    goto outputBytes;
1589
0
                }
1590
0
            } else if((delta=c-currentOffset)<=0x7f) {
1591
                /* use the current dynamic window */
1592
0
                *target++=(uint8_t)(delta|0x80);
1593
0
                --targetCapacity;
1594
0
            } else if(U16_IS_SURROGATE(c)) {
1595
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1596
0
getTrailSingle:
1597
0
                    lead=(UChar)c;
1598
0
                    if(source<sourceLimit) {
1599
                        /* test the following code unit */
1600
0
                        trail=*source;
1601
0
                        if(U16_IS_TRAIL(trail)) {
1602
0
                            ++source;
1603
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1604
                            /* convert this surrogate code point */
1605
                            /* exit this condition tree */
1606
0
                        } else {
1607
                            /* this is an unmatched lead code unit (1st surrogate) */
1608
                            /* callback(illegal) */
1609
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1610
0
                            goto endloop;
1611
0
                        }
1612
0
                    } else {
1613
                        /* no more input */
1614
0
                        break;
1615
0
                    }
1616
0
                } else {
1617
                    /* this is an unmatched trail code unit (2nd surrogate) */
1618
                    /* callback(illegal) */
1619
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1620
0
                    goto endloop;
1621
0
                }
1622
1623
                /* compress supplementary character U+10000..U+10ffff */
1624
0
                if((delta=c-currentOffset)<=0x7f) {
1625
                    /* use the current dynamic window */
1626
0
                    *target++=(uint8_t)(delta|0x80);
1627
0
                    --targetCapacity;
1628
0
                } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1629
                    /* there is a dynamic window that contains this character, change to it */
1630
0
                    dynamicWindow=window;
1631
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1632
0
                    useDynamicWindow(scsu, dynamicWindow);
1633
0
                    c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1634
0
                    length=2;
1635
0
                    goto outputBytes;
1636
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1637
                    /* might check if there are more characters in this window to come */
1638
                    /* define an extended window with this character */
1639
0
                    code-=0x200;
1640
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1641
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1642
0
                    useDynamicWindow(scsu, dynamicWindow);
1643
0
                    c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1644
0
                    length=4;
1645
0
                    goto outputBytes;
1646
0
                } else {
1647
                    /* change to Unicode mode and output this (lead, trail) pair */
1648
0
                    isSingleByteMode=FALSE;
1649
0
                    *target++=(uint8_t)SCU;
1650
0
                    --targetCapacity;
1651
0
                    c=((uint32_t)lead<<16)|trail;
1652
0
                    length=4;
1653
0
                    goto outputBytes;
1654
0
                }
1655
0
            } else if(c<0xa0) {
1656
                /* quote C1 control character */
1657
0
                c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1658
0
                length=2;
1659
0
                goto outputBytes;
1660
0
            } else if(c==0xfeff || c>=0xfff0) {
1661
                /* quote signature character=byte order mark and specials */
1662
0
                c|=SQU<<16;
1663
0
                length=3;
1664
0
                goto outputBytes;
1665
0
            } else {
1666
                /* compress all other BMP characters */
1667
0
                if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1668
                    /* there is a window defined that contains this character - switch to it or quote from it? */
1669
0
                    if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1670
                        /* change to dynamic window */
1671
0
                        dynamicWindow=window;
1672
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1673
0
                        useDynamicWindow(scsu, dynamicWindow);
1674
0
                        c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1675
0
                        length=2;
1676
0
                        goto outputBytes;
1677
0
                    } else {
1678
                        /* quote from dynamic window */
1679
0
                        c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1680
0
                        length=2;
1681
0
                        goto outputBytes;
1682
0
                    }
1683
0
                } else if((window=getWindow(staticOffsets, c))>=0) {
1684
                    /* quote from static window */
1685
0
                    c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1686
0
                    length=2;
1687
0
                    goto outputBytes;
1688
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1689
                    /* define a dynamic window with this character */
1690
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1691
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1692
0
                    useDynamicWindow(scsu, dynamicWindow);
1693
0
                    c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1694
0
                    length=3;
1695
0
                    goto outputBytes;
1696
0
                } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1697
0
                          (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1698
0
                ) {
1699
                    /*
1700
                     * this character is not compressible (a BMP ideograph or similar);
1701
                     * switch to Unicode mode if this is the last character in the block
1702
                     * or there is at least one more ideograph following immediately
1703
                     */
1704
0
                    isSingleByteMode=FALSE;
1705
0
                    c|=SCU<<16;
1706
0
                    length=3;
1707
0
                    goto outputBytes;
1708
0
                } else {
1709
                    /* quote Unicode */
1710
0
                    c|=SQU<<16;
1711
0
                    length=3;
1712
0
                    goto outputBytes;
1713
0
                }
1714
0
            }
1715
1716
            /* normal end of conversion: prepare for a new character */
1717
0
            c=0;
1718
0
        }
1719
0
    } else {
1720
0
        if(c!=0 && targetCapacity>0) {
1721
0
            goto getTrailUnicode;
1722
0
        }
1723
1724
        /* state machine for Unicode mode */
1725
/* unicodeByteMode: */
1726
0
        while(source<sourceLimit) {
1727
0
            if(targetCapacity<=0) {
1728
                /* target is full */
1729
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1730
0
                break;
1731
0
            }
1732
0
            c=*source++;
1733
1734
0
            if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1735
                /* not compressible, write character directly */
1736
0
                if(targetCapacity>=2) {
1737
0
                    *target++=(uint8_t)(c>>8);
1738
0
                    *target++=(uint8_t)c;
1739
0
                    targetCapacity-=2;
1740
0
                } else {
1741
0
                    length=2;
1742
0
                    goto outputBytes;
1743
0
                }
1744
0
            } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1745
                /* compress BMP character if the following one is not an uncompressible ideograph */
1746
0
                if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1747
0
                    if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1748
                        /* ASCII digit or letter */
1749
0
                        isSingleByteMode=TRUE;
1750
0
                        c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1751
0
                        length=2;
1752
0
                        goto outputBytes;
1753
0
                    } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1754
                        /* there is a dynamic window that contains this character, change to it */
1755
0
                        isSingleByteMode=TRUE;
1756
0
                        dynamicWindow=window;
1757
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1758
0
                        useDynamicWindow(scsu, dynamicWindow);
1759
0
                        c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1760
0
                        length=2;
1761
0
                        goto outputBytes;
1762
0
                    } else if((code=getDynamicOffset(c, &offset))>=0) {
1763
                        /* define a dynamic window with this character */
1764
0
                        isSingleByteMode=TRUE;
1765
0
                        dynamicWindow=getNextDynamicWindow(scsu);
1766
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1767
0
                        useDynamicWindow(scsu, dynamicWindow);
1768
0
                        c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1769
0
                        length=3;
1770
0
                        goto outputBytes;
1771
0
                    }
1772
0
                }
1773
1774
                /* don't know how to compress this character, just write it directly */
1775
0
                length=2;
1776
0
                goto outputBytes;
1777
0
            } else if(c<0xe000) {
1778
                /* c is a surrogate */
1779
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1780
0
getTrailUnicode:
1781
0
                    lead=(UChar)c;
1782
0
                    if(source<sourceLimit) {
1783
                        /* test the following code unit */
1784
0
                        trail=*source;
1785
0
                        if(U16_IS_TRAIL(trail)) {
1786
0
                            ++source;
1787
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1788
                            /* convert this surrogate code point */
1789
                            /* exit this condition tree */
1790
0
                        } else {
1791
                            /* this is an unmatched lead code unit (1st surrogate) */
1792
                            /* callback(illegal) */
1793
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1794
0
                            goto endloop;
1795
0
                        }
1796
0
                    } else {
1797
                        /* no more input */
1798
0
                        break;
1799
0
                    }
1800
0
                } else {
1801
                    /* this is an unmatched trail code unit (2nd surrogate) */
1802
                    /* callback(illegal) */
1803
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1804
0
                    goto endloop;
1805
0
                }
1806
1807
                /* compress supplementary character */
1808
0
                if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1809
0
                    !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1810
0
                ) {
1811
                    /*
1812
                     * there is a dynamic window that contains this character and
1813
                     * the following character is not uncompressible,
1814
                     * change to the window
1815
                     */
1816
0
                    isSingleByteMode=TRUE;
1817
0
                    dynamicWindow=window;
1818
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1819
0
                    useDynamicWindow(scsu, dynamicWindow);
1820
0
                    c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1821
0
                    length=2;
1822
0
                    goto outputBytes;
1823
0
                } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1824
0
                          (code=getDynamicOffset(c, &offset))>=0
1825
0
                ) {
1826
                    /* two supplementary characters in (probably) the same window - define an extended one */
1827
0
                    isSingleByteMode=TRUE;
1828
0
                    code-=0x200;
1829
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1830
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1831
0
                    useDynamicWindow(scsu, dynamicWindow);
1832
0
                    c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1833
0
                    length=4;
1834
0
                    goto outputBytes;
1835
0
                } else {
1836
                    /* don't know how to compress this character, just write it directly */
1837
0
                    c=((uint32_t)lead<<16)|trail;
1838
0
                    length=4;
1839
0
                    goto outputBytes;
1840
0
                }
1841
0
            } else /* 0xe000<=c<0xf300 */ {
1842
                /* quote to avoid SCSU tags */
1843
0
                c|=UQU<<16;
1844
0
                length=3;
1845
0
                goto outputBytes;
1846
0
            }
1847
1848
            /* normal end of conversion: prepare for a new character */
1849
0
            c=0;
1850
0
        }
1851
0
    }
1852
0
endloop:
1853
1854
    /* set the converter state back into UConverter */
1855
0
    scsu->fromUIsSingleByteMode=isSingleByteMode;
1856
0
    scsu->fromUDynamicWindow=dynamicWindow;
1857
1858
0
    cnv->fromUChar32=c;
1859
1860
    /* write back the updated pointers */
1861
0
    pArgs->source=source;
1862
0
    pArgs->target=(char *)target;
1863
0
    return;
1864
1865
0
outputBytes:
1866
    /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1867
    /* from the first if in the loop we know that targetCapacity>0 */
1868
0
    if(length<=targetCapacity) {
1869
0
        switch(length) {
1870
            /* each branch falls through to the next one */
1871
0
        case 4:
1872
0
            *target++=(uint8_t)(c>>24);
1873
0
            U_FALLTHROUGH;
1874
0
        case 3:
1875
0
            *target++=(uint8_t)(c>>16);
1876
0
            U_FALLTHROUGH;
1877
0
        case 2:
1878
0
            *target++=(uint8_t)(c>>8);
1879
0
            U_FALLTHROUGH;
1880
0
        case 1:
1881
0
            *target++=(uint8_t)c;
1882
0
            U_FALLTHROUGH;
1883
0
        default:
1884
            /* will never occur */
1885
0
            break;
1886
0
        }
1887
0
        targetCapacity-=length;
1888
1889
        /* normal end of conversion: prepare for a new character */
1890
0
        c=0;
1891
0
        goto loop;
1892
0
    } else {
1893
0
        uint8_t *p;
1894
1895
        /*
1896
         * We actually do this backwards here:
1897
         * In order to save an intermediate variable, we output
1898
         * first to the overflow buffer what does not fit into the
1899
         * regular target.
1900
         */
1901
        /* we know that 0<=targetCapacity<length<=4 */
1902
        /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1903
0
        length-=targetCapacity;
1904
0
        p=(uint8_t *)cnv->charErrorBuffer;
1905
0
        switch(length) {
1906
            /* each branch falls through to the next one */
1907
0
        case 4:
1908
0
            *p++=(uint8_t)(c>>24);
1909
0
            U_FALLTHROUGH;
1910
0
        case 3:
1911
0
            *p++=(uint8_t)(c>>16);
1912
0
            U_FALLTHROUGH;
1913
0
        case 2:
1914
0
            *p++=(uint8_t)(c>>8);
1915
0
            U_FALLTHROUGH;
1916
0
        case 1:
1917
0
            *p=(uint8_t)c;
1918
0
            U_FALLTHROUGH;
1919
0
        default:
1920
            /* will never occur */
1921
0
            break;
1922
0
        }
1923
0
        cnv->charErrorBufferLength=(int8_t)length;
1924
1925
        /* now output what fits into the regular target */
1926
0
        c>>=8*length; /* length was reduced by targetCapacity */
1927
0
        switch(targetCapacity) {
1928
            /* each branch falls through to the next one */
1929
0
        case 3:
1930
0
            *target++=(uint8_t)(c>>16);
1931
0
            U_FALLTHROUGH;
1932
0
        case 2:
1933
0
            *target++=(uint8_t)(c>>8);
1934
0
            U_FALLTHROUGH;
1935
0
        case 1:
1936
0
            *target++=(uint8_t)c;
1937
0
            U_FALLTHROUGH;
1938
0
        default:
1939
0
            break;
1940
0
        }
1941
1942
        /* target overflow */
1943
0
        targetCapacity=0;
1944
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1945
0
        c=0;
1946
0
        goto endloop;
1947
0
    }
1948
0
}
1949
1950
/* miscellaneous ------------------------------------------------------------ */
1951
1952
static const char *  U_CALLCONV
1953
0
_SCSUGetName(const UConverter *cnv) {
1954
0
    SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1955
1956
0
    switch(scsu->locale) {
1957
0
    case l_ja:
1958
0
        return "SCSU,locale=ja";
1959
0
    default:
1960
0
        return "SCSU";
1961
0
    }
1962
0
}
1963
1964
/* structure for SafeClone calculations */
1965
struct cloneSCSUStruct
1966
{
1967
    UConverter cnv;
1968
    SCSUData mydata;
1969
};
1970
1971
static UConverter *  U_CALLCONV
1972
_SCSUSafeClone(const UConverter *cnv, 
1973
               void *stackBuffer, 
1974
               int32_t *pBufferSize, 
1975
               UErrorCode *status)
1976
0
{
1977
0
    struct cloneSCSUStruct * localClone;
1978
0
    int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1979
1980
0
    if (U_FAILURE(*status)){
1981
0
        return 0;
1982
0
    }
1983
1984
0
    if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1985
0
        *pBufferSize = bufferSizeNeeded;
1986
0
        return 0;
1987
0
    }
1988
1989
0
    localClone = (struct cloneSCSUStruct *)stackBuffer;
1990
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1991
1992
0
    uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1993
0
    localClone->cnv.extraInfo = &localClone->mydata;
1994
0
    localClone->cnv.isExtraLocal = TRUE;
1995
1996
0
    return &localClone->cnv;
1997
0
}
1998
U_CDECL_END
1999
2000
static const UConverterImpl _SCSUImpl={
2001
    UCNV_SCSU,
2002
2003
    NULL,
2004
    NULL,
2005
2006
    _SCSUOpen,
2007
    _SCSUClose,
2008
    _SCSUReset,
2009
2010
    _SCSUToUnicode,
2011
    _SCSUToUnicodeWithOffsets,
2012
    _SCSUFromUnicode,
2013
    _SCSUFromUnicodeWithOffsets,
2014
    NULL,
2015
2016
    NULL,
2017
    _SCSUGetName,
2018
    NULL,
2019
    _SCSUSafeClone,
2020
    ucnv_getCompleteUnicodeSet,
2021
    NULL,
2022
    NULL
2023
};
2024
2025
static const UConverterStaticData _SCSUStaticData={
2026
    sizeof(UConverterStaticData),
2027
    "SCSU",
2028
    1212, /* CCSID for SCSU */
2029
    UCNV_IBM, UCNV_SCSU,
2030
    1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2031
    /*
2032
     * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2033
     * substitution string.
2034
     */
2035
    { 0x0e, 0xff, 0xfd, 0 }, 3,
2036
    FALSE, FALSE,
2037
    0,
2038
    0,
2039
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2040
};
2041
2042
const UConverterSharedData _SCSUData=
2043
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2044
2045
#endif