Coverage Report

Created: 2026-06-13 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/common/ucnvscsu.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2000-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  ucnvscsu.c
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2000nov18
16
*   created by: Markus W. Scherer
17
*
18
*   This is an implementation of the Standard Compression Scheme for Unicode
19
*   as defined in https://www.unicode.org/reports/tr6/ .
20
*   Reserved commands and window settings are treated as illegal sequences and
21
*   will result in callback calls.
22
*/
23
24
#include "unicode/utypes.h"
25
26
#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
27
28
#include "unicode/ucnv.h"
29
#include "unicode/ucnv_cb.h"
30
#include "unicode/utf16.h"
31
#include "ucnv_bld.h"
32
#include "ucnv_cnv.h"
33
#include "cmemory.h"
34
35
/* SCSU definitions --------------------------------------------------------- */
36
37
/* SCSU command byte values */
38
enum {
39
    SQ0=0x01, /* Quote from window pair 0 */
40
    SQ7=0x08, /* Quote from window pair 7 */
41
    SDX=0x0B, /* Define a window as extended */
42
    Srs=0x0C, /* reserved */
43
    SQU=0x0E, /* Quote a single Unicode character */
44
    SCU=0x0F, /* Change to Unicode mode */
45
    SC0=0x10, /* Select window 0 */
46
    SC7=0x17, /* Select window 7 */
47
    SD0=0x18, /* Define and select window 0 */
48
    SD7=0x1F, /* Define and select window 7 */
49
50
    UC0=0xE0, /* Select window 0 */
51
    UC7=0xE7, /* Select window 7 */
52
    UD0=0xE8, /* Define and select window 0 */
53
    UD7=0xEF, /* Define and select window 7 */
54
    UQU=0xF0, /* Quote a single Unicode character */
55
    UDX=0xF1, /* Define a Window as extended */
56
    Urs=0xF2  /* reserved */
57
};
58
59
enum {
60
    /*
61
     * Unicode code points from 3400 to E000 are not adressible by
62
     * dynamic window, since in these areas no short run alphabets are
63
     * found. Therefore add gapOffset to all values from gapThreshold.
64
     */
65
    gapThreshold=0x68,
66
    gapOffset=0xAC00,
67
68
    /* values between reservedStart and fixedThreshold are reserved */
69
    reservedStart=0xA8,
70
71
    /* use table of predefined fixed offsets for values from fixedThreshold */
72
    fixedThreshold=0xF9
73
};
74
75
/* constant offsets for the 8 static windows */
76
static const uint32_t staticOffsets[8]={
77
    0x0000, /* ASCII for quoted tags */
78
    0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79
    0x0100, /* Latin Extended-A */
80
    0x0300, /* Combining Diacritical Marks */
81
    0x2000, /* General Punctuation */
82
    0x2080, /* Currency Symbols */
83
    0x2100, /* Letterlike Symbols and Number Forms */
84
    0x3000  /* CJK Symbols and punctuation */
85
};
86
87
/* initial offsets for the 8 dynamic (sliding) windows */
88
static const uint32_t initialDynamicOffsets[8]={
89
    0x0080, /* Latin-1 */
90
    0x00C0, /* Latin Extended A */
91
    0x0400, /* Cyrillic */
92
    0x0600, /* Arabic */
93
    0x0900, /* Devanagari */
94
    0x3040, /* Hiragana */
95
    0x30A0, /* Katakana */
96
    0xFF00  /* Fullwidth ASCII */
97
};
98
99
/* Table of fixed predefined Offsets */
100
static const uint32_t fixedOffsets[]={
101
    /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102
    /* 0xFA */ 0x0250, /* IPA extensions */
103
    /* 0xFB */ 0x0370, /* Greek */
104
    /* 0xFC */ 0x0530, /* Armenian */
105
    /* 0xFD */ 0x3040, /* Hiragana */
106
    /* 0xFE */ 0x30A0, /* Katakana */
107
    /* 0xFF */ 0xFF60  /* Halfwidth Katakana */
108
};
109
110
/* state values */
111
enum {
112
    readCommand,
113
    quotePairOne,
114
    quotePairTwo,
115
    quoteOne,
116
    definePairOne,
117
    definePairTwo,
118
    defineOne
119
};
120
121
typedef struct SCSUData {
122
    /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
123
    uint32_t toUDynamicOffsets[8];
124
    uint32_t fromUDynamicOffsets[8];
125
126
    /* state machine state - toUnicode */
127
    UBool toUIsSingleByteMode;
128
    uint8_t toUState;
129
    int8_t toUQuoteWindow, toUDynamicWindow;
130
    uint8_t toUByteOne;
131
    uint8_t toUPadding[3];
132
133
    /* state machine state - fromUnicode */
134
    UBool fromUIsSingleByteMode;
135
    int8_t fromUDynamicWindow;
136
137
    /*
138
     * windowUse[] keeps track of the use of the dynamic windows:
139
     * At nextWindowUseIndex there is the least recently used window,
140
     * and the following windows (in a wrapping manner) are more and more
141
     * recently used.
142
     * At nextWindowUseIndex-1 there is the most recently used window.
143
     */
144
    uint8_t locale;
145
    int8_t nextWindowUseIndex;
146
    int8_t windowUse[8];
147
} SCSUData;
148
149
static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150
static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151
152
enum {
153
    lGeneric, l_ja
154
};
155
156
/* SCSU setup functions ----------------------------------------------------- */
157
U_CDECL_BEGIN
158
static void U_CALLCONV
159
291
_SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160
291
    SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161
162
291
    if(choice<=UCNV_RESET_TO_UNICODE) {
163
        /* reset toUnicode */
164
291
        uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165
166
291
        scsu->toUIsSingleByteMode=true;
167
291
        scsu->toUState=readCommand;
168
291
        scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169
291
        scsu->toUByteOne=0;
170
171
291
        cnv->toULength=0;
172
291
    }
173
291
    if(choice!=UCNV_RESET_TO_UNICODE) {
174
        /* reset fromUnicode */
175
291
        uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176
177
291
        scsu->fromUIsSingleByteMode=true;
178
291
        scsu->fromUDynamicWindow=0;
179
180
291
        scsu->nextWindowUseIndex=0;
181
291
        switch(scsu->locale) {
182
0
        case l_ja:
183
0
            uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184
0
            break;
185
291
        default:
186
291
            uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187
291
            break;
188
291
        }
189
190
291
        cnv->fromUChar32=0;
191
291
    }
192
291
}
193
194
static void U_CALLCONV
195
_SCSUOpen(UConverter *cnv,
196
          UConverterLoadArgs *pArgs,
197
291
          UErrorCode *pErrorCode) {
198
291
    const char *locale=pArgs->locale;
199
291
    if(pArgs->onlyTestIsLoadable) {
200
0
        return;
201
0
    }
202
291
    cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203
291
    if(cnv->extraInfo!=nullptr) {
204
291
        if(locale!=nullptr && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205
0
            ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206
291
        } else {
207
291
            ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208
291
        }
209
291
        _SCSUReset(cnv, UCNV_RESET_BOTH);
210
291
    } else {
211
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212
0
    }
213
214
    /* Set the substitution character U+fffd as a Unicode string. */
215
291
    cnv->subUChars[0]=0xfffd;
216
291
    cnv->subCharLen=-1;
217
291
}
218
219
static void U_CALLCONV
220
291
_SCSUClose(UConverter *cnv) {
221
291
    if(cnv->extraInfo!=nullptr) {
222
291
        if(!cnv->isExtraLocal) {
223
291
            uprv_free(cnv->extraInfo);
224
291
        }
225
291
        cnv->extraInfo=nullptr;
226
291
    }
227
291
}
228
229
/* SCSU-to-Unicode conversion functions ------------------------------------- */
230
231
static void U_CALLCONV
232
_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233
0
                          UErrorCode *pErrorCode) {
234
0
    UConverter *cnv;
235
0
    SCSUData *scsu;
236
0
    const uint8_t *source, *sourceLimit;
237
0
    char16_t *target;
238
0
    const char16_t *targetLimit;
239
0
    int32_t *offsets;
240
0
    UBool isSingleByteMode;
241
0
    uint8_t state, byteOne;
242
0
    int8_t quoteWindow, dynamicWindow;
243
244
0
    int32_t sourceIndex, nextSourceIndex;
245
246
0
    uint8_t b;
247
248
    /* set up the local pointers */
249
0
    cnv=pArgs->converter;
250
0
    scsu=(SCSUData *)cnv->extraInfo;
251
252
0
    source=(const uint8_t *)pArgs->source;
253
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254
0
    target=pArgs->target;
255
0
    targetLimit=pArgs->targetLimit;
256
0
    offsets=pArgs->offsets;
257
258
    /* get the state machine state */
259
0
    isSingleByteMode=scsu->toUIsSingleByteMode;
260
0
    state=scsu->toUState;
261
0
    quoteWindow=scsu->toUQuoteWindow;
262
0
    dynamicWindow=scsu->toUDynamicWindow;
263
0
    byteOne=scsu->toUByteOne;
264
265
    /* sourceIndex=-1 if the current character began in the previous buffer */
266
0
    sourceIndex=state==readCommand ? 0 : -1;
267
0
    nextSourceIndex=0;
268
269
    /*
270
     * conversion "loop"
271
     *
272
     * For performance, this is not a normal C loop.
273
     * Instead, there are two code blocks for the two SCSU modes.
274
     * The function branches to either one, and a change of the mode is done with a goto to
275
     * the other branch.
276
     *
277
     * Each branch has two conventional loops:
278
     * - a fast-path loop for the most common codes in the mode
279
     * - a loop for all other codes in the mode
280
     * When the fast-path runs into a code that it cannot handle, its loop ends and it
281
     * runs into the following loop to handle the other codes.
282
     * The end of the input or output buffer is also handled by the slower loop.
283
     * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284
     *
285
     * The callback handling is done by returning with an error code.
286
     * The conversion framework actually calls the callback function.
287
     */
288
0
    if(isSingleByteMode) {
289
        /* fast path for single-byte mode */
290
0
        if(state==readCommand) {
291
0
fastSingle:
292
0
            while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293
0
                ++source;
294
0
                ++nextSourceIndex;
295
0
                if(b<=0x7f) {
296
                    /* write US-ASCII graphic character or DEL */
297
0
                    *target++=(char16_t)b;
298
0
                    if(offsets!=nullptr) {
299
0
                        *offsets++=sourceIndex;
300
0
                    }
301
0
                } else {
302
                    /* write from dynamic window */
303
0
                    uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304
0
                    if(c<=0xffff) {
305
0
                        *target++=(char16_t)c;
306
0
                        if(offsets!=nullptr) {
307
0
                            *offsets++=sourceIndex;
308
0
                        }
309
0
                    } else {
310
                        /* output surrogate pair */
311
0
                        *target++=(char16_t)(0xd7c0+(c>>10));
312
0
                        if(target<targetLimit) {
313
0
                            *target++=(char16_t)(0xdc00|(c&0x3ff));
314
0
                            if(offsets!=nullptr) {
315
0
                                *offsets++=sourceIndex;
316
0
                                *offsets++=sourceIndex;
317
0
                            }
318
0
                        } else {
319
                            /* target overflow */
320
0
                            if(offsets!=nullptr) {
321
0
                                *offsets++=sourceIndex;
322
0
                            }
323
0
                            cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
324
0
                            cnv->UCharErrorBufferLength=1;
325
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326
0
                            goto endloop;
327
0
                        }
328
0
                    }
329
0
                }
330
0
                sourceIndex=nextSourceIndex;
331
0
            }
332
0
        }
333
334
        /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335
0
singleByteMode:
336
0
        while(source<sourceLimit) {
337
0
            if(target>=targetLimit) {
338
                /* target is full */
339
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340
0
                break;
341
0
            }
342
0
            b=*source++;
343
0
            ++nextSourceIndex;
344
0
            switch(state) {
345
0
            case readCommand:
346
                /* redundant conditions are commented out */
347
                /* here: b<0x20 because otherwise we would be in fastSingle */
348
0
                if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349
                    /* CR/LF/TAB/NUL */
350
0
                    *target++=(char16_t)b;
351
0
                    if(offsets!=nullptr) {
352
0
                        *offsets++=sourceIndex;
353
0
                    }
354
0
                    sourceIndex=nextSourceIndex;
355
0
                    goto fastSingle;
356
0
                } else if(SC0<=b) {
357
0
                    if(b<=SC7) {
358
0
                        dynamicWindow=(int8_t)(b-SC0);
359
0
                        sourceIndex=nextSourceIndex;
360
0
                        goto fastSingle;
361
0
                    } else /* if(SD0<=b && b<=SD7) */ {
362
0
                        dynamicWindow=(int8_t)(b-SD0);
363
0
                        state=defineOne;
364
0
                    }
365
0
                } else if(/* SQ0<=b && */ b<=SQ7) {
366
0
                    quoteWindow=(int8_t)(b-SQ0);
367
0
                    state=quoteOne;
368
0
                } else if(b==SDX) {
369
0
                    state=definePairOne;
370
0
                } else if(b==SQU) {
371
0
                    state=quotePairOne;
372
0
                } else if(b==SCU) {
373
0
                    sourceIndex=nextSourceIndex;
374
0
                    isSingleByteMode=false;
375
0
                    goto fastUnicode;
376
0
                } else /* Srs */ {
377
                    /* callback(illegal) */
378
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379
0
                    cnv->toUBytes[0]=b;
380
0
                    cnv->toULength=1;
381
0
                    goto endloop;
382
0
                }
383
384
                /* store the first byte of a multibyte sequence in toUBytes[] */
385
0
                cnv->toUBytes[0]=b;
386
0
                cnv->toULength=1;
387
0
                break;
388
0
            case quotePairOne:
389
0
                byteOne=b;
390
0
                cnv->toUBytes[1]=b;
391
0
                cnv->toULength=2;
392
0
                state=quotePairTwo;
393
0
                break;
394
0
            case quotePairTwo:
395
0
                *target++=(char16_t)((byteOne<<8)|b);
396
0
                if(offsets!=nullptr) {
397
0
                    *offsets++=sourceIndex;
398
0
                }
399
0
                sourceIndex=nextSourceIndex;
400
0
                state=readCommand;
401
0
                goto fastSingle;
402
0
            case quoteOne:
403
0
                if(b<0x80) {
404
                    /* all static offsets are in the BMP */
405
0
                    *target++=(char16_t)(staticOffsets[quoteWindow]+b);
406
0
                    if(offsets!=nullptr) {
407
0
                        *offsets++=sourceIndex;
408
0
                    }
409
0
                } else {
410
                    /* write from dynamic window */
411
0
                    uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412
0
                    if(c<=0xffff) {
413
0
                        *target++=(char16_t)c;
414
0
                        if(offsets!=nullptr) {
415
0
                            *offsets++=sourceIndex;
416
0
                        }
417
0
                    } else {
418
                        /* output surrogate pair */
419
0
                        *target++=(char16_t)(0xd7c0+(c>>10));
420
0
                        if(target<targetLimit) {
421
0
                            *target++=(char16_t)(0xdc00|(c&0x3ff));
422
0
                            if(offsets!=nullptr) {
423
0
                                *offsets++=sourceIndex;
424
0
                                *offsets++=sourceIndex;
425
0
                            }
426
0
                        } else {
427
                            /* target overflow */
428
0
                            if(offsets!=nullptr) {
429
0
                                *offsets++=sourceIndex;
430
0
                            }
431
0
                            cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
432
0
                            cnv->UCharErrorBufferLength=1;
433
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434
0
                            goto endloop;
435
0
                        }
436
0
                    }
437
0
                }
438
0
                sourceIndex=nextSourceIndex;
439
0
                state=readCommand;
440
0
                goto fastSingle;
441
0
            case definePairOne:
442
0
                dynamicWindow=(int8_t)((b>>5)&7);
443
0
                byteOne=(uint8_t)(b&0x1f);
444
0
                cnv->toUBytes[1]=b;
445
0
                cnv->toULength=2;
446
0
                state=definePairTwo;
447
0
                break;
448
0
            case definePairTwo:
449
0
                scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450
0
                sourceIndex=nextSourceIndex;
451
0
                state=readCommand;
452
0
                goto fastSingle;
453
0
            case defineOne:
454
0
                if(b==0) {
455
                    /* callback(illegal): Reserved window offset value 0 */
456
0
                    cnv->toUBytes[1]=b;
457
0
                    cnv->toULength=2;
458
0
                    goto endloop;
459
0
                } else if(b<gapThreshold) {
460
0
                    scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461
0
                } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462
0
                    scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463
0
                } else if(b>=fixedThreshold) {
464
0
                    scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465
0
                } else {
466
                    /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
467
0
                    cnv->toUBytes[1]=b;
468
0
                    cnv->toULength=2;
469
0
                    goto endloop;
470
0
                }
471
0
                sourceIndex=nextSourceIndex;
472
0
                state=readCommand;
473
0
                goto fastSingle;
474
0
            }
475
0
        }
476
0
    } else {
477
        /* fast path for Unicode mode */
478
0
        if(state==readCommand) {
479
0
fastUnicode:
480
0
            while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481
0
                *target++=(char16_t)((b<<8)|source[1]);
482
0
                if(offsets!=nullptr) {
483
0
                    *offsets++=sourceIndex;
484
0
                }
485
0
                sourceIndex=nextSourceIndex;
486
0
                nextSourceIndex+=2;
487
0
                source+=2;
488
0
            }
489
0
        }
490
491
        /* normal state machine for Unicode mode */
492
/* unicodeByteMode: */
493
0
        while(source<sourceLimit) {
494
0
            if(target>=targetLimit) {
495
                /* target is full */
496
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497
0
                break;
498
0
            }
499
0
            b=*source++;
500
0
            ++nextSourceIndex;
501
0
            switch(state) {
502
0
            case readCommand:
503
0
                if((uint8_t)(b-UC0)>(Urs-UC0)) {
504
0
                    byteOne=b;
505
0
                    cnv->toUBytes[0]=b;
506
0
                    cnv->toULength=1;
507
0
                    state=quotePairTwo;
508
0
                } else if(/* UC0<=b && */ b<=UC7) {
509
0
                    dynamicWindow=(int8_t)(b-UC0);
510
0
                    sourceIndex=nextSourceIndex;
511
0
                    isSingleByteMode=true;
512
0
                    goto fastSingle;
513
0
                } else if(/* UD0<=b && */ b<=UD7) {
514
0
                    dynamicWindow=(int8_t)(b-UD0);
515
0
                    isSingleByteMode=true;
516
0
                    cnv->toUBytes[0]=b;
517
0
                    cnv->toULength=1;
518
0
                    state=defineOne;
519
0
                    goto singleByteMode;
520
0
                } else if(b==UDX) {
521
0
                    isSingleByteMode=true;
522
0
                    cnv->toUBytes[0]=b;
523
0
                    cnv->toULength=1;
524
0
                    state=definePairOne;
525
0
                    goto singleByteMode;
526
0
                } else if(b==UQU) {
527
0
                    cnv->toUBytes[0]=b;
528
0
                    cnv->toULength=1;
529
0
                    state=quotePairOne;
530
0
                } else /* Urs */ {
531
                    /* callback(illegal) */
532
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533
0
                    cnv->toUBytes[0]=b;
534
0
                    cnv->toULength=1;
535
0
                    goto endloop;
536
0
                }
537
0
                break;
538
0
            case quotePairOne:
539
0
                byteOne=b;
540
0
                cnv->toUBytes[1]=b;
541
0
                cnv->toULength=2;
542
0
                state=quotePairTwo;
543
0
                break;
544
0
            case quotePairTwo:
545
0
                *target++=(char16_t)((byteOne<<8)|b);
546
0
                if(offsets!=nullptr) {
547
0
                    *offsets++=sourceIndex;
548
0
                }
549
0
                sourceIndex=nextSourceIndex;
550
0
                state=readCommand;
551
0
                goto fastUnicode;
552
0
            }
553
0
        }
554
0
    }
555
0
endloop:
556
557
    /* set the converter state back into UConverter */
558
0
    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559
        /* reset to deal with the next character */
560
0
        state=readCommand;
561
0
    } else if(state==readCommand) {
562
        /* not in a multi-byte sequence, reset toULength */
563
0
        cnv->toULength=0;
564
0
    }
565
0
    scsu->toUIsSingleByteMode=isSingleByteMode;
566
0
    scsu->toUState=state;
567
0
    scsu->toUQuoteWindow=quoteWindow;
568
0
    scsu->toUDynamicWindow=dynamicWindow;
569
0
    scsu->toUByteOne=byteOne;
570
571
    /* write back the updated pointers */
572
0
    pArgs->source=(const char *)source;
573
0
    pArgs->target=target;
574
0
    pArgs->offsets=offsets;
575
0
}
576
577
/*
578
 * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
579
 * If a change is made in the original function, then either
580
 * change this function the same way or
581
 * re-copy the original function and remove the variables
582
 * offsets, sourceIndex, and nextSourceIndex.
583
 */
584
static void U_CALLCONV
585
_SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
586
0
               UErrorCode *pErrorCode) {
587
0
    UConverter *cnv;
588
0
    SCSUData *scsu;
589
0
    const uint8_t *source, *sourceLimit;
590
0
    char16_t *target;
591
0
    const char16_t *targetLimit;
592
0
    UBool isSingleByteMode;
593
0
    uint8_t state, byteOne;
594
0
    int8_t quoteWindow, dynamicWindow;
595
596
0
    uint8_t b;
597
598
    /* set up the local pointers */
599
0
    cnv=pArgs->converter;
600
0
    scsu=(SCSUData *)cnv->extraInfo;
601
602
0
    source=(const uint8_t *)pArgs->source;
603
0
    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
604
0
    target=pArgs->target;
605
0
    targetLimit=pArgs->targetLimit;
606
607
    /* get the state machine state */
608
0
    isSingleByteMode=scsu->toUIsSingleByteMode;
609
0
    state=scsu->toUState;
610
0
    quoteWindow=scsu->toUQuoteWindow;
611
0
    dynamicWindow=scsu->toUDynamicWindow;
612
0
    byteOne=scsu->toUByteOne;
613
614
    /*
615
     * conversion "loop"
616
     *
617
     * For performance, this is not a normal C loop.
618
     * Instead, there are two code blocks for the two SCSU modes.
619
     * The function branches to either one, and a change of the mode is done with a goto to
620
     * the other branch.
621
     *
622
     * Each branch has two conventional loops:
623
     * - a fast-path loop for the most common codes in the mode
624
     * - a loop for all other codes in the mode
625
     * When the fast-path runs into a code that it cannot handle, its loop ends and it
626
     * runs into the following loop to handle the other codes.
627
     * The end of the input or output buffer is also handled by the slower loop.
628
     * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
629
     *
630
     * The callback handling is done by returning with an error code.
631
     * The conversion framework actually calls the callback function.
632
     */
633
0
    if(isSingleByteMode) {
634
        /* fast path for single-byte mode */
635
0
        if(state==readCommand) {
636
0
fastSingle:
637
0
            while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
638
0
                ++source;
639
0
                if(b<=0x7f) {
640
                    /* write US-ASCII graphic character or DEL */
641
0
                    *target++=(char16_t)b;
642
0
                } else {
643
                    /* write from dynamic window */
644
0
                    uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
645
0
                    if(c<=0xffff) {
646
0
                        *target++=(char16_t)c;
647
0
                    } else {
648
                        /* output surrogate pair */
649
0
                        *target++=(char16_t)(0xd7c0+(c>>10));
650
0
                        if(target<targetLimit) {
651
0
                            *target++=(char16_t)(0xdc00|(c&0x3ff));
652
0
                        } else {
653
                            /* target overflow */
654
0
                            cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
655
0
                            cnv->UCharErrorBufferLength=1;
656
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
657
0
                            goto endloop;
658
0
                        }
659
0
                    }
660
0
                }
661
0
            }
662
0
        }
663
664
        /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
665
0
singleByteMode:
666
0
        while(source<sourceLimit) {
667
0
            if(target>=targetLimit) {
668
                /* target is full */
669
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670
0
                break;
671
0
            }
672
0
            b=*source++;
673
0
            switch(state) {
674
0
            case readCommand:
675
                /* redundant conditions are commented out */
676
                /* here: b<0x20 because otherwise we would be in fastSingle */
677
0
                if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
678
                    /* CR/LF/TAB/NUL */
679
0
                    *target++=(char16_t)b;
680
0
                    goto fastSingle;
681
0
                } else if(SC0<=b) {
682
0
                    if(b<=SC7) {
683
0
                        dynamicWindow=(int8_t)(b-SC0);
684
0
                        goto fastSingle;
685
0
                    } else /* if(SD0<=b && b<=SD7) */ {
686
0
                        dynamicWindow=(int8_t)(b-SD0);
687
0
                        state=defineOne;
688
0
                    }
689
0
                } else if(/* SQ0<=b && */ b<=SQ7) {
690
0
                    quoteWindow=(int8_t)(b-SQ0);
691
0
                    state=quoteOne;
692
0
                } else if(b==SDX) {
693
0
                    state=definePairOne;
694
0
                } else if(b==SQU) {
695
0
                    state=quotePairOne;
696
0
                } else if(b==SCU) {
697
0
                    isSingleByteMode=false;
698
0
                    goto fastUnicode;
699
0
                } else /* Srs */ {
700
                    /* callback(illegal) */
701
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
702
0
                    cnv->toUBytes[0]=b;
703
0
                    cnv->toULength=1;
704
0
                    goto endloop;
705
0
                }
706
707
                /* store the first byte of a multibyte sequence in toUBytes[] */
708
0
                cnv->toUBytes[0]=b;
709
0
                cnv->toULength=1;
710
0
                break;
711
0
            case quotePairOne:
712
0
                byteOne=b;
713
0
                cnv->toUBytes[1]=b;
714
0
                cnv->toULength=2;
715
0
                state=quotePairTwo;
716
0
                break;
717
0
            case quotePairTwo:
718
0
                *target++=(char16_t)((byteOne<<8)|b);
719
0
                state=readCommand;
720
0
                goto fastSingle;
721
0
            case quoteOne:
722
0
                if(b<0x80) {
723
                    /* all static offsets are in the BMP */
724
0
                    *target++=(char16_t)(staticOffsets[quoteWindow]+b);
725
0
                } else {
726
                    /* write from dynamic window */
727
0
                    uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
728
0
                    if(c<=0xffff) {
729
0
                        *target++=(char16_t)c;
730
0
                    } else {
731
                        /* output surrogate pair */
732
0
                        *target++=(char16_t)(0xd7c0+(c>>10));
733
0
                        if(target<targetLimit) {
734
0
                            *target++=(char16_t)(0xdc00|(c&0x3ff));
735
0
                        } else {
736
                            /* target overflow */
737
0
                            cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
738
0
                            cnv->UCharErrorBufferLength=1;
739
0
                            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
740
0
                            goto endloop;
741
0
                        }
742
0
                    }
743
0
                }
744
0
                state=readCommand;
745
0
                goto fastSingle;
746
0
            case definePairOne:
747
0
                dynamicWindow=(int8_t)((b>>5)&7);
748
0
                byteOne=(uint8_t)(b&0x1f);
749
0
                cnv->toUBytes[1]=b;
750
0
                cnv->toULength=2;
751
0
                state=definePairTwo;
752
0
                break;
753
0
            case definePairTwo:
754
0
                scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
755
0
                state=readCommand;
756
0
                goto fastSingle;
757
0
            case defineOne:
758
0
                if(b==0) {
759
                    /* callback(illegal): Reserved window offset value 0 */
760
0
                    cnv->toUBytes[1]=b;
761
0
                    cnv->toULength=2;
762
0
                    goto endloop;
763
0
                } else if(b<gapThreshold) {
764
0
                    scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
765
0
                } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
766
0
                    scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
767
0
                } else if(b>=fixedThreshold) {
768
0
                    scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
769
0
                } else {
770
                    /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
771
0
                    cnv->toUBytes[1]=b;
772
0
                    cnv->toULength=2;
773
0
                    goto endloop;
774
0
                }
775
0
                state=readCommand;
776
0
                goto fastSingle;
777
0
            }
778
0
        }
779
0
    } else {
780
        /* fast path for Unicode mode */
781
0
        if(state==readCommand) {
782
0
fastUnicode:
783
0
            while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
784
0
                *target++=(char16_t)((b<<8)|source[1]);
785
0
                source+=2;
786
0
            }
787
0
        }
788
789
        /* normal state machine for Unicode mode */
790
/* unicodeByteMode: */
791
0
        while(source<sourceLimit) {
792
0
            if(target>=targetLimit) {
793
                /* target is full */
794
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
795
0
                break;
796
0
            }
797
0
            b=*source++;
798
0
            switch(state) {
799
0
            case readCommand:
800
0
                if((uint8_t)(b-UC0)>(Urs-UC0)) {
801
0
                    byteOne=b;
802
0
                    cnv->toUBytes[0]=b;
803
0
                    cnv->toULength=1;
804
0
                    state=quotePairTwo;
805
0
                } else if(/* UC0<=b && */ b<=UC7) {
806
0
                    dynamicWindow=(int8_t)(b-UC0);
807
0
                    isSingleByteMode=true;
808
0
                    goto fastSingle;
809
0
                } else if(/* UD0<=b && */ b<=UD7) {
810
0
                    dynamicWindow=(int8_t)(b-UD0);
811
0
                    isSingleByteMode=true;
812
0
                    cnv->toUBytes[0]=b;
813
0
                    cnv->toULength=1;
814
0
                    state=defineOne;
815
0
                    goto singleByteMode;
816
0
                } else if(b==UDX) {
817
0
                    isSingleByteMode=true;
818
0
                    cnv->toUBytes[0]=b;
819
0
                    cnv->toULength=1;
820
0
                    state=definePairOne;
821
0
                    goto singleByteMode;
822
0
                } else if(b==UQU) {
823
0
                    cnv->toUBytes[0]=b;
824
0
                    cnv->toULength=1;
825
0
                    state=quotePairOne;
826
0
                } else /* Urs */ {
827
                    /* callback(illegal) */
828
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
829
0
                    cnv->toUBytes[0]=b;
830
0
                    cnv->toULength=1;
831
0
                    goto endloop;
832
0
                }
833
0
                break;
834
0
            case quotePairOne:
835
0
                byteOne=b;
836
0
                cnv->toUBytes[1]=b;
837
0
                cnv->toULength=2;
838
0
                state=quotePairTwo;
839
0
                break;
840
0
            case quotePairTwo:
841
0
                *target++=(char16_t)((byteOne<<8)|b);
842
0
                state=readCommand;
843
0
                goto fastUnicode;
844
0
            }
845
0
        }
846
0
    }
847
0
endloop:
848
849
    /* set the converter state back into UConverter */
850
0
    if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
851
        /* reset to deal with the next character */
852
0
        state=readCommand;
853
0
    } else if(state==readCommand) {
854
        /* not in a multi-byte sequence, reset toULength */
855
0
        cnv->toULength=0;
856
0
    }
857
0
    scsu->toUIsSingleByteMode=isSingleByteMode;
858
0
    scsu->toUState=state;
859
0
    scsu->toUQuoteWindow=quoteWindow;
860
0
    scsu->toUDynamicWindow=dynamicWindow;
861
0
    scsu->toUByteOne=byteOne;
862
863
    /* write back the updated pointers */
864
0
    pArgs->source=(const char *)source;
865
0
    pArgs->target=target;
866
0
}
867
U_CDECL_END
868
/* SCSU-from-Unicode conversion functions ----------------------------------- */
869
870
/*
871
 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
872
 * reasonable results. The lookahead is minimal.
873
 * Many cases are simple:
874
 * A character fits directly into the current mode, a dynamic or static window,
875
 * or is not compressible. These cases are tested first.
876
 * Real compression heuristics are applied to the rest, in code branches for
877
 * single/Unicode mode and BMP/supplementary code points.
878
 * The heuristics used here are extremely simple.
879
 */
880
881
/* get the number of the window that this character is in, or -1 */
882
static int8_t
883
0
getWindow(const uint32_t offsets[8], uint32_t c) {
884
0
    int i;
885
0
    for(i=0; i<8; ++i) {
886
0
        if (c - offsets[i] <= 0x7f) {
887
0
            return static_cast<int8_t>(i);
888
0
        }
889
0
    }
890
0
    return -1;
891
0
}
892
893
/* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
894
static UBool
895
0
isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
896
0
    return c<=offset+0x7f &&
897
0
          (c>=offset || (c<=0x7f &&
898
0
                        (c>=0x20 || (1UL<<c)&0x2601)));
899
                                /* binary 0010 0110 0000 0001,
900
                                   check for b==0xd || b==0xa || b==9 || b==0 */
901
0
}
902
903
/*
904
 * getNextDynamicWindow returns the next dynamic window to be redefined
905
 */
906
static int8_t
907
0
getNextDynamicWindow(SCSUData *scsu) {
908
0
    int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
909
0
    if(++scsu->nextWindowUseIndex==8) {
910
0
        scsu->nextWindowUseIndex=0;
911
0
    }
912
0
    return window;
913
0
}
914
915
/*
916
 * useDynamicWindow() adjusts
917
 * windowUse[] and nextWindowUseIndex for the algorithm to choose
918
 * the next dynamic window to be defined;
919
 * a subclass may override it and provide its own algorithm.
920
 */
921
static void
922
0
useDynamicWindow(SCSUData *scsu, int8_t window) {
923
    /*
924
     * move the existing window, which just became the most recently used one,
925
     * up in windowUse[] to nextWindowUseIndex-1
926
     */
927
928
    /* first, find the index of the window - backwards to favor the more recently used windows */
929
0
    int i, j;
930
931
0
    i=scsu->nextWindowUseIndex;
932
0
    do {
933
0
        if(--i<0) {
934
0
            i=7;
935
0
        }
936
0
    } while(scsu->windowUse[i]!=window);
937
938
    /* now copy each windowUse[i+1] to [i] */
939
0
    j=i+1;
940
0
    if(j==8) {
941
0
        j=0;
942
0
    }
943
0
    while(j!=scsu->nextWindowUseIndex) {
944
0
        scsu->windowUse[i]=scsu->windowUse[j];
945
0
        i=j;
946
0
        if(++j==8) { j=0; }
947
0
    }
948
949
    /* finally, set the window into the most recently used index */
950
0
    scsu->windowUse[i]=window;
951
0
}
952
953
/*
954
 * calculate the offset and the code for a dynamic window that contains the character
955
 * takes fixed offsets into account
956
 * the offset of the window is stored in the offset variable,
957
 * the code is returned
958
 *
959
 * return offset code: -1 none  <=0xff code for SDn/UDn  else code for SDX/UDX, subtract 0x200 to get the true code
960
 */
961
static int
962
0
getDynamicOffset(uint32_t c, uint32_t *pOffset) {
963
0
    int i;
964
965
0
    for(i=0; i<7; ++i) {
966
0
        if (c - fixedOffsets[i] <= 0x7f) {
967
0
            *pOffset=fixedOffsets[i];
968
0
            return 0xf9+i;
969
0
        }
970
0
    }
971
972
0
    if(c<0x80) {
973
        /* No dynamic window for US-ASCII. */
974
0
        return -1;
975
0
    } else if(c<0x3400 ||
976
0
              c - 0x10000 < 0x14000 - 0x10000 ||
977
0
              c - 0x1d000 <= 0x1ffff - 0x1d000
978
0
    ) {
979
        /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
980
0
        *pOffset=c&0x7fffff80;
981
0
        return static_cast<int>(c >> 7);
982
0
    } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
983
        /* For these characters we need to take the gapOffset into account. */
984
0
        *pOffset=c&0x7fffff80;
985
0
        return static_cast<int>((c - gapOffset) >> 7);
986
0
    } else {
987
0
        return -1;
988
0
    }
989
0
}
990
U_CDECL_BEGIN
991
/*
992
 * Idea for compression:
993
 *  - save SCSUData and other state before really starting work
994
 *  - at endloop, see if compression could be better with just unicode mode
995
 *  - don't do this if a callback has been called
996
 *  - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
997
 *  - different buffer handling!
998
 *
999
 * Drawback or need for corrective handling:
1000
 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1001
 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1002
 * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1003
 *
1004
 * How to achieve both?
1005
 *  - Only replace the result after an SDX or SCU?
1006
 */
1007
1008
static void U_CALLCONV
1009
_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1010
0
                            UErrorCode *pErrorCode) {
1011
0
    UConverter *cnv;
1012
0
    SCSUData *scsu;
1013
0
    const char16_t *source, *sourceLimit;
1014
0
    uint8_t *target;
1015
0
    int32_t targetCapacity;
1016
0
    int32_t *offsets;
1017
1018
0
    UBool isSingleByteMode;
1019
0
    uint8_t dynamicWindow;
1020
0
    uint32_t currentOffset;
1021
1022
0
    uint32_t c, delta;
1023
1024
0
    int32_t sourceIndex, nextSourceIndex;
1025
1026
0
    int32_t length;
1027
1028
    /* variables for compression heuristics */
1029
0
    uint32_t offset;
1030
0
    char16_t lead, trail;
1031
0
    int code;
1032
0
    int8_t window;
1033
1034
    /* set up the local pointers */
1035
0
    cnv=pArgs->converter;
1036
0
    scsu=(SCSUData *)cnv->extraInfo;
1037
1038
    /* set up the local pointers */
1039
0
    source=pArgs->source;
1040
0
    sourceLimit=pArgs->sourceLimit;
1041
0
    target=(uint8_t *)pArgs->target;
1042
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1043
0
    offsets=pArgs->offsets;
1044
1045
    /* get the state machine state */
1046
0
    isSingleByteMode=scsu->fromUIsSingleByteMode;
1047
0
    dynamicWindow=scsu->fromUDynamicWindow;
1048
0
    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1049
1050
0
    c=cnv->fromUChar32;
1051
1052
    /* sourceIndex=-1 if the current character began in the previous buffer */
1053
0
    sourceIndex= c==0 ? 0 : -1;
1054
0
    nextSourceIndex=0;
1055
1056
    /* similar conversion "loop" as in toUnicode */
1057
0
loop:
1058
0
    if(isSingleByteMode) {
1059
0
        if(c!=0 && targetCapacity>0) {
1060
0
            goto getTrailSingle;
1061
0
        }
1062
1063
        /* state machine for single-byte mode */
1064
/* singleByteMode: */
1065
0
        while(source<sourceLimit) {
1066
0
            if(targetCapacity<=0) {
1067
                /* target is full */
1068
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1069
0
                break;
1070
0
            }
1071
0
            c=*source++;
1072
0
            ++nextSourceIndex;
1073
1074
0
            if((c-0x20)<=0x5f) {
1075
                /* pass US-ASCII graphic character through */
1076
0
                *target++=(uint8_t)c;
1077
0
                if(offsets!=nullptr) {
1078
0
                    *offsets++=sourceIndex;
1079
0
                }
1080
0
                --targetCapacity;
1081
0
            } else if(c<0x20) {
1082
0
                if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1083
                    /* CR/LF/TAB/NUL */
1084
0
                    *target++=(uint8_t)c;
1085
0
                    if(offsets!=nullptr) {
1086
0
                        *offsets++=sourceIndex;
1087
0
                    }
1088
0
                    --targetCapacity;
1089
0
                } else {
1090
                    /* quote C0 control character */
1091
0
                    c|=SQ0<<8;
1092
0
                    length=2;
1093
0
                    goto outputBytes;
1094
0
                }
1095
0
            } else if((delta=c-currentOffset)<=0x7f) {
1096
                /* use the current dynamic window */
1097
0
                *target++=(uint8_t)(delta|0x80);
1098
0
                if(offsets!=nullptr) {
1099
0
                    *offsets++=sourceIndex;
1100
0
                }
1101
0
                --targetCapacity;
1102
0
            } else if(U16_IS_SURROGATE(c)) {
1103
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1104
0
getTrailSingle:
1105
0
                    lead=(char16_t)c;
1106
0
                    if(source<sourceLimit) {
1107
                        /* test the following code unit */
1108
0
                        trail=*source;
1109
0
                        if(U16_IS_TRAIL(trail)) {
1110
0
                            ++source;
1111
0
                            ++nextSourceIndex;
1112
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1113
                            /* convert this surrogate code point */
1114
                            /* exit this condition tree */
1115
0
                        } else {
1116
                            /* this is an unmatched lead code unit (1st surrogate) */
1117
                            /* callback(illegal) */
1118
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1119
0
                            goto endloop;
1120
0
                        }
1121
0
                    } else {
1122
                        /* no more input */
1123
0
                        break;
1124
0
                    }
1125
0
                } else {
1126
                    /* this is an unmatched trail code unit (2nd surrogate) */
1127
                    /* callback(illegal) */
1128
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1129
0
                    goto endloop;
1130
0
                }
1131
1132
                /* compress supplementary character U+10000..U+10ffff */
1133
0
                if((delta=c-currentOffset)<=0x7f) {
1134
                    /* use the current dynamic window */
1135
0
                    *target++=(uint8_t)(delta|0x80);
1136
0
                    if(offsets!=nullptr) {
1137
0
                        *offsets++=sourceIndex;
1138
0
                    }
1139
0
                    --targetCapacity;
1140
0
                } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1141
                    /* there is a dynamic window that contains this character, change to it */
1142
0
                    dynamicWindow=window;
1143
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1144
0
                    useDynamicWindow(scsu, dynamicWindow);
1145
0
                    c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1146
0
                    length=2;
1147
0
                    goto outputBytes;
1148
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1149
                    /* might check if there are more characters in this window to come */
1150
                    /* define an extended window with this character */
1151
0
                    code-=0x200;
1152
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1153
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1154
0
                    useDynamicWindow(scsu, dynamicWindow);
1155
0
                    c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1156
0
                    length=4;
1157
0
                    goto outputBytes;
1158
0
                } else {
1159
                    /* change to Unicode mode and output this (lead, trail) pair */
1160
0
                    isSingleByteMode=false;
1161
0
                    *target++=(uint8_t)SCU;
1162
0
                    if(offsets!=nullptr) {
1163
0
                        *offsets++=sourceIndex;
1164
0
                    }
1165
0
                    --targetCapacity;
1166
0
                    c=((uint32_t)lead<<16)|trail;
1167
0
                    length=4;
1168
0
                    goto outputBytes;
1169
0
                }
1170
0
            } else if(c<0xa0) {
1171
                /* quote C1 control character */
1172
0
                c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1173
0
                length=2;
1174
0
                goto outputBytes;
1175
0
            } else if(c==0xfeff || c>=0xfff0) {
1176
                /* quote signature character=byte order mark and specials */
1177
0
                c|=SQU<<16;
1178
0
                length=3;
1179
0
                goto outputBytes;
1180
0
            } else {
1181
                /* compress all other BMP characters */
1182
0
                if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1183
                    /* there is a window defined that contains this character - switch to it or quote from it? */
1184
0
                    if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1185
                        /* change to dynamic window */
1186
0
                        dynamicWindow=window;
1187
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1188
0
                        useDynamicWindow(scsu, dynamicWindow);
1189
0
                        c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1190
0
                        length=2;
1191
0
                        goto outputBytes;
1192
0
                    } else {
1193
                        /* quote from dynamic window */
1194
0
                        c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1195
0
                        length=2;
1196
0
                        goto outputBytes;
1197
0
                    }
1198
0
                } else if((window=getWindow(staticOffsets, c))>=0) {
1199
                    /* quote from static window */
1200
0
                    c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1201
0
                    length=2;
1202
0
                    goto outputBytes;
1203
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1204
                    /* define a dynamic window with this character */
1205
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1206
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1207
0
                    useDynamicWindow(scsu, dynamicWindow);
1208
0
                    c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1209
0
                    length=3;
1210
0
                    goto outputBytes;
1211
0
                } else if ((c - 0x3400) < (0xd800 - 0x3400) &&
1212
0
                           (source >= sourceLimit || (uint32_t)(*source - 0x3400) < (0xd800 - 0x3400))
1213
0
                ) {
1214
                    /*
1215
                     * this character is not compressible (a BMP ideograph or similar);
1216
                     * switch to Unicode mode if this is the last character in the block
1217
                     * or there is at least one more ideograph following immediately
1218
                     */
1219
0
                    isSingleByteMode=false;
1220
0
                    c|=SCU<<16;
1221
0
                    length=3;
1222
0
                    goto outputBytes;
1223
0
                } else {
1224
                    /* quote Unicode */
1225
0
                    c|=SQU<<16;
1226
0
                    length=3;
1227
0
                    goto outputBytes;
1228
0
                }
1229
0
            }
1230
1231
            /* normal end of conversion: prepare for a new character */
1232
0
            c=0;
1233
0
            sourceIndex=nextSourceIndex;
1234
0
        }
1235
0
    } else {
1236
0
        if(c!=0 && targetCapacity>0) {
1237
0
            goto getTrailUnicode;
1238
0
        }
1239
1240
        /* state machine for Unicode mode */
1241
/* unicodeByteMode: */
1242
0
        while(source<sourceLimit) {
1243
0
            if(targetCapacity<=0) {
1244
                /* target is full */
1245
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1246
0
                break;
1247
0
            }
1248
0
            c=*source++;
1249
0
            ++nextSourceIndex;
1250
1251
0
            if ((c - 0x3400) < (0xd800 - 0x3400)) {
1252
                /* not compressible, write character directly */
1253
0
                if(targetCapacity>=2) {
1254
0
                    *target++=(uint8_t)(c>>8);
1255
0
                    *target++=(uint8_t)c;
1256
0
                    if(offsets!=nullptr) {
1257
0
                        *offsets++=sourceIndex;
1258
0
                        *offsets++=sourceIndex;
1259
0
                    }
1260
0
                    targetCapacity-=2;
1261
0
                } else {
1262
0
                    length=2;
1263
0
                    goto outputBytes;
1264
0
                }
1265
0
            } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) {
1266
                /* compress BMP character if the following one is not an uncompressible ideograph */
1267
0
                if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1268
0
                    if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) {
1269
                        /* ASCII digit or letter */
1270
0
                        isSingleByteMode=true;
1271
0
                        c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1272
0
                        length=2;
1273
0
                        goto outputBytes;
1274
0
                    } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1275
                        /* there is a dynamic window that contains this character, change to it */
1276
0
                        isSingleByteMode=true;
1277
0
                        dynamicWindow=window;
1278
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1279
0
                        useDynamicWindow(scsu, dynamicWindow);
1280
0
                        c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1281
0
                        length=2;
1282
0
                        goto outputBytes;
1283
0
                    } else if((code=getDynamicOffset(c, &offset))>=0) {
1284
                        /* define a dynamic window with this character */
1285
0
                        isSingleByteMode=true;
1286
0
                        dynamicWindow=getNextDynamicWindow(scsu);
1287
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1288
0
                        useDynamicWindow(scsu, dynamicWindow);
1289
0
                        c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1290
0
                        length=3;
1291
0
                        goto outputBytes;
1292
0
                    }
1293
0
                }
1294
1295
                /* don't know how to compress this character, just write it directly */
1296
0
                length=2;
1297
0
                goto outputBytes;
1298
0
            } else if(c<0xe000) {
1299
                /* c is a surrogate */
1300
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1301
0
getTrailUnicode:
1302
0
                    lead=(char16_t)c;
1303
0
                    if(source<sourceLimit) {
1304
                        /* test the following code unit */
1305
0
                        trail=*source;
1306
0
                        if(U16_IS_TRAIL(trail)) {
1307
0
                            ++source;
1308
0
                            ++nextSourceIndex;
1309
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1310
                            /* convert this surrogate code point */
1311
                            /* exit this condition tree */
1312
0
                        } else {
1313
                            /* this is an unmatched lead code unit (1st surrogate) */
1314
                            /* callback(illegal) */
1315
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1316
0
                            goto endloop;
1317
0
                        }
1318
0
                    } else {
1319
                        /* no more input */
1320
0
                        break;
1321
0
                    }
1322
0
                } else {
1323
                    /* this is an unmatched trail code unit (2nd surrogate) */
1324
                    /* callback(illegal) */
1325
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1326
0
                    goto endloop;
1327
0
                }
1328
1329
                /* compress supplementary character */
1330
0
                if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1331
0
                    !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1332
0
                ) {
1333
                    /*
1334
                     * there is a dynamic window that contains this character and
1335
                     * the following character is not uncompressible,
1336
                     * change to the window
1337
                     */
1338
0
                    isSingleByteMode=true;
1339
0
                    dynamicWindow=window;
1340
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1341
0
                    useDynamicWindow(scsu, dynamicWindow);
1342
0
                    c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1343
0
                    length=2;
1344
0
                    goto outputBytes;
1345
0
                } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1346
0
                          (code=getDynamicOffset(c, &offset))>=0
1347
0
                ) {
1348
                    /* two supplementary characters in (probably) the same window - define an extended one */
1349
0
                    isSingleByteMode=true;
1350
0
                    code-=0x200;
1351
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1352
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1353
0
                    useDynamicWindow(scsu, dynamicWindow);
1354
0
                    c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1355
0
                    length=4;
1356
0
                    goto outputBytes;
1357
0
                } else {
1358
                    /* don't know how to compress this character, just write it directly */
1359
0
                    c=((uint32_t)lead<<16)|trail;
1360
0
                    length=4;
1361
0
                    goto outputBytes;
1362
0
                }
1363
0
            } else /* 0xe000<=c<0xf300 */ {
1364
                /* quote to avoid SCSU tags */
1365
0
                c|=UQU<<16;
1366
0
                length=3;
1367
0
                goto outputBytes;
1368
0
            }
1369
1370
            /* normal end of conversion: prepare for a new character */
1371
0
            c=0;
1372
0
            sourceIndex=nextSourceIndex;
1373
0
        }
1374
0
    }
1375
0
endloop:
1376
1377
    /* set the converter state back into UConverter */
1378
0
    scsu->fromUIsSingleByteMode=isSingleByteMode;
1379
0
    scsu->fromUDynamicWindow=dynamicWindow;
1380
1381
0
    cnv->fromUChar32=c;
1382
1383
    /* write back the updated pointers */
1384
0
    pArgs->source=source;
1385
0
    pArgs->target=(char *)target;
1386
0
    pArgs->offsets=offsets;
1387
0
    return;
1388
1389
0
outputBytes:
1390
    /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1391
    /* from the first if in the loop we know that targetCapacity>0 */
1392
0
    if(length<=targetCapacity) {
1393
0
        if(offsets==nullptr) {
1394
0
            switch(length) {
1395
                /* each branch falls through to the next one */
1396
0
            case 4:
1397
0
                *target++=(uint8_t)(c>>24);
1398
0
                U_FALLTHROUGH;
1399
0
            case 3:
1400
0
                *target++=(uint8_t)(c>>16);
1401
0
                U_FALLTHROUGH;
1402
0
            case 2:
1403
0
                *target++=(uint8_t)(c>>8);
1404
0
                U_FALLTHROUGH;
1405
0
            case 1:
1406
0
                *target++=(uint8_t)c;
1407
0
                U_FALLTHROUGH;
1408
0
            default:
1409
                /* will never occur */
1410
0
                break;
1411
0
            }
1412
0
        } else {
1413
0
            switch(length) {
1414
                /* each branch falls through to the next one */
1415
0
            case 4:
1416
0
                *target++=(uint8_t)(c>>24);
1417
0
                *offsets++=sourceIndex;
1418
0
                U_FALLTHROUGH;
1419
0
            case 3:
1420
0
                *target++=(uint8_t)(c>>16);
1421
0
                *offsets++=sourceIndex;
1422
0
                U_FALLTHROUGH;
1423
0
            case 2:
1424
0
                *target++=(uint8_t)(c>>8);
1425
0
                *offsets++=sourceIndex;
1426
0
                U_FALLTHROUGH;
1427
0
            case 1:
1428
0
                *target++=(uint8_t)c;
1429
0
                *offsets++=sourceIndex;
1430
0
                U_FALLTHROUGH;
1431
0
            default:
1432
                /* will never occur */
1433
0
                break;
1434
0
            }
1435
0
        }
1436
0
        targetCapacity-=length;
1437
1438
        /* normal end of conversion: prepare for a new character */
1439
0
        c=0;
1440
0
        sourceIndex=nextSourceIndex;
1441
0
        goto loop;
1442
0
    } else {
1443
0
        uint8_t *p;
1444
1445
        /*
1446
         * We actually do this backwards here:
1447
         * In order to save an intermediate variable, we output
1448
         * first to the overflow buffer what does not fit into the
1449
         * regular target.
1450
         */
1451
        /* we know that 0<=targetCapacity<length<=4 */
1452
        /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1453
0
        length-=targetCapacity;
1454
0
        p=(uint8_t *)cnv->charErrorBuffer;
1455
0
        switch(length) {
1456
            /* each branch falls through to the next one */
1457
0
        case 4:
1458
0
            *p++=(uint8_t)(c>>24);
1459
0
            U_FALLTHROUGH;
1460
0
        case 3:
1461
0
            *p++=(uint8_t)(c>>16);
1462
0
            U_FALLTHROUGH;
1463
0
        case 2:
1464
0
            *p++=(uint8_t)(c>>8);
1465
0
            U_FALLTHROUGH;
1466
0
        case 1:
1467
0
            *p=(uint8_t)c;
1468
0
            U_FALLTHROUGH;
1469
0
        default:
1470
            /* will never occur */
1471
0
            break;
1472
0
        }
1473
0
        cnv->charErrorBufferLength=(int8_t)length;
1474
1475
        /* now output what fits into the regular target */
1476
0
        c>>=8*length; /* length was reduced by targetCapacity */
1477
0
        switch(targetCapacity) {
1478
            /* each branch falls through to the next one */
1479
0
        case 3:
1480
0
            *target++=(uint8_t)(c>>16);
1481
0
            if(offsets!=nullptr) {
1482
0
                *offsets++=sourceIndex;
1483
0
            }
1484
0
            U_FALLTHROUGH;
1485
0
        case 2:
1486
0
            *target++=(uint8_t)(c>>8);
1487
0
            if(offsets!=nullptr) {
1488
0
                *offsets++=sourceIndex;
1489
0
            }
1490
0
            U_FALLTHROUGH;
1491
0
        case 1:
1492
0
            *target++=(uint8_t)c;
1493
0
            if(offsets!=nullptr) {
1494
0
                *offsets++=sourceIndex;
1495
0
            }
1496
0
            U_FALLTHROUGH;
1497
0
        default:
1498
0
            break;
1499
0
        }
1500
1501
        /* target overflow */
1502
0
        targetCapacity=0;
1503
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1504
0
        c=0;
1505
0
        goto endloop;
1506
0
    }
1507
0
}
1508
1509
/*
1510
 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1511
 * If a change is made in the original function, then either
1512
 * change this function the same way or
1513
 * re-copy the original function and remove the variables
1514
 * offsets, sourceIndex, and nextSourceIndex.
1515
 */
1516
static void U_CALLCONV
1517
_SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1518
0
                 UErrorCode *pErrorCode) {
1519
0
    UConverter *cnv;
1520
0
    SCSUData *scsu;
1521
0
    const char16_t *source, *sourceLimit;
1522
0
    uint8_t *target;
1523
0
    int32_t targetCapacity;
1524
1525
0
    UBool isSingleByteMode;
1526
0
    uint8_t dynamicWindow;
1527
0
    uint32_t currentOffset;
1528
1529
0
    uint32_t c, delta;
1530
1531
0
    int32_t length;
1532
1533
    /* variables for compression heuristics */
1534
0
    uint32_t offset;
1535
0
    char16_t lead, trail;
1536
0
    int code;
1537
0
    int8_t window;
1538
1539
    /* set up the local pointers */
1540
0
    cnv=pArgs->converter;
1541
0
    scsu=(SCSUData *)cnv->extraInfo;
1542
1543
    /* set up the local pointers */
1544
0
    source=pArgs->source;
1545
0
    sourceLimit=pArgs->sourceLimit;
1546
0
    target=(uint8_t *)pArgs->target;
1547
0
    targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1548
1549
    /* get the state machine state */
1550
0
    isSingleByteMode=scsu->fromUIsSingleByteMode;
1551
0
    dynamicWindow=scsu->fromUDynamicWindow;
1552
0
    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1553
1554
0
    c=cnv->fromUChar32;
1555
1556
    /* similar conversion "loop" as in toUnicode */
1557
0
loop:
1558
0
    if(isSingleByteMode) {
1559
0
        if(c!=0 && targetCapacity>0) {
1560
0
            goto getTrailSingle;
1561
0
        }
1562
1563
        /* state machine for single-byte mode */
1564
/* singleByteMode: */
1565
0
        while(source<sourceLimit) {
1566
0
            if(targetCapacity<=0) {
1567
                /* target is full */
1568
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1569
0
                break;
1570
0
            }
1571
0
            c=*source++;
1572
1573
0
            if((c-0x20)<=0x5f) {
1574
                /* pass US-ASCII graphic character through */
1575
0
                *target++=(uint8_t)c;
1576
0
                --targetCapacity;
1577
0
            } else if(c<0x20) {
1578
0
                if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1579
                    /* CR/LF/TAB/NUL */
1580
0
                    *target++=(uint8_t)c;
1581
0
                    --targetCapacity;
1582
0
                } else {
1583
                    /* quote C0 control character */
1584
0
                    c|=SQ0<<8;
1585
0
                    length=2;
1586
0
                    goto outputBytes;
1587
0
                }
1588
0
            } else if((delta=c-currentOffset)<=0x7f) {
1589
                /* use the current dynamic window */
1590
0
                *target++=(uint8_t)(delta|0x80);
1591
0
                --targetCapacity;
1592
0
            } else if(U16_IS_SURROGATE(c)) {
1593
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1594
0
getTrailSingle:
1595
0
                    lead=(char16_t)c;
1596
0
                    if(source<sourceLimit) {
1597
                        /* test the following code unit */
1598
0
                        trail=*source;
1599
0
                        if(U16_IS_TRAIL(trail)) {
1600
0
                            ++source;
1601
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1602
                            /* convert this surrogate code point */
1603
                            /* exit this condition tree */
1604
0
                        } else {
1605
                            /* this is an unmatched lead code unit (1st surrogate) */
1606
                            /* callback(illegal) */
1607
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1608
0
                            goto endloop;
1609
0
                        }
1610
0
                    } else {
1611
                        /* no more input */
1612
0
                        break;
1613
0
                    }
1614
0
                } else {
1615
                    /* this is an unmatched trail code unit (2nd surrogate) */
1616
                    /* callback(illegal) */
1617
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1618
0
                    goto endloop;
1619
0
                }
1620
1621
                /* compress supplementary character U+10000..U+10ffff */
1622
0
                if((delta=c-currentOffset)<=0x7f) {
1623
                    /* use the current dynamic window */
1624
0
                    *target++=(uint8_t)(delta|0x80);
1625
0
                    --targetCapacity;
1626
0
                } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1627
                    /* there is a dynamic window that contains this character, change to it */
1628
0
                    dynamicWindow=window;
1629
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1630
0
                    useDynamicWindow(scsu, dynamicWindow);
1631
0
                    c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1632
0
                    length=2;
1633
0
                    goto outputBytes;
1634
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1635
                    /* might check if there are more characters in this window to come */
1636
                    /* define an extended window with this character */
1637
0
                    code-=0x200;
1638
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1639
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1640
0
                    useDynamicWindow(scsu, dynamicWindow);
1641
0
                    c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1642
0
                    length=4;
1643
0
                    goto outputBytes;
1644
0
                } else {
1645
                    /* change to Unicode mode and output this (lead, trail) pair */
1646
0
                    isSingleByteMode=false;
1647
0
                    *target++=(uint8_t)SCU;
1648
0
                    --targetCapacity;
1649
0
                    c=((uint32_t)lead<<16)|trail;
1650
0
                    length=4;
1651
0
                    goto outputBytes;
1652
0
                }
1653
0
            } else if(c<0xa0) {
1654
                /* quote C1 control character */
1655
0
                c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1656
0
                length=2;
1657
0
                goto outputBytes;
1658
0
            } else if(c==0xfeff || c>=0xfff0) {
1659
                /* quote signature character=byte order mark and specials */
1660
0
                c|=SQU<<16;
1661
0
                length=3;
1662
0
                goto outputBytes;
1663
0
            } else {
1664
                /* compress all other BMP characters */
1665
0
                if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1666
                    /* there is a window defined that contains this character - switch to it or quote from it? */
1667
0
                    if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1668
                        /* change to dynamic window */
1669
0
                        dynamicWindow=window;
1670
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1671
0
                        useDynamicWindow(scsu, dynamicWindow);
1672
0
                        c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1673
0
                        length=2;
1674
0
                        goto outputBytes;
1675
0
                    } else {
1676
                        /* quote from dynamic window */
1677
0
                        c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1678
0
                        length=2;
1679
0
                        goto outputBytes;
1680
0
                    }
1681
0
                } else if((window=getWindow(staticOffsets, c))>=0) {
1682
                    /* quote from static window */
1683
0
                    c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1684
0
                    length=2;
1685
0
                    goto outputBytes;
1686
0
                } else if((code=getDynamicOffset(c, &offset))>=0) {
1687
                    /* define a dynamic window with this character */
1688
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1689
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1690
0
                    useDynamicWindow(scsu, dynamicWindow);
1691
0
                    c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1692
0
                    length=3;
1693
0
                    goto outputBytes;
1694
0
                } else if (c - 0x3400 < 0xd800 - 0x3400 &&
1695
0
                           (source >= sourceLimit || static_cast<uint32_t>(*source - 0x3400) < 0xd800 - 0x3400)
1696
0
                ) {
1697
                    /*
1698
                     * this character is not compressible (a BMP ideograph or similar);
1699
                     * switch to Unicode mode if this is the last character in the block
1700
                     * or there is at least one more ideograph following immediately
1701
                     */
1702
0
                    isSingleByteMode=false;
1703
0
                    c|=SCU<<16;
1704
0
                    length=3;
1705
0
                    goto outputBytes;
1706
0
                } else {
1707
                    /* quote Unicode */
1708
0
                    c|=SQU<<16;
1709
0
                    length=3;
1710
0
                    goto outputBytes;
1711
0
                }
1712
0
            }
1713
1714
            /* normal end of conversion: prepare for a new character */
1715
0
            c=0;
1716
0
        }
1717
0
    } else {
1718
0
        if(c!=0 && targetCapacity>0) {
1719
0
            goto getTrailUnicode;
1720
0
        }
1721
1722
        /* state machine for Unicode mode */
1723
/* unicodeByteMode: */
1724
0
        while(source<sourceLimit) {
1725
0
            if(targetCapacity<=0) {
1726
                /* target is full */
1727
0
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1728
0
                break;
1729
0
            }
1730
0
            c=*source++;
1731
1732
0
            if (c - 0x3400 < 0xd800 - 0x3400) {
1733
                /* not compressible, write character directly */
1734
0
                if(targetCapacity>=2) {
1735
0
                    *target++=(uint8_t)(c>>8);
1736
0
                    *target++=(uint8_t)c;
1737
0
                    targetCapacity-=2;
1738
0
                } else {
1739
0
                    length=2;
1740
0
                    goto outputBytes;
1741
0
                }
1742
0
            } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) {
1743
                /* compress BMP character if the following one is not an uncompressible ideograph */
1744
0
                if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1745
0
                    if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) {
1746
                        /* ASCII digit or letter */
1747
0
                        isSingleByteMode=true;
1748
0
                        c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1749
0
                        length=2;
1750
0
                        goto outputBytes;
1751
0
                    } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1752
                        /* there is a dynamic window that contains this character, change to it */
1753
0
                        isSingleByteMode=true;
1754
0
                        dynamicWindow=window;
1755
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1756
0
                        useDynamicWindow(scsu, dynamicWindow);
1757
0
                        c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1758
0
                        length=2;
1759
0
                        goto outputBytes;
1760
0
                    } else if((code=getDynamicOffset(c, &offset))>=0) {
1761
                        /* define a dynamic window with this character */
1762
0
                        isSingleByteMode=true;
1763
0
                        dynamicWindow=getNextDynamicWindow(scsu);
1764
0
                        currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1765
0
                        useDynamicWindow(scsu, dynamicWindow);
1766
0
                        c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1767
0
                        length=3;
1768
0
                        goto outputBytes;
1769
0
                    }
1770
0
                }
1771
1772
                /* don't know how to compress this character, just write it directly */
1773
0
                length=2;
1774
0
                goto outputBytes;
1775
0
            } else if(c<0xe000) {
1776
                /* c is a surrogate */
1777
0
                if(U16_IS_SURROGATE_LEAD(c)) {
1778
0
getTrailUnicode:
1779
0
                    lead=(char16_t)c;
1780
0
                    if(source<sourceLimit) {
1781
                        /* test the following code unit */
1782
0
                        trail=*source;
1783
0
                        if(U16_IS_TRAIL(trail)) {
1784
0
                            ++source;
1785
0
                            c=U16_GET_SUPPLEMENTARY(c, trail);
1786
                            /* convert this surrogate code point */
1787
                            /* exit this condition tree */
1788
0
                        } else {
1789
                            /* this is an unmatched lead code unit (1st surrogate) */
1790
                            /* callback(illegal) */
1791
0
                            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1792
0
                            goto endloop;
1793
0
                        }
1794
0
                    } else {
1795
                        /* no more input */
1796
0
                        break;
1797
0
                    }
1798
0
                } else {
1799
                    /* this is an unmatched trail code unit (2nd surrogate) */
1800
                    /* callback(illegal) */
1801
0
                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1802
0
                    goto endloop;
1803
0
                }
1804
1805
                /* compress supplementary character */
1806
0
                if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1807
0
                    !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1808
0
                ) {
1809
                    /*
1810
                     * there is a dynamic window that contains this character and
1811
                     * the following character is not uncompressible,
1812
                     * change to the window
1813
                     */
1814
0
                    isSingleByteMode=true;
1815
0
                    dynamicWindow=window;
1816
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1817
0
                    useDynamicWindow(scsu, dynamicWindow);
1818
0
                    c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1819
0
                    length=2;
1820
0
                    goto outputBytes;
1821
0
                } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1822
0
                          (code=getDynamicOffset(c, &offset))>=0
1823
0
                ) {
1824
                    /* two supplementary characters in (probably) the same window - define an extended one */
1825
0
                    isSingleByteMode=true;
1826
0
                    code-=0x200;
1827
0
                    dynamicWindow=getNextDynamicWindow(scsu);
1828
0
                    currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1829
0
                    useDynamicWindow(scsu, dynamicWindow);
1830
0
                    c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1831
0
                    length=4;
1832
0
                    goto outputBytes;
1833
0
                } else {
1834
                    /* don't know how to compress this character, just write it directly */
1835
0
                    c=((uint32_t)lead<<16)|trail;
1836
0
                    length=4;
1837
0
                    goto outputBytes;
1838
0
                }
1839
0
            } else /* 0xe000<=c<0xf300 */ {
1840
                /* quote to avoid SCSU tags */
1841
0
                c|=UQU<<16;
1842
0
                length=3;
1843
0
                goto outputBytes;
1844
0
            }
1845
1846
            /* normal end of conversion: prepare for a new character */
1847
0
            c=0;
1848
0
        }
1849
0
    }
1850
0
endloop:
1851
1852
    /* set the converter state back into UConverter */
1853
0
    scsu->fromUIsSingleByteMode=isSingleByteMode;
1854
0
    scsu->fromUDynamicWindow=dynamicWindow;
1855
1856
0
    cnv->fromUChar32=c;
1857
1858
    /* write back the updated pointers */
1859
0
    pArgs->source=source;
1860
0
    pArgs->target=(char *)target;
1861
0
    return;
1862
1863
0
outputBytes:
1864
    /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1865
    /* from the first if in the loop we know that targetCapacity>0 */
1866
0
    if(length<=targetCapacity) {
1867
0
        switch(length) {
1868
            /* each branch falls through to the next one */
1869
0
        case 4:
1870
0
            *target++=(uint8_t)(c>>24);
1871
0
            U_FALLTHROUGH;
1872
0
        case 3:
1873
0
            *target++=(uint8_t)(c>>16);
1874
0
            U_FALLTHROUGH;
1875
0
        case 2:
1876
0
            *target++=(uint8_t)(c>>8);
1877
0
            U_FALLTHROUGH;
1878
0
        case 1:
1879
0
            *target++=(uint8_t)c;
1880
0
            U_FALLTHROUGH;
1881
0
        default:
1882
            /* will never occur */
1883
0
            break;
1884
0
        }
1885
0
        targetCapacity-=length;
1886
1887
        /* normal end of conversion: prepare for a new character */
1888
0
        c=0;
1889
0
        goto loop;
1890
0
    } else {
1891
0
        uint8_t *p;
1892
1893
        /*
1894
         * We actually do this backwards here:
1895
         * In order to save an intermediate variable, we output
1896
         * first to the overflow buffer what does not fit into the
1897
         * regular target.
1898
         */
1899
        /* we know that 0<=targetCapacity<length<=4 */
1900
        /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1901
0
        length-=targetCapacity;
1902
0
        p=(uint8_t *)cnv->charErrorBuffer;
1903
0
        switch(length) {
1904
            /* each branch falls through to the next one */
1905
0
        case 4:
1906
0
            *p++=(uint8_t)(c>>24);
1907
0
            U_FALLTHROUGH;
1908
0
        case 3:
1909
0
            *p++=(uint8_t)(c>>16);
1910
0
            U_FALLTHROUGH;
1911
0
        case 2:
1912
0
            *p++=(uint8_t)(c>>8);
1913
0
            U_FALLTHROUGH;
1914
0
        case 1:
1915
0
            *p=(uint8_t)c;
1916
0
            U_FALLTHROUGH;
1917
0
        default:
1918
            /* will never occur */
1919
0
            break;
1920
0
        }
1921
0
        cnv->charErrorBufferLength=(int8_t)length;
1922
1923
        /* now output what fits into the regular target */
1924
0
        c = (length == 4) ? 0 : c >> 8*length; /* length was reduced by targetCapacity */
1925
0
        switch(targetCapacity) {
1926
            /* each branch falls through to the next one */
1927
0
        case 3:
1928
0
            *target++=(uint8_t)(c>>16);
1929
0
            U_FALLTHROUGH;
1930
0
        case 2:
1931
0
            *target++=(uint8_t)(c>>8);
1932
0
            U_FALLTHROUGH;
1933
0
        case 1:
1934
0
            *target++=(uint8_t)c;
1935
0
            U_FALLTHROUGH;
1936
0
        default:
1937
0
            break;
1938
0
        }
1939
1940
        /* target overflow */
1941
0
        targetCapacity=0;
1942
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1943
0
        c=0;
1944
0
        goto endloop;
1945
0
    }
1946
0
}
1947
1948
/* miscellaneous ------------------------------------------------------------ */
1949
1950
static const char *  U_CALLCONV
1951
0
_SCSUGetName(const UConverter *cnv) {
1952
0
    SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1953
1954
0
    switch(scsu->locale) {
1955
0
    case l_ja:
1956
0
        return "SCSU,locale=ja";
1957
0
    default:
1958
0
        return "SCSU";
1959
0
    }
1960
0
}
1961
1962
/* structure for SafeClone calculations */
1963
struct cloneSCSUStruct
1964
{
1965
    UConverter cnv;
1966
    SCSUData mydata;
1967
};
1968
1969
static UConverter *  U_CALLCONV
1970
_SCSUSafeClone(const UConverter *cnv, 
1971
               void *stackBuffer, 
1972
               int32_t *pBufferSize, 
1973
               UErrorCode *status)
1974
0
{
1975
0
    struct cloneSCSUStruct * localClone;
1976
0
    int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1977
1978
0
    if (U_FAILURE(*status)){
1979
0
        return nullptr;
1980
0
    }
1981
1982
0
    if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1983
0
        *pBufferSize = bufferSizeNeeded;
1984
0
        return nullptr;
1985
0
    }
1986
1987
0
    localClone = (struct cloneSCSUStruct *)stackBuffer;
1988
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1989
1990
0
    uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1991
0
    localClone->cnv.extraInfo = &localClone->mydata;
1992
0
    localClone->cnv.isExtraLocal = true;
1993
1994
0
    return &localClone->cnv;
1995
0
}
1996
U_CDECL_END
1997
1998
static const UConverterImpl _SCSUImpl={
1999
    UCNV_SCSU,
2000
2001
    nullptr,
2002
    nullptr,
2003
2004
    _SCSUOpen,
2005
    _SCSUClose,
2006
    _SCSUReset,
2007
2008
    _SCSUToUnicode,
2009
    _SCSUToUnicodeWithOffsets,
2010
    _SCSUFromUnicode,
2011
    _SCSUFromUnicodeWithOffsets,
2012
    nullptr,
2013
2014
    nullptr,
2015
    _SCSUGetName,
2016
    nullptr,
2017
    _SCSUSafeClone,
2018
    ucnv_getCompleteUnicodeSet,
2019
    nullptr,
2020
    nullptr
2021
};
2022
2023
static const UConverterStaticData _SCSUStaticData={
2024
    sizeof(UConverterStaticData),
2025
    "SCSU",
2026
    1212, /* CCSID for SCSU */
2027
    UCNV_IBM, UCNV_SCSU,
2028
    1, 3, /* one char16_t generates at least 1 byte and at most 3 bytes */
2029
    /*
2030
     * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2031
     * substitution string.
2032
     */
2033
    { 0x0e, 0xff, 0xfd, 0 }, 3,
2034
    false, false,
2035
    0,
2036
    0,
2037
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2038
};
2039
2040
const UConverterSharedData _SCSUData=
2041
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2042
2043
#endif