Coverage Report

Created: 2026-06-07 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/ucnv2022.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2000-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv2022.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2000feb03
14
*   created by: Markus W. Scherer
15
*
16
*   Change history:
17
*
18
*   06/29/2000  helena  Major rewrite of the callback APIs.
19
*   08/08/2000  Ram     Included support for ISO-2022-JP-2
20
*                       Changed implementation of toUnicode
21
*                       function
22
*   08/21/2000  Ram     Added support for ISO-2022-KR
23
*   08/29/2000  Ram     Seperated implementation of EBCDIC to
24
*                       ucnvebdc.c
25
*   09/20/2000  Ram     Added support for ISO-2022-CN
26
*                       Added implementations for getNextUChar()
27
*                       for specific 2022 country variants.
28
*   10/31/2000  Ram     Implemented offsets logic functions
29
*/
30
31
#include "unicode/utypes.h"
32
33
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35
#include "unicode/ucnv.h"
36
#include "unicode/uset.h"
37
#include "unicode/ucnv_err.h"
38
#include "unicode/ucnv_cb.h"
39
#include "unicode/utf16.h"
40
#include "ucnv_imp.h"
41
#include "ucnv_bld.h"
42
#include "ucnv_cnv.h"
43
#include "ucnvmbcs.h"
44
#include "cstring.h"
45
#include "cmemory.h"
46
#include "uassert.h"
47
48
#ifdef U_ENABLE_GENERIC_ISO_2022
49
/*
50
 * I am disabling the generic ISO-2022 converter after proposing to do so on
51
 * the icu mailing list two days ago.
52
 *
53
 * Reasons:
54
 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55
 *    its designation sequences, single shifts with return to the previous state,
56
 *    switch-with-no-return to UTF-16BE or similar, etc.
57
 *    This is unlike the language-specific variants like ISO-2022-JP which
58
 *    require a much smaller repertoire of ISO-2022 features.
59
 *    These variants continue to be supported.
60
 * 2. I believe that no one is really using the generic ISO-2022 converter
61
 *    but rather always one of the language-specific variants.
62
 *    Note that ICU's generic ISO-2022 converter has always output one escape
63
 *    sequence followed by UTF-8 for the whole stream.
64
 * 3. Switching between subcharsets is extremely slow, because each time
65
 *    the previous converter is closed and a new one opened,
66
 *    without any kind of caching, least-recently-used list, etc.
67
 * 4. The code is currently buggy, and given the above it does not seem
68
 *    reasonable to spend the time on maintenance.
69
 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70
 *    This means, for example, that when ISO-8859-7 is designated, the following
71
 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72
 *    The ICU ISO-2022 converter does not handle this - and has no information
73
 *    about which subconverter would have to be shifted vs. which is designed
74
 *    for 7-bit ISO-2022.
75
 *
76
 * Markus Scherer 2003-dec-03
77
 */
78
#endif
79
80
#if !UCONFIG_ONLY_HTML_CONVERSION
81
static const char SHIFT_IN_STR[]  = "\x0F";
82
// static const char SHIFT_OUT_STR[] = "\x0E";
83
#endif
84
85
16.5M
#define CR      0x0D
86
8.32M
#define LF      0x0A
87
#define H_TAB   0x09
88
#define V_TAB   0x0B
89
#define SPACE   0x20
90
91
enum {
92
    HWKANA_START=0xff61,
93
    HWKANA_END=0xff9f
94
};
95
96
/*
97
 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98
 * as bytes 21..7E. (Subtract 0x80.)
99
 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100
 * as bytes 20..7F. (Subtract 0x80.)
101
 * Do not encode C1 control codes with native bytes 80..9F
102
 * as bytes 00..1F (C0 control codes).
103
 */
104
enum {
105
    GR94_START=0xa1,
106
    GR94_END=0xfe,
107
    GR96_START=0xa0,
108
    GR96_END=0xff
109
};
110
111
/*
112
 * ISO 2022 control codes must not be converted from Unicode
113
 * because they would mess up the byte stream.
114
 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115
 * corresponding to SO, SI, and ESC.
116
 */
117
21.7M
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119
/* for ISO-2022-JP and -CN implementations */
120
typedef enum  {
121
        /* shared values */
122
        INVALID_STATE=-1,
123
        ASCII = 0,
124
125
        SS2_STATE=0x10,
126
        SS3_STATE,
127
128
        /* JP */
129
        ISO8859_1 = 1 ,
130
        ISO8859_7 = 2 ,
131
        JISX201  = 3,
132
        JISX208 = 4,
133
        JISX212 = 5,
134
        GB2312  =6,
135
        KSC5601 =7,
136
        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137
138
        /* CN */
139
        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140
        GB2312_1=1,
141
        ISO_IR_165=2,
142
        CNS_11643=3,
143
144
        /*
145
         * these are used in StateEnum and ISO2022State variables,
146
         * but CNS_11643 must be used to index into myConverterArray[]
147
         */
148
        CNS_11643_0=0x20,
149
        CNS_11643_1,
150
        CNS_11643_2,
151
        CNS_11643_3,
152
        CNS_11643_4,
153
        CNS_11643_5,
154
        CNS_11643_6,
155
        CNS_11643_7
156
} StateEnum;
157
158
/* is the StateEnum charset value for a DBCS charset? */
159
#if UCONFIG_ONLY_HTML_CONVERSION
160
#define IS_JP_DBCS(cs) (JISX208==(cs))
161
#else
162
393
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163
#endif
164
165
142M
#define CSM(cs) ((uint16_t)1<<(cs))
166
167
/*
168
 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169
 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170
 *
171
 * Note: The converter uses some leniency:
172
 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173
 *   all versions, not just JIS7 and JIS8.
174
 * - ICU does not distinguish between different versions of JIS X 0208.
175
 */
176
#if UCONFIG_ONLY_HTML_CONVERSION
177
enum { MAX_JA_VERSION=0 };
178
#else
179
enum { MAX_JA_VERSION=4 };
180
#endif
181
static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183
#if !UCONFIG_ONLY_HTML_CONVERSION
184
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188
#endif
189
};
190
191
typedef enum {
192
        ASCII1=0,
193
        LATIN1,
194
        SBCS,
195
        DBCS,
196
        MBCS,
197
        HWKANA
198
}Cnv2022Type;
199
200
typedef struct ISO2022State {
201
    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202
    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203
    int8_t prevG;       /* g before single shift (SS2 or SS3) */
204
} ISO2022State;
205
206
2.29k
#define UCNV_OPTIONS_VERSION_MASK 0xf
207
25.2k
#define UCNV_2022_MAX_CONVERTERS 10
208
209
typedef struct{
210
    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211
    UConverter *currentConverter;
212
    Cnv2022Type currentType;
213
    ISO2022State toU2022State, fromU2022State;
214
    uint32_t key;
215
    uint32_t version;
216
#ifdef U_ENABLE_GENERIC_ISO_2022
217
    UBool isFirstBuffer;
218
#endif
219
    UBool isEmptySegment;
220
    char name[30];
221
    char locale[3];
222
}UConverterDataISO2022;
223
224
/* Protos */
225
/* ISO-2022 ----------------------------------------------------------------- */
226
227
/*Forward declaration */
228
U_CFUNC void U_CALLCONV
229
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230
                      UErrorCode * err);
231
U_CFUNC void U_CALLCONV
232
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233
                                    UErrorCode * err);
234
235
455k
#define ESC_2022 0x1B /*ESC*/
236
237
typedef enum
238
{
239
        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240
        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241
        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242
        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243
} UCNV_TableStates_2022;
244
245
/*
246
* The way these state transition arrays work is:
247
* ex : ESC$B is the sequence for JISX208
248
*      a) First Iteration: char is ESC
249
*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250
*             int x = normalize_esq_chars_2022[27] which is equal to 1
251
*         ii) Search for this value in escSeqStateTable_Key_2022[]
252
*             value of x is stored at escSeqStateTable_Key_2022[0]
253
*        iii) Save this index as offset
254
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256
*     b) Switch on this state and continue to next char
257
*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258
*             which is normalize_esq_chars_2022[36] == 4
259
*         ii) x is currently 1(from above)
260
*               x<<=5 -- x is now 32
261
*               x+=normalize_esq_chars_2022[36]
262
*               now x is 36
263
*        iii) Search for this value in escSeqStateTable_Key_2022[]
264
*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267
*     c) Switch on this state and continue to next char
268
*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269
*        ii) x is currently 36 (from above)
270
*            x<<=5 -- x is now 1152
271
*            x+=normalize_esq_chars_2022[66]
272
*            now x is 1161
273
*       iii) Search for this value in escSeqStateTable_Key_2022[]
274
*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275
*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276
*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277
*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278
*/
279
280
281
/*Below are the 3 arrays depicting a state transition table*/
282
static const int8_t normalize_esq_chars_2022[256] = {
283
/*       0      1       2       3       4      5       6        7       8       9           */
284
285
         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288
        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289
        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291
        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292
        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293
        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294
        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310
        ,0     ,0      ,0      ,0      ,0      ,0
311
};
312
313
#ifdef U_ENABLE_GENERIC_ISO_2022
314
/*
315
 * When the generic ISO-2022 converter is completely removed, not just disabled
316
 * per #ifdef, then the following state table and the associated tables that are
317
 * dimensioned with MAX_STATES_2022 should be trimmed.
318
 *
319
 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320
 * the associated escape sequences starting with ESC ( B should be removed.
321
 * This includes the ones with key values 1097 and all of the ones above 1000000.
322
 *
323
 * For the latter, the tables can simply be truncated.
324
 * For the former, since the tables must be kept parallel, it is probably best
325
 * to simply duplicate an adjacent table cell, parallel in all tables.
326
 *
327
 * It may make sense to restructure the tables, especially by using small search
328
 * tables for the variants instead of indexing them parallel to the table here.
329
 */
330
#endif
331
332
143k
#define MAX_STATES_2022 74
333
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334
/*   0           1           2           3           4           5           6           7           8           9           */
335
336
     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337
    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338
    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339
    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340
    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341
    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342
    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343
    ,35947631   ,35947635   ,35947636   ,35947638
344
};
345
346
#ifdef U_ENABLE_GENERIC_ISO_2022
347
348
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349
 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350
351
     nullptr                   ,nullptr                   ,nullptr                   ,nullptr               ,nullptr               ,nullptr                   ,nullptr                   ,nullptr                   ,"latin1"               ,"latin1"
352
    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353
    ,"latin1"               ,nullptr                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,nullptr                   ,nullptr                   ,nullptr                   ,nullptr                   ,"UTF8"
354
    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,nullptr               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355
    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356
    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357
    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358
    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359
};
360
361
#endif
362
363
static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364
/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365
     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366
    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367
    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373
};
374
375
/* Type def for refactoring changeState_2022 code*/
376
typedef enum{
377
#ifdef U_ENABLE_GENERIC_ISO_2022
378
    ISO_2022=0,
379
#endif
380
    ISO_2022_JP=1,
381
#if !UCONFIG_ONLY_HTML_CONVERSION
382
    ISO_2022_KR=2,
383
    ISO_2022_CN=3
384
#endif
385
} Variant2022;
386
387
/*********** ISO 2022 Converter Protos ***********/
388
static void U_CALLCONV
389
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391
static void U_CALLCONV
392
 _ISO2022Close(UConverter *converter);
393
394
static void U_CALLCONV
395
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397
U_CDECL_BEGIN
398
static const char * U_CALLCONV
399
_ISO2022getName(const UConverter* cnv);
400
U_CDECL_END
401
402
static void  U_CALLCONV
403
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405
U_CDECL_BEGIN
406
static UConverter * U_CALLCONV
407
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409
U_CDECL_END
410
411
#ifdef U_ENABLE_GENERIC_ISO_2022
412
static void U_CALLCONV
413
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414
#endif
415
416
namespace {
417
418
/*const UConverterSharedData _ISO2022Data;*/
419
extern const UConverterSharedData _ISO2022JPData;
420
421
#if !UCONFIG_ONLY_HTML_CONVERSION
422
extern const UConverterSharedData _ISO2022KRData;
423
extern const UConverterSharedData _ISO2022CNData;
424
#endif
425
426
}  // namespace
427
428
/*************** Converter implementations ******************/
429
430
/* The purpose of this function is to get around gcc compiler warnings. */
431
static inline void
432
fromUWriteUInt8(UConverter *cnv,
433
                 const char *bytes, int32_t length,
434
                 uint8_t **target, const char *targetLimit,
435
                 int32_t **offsets,
436
                 int32_t sourceIndex,
437
                 UErrorCode *pErrorCode)
438
19.8M
{
439
19.8M
    char* targetChars = reinterpret_cast<char*>(*target);
440
19.8M
    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441
19.8M
                         offsets, sourceIndex, pErrorCode);
442
19.8M
    *target = reinterpret_cast<uint8_t*>(targetChars);
443
444
19.8M
}
445
446
static inline void
447
800
setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448
800
    if(myConverterData->version == 1) {
449
69
        UConverter *cnv = myConverterData->currentConverter;
450
451
69
        cnv->toUnicodeStatus=0;     /* offset */
452
69
        cnv->mode=0;                /* state */
453
69
        cnv->toULength=0;           /* byteIndex */
454
69
    }
455
800
}
456
457
static inline void
458
1.08k
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459
   /* in ISO-2022-KR the designator sequence appears only once
460
    * in a file so we append it only once
461
    */
462
1.08k
    if( converter->charErrorBufferLength==0){
463
464
1.08k
        converter->charErrorBufferLength = 4;
465
1.08k
        converter->charErrorBuffer[0] = 0x1b;
466
1.08k
        converter->charErrorBuffer[1] = 0x24;
467
1.08k
        converter->charErrorBuffer[2] = 0x29;
468
1.08k
        converter->charErrorBuffer[3] = 0x43;
469
1.08k
    }
470
1.08k
    if(myConverterData->version == 1) {
471
205
        UConverter *cnv = myConverterData->currentConverter;
472
473
205
        cnv->fromUChar32=0;
474
205
        cnv->fromUnicodeStatus=1;   /* prevLength */
475
205
    }
476
1.08k
}
477
478
static void U_CALLCONV
479
2.29k
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481
2.29k
    char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
482
483
2.29k
    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484
2.29k
    if(cnv->extraInfo != nullptr) {
485
2.29k
        UConverterNamePieces stackPieces;
486
2.29k
        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487
2.29k
        UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
488
2.29k
        uint32_t version;
489
490
2.29k
        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492
2.29k
        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493
2.29k
        myConverterData->currentType = ASCII1;
494
2.29k
        cnv->fromUnicodeStatus =false;
495
2.29k
        if(pArgs->locale){
496
2.29k
            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
497
2.29k
        }
498
2.29k
        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499
2.29k
        myConverterData->version = version;
500
2.29k
        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501
985
            (myLocale[2]=='_' || myLocale[2]=='\0'))
502
985
        {
503
            /* open the required converters and cache them */
504
985
            if(version>MAX_JA_VERSION) {
505
                // ICU 55 fails to open a converter for an unsupported version.
506
                // Previously, it fell back to version 0, but that would yield
507
                // unexpected behavior.
508
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
509
0
                return;
510
0
            }
511
985
            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512
420
                myConverterData->myConverterArray[ISO8859_7] =
513
420
                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514
420
            }
515
985
            myConverterData->myConverterArray[JISX208] =
516
985
                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517
985
            if(jpCharsetMasks[version]&CSM(JISX212)) {
518
453
                myConverterData->myConverterArray[JISX212] =
519
453
                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520
453
            }
521
985
            if(jpCharsetMasks[version]&CSM(GB2312)) {
522
420
                myConverterData->myConverterArray[GB2312] =
523
420
                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524
420
            }
525
985
            if(jpCharsetMasks[version]&CSM(KSC5601)) {
526
420
                myConverterData->myConverterArray[KSC5601] =
527
420
                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528
420
            }
529
530
            /* set the function pointers to appropriate functions */
531
985
            cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022JPData);
532
985
            uprv_strcpy(myConverterData->locale,"ja");
533
534
985
            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535
985
            size_t len = uprv_strlen(myConverterData->name);
536
985
            myConverterData->name[len] = static_cast<char>(myConverterData->version + static_cast<int>('0'));
537
985
            myConverterData->name[len+1]='\0';
538
985
        }
539
1.30k
#if !UCONFIG_ONLY_HTML_CONVERSION
540
1.30k
        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541
540
            (myLocale[2]=='_' || myLocale[2]=='\0'))
542
540
        {
543
540
            if(version>1) {
544
                // ICU 55 fails to open a converter for an unsupported version.
545
                // Previously, it fell back to version 0, but that would yield
546
                // unexpected behavior.
547
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
548
0
                return;
549
0
            }
550
540
            const char *cnvName;
551
540
            if(version==1) {
552
70
                cnvName="icu-internal-25546";
553
470
            } else {
554
470
                cnvName="ibm-949";
555
470
                myConverterData->version=version=0;
556
470
            }
557
540
            if(pArgs->onlyTestIsLoadable) {
558
2
                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559
2
                uprv_free(cnv->extraInfo);
560
2
                cnv->extraInfo=nullptr;
561
2
                return;
562
538
            } else {
563
538
                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564
538
                if (U_FAILURE(*errorCode)) {
565
0
                    _ISO2022Close(cnv);
566
0
                    return;
567
0
                }
568
569
538
                if(version==1) {
570
69
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571
69
                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572
69
                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573
469
                }else{
574
469
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575
469
                }
576
577
                /* initialize the state variables */
578
538
                setInitialStateToUnicodeKR(cnv, myConverterData);
579
538
                setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581
                /* set the function pointers to appropriate functions */
582
538
                cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022KRData);
583
538
                uprv_strcpy(myConverterData->locale,"ko");
584
538
            }
585
540
        }
586
768
        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587
768
            (myLocale[2]=='_' || myLocale[2]=='\0'))
588
768
        {
589
768
            if(version>2) {
590
                // ICU 55 fails to open a converter for an unsupported version.
591
                // Previously, it fell back to version 0, but that would yield
592
                // unexpected behavior.
593
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
594
0
                return;
595
0
            }
596
597
            /* open the required converters and cache them */
598
768
            myConverterData->myConverterArray[GB2312_1] =
599
768
                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600
768
            if(version>=1) {
601
537
                myConverterData->myConverterArray[ISO_IR_165] =
602
537
                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603
537
            }
604
768
            myConverterData->myConverterArray[CNS_11643] =
605
768
                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608
            /* set the function pointers to appropriate functions */
609
768
            cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022CNData);
610
768
            uprv_strcpy(myConverterData->locale,"cn");
611
612
768
            if (version==0){
613
231
                myConverterData->version = 0;
614
231
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615
537
            }else if (version==1){
616
498
                myConverterData->version = 1;
617
498
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618
498
            }else {
619
39
                myConverterData->version = 2;
620
39
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621
39
            }
622
768
        }
623
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
624
0
        else{
625
#ifdef U_ENABLE_GENERIC_ISO_2022
626
            myConverterData->isFirstBuffer = true;
627
628
            /* append the UTF-8 escape sequence */
629
            cnv->charErrorBufferLength = 3;
630
            cnv->charErrorBuffer[0] = 0x1b;
631
            cnv->charErrorBuffer[1] = 0x25;
632
            cnv->charErrorBuffer[2] = 0x42;
633
634
            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635
            /* initialize the state variables */
636
            uprv_strcpy(myConverterData->name,"ISO_2022");
637
#else
638
0
            *errorCode = U_MISSING_RESOURCE_ERROR;
639
            // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640
            // data loading error code.
641
0
            return;
642
0
#endif
643
0
        }
644
645
2.29k
        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647
2.29k
        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648
8
            _ISO2022Close(cnv);
649
8
        }
650
2.29k
    } else {
651
0
        *errorCode = U_MEMORY_ALLOCATION_ERROR;
652
0
    }
653
2.29k
}
654
655
656
static void U_CALLCONV
657
2.29k
_ISO2022Close(UConverter *converter) {
658
2.29k
    UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(converter->extraInfo);
659
2.29k
    UConverterSharedData **array = myData->myConverterArray;
660
2.29k
    int32_t i;
661
662
2.29k
    if (converter->extraInfo != nullptr) {
663
        /*close the array of converter pointers and free the memory*/
664
25.2k
        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665
22.9k
            if(array[i]!=nullptr) {
666
4.77k
                ucnv_unloadSharedDataIfReady(array[i]);
667
4.77k
            }
668
22.9k
        }
669
670
2.29k
        ucnv_close(myData->currentConverter);
671
672
2.29k
        if(!converter->isExtraLocal){
673
2.29k
            uprv_free (converter->extraInfo);
674
2.29k
            converter->extraInfo = nullptr;
675
2.29k
        }
676
2.29k
    }
677
2.29k
}
678
679
static void U_CALLCONV
680
3.44k
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681
3.44k
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(converter->extraInfo);
682
3.44k
    if(choice<=UCNV_RESET_TO_UNICODE) {
683
1.10k
        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684
1.10k
        myConverterData->key = 0;
685
1.10k
        myConverterData->isEmptySegment = false;
686
1.10k
    }
687
3.44k
    if(choice!=UCNV_RESET_TO_UNICODE) {
688
2.33k
        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689
2.33k
    }
690
#ifdef U_ENABLE_GENERIC_ISO_2022
691
    if(myConverterData->locale[0] == 0){
692
        if(choice<=UCNV_RESET_TO_UNICODE) {
693
            myConverterData->isFirstBuffer = true;
694
            myConverterData->key = 0;
695
            if (converter->mode == UCNV_SO){
696
                ucnv_close (myConverterData->currentConverter);
697
                myConverterData->currentConverter=nullptr;
698
            }
699
            converter->mode = UCNV_SI;
700
        }
701
        if(choice!=UCNV_RESET_TO_UNICODE) {
702
            /* re-append UTF-8 escape sequence */
703
            converter->charErrorBufferLength = 3;
704
            converter->charErrorBuffer[0] = 0x1b;
705
            converter->charErrorBuffer[1] = 0x28;
706
            converter->charErrorBuffer[2] = 0x42;
707
        }
708
    }
709
    else
710
#endif
711
3.44k
    {
712
        /* reset the state variables */
713
3.44k
        if(myConverterData->locale[0] == 'k'){
714
808
            if(choice<=UCNV_RESET_TO_UNICODE) {
715
262
                setInitialStateToUnicodeKR(converter, myConverterData);
716
262
            }
717
808
            if(choice!=UCNV_RESET_TO_UNICODE) {
718
546
                setInitialStateFromUnicodeKR(converter, myConverterData);
719
546
            }
720
808
        }
721
3.44k
    }
722
3.44k
}
723
724
U_CDECL_BEGIN
725
726
static const char * U_CALLCONV
727
0
_ISO2022getName(const UConverter* cnv){
728
0
    if(cnv->extraInfo){
729
0
        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730
0
        return myData->name;
731
0
    }
732
0
    return nullptr;
733
0
}
734
735
U_CDECL_END
736
737
738
/*************** to unicode *******************/
739
/****************************************************************************
740
 * Recognized escape sequences are
741
 * <ESC>(B  ASCII
742
 * <ESC>.A  ISO-8859-1
743
 * <ESC>.F  ISO-8859-7
744
 * <ESC>(J  JISX-201
745
 * <ESC>(I  JISX-201
746
 * <ESC>$B  JISX-208
747
 * <ESC>$@  JISX-208
748
 * <ESC>$(D JISX-212
749
 * <ESC>$A  GB2312
750
 * <ESC>$(C KSC5601
751
 */
752
static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753
/*      0                1               2               3               4               5               6               7               8               9    */
754
    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755
    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756
    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757
    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762
};
763
764
#if !UCONFIG_ONLY_HTML_CONVERSION
765
/*************** to unicode *******************/
766
static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767
/*      0                1               2               3               4               5               6               7               8               9    */
768
     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773
    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776
};
777
#endif
778
779
780
static UCNV_TableStates_2022
781
143k
getKey_2022(char c,int32_t* key,int32_t* offset){
782
143k
    int32_t togo;
783
143k
    int32_t low = 0;
784
143k
    int32_t hi = MAX_STATES_2022;
785
143k
    int32_t oldmid=0;
786
787
143k
    togo = normalize_esq_chars_2022[static_cast<uint8_t>(c)];
788
143k
    if(togo == 0) {
789
        /* not a valid character anywhere in an escape sequence */
790
13.8k
        *key = 0;
791
13.8k
        *offset = 0;
792
13.8k
        return INVALID_2022;
793
13.8k
    }
794
129k
    togo = (*key << 5) + togo;
795
796
803k
    while (hi != low)  /*binary search*/{
797
798
803k
        int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800
803k
        if (mid == oldmid)
801
4.07k
            break;
802
803
799k
        if (escSeqStateTable_Key_2022[mid] > togo){
804
553k
            hi = mid;
805
553k
        }
806
245k
        else if (escSeqStateTable_Key_2022[mid] < togo){
807
119k
            low = mid;
808
119k
        }
809
125k
        else /*we found it*/{
810
125k
            *key = togo;
811
125k
            *offset = mid;
812
125k
            return static_cast<UCNV_TableStates_2022>(escSeqStateTable_Value_2022[mid]);
813
125k
        }
814
673k
        oldmid = mid;
815
816
673k
    }
817
818
4.07k
    *key = 0;
819
4.07k
    *offset = 0;
820
4.07k
    return INVALID_2022;
821
129k
}
822
823
/*runs through a state machine to determine the escape sequence - codepage correspondence
824
 */
825
static void
826
changeState_2022(UConverter* _this,
827
                const char** source,
828
                const char* sourceLimit,
829
                Variant2022 var,
830
54.0k
                UErrorCode* err){
831
54.0k
    UCNV_TableStates_2022 value;
832
54.0k
    UConverterDataISO2022* myData2022 = static_cast<UConverterDataISO2022*>(_this->extraInfo);
833
54.0k
    uint32_t key = myData2022->key;
834
54.0k
    int32_t offset = 0;
835
54.0k
    int8_t initialToULength = _this->toULength;
836
54.0k
    char c;
837
838
54.0k
    value = VALID_NON_TERMINAL_2022;
839
143k
    while (*source < sourceLimit) {
840
143k
        c = *(*source)++;
841
143k
        _this->toUBytes[_this->toULength++] = static_cast<uint8_t>(c);
842
143k
        value = getKey_2022(c, reinterpret_cast<int32_t*>(&key), &offset);
843
844
143k
        switch (value){
845
846
89.5k
        case VALID_NON_TERMINAL_2022 :
847
            /* continue with the loop */
848
89.5k
            break;
849
850
35.1k
        case VALID_TERMINAL_2022:
851
35.1k
            key = 0;
852
35.1k
            goto DONE;
853
854
17.9k
        case INVALID_2022:
855
17.9k
            goto DONE;
856
857
748
        case VALID_MAYBE_TERMINAL_2022:
858
#ifdef U_ENABLE_GENERIC_ISO_2022
859
            /* ESC ( B is ambiguous only for ISO_2022 itself */
860
            if(var == ISO_2022) {
861
                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862
                _this->toULength = 0;
863
864
                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866
                /* continue with the loop */
867
                value = VALID_NON_TERMINAL_2022;
868
                break;
869
            } else
870
#endif
871
748
            {
872
                /* not ISO_2022 itself, finish here */
873
748
                value = VALID_TERMINAL_2022;
874
748
                key = 0;
875
748
                goto DONE;
876
0
            }
877
143k
        }
878
143k
    }
879
880
54.0k
DONE:
881
54.0k
    myData2022->key = key;
882
883
54.0k
    if (value == VALID_NON_TERMINAL_2022) {
884
        /* indicate that the escape sequence is incomplete: key!=0 */
885
186
        return;
886
53.8k
    } else if (value == INVALID_2022 ) {
887
17.9k
        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888
35.8k
    } else /* value == VALID_TERMINAL_2022 */ {
889
35.8k
        switch(var){
890
#ifdef U_ENABLE_GENERIC_ISO_2022
891
        case ISO_2022:
892
        {
893
            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894
            if(chosenConverterName == nullptr) {
895
                /* SS2 or SS3 */
896
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897
                _this->toUCallbackReason = UCNV_UNASSIGNED;
898
                return;
899
            }
900
901
            _this->mode = UCNV_SI;
902
            ucnv_close(myData2022->currentConverter);
903
            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904
            if(U_SUCCESS(*err)) {
905
                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906
                _this->mode = UCNV_SO;
907
            }
908
            break;
909
        }
910
#endif
911
15.6k
        case ISO_2022_JP:
912
15.6k
            {
913
15.6k
                StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeJP[offset]);
914
15.6k
                switch(tempState) {
915
4.78k
                case INVALID_STATE:
916
4.78k
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917
4.78k
                    break;
918
4.31k
                case SS2_STATE:
919
4.31k
                    if(myData2022->toU2022State.cs[2]!=0) {
920
0
                        if(myData2022->toU2022State.g<2) {
921
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922
0
                        }
923
0
                        myData2022->toU2022State.g=2;
924
4.31k
                    } else {
925
                        /* illegal to have SS2 before a matching designator */
926
4.31k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927
4.31k
                    }
928
4.31k
                    break;
929
                /* case SS3_STATE: not used in ISO-2022-JP-x */
930
615
                case ISO8859_1:
931
944
                case ISO8859_7:
932
944
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933
944
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934
944
                    } else {
935
                        /* G2 charset for SS2 */
936
0
                        myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
937
0
                    }
938
944
                    break;
939
5.59k
                default:
940
5.59k
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941
220
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942
5.37k
                    } else {
943
                        /* G0 charset */
944
5.37k
                        myData2022->toU2022State.cs[0] = static_cast<int8_t>(tempState);
945
5.37k
                    }
946
5.59k
                    break;
947
15.6k
                }
948
15.6k
            }
949
15.6k
            break;
950
15.6k
#if !UCONFIG_ONLY_HTML_CONVERSION
951
18.5k
        case ISO_2022_CN:
952
18.5k
            {
953
18.5k
                StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeCN[offset]);
954
18.5k
                switch(tempState) {
955
4.68k
                case INVALID_STATE:
956
4.68k
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957
4.68k
                    break;
958
5.43k
                case SS2_STATE:
959
5.43k
                    if(myData2022->toU2022State.cs[2]!=0) {
960
2.76k
                        if(myData2022->toU2022State.g<2) {
961
1.88k
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962
1.88k
                        }
963
2.76k
                        myData2022->toU2022State.g=2;
964
2.76k
                    } else {
965
                        /* illegal to have SS2 before a matching designator */
966
2.67k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967
2.67k
                    }
968
5.43k
                    break;
969
2.85k
                case SS3_STATE:
970
2.85k
                    if(myData2022->toU2022State.cs[3]!=0) {
971
1.59k
                        if(myData2022->toU2022State.g<2) {
972
1.09k
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973
1.09k
                        }
974
1.59k
                        myData2022->toU2022State.g=3;
975
1.59k
                    } else {
976
                        /* illegal to have SS3 before a matching designator */
977
1.25k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978
1.25k
                    }
979
2.85k
                    break;
980
652
                case ISO_IR_165:
981
652
                    if(myData2022->version==0) {
982
213
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983
213
                        break;
984
213
                    }
985
439
                    U_FALLTHROUGH;
986
1.80k
                case GB2312_1:
987
1.80k
                    U_FALLTHROUGH;
988
2.36k
                case CNS_11643_1:
989
2.36k
                    myData2022->toU2022State.cs[1] = static_cast<int8_t>(tempState);
990
2.36k
                    break;
991
1.80k
                case CNS_11643_2:
992
1.80k
                    myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
993
1.80k
                    break;
994
1.16k
                default:
995
                    /* other CNS 11643 planes */
996
1.16k
                    if(myData2022->version==0) {
997
621
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998
621
                    } else {
999
543
                        myData2022->toU2022State.cs[3] = static_cast<int8_t>(tempState);
1000
543
                    }
1001
1.16k
                    break;
1002
18.5k
                }
1003
18.5k
            }
1004
18.5k
            break;
1005
18.5k
        case ISO_2022_KR:
1006
1.72k
            if(offset==0x30){
1007
                /* nothing to be done, just accept this one escape sequence */
1008
1.32k
            } else {
1009
1.32k
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010
1.32k
            }
1011
1.72k
            break;
1012
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014
0
        default:
1015
0
            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016
0
            break;
1017
35.8k
        }
1018
35.8k
    }
1019
53.8k
    if(U_SUCCESS(*err)) {
1020
14.8k
        _this->toULength = 0;
1021
38.9k
    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022
26.1k
        if(_this->toULength>1) {
1023
            /*
1024
             * Ticket 5691: consistent illegal sequences:
1025
             * - We include at least the first byte (ESC) in the illegal sequence.
1026
             * - If any of the non-initial bytes could be the start of a character,
1027
             *   we stop the illegal sequence before the first one of those.
1028
             *   In escape sequences, all following bytes are "printable", that is,
1029
             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030
             *   they are valid single/lead bytes.
1031
             *   For simplicity, we always only report the initial ESC byte as the
1032
             *   illegal sequence and back out all other bytes we looked at.
1033
             */
1034
            /* Back out some bytes. */
1035
26.1k
            int8_t backOutDistance=_this->toULength-1;
1036
26.1k
            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037
26.1k
            if(backOutDistance<=bytesFromThisBuffer) {
1038
                /* same as initialToULength<=1 */
1039
26.1k
                *source-=backOutDistance;
1040
26.1k
            } else {
1041
                /* Back out bytes from the previous buffer: Need to replay them. */
1042
0
                _this->preToULength = static_cast<int8_t>(bytesFromThisBuffer - backOutDistance);
1043
                /* same as -(initialToULength-1) */
1044
                /* preToULength is negative! */
1045
0
                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046
0
                *source-=bytesFromThisBuffer;
1047
0
            }
1048
26.1k
            _this->toULength=1;
1049
26.1k
        }
1050
26.1k
    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051
12.7k
        _this->toUCallbackReason = UCNV_UNASSIGNED;
1052
12.7k
    }
1053
53.8k
}
1054
1055
#if !UCONFIG_ONLY_HTML_CONVERSION
1056
/*Checks the characters of the buffer against valid 2022 escape sequences
1057
*if the match we return a pointer to the initial start of the sequence otherwise
1058
*we return sourceLimit
1059
*/
1060
/*for 2022 looks ahead in the stream
1061
 *to determine the longest possible convertible
1062
 *data stream
1063
 */
1064
static inline const char*
1065
getEndOfBuffer_2022(const char** source,
1066
                   const char* sourceLimit,
1067
0
                   UBool /*flush*/){
1068
1069
0
    const char* mySource = *source;
1070
1071
#ifdef U_ENABLE_GENERIC_ISO_2022
1072
    if (*source >= sourceLimit)
1073
        return sourceLimit;
1074
1075
    do{
1076
1077
        if (*mySource == ESC_2022){
1078
            int8_t i;
1079
            int32_t key = 0;
1080
            int32_t offset;
1081
            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083
            /* Kludge: I could not
1084
            * figure out the reason for validating an escape sequence
1085
            * twice - once here and once in changeState_2022().
1086
            * is it possible to have an ESC character in a ISO2022
1087
            * byte stream which is valid in a code page? Is it legal?
1088
            */
1089
            for (i=0;
1090
            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091
            i++) {
1092
                value =  getKey_2022(*(mySource+i), &key, &offset);
1093
            }
1094
            if (value > 0 || *mySource==ESC_2022)
1095
                return mySource;
1096
1097
            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098
                return sourceLimit;
1099
        }
1100
    }while (++mySource < sourceLimit);
1101
1102
    return sourceLimit;
1103
#else
1104
0
    while(mySource < sourceLimit && *mySource != ESC_2022) {
1105
0
        ++mySource;
1106
0
    }
1107
0
    return mySource;
1108
0
#endif
1109
0
}
1110
#endif
1111
1112
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113
 * any future change in _MBCSFromUChar32() function should be reflected here.
1114
 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115
 */
1116
static inline int32_t
1117
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118
                                         UChar32 c,
1119
                                         uint32_t* value,
1120
                                         UBool useFallback,
1121
                                         int outputType)
1122
65.1M
{
1123
65.1M
    const int32_t *cx;
1124
65.1M
    const uint16_t *table;
1125
65.1M
    uint32_t stage2Entry;
1126
65.1M
    uint32_t myValue;
1127
65.1M
    int32_t length;
1128
65.1M
    const uint8_t *p;
1129
    /*
1130
     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131
     * Use internal version of ucnv_open() that verifies that the new structures are available,
1132
     * else U_INTERNAL_PROGRAM_ERROR.
1133
     */
1134
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135
65.1M
    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136
65.1M
        table=sharedData->mbcs.fromUnicodeTable;
1137
65.1M
        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138
        /* get the bytes and the length for the output */
1139
65.1M
        if(outputType==MBCS_OUTPUT_2){
1140
52.6M
            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141
52.6M
            if(myValue<=0xff) {
1142
25.2M
                length=1;
1143
27.4M
            } else {
1144
27.4M
                length=2;
1145
27.4M
            }
1146
52.6M
        } else /* outputType==MBCS_OUTPUT_3 */ {
1147
12.4M
            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148
12.4M
            myValue = (static_cast<uint32_t>(*p) << 16) | (static_cast<uint32_t>(p[1]) << 8) | p[2];
1149
12.4M
            if(myValue<=0xff) {
1150
6.23M
                length=1;
1151
6.23M
            } else if(myValue<=0xffff) {
1152
0
                length=2;
1153
6.23M
            } else {
1154
6.23M
                length=3;
1155
6.23M
            }
1156
12.4M
        }
1157
        /* is this code point assigned, or do we use fallbacks? */
1158
65.1M
        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159
            /* assigned */
1160
33.7M
            *value=myValue;
1161
33.7M
            return length;
1162
33.7M
        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163
            /*
1164
             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165
             * There is no way with this data structure for fallback output
1166
             * to be a zero byte.
1167
             */
1168
0
            *value=myValue;
1169
0
            return -length;
1170
0
        }
1171
65.1M
    }
1172
1173
31.4M
    cx=sharedData->mbcs.extIndexes;
1174
31.4M
    if(cx!=nullptr) {
1175
10.8M
        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176
10.8M
    }
1177
1178
    /* unassigned */
1179
20.6M
    return 0;
1180
31.4M
}
1181
1182
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183
 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184
 * @param retval pointer to output byte
1185
 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186
 */
1187
static inline int32_t
1188
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189
                                       UChar32 c,
1190
                                       uint32_t* retval,
1191
                                       UBool useFallback)
1192
7.45M
{
1193
7.45M
    const uint16_t *table;
1194
7.45M
    int32_t value;
1195
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196
7.45M
    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197
346
        return 0;
1198
346
    }
1199
    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200
7.45M
    table=sharedData->mbcs.fromUnicodeTable;
1201
    /* get the byte for the output */
1202
7.45M
    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203
    /* is this code point assigned, or do we use fallbacks? */
1204
7.45M
    *retval = static_cast<uint32_t>(value & 0xff);
1205
7.45M
    if(value>=0xf00) {
1206
490
        return 1;  /* roundtrip */
1207
7.45M
    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208
0
        return -1;  /* fallback taken */
1209
7.45M
    } else {
1210
7.45M
        return 0;  /* no mapping */
1211
7.45M
    }
1212
7.45M
}
1213
1214
/*
1215
 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216
 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217
 * to move it to the ISO 2022 range 21..7E.
1218
 * Return 0 if out of range.
1219
 */
1220
static inline uint32_t
1221
3.59M
_2022FromGR94DBCS(uint32_t value) {
1222
3.59M
    if (static_cast<uint16_t>(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223
3.59M
        static_cast<uint8_t>(value - 0xa1) <= (0xfe - 0xa1)
1224
3.59M
    ) {
1225
3.59M
        return value - 0x8080;  /* shift down to 21..7e byte range */
1226
3.59M
    } else {
1227
872
        return 0;  /* not valid for ISO 2022 */
1228
872
    }
1229
3.59M
}
1230
1231
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232
/*
1233
 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234
 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235
 * unchanged. 
1236
 */
1237
static inline uint32_t
1238
_2022ToGR94DBCS(uint32_t value) {
1239
    uint32_t returnValue = value + 0x8080;
1240
    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241
        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242
        return returnValue;
1243
    } else {
1244
        return value;
1245
    }
1246
}
1247
#endif
1248
1249
#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251
/**********************************************************************************
1252
*  ISO-2022 Converter
1253
*
1254
*
1255
*/
1256
1257
static void U_CALLCONV
1258
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259
                                                           UErrorCode* err){
1260
    const char* mySourceLimit, *realSourceLimit;
1261
    const char* sourceStart;
1262
    const char16_t* myTargetStart;
1263
    UConverter* saveThis;
1264
    UConverterDataISO2022* myData;
1265
    int8_t length;
1266
1267
    saveThis = args->converter;
1268
    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270
    realSourceLimit = args->sourceLimit;
1271
    while (args->source < realSourceLimit) {
1272
        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273
            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274
            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276
            if(args->source < mySourceLimit) {
1277
                if(myData->currentConverter==nullptr) {
1278
                    myData->currentConverter = ucnv_open("ASCII",err);
1279
                    if(U_FAILURE(*err)){
1280
                        return;
1281
                    }
1282
1283
                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284
                    saveThis->mode = UCNV_SO;
1285
                }
1286
1287
                /* convert to before the ESC or until the end of the buffer */
1288
                myData->isFirstBuffer=false;
1289
                sourceStart = args->source;
1290
                myTargetStart = args->target;
1291
                args->converter = myData->currentConverter;
1292
                ucnv_toUnicode(args->converter,
1293
                    &args->target,
1294
                    args->targetLimit,
1295
                    &args->source,
1296
                    mySourceLimit,
1297
                    args->offsets,
1298
                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299
                    err);
1300
                args->converter = saveThis;
1301
1302
                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303
                    /* move the overflow buffer */
1304
                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305
                    myData->currentConverter->UCharErrorBufferLength = 0;
1306
                    if(length > 0) {
1307
                        uprv_memcpy(saveThis->UCharErrorBuffer,
1308
                                    myData->currentConverter->UCharErrorBuffer,
1309
                                    length*U_SIZEOF_UCHAR);
1310
                    }
1311
                    return;
1312
                }
1313
1314
                /*
1315
                 * At least one of:
1316
                 * -Error while converting
1317
                 * -Done with entire buffer
1318
                 * -Need to write offsets or update the current offset
1319
                 *  (leave that up to the code in ucnv.c)
1320
                 *
1321
                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322
                 */
1323
                if (U_FAILURE(*err) ||
1324
                    (args->source == realSourceLimit) ||
1325
                    (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
1326
                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327
                ) {
1328
                    /* copy partial or error input for truncated detection and error handling */
1329
                    if(U_FAILURE(*err)) {
1330
                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331
                        if(length > 0) {
1332
                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333
                        }
1334
                    } else {
1335
                        length = saveThis->toULength = myData->currentConverter->toULength;
1336
                        if(length > 0) {
1337
                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338
                            if(args->source < mySourceLimit) {
1339
                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340
                            }
1341
                        }
1342
                    }
1343
                    return;
1344
                }
1345
            }
1346
        }
1347
1348
        sourceStart = args->source;
1349
        changeState_2022(args->converter,
1350
               &(args->source),
1351
               realSourceLimit,
1352
               ISO_2022,
1353
               err);
1354
        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
1355
            /* let the ucnv.c code update its current offset */
1356
            return;
1357
        }
1358
    }
1359
}
1360
1361
#endif
1362
1363
/*
1364
 * To Unicode Callback helper function
1365
 */
1366
static void
1367
toUnicodeCallback(UConverter *cnv,
1368
                  const uint32_t sourceChar, const uint32_t targetUniChar,
1369
2.24M
                  UErrorCode* err){
1370
2.24M
    if(sourceChar>0xff){
1371
537k
        cnv->toUBytes[0] = static_cast<uint8_t>(sourceChar >> 8);
1372
537k
        cnv->toUBytes[1] = static_cast<uint8_t>(sourceChar);
1373
537k
        cnv->toULength = 2;
1374
537k
    }
1375
1.70M
    else{
1376
1.70M
        cnv->toUBytes[0] = static_cast<char>(sourceChar);
1377
1.70M
        cnv->toULength = 1;
1378
1.70M
    }
1379
1380
2.24M
    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381
35.5k
        *err = U_INVALID_CHAR_FOUND;
1382
35.5k
    }
1383
2.20M
    else{
1384
2.20M
        *err = U_ILLEGAL_CHAR_FOUND;
1385
2.20M
    }
1386
2.24M
}
1387
1388
/**************************************ISO-2022-JP*************************************************/
1389
1390
/************************************** IMPORTANT **************************************************
1391
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393
* The converter iterates over each Unicode codepoint
1394
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395
* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396
* would do as far as possible.
1397
*
1398
* If the implementation of these macros or structure of sharedData struct change in the future, make
1399
* sure that ISO-2022 is also changed.
1400
***************************************************************************************************
1401
*/
1402
1403
/***************************************************************************************************
1404
* Rules for ISO-2022-jp encoding
1405
* (i)   Escape sequences must be fully contained within a line they should not
1406
*       span new lines or CRs
1407
* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408
*       JIS-Roman character escape sequence should follow before the line terminates
1409
* (iii) If the first character on the line is represented by two bytes then a two
1410
*       byte character escape sequence should precede it
1411
* (iv)  If no escape sequence is encountered then the characters are ASCII
1412
* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413
*       and invoked with SS2 (ESC N).
1414
* (vi)  If there is any G0 designation in text, there must be a switch to
1415
*       ASCII or to JIS X 0201-Roman before a space character (but not
1416
*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417
*       characters such as tab or CRLF.
1418
* (vi)  Supported encodings:
1419
*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420
*
1421
*  source : RFC-1554
1422
*
1423
*          JISX201, JISX208,JISX212 : new .cnv data files created
1424
*          KSC5601 : alias to ibm-949 mapping table
1425
*          GB2312 : alias to ibm-1386 mapping table
1426
*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427
*          ISO-8859-7 : alias to ibm-9409 mapping table
1428
*/
1429
1430
/* preference order of JP charsets */
1431
static const StateEnum jpCharsetPref[]={
1432
    ASCII,
1433
    JISX201,
1434
    ISO8859_1,
1435
    JISX208,
1436
    ISO8859_7,
1437
    JISX212,
1438
    GB2312,
1439
    KSC5601,
1440
    HWKANA_7BIT
1441
};
1442
1443
/*
1444
 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445
 * not in order of jpCharsetPref[]!
1446
 */
1447
static const char escSeqChars[][6] ={
1448
    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449
    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450
    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451
    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452
    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453
    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454
    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455
    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456
    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457
1458
};
1459
static  const int8_t escSeqCharsLen[] ={
1460
    3, /* length of <ESC>(B  ASCII       */
1461
    3, /* length of <ESC>.A  ISO-8859-1  */
1462
    3, /* length of <ESC>.F  ISO-8859-7  */
1463
    3, /* length of <ESC>(J  JISX-201    */
1464
    3, /* length of <ESC>$B  JISX-208    */
1465
    4, /* length of <ESC>$(D JISX-212    */
1466
    3, /* length of <ESC>$A  GB2312      */
1467
    4, /* length of <ESC>$(C KSC5601     */
1468
    3  /* length of <ESC>(I  HWKANA_7BIT */
1469
};
1470
1471
/*
1472
* The iteration over various code pages works this way:
1473
* i)   Get the currentState from myConverterData->currentState
1474
* ii)  Check if the character is mapped to a valid character in the currentState
1475
*      Yes ->  a) set the initIterState to currentState
1476
*       b) remain in this state until an invalid character is found
1477
*      No  ->  a) go to the next code page and find the character
1478
* iii) Before changing the state increment the current state check if the current state
1479
*      is equal to the intitIteration state
1480
*      Yes ->  A character that cannot be represented in any of the supported encodings
1481
*       break and return a U_INVALID_CHARACTER error
1482
*      No  ->  Continue and find the character in next code page
1483
*
1484
*
1485
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486
*/
1487
1488
/* Map 00..7F to Unicode according to JIS X 0201. */
1489
static inline uint32_t
1490
167k
jisx201ToU(uint32_t value) {
1491
167k
    if(value < 0x5c) {
1492
68.8k
        return value;
1493
99.0k
    } else if(value == 0x5c) {
1494
389
        return 0xa5;
1495
98.6k
    } else if(value == 0x7e) {
1496
8.46k
        return 0x203e;
1497
90.2k
    } else /* value <= 0x7f */ {
1498
90.2k
        return value;
1499
90.2k
    }
1500
167k
}
1501
1502
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503
static inline uint32_t
1504
7.93M
jisx201FromU(uint32_t value) {
1505
7.93M
    if(value<=0x7f) {
1506
652
        if(value!=0x5c && value!=0x7e) {
1507
200
            return value;
1508
200
        }
1509
7.93M
    } else if(value==0xa5) {
1510
665
        return 0x5c;
1511
7.93M
    } else if(value==0x203e) {
1512
200
        return 0x7e;
1513
200
    }
1514
7.93M
    return 0xfffe;
1515
7.93M
}
1516
1517
/*
1518
 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519
 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520
 * Return 0 if the byte pair is out of range.
1521
 */
1522
static inline uint32_t
1523
7.49M
_2022FromSJIS(uint32_t value) {
1524
7.49M
    uint8_t trail;
1525
1526
7.49M
    if(value > 0xEFFC) {
1527
6.96M
        return 0;  /* beyond JIS X 0208 */
1528
6.96M
    }
1529
1530
530k
    trail = static_cast<uint8_t>(value);
1531
1532
530k
    value &= 0xff00;  /* lead byte */
1533
530k
    if(value <= 0x9f00) {
1534
529k
        value -= 0x7000;
1535
529k
    } else /* 0xe000 <= value <= 0xef00 */ {
1536
856
        value -= 0xb000;
1537
856
    }
1538
530k
    value <<= 1;
1539
1540
530k
    if(trail <= 0x9e) {
1541
138k
        value -= 0x100;
1542
138k
        if(trail <= 0x7e) {
1543
53.4k
            value |= trail - 0x1f;
1544
85.4k
        } else {
1545
85.4k
            value |= trail - 0x20;
1546
85.4k
        }
1547
391k
    } else /* trail <= 0xfc */ {
1548
391k
        value |= trail - 0x7e;
1549
391k
    }
1550
530k
    return value;
1551
7.49M
}
1552
1553
/*
1554
 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555
 * If either byte is outside 21..7E make sure that the result is not valid
1556
 * for Shift-JIS so that the converter catches it.
1557
 * Some invalid byte values already turn into equally invalid Shift-JIS
1558
 * byte values and need not be tested explicitly.
1559
 */
1560
static inline void
1561
126k
_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562
126k
    if(c1&1) {
1563
105k
        ++c1;
1564
105k
        if(c2 <= 0x5f) {
1565
15.5k
            c2 += 0x1f;
1566
90.2k
        } else if(c2 <= 0x7e) {
1567
90.2k
            c2 += 0x20;
1568
90.2k
        } else {
1569
0
            c2 = 0;  /* invalid */
1570
0
        }
1571
105k
    } else {
1572
20.3k
        if (static_cast<uint8_t>(c2 - 0x21) <= ((0x7e) - 0x21)) {
1573
20.3k
            c2 += 0x7e;
1574
20.3k
        } else {
1575
0
            c2 = 0;  /* invalid */
1576
0
        }
1577
20.3k
    }
1578
126k
    c1 >>= 1;
1579
126k
    if(c1 <= 0x2f) {
1580
23.4k
        c1 += 0x70;
1581
102k
    } else if(c1 <= 0x3f) {
1582
102k
        c1 += 0xb0;
1583
102k
    } else {
1584
0
        c1 = 0;  /* invalid */
1585
0
    }
1586
126k
    bytes[0] = static_cast<char>(c1);
1587
126k
    bytes[1] = static_cast<char>(c2);
1588
126k
}
1589
1590
/*
1591
 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592
 * Katakana.
1593
 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594
 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595
 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596
 */
1597
static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598
    0x2123,  /* U+FF61 */
1599
    0x2156,
1600
    0x2157,
1601
    0x2122,
1602
    0x2126,
1603
    0x2572,
1604
    0x2521,
1605
    0x2523,
1606
    0x2525,
1607
    0x2527,
1608
    0x2529,
1609
    0x2563,
1610
    0x2565,
1611
    0x2567,
1612
    0x2543,
1613
    0x213C,  /* U+FF70 */
1614
    0x2522,
1615
    0x2524,
1616
    0x2526,
1617
    0x2528,
1618
    0x252A,
1619
    0x252B,
1620
    0x252D,
1621
    0x252F,
1622
    0x2531,
1623
    0x2533,
1624
    0x2535,
1625
    0x2537,
1626
    0x2539,
1627
    0x253B,
1628
    0x253D,
1629
    0x253F,  /* U+FF80 */
1630
    0x2541,
1631
    0x2544,
1632
    0x2546,
1633
    0x2548,
1634
    0x254A,
1635
    0x254B,
1636
    0x254C,
1637
    0x254D,
1638
    0x254E,
1639
    0x254F,
1640
    0x2552,
1641
    0x2555,
1642
    0x2558,
1643
    0x255B,
1644
    0x255E,
1645
    0x255F,  /* U+FF90 */
1646
    0x2560,
1647
    0x2561,
1648
    0x2562,
1649
    0x2564,
1650
    0x2566,
1651
    0x2568,
1652
    0x2569,
1653
    0x256A,
1654
    0x256B,
1655
    0x256C,
1656
    0x256D,
1657
    0x256F,
1658
    0x2573,
1659
    0x212B,
1660
    0x212C   /* U+FF9F */
1661
};
1662
1663
static void U_CALLCONV
1664
408k
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665
408k
    UConverter *cnv = args->converter;
1666
408k
    UConverterDataISO2022 *converterData;
1667
408k
    ISO2022State *pFromU2022State;
1668
408k
    uint8_t* target = reinterpret_cast<uint8_t*>(args->target);
1669
408k
    const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit);
1670
408k
    const char16_t* source = args->source;
1671
408k
    const char16_t* sourceLimit = args->sourceLimit;
1672
408k
    int32_t* offsets = args->offsets;
1673
408k
    UChar32 sourceChar;
1674
408k
    char buffer[8];
1675
408k
    int32_t len, outLen;
1676
408k
    int8_t choices[10];
1677
408k
    int32_t choiceCount;
1678
408k
    uint32_t targetValue = 0;
1679
408k
    UBool useFallback;
1680
1681
408k
    int32_t i;
1682
408k
    int8_t cs, g;
1683
1684
    /* set up the state */
1685
408k
    converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
1686
408k
    pFromU2022State   = &converterData->fromU2022State;
1687
1688
408k
    choiceCount = 0;
1689
1690
    /* check if the last codepoint of previous buffer was a lead surrogate*/
1691
408k
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692
0
        goto getTrail;
1693
0
    }
1694
1695
8.63M
    while(source < sourceLimit) {
1696
8.63M
        if(target < targetLimit) {
1697
1698
8.62M
            sourceChar  = *(source++);
1699
            /*check if the char is a First surrogate*/
1700
8.62M
            if(U16_IS_SURROGATE(sourceChar)) {
1701
1.80k
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702
1.42k
getTrail:
1703
                    /*look ahead to find the trail surrogate*/
1704
1.42k
                    if(source < sourceLimit) {
1705
                        /* test the following code unit */
1706
1.41k
                        char16_t trail = *source;
1707
1.41k
                        if(U16_IS_TRAIL(trail)) {
1708
362
                            source++;
1709
362
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710
362
                            cnv->fromUChar32=0x00;
1711
                            /* convert this supplementary code point */
1712
                            /* exit this condition tree */
1713
1.05k
                        } else {
1714
                            /* this is an unmatched lead code unit (1st surrogate) */
1715
                            /* callback(illegal) */
1716
1.05k
                            *err=U_ILLEGAL_CHAR_FOUND;
1717
1.05k
                            cnv->fromUChar32=sourceChar;
1718
1.05k
                            break;
1719
1.05k
                        }
1720
1.41k
                    } else {
1721
                        /* no more input */
1722
15
                        cnv->fromUChar32=sourceChar;
1723
15
                        break;
1724
15
                    }
1725
1.42k
                } else {
1726
                    /* this is an unmatched trail code unit (2nd surrogate) */
1727
                    /* callback(illegal) */
1728
375
                    *err=U_ILLEGAL_CHAR_FOUND;
1729
375
                    cnv->fromUChar32=sourceChar;
1730
375
                    break;
1731
375
                }
1732
1.80k
            }
1733
1734
            /* do not convert SO/SI/ESC */
1735
8.62M
            if(IS_2022_CONTROL(sourceChar)) {
1736
                /* callback(illegal) */
1737
196
                *err=U_ILLEGAL_CHAR_FOUND;
1738
196
                cnv->fromUChar32=sourceChar;
1739
196
                break;
1740
196
            }
1741
1742
            /* do the conversion */
1743
1744
8.62M
            if(choiceCount == 0) {
1745
7.93M
                uint16_t csm;
1746
1747
                /*
1748
                 * The csm variable keeps track of which charsets are allowed
1749
                 * and not used yet while building the choices[].
1750
                 */
1751
7.93M
                csm = jpCharsetMasks[converterData->version];
1752
7.93M
                choiceCount = 0;
1753
1754
                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755
7.93M
                if(converterData->version == 3 || converterData->version == 4) {
1756
252k
                    choices[choiceCount++] = static_cast<int8_t>(HWKANA_7BIT);
1757
252k
                }
1758
                /* Do not try single-byte half-width Katakana for other versions. */
1759
7.93M
                csm &= ~CSM(HWKANA_7BIT);
1760
1761
                /* try the current G0 charset */
1762
7.93M
                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763
7.93M
                csm &= ~CSM(cs);
1764
1765
                /* try the current G2 charset */
1766
7.93M
                if((cs = pFromU2022State->cs[2]) != 0) {
1767
888k
                    choices[choiceCount++] = cs;
1768
888k
                    csm &= ~CSM(cs);
1769
888k
                }
1770
1771
                /* try all the other possible charsets */
1772
79.3M
                for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773
71.4M
                    cs = static_cast<int8_t>(jpCharsetPref[i]);
1774
71.4M
                    if(CSM(cs) & csm) {
1775
54.6M
                        choices[choiceCount++] = cs;
1776
54.6M
                        csm &= ~CSM(cs);
1777
54.6M
                    }
1778
71.4M
                }
1779
7.93M
            }
1780
1781
8.62M
            cs = g = 0;
1782
            /*
1783
             * len==0: no mapping found yet
1784
             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785
             * len>0: found a roundtrip result, done
1786
             */
1787
8.62M
            len = 0;
1788
            /*
1789
             * We will turn off useFallback after finding a fallback,
1790
             * but we still get fallbacks from PUA code points as usual.
1791
             * Therefore, we will also need to check that we don't overwrite
1792
             * an early fallback with a later one.
1793
             */
1794
8.62M
            useFallback = cnv->useFallback;
1795
1796
67.7M
            for(i = 0; i < choiceCount && len <= 0; ++i) {
1797
59.1M
                uint32_t value;
1798
59.1M
                int32_t len2;
1799
59.1M
                int8_t cs0 = choices[i];
1800
59.1M
                switch(cs0) {
1801
7.93M
                case ASCII:
1802
7.93M
                    if(sourceChar <= 0x7f) {
1803
1.82k
                        targetValue = static_cast<uint32_t>(sourceChar);
1804
1.82k
                        len = 1;
1805
1.82k
                        cs = cs0;
1806
1.82k
                        g = 0;
1807
1.82k
                    }
1808
7.93M
                    break;
1809
7.93M
                case ISO8859_1:
1810
7.93M
                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811
355
                        targetValue = static_cast<uint32_t>(sourceChar) - 0x80;
1812
355
                        len = 1;
1813
355
                        cs = cs0;
1814
355
                        g = 2;
1815
355
                    }
1816
7.93M
                    break;
1817
361k
                case HWKANA_7BIT:
1818
361k
                    if (static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819
1.22k
                        if(converterData->version==3) {
1820
                            /* JIS7: use G1 (SO) */
1821
                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822
832
                            targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0x21));
1823
832
                            len = 1;
1824
832
                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825
832
                            g = 1;
1826
832
                        } else if(converterData->version==4) {
1827
                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828
                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829
393
                            targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0xa1));
1830
393
                            len = 1;
1831
1832
393
                            cs = pFromU2022State->cs[0];
1833
393
                            if(IS_JP_DBCS(cs)) {
1834
                                /* switch from a DBCS charset to JISX201 */
1835
197
                                cs = static_cast<int8_t>(JISX201);
1836
197
                            }
1837
                            /* else stay in the current G0 charset */
1838
393
                            g = 0;
1839
393
                        }
1840
                        /* else do not use HWKANA_7BIT with other versions */
1841
1.22k
                    }
1842
361k
                    break;
1843
7.93M
                case JISX201:
1844
                    /* G0 SBCS */
1845
7.93M
                    value = jisx201FromU(sourceChar);
1846
7.93M
                    if(value <= 0x7f) {
1847
1.06k
                        targetValue = value;
1848
1.06k
                        len = 1;
1849
1.06k
                        cs = cs0;
1850
1.06k
                        g = 0;
1851
1.06k
                        useFallback = false;
1852
1.06k
                    }
1853
7.93M
                    break;
1854
7.98M
                case JISX208:
1855
                    /* G0 DBCS from Shift-JIS table */
1856
7.98M
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1857
7.98M
                                converterData->myConverterArray[cs0],
1858
7.98M
                                sourceChar, &value,
1859
7.98M
                                useFallback, MBCS_OUTPUT_2);
1860
7.98M
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1861
7.49M
                        value = _2022FromSJIS(value);
1862
7.49M
                        if(value != 0) {
1863
530k
                            targetValue = value;
1864
530k
                            len = len2;
1865
530k
                            cs = cs0;
1866
530k
                            g = 0;
1867
530k
                            useFallback = false;
1868
530k
                        }
1869
7.49M
                    } else if(len == 0 && useFallback &&
1870
0
                              static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871
0
                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872
0
                        len = -2;
1873
0
                        cs = cs0;
1874
0
                        g = 0;
1875
0
                        useFallback = false;
1876
0
                    }
1877
7.98M
                    break;
1878
7.45M
                case ISO8859_7:
1879
                    /* G0 SBCS forced to 7-bit output */
1880
7.45M
                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1881
7.45M
                                converterData->myConverterArray[cs0],
1882
7.45M
                                sourceChar, &value,
1883
7.45M
                                useFallback);
1884
7.45M
                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885
206
                        targetValue = value - 0x80;
1886
206
                        len = len2;
1887
206
                        cs = cs0;
1888
206
                        g = 2;
1889
206
                        useFallback = false;
1890
206
                    }
1891
7.45M
                    break;
1892
19.5M
                default:
1893
                    /* G0 DBCS */
1894
19.5M
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1895
19.5M
                                converterData->myConverterArray[cs0],
1896
19.5M
                                sourceChar, &value,
1897
19.5M
                                useFallback, MBCS_OUTPUT_2);
1898
19.5M
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1899
7.69M
                        if(cs0 == KSC5601) {
1900
                            /*
1901
                             * Check for valid bytes for the encoding scheme.
1902
                             * This is necessary because the sub-converter (windows-949)
1903
                             * has a broader encoding scheme than is valid for 2022.
1904
                             */
1905
3.59M
                            value = _2022FromGR94DBCS(value);
1906
3.59M
                            if(value == 0) {
1907
872
                                break;
1908
872
                            }
1909
3.59M
                        }
1910
7.68M
                        targetValue = value;
1911
7.68M
                        len = len2;
1912
7.68M
                        cs = cs0;
1913
7.68M
                        g = 0;
1914
7.68M
                        useFallback = false;
1915
7.68M
                    }
1916
19.5M
                    break;
1917
59.1M
                }
1918
59.1M
            }
1919
1920
8.62M
            if(len != 0) {
1921
8.22M
                if(len < 0) {
1922
197
                    len = -len;  /* fallback */
1923
197
                }
1924
8.22M
                outLen = 0; /* count output bytes */
1925
1926
                /* write SI if necessary (only for JIS7) */
1927
8.22M
                if(pFromU2022State->g == 1 && g == 0) {
1928
376
                    buffer[outLen++] = UCNV_SI;
1929
376
                    pFromU2022State->g = 0;
1930
376
                }
1931
1932
                /* write the designation sequence if necessary */
1933
8.22M
                if(cs != pFromU2022State->cs[g]) {
1934
7.53M
                    int32_t escLen = escSeqCharsLen[cs];
1935
7.53M
                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936
7.53M
                    outLen += escLen;
1937
7.53M
                    pFromU2022State->cs[g] = cs;
1938
1939
                    /* invalidate the choices[] */
1940
7.53M
                    choiceCount = 0;
1941
7.53M
                }
1942
1943
                /* write the shift sequence if necessary */
1944
8.22M
                if(g != pFromU2022State->g) {
1945
1.16k
                    switch(g) {
1946
                    /* case 0 handled before writing escapes */
1947
601
                    case 1:
1948
601
                        buffer[outLen++] = UCNV_SO;
1949
601
                        pFromU2022State->g = 1;
1950
601
                        break;
1951
561
                    default: /* case 2 */
1952
561
                        buffer[outLen++] = 0x1b;
1953
561
                        buffer[outLen++] = 0x4e;
1954
561
                        break;
1955
                    /* no case 3: no SS3 in ISO-2022-JP-x */
1956
1.16k
                    }
1957
1.16k
                }
1958
1959
                /* write the output bytes */
1960
8.22M
                if(len == 1) {
1961
4.68k
                    buffer[outLen++] = static_cast<char>(targetValue);
1962
8.22M
                } else /* len == 2 */ {
1963
8.22M
                    buffer[outLen++] = static_cast<char>(targetValue >> 8);
1964
8.22M
                    buffer[outLen++] = static_cast<char>(targetValue);
1965
8.22M
                }
1966
8.22M
            } else {
1967
                /*
1968
                 * if we cannot find the character after checking all codepages
1969
                 * then this is an error
1970
                 */
1971
402k
                *err = U_INVALID_CHAR_FOUND;
1972
402k
                cnv->fromUChar32=sourceChar;
1973
402k
                break;
1974
402k
            }
1975
1976
8.22M
            if(sourceChar == CR || sourceChar == LF) {
1977
                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978
397
                pFromU2022State->cs[2] = 0;
1979
397
                choiceCount = 0;
1980
397
            }
1981
1982
            /* output outLen>0 bytes in buffer[] */
1983
8.22M
            if(outLen == 1) {
1984
2.11k
                *target++ = buffer[0];
1985
2.11k
                if(offsets) {
1986
0
                    *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */
1987
0
                }
1988
8.22M
            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989
690k
                *target++ = buffer[0];
1990
690k
                *target++ = buffer[1];
1991
690k
                if(offsets) {
1992
0
                    int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar));
1993
0
                    *offsets++ = sourceIndex;
1994
0
                    *offsets++ = sourceIndex;
1995
0
                }
1996
7.53M
            } else {
1997
7.53M
                fromUWriteUInt8(
1998
7.53M
                    cnv,
1999
7.53M
                    buffer, outLen,
2000
7.53M
                    &target, reinterpret_cast<const char*>(targetLimit),
2001
7.53M
                    &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)),
2002
7.53M
                    err);
2003
7.53M
                if(U_FAILURE(*err)) {
2004
3.44k
                    break;
2005
3.44k
                }
2006
7.53M
            }
2007
8.22M
        } /* end if(myTargetIndex<myTargetLength) */
2008
1.03k
        else{
2009
1.03k
            *err =U_BUFFER_OVERFLOW_ERROR;
2010
1.03k
            break;
2011
1.03k
        }
2012
2013
8.63M
    }/* end while(mySourceIndex<mySourceLength) */
2014
2015
    /*
2016
     * the end of the input stream and detection of truncated input
2017
     * are handled by the framework, but for ISO-2022-JP conversion
2018
     * we need to be in ASCII mode at the very end
2019
     *
2020
     * conditions:
2021
     *   successful
2022
     *   in SO mode or not in ASCII mode
2023
     *   end of input and no truncated input
2024
     */
2025
408k
    if( U_SUCCESS(*err) &&
2026
528
        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027
211
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028
408k
    ) {
2029
207
        int32_t sourceIndex;
2030
2031
207
        outLen = 0;
2032
2033
207
        if(pFromU2022State->g != 0) {
2034
20
            buffer[outLen++] = UCNV_SI;
2035
20
            pFromU2022State->g = 0;
2036
20
        }
2037
2038
207
        if(pFromU2022State->cs[0] != ASCII) {
2039
193
            int32_t escLen = escSeqCharsLen[ASCII];
2040
193
            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041
193
            outLen += escLen;
2042
193
            pFromU2022State->cs[0] = static_cast<int8_t>(ASCII);
2043
193
        }
2044
2045
        /* get the source index of the last input character */
2046
        /*
2047
         * TODO this would be simpler and more reliable if we used a pair
2048
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049
         * so that we could simply use the prevSourceIndex here;
2050
         * this code gives an incorrect result for the rare case of an unmatched
2051
         * trail surrogate that is alone in the last buffer of the text stream
2052
         */
2053
207
        sourceIndex = static_cast<int32_t>(source - args->source);
2054
207
        if(sourceIndex>0) {
2055
197
            --sourceIndex;
2056
197
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058
197
            ) {
2059
0
                --sourceIndex;
2060
0
            }
2061
197
        } else {
2062
10
            sourceIndex=-1;
2063
10
        }
2064
2065
207
        fromUWriteUInt8(
2066
207
            cnv,
2067
207
            buffer, outLen,
2068
207
            &target, reinterpret_cast<const char*>(targetLimit),
2069
207
            &offsets, sourceIndex,
2070
207
            err);
2071
207
    }
2072
2073
    /*save the state and return */
2074
408k
    args->source = source;
2075
408k
    args->target = reinterpret_cast<char*>(target);
2076
408k
}
2077
2078
/*************** to unicode *******************/
2079
2080
static void U_CALLCONV
2081
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082
1.27M
                                               UErrorCode* err){
2083
1.27M
    char tempBuf[2];
2084
1.27M
    const char* mySource = const_cast<char*>(args->source);
2085
1.27M
    char16_t *myTarget = args->target;
2086
1.27M
    const char *mySourceLimit = args->sourceLimit;
2087
1.27M
    uint32_t targetUniChar = 0x0000;
2088
1.27M
    uint32_t mySourceChar = 0x0000;
2089
1.27M
    uint32_t tmpSourceChar = 0x0000;
2090
1.27M
    UConverterDataISO2022* myData;
2091
1.27M
    ISO2022State *pToU2022State;
2092
1.27M
    StateEnum cs;
2093
2094
1.27M
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2095
1.27M
    pToU2022State = &myData->toU2022State;
2096
2097
1.27M
    if(myData->key != 0) {
2098
        /* continue with a partial escape sequence */
2099
25
        goto escape;
2100
1.27M
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101
        /* continue with a partial double-byte character */
2102
0
        mySourceChar = args->converter->toUBytes[0];
2103
0
        args->converter->toULength = 0;
2104
0
        cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
2105
0
        targetUniChar = missingCharMarker;
2106
0
        goto getTrailByte;
2107
0
    }
2108
2109
3.58M
    while(mySource < mySourceLimit){
2110
2111
3.58M
        targetUniChar =missingCharMarker;
2112
2113
3.58M
        if(myTarget < args->targetLimit){
2114
2115
3.58M
            mySourceChar = static_cast<unsigned char>(*mySource++);
2116
2117
3.58M
            switch(mySourceChar) {
2118
3.48k
            case UCNV_SI:
2119
3.48k
                if(myData->version==3) {
2120
0
                    pToU2022State->g=0;
2121
0
                    continue;
2122
3.48k
                } else {
2123
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124
3.48k
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2125
3.48k
                    break;
2126
3.48k
                }
2127
2128
10.5k
            case UCNV_SO:
2129
10.5k
                if(myData->version==3) {
2130
                    /* JIS7: switch to G1 half-width Katakana */
2131
0
                    pToU2022State->cs[1] = static_cast<int8_t>(HWKANA_7BIT);
2132
0
                    pToU2022State->g=1;
2133
0
                    continue;
2134
10.5k
                } else {
2135
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136
10.5k
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2137
10.5k
                    break;
2138
10.5k
                }
2139
2140
23.6k
            case ESC_2022:
2141
23.6k
                mySource--;
2142
23.7k
escape:
2143
23.7k
                {
2144
23.7k
                    const char * mySourceBefore = mySource;
2145
23.7k
                    int8_t toULengthBefore = args->converter->toULength;
2146
2147
23.7k
                    changeState_2022(args->converter,&(mySource),
2148
23.7k
                        mySourceLimit, ISO_2022_JP,err);
2149
2150
                    /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
2151
23.7k
                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152
697
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153
697
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154
697
                        args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
2155
697
                    }
2156
23.7k
                }
2157
2158
                /* invalid or illegal escape sequence */
2159
23.7k
                if(U_FAILURE(*err)){
2160
18.9k
                    args->target = myTarget;
2161
18.9k
                    args->source = mySource;
2162
18.9k
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
2163
18.9k
                    return;
2164
18.9k
                }
2165
                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166
4.72k
                if(myData->key==0) {
2167
4.67k
                    myData->isEmptySegment = true;
2168
4.67k
                }
2169
4.72k
                continue;
2170
2171
            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173
2.49k
            case CR:
2174
31.8k
            case LF:
2175
                /* automatically reset to single-byte mode */
2176
31.8k
                if (static_cast<StateEnum>(pToU2022State->cs[0]) != ASCII &&
2177
2.12k
                    static_cast<StateEnum>(pToU2022State->cs[0]) != JISX201) {
2178
1.48k
                    pToU2022State->cs[0] = static_cast<int8_t>(ASCII);
2179
1.48k
                }
2180
31.8k
                pToU2022State->cs[2] = 0;
2181
31.8k
                pToU2022State->g = 0;
2182
31.8k
                U_FALLTHROUGH;
2183
3.54M
            default:
2184
                /* convert one or two bytes */
2185
3.54M
                myData->isEmptySegment = false;
2186
3.54M
                cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
2187
3.54M
                if (static_cast<uint8_t>(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version == 4 &&
2188
0
                    !IS_JP_DBCS(cs)
2189
3.54M
                ) {
2190
                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2191
0
                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2192
2193
                    /* return from a single-shift state to the previous one */
2194
0
                    if(pToU2022State->g >= 2) {
2195
0
                        pToU2022State->g=pToU2022State->prevG;
2196
0
                    }
2197
3.54M
                } else switch(cs) {
2198
2.90M
                case ASCII:
2199
2.90M
                    if(mySourceChar <= 0x7f) {
2200
2.01M
                        targetUniChar = mySourceChar;
2201
2.01M
                    }
2202
2.90M
                    break;
2203
0
                case ISO8859_1:
2204
0
                    if(mySourceChar <= 0x7f) {
2205
0
                        targetUniChar = mySourceChar + 0x80;
2206
0
                    }
2207
                    /* return from a single-shift state to the previous one */
2208
0
                    pToU2022State->g=pToU2022State->prevG;
2209
0
                    break;
2210
0
                case ISO8859_7:
2211
0
                    if(mySourceChar <= 0x7f) {
2212
                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2213
0
                        targetUniChar =
2214
0
                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2215
0
                                myData->myConverterArray[cs],
2216
0
                                mySourceChar + 0x80);
2217
0
                    }
2218
                    /* return from a single-shift state to the previous one */
2219
0
                    pToU2022State->g=pToU2022State->prevG;
2220
0
                    break;
2221
261k
                case JISX201:
2222
261k
                    if(mySourceChar <= 0x7f) {
2223
167k
                        targetUniChar = jisx201ToU(mySourceChar);
2224
167k
                    }
2225
261k
                    break;
2226
23.3k
                case HWKANA_7BIT:
2227
23.3k
                    if (static_cast<uint8_t>(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2228
                        /* 7-bit halfwidth Katakana */
2229
1.41k
                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2230
1.41k
                    }
2231
23.3k
                    break;
2232
355k
                default:
2233
                    /* G0 DBCS */
2234
355k
                    if(mySource < mySourceLimit) {
2235
355k
                        int leadIsOk, trailIsOk;
2236
355k
                        uint8_t trailByte;
2237
355k
getTrailByte:
2238
355k
                        trailByte = static_cast<uint8_t>(*mySource);
2239
                        /*
2240
                         * Ticket 5691: consistent illegal sequences:
2241
                         * - We include at least the first byte in the illegal sequence.
2242
                         * - If any of the non-initial bytes could be the start of a character,
2243
                         *   we stop the illegal sequence before the first one of those.
2244
                         *
2245
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2246
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2247
                         * Otherwise we convert or report the pair of bytes.
2248
                         */
2249
355k
                        leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
2250
355k
                        trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
2251
355k
                        if (leadIsOk && trailIsOk) {
2252
126k
                            ++mySource;
2253
126k
                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2254
126k
                            if(cs == JISX208) {
2255
126k
                                _2022ToSJIS(static_cast<uint8_t>(mySourceChar), trailByte, tempBuf);
2256
126k
                                mySourceChar = tmpSourceChar;
2257
126k
                            } else {
2258
                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2259
0
                                mySourceChar = tmpSourceChar;
2260
0
                                if (cs == KSC5601) {
2261
0
                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2262
0
                                }
2263
0
                                tempBuf[0] = static_cast<char>(tmpSourceChar >> 8);
2264
0
                                tempBuf[1] = static_cast<char>(tmpSourceChar);
2265
0
                            }
2266
126k
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
2267
229k
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2268
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2269
208k
                            ++mySource;
2270
                            /* add another bit so that the code below writes 2 bytes in case of error */
2271
208k
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2272
208k
                        }
2273
355k
                    } else {
2274
18
                        args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2275
18
                        args->converter->toULength = 1;
2276
18
                        goto endloop;
2277
18
                    }
2278
3.54M
                }  /* End of inner switch */
2279
3.54M
                break;
2280
3.58M
            }  /* End of outer switch */
2281
3.56M
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2282
2.30M
                if(args->offsets){
2283
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2284
0
                }
2285
2.30M
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
2286
2.30M
            }
2287
1.25M
            else if(targetUniChar > missingCharMarker){
2288
                /* disassemble the surrogate pair and write to output*/
2289
0
                targetUniChar-=0x0010000;
2290
0
                *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10));
2291
0
                if(args->offsets){
2292
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2293
0
                }
2294
0
                ++myTarget;
2295
0
                if(myTarget< args->targetLimit){
2296
0
                    *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
2297
0
                    if(args->offsets){
2298
0
                        args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2299
0
                    }
2300
0
                    ++myTarget;
2301
0
                }else{
2302
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2303
0
                                    static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
2304
0
                }
2305
2306
0
            }
2307
1.25M
            else{
2308
                /* Call the callback function*/
2309
1.25M
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2310
1.25M
                break;
2311
1.25M
            }
2312
3.56M
        }
2313
0
        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2314
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2315
0
            break;
2316
0
        }
2317
3.58M
    }
2318
1.26M
endloop:
2319
1.26M
    args->target = myTarget;
2320
1.26M
    args->source = mySource;
2321
1.26M
}
2322
2323
2324
#if !UCONFIG_ONLY_HTML_CONVERSION
2325
/***************************************************************
2326
*   Rules for ISO-2022-KR encoding
2327
*   i) The KSC5601 designator sequence should appear only once in a file,
2328
*      at the beginning of a line before any KSC5601 characters. This usually
2329
*      means that it appears by itself on the first line of the file
2330
*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2331
*      and SI to shift into single byte mode
2332
*/
2333
static void U_CALLCONV
2334
1.68M
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2335
2336
1.68M
    UConverter* saveConv = args->converter;
2337
1.68M
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(saveConv->extraInfo);
2338
1.68M
    args->converter=myConverterData->currentConverter;
2339
2340
1.68M
    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2341
1.68M
    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2342
1.68M
    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2343
2344
1.68M
    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2345
175
        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2346
85
            uprv_memcpy(
2347
85
                saveConv->charErrorBuffer,
2348
85
                myConverterData->currentConverter->charErrorBuffer,
2349
85
                myConverterData->currentConverter->charErrorBufferLength);
2350
85
        }
2351
175
        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2352
175
        myConverterData->currentConverter->charErrorBufferLength = 0;
2353
175
    }
2354
1.68M
    args->converter=saveConv;
2355
1.68M
}
2356
2357
static void U_CALLCONV
2358
8.11M
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2359
2360
8.11M
    const char16_t *source = args->source;
2361
8.11M
    const char16_t *sourceLimit = args->sourceLimit;
2362
8.11M
    unsigned char *target = reinterpret_cast<unsigned char*>(args->target);
2363
8.11M
    unsigned char *targetLimit = reinterpret_cast<unsigned char*>(const_cast<char*>(args->targetLimit));
2364
8.11M
    int32_t* offsets = args->offsets;
2365
8.11M
    uint32_t targetByteUnit = 0x0000;
2366
8.11M
    UChar32 sourceChar = 0x0000;
2367
8.11M
    UBool isTargetByteDBCS;
2368
8.11M
    UBool oldIsTargetByteDBCS;
2369
8.11M
    UConverterDataISO2022 *converterData;
2370
8.11M
    UConverterSharedData* sharedData;
2371
8.11M
    UBool useFallback;
2372
8.11M
    int32_t length =0;
2373
2374
8.11M
    converterData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2375
    /* if the version is 1 then the user is requesting
2376
     * conversion with ibm-25546 pass the arguments to
2377
     * MBCS converter and return
2378
     */
2379
8.11M
    if(converterData->version==1){
2380
1.68M
        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2381
1.68M
        return;
2382
1.68M
    }
2383
2384
    /* initialize data */
2385
6.43M
    sharedData = converterData->currentConverter->sharedData;
2386
6.43M
    useFallback = args->converter->useFallback;
2387
6.43M
    isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus);
2388
6.43M
    oldIsTargetByteDBCS = isTargetByteDBCS;
2389
2390
6.43M
    isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus);
2391
6.43M
    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2392
0
        goto getTrail;
2393
0
    }
2394
12.5M
    while(source < sourceLimit){
2395
2396
12.5M
        targetByteUnit = missingCharMarker;
2397
2398
12.5M
        if(target < (unsigned char*) args->targetLimit){
2399
12.5M
            sourceChar = *source++;
2400
2401
            /* do not convert SO/SI/ESC */
2402
12.5M
            if(IS_2022_CONTROL(sourceChar)) {
2403
                /* callback(illegal) */
2404
403
                *err=U_ILLEGAL_CHAR_FOUND;
2405
403
                args->converter->fromUChar32=sourceChar;
2406
403
                break;
2407
403
            }
2408
2409
12.5M
            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2410
12.5M
            if(length < 0) {
2411
0
                length = -length;  /* fallback */
2412
0
            }
2413
            /* only DBCS or SBCS characters are expected*/
2414
            /* DB characters with high bit set to 1 are expected */
2415
12.5M
            if( length > 2 || length==0 ||
2416
6.10M
                (length == 1 && targetByteUnit > 0x7f) ||
2417
6.10M
                (length == 2 &&
2418
6.06M
                    (static_cast<uint16_t>(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2419
6.04M
                    static_cast<uint8_t>(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2420
12.5M
            ) {
2421
6.43M
                targetByteUnit=missingCharMarker;
2422
6.43M
            }
2423
12.5M
            if (targetByteUnit != missingCharMarker){
2424
2425
6.08M
                oldIsTargetByteDBCS = isTargetByteDBCS;
2426
6.08M
                isTargetByteDBCS = static_cast<UBool>(targetByteUnit > 0x00FF);
2427
                  /* append the shift sequence */
2428
6.08M
                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2429
2430
6.06M
                    if (isTargetByteDBCS)
2431
6.03M
                        *target++ = UCNV_SO;
2432
30.4k
                    else
2433
30.4k
                        *target++ = UCNV_SI;
2434
6.06M
                    if(offsets)
2435
0
                        *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2436
6.06M
                }
2437
                /* write the targetUniChar  to target */
2438
6.08M
                if(targetByteUnit <= 0x00FF){
2439
38.1k
                    if( target < targetLimit){
2440
38.1k
                        *(target++) = static_cast<unsigned char>(targetByteUnit);
2441
38.1k
                        if(offsets){
2442
0
                            *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2443
0
                        }
2444
2445
38.1k
                    }else{
2446
35
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit);
2447
35
                        *err = U_BUFFER_OVERFLOW_ERROR;
2448
35
                    }
2449
6.04M
                }else{
2450
6.04M
                    if(target < targetLimit){
2451
6.04M
                        *(target++) = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80);
2452
6.04M
                        if(offsets){
2453
0
                            *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2454
0
                        }
2455
6.04M
                        if(target < targetLimit){
2456
6.04M
                            *(target++) = static_cast<unsigned char>(targetByteUnit - 0x80);
2457
6.04M
                            if(offsets){
2458
0
                                *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2459
0
                            }
2460
6.04M
                        }else{
2461
106
                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80);
2462
106
                            *err = U_BUFFER_OVERFLOW_ERROR;
2463
106
                        }
2464
6.04M
                    }else{
2465
101
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80);
2466
101
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80);
2467
101
                        *err = U_BUFFER_OVERFLOW_ERROR;
2468
101
                    }
2469
6.04M
                }
2470
2471
6.08M
            }
2472
6.43M
            else{
2473
                /* oops.. the code point is unassingned
2474
                 * set the error and reason
2475
                 */
2476
2477
                /*check if the char is a First surrogate*/
2478
6.43M
                if(U16_IS_SURROGATE(sourceChar)) {
2479
13.8k
                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2480
7.18k
getTrail:
2481
                        /*look ahead to find the trail surrogate*/
2482
7.18k
                        if(source <  sourceLimit) {
2483
                            /* test the following code unit */
2484
7.15k
                            char16_t trail = *source;
2485
7.15k
                            if(U16_IS_TRAIL(trail)) {
2486
401
                                source++;
2487
401
                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2488
401
                                *err = U_INVALID_CHAR_FOUND;
2489
                                /* convert this surrogate code point */
2490
                                /* exit this condition tree */
2491
6.75k
                            } else {
2492
                                /* this is an unmatched lead code unit (1st surrogate) */
2493
                                /* callback(illegal) */
2494
6.75k
                                *err=U_ILLEGAL_CHAR_FOUND;
2495
6.75k
                            }
2496
7.15k
                        } else {
2497
                            /* no more input */
2498
24
                            *err = U_ZERO_ERROR;
2499
24
                        }
2500
7.18k
                    } else {
2501
                        /* this is an unmatched trail code unit (2nd surrogate) */
2502
                        /* callback(illegal) */
2503
6.66k
                        *err=U_ILLEGAL_CHAR_FOUND;
2504
6.66k
                    }
2505
6.41M
                } else {
2506
                    /* callback(unassigned) for a BMP code point */
2507
6.41M
                    *err = U_INVALID_CHAR_FOUND;
2508
6.41M
                }
2509
2510
6.43M
                args->converter->fromUChar32=sourceChar;
2511
6.43M
                break;
2512
6.43M
            }
2513
12.5M
        } /* end if(myTargetIndex<myTargetLength) */
2514
330
        else{
2515
330
            *err =U_BUFFER_OVERFLOW_ERROR;
2516
330
            break;
2517
330
        }
2518
2519
12.5M
    }/* end while(mySourceIndex<mySourceLength) */
2520
2521
    /*
2522
     * the end of the input stream and detection of truncated input
2523
     * are handled by the framework, but for ISO-2022-KR conversion
2524
     * we need to be in ASCII mode at the very end
2525
     *
2526
     * conditions:
2527
     *   successful
2528
     *   not in ASCII mode
2529
     *   end of input and no truncated input
2530
     */
2531
6.43M
    if( U_SUCCESS(*err) &&
2532
229
        isTargetByteDBCS &&
2533
52
        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2534
6.43M
    ) {
2535
41
        int32_t sourceIndex;
2536
2537
        /* we are switching to ASCII */
2538
41
        isTargetByteDBCS=false;
2539
2540
        /* get the source index of the last input character */
2541
        /*
2542
         * TODO this would be simpler and more reliable if we used a pair
2543
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2544
         * so that we could simply use the prevSourceIndex here;
2545
         * this code gives an incorrect result for the rare case of an unmatched
2546
         * trail surrogate that is alone in the last buffer of the text stream
2547
         */
2548
41
        sourceIndex = static_cast<int32_t>(source - args->source);
2549
41
        if(sourceIndex>0) {
2550
39
            --sourceIndex;
2551
39
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2552
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2553
39
            ) {
2554
0
                --sourceIndex;
2555
0
            }
2556
39
        } else {
2557
2
            sourceIndex=-1;
2558
2
        }
2559
2560
41
        fromUWriteUInt8(
2561
41
            args->converter,
2562
41
            SHIFT_IN_STR, 1,
2563
41
            &target, reinterpret_cast<const char*>(targetLimit),
2564
41
            &offsets, sourceIndex,
2565
41
            err);
2566
41
    }
2567
2568
    /*save the state and return */
2569
6.43M
    args->source = source;
2570
6.43M
    args->target = reinterpret_cast<char*>(target);
2571
6.43M
    args->converter->fromUnicodeStatus = static_cast<uint32_t>(isTargetByteDBCS);
2572
6.43M
}
2573
2574
/************************ To Unicode ***************************************/
2575
2576
static void U_CALLCONV
2577
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2578
0
                                                            UErrorCode* err){
2579
0
    char const* sourceStart;
2580
0
    UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2581
2582
0
    UConverterToUnicodeArgs subArgs;
2583
0
    int32_t minArgsSize;
2584
2585
    /* set up the subconverter arguments */
2586
0
    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2587
0
        minArgsSize = args->size;
2588
0
    } else {
2589
0
        minArgsSize = static_cast<int32_t>(sizeof(UConverterToUnicodeArgs));
2590
0
    }
2591
2592
0
    uprv_memcpy(&subArgs, args, minArgsSize);
2593
0
    subArgs.size = static_cast<uint16_t>(minArgsSize);
2594
0
    subArgs.converter = myData->currentConverter;
2595
2596
    /* remember the original start of the input for offsets */
2597
0
    sourceStart = args->source;
2598
2599
0
    if(myData->key != 0) {
2600
        /* continue with a partial escape sequence */
2601
0
        goto escape;
2602
0
    }
2603
2604
0
    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2605
        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2606
0
        subArgs.source = args->source;
2607
0
        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2608
0
        if(subArgs.source != subArgs.sourceLimit) {
2609
            /*
2610
             * get the current partial byte sequence
2611
             *
2612
             * it needs to be moved between the public and the subconverter
2613
             * so that the conversion framework, which only sees the public
2614
             * converter, can handle truncated and illegal input etc.
2615
             */
2616
0
            if(args->converter->toULength > 0) {
2617
0
                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2618
0
            }
2619
0
            subArgs.converter->toULength = args->converter->toULength;
2620
2621
            /*
2622
             * Convert up to the end of the input, or to before the next escape character.
2623
             * Does not handle conversion extensions because the preToU[] state etc.
2624
             * is not copied.
2625
             */
2626
0
            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2627
2628
0
            if(args->offsets != nullptr && sourceStart != args->source) {
2629
                /* update offsets to base them on the actual start of the input */
2630
0
                int32_t *offsets = args->offsets;
2631
0
                char16_t *target = args->target;
2632
0
                int32_t delta = static_cast<int32_t>(args->source - sourceStart);
2633
0
                while(target < subArgs.target) {
2634
0
                    if(*offsets >= 0) {
2635
0
                        *offsets += delta;
2636
0
                    }
2637
0
                    ++offsets;
2638
0
                    ++target;
2639
0
                }
2640
0
            }
2641
0
            args->source = subArgs.source;
2642
0
            args->target = subArgs.target;
2643
0
            args->offsets = subArgs.offsets;
2644
2645
            /* copy input/error/overflow buffers */
2646
0
            if(subArgs.converter->toULength > 0) {
2647
0
                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2648
0
            }
2649
0
            args->converter->toULength = subArgs.converter->toULength;
2650
2651
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2652
0
                if(subArgs.converter->UCharErrorBufferLength > 0) {
2653
0
                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2654
0
                                subArgs.converter->UCharErrorBufferLength);
2655
0
                }
2656
0
                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2657
0
                subArgs.converter->UCharErrorBufferLength = 0;
2658
0
            }
2659
0
        }
2660
2661
0
        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2662
0
            return;
2663
0
        }
2664
2665
0
escape:
2666
0
        changeState_2022(args->converter,
2667
0
               &(args->source),
2668
0
               args->sourceLimit,
2669
0
               ISO_2022_KR,
2670
0
               err);
2671
0
    }
2672
0
}
2673
2674
static void U_CALLCONV
2675
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2676
109k
                                                            UErrorCode* err){
2677
109k
    char tempBuf[2];
2678
109k
    const char* mySource = const_cast<char*>(args->source);
2679
109k
    char16_t *myTarget = args->target;
2680
109k
    const char *mySourceLimit = args->sourceLimit;
2681
109k
    UChar32 targetUniChar = 0x0000;
2682
109k
    char16_t mySourceChar = 0x0000;
2683
109k
    UConverterDataISO2022* myData;
2684
109k
    UConverterSharedData* sharedData ;
2685
109k
    UBool useFallback;
2686
2687
109k
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2688
109k
    if(myData->version==1){
2689
0
        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2690
0
        return;
2691
0
    }
2692
2693
    /* initialize state */
2694
109k
    sharedData = myData->currentConverter->sharedData;
2695
109k
    useFallback = args->converter->useFallback;
2696
2697
109k
    if(myData->key != 0) {
2698
        /* continue with a partial escape sequence */
2699
32
        goto escape;
2700
109k
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2701
        /* continue with a partial double-byte character */
2702
0
        mySourceChar = args->converter->toUBytes[0];
2703
0
        args->converter->toULength = 0;
2704
0
        goto getTrailByte;
2705
0
    }
2706
2707
408k
    while(mySource< mySourceLimit){
2708
2709
408k
        if(myTarget < args->targetLimit){
2710
2711
408k
            mySourceChar = static_cast<unsigned char>(*mySource++);
2712
2713
408k
            if(mySourceChar==UCNV_SI){
2714
1.23k
                myData->toU2022State.g = 0;
2715
1.23k
                if (myData->isEmptySegment) {
2716
255
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
2717
255
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2718
255
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2719
255
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2720
255
                    args->converter->toULength = 1;
2721
255
                    args->target = myTarget;
2722
255
                    args->source = mySource;
2723
255
                    return;
2724
255
                }
2725
                /*consume the source */
2726
983
                continue;
2727
407k
            }else if(mySourceChar==UCNV_SO){
2728
2.07k
                myData->toU2022State.g = 1;
2729
2.07k
                myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
2730
                /*consume the source */
2731
2.07k
                continue;
2732
404k
            }else if(mySourceChar==ESC_2022){
2733
3.55k
                mySource--;
2734
3.58k
escape:
2735
3.58k
                myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
2736
3.58k
                changeState_2022(args->converter,&(mySource),
2737
3.58k
                                mySourceLimit, ISO_2022_KR, err);
2738
3.58k
                if(U_FAILURE(*err)){
2739
3.12k
                    args->target = myTarget;
2740
3.12k
                    args->source = mySource;
2741
3.12k
                    return;
2742
3.12k
                }
2743
457
                continue;
2744
3.58k
            }
2745
2746
401k
            myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
2747
401k
            if(myData->toU2022State.g == 1) {
2748
194k
                if(mySource < mySourceLimit) {
2749
194k
                    int leadIsOk, trailIsOk;
2750
194k
                    uint8_t trailByte;
2751
194k
getTrailByte:
2752
194k
                    targetUniChar = missingCharMarker;
2753
194k
                    trailByte = static_cast<uint8_t>(*mySource);
2754
                    /*
2755
                     * Ticket 5691: consistent illegal sequences:
2756
                     * - We include at least the first byte in the illegal sequence.
2757
                     * - If any of the non-initial bytes could be the start of a character,
2758
                     *   we stop the illegal sequence before the first one of those.
2759
                     *
2760
                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2761
                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2762
                     * Otherwise we convert or report the pair of bytes.
2763
                     */
2764
194k
                    leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
2765
194k
                    trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
2766
194k
                    if (leadIsOk && trailIsOk) {
2767
138k
                        ++mySource;
2768
138k
                        tempBuf[0] = static_cast<char>(mySourceChar + 0x80);
2769
138k
                        tempBuf[1] = static_cast<char>(trailByte + 0x80);
2770
138k
                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2771
138k
                        mySourceChar = (mySourceChar << 8) | trailByte;
2772
138k
                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2773
                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2774
46.4k
                        ++mySource;
2775
                        /* add another bit so that the code below writes 2 bytes in case of error */
2776
46.4k
                        mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
2777
46.4k
                    }
2778
194k
                } else {
2779
14
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2780
14
                    args->converter->toULength = 1;
2781
14
                    break;
2782
14
                }
2783
194k
            }
2784
207k
            else if(mySourceChar <= 0x7f) {
2785
158k
                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2786
158k
            } else {
2787
48.9k
                targetUniChar = 0xffff;
2788
48.9k
            }
2789
401k
            if(targetUniChar < 0xfffe){
2790
295k
                if(args->offsets) {
2791
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2792
0
                }
2793
295k
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
2794
295k
            }
2795
106k
            else {
2796
                /* Call the callback function*/
2797
106k
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2798
106k
                break;
2799
106k
            }
2800
401k
        }
2801
0
        else{
2802
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2803
0
            break;
2804
0
        }
2805
408k
    }
2806
106k
    args->target = myTarget;
2807
106k
    args->source = mySource;
2808
106k
}
2809
2810
/*************************** END ISO2022-KR *********************************/
2811
2812
/*************************** ISO-2022-CN *********************************
2813
*
2814
* Rules for ISO-2022-CN Encoding:
2815
* i)   The designator sequence must appear once on a line before any instance
2816
*      of character set it designates.
2817
* ii)  If two lines contain characters from the same character set, both lines
2818
*      must include the designator sequence.
2819
* iii) Once the designator sequence is known, a shifting sequence has to be found
2820
*      to invoke the  shifting
2821
* iv)  All lines start in ASCII and end in ASCII.
2822
* v)   Four shifting sequences are employed for this purpose:
2823
*
2824
*      Sequcence   ASCII Eq    Charsets
2825
*      ----------  -------    ---------
2826
*      SI           <SI>        US-ASCII
2827
*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2828
*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2829
*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2830
*
2831
* vi)
2832
*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2833
*      SS2designator : ESC "$" "*" finalchar_for_SS2
2834
*      SS3designator : ESC "$" "+" finalchar_for_SS3
2835
*
2836
*      ESC $ ) A       Indicates the bytes following SO are Chinese
2837
*       characters as defined in GB 2312-80, until
2838
*       another SOdesignation appears
2839
*
2840
*
2841
*      ESC $ ) E       Indicates the bytes following SO are as defined
2842
*       in ISO-IR-165 (for details, see section 2.1),
2843
*       until another SOdesignation appears
2844
*
2845
*      ESC $ ) G       Indicates the bytes following SO are as defined
2846
*       in CNS 11643-plane-1, until another
2847
*       SOdesignation appears
2848
*
2849
*      ESC $ * H       Indicates the two bytes immediately following
2850
*       SS2 is a Chinese character as defined in CNS
2851
*       11643-plane-2, until another SS2designation
2852
*       appears
2853
*       (Meaning <ESC>N must precede every 2 byte
2854
*        sequence.)
2855
*
2856
*      ESC $ + I       Indicates the immediate two bytes following SS3
2857
*       is a Chinese character as defined in CNS
2858
*       11643-plane-3, until another SS3designation
2859
*       appears
2860
*       (Meaning <ESC>O must precede every 2 byte
2861
*        sequence.)
2862
*
2863
*      ESC $ + J       Indicates the immediate two bytes following SS3
2864
*       is a Chinese character as defined in CNS
2865
*       11643-plane-4, until another SS3designation
2866
*       appears
2867
*       (In English: <ESC>O must precede every 2 byte
2868
*        sequence.)
2869
*
2870
*      ESC $ + K       Indicates the immediate two bytes following SS3
2871
*       is a Chinese character as defined in CNS
2872
*       11643-plane-5, until another SS3designation
2873
*       appears
2874
*
2875
*      ESC $ + L       Indicates the immediate two bytes following SS3
2876
*       is a Chinese character as defined in CNS
2877
*       11643-plane-6, until another SS3designation
2878
*       appears
2879
*
2880
*      ESC $ + M       Indicates the immediate two bytes following SS3
2881
*       is a Chinese character as defined in CNS
2882
*       11643-plane-7, until another SS3designation
2883
*       appears
2884
*
2885
*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2886
*       has its own designation information before any Chinese characters
2887
*       appear
2888
*
2889
*/
2890
2891
/* The following are defined this way to make the strings truly readonly */
2892
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2893
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2894
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2895
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2896
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2897
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2898
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2899
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2900
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2901
2902
/********************** ISO2022-CN Data **************************/
2903
static const char* const escSeqCharsCN[10] ={
2904
        SHIFT_IN_STR,                   /* 0 ASCII */
2905
        GB_2312_80_STR,                 /* 1 GB2312_1 */
2906
        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2907
        CNS_11643_1992_Plane_1_STR,
2908
        CNS_11643_1992_Plane_2_STR,
2909
        CNS_11643_1992_Plane_3_STR,
2910
        CNS_11643_1992_Plane_4_STR,
2911
        CNS_11643_1992_Plane_5_STR,
2912
        CNS_11643_1992_Plane_6_STR,
2913
        CNS_11643_1992_Plane_7_STR
2914
};
2915
2916
static void U_CALLCONV
2917
371k
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2918
371k
    UConverter *cnv = args->converter;
2919
371k
    UConverterDataISO2022 *converterData;
2920
371k
    ISO2022State *pFromU2022State;
2921
371k
    uint8_t* target = reinterpret_cast<uint8_t*>(args->target);
2922
371k
    const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit);
2923
371k
    const char16_t* source = args->source;
2924
371k
    const char16_t* sourceLimit = args->sourceLimit;
2925
371k
    int32_t* offsets = args->offsets;
2926
371k
    UChar32 sourceChar;
2927
371k
    char buffer[8];
2928
371k
    int32_t len;
2929
371k
    int8_t choices[3];
2930
371k
    int32_t choiceCount;
2931
371k
    uint32_t targetValue = 0;
2932
371k
    UBool useFallback;
2933
2934
    /* set up the state */
2935
371k
    converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
2936
371k
    pFromU2022State   = &converterData->fromU2022State;
2937
2938
371k
    choiceCount = 0;
2939
2940
    /* check if the last codepoint of previous buffer was a lead surrogate*/
2941
371k
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2942
0
        goto getTrail;
2943
0
    }
2944
2945
12.8M
    while( source < sourceLimit){
2946
12.8M
        if(target < targetLimit){
2947
2948
12.8M
            sourceChar  = *(source++);
2949
            /*check if the char is a First surrogate*/
2950
12.8M
             if(U16_IS_SURROGATE(sourceChar)) {
2951
5.21k
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2952
4.69k
getTrail:
2953
                    /*look ahead to find the trail surrogate*/
2954
4.69k
                    if(source < sourceLimit) {
2955
                        /* test the following code unit */
2956
4.67k
                        char16_t trail = *source;
2957
4.67k
                        if(U16_IS_TRAIL(trail)) {
2958
1.06k
                            source++;
2959
1.06k
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2960
1.06k
                            cnv->fromUChar32=0x00;
2961
                            /* convert this supplementary code point */
2962
                            /* exit this condition tree */
2963
3.61k
                        } else {
2964
                            /* this is an unmatched lead code unit (1st surrogate) */
2965
                            /* callback(illegal) */
2966
3.61k
                            *err=U_ILLEGAL_CHAR_FOUND;
2967
3.61k
                            cnv->fromUChar32=sourceChar;
2968
3.61k
                            break;
2969
3.61k
                        }
2970
4.67k
                    } else {
2971
                        /* no more input */
2972
19
                        cnv->fromUChar32=sourceChar;
2973
19
                        break;
2974
19
                    }
2975
4.69k
                } else {
2976
                    /* this is an unmatched trail code unit (2nd surrogate) */
2977
                    /* callback(illegal) */
2978
527
                    *err=U_ILLEGAL_CHAR_FOUND;
2979
527
                    cnv->fromUChar32=sourceChar;
2980
527
                    break;
2981
527
                }
2982
5.21k
            }
2983
2984
            /* do the conversion */
2985
12.8M
            if(sourceChar <= 0x007f ){
2986
                /* do not convert SO/SI/ESC */
2987
59.8k
                if(IS_2022_CONTROL(sourceChar)) {
2988
                    /* callback(illegal) */
2989
2.32k
                    *err=U_ILLEGAL_CHAR_FOUND;
2990
2.32k
                    cnv->fromUChar32=sourceChar;
2991
2.32k
                    break;
2992
2.32k
                }
2993
2994
                /* US-ASCII */
2995
57.4k
                if(pFromU2022State->g == 0) {
2996
50.2k
                    buffer[0] = static_cast<char>(sourceChar);
2997
50.2k
                    len = 1;
2998
50.2k
                } else {
2999
7.28k
                    buffer[0] = UCNV_SI;
3000
7.28k
                    buffer[1] = static_cast<char>(sourceChar);
3001
7.28k
                    len = 2;
3002
7.28k
                    pFromU2022State->g = 0;
3003
7.28k
                    choiceCount = 0;
3004
7.28k
                }
3005
57.4k
                if(sourceChar == CR || sourceChar == LF) {
3006
                    /* reset the state at the end of a line */
3007
792
                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3008
792
                    choiceCount = 0;
3009
792
                }
3010
57.4k
            }
3011
12.8M
            else{
3012
                /* convert U+0080..U+10ffff */
3013
12.8M
                int32_t i;
3014
12.8M
                int8_t cs, g;
3015
3016
12.8M
                if(choiceCount == 0) {
3017
                    /* try the current SO/G1 converter first */
3018
12.5M
                    choices[0] = pFromU2022State->cs[1];
3019
3020
                    /* default to GB2312_1 if none is designated yet */
3021
12.5M
                    if(choices[0] == 0) {
3022
231k
                        choices[0] = GB2312_1;
3023
231k
                    }
3024
3025
12.5M
                    if(converterData->version == 0) {
3026
                        /* ISO-2022-CN */
3027
3028
                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3029
943
                        if(choices[0] == GB2312_1) {
3030
743
                            choices[1] = static_cast<int8_t>(CNS_11643_1);
3031
743
                        } else {
3032
200
                            choices[1] = static_cast<int8_t>(GB2312_1);
3033
200
                        }
3034
3035
943
                        choiceCount = 2;
3036
12.5M
                    } else if (converterData->version == 1) {
3037
                        /* ISO-2022-CN-EXT */
3038
3039
                        /* try one of the other converters */
3040
12.3M
                        switch(choices[0]) {
3041
6.16M
                        case GB2312_1:
3042
6.16M
                            choices[1] = static_cast<int8_t>(CNS_11643_1);
3043
6.16M
                            choices[2] = static_cast<int8_t>(ISO_IR_165);
3044
6.16M
                            break;
3045
9.58k
                        case ISO_IR_165:
3046
9.58k
                            choices[1] = static_cast<int8_t>(GB2312_1);
3047
9.58k
                            choices[2] = static_cast<int8_t>(CNS_11643_1);
3048
9.58k
                            break;
3049
6.16M
                        default: /* CNS_11643_x */
3050
6.16M
                            choices[1] = static_cast<int8_t>(GB2312_1);
3051
6.16M
                            choices[2] = static_cast<int8_t>(ISO_IR_165);
3052
6.16M
                            break;
3053
12.3M
                        }
3054
3055
12.3M
                        choiceCount = 3;
3056
12.3M
                    } else {
3057
224k
                        choices[0] = static_cast<int8_t>(CNS_11643_1);
3058
224k
                        choices[1] = static_cast<int8_t>(GB2312_1);
3059
224k
                    }
3060
12.5M
                }
3061
3062
12.8M
                cs = g = 0;
3063
                /*
3064
                 * len==0: no mapping found yet
3065
                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3066
                 * len>0: found a roundtrip result, done
3067
                 */
3068
12.8M
                len = 0;
3069
                /*
3070
                 * We will turn off useFallback after finding a fallback,
3071
                 * but we still get fallbacks from PUA code points as usual.
3072
                 * Therefore, we will also need to check that we don't overwrite
3073
                 * an early fallback with a later one.
3074
                 */
3075
12.8M
                useFallback = cnv->useFallback;
3076
3077
37.9M
                for(i = 0; i < choiceCount && len <= 0; ++i) {
3078
25.1M
                    int8_t cs0 = choices[i];
3079
25.1M
                    if(cs0 > 0) {
3080
25.1M
                        uint32_t value;
3081
25.1M
                        int32_t len2;
3082
25.1M
                        if(cs0 >= CNS_11643_0) {
3083
12.4M
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3084
12.4M
                                        converterData->myConverterArray[CNS_11643],
3085
12.4M
                                        sourceChar,
3086
12.4M
                                        &value,
3087
12.4M
                                        useFallback,
3088
12.4M
                                        MBCS_OUTPUT_3);
3089
12.4M
                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3090
6.23M
                                targetValue = value;
3091
6.23M
                                cs = static_cast<int8_t>(CNS_11643_0 + (value >> 16) - 0x80);
3092
6.23M
                                if(len2 >= 0) {
3093
6.23M
                                    len = 2;
3094
6.23M
                                } else {
3095
0
                                    len = -2;
3096
0
                                    useFallback = false;
3097
0
                                }
3098
6.23M
                                if(cs == CNS_11643_1) {
3099
6.10M
                                    g = 1;
3100
6.10M
                                } else if(cs == CNS_11643_2) {
3101
134k
                                    g = 2;
3102
134k
                                } else /* plane 3..7 */ if(converterData->version == 1) {
3103
0
                                    g = 3;
3104
0
                                } else {
3105
                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3106
0
                                    len = 0;
3107
0
                                }
3108
6.23M
                            }
3109
12.6M
                        } else {
3110
                            /* GB2312_1 or ISO-IR-165 */
3111
12.6M
                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3112
12.6M
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3113
12.6M
                                        converterData->myConverterArray[cs0],
3114
12.6M
                                        sourceChar,
3115
12.6M
                                        &value,
3116
12.6M
                                        useFallback,
3117
12.6M
                                        MBCS_OUTPUT_2);
3118
12.6M
                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3119
6.21M
                                targetValue = value;
3120
6.21M
                                len = len2;
3121
6.21M
                                cs = cs0;
3122
6.21M
                                g = 1;
3123
6.21M
                                useFallback = false;
3124
6.21M
                            }
3125
12.6M
                        }
3126
25.1M
                    }
3127
25.1M
                }
3128
3129
12.8M
                if(len != 0) {
3130
12.4M
                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3131
3132
                    /* write the designation sequence if necessary */
3133
12.4M
                    if(cs != pFromU2022State->cs[g]) {
3134
12.1M
                        if(cs < CNS_11643) {
3135
6.09M
                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3136
6.09M
                        } else {
3137
6.09M
                            U_ASSERT(cs >= CNS_11643_1);
3138
6.09M
                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3139
6.09M
                        }
3140
12.1M
                        len = 4;
3141
12.1M
                        pFromU2022State->cs[g] = cs;
3142
12.1M
                        if(g == 1) {
3143
                            /* changing the SO/G1 charset invalidates the choices[] */
3144
12.1M
                            choiceCount = 0;
3145
12.1M
                        }
3146
12.1M
                    }
3147
3148
                    /* write the shift sequence if necessary */
3149
12.4M
                    if(g != pFromU2022State->g) {
3150
148k
                        switch(g) {
3151
14.1k
                        case 1:
3152
14.1k
                            buffer[len++] = UCNV_SO;
3153
3154
                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3155
14.1k
                            pFromU2022State->g = 1;
3156
14.1k
                            break;
3157
134k
                        case 2:
3158
134k
                            buffer[len++] = 0x1b;
3159
134k
                            buffer[len++] = 0x4e;
3160
134k
                            break;
3161
0
                        default: /* case 3 */
3162
0
                            buffer[len++] = 0x1b;
3163
0
                            buffer[len++] = 0x4f;
3164
0
                            break;
3165
148k
                        }
3166
148k
                    }
3167
3168
                    /* write the two output bytes */
3169
12.4M
                    buffer[len++] = static_cast<char>(targetValue >> 8);
3170
12.4M
                    buffer[len++] = static_cast<char>(targetValue);
3171
12.4M
                } else {
3172
                    /* if we cannot find the character after checking all codepages
3173
                     * then this is an error
3174
                     */
3175
362k
                    *err = U_INVALID_CHAR_FOUND;
3176
362k
                    cnv->fromUChar32=sourceChar;
3177
362k
                    break;
3178
362k
                }
3179
12.8M
            }
3180
3181
            /* output len>0 bytes in buffer[] */
3182
12.5M
            if(len == 1) {
3183
50.2k
                *target++ = buffer[0];
3184
50.2k
                if(offsets) {
3185
0
                    *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */
3186
0
                }
3187
12.4M
            } else if(len == 2 && (target + 2) <= targetLimit) {
3188
119k
                *target++ = buffer[0];
3189
119k
                *target++ = buffer[1];
3190
119k
                if(offsets) {
3191
0
                    int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar));
3192
0
                    *offsets++ = sourceIndex;
3193
0
                    *offsets++ = sourceIndex;
3194
0
                }
3195
12.3M
            } else {
3196
12.3M
                fromUWriteUInt8(
3197
12.3M
                    cnv,
3198
12.3M
                    buffer, len,
3199
12.3M
                    &target, reinterpret_cast<const char*>(targetLimit),
3200
12.3M
                    &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)),
3201
12.3M
                    err);
3202
12.3M
                if(U_FAILURE(*err)) {
3203
1.75k
                    break;
3204
1.75k
                }
3205
12.3M
            }
3206
12.5M
        } /* end if(myTargetIndex<myTargetLength) */
3207
447
        else{
3208
447
            *err =U_BUFFER_OVERFLOW_ERROR;
3209
447
            break;
3210
447
        }
3211
3212
12.8M
    }/* end while(mySourceIndex<mySourceLength) */
3213
3214
    /*
3215
     * the end of the input stream and detection of truncated input
3216
     * are handled by the framework, but for ISO-2022-CN conversion
3217
     * we need to be in ASCII mode at the very end
3218
     *
3219
     * conditions:
3220
     *   successful
3221
     *   not in ASCII mode
3222
     *   end of input and no truncated input
3223
     */
3224
371k
    if( U_SUCCESS(*err) &&
3225
403
        pFromU2022State->g!=0 &&
3226
130
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3227
371k
    ) {
3228
119
        int32_t sourceIndex;
3229
3230
        /* we are switching to ASCII */
3231
119
        pFromU2022State->g=0;
3232
3233
        /* get the source index of the last input character */
3234
        /*
3235
         * TODO this would be simpler and more reliable if we used a pair
3236
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3237
         * so that we could simply use the prevSourceIndex here;
3238
         * this code gives an incorrect result for the rare case of an unmatched
3239
         * trail surrogate that is alone in the last buffer of the text stream
3240
         */
3241
119
        sourceIndex = static_cast<int32_t>(source - args->source);
3242
119
        if(sourceIndex>0) {
3243
103
            --sourceIndex;
3244
103
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3245
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3246
103
            ) {
3247
0
                --sourceIndex;
3248
0
            }
3249
103
        } else {
3250
16
            sourceIndex=-1;
3251
16
        }
3252
3253
119
        fromUWriteUInt8(
3254
119
            cnv,
3255
119
            SHIFT_IN_STR, 1,
3256
119
            &target, reinterpret_cast<const char*>(targetLimit),
3257
119
            &offsets, sourceIndex,
3258
119
            err);
3259
119
    }
3260
3261
    /*save the state and return */
3262
371k
    args->source = source;
3263
371k
    args->target = reinterpret_cast<char*>(target);
3264
371k
}
3265
3266
3267
static void U_CALLCONV
3268
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3269
896k
                                               UErrorCode* err){
3270
896k
    char tempBuf[3];
3271
896k
    const char* mySource = const_cast<char*>(args->source);
3272
896k
    char16_t *myTarget = args->target;
3273
896k
    const char *mySourceLimit = args->sourceLimit;
3274
896k
    uint32_t targetUniChar = 0x0000;
3275
896k
    uint32_t mySourceChar = 0x0000;
3276
896k
    UConverterDataISO2022* myData;
3277
896k
    ISO2022State *pToU2022State;
3278
3279
896k
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
3280
896k
    pToU2022State = &myData->toU2022State;
3281
3282
896k
    if(myData->key != 0) {
3283
        /* continue with a partial escape sequence */
3284
36
        goto escape;
3285
896k
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3286
        /* continue with a partial double-byte character */
3287
0
        mySourceChar = args->converter->toUBytes[0];
3288
0
        args->converter->toULength = 0;
3289
0
        targetUniChar = missingCharMarker;
3290
0
        goto getTrailByte;
3291
0
    }
3292
3293
2.02M
    while(mySource < mySourceLimit){
3294
3295
2.02M
        targetUniChar =missingCharMarker;
3296
3297
2.02M
        if(myTarget < args->targetLimit){
3298
3299
2.02M
            mySourceChar = static_cast<unsigned char>(*mySource++);
3300
3301
2.02M
            switch(mySourceChar){
3302
5.43k
            case UCNV_SI:
3303
5.43k
                pToU2022State->g=0;
3304
5.43k
                if (myData->isEmptySegment) {
3305
556
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
3306
556
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3307
556
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3308
556
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3309
556
                    args->converter->toULength = 1;
3310
556
                    args->target = myTarget;
3311
556
                    args->source = mySource;
3312
556
                    return;
3313
556
                }
3314
4.87k
                continue;
3315
3316
9.56k
            case UCNV_SO:
3317
9.56k
                if(pToU2022State->cs[1] != 0) {
3318
5.19k
                    pToU2022State->g=1;
3319
5.19k
                    myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
3320
5.19k
                    continue;
3321
5.19k
                } else {
3322
                    /* illegal to have SO before a matching designator */
3323
4.37k
                    myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
3324
4.37k
                    break;
3325
4.37k
                }
3326
3327
26.6k
            case ESC_2022:
3328
26.6k
                mySource--;
3329
26.7k
escape:
3330
26.7k
                {
3331
26.7k
                    const char * mySourceBefore = mySource;
3332
26.7k
                    int8_t toULengthBefore = args->converter->toULength;
3333
3334
26.7k
                    changeState_2022(args->converter,&(mySource),
3335
26.7k
                        mySourceLimit, ISO_2022_CN,err);
3336
3337
                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3338
26.7k
                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3339
1.27k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3340
1.27k
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3341
1.27k
                        args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
3342
1.27k
                    }
3343
26.7k
                }
3344
3345
                /* invalid or illegal escape sequence */
3346
26.7k
                if(U_FAILURE(*err)){
3347
18.8k
                    args->target = myTarget;
3348
18.8k
                    args->source = mySource;
3349
18.8k
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
3350
18.8k
                    return;
3351
18.8k
                }
3352
7.87k
                continue;
3353
3354
            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3355
3356
7.87k
            case CR:
3357
13.7k
            case LF:
3358
13.7k
                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3359
13.7k
                U_FALLTHROUGH;
3360
1.98M
            default:
3361
                /* convert one or two bytes */
3362
1.98M
                myData->isEmptySegment = false;
3363
1.98M
                if(pToU2022State->g != 0) {
3364
404k
                    if(mySource < mySourceLimit) {
3365
404k
                        UConverterSharedData *cnv;
3366
404k
                        StateEnum tempState;
3367
404k
                        int32_t tempBufLen;
3368
404k
                        int leadIsOk, trailIsOk;
3369
404k
                        uint8_t trailByte;
3370
404k
getTrailByte:
3371
404k
                        trailByte = static_cast<uint8_t>(*mySource);
3372
                        /*
3373
                         * Ticket 5691: consistent illegal sequences:
3374
                         * - We include at least the first byte in the illegal sequence.
3375
                         * - If any of the non-initial bytes could be the start of a character,
3376
                         *   we stop the illegal sequence before the first one of those.
3377
                         *
3378
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3379
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3380
                         * Otherwise we convert or report the pair of bytes.
3381
                         */
3382
404k
                        leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
3383
404k
                        trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
3384
404k
                        if (leadIsOk && trailIsOk) {
3385
129k
                            ++mySource;
3386
129k
                            tempState = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
3387
129k
                            if(tempState >= CNS_11643_0) {
3388
37.9k
                                cnv = myData->myConverterArray[CNS_11643];
3389
37.9k
                                tempBuf[0] = static_cast<char>(0x80 + (tempState - CNS_11643_0));
3390
37.9k
                                tempBuf[1] = static_cast<char>(mySourceChar);
3391
37.9k
                                tempBuf[2] = static_cast<char>(trailByte);
3392
37.9k
                                tempBufLen = 3;
3393
3394
91.4k
                            }else{
3395
91.4k
                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3396
91.4k
                                cnv = myData->myConverterArray[tempState];
3397
91.4k
                                tempBuf[0] = static_cast<char>(mySourceChar);
3398
91.4k
                                tempBuf[1] = static_cast<char>(trailByte);
3399
91.4k
                                tempBufLen = 2;
3400
91.4k
                            }
3401
129k
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
3402
129k
                            mySourceChar = (mySourceChar << 8) | trailByte;
3403
275k
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3404
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3405
253k
                            ++mySource;
3406
                            /* add another bit so that the code below writes 2 bytes in case of error */
3407
253k
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3408
253k
                        }
3409
404k
                        if(pToU2022State->g>=2) {
3410
                            /* return from a single-shift state to the previous one */
3411
1.67k
                            pToU2022State->g=pToU2022State->prevG;
3412
1.67k
                        }
3413
404k
                    } else {
3414
15
                        args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3415
15
                        args->converter->toULength = 1;
3416
15
                        goto endloop;
3417
15
                    }
3418
404k
                }
3419
1.58M
                else{
3420
1.58M
                    if(mySourceChar <= 0x7f) {
3421
1.00M
                        targetUniChar = static_cast<char16_t>(mySourceChar);
3422
1.00M
                    }
3423
1.58M
                }
3424
1.98M
                break;
3425
2.02M
            }
3426
1.99M
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3427
1.11M
                if(args->offsets){
3428
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3429
0
                }
3430
1.11M
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
3431
1.11M
            }
3432
876k
            else if(targetUniChar > missingCharMarker){
3433
                /* disassemble the surrogate pair and write to output*/
3434
0
                targetUniChar-=0x0010000;
3435
0
                *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10));
3436
0
                if(args->offsets){
3437
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3438
0
                }
3439
0
                ++myTarget;
3440
0
                if(myTarget< args->targetLimit){
3441
0
                    *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
3442
0
                    if(args->offsets){
3443
0
                        args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3444
0
                    }
3445
0
                    ++myTarget;
3446
0
                }else{
3447
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3448
0
                                    static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
3449
0
                }
3450
3451
0
            }
3452
876k
            else{
3453
                /* Call the callback function*/
3454
876k
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3455
876k
                break;
3456
876k
            }
3457
1.99M
        }
3458
0
        else{
3459
0
            *err =U_BUFFER_OVERFLOW_ERROR;
3460
0
            break;
3461
0
        }
3462
2.02M
    }
3463
877k
endloop:
3464
877k
    args->target = myTarget;
3465
877k
    args->source = mySource;
3466
877k
}
3467
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3468
3469
static void U_CALLCONV
3470
8.88M
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3471
8.88M
    UConverter *cnv = args->converter;
3472
8.88M
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
3473
8.88M
    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3474
8.88M
    char *p, *subchar;
3475
8.88M
    char buffer[8];
3476
8.88M
    int32_t length;
3477
3478
8.88M
    subchar = reinterpret_cast<char*>(cnv->subChars);
3479
8.88M
    length=cnv->subCharLen; /* assume length==1 for most variants */
3480
3481
8.88M
    p = buffer;
3482
8.88M
    switch(myConverterData->locale[0]){
3483
403k
    case 'j':
3484
403k
        {
3485
403k
            int8_t cs;
3486
3487
403k
            if(pFromU2022State->g == 1) {
3488
                /* JIS7: switch from G1 to G0 */
3489
205
                pFromU2022State->g = 0;
3490
205
                *p++ = UCNV_SI;
3491
205
            }
3492
3493
403k
            cs = pFromU2022State->cs[0];
3494
403k
            if(cs != ASCII && cs != JISX201) {
3495
                /* not in ASCII or JIS X 0201: switch to ASCII */
3496
393k
                pFromU2022State->cs[0] = static_cast<int8_t>(ASCII);
3497
393k
                *p++ = '\x1b';
3498
393k
                *p++ = '\x28';
3499
393k
                *p++ = '\x42';
3500
393k
            }
3501
3502
403k
            *p++ = subchar[0];
3503
403k
            break;
3504
0
        }
3505
367k
    case 'c':
3506
367k
        if(pFromU2022State->g != 0) {
3507
            /* not in ASCII mode: switch to ASCII */
3508
6.78k
            pFromU2022State->g = 0;
3509
6.78k
            *p++ = UCNV_SI;
3510
6.78k
        }
3511
367k
        *p++ = subchar[0];
3512
367k
        break;
3513
8.11M
    case 'k':
3514
8.11M
        if(myConverterData->version == 0) {
3515
6.43M
            if(length == 1) {
3516
6.43M
                if(args->converter->fromUnicodeStatus) {
3517
                    /* in DBCS mode: switch to SBCS */
3518
6.00M
                    args->converter->fromUnicodeStatus = 0;
3519
6.00M
                    *p++ = UCNV_SI;
3520
6.00M
                }
3521
6.43M
                *p++ = subchar[0];
3522
6.43M
            } else /* length == 2*/ {
3523
0
                if(!args->converter->fromUnicodeStatus) {
3524
                    /* in SBCS mode: switch to DBCS */
3525
0
                    args->converter->fromUnicodeStatus = 1;
3526
0
                    *p++ = UCNV_SO;
3527
0
                }
3528
0
                *p++ = subchar[0];
3529
0
                *p++ = subchar[1];
3530
0
            }
3531
6.43M
            break;
3532
6.43M
        } else {
3533
            /* save the subconverter's substitution string */
3534
1.68M
            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3535
1.68M
            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3536
3537
            /* set our substitution string into the subconverter */
3538
1.68M
            myConverterData->currentConverter->subChars = reinterpret_cast<uint8_t*>(subchar);
3539
1.68M
            myConverterData->currentConverter->subCharLen = static_cast<int8_t>(length);
3540
3541
            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3542
1.68M
            args->converter = myConverterData->currentConverter;
3543
1.68M
            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3544
1.68M
            ucnv_cbFromUWriteSub(args, 0, err);
3545
1.68M
            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3546
1.68M
            args->converter = cnv;
3547
3548
            /* restore the subconverter's substitution string */
3549
1.68M
            myConverterData->currentConverter->subChars = currentSubChars;
3550
1.68M
            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3551
3552
1.68M
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3553
71
                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3554
71
                    uprv_memcpy(
3555
71
                        cnv->charErrorBuffer,
3556
71
                        myConverterData->currentConverter->charErrorBuffer,
3557
71
                        myConverterData->currentConverter->charErrorBufferLength);
3558
71
                }
3559
71
                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3560
71
                myConverterData->currentConverter->charErrorBufferLength = 0;
3561
71
            }
3562
1.68M
            return;
3563
1.68M
        }
3564
0
    default:
3565
        /* not expected */
3566
0
        break;
3567
8.88M
    }
3568
7.20M
    ucnv_cbFromUWriteBytes(args,
3569
7.20M
                           buffer, static_cast<int32_t>(p - buffer),
3570
7.20M
                           offsetIndex, err);
3571
7.20M
}
3572
3573
/*
3574
 * Structure for cloning an ISO 2022 converter into a single memory block.
3575
 */
3576
struct cloneStruct
3577
{
3578
    UConverter cnv;
3579
    UConverter currentConverter;
3580
    UConverterDataISO2022 mydata;
3581
};
3582
3583
3584
U_CDECL_BEGIN
3585
3586
static UConverter * U_CALLCONV
3587
_ISO_2022_SafeClone(
3588
            const UConverter *cnv,
3589
            void *stackBuffer,
3590
            int32_t *pBufferSize,
3591
            UErrorCode *status)
3592
0
{
3593
0
    struct cloneStruct * localClone;
3594
0
    UConverterDataISO2022 *cnvData;
3595
0
    int32_t i, size;
3596
3597
0
    if (U_FAILURE(*status)){
3598
0
        return nullptr;
3599
0
    }
3600
3601
0
    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3602
0
        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3603
0
        return nullptr;
3604
0
    }
3605
3606
0
    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3607
0
    localClone = (struct cloneStruct *)stackBuffer;
3608
3609
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3610
3611
0
    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3612
0
    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3613
0
    localClone->cnv.isExtraLocal = true;
3614
3615
    /* share the subconverters */
3616
3617
0
    if(cnvData->currentConverter != nullptr) {
3618
0
        size = (int32_t)sizeof(UConverter);
3619
0
        localClone->mydata.currentConverter =
3620
0
            ucnv_safeClone(cnvData->currentConverter,
3621
0
                            &localClone->currentConverter,
3622
0
                            &size, status);
3623
0
        if(U_FAILURE(*status)) {
3624
0
            return nullptr;
3625
0
        }
3626
0
    }
3627
3628
0
    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3629
0
        if(cnvData->myConverterArray[i] != nullptr) {
3630
0
            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3631
0
        }
3632
0
    }
3633
3634
0
    return &localClone->cnv;
3635
0
}
3636
3637
U_CDECL_END
3638
3639
static void U_CALLCONV
3640
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3641
                    const USetAdder *sa,
3642
                    UConverterUnicodeSet which,
3643
                    UErrorCode *pErrorCode)
3644
0
{
3645
0
    int32_t i;
3646
0
    UConverterDataISO2022* cnvData;
3647
3648
0
    if (U_FAILURE(*pErrorCode)) {
3649
0
        return;
3650
0
    }
3651
#ifdef U_ENABLE_GENERIC_ISO_2022
3652
    if (cnv->sharedData == &_ISO2022Data) {
3653
        /* We use UTF-8 in this case */
3654
        sa->addRange(sa->set, 0, 0xd7FF);
3655
        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3656
        return;
3657
    }
3658
#endif
3659
3660
0
    cnvData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
3661
3662
    /* open a set and initialize it with code points that are algorithmically round-tripped */
3663
0
    switch(cnvData->locale[0]){
3664
0
    case 'j':
3665
        /* include JIS X 0201 which is hardcoded */
3666
0
        sa->add(sa->set, 0xa5);
3667
0
        sa->add(sa->set, 0x203e);
3668
0
        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3669
            /* include Latin-1 for some variants of JP */
3670
0
            sa->addRange(sa->set, 0, 0xff);
3671
0
        } else {
3672
            /* include ASCII for JP */
3673
0
            sa->addRange(sa->set, 0, 0x7f);
3674
0
        }
3675
0
        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3676
            /*
3677
             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3678
             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3679
             * use half-width Katakana.
3680
             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3681
             * half-width Katakana via the ESC ( I sequence.
3682
             * However, we only emit (fromUnicode) half-width Katakana according to the
3683
             * definition of each variant.
3684
             *
3685
             * When including fallbacks,
3686
             * we need to include half-width Katakana Unicode code points for all JP variants because
3687
             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3688
             */
3689
            /* include half-width Katakana for JP */
3690
0
            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3691
0
        }
3692
0
        break;
3693
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3694
0
    case 'c':
3695
0
    case 'z':
3696
        /* include ASCII for CN */
3697
0
        sa->addRange(sa->set, 0, 0x7f);
3698
0
        break;
3699
0
    case 'k':
3700
        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3701
0
        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3702
0
                cnvData->currentConverter, sa, which, pErrorCode);
3703
        /* the loop over myConverterArray[] will simply not find another converter */
3704
0
        break;
3705
0
#endif
3706
0
    default:
3707
0
        break;
3708
0
    }
3709
3710
#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3711
            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3712
                cnvData->version==0 && i==CNS_11643
3713
            ) {
3714
                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3715
                ucnv_MBCSGetUnicodeSetForBytes(
3716
                        cnvData->myConverterArray[i],
3717
                        sa, UCNV_ROUNDTRIP_SET,
3718
                        0, 0x81, 0x82,
3719
                        pErrorCode);
3720
            }
3721
#endif
3722
3723
0
    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3724
0
        UConverterSetFilter filter;
3725
0
        if(cnvData->myConverterArray[i]!=nullptr) {
3726
0
            if(cnvData->locale[0]=='j' && i==JISX208) {
3727
                /*
3728
                 * Only add code points that map to Shift-JIS codes
3729
                 * corresponding to JIS X 0208.
3730
                 */
3731
0
                filter=UCNV_SET_FILTER_SJIS;
3732
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3733
0
            } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3734
0
                       cnvData->version==0 && i==CNS_11643) {
3735
                /*
3736
                 * Version-specific for CN:
3737
                 * CN version 0 does not map CNS planes 3..7 although
3738
                 * they are all available in the CNS conversion table;
3739
                 * CN version 1 (-EXT) does map them all.
3740
                 * The two versions create different Unicode sets.
3741
                 */
3742
0
                filter=UCNV_SET_FILTER_2022_CN;
3743
0
            } else if(i==KSC5601) {
3744
                /*
3745
                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3746
                 * are broader than GR94.
3747
                 */
3748
0
                filter=UCNV_SET_FILTER_GR94DBCS;
3749
0
#endif
3750
0
            } else {
3751
0
                filter=UCNV_SET_FILTER_NONE;
3752
0
            }
3753
0
            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3754
0
        }
3755
0
    }
3756
3757
    /*
3758
     * ISO 2022 converters must not convert SO/SI/ESC despite what
3759
     * sub-converters do by themselves.
3760
     * Remove these characters from the set.
3761
     */
3762
0
    sa->remove(sa->set, 0x0e);
3763
0
    sa->remove(sa->set, 0x0f);
3764
0
    sa->remove(sa->set, 0x1b);
3765
3766
    /* ISO 2022 converters do not convert C1 controls either */
3767
0
    sa->removeRange(sa->set, 0x80, 0x9f);
3768
0
}
3769
3770
static const UConverterImpl _ISO2022Impl={
3771
    UCNV_ISO_2022,
3772
3773
    nullptr,
3774
    nullptr,
3775
3776
    _ISO2022Open,
3777
    _ISO2022Close,
3778
    _ISO2022Reset,
3779
3780
#ifdef U_ENABLE_GENERIC_ISO_2022
3781
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3783
    ucnv_fromUnicode_UTF8,
3784
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3785
#else
3786
    nullptr,
3787
    nullptr,
3788
    nullptr,
3789
    nullptr,
3790
#endif
3791
    nullptr,
3792
3793
    nullptr,
3794
    _ISO2022getName,
3795
    _ISO_2022_WriteSub,
3796
    _ISO_2022_SafeClone,
3797
    _ISO_2022_GetUnicodeSet,
3798
3799
    nullptr,
3800
    nullptr
3801
};
3802
static const UConverterStaticData _ISO2022StaticData={
3803
    sizeof(UConverterStaticData),
3804
    "ISO_2022",
3805
    2022,
3806
    UCNV_IBM,
3807
    UCNV_ISO_2022,
3808
    1,
3809
    3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
3810
    { 0x1a, 0, 0, 0 },
3811
    1,
3812
    false,
3813
    false,
3814
    0,
3815
    0,
3816
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817
};
3818
const UConverterSharedData _ISO2022Data=
3819
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3820
3821
/*************JP****************/
3822
static const UConverterImpl _ISO2022JPImpl={
3823
    UCNV_ISO_2022,
3824
3825
    nullptr,
3826
    nullptr,
3827
3828
    _ISO2022Open,
3829
    _ISO2022Close,
3830
    _ISO2022Reset,
3831
3832
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3836
    nullptr,
3837
3838
    nullptr,
3839
    _ISO2022getName,
3840
    _ISO_2022_WriteSub,
3841
    _ISO_2022_SafeClone,
3842
    _ISO_2022_GetUnicodeSet,
3843
3844
    nullptr,
3845
    nullptr
3846
};
3847
static const UConverterStaticData _ISO2022JPStaticData={
3848
    sizeof(UConverterStaticData),
3849
    "ISO_2022_JP",
3850
    0,
3851
    UCNV_IBM,
3852
    UCNV_ISO_2022,
3853
    1,
3854
    6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
3855
    { 0x1a, 0, 0, 0 },
3856
    1,
3857
    false,
3858
    false,
3859
    0,
3860
    0,
3861
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3862
};
3863
3864
namespace {
3865
3866
const UConverterSharedData _ISO2022JPData=
3867
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3868
3869
}  // namespace
3870
3871
#if !UCONFIG_ONLY_HTML_CONVERSION
3872
/************* KR ***************/
3873
static const UConverterImpl _ISO2022KRImpl={
3874
    UCNV_ISO_2022,
3875
3876
    nullptr,
3877
    nullptr,
3878
3879
    _ISO2022Open,
3880
    _ISO2022Close,
3881
    _ISO2022Reset,
3882
3883
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3884
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3885
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3886
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3887
    nullptr,
3888
3889
    nullptr,
3890
    _ISO2022getName,
3891
    _ISO_2022_WriteSub,
3892
    _ISO_2022_SafeClone,
3893
    _ISO_2022_GetUnicodeSet,
3894
3895
    nullptr,
3896
    nullptr
3897
};
3898
static const UConverterStaticData _ISO2022KRStaticData={
3899
    sizeof(UConverterStaticData),
3900
    "ISO_2022_KR",
3901
    0,
3902
    UCNV_IBM,
3903
    UCNV_ISO_2022,
3904
    1,
3905
    8, /* max 8 bytes per char16_t */
3906
    { 0x1a, 0, 0, 0 },
3907
    1,
3908
    false,
3909
    false,
3910
    0,
3911
    0,
3912
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3913
};
3914
3915
namespace {
3916
3917
const UConverterSharedData _ISO2022KRData=
3918
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3919
3920
}  // namespace
3921
3922
/*************** CN ***************/
3923
static const UConverterImpl _ISO2022CNImpl={
3924
3925
    UCNV_ISO_2022,
3926
3927
    nullptr,
3928
    nullptr,
3929
3930
    _ISO2022Open,
3931
    _ISO2022Close,
3932
    _ISO2022Reset,
3933
3934
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3937
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3938
    nullptr,
3939
3940
    nullptr,
3941
    _ISO2022getName,
3942
    _ISO_2022_WriteSub,
3943
    _ISO_2022_SafeClone,
3944
    _ISO_2022_GetUnicodeSet,
3945
3946
    nullptr,
3947
    nullptr
3948
};
3949
static const UConverterStaticData _ISO2022CNStaticData={
3950
    sizeof(UConverterStaticData),
3951
    "ISO_2022_CN",
3952
    0,
3953
    UCNV_IBM,
3954
    UCNV_ISO_2022,
3955
    1,
3956
    8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3957
    { 0x1a, 0, 0, 0 },
3958
    1,
3959
    false,
3960
    false,
3961
    0,
3962
    0,
3963
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3964
};
3965
3966
namespace {
3967
3968
const UConverterSharedData _ISO2022CNData=
3969
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3970
3971
}  // namespace
3972
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3973
3974
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */