Coverage Report

Created: 2023-03-04 06:53

/src/icu/icu4c/source/common/ucnv2022.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2000-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv2022.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2000feb03
14
*   created by: Markus W. Scherer
15
*
16
*   Change history:
17
*
18
*   06/29/2000  helena  Major rewrite of the callback APIs.
19
*   08/08/2000  Ram     Included support for ISO-2022-JP-2
20
*                       Changed implementation of toUnicode
21
*                       function
22
*   08/21/2000  Ram     Added support for ISO-2022-KR
23
*   08/29/2000  Ram     Seperated implementation of EBCDIC to
24
*                       ucnvebdc.c
25
*   09/20/2000  Ram     Added support for ISO-2022-CN
26
*                       Added implementations for getNextUChar()
27
*                       for specific 2022 country variants.
28
*   10/31/2000  Ram     Implemented offsets logic functions
29
*/
30
31
#include "unicode/utypes.h"
32
33
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35
#include "unicode/ucnv.h"
36
#include "unicode/uset.h"
37
#include "unicode/ucnv_err.h"
38
#include "unicode/ucnv_cb.h"
39
#include "unicode/utf16.h"
40
#include "ucnv_imp.h"
41
#include "ucnv_bld.h"
42
#include "ucnv_cnv.h"
43
#include "ucnvmbcs.h"
44
#include "cstring.h"
45
#include "cmemory.h"
46
#include "uassert.h"
47
48
#ifdef U_ENABLE_GENERIC_ISO_2022
49
/*
50
 * I am disabling the generic ISO-2022 converter after proposing to do so on
51
 * the icu mailing list two days ago.
52
 *
53
 * Reasons:
54
 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55
 *    its designation sequences, single shifts with return to the previous state,
56
 *    switch-with-no-return to UTF-16BE or similar, etc.
57
 *    This is unlike the language-specific variants like ISO-2022-JP which
58
 *    require a much smaller repertoire of ISO-2022 features.
59
 *    These variants continue to be supported.
60
 * 2. I believe that no one is really using the generic ISO-2022 converter
61
 *    but rather always one of the language-specific variants.
62
 *    Note that ICU's generic ISO-2022 converter has always output one escape
63
 *    sequence followed by UTF-8 for the whole stream.
64
 * 3. Switching between subcharsets is extremely slow, because each time
65
 *    the previous converter is closed and a new one opened,
66
 *    without any kind of caching, least-recently-used list, etc.
67
 * 4. The code is currently buggy, and given the above it does not seem
68
 *    reasonable to spend the time on maintenance.
69
 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70
 *    This means, for example, that when ISO-8859-7 is designated, the following
71
 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72
 *    The ICU ISO-2022 converter does not handle this - and has no information
73
 *    about which subconverter would have to be shifted vs. which is designed
74
 *    for 7-bit ISO-2022.
75
 *
76
 * Markus Scherer 2003-dec-03
77
 */
78
#endif
79
80
#if !UCONFIG_ONLY_HTML_CONVERSION
81
static const char SHIFT_IN_STR[]  = "\x0F";
82
// static const char SHIFT_OUT_STR[] = "\x0E";
83
#endif
84
85
15.3M
#define CR      0x0D
86
7.69M
#define LF      0x0A
87
#define H_TAB   0x09
88
#define V_TAB   0x0B
89
#define SPACE   0x20
90
91
enum {
92
    HWKANA_START=0xff61,
93
    HWKANA_END=0xff9f
94
};
95
96
/*
97
 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98
 * as bytes 21..7E. (Subtract 0x80.)
99
 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100
 * as bytes 20..7F. (Subtract 0x80.)
101
 * Do not encode C1 control codes with native bytes 80..9F
102
 * as bytes 00..1F (C0 control codes).
103
 */
104
enum {
105
    GR94_START=0xa1,
106
    GR94_END=0xfe,
107
    GR96_START=0xa0,
108
    GR96_END=0xff
109
};
110
111
/*
112
 * ISO 2022 control codes must not be converted from Unicode
113
 * because they would mess up the byte stream.
114
 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115
 * corresponding to SO, SI, and ESC.
116
 */
117
22.0M
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119
/* for ISO-2022-JP and -CN implementations */
120
typedef enum  {
121
        /* shared values */
122
        INVALID_STATE=-1,
123
        ASCII = 0,
124
125
        SS2_STATE=0x10,
126
        SS3_STATE,
127
128
        /* JP */
129
        ISO8859_1 = 1 ,
130
        ISO8859_7 = 2 ,
131
        JISX201  = 3,
132
        JISX208 = 4,
133
        JISX212 = 5,
134
        GB2312  =6,
135
        KSC5601 =7,
136
        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137
138
        /* CN */
139
        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140
        GB2312_1=1,
141
        ISO_IR_165=2,
142
        CNS_11643=3,
143
144
        /*
145
         * these are used in StateEnum and ISO2022State variables,
146
         * but CNS_11643 must be used to index into myConverterArray[]
147
         */
148
        CNS_11643_0=0x20,
149
        CNS_11643_1,
150
        CNS_11643_2,
151
        CNS_11643_3,
152
        CNS_11643_4,
153
        CNS_11643_5,
154
        CNS_11643_6,
155
        CNS_11643_7
156
} StateEnum;
157
158
/* is the StateEnum charset value for a DBCS charset? */
159
#if UCONFIG_ONLY_HTML_CONVERSION
160
#define IS_JP_DBCS(cs) (JISX208==(cs))
161
#else
162
391
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163
#endif
164
165
134M
#define CSM(cs) ((uint16_t)1<<(cs))
166
167
/*
168
 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169
 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170
 *
171
 * Note: The converter uses some leniency:
172
 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173
 *   all versions, not just JIS7 and JIS8.
174
 * - ICU does not distinguish between different versions of JIS X 0208.
175
 */
176
#if UCONFIG_ONLY_HTML_CONVERSION
177
enum { MAX_JA_VERSION=0 };
178
#else
179
enum { MAX_JA_VERSION=4 };
180
#endif
181
static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183
#if !UCONFIG_ONLY_HTML_CONVERSION
184
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188
#endif
189
};
190
191
typedef enum {
192
        ASCII1=0,
193
        LATIN1,
194
        SBCS,
195
        DBCS,
196
        MBCS,
197
        HWKANA
198
}Cnv2022Type;
199
200
typedef struct ISO2022State {
201
    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202
    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203
    int8_t prevG;       /* g before single shift (SS2 or SS3) */
204
} ISO2022State;
205
206
1.25k
#define UCNV_OPTIONS_VERSION_MASK 0xf
207
13.7k
#define UCNV_2022_MAX_CONVERTERS 10
208
209
typedef struct{
210
    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211
    UConverter *currentConverter;
212
    Cnv2022Type currentType;
213
    ISO2022State toU2022State, fromU2022State;
214
    uint32_t key;
215
    uint32_t version;
216
#ifdef U_ENABLE_GENERIC_ISO_2022
217
    UBool isFirstBuffer;
218
#endif
219
    UBool isEmptySegment;
220
    char name[30];
221
    char locale[3];
222
}UConverterDataISO2022;
223
224
/* Protos */
225
/* ISO-2022 ----------------------------------------------------------------- */
226
227
/*Forward declaration */
228
U_CFUNC void U_CALLCONV
229
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230
                      UErrorCode * err);
231
U_CFUNC void U_CALLCONV
232
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233
                                    UErrorCode * err);
234
235
0
#define ESC_2022 0x1B /*ESC*/
236
237
typedef enum
238
{
239
        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240
        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241
        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242
        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243
} UCNV_TableStates_2022;
244
245
/*
246
* The way these state transition arrays work is:
247
* ex : ESC$B is the sequence for JISX208
248
*      a) First Iteration: char is ESC
249
*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250
*             int x = normalize_esq_chars_2022[27] which is equal to 1
251
*         ii) Search for this value in escSeqStateTable_Key_2022[]
252
*             value of x is stored at escSeqStateTable_Key_2022[0]
253
*        iii) Save this index as offset
254
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256
*     b) Switch on this state and continue to next char
257
*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258
*             which is normalize_esq_chars_2022[36] == 4
259
*         ii) x is currently 1(from above)
260
*               x<<=5 -- x is now 32
261
*               x+=normalize_esq_chars_2022[36]
262
*               now x is 36
263
*        iii) Search for this value in escSeqStateTable_Key_2022[]
264
*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267
*     c) Switch on this state and continue to next char
268
*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269
*        ii) x is currently 36 (from above)
270
*            x<<=5 -- x is now 1152
271
*            x+=normalize_esq_chars_2022[66]
272
*            now x is 1161
273
*       iii) Search for this value in escSeqStateTable_Key_2022[]
274
*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275
*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276
*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277
*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278
*/
279
280
281
/*Below are the 3 arrays depicting a state transition table*/
282
static const int8_t normalize_esq_chars_2022[256] = {
283
/*       0      1       2       3       4      5       6        7       8       9           */
284
285
         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288
        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289
        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291
        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292
        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293
        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294
        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310
        ,0     ,0      ,0      ,0      ,0      ,0
311
};
312
313
#ifdef U_ENABLE_GENERIC_ISO_2022
314
/*
315
 * When the generic ISO-2022 converter is completely removed, not just disabled
316
 * per #ifdef, then the following state table and the associated tables that are
317
 * dimensioned with MAX_STATES_2022 should be trimmed.
318
 *
319
 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320
 * the associated escape sequences starting with ESC ( B should be removed.
321
 * This includes the ones with key values 1097 and all of the ones above 1000000.
322
 *
323
 * For the latter, the tables can simply be truncated.
324
 * For the former, since the tables must be kept parallel, it is probably best
325
 * to simply duplicate an adjacent table cell, parallel in all tables.
326
 *
327
 * It may make sense to restructure the tables, especially by using small search
328
 * tables for the variants instead of indexing them parallel to the table here.
329
 */
330
#endif
331
332
0
#define MAX_STATES_2022 74
333
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334
/*   0           1           2           3           4           5           6           7           8           9           */
335
336
     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337
    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338
    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339
    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340
    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341
    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342
    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343
    ,35947631   ,35947635   ,35947636   ,35947638
344
};
345
346
#ifdef U_ENABLE_GENERIC_ISO_2022
347
348
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349
 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350
351
     nullptr                   ,nullptr                   ,nullptr                   ,nullptr               ,nullptr               ,nullptr                   ,nullptr                   ,nullptr                   ,"latin1"               ,"latin1"
352
    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353
    ,"latin1"               ,nullptr                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,nullptr                   ,nullptr                   ,nullptr                   ,nullptr                   ,"UTF8"
354
    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,nullptr               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355
    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356
    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357
    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358
    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359
};
360
361
#endif
362
363
static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364
/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365
     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366
    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367
    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373
};
374
375
/* Type def for refactoring changeState_2022 code*/
376
typedef enum{
377
#ifdef U_ENABLE_GENERIC_ISO_2022
378
    ISO_2022=0,
379
#endif
380
    ISO_2022_JP=1,
381
#if !UCONFIG_ONLY_HTML_CONVERSION
382
    ISO_2022_KR=2,
383
    ISO_2022_CN=3
384
#endif
385
} Variant2022;
386
387
/*********** ISO 2022 Converter Protos ***********/
388
static void U_CALLCONV
389
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391
static void U_CALLCONV
392
 _ISO2022Close(UConverter *converter);
393
394
static void U_CALLCONV
395
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397
U_CDECL_BEGIN
398
static const char * U_CALLCONV
399
_ISO2022getName(const UConverter* cnv);
400
U_CDECL_END
401
402
static void  U_CALLCONV
403
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405
U_CDECL_BEGIN
406
static UConverter * U_CALLCONV
407
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409
U_CDECL_END
410
411
#ifdef U_ENABLE_GENERIC_ISO_2022
412
static void U_CALLCONV
413
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414
#endif
415
416
namespace {
417
418
/*const UConverterSharedData _ISO2022Data;*/
419
extern const UConverterSharedData _ISO2022JPData;
420
421
#if !UCONFIG_ONLY_HTML_CONVERSION
422
extern const UConverterSharedData _ISO2022KRData;
423
extern const UConverterSharedData _ISO2022CNData;
424
#endif
425
426
}  // namespace
427
428
/*************** Converter implementations ******************/
429
430
/* The purpose of this function is to get around gcc compiler warnings. */
431
static inline void
432
fromUWriteUInt8(UConverter *cnv,
433
                 const char *bytes, int32_t length,
434
                 uint8_t **target, const char *targetLimit,
435
                 int32_t **offsets,
436
                 int32_t sourceIndex,
437
                 UErrorCode *pErrorCode)
438
20.6M
{
439
20.6M
    char *targetChars = (char *)*target;
440
20.6M
    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441
20.6M
                         offsets, sourceIndex, pErrorCode);
442
20.6M
    *target = (uint8_t*)targetChars;
443
444
20.6M
}
445
446
static inline void
447
294
setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448
294
    if(myConverterData->version == 1) {
449
79
        UConverter *cnv = myConverterData->currentConverter;
450
451
79
        cnv->toUnicodeStatus=0;     /* offset */
452
79
        cnv->mode=0;                /* state */
453
79
        cnv->toULength=0;           /* byteIndex */
454
79
    }
455
294
}
456
457
static inline void
458
876
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459
   /* in ISO-2022-KR the designator sequence appears only once
460
    * in a file so we append it only once
461
    */
462
876
    if( converter->charErrorBufferLength==0){
463
464
876
        converter->charErrorBufferLength = 4;
465
876
        converter->charErrorBuffer[0] = 0x1b;
466
876
        converter->charErrorBuffer[1] = 0x24;
467
876
        converter->charErrorBuffer[2] = 0x29;
468
876
        converter->charErrorBuffer[3] = 0x43;
469
876
    }
470
876
    if(myConverterData->version == 1) {
471
233
        UConverter *cnv = myConverterData->currentConverter;
472
473
233
        cnv->fromUChar32=0;
474
233
        cnv->fromUnicodeStatus=1;   /* prevLength */
475
233
    }
476
876
}
477
478
static void U_CALLCONV
479
1.25k
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481
1.25k
    char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
482
483
1.25k
    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484
1.25k
    if(cnv->extraInfo != nullptr) {
485
1.25k
        UConverterNamePieces stackPieces;
486
1.25k
        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487
1.25k
        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488
1.25k
        uint32_t version;
489
490
1.25k
        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492
1.25k
        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493
1.25k
        myConverterData->currentType = ASCII1;
494
1.25k
        cnv->fromUnicodeStatus =false;
495
1.25k
        if(pArgs->locale){
496
1.25k
            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
497
1.25k
        }
498
1.25k
        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499
1.25k
        myConverterData->version = version;
500
1.25k
        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501
1.25k
            (myLocale[2]=='_' || myLocale[2]=='\0'))
502
562
        {
503
            /* open the required converters and cache them */
504
562
            if(version>MAX_JA_VERSION) {
505
                // ICU 55 fails to open a converter for an unsupported version.
506
                // Previously, it fell back to version 0, but that would yield
507
                // unexpected behavior.
508
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
509
0
                return;
510
0
            }
511
562
            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512
436
                myConverterData->myConverterArray[ISO8859_7] =
513
436
                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514
436
            }
515
562
            myConverterData->myConverterArray[JISX208] =
516
562
                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517
562
            if(jpCharsetMasks[version]&CSM(JISX212)) {
518
495
                myConverterData->myConverterArray[JISX212] =
519
495
                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520
495
            }
521
562
            if(jpCharsetMasks[version]&CSM(GB2312)) {
522
436
                myConverterData->myConverterArray[GB2312] =
523
436
                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524
436
            }
525
562
            if(jpCharsetMasks[version]&CSM(KSC5601)) {
526
436
                myConverterData->myConverterArray[KSC5601] =
527
436
                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528
436
            }
529
530
            /* set the function pointers to appropriate functions */
531
562
            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532
562
            uprv_strcpy(myConverterData->locale,"ja");
533
534
562
            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535
562
            size_t len = uprv_strlen(myConverterData->name);
536
562
            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537
562
            myConverterData->name[len+1]='\0';
538
562
        }
539
688
#if !UCONFIG_ONLY_HTML_CONVERSION
540
688
        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541
688
            (myLocale[2]=='_' || myLocale[2]=='\0'))
542
296
        {
543
296
            if(version>1) {
544
                // ICU 55 fails to open a converter for an unsupported version.
545
                // Previously, it fell back to version 0, but that would yield
546
                // unexpected behavior.
547
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
548
0
                return;
549
0
            }
550
296
            const char *cnvName;
551
296
            if(version==1) {
552
80
                cnvName="icu-internal-25546";
553
216
            } else {
554
216
                cnvName="ibm-949";
555
216
                myConverterData->version=version=0;
556
216
            }
557
296
            if(pArgs->onlyTestIsLoadable) {
558
2
                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559
2
                uprv_free(cnv->extraInfo);
560
2
                cnv->extraInfo=nullptr;
561
2
                return;
562
294
            } else {
563
294
                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564
294
                if (U_FAILURE(*errorCode)) {
565
0
                    _ISO2022Close(cnv);
566
0
                    return;
567
0
                }
568
569
294
                if(version==1) {
570
79
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571
79
                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572
79
                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573
215
                }else{
574
215
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575
215
                }
576
577
                /* initialize the state variables */
578
294
                setInitialStateToUnicodeKR(cnv, myConverterData);
579
294
                setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581
                /* set the function pointers to appropriate functions */
582
294
                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583
294
                uprv_strcpy(myConverterData->locale,"ko");
584
294
            }
585
296
        }
586
392
        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587
392
            (myLocale[2]=='_' || myLocale[2]=='\0'))
588
392
        {
589
392
            if(version>2) {
590
                // ICU 55 fails to open a converter for an unsupported version.
591
                // Previously, it fell back to version 0, but that would yield
592
                // unexpected behavior.
593
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
594
0
                return;
595
0
            }
596
597
            /* open the required converters and cache them */
598
392
            myConverterData->myConverterArray[GB2312_1] =
599
392
                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600
392
            if(version==1) {
601
255
                myConverterData->myConverterArray[ISO_IR_165] =
602
255
                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603
255
            }
604
392
            myConverterData->myConverterArray[CNS_11643] =
605
392
                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608
            /* set the function pointers to appropriate functions */
609
392
            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610
392
            uprv_strcpy(myConverterData->locale,"cn");
611
612
392
            if (version==0){
613
102
                myConverterData->version = 0;
614
102
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615
290
            }else if (version==1){
616
255
                myConverterData->version = 1;
617
255
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618
255
            }else {
619
35
                myConverterData->version = 2;
620
35
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621
35
            }
622
392
        }
623
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
624
0
        else{
625
#ifdef U_ENABLE_GENERIC_ISO_2022
626
            myConverterData->isFirstBuffer = true;
627
628
            /* append the UTF-8 escape sequence */
629
            cnv->charErrorBufferLength = 3;
630
            cnv->charErrorBuffer[0] = 0x1b;
631
            cnv->charErrorBuffer[1] = 0x25;
632
            cnv->charErrorBuffer[2] = 0x42;
633
634
            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635
            /* initialize the state variables */
636
            uprv_strcpy(myConverterData->name,"ISO_2022");
637
#else
638
0
            *errorCode = U_MISSING_RESOURCE_ERROR;
639
            // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640
            // data loading error code.
641
0
            return;
642
0
#endif
643
0
        }
644
645
1.24k
        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647
1.24k
        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648
8
            _ISO2022Close(cnv);
649
8
        }
650
1.24k
    } else {
651
0
        *errorCode = U_MEMORY_ALLOCATION_ERROR;
652
0
    }
653
1.25k
}
654
655
656
static void U_CALLCONV
657
1.24k
_ISO2022Close(UConverter *converter) {
658
1.24k
    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659
1.24k
    UConverterSharedData **array = myData->myConverterArray;
660
1.24k
    int32_t i;
661
662
1.24k
    if (converter->extraInfo != nullptr) {
663
        /*close the array of converter pointers and free the memory*/
664
13.7k
        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665
12.4k
            if(array[i]!=nullptr) {
666
3.40k
                ucnv_unloadSharedDataIfReady(array[i]);
667
3.40k
            }
668
12.4k
        }
669
670
1.24k
        ucnv_close(myData->currentConverter);
671
672
1.24k
        if(!converter->isExtraLocal){
673
1.24k
            uprv_free (converter->extraInfo);
674
1.24k
            converter->extraInfo = nullptr;
675
1.24k
        }
676
1.24k
    }
677
1.24k
}
678
679
static void U_CALLCONV
680
2.46k
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681
2.46k
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682
2.46k
    if(choice<=UCNV_RESET_TO_UNICODE) {
683
0
        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684
0
        myConverterData->key = 0;
685
0
        myConverterData->isEmptySegment = false;
686
0
    }
687
2.46k
    if(choice!=UCNV_RESET_TO_UNICODE) {
688
2.46k
        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689
2.46k
    }
690
#ifdef U_ENABLE_GENERIC_ISO_2022
691
    if(myConverterData->locale[0] == 0){
692
        if(choice<=UCNV_RESET_TO_UNICODE) {
693
            myConverterData->isFirstBuffer = true;
694
            myConverterData->key = 0;
695
            if (converter->mode == UCNV_SO){
696
                ucnv_close (myConverterData->currentConverter);
697
                myConverterData->currentConverter=nullptr;
698
            }
699
            converter->mode = UCNV_SI;
700
        }
701
        if(choice!=UCNV_RESET_TO_UNICODE) {
702
            /* re-append UTF-8 escape sequence */
703
            converter->charErrorBufferLength = 3;
704
            converter->charErrorBuffer[0] = 0x1b;
705
            converter->charErrorBuffer[1] = 0x28;
706
            converter->charErrorBuffer[2] = 0x42;
707
        }
708
    }
709
    else
710
#endif
711
2.46k
    {
712
        /* reset the state variables */
713
2.46k
        if(myConverterData->locale[0] == 'k'){
714
582
            if(choice<=UCNV_RESET_TO_UNICODE) {
715
0
                setInitialStateToUnicodeKR(converter, myConverterData);
716
0
            }
717
582
            if(choice!=UCNV_RESET_TO_UNICODE) {
718
582
                setInitialStateFromUnicodeKR(converter, myConverterData);
719
582
            }
720
582
        }
721
2.46k
    }
722
2.46k
}
723
724
U_CDECL_BEGIN
725
726
static const char * U_CALLCONV
727
0
_ISO2022getName(const UConverter* cnv){
728
0
    if(cnv->extraInfo){
729
0
        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730
0
        return myData->name;
731
0
    }
732
0
    return nullptr;
733
0
}
734
735
U_CDECL_END
736
737
738
/*************** to unicode *******************/
739
/****************************************************************************
740
 * Recognized escape sequences are
741
 * <ESC>(B  ASCII
742
 * <ESC>.A  ISO-8859-1
743
 * <ESC>.F  ISO-8859-7
744
 * <ESC>(J  JISX-201
745
 * <ESC>(I  JISX-201
746
 * <ESC>$B  JISX-208
747
 * <ESC>$@  JISX-208
748
 * <ESC>$(D JISX-212
749
 * <ESC>$A  GB2312
750
 * <ESC>$(C KSC5601
751
 */
752
static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753
/*      0                1               2               3               4               5               6               7               8               9    */
754
    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755
    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756
    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757
    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762
};
763
764
#if !UCONFIG_ONLY_HTML_CONVERSION
765
/*************** to unicode *******************/
766
static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767
/*      0                1               2               3               4               5               6               7               8               9    */
768
     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773
    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776
};
777
#endif
778
779
780
static UCNV_TableStates_2022
781
0
getKey_2022(char c,int32_t* key,int32_t* offset){
782
0
    int32_t togo;
783
0
    int32_t low = 0;
784
0
    int32_t hi = MAX_STATES_2022;
785
0
    int32_t oldmid=0;
786
787
0
    togo = normalize_esq_chars_2022[(uint8_t)c];
788
0
    if(togo == 0) {
789
        /* not a valid character anywhere in an escape sequence */
790
0
        *key = 0;
791
0
        *offset = 0;
792
0
        return INVALID_2022;
793
0
    }
794
0
    togo = (*key << 5) + togo;
795
796
0
    while (hi != low)  /*binary search*/{
797
798
0
        int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800
0
        if (mid == oldmid)
801
0
            break;
802
803
0
        if (escSeqStateTable_Key_2022[mid] > togo){
804
0
            hi = mid;
805
0
        }
806
0
        else if (escSeqStateTable_Key_2022[mid] < togo){
807
0
            low = mid;
808
0
        }
809
0
        else /*we found it*/{
810
0
            *key = togo;
811
0
            *offset = mid;
812
0
            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813
0
        }
814
0
        oldmid = mid;
815
816
0
    }
817
818
0
    *key = 0;
819
0
    *offset = 0;
820
0
    return INVALID_2022;
821
0
}
822
823
/*runs through a state machine to determine the escape sequence - codepage correspondence
824
 */
825
static void
826
changeState_2022(UConverter* _this,
827
                const char** source,
828
                const char* sourceLimit,
829
                Variant2022 var,
830
0
                UErrorCode* err){
831
0
    UCNV_TableStates_2022 value;
832
0
    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833
0
    uint32_t key = myData2022->key;
834
0
    int32_t offset = 0;
835
0
    int8_t initialToULength = _this->toULength;
836
0
    char c;
837
838
0
    value = VALID_NON_TERMINAL_2022;
839
0
    while (*source < sourceLimit) {
840
0
        c = *(*source)++;
841
0
        _this->toUBytes[_this->toULength++]=(uint8_t)c;
842
0
        value = getKey_2022(c,(int32_t *) &key, &offset);
843
844
0
        switch (value){
845
846
0
        case VALID_NON_TERMINAL_2022 :
847
            /* continue with the loop */
848
0
            break;
849
850
0
        case VALID_TERMINAL_2022:
851
0
            key = 0;
852
0
            goto DONE;
853
854
0
        case INVALID_2022:
855
0
            goto DONE;
856
857
0
        case VALID_MAYBE_TERMINAL_2022:
858
#ifdef U_ENABLE_GENERIC_ISO_2022
859
            /* ESC ( B is ambiguous only for ISO_2022 itself */
860
            if(var == ISO_2022) {
861
                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862
                _this->toULength = 0;
863
864
                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866
                /* continue with the loop */
867
                value = VALID_NON_TERMINAL_2022;
868
                break;
869
            } else
870
#endif
871
0
            {
872
                /* not ISO_2022 itself, finish here */
873
0
                value = VALID_TERMINAL_2022;
874
0
                key = 0;
875
0
                goto DONE;
876
0
            }
877
0
        }
878
0
    }
879
880
0
DONE:
881
0
    myData2022->key = key;
882
883
0
    if (value == VALID_NON_TERMINAL_2022) {
884
        /* indicate that the escape sequence is incomplete: key!=0 */
885
0
        return;
886
0
    } else if (value == INVALID_2022 ) {
887
0
        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888
0
    } else /* value == VALID_TERMINAL_2022 */ {
889
0
        switch(var){
890
#ifdef U_ENABLE_GENERIC_ISO_2022
891
        case ISO_2022:
892
        {
893
            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894
            if(chosenConverterName == nullptr) {
895
                /* SS2 or SS3 */
896
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897
                _this->toUCallbackReason = UCNV_UNASSIGNED;
898
                return;
899
            }
900
901
            _this->mode = UCNV_SI;
902
            ucnv_close(myData2022->currentConverter);
903
            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904
            if(U_SUCCESS(*err)) {
905
                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906
                _this->mode = UCNV_SO;
907
            }
908
            break;
909
        }
910
#endif
911
0
        case ISO_2022_JP:
912
0
            {
913
0
                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914
0
                switch(tempState) {
915
0
                case INVALID_STATE:
916
0
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917
0
                    break;
918
0
                case SS2_STATE:
919
0
                    if(myData2022->toU2022State.cs[2]!=0) {
920
0
                        if(myData2022->toU2022State.g<2) {
921
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922
0
                        }
923
0
                        myData2022->toU2022State.g=2;
924
0
                    } else {
925
                        /* illegal to have SS2 before a matching designator */
926
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927
0
                    }
928
0
                    break;
929
                /* case SS3_STATE: not used in ISO-2022-JP-x */
930
0
                case ISO8859_1:
931
0
                case ISO8859_7:
932
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934
0
                    } else {
935
                        /* G2 charset for SS2 */
936
0
                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
937
0
                    }
938
0
                    break;
939
0
                default:
940
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942
0
                    } else {
943
                        /* G0 charset */
944
0
                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
945
0
                    }
946
0
                    break;
947
0
                }
948
0
            }
949
0
            break;
950
0
#if !UCONFIG_ONLY_HTML_CONVERSION
951
0
        case ISO_2022_CN:
952
0
            {
953
0
                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954
0
                switch(tempState) {
955
0
                case INVALID_STATE:
956
0
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957
0
                    break;
958
0
                case SS2_STATE:
959
0
                    if(myData2022->toU2022State.cs[2]!=0) {
960
0
                        if(myData2022->toU2022State.g<2) {
961
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962
0
                        }
963
0
                        myData2022->toU2022State.g=2;
964
0
                    } else {
965
                        /* illegal to have SS2 before a matching designator */
966
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967
0
                    }
968
0
                    break;
969
0
                case SS3_STATE:
970
0
                    if(myData2022->toU2022State.cs[3]!=0) {
971
0
                        if(myData2022->toU2022State.g<2) {
972
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973
0
                        }
974
0
                        myData2022->toU2022State.g=3;
975
0
                    } else {
976
                        /* illegal to have SS3 before a matching designator */
977
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978
0
                    }
979
0
                    break;
980
0
                case ISO_IR_165:
981
0
                    if(myData2022->version==0) {
982
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983
0
                        break;
984
0
                    }
985
0
                    U_FALLTHROUGH;
986
0
                case GB2312_1:
987
0
                    U_FALLTHROUGH;
988
0
                case CNS_11643_1:
989
0
                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
990
0
                    break;
991
0
                case CNS_11643_2:
992
0
                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
993
0
                    break;
994
0
                default:
995
                    /* other CNS 11643 planes */
996
0
                    if(myData2022->version==0) {
997
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998
0
                    } else {
999
0
                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000
0
                    }
1001
0
                    break;
1002
0
                }
1003
0
            }
1004
0
            break;
1005
0
        case ISO_2022_KR:
1006
0
            if(offset==0x30){
1007
                /* nothing to be done, just accept this one escape sequence */
1008
0
            } else {
1009
0
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010
0
            }
1011
0
            break;
1012
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014
0
        default:
1015
0
            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016
0
            break;
1017
0
        }
1018
0
    }
1019
0
    if(U_SUCCESS(*err)) {
1020
0
        _this->toULength = 0;
1021
0
    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022
0
        if(_this->toULength>1) {
1023
            /*
1024
             * Ticket 5691: consistent illegal sequences:
1025
             * - We include at least the first byte (ESC) in the illegal sequence.
1026
             * - If any of the non-initial bytes could be the start of a character,
1027
             *   we stop the illegal sequence before the first one of those.
1028
             *   In escape sequences, all following bytes are "printable", that is,
1029
             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030
             *   they are valid single/lead bytes.
1031
             *   For simplicity, we always only report the initial ESC byte as the
1032
             *   illegal sequence and back out all other bytes we looked at.
1033
             */
1034
            /* Back out some bytes. */
1035
0
            int8_t backOutDistance=_this->toULength-1;
1036
0
            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037
0
            if(backOutDistance<=bytesFromThisBuffer) {
1038
                /* same as initialToULength<=1 */
1039
0
                *source-=backOutDistance;
1040
0
            } else {
1041
                /* Back out bytes from the previous buffer: Need to replay them. */
1042
0
                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043
                /* same as -(initialToULength-1) */
1044
                /* preToULength is negative! */
1045
0
                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046
0
                *source-=bytesFromThisBuffer;
1047
0
            }
1048
0
            _this->toULength=1;
1049
0
        }
1050
0
    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051
0
        _this->toUCallbackReason = UCNV_UNASSIGNED;
1052
0
    }
1053
0
}
1054
1055
#if !UCONFIG_ONLY_HTML_CONVERSION
1056
/*Checks the characters of the buffer against valid 2022 escape sequences
1057
*if the match we return a pointer to the initial start of the sequence otherwise
1058
*we return sourceLimit
1059
*/
1060
/*for 2022 looks ahead in the stream
1061
 *to determine the longest possible convertible
1062
 *data stream
1063
 */
1064
static inline const char*
1065
getEndOfBuffer_2022(const char** source,
1066
                   const char* sourceLimit,
1067
0
                   UBool /*flush*/){
1068
1069
0
    const char* mySource = *source;
1070
1071
#ifdef U_ENABLE_GENERIC_ISO_2022
1072
    if (*source >= sourceLimit)
1073
        return sourceLimit;
1074
1075
    do{
1076
1077
        if (*mySource == ESC_2022){
1078
            int8_t i;
1079
            int32_t key = 0;
1080
            int32_t offset;
1081
            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083
            /* Kludge: I could not
1084
            * figure out the reason for validating an escape sequence
1085
            * twice - once here and once in changeState_2022().
1086
            * is it possible to have an ESC character in a ISO2022
1087
            * byte stream which is valid in a code page? Is it legal?
1088
            */
1089
            for (i=0;
1090
            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091
            i++) {
1092
                value =  getKey_2022(*(mySource+i), &key, &offset);
1093
            }
1094
            if (value > 0 || *mySource==ESC_2022)
1095
                return mySource;
1096
1097
            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098
                return sourceLimit;
1099
        }
1100
    }while (++mySource < sourceLimit);
1101
1102
    return sourceLimit;
1103
#else
1104
0
    while(mySource < sourceLimit && *mySource != ESC_2022) {
1105
0
        ++mySource;
1106
0
    }
1107
0
    return mySource;
1108
0
#endif
1109
0
}
1110
#endif
1111
1112
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113
 * any future change in _MBCSFromUChar32() function should be reflected here.
1114
 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115
 */
1116
static inline int32_t
1117
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118
                                         UChar32 c,
1119
                                         uint32_t* value,
1120
                                         UBool useFallback,
1121
                                         int outputType)
1122
67.5M
{
1123
67.5M
    const int32_t *cx;
1124
67.5M
    const uint16_t *table;
1125
67.5M
    uint32_t stage2Entry;
1126
67.5M
    uint32_t myValue;
1127
67.5M
    int32_t length;
1128
67.5M
    const uint8_t *p;
1129
    /*
1130
     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131
     * Use internal version of ucnv_open() that verifies that the new structures are available,
1132
     * else U_INTERNAL_PROGRAM_ERROR.
1133
     */
1134
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135
67.5M
    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136
67.5M
        table=sharedData->mbcs.fromUnicodeTable;
1137
67.5M
        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138
        /* get the bytes and the length for the output */
1139
67.5M
        if(outputType==MBCS_OUTPUT_2){
1140
53.6M
            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141
53.6M
            if(myValue<=0xff) {
1142
25.7M
                length=1;
1143
27.9M
            } else {
1144
27.9M
                length=2;
1145
27.9M
            }
1146
53.6M
        } else /* outputType==MBCS_OUTPUT_3 */ {
1147
13.8M
            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148
13.8M
            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149
13.8M
            if(myValue<=0xff) {
1150
6.96M
                length=1;
1151
6.96M
            } else if(myValue<=0xffff) {
1152
0
                length=2;
1153
6.89M
            } else {
1154
6.89M
                length=3;
1155
6.89M
            }
1156
13.8M
        }
1157
        /* is this code point assigned, or do we use fallbacks? */
1158
67.5M
        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159
            /* assigned */
1160
34.8M
            *value=myValue;
1161
34.8M
            return length;
1162
34.8M
        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163
            /*
1164
             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165
             * There is no way with this data structure for fallback output
1166
             * to be a zero byte.
1167
             */
1168
0
            *value=myValue;
1169
0
            return -length;
1170
0
        }
1171
67.5M
    }
1172
1173
32.6M
    cx=sharedData->mbcs.extIndexes;
1174
32.6M
    if(cx!=nullptr) {
1175
11.1M
        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176
11.1M
    }
1177
1178
    /* unassigned */
1179
21.5M
    return 0;
1180
32.6M
}
1181
1182
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183
 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184
 * @param retval pointer to output byte
1185
 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186
 */
1187
static inline int32_t
1188
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189
                                       UChar32 c,
1190
                                       uint32_t* retval,
1191
                                       UBool useFallback)
1192
6.92M
{
1193
6.92M
    const uint16_t *table;
1194
6.92M
    int32_t value;
1195
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196
6.92M
    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197
1.25k
        return 0;
1198
1.25k
    }
1199
    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200
6.92M
    table=sharedData->mbcs.fromUnicodeTable;
1201
    /* get the byte for the output */
1202
6.92M
    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203
    /* is this code point assigned, or do we use fallbacks? */
1204
6.92M
    *retval=(uint32_t)(value&0xff);
1205
6.92M
    if(value>=0xf00) {
1206
564
        return 1;  /* roundtrip */
1207
6.92M
    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208
0
        return -1;  /* fallback taken */
1209
6.92M
    } else {
1210
6.92M
        return 0;  /* no mapping */
1211
6.92M
    }
1212
6.92M
}
1213
1214
/*
1215
 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216
 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217
 * to move it to the ISO 2022 range 21..7E.
1218
 * Return 0 if out of range.
1219
 */
1220
static inline uint32_t
1221
3.31M
_2022FromGR94DBCS(uint32_t value) {
1222
3.31M
    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223
3.31M
        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224
3.31M
    ) {
1225
3.30M
        return value - 0x8080;  /* shift down to 21..7e byte range */
1226
3.30M
    } else {
1227
1.84k
        return 0;  /* not valid for ISO 2022 */
1228
1.84k
    }
1229
3.31M
}
1230
1231
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232
/*
1233
 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234
 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235
 * unchanged. 
1236
 */
1237
static inline uint32_t
1238
_2022ToGR94DBCS(uint32_t value) {
1239
    uint32_t returnValue = value + 0x8080;
1240
    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241
        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242
        return returnValue;
1243
    } else {
1244
        return value;
1245
    }
1246
}
1247
#endif
1248
1249
#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251
/**********************************************************************************
1252
*  ISO-2022 Converter
1253
*
1254
*
1255
*/
1256
1257
static void U_CALLCONV
1258
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259
                                                           UErrorCode* err){
1260
    const char* mySourceLimit, *realSourceLimit;
1261
    const char* sourceStart;
1262
    const char16_t* myTargetStart;
1263
    UConverter* saveThis;
1264
    UConverterDataISO2022* myData;
1265
    int8_t length;
1266
1267
    saveThis = args->converter;
1268
    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270
    realSourceLimit = args->sourceLimit;
1271
    while (args->source < realSourceLimit) {
1272
        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273
            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274
            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276
            if(args->source < mySourceLimit) {
1277
                if(myData->currentConverter==nullptr) {
1278
                    myData->currentConverter = ucnv_open("ASCII",err);
1279
                    if(U_FAILURE(*err)){
1280
                        return;
1281
                    }
1282
1283
                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284
                    saveThis->mode = UCNV_SO;
1285
                }
1286
1287
                /* convert to before the ESC or until the end of the buffer */
1288
                myData->isFirstBuffer=false;
1289
                sourceStart = args->source;
1290
                myTargetStart = args->target;
1291
                args->converter = myData->currentConverter;
1292
                ucnv_toUnicode(args->converter,
1293
                    &args->target,
1294
                    args->targetLimit,
1295
                    &args->source,
1296
                    mySourceLimit,
1297
                    args->offsets,
1298
                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299
                    err);
1300
                args->converter = saveThis;
1301
1302
                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303
                    /* move the overflow buffer */
1304
                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305
                    myData->currentConverter->UCharErrorBufferLength = 0;
1306
                    if(length > 0) {
1307
                        uprv_memcpy(saveThis->UCharErrorBuffer,
1308
                                    myData->currentConverter->UCharErrorBuffer,
1309
                                    length*U_SIZEOF_UCHAR);
1310
                    }
1311
                    return;
1312
                }
1313
1314
                /*
1315
                 * At least one of:
1316
                 * -Error while converting
1317
                 * -Done with entire buffer
1318
                 * -Need to write offsets or update the current offset
1319
                 *  (leave that up to the code in ucnv.c)
1320
                 *
1321
                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322
                 */
1323
                if (U_FAILURE(*err) ||
1324
                    (args->source == realSourceLimit) ||
1325
                    (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
1326
                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327
                ) {
1328
                    /* copy partial or error input for truncated detection and error handling */
1329
                    if(U_FAILURE(*err)) {
1330
                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331
                        if(length > 0) {
1332
                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333
                        }
1334
                    } else {
1335
                        length = saveThis->toULength = myData->currentConverter->toULength;
1336
                        if(length > 0) {
1337
                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338
                            if(args->source < mySourceLimit) {
1339
                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340
                            }
1341
                        }
1342
                    }
1343
                    return;
1344
                }
1345
            }
1346
        }
1347
1348
        sourceStart = args->source;
1349
        changeState_2022(args->converter,
1350
               &(args->source),
1351
               realSourceLimit,
1352
               ISO_2022,
1353
               err);
1354
        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
1355
            /* let the ucnv.c code update its current offset */
1356
            return;
1357
        }
1358
    }
1359
}
1360
1361
#endif
1362
1363
/*
1364
 * To Unicode Callback helper function
1365
 */
1366
static void
1367
toUnicodeCallback(UConverter *cnv,
1368
                  const uint32_t sourceChar, const uint32_t targetUniChar,
1369
0
                  UErrorCode* err){
1370
0
    if(sourceChar>0xff){
1371
0
        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372
0
        cnv->toUBytes[1] = (uint8_t)sourceChar;
1373
0
        cnv->toULength = 2;
1374
0
    }
1375
0
    else{
1376
0
        cnv->toUBytes[0] =(char) sourceChar;
1377
0
        cnv->toULength = 1;
1378
0
    }
1379
1380
0
    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381
0
        *err = U_INVALID_CHAR_FOUND;
1382
0
    }
1383
0
    else{
1384
0
        *err = U_ILLEGAL_CHAR_FOUND;
1385
0
    }
1386
0
}
1387
1388
/**************************************ISO-2022-JP*************************************************/
1389
1390
/************************************** IMPORTANT **************************************************
1391
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393
* The converter iterates over each Unicode codepoint
1394
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395
* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396
* would do as far as possible.
1397
*
1398
* If the implementation of these macros or structure of sharedData struct change in the future, make
1399
* sure that ISO-2022 is also changed.
1400
***************************************************************************************************
1401
*/
1402
1403
/***************************************************************************************************
1404
* Rules for ISO-2022-jp encoding
1405
* (i)   Escape sequences must be fully contained within a line they should not
1406
*       span new lines or CRs
1407
* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408
*       JIS-Roman character escape sequence should follow before the line terminates
1409
* (iii) If the first character on the line is represented by two bytes then a two
1410
*       byte character escape sequence should precede it
1411
* (iv)  If no escape sequence is encountered then the characters are ASCII
1412
* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413
*       and invoked with SS2 (ESC N).
1414
* (vi)  If there is any G0 designation in text, there must be a switch to
1415
*       ASCII or to JIS X 0201-Roman before a space character (but not
1416
*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417
*       characters such as tab or CRLF.
1418
* (vi)  Supported encodings:
1419
*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420
*
1421
*  source : RFC-1554
1422
*
1423
*          JISX201, JISX208,JISX212 : new .cnv data files created
1424
*          KSC5601 : alias to ibm-949 mapping table
1425
*          GB2312 : alias to ibm-1386 mapping table
1426
*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427
*          ISO-8859-7 : alias to ibm-9409 mapping table
1428
*/
1429
1430
/* preference order of JP charsets */
1431
static const StateEnum jpCharsetPref[]={
1432
    ASCII,
1433
    JISX201,
1434
    ISO8859_1,
1435
    JISX208,
1436
    ISO8859_7,
1437
    JISX212,
1438
    GB2312,
1439
    KSC5601,
1440
    HWKANA_7BIT
1441
};
1442
1443
/*
1444
 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445
 * not in order of jpCharsetPref[]!
1446
 */
1447
static const char escSeqChars[][6] ={
1448
    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449
    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450
    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451
    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452
    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453
    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454
    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455
    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456
    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457
1458
};
1459
static  const int8_t escSeqCharsLen[] ={
1460
    3, /* length of <ESC>(B  ASCII       */
1461
    3, /* length of <ESC>.A  ISO-8859-1  */
1462
    3, /* length of <ESC>.F  ISO-8859-7  */
1463
    3, /* length of <ESC>(J  JISX-201    */
1464
    3, /* length of <ESC>$B  JISX-208    */
1465
    4, /* length of <ESC>$(D JISX-212    */
1466
    3, /* length of <ESC>$A  GB2312      */
1467
    4, /* length of <ESC>$(C KSC5601     */
1468
    3  /* length of <ESC>(I  HWKANA_7BIT */
1469
};
1470
1471
/*
1472
* The iteration over various code pages works this way:
1473
* i)   Get the currentState from myConverterData->currentState
1474
* ii)  Check if the character is mapped to a valid character in the currentState
1475
*      Yes ->  a) set the initIterState to currentState
1476
*       b) remain in this state until an invalid character is found
1477
*      No  ->  a) go to the next code page and find the character
1478
* iii) Before changing the state increment the current state check if the current state
1479
*      is equal to the intitIteration state
1480
*      Yes ->  A character that cannot be represented in any of the supported encodings
1481
*       break and return a U_INVALID_CHARACTER error
1482
*      No  ->  Continue and find the character in next code page
1483
*
1484
*
1485
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486
*/
1487
1488
/* Map 00..7F to Unicode according to JIS X 0201. */
1489
static inline uint32_t
1490
0
jisx201ToU(uint32_t value) {
1491
0
    if(value < 0x5c) {
1492
0
        return value;
1493
0
    } else if(value == 0x5c) {
1494
0
        return 0xa5;
1495
0
    } else if(value == 0x7e) {
1496
0
        return 0x203e;
1497
0
    } else /* value <= 0x7f */ {
1498
0
        return value;
1499
0
    }
1500
0
}
1501
1502
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503
static inline uint32_t
1504
7.47M
jisx201FromU(uint32_t value) {
1505
7.47M
    if(value<=0x7f) {
1506
656
        if(value!=0x5c && value!=0x7e) {
1507
205
            return value;
1508
205
        }
1509
7.47M
    } else if(value==0xa5) {
1510
666
        return 0x5c;
1511
7.47M
    } else if(value==0x203e) {
1512
207
        return 0x7e;
1513
207
    }
1514
7.47M
    return 0xfffe;
1515
7.47M
}
1516
1517
/*
1518
 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519
 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520
 * Return 0 if the byte pair is out of range.
1521
 */
1522
static inline uint32_t
1523
6.98M
_2022FromSJIS(uint32_t value) {
1524
6.98M
    uint8_t trail;
1525
1526
6.98M
    if(value > 0xEFFC) {
1527
6.41M
        return 0;  /* beyond JIS X 0208 */
1528
6.41M
    }
1529
1530
564k
    trail = (uint8_t)value;
1531
1532
564k
    value &= 0xff00;  /* lead byte */
1533
564k
    if(value <= 0x9f00) {
1534
562k
        value -= 0x7000;
1535
562k
    } else /* 0xe000 <= value <= 0xef00 */ {
1536
1.27k
        value -= 0xb000;
1537
1.27k
    }
1538
564k
    value <<= 1;
1539
1540
564k
    if(trail <= 0x9e) {
1541
22.2k
        value -= 0x100;
1542
22.2k
        if(trail <= 0x7e) {
1543
20.2k
            value |= trail - 0x1f;
1544
20.2k
        } else {
1545
1.97k
            value |= trail - 0x20;
1546
1.97k
        }
1547
541k
    } else /* trail <= 0xfc */ {
1548
541k
        value |= trail - 0x7e;
1549
541k
    }
1550
564k
    return value;
1551
6.98M
}
1552
1553
/*
1554
 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555
 * If either byte is outside 21..7E make sure that the result is not valid
1556
 * for Shift-JIS so that the converter catches it.
1557
 * Some invalid byte values already turn into equally invalid Shift-JIS
1558
 * byte values and need not be tested explicitly.
1559
 */
1560
static inline void
1561
0
_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562
0
    if(c1&1) {
1563
0
        ++c1;
1564
0
        if(c2 <= 0x5f) {
1565
0
            c2 += 0x1f;
1566
0
        } else if(c2 <= 0x7e) {
1567
0
            c2 += 0x20;
1568
0
        } else {
1569
0
            c2 = 0;  /* invalid */
1570
0
        }
1571
0
    } else {
1572
0
        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1573
0
            c2 += 0x7e;
1574
0
        } else {
1575
0
            c2 = 0;  /* invalid */
1576
0
        }
1577
0
    }
1578
0
    c1 >>= 1;
1579
0
    if(c1 <= 0x2f) {
1580
0
        c1 += 0x70;
1581
0
    } else if(c1 <= 0x3f) {
1582
0
        c1 += 0xb0;
1583
0
    } else {
1584
0
        c1 = 0;  /* invalid */
1585
0
    }
1586
0
    bytes[0] = (char)c1;
1587
0
    bytes[1] = (char)c2;
1588
0
}
1589
1590
/*
1591
 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592
 * Katakana.
1593
 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594
 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595
 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596
 */
1597
static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598
    0x2123,  /* U+FF61 */
1599
    0x2156,
1600
    0x2157,
1601
    0x2122,
1602
    0x2126,
1603
    0x2572,
1604
    0x2521,
1605
    0x2523,
1606
    0x2525,
1607
    0x2527,
1608
    0x2529,
1609
    0x2563,
1610
    0x2565,
1611
    0x2567,
1612
    0x2543,
1613
    0x213C,  /* U+FF70 */
1614
    0x2522,
1615
    0x2524,
1616
    0x2526,
1617
    0x2528,
1618
    0x252A,
1619
    0x252B,
1620
    0x252D,
1621
    0x252F,
1622
    0x2531,
1623
    0x2533,
1624
    0x2535,
1625
    0x2537,
1626
    0x2539,
1627
    0x253B,
1628
    0x253D,
1629
    0x253F,  /* U+FF80 */
1630
    0x2541,
1631
    0x2544,
1632
    0x2546,
1633
    0x2548,
1634
    0x254A,
1635
    0x254B,
1636
    0x254C,
1637
    0x254D,
1638
    0x254E,
1639
    0x254F,
1640
    0x2552,
1641
    0x2555,
1642
    0x2558,
1643
    0x255B,
1644
    0x255E,
1645
    0x255F,  /* U+FF90 */
1646
    0x2560,
1647
    0x2561,
1648
    0x2562,
1649
    0x2564,
1650
    0x2566,
1651
    0x2568,
1652
    0x2569,
1653
    0x256A,
1654
    0x256B,
1655
    0x256C,
1656
    0x256D,
1657
    0x256F,
1658
    0x2573,
1659
    0x212B,
1660
    0x212C   /* U+FF9F */
1661
};
1662
1663
static void U_CALLCONV
1664
571k
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665
571k
    UConverter *cnv = args->converter;
1666
571k
    UConverterDataISO2022 *converterData;
1667
571k
    ISO2022State *pFromU2022State;
1668
571k
    uint8_t *target = (uint8_t *) args->target;
1669
571k
    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1670
571k
    const char16_t* source = args->source;
1671
571k
    const char16_t* sourceLimit = args->sourceLimit;
1672
571k
    int32_t* offsets = args->offsets;
1673
571k
    UChar32 sourceChar;
1674
571k
    char buffer[8];
1675
571k
    int32_t len, outLen;
1676
571k
    int8_t choices[10];
1677
571k
    int32_t choiceCount;
1678
571k
    uint32_t targetValue = 0;
1679
571k
    UBool useFallback;
1680
1681
571k
    int32_t i;
1682
571k
    int8_t cs, g;
1683
1684
    /* set up the state */
1685
571k
    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1686
571k
    pFromU2022State   = &converterData->fromU2022State;
1687
1688
571k
    choiceCount = 0;
1689
1690
    /* check if the last codepoint of previous buffer was a lead surrogate*/
1691
571k
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692
0
        goto getTrail;
1693
0
    }
1694
1695
8.24M
    while(source < sourceLimit) {
1696
8.24M
        if(target < targetLimit) {
1697
1698
8.24M
            sourceChar  = *(source++);
1699
            /*check if the char is a First surrogate*/
1700
8.24M
            if(U16_IS_SURROGATE(sourceChar)) {
1701
2.72k
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702
2.30k
getTrail:
1703
                    /*look ahead to find the trail surrogate*/
1704
2.30k
                    if(source < sourceLimit) {
1705
                        /* test the following code unit */
1706
2.29k
                        char16_t trail=(char16_t) *source;
1707
2.29k
                        if(U16_IS_TRAIL(trail)) {
1708
1.26k
                            source++;
1709
1.26k
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710
1.26k
                            cnv->fromUChar32=0x00;
1711
                            /* convert this supplementary code point */
1712
                            /* exit this condition tree */
1713
1.26k
                        } else {
1714
                            /* this is an unmatched lead code unit (1st surrogate) */
1715
                            /* callback(illegal) */
1716
1.02k
                            *err=U_ILLEGAL_CHAR_FOUND;
1717
1.02k
                            cnv->fromUChar32=sourceChar;
1718
1.02k
                            break;
1719
1.02k
                        }
1720
2.29k
                    } else {
1721
                        /* no more input */
1722
15
                        cnv->fromUChar32=sourceChar;
1723
15
                        break;
1724
15
                    }
1725
2.30k
                } else {
1726
                    /* this is an unmatched trail code unit (2nd surrogate) */
1727
                    /* callback(illegal) */
1728
413
                    *err=U_ILLEGAL_CHAR_FOUND;
1729
413
                    cnv->fromUChar32=sourceChar;
1730
413
                    break;
1731
413
                }
1732
2.72k
            }
1733
1734
            /* do not convert SO/SI/ESC */
1735
8.23M
            if(IS_2022_CONTROL(sourceChar)) {
1736
                /* callback(illegal) */
1737
324
                *err=U_ILLEGAL_CHAR_FOUND;
1738
324
                cnv->fromUChar32=sourceChar;
1739
324
                break;
1740
324
            }
1741
1742
            /* do the conversion */
1743
1744
8.23M
            if(choiceCount == 0) {
1745
7.48M
                uint16_t csm;
1746
1747
                /*
1748
                 * The csm variable keeps track of which charsets are allowed
1749
                 * and not used yet while building the choices[].
1750
                 */
1751
7.48M
                csm = jpCharsetMasks[converterData->version];
1752
7.48M
                choiceCount = 0;
1753
1754
                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755
7.48M
                if(converterData->version == 3 || converterData->version == 4) {
1756
6.77k
                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1757
6.77k
                }
1758
                /* Do not try single-byte half-width Katakana for other versions. */
1759
7.48M
                csm &= ~CSM(HWKANA_7BIT);
1760
1761
                /* try the current G0 charset */
1762
7.48M
                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763
7.48M
                csm &= ~CSM(cs);
1764
1765
                /* try the current G2 charset */
1766
7.48M
                if((cs = pFromU2022State->cs[2]) != 0) {
1767
4.57M
                    choices[choiceCount++] = cs;
1768
4.57M
                    csm &= ~CSM(cs);
1769
4.57M
                }
1770
1771
                /* try all the other possible charsets */
1772
74.8M
                for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773
67.3M
                    cs = (int8_t)jpCharsetPref[i];
1774
67.3M
                    if(CSM(cs) & csm) {
1775
47.5M
                        choices[choiceCount++] = cs;
1776
47.5M
                        csm &= ~CSM(cs);
1777
47.5M
                    }
1778
67.3M
                }
1779
7.48M
            }
1780
1781
8.23M
            cs = g = 0;
1782
            /*
1783
             * len==0: no mapping found yet
1784
             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785
             * len>0: found a roundtrip result, done
1786
             */
1787
8.23M
            len = 0;
1788
            /*
1789
             * We will turn off useFallback after finding a fallback,
1790
             * but we still get fallbacks from PUA code points as usual.
1791
             * Therefore, we will also need to check that we don't overwrite
1792
             * an early fallback with a later one.
1793
             */
1794
8.23M
            useFallback = cnv->useFallback;
1795
1796
63.4M
            for(i = 0; i < choiceCount && len <= 0; ++i) {
1797
55.1M
                uint32_t value;
1798
55.1M
                int32_t len2;
1799
55.1M
                int8_t cs0 = choices[i];
1800
55.1M
                switch(cs0) {
1801
7.53M
                case ASCII:
1802
7.53M
                    if(sourceChar <= 0x7f) {
1803
50.9k
                        targetValue = (uint32_t)sourceChar;
1804
50.9k
                        len = 1;
1805
50.9k
                        cs = cs0;
1806
50.9k
                        g = 0;
1807
50.9k
                    }
1808
7.53M
                    break;
1809
7.41M
                case ISO8859_1:
1810
7.41M
                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811
517
                        targetValue = (uint32_t)sourceChar - 0x80;
1812
517
                        len = 1;
1813
517
                        cs = cs0;
1814
517
                        g = 2;
1815
517
                    }
1816
7.41M
                    break;
1817
10.3k
                case HWKANA_7BIT:
1818
10.3k
                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819
1.31k
                        if(converterData->version==3) {
1820
                            /* JIS7: use G1 (SO) */
1821
                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822
922
                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1823
922
                            len = 1;
1824
922
                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825
922
                            g = 1;
1826
922
                        } else if(converterData->version==4) {
1827
                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828
                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829
391
                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1830
391
                            len = 1;
1831
1832
391
                            cs = pFromU2022State->cs[0];
1833
391
                            if(IS_JP_DBCS(cs)) {
1834
                                /* switch from a DBCS charset to JISX201 */
1835
197
                                cs = (int8_t)JISX201;
1836
197
                            }
1837
                            /* else stay in the current G0 charset */
1838
391
                            g = 0;
1839
391
                        }
1840
                        /* else do not use HWKANA_7BIT with other versions */
1841
1.31k
                    }
1842
10.3k
                    break;
1843
7.47M
                case JISX201:
1844
                    /* G0 SBCS */
1845
7.47M
                    value = jisx201FromU(sourceChar);
1846
7.47M
                    if(value <= 0x7f) {
1847
1.07k
                        targetValue = value;
1848
1.07k
                        len = 1;
1849
1.07k
                        cs = cs0;
1850
1.07k
                        g = 0;
1851
1.07k
                        useFallback = false;
1852
1.07k
                    }
1853
7.47M
                    break;
1854
7.55M
                case JISX208:
1855
                    /* G0 DBCS from Shift-JIS table */
1856
7.55M
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1857
7.55M
                                converterData->myConverterArray[cs0],
1858
7.55M
                                sourceChar, &value,
1859
7.55M
                                useFallback, MBCS_OUTPUT_2);
1860
7.55M
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1861
6.98M
                        value = _2022FromSJIS(value);
1862
6.98M
                        if(value != 0) {
1863
564k
                            targetValue = value;
1864
564k
                            len = len2;
1865
564k
                            cs = cs0;
1866
564k
                            g = 0;
1867
564k
                            useFallback = false;
1868
564k
                        }
1869
6.98M
                    } else if(len == 0 && useFallback &&
1870
572k
                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871
0
                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872
0
                        len = -2;
1873
0
                        cs = cs0;
1874
0
                        g = 0;
1875
0
                        useFallback = false;
1876
0
                    }
1877
7.55M
                    break;
1878
6.92M
                case ISO8859_7:
1879
                    /* G0 SBCS forced to 7-bit output */
1880
6.92M
                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1881
6.92M
                                converterData->myConverterArray[cs0],
1882
6.92M
                                sourceChar, &value,
1883
6.92M
                                useFallback);
1884
6.92M
                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885
212
                        targetValue = value - 0x80;
1886
212
                        len = len2;
1887
212
                        cs = cs0;
1888
212
                        g = 2;
1889
212
                        useFallback = false;
1890
212
                    }
1891
6.92M
                    break;
1892
18.2M
                default:
1893
                    /* G0 DBCS */
1894
18.2M
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1895
18.2M
                                converterData->myConverterArray[cs0],
1896
18.2M
                                sourceChar, &value,
1897
18.2M
                                useFallback, MBCS_OUTPUT_2);
1898
18.2M
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1899
7.05M
                        if(cs0 == KSC5601) {
1900
                            /*
1901
                             * Check for valid bytes for the encoding scheme.
1902
                             * This is necessary because the sub-converter (windows-949)
1903
                             * has a broader encoding scheme than is valid for 2022.
1904
                             */
1905
3.31M
                            value = _2022FromGR94DBCS(value);
1906
3.31M
                            if(value == 0) {
1907
1.84k
                                break;
1908
1.84k
                            }
1909
3.31M
                        }
1910
7.05M
                        targetValue = value;
1911
7.05M
                        len = len2;
1912
7.05M
                        cs = cs0;
1913
7.05M
                        g = 0;
1914
7.05M
                        useFallback = false;
1915
7.05M
                    }
1916
18.2M
                    break;
1917
55.1M
                }
1918
55.1M
            }
1919
1920
8.23M
            if(len != 0) {
1921
7.67M
                if(len < 0) {
1922
201
                    len = -len;  /* fallback */
1923
201
                }
1924
7.67M
                outLen = 0; /* count output bytes */
1925
1926
                /* write SI if necessary (only for JIS7) */
1927
7.67M
                if(pFromU2022State->g == 1 && g == 0) {
1928
424
                    buffer[outLen++] = UCNV_SI;
1929
424
                    pFromU2022State->g = 0;
1930
424
                }
1931
1932
                /* write the designation sequence if necessary */
1933
7.67M
                if(cs != pFromU2022State->cs[g]) {
1934
6.91M
                    int32_t escLen = escSeqCharsLen[cs];
1935
6.91M
                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936
6.91M
                    outLen += escLen;
1937
6.91M
                    pFromU2022State->cs[g] = cs;
1938
1939
                    /* invalidate the choices[] */
1940
6.91M
                    choiceCount = 0;
1941
6.91M
                }
1942
1943
                /* write the shift sequence if necessary */
1944
7.67M
                if(g != pFromU2022State->g) {
1945
1.39k
                    switch(g) {
1946
                    /* case 0 handled before writing escapes */
1947
669
                    case 1:
1948
669
                        buffer[outLen++] = UCNV_SO;
1949
669
                        pFromU2022State->g = 1;
1950
669
                        break;
1951
729
                    default: /* case 2 */
1952
729
                        buffer[outLen++] = 0x1b;
1953
729
                        buffer[outLen++] = 0x4e;
1954
729
                        break;
1955
                    /* no case 3: no SS3 in ISO-2022-JP-x */
1956
1.39k
                    }
1957
1.39k
                }
1958
1959
                /* write the output bytes */
1960
7.67M
                if(len == 1) {
1961
54.0k
                    buffer[outLen++] = (char)targetValue;
1962
7.62M
                } else /* len == 2 */ {
1963
7.62M
                    buffer[outLen++] = (char)(targetValue >> 8);
1964
7.62M
                    buffer[outLen++] = (char)targetValue;
1965
7.62M
                }
1966
7.67M
            } else {
1967
                /*
1968
                 * if we cannot find the character after checking all codepages
1969
                 * then this is an error
1970
                 */
1971
564k
                *err = U_INVALID_CHAR_FOUND;
1972
564k
                cnv->fromUChar32=sourceChar;
1973
564k
                break;
1974
564k
            }
1975
1976
7.67M
            if(sourceChar == CR || sourceChar == LF) {
1977
                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978
402
                pFromU2022State->cs[2] = 0;
1979
402
                choiceCount = 0;
1980
402
            }
1981
1982
            /* output outLen>0 bytes in buffer[] */
1983
7.67M
            if(outLen == 1) {
1984
48.4k
                *target++ = buffer[0];
1985
48.4k
                if(offsets) {
1986
0
                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1987
0
                }
1988
7.62M
            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989
707k
                *target++ = buffer[0];
1990
707k
                *target++ = buffer[1];
1991
707k
                if(offsets) {
1992
0
                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993
0
                    *offsets++ = sourceIndex;
1994
0
                    *offsets++ = sourceIndex;
1995
0
                }
1996
6.91M
            } else {
1997
6.91M
                fromUWriteUInt8(
1998
6.91M
                    cnv,
1999
6.91M
                    buffer, outLen,
2000
6.91M
                    &target, (const char *)targetLimit,
2001
6.91M
                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002
6.91M
                    err);
2003
6.91M
                if(U_FAILURE(*err)) {
2004
3.01k
                    break;
2005
3.01k
                }
2006
6.91M
            }
2007
7.67M
        } /* end if(myTargetIndex<myTargetLength) */
2008
1.05k
        else{
2009
1.05k
            *err =U_BUFFER_OVERFLOW_ERROR;
2010
1.05k
            break;
2011
1.05k
        }
2012
2013
8.24M
    }/* end while(mySourceIndex<mySourceLength) */
2014
2015
    /*
2016
     * the end of the input stream and detection of truncated input
2017
     * are handled by the framework, but for ISO-2022-JP conversion
2018
     * we need to be in ASCII mode at the very end
2019
     *
2020
     * conditions:
2021
     *   successful
2022
     *   in SO mode or not in ASCII mode
2023
     *   end of input and no truncated input
2024
     */
2025
571k
    if( U_SUCCESS(*err) &&
2026
571k
        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027
571k
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028
571k
    ) {
2029
218
        int32_t sourceIndex;
2030
2031
218
        outLen = 0;
2032
2033
218
        if(pFromU2022State->g != 0) {
2034
25
            buffer[outLen++] = UCNV_SI;
2035
25
            pFromU2022State->g = 0;
2036
25
        }
2037
2038
218
        if(pFromU2022State->cs[0] != ASCII) {
2039
198
            int32_t escLen = escSeqCharsLen[ASCII];
2040
198
            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041
198
            outLen += escLen;
2042
198
            pFromU2022State->cs[0] = (int8_t)ASCII;
2043
198
        }
2044
2045
        /* get the source index of the last input character */
2046
        /*
2047
         * TODO this would be simpler and more reliable if we used a pair
2048
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049
         * so that we could simply use the prevSourceIndex here;
2050
         * this code gives an incorrect result for the rare case of an unmatched
2051
         * trail surrogate that is alone in the last buffer of the text stream
2052
         */
2053
218
        sourceIndex=(int32_t)(source-args->source);
2054
218
        if(sourceIndex>0) {
2055
207
            --sourceIndex;
2056
207
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057
207
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058
207
            ) {
2059
0
                --sourceIndex;
2060
0
            }
2061
207
        } else {
2062
11
            sourceIndex=-1;
2063
11
        }
2064
2065
218
        fromUWriteUInt8(
2066
218
            cnv,
2067
218
            buffer, outLen,
2068
218
            &target, (const char *)targetLimit,
2069
218
            &offsets, sourceIndex,
2070
218
            err);
2071
218
    }
2072
2073
    /*save the state and return */
2074
571k
    args->source = source;
2075
571k
    args->target = (char*)target;
2076
571k
}
2077
2078
/*************** to unicode *******************/
2079
2080
static void U_CALLCONV
2081
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082
0
                                               UErrorCode* err){
2083
0
    char tempBuf[2];
2084
0
    const char *mySource = (char *) args->source;
2085
0
    char16_t *myTarget = args->target;
2086
0
    const char *mySourceLimit = args->sourceLimit;
2087
0
    uint32_t targetUniChar = 0x0000;
2088
0
    uint32_t mySourceChar = 0x0000;
2089
0
    uint32_t tmpSourceChar = 0x0000;
2090
0
    UConverterDataISO2022* myData;
2091
0
    ISO2022State *pToU2022State;
2092
0
    StateEnum cs;
2093
2094
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2095
0
    pToU2022State = &myData->toU2022State;
2096
2097
0
    if(myData->key != 0) {
2098
        /* continue with a partial escape sequence */
2099
0
        goto escape;
2100
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101
        /* continue with a partial double-byte character */
2102
0
        mySourceChar = args->converter->toUBytes[0];
2103
0
        args->converter->toULength = 0;
2104
0
        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2105
0
        targetUniChar = missingCharMarker;
2106
0
        goto getTrailByte;
2107
0
    }
2108
2109
0
    while(mySource < mySourceLimit){
2110
2111
0
        targetUniChar =missingCharMarker;
2112
2113
0
        if(myTarget < args->targetLimit){
2114
2115
0
            mySourceChar= (unsigned char) *mySource++;
2116
2117
0
            switch(mySourceChar) {
2118
0
            case UCNV_SI:
2119
0
                if(myData->version==3) {
2120
0
                    pToU2022State->g=0;
2121
0
                    continue;
2122
0
                } else {
2123
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124
0
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2125
0
                    break;
2126
0
                }
2127
2128
0
            case UCNV_SO:
2129
0
                if(myData->version==3) {
2130
                    /* JIS7: switch to G1 half-width Katakana */
2131
0
                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2132
0
                    pToU2022State->g=1;
2133
0
                    continue;
2134
0
                } else {
2135
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136
0
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2137
0
                    break;
2138
0
                }
2139
2140
0
            case ESC_2022:
2141
0
                mySource--;
2142
0
escape:
2143
0
                {
2144
0
                    const char * mySourceBefore = mySource;
2145
0
                    int8_t toULengthBefore = args->converter->toULength;
2146
2147
0
                    changeState_2022(args->converter,&(mySource),
2148
0
                        mySourceLimit, ISO_2022_JP,err);
2149
2150
                    /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
2151
0
                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153
0
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154
0
                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2155
0
                    }
2156
0
                }
2157
2158
                /* invalid or illegal escape sequence */
2159
0
                if(U_FAILURE(*err)){
2160
0
                    args->target = myTarget;
2161
0
                    args->source = mySource;
2162
0
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
2163
0
                    return;
2164
0
                }
2165
                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166
0
                if(myData->key==0) {
2167
0
                    myData->isEmptySegment = true;
2168
0
                }
2169
0
                continue;
2170
2171
            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173
0
            case CR:
2174
0
            case LF:
2175
                /* automatically reset to single-byte mode */
2176
0
                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2177
0
                    pToU2022State->cs[0] = (int8_t)ASCII;
2178
0
                }
2179
0
                pToU2022State->cs[2] = 0;
2180
0
                pToU2022State->g = 0;
2181
0
                U_FALLTHROUGH;
2182
0
            default:
2183
                /* convert one or two bytes */
2184
0
                myData->isEmptySegment = false;
2185
0
                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186
0
                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2187
0
                    !IS_JP_DBCS(cs)
2188
0
                ) {
2189
                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2190
0
                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2191
2192
                    /* return from a single-shift state to the previous one */
2193
0
                    if(pToU2022State->g >= 2) {
2194
0
                        pToU2022State->g=pToU2022State->prevG;
2195
0
                    }
2196
0
                } else switch(cs) {
2197
0
                case ASCII:
2198
0
                    if(mySourceChar <= 0x7f) {
2199
0
                        targetUniChar = mySourceChar;
2200
0
                    }
2201
0
                    break;
2202
0
                case ISO8859_1:
2203
0
                    if(mySourceChar <= 0x7f) {
2204
0
                        targetUniChar = mySourceChar + 0x80;
2205
0
                    }
2206
                    /* return from a single-shift state to the previous one */
2207
0
                    pToU2022State->g=pToU2022State->prevG;
2208
0
                    break;
2209
0
                case ISO8859_7:
2210
0
                    if(mySourceChar <= 0x7f) {
2211
                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2212
0
                        targetUniChar =
2213
0
                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214
0
                                myData->myConverterArray[cs],
2215
0
                                mySourceChar + 0x80);
2216
0
                    }
2217
                    /* return from a single-shift state to the previous one */
2218
0
                    pToU2022State->g=pToU2022State->prevG;
2219
0
                    break;
2220
0
                case JISX201:
2221
0
                    if(mySourceChar <= 0x7f) {
2222
0
                        targetUniChar = jisx201ToU(mySourceChar);
2223
0
                    }
2224
0
                    break;
2225
0
                case HWKANA_7BIT:
2226
0
                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2227
                        /* 7-bit halfwidth Katakana */
2228
0
                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2229
0
                    }
2230
0
                    break;
2231
0
                default:
2232
                    /* G0 DBCS */
2233
0
                    if(mySource < mySourceLimit) {
2234
0
                        int leadIsOk, trailIsOk;
2235
0
                        uint8_t trailByte;
2236
0
getTrailByte:
2237
0
                        trailByte = (uint8_t)*mySource;
2238
                        /*
2239
                         * Ticket 5691: consistent illegal sequences:
2240
                         * - We include at least the first byte in the illegal sequence.
2241
                         * - If any of the non-initial bytes could be the start of a character,
2242
                         *   we stop the illegal sequence before the first one of those.
2243
                         *
2244
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246
                         * Otherwise we convert or report the pair of bytes.
2247
                         */
2248
0
                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2249
0
                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2250
0
                        if (leadIsOk && trailIsOk) {
2251
0
                            ++mySource;
2252
0
                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2253
0
                            if(cs == JISX208) {
2254
0
                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255
0
                                mySourceChar = tmpSourceChar;
2256
0
                            } else {
2257
                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258
0
                                mySourceChar = tmpSourceChar;
2259
0
                                if (cs == KSC5601) {
2260
0
                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2261
0
                                }
2262
0
                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2263
0
                                tempBuf[1] = (char)(tmpSourceChar);
2264
0
                            }
2265
0
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
2266
0
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2267
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2268
0
                            ++mySource;
2269
                            /* add another bit so that the code below writes 2 bytes in case of error */
2270
0
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2271
0
                        }
2272
0
                    } else {
2273
0
                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2274
0
                        args->converter->toULength = 1;
2275
0
                        goto endloop;
2276
0
                    }
2277
0
                }  /* End of inner switch */
2278
0
                break;
2279
0
            }  /* End of outer switch */
2280
0
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2281
0
                if(args->offsets){
2282
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2283
0
                }
2284
0
                *(myTarget++)=(char16_t)targetUniChar;
2285
0
            }
2286
0
            else if(targetUniChar > missingCharMarker){
2287
                /* disassemble the surrogate pair and write to output*/
2288
0
                targetUniChar-=0x0010000;
2289
0
                *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
2290
0
                if(args->offsets){
2291
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2292
0
                }
2293
0
                ++myTarget;
2294
0
                if(myTarget< args->targetLimit){
2295
0
                    *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
2296
0
                    if(args->offsets){
2297
0
                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2298
0
                    }
2299
0
                    ++myTarget;
2300
0
                }else{
2301
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302
0
                                    (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
2303
0
                }
2304
2305
0
            }
2306
0
            else{
2307
                /* Call the callback function*/
2308
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309
0
                break;
2310
0
            }
2311
0
        }
2312
0
        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2313
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2314
0
            break;
2315
0
        }
2316
0
    }
2317
0
endloop:
2318
0
    args->target = myTarget;
2319
0
    args->source = mySource;
2320
0
}
2321
2322
2323
#if !UCONFIG_ONLY_HTML_CONVERSION
2324
/***************************************************************
2325
*   Rules for ISO-2022-KR encoding
2326
*   i) The KSC5601 designator sequence should appear only once in a file,
2327
*      at the beginning of a line before any KSC5601 characters. This usually
2328
*      means that it appears by itself on the first line of the file
2329
*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2330
*      and SI to shift into single byte mode
2331
*/
2332
static void U_CALLCONV
2333
1.50M
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
2335
1.50M
    UConverter* saveConv = args->converter;
2336
1.50M
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2337
1.50M
    args->converter=myConverterData->currentConverter;
2338
2339
1.50M
    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340
1.50M
    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341
1.50M
    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343
1.50M
    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344
182
        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2345
85
            uprv_memcpy(
2346
85
                saveConv->charErrorBuffer,
2347
85
                myConverterData->currentConverter->charErrorBuffer,
2348
85
                myConverterData->currentConverter->charErrorBufferLength);
2349
85
        }
2350
182
        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351
182
        myConverterData->currentConverter->charErrorBufferLength = 0;
2352
182
    }
2353
1.50M
    args->converter=saveConv;
2354
1.50M
}
2355
2356
static void U_CALLCONV
2357
8.36M
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359
8.36M
    const char16_t *source = args->source;
2360
8.36M
    const char16_t *sourceLimit = args->sourceLimit;
2361
8.36M
    unsigned char *target = (unsigned char *) args->target;
2362
8.36M
    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2363
8.36M
    int32_t* offsets = args->offsets;
2364
8.36M
    uint32_t targetByteUnit = 0x0000;
2365
8.36M
    UChar32 sourceChar = 0x0000;
2366
8.36M
    UBool isTargetByteDBCS;
2367
8.36M
    UBool oldIsTargetByteDBCS;
2368
8.36M
    UConverterDataISO2022 *converterData;
2369
8.36M
    UConverterSharedData* sharedData;
2370
8.36M
    UBool useFallback;
2371
8.36M
    int32_t length =0;
2372
2373
8.36M
    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2374
    /* if the version is 1 then the user is requesting
2375
     * conversion with ibm-25546 pass the arguments to
2376
     * MBCS converter and return
2377
     */
2378
8.36M
    if(converterData->version==1){
2379
1.50M
        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380
1.50M
        return;
2381
1.50M
    }
2382
2383
    /* initialize data */
2384
6.86M
    sharedData = converterData->currentConverter->sharedData;
2385
6.86M
    useFallback = args->converter->useFallback;
2386
6.86M
    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387
6.86M
    oldIsTargetByteDBCS = isTargetByteDBCS;
2388
2389
6.86M
    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2390
6.86M
    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2391
0
        goto getTrail;
2392
0
    }
2393
13.7M
    while(source < sourceLimit){
2394
2395
13.7M
        targetByteUnit = missingCharMarker;
2396
2397
13.7M
        if(target < (unsigned char*) args->targetLimit){
2398
13.7M
            sourceChar = *source++;
2399
2400
            /* do not convert SO/SI/ESC */
2401
13.7M
            if(IS_2022_CONTROL(sourceChar)) {
2402
                /* callback(illegal) */
2403
217
                *err=U_ILLEGAL_CHAR_FOUND;
2404
217
                args->converter->fromUChar32=sourceChar;
2405
217
                break;
2406
217
            }
2407
2408
13.7M
            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409
13.7M
            if(length < 0) {
2410
0
                length = -length;  /* fallback */
2411
0
            }
2412
            /* only DBCS or SBCS characters are expected*/
2413
            /* DB characters with high bit set to 1 are expected */
2414
13.7M
            if( length > 2 || length==0 ||
2415
13.7M
                (length == 1 && targetByteUnit > 0x7f) ||
2416
13.7M
                (length == 2 &&
2417
6.89M
                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2418
6.86M
                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2419
13.7M
            ) {
2420
6.86M
                targetByteUnit=missingCharMarker;
2421
6.86M
            }
2422
13.7M
            if (targetByteUnit != missingCharMarker){
2423
2424
6.89M
                oldIsTargetByteDBCS = isTargetByteDBCS;
2425
6.89M
                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2426
                  /* append the shift sequence */
2427
6.89M
                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2428
2429
6.88M
                    if (isTargetByteDBCS)
2430
6.86M
                        *target++ = UCNV_SO;
2431
23.2k
                    else
2432
23.2k
                        *target++ = UCNV_SI;
2433
6.88M
                    if(offsets)
2434
0
                        *(offsets++) = (int32_t)(source - args->source-1);
2435
6.88M
                }
2436
                /* write the targetUniChar  to target */
2437
6.89M
                if(targetByteUnit <= 0x00FF){
2438
29.3k
                    if( target < targetLimit){
2439
29.3k
                        *(target++) = (unsigned char) targetByteUnit;
2440
29.3k
                        if(offsets){
2441
0
                            *(offsets++) = (int32_t)(source - args->source-1);
2442
0
                        }
2443
2444
29.3k
                    }else{
2445
35
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446
35
                        *err = U_BUFFER_OVERFLOW_ERROR;
2447
35
                    }
2448
6.86M
                }else{
2449
6.86M
                    if(target < targetLimit){
2450
6.86M
                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2451
6.86M
                        if(offsets){
2452
0
                            *(offsets++) = (int32_t)(source - args->source-1);
2453
0
                        }
2454
6.86M
                        if(target < targetLimit){
2455
6.86M
                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2456
6.86M
                            if(offsets){
2457
0
                                *(offsets++) = (int32_t)(source - args->source-1);
2458
0
                            }
2459
6.86M
                        }else{
2460
97
                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2461
97
                            *err = U_BUFFER_OVERFLOW_ERROR;
2462
97
                        }
2463
6.86M
                    }else{
2464
122
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2465
122
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2466
122
                        *err = U_BUFFER_OVERFLOW_ERROR;
2467
122
                    }
2468
6.86M
                }
2469
2470
6.89M
            }
2471
6.86M
            else{
2472
                /* oops.. the code point is unassingned
2473
                 * set the error and reason
2474
                 */
2475
2476
                /*check if the char is a First surrogate*/
2477
6.86M
                if(U16_IS_SURROGATE(sourceChar)) {
2478
1.19k
                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2479
909
getTrail:
2480
                        /*look ahead to find the trail surrogate*/
2481
909
                        if(source <  sourceLimit) {
2482
                            /* test the following code unit */
2483
896
                            char16_t trail=(char16_t) *source;
2484
896
                            if(U16_IS_TRAIL(trail)) {
2485
333
                                source++;
2486
333
                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2487
333
                                *err = U_INVALID_CHAR_FOUND;
2488
                                /* convert this surrogate code point */
2489
                                /* exit this condition tree */
2490
563
                            } else {
2491
                                /* this is an unmatched lead code unit (1st surrogate) */
2492
                                /* callback(illegal) */
2493
563
                                *err=U_ILLEGAL_CHAR_FOUND;
2494
563
                            }
2495
896
                        } else {
2496
                            /* no more input */
2497
13
                            *err = U_ZERO_ERROR;
2498
13
                        }
2499
909
                    } else {
2500
                        /* this is an unmatched trail code unit (2nd surrogate) */
2501
                        /* callback(illegal) */
2502
289
                        *err=U_ILLEGAL_CHAR_FOUND;
2503
289
                    }
2504
6.85M
                } else {
2505
                    /* callback(unassigned) for a BMP code point */
2506
6.85M
                    *err = U_INVALID_CHAR_FOUND;
2507
6.85M
                }
2508
2509
6.86M
                args->converter->fromUChar32=sourceChar;
2510
6.86M
                break;
2511
6.86M
            }
2512
13.7M
        } /* end if(myTargetIndex<myTargetLength) */
2513
369
        else{
2514
369
            *err =U_BUFFER_OVERFLOW_ERROR;
2515
369
            break;
2516
369
        }
2517
2518
13.7M
    }/* end while(mySourceIndex<mySourceLength) */
2519
2520
    /*
2521
     * the end of the input stream and detection of truncated input
2522
     * are handled by the framework, but for ISO-2022-KR conversion
2523
     * we need to be in ASCII mode at the very end
2524
     *
2525
     * conditions:
2526
     *   successful
2527
     *   not in ASCII mode
2528
     *   end of input and no truncated input
2529
     */
2530
6.86M
    if( U_SUCCESS(*err) &&
2531
6.86M
        isTargetByteDBCS &&
2532
6.86M
        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2533
6.86M
    ) {
2534
52
        int32_t sourceIndex;
2535
2536
        /* we are switching to ASCII */
2537
52
        isTargetByteDBCS=false;
2538
2539
        /* get the source index of the last input character */
2540
        /*
2541
         * TODO this would be simpler and more reliable if we used a pair
2542
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543
         * so that we could simply use the prevSourceIndex here;
2544
         * this code gives an incorrect result for the rare case of an unmatched
2545
         * trail surrogate that is alone in the last buffer of the text stream
2546
         */
2547
52
        sourceIndex=(int32_t)(source-args->source);
2548
52
        if(sourceIndex>0) {
2549
49
            --sourceIndex;
2550
49
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551
49
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2552
49
            ) {
2553
0
                --sourceIndex;
2554
0
            }
2555
49
        } else {
2556
3
            sourceIndex=-1;
2557
3
        }
2558
2559
52
        fromUWriteUInt8(
2560
52
            args->converter,
2561
52
            SHIFT_IN_STR, 1,
2562
52
            &target, (const char *)targetLimit,
2563
52
            &offsets, sourceIndex,
2564
52
            err);
2565
52
    }
2566
2567
    /*save the state and return */
2568
6.86M
    args->source = source;
2569
6.86M
    args->target = (char*)target;
2570
6.86M
    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571
6.86M
}
2572
2573
/************************ To Unicode ***************************************/
2574
2575
static void U_CALLCONV
2576
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577
0
                                                            UErrorCode* err){
2578
0
    char const* sourceStart;
2579
0
    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2580
2581
0
    UConverterToUnicodeArgs subArgs;
2582
0
    int32_t minArgsSize;
2583
2584
    /* set up the subconverter arguments */
2585
0
    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586
0
        minArgsSize = args->size;
2587
0
    } else {
2588
0
        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589
0
    }
2590
2591
0
    uprv_memcpy(&subArgs, args, minArgsSize);
2592
0
    subArgs.size = (uint16_t)minArgsSize;
2593
0
    subArgs.converter = myData->currentConverter;
2594
2595
    /* remember the original start of the input for offsets */
2596
0
    sourceStart = args->source;
2597
2598
0
    if(myData->key != 0) {
2599
        /* continue with a partial escape sequence */
2600
0
        goto escape;
2601
0
    }
2602
2603
0
    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2604
        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2605
0
        subArgs.source = args->source;
2606
0
        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607
0
        if(subArgs.source != subArgs.sourceLimit) {
2608
            /*
2609
             * get the current partial byte sequence
2610
             *
2611
             * it needs to be moved between the public and the subconverter
2612
             * so that the conversion framework, which only sees the public
2613
             * converter, can handle truncated and illegal input etc.
2614
             */
2615
0
            if(args->converter->toULength > 0) {
2616
0
                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617
0
            }
2618
0
            subArgs.converter->toULength = args->converter->toULength;
2619
2620
            /*
2621
             * Convert up to the end of the input, or to before the next escape character.
2622
             * Does not handle conversion extensions because the preToU[] state etc.
2623
             * is not copied.
2624
             */
2625
0
            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627
0
            if(args->offsets != nullptr && sourceStart != args->source) {
2628
                /* update offsets to base them on the actual start of the input */
2629
0
                int32_t *offsets = args->offsets;
2630
0
                char16_t *target = args->target;
2631
0
                int32_t delta = (int32_t)(args->source - sourceStart);
2632
0
                while(target < subArgs.target) {
2633
0
                    if(*offsets >= 0) {
2634
0
                        *offsets += delta;
2635
0
                    }
2636
0
                    ++offsets;
2637
0
                    ++target;
2638
0
                }
2639
0
            }
2640
0
            args->source = subArgs.source;
2641
0
            args->target = subArgs.target;
2642
0
            args->offsets = subArgs.offsets;
2643
2644
            /* copy input/error/overflow buffers */
2645
0
            if(subArgs.converter->toULength > 0) {
2646
0
                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647
0
            }
2648
0
            args->converter->toULength = subArgs.converter->toULength;
2649
2650
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651
0
                if(subArgs.converter->UCharErrorBufferLength > 0) {
2652
0
                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653
0
                                subArgs.converter->UCharErrorBufferLength);
2654
0
                }
2655
0
                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656
0
                subArgs.converter->UCharErrorBufferLength = 0;
2657
0
            }
2658
0
        }
2659
2660
0
        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2661
0
            return;
2662
0
        }
2663
2664
0
escape:
2665
0
        changeState_2022(args->converter,
2666
0
               &(args->source),
2667
0
               args->sourceLimit,
2668
0
               ISO_2022_KR,
2669
0
               err);
2670
0
    }
2671
0
}
2672
2673
static void U_CALLCONV
2674
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675
0
                                                            UErrorCode* err){
2676
0
    char tempBuf[2];
2677
0
    const char *mySource = ( char *) args->source;
2678
0
    char16_t *myTarget = args->target;
2679
0
    const char *mySourceLimit = args->sourceLimit;
2680
0
    UChar32 targetUniChar = 0x0000;
2681
0
    char16_t mySourceChar = 0x0000;
2682
0
    UConverterDataISO2022* myData;
2683
0
    UConverterSharedData* sharedData ;
2684
0
    UBool useFallback;
2685
2686
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687
0
    if(myData->version==1){
2688
0
        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2689
0
        return;
2690
0
    }
2691
2692
    /* initialize state */
2693
0
    sharedData = myData->currentConverter->sharedData;
2694
0
    useFallback = args->converter->useFallback;
2695
2696
0
    if(myData->key != 0) {
2697
        /* continue with a partial escape sequence */
2698
0
        goto escape;
2699
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700
        /* continue with a partial double-byte character */
2701
0
        mySourceChar = args->converter->toUBytes[0];
2702
0
        args->converter->toULength = 0;
2703
0
        goto getTrailByte;
2704
0
    }
2705
2706
0
    while(mySource< mySourceLimit){
2707
2708
0
        if(myTarget < args->targetLimit){
2709
2710
0
            mySourceChar= (unsigned char) *mySource++;
2711
2712
0
            if(mySourceChar==UCNV_SI){
2713
0
                myData->toU2022State.g = 0;
2714
0
                if (myData->isEmptySegment) {
2715
0
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
2716
0
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717
0
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718
0
                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2719
0
                    args->converter->toULength = 1;
2720
0
                    args->target = myTarget;
2721
0
                    args->source = mySource;
2722
0
                    return;
2723
0
                }
2724
                /*consume the source */
2725
0
                continue;
2726
0
            }else if(mySourceChar==UCNV_SO){
2727
0
                myData->toU2022State.g = 1;
2728
0
                myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
2729
                /*consume the source */
2730
0
                continue;
2731
0
            }else if(mySourceChar==ESC_2022){
2732
0
                mySource--;
2733
0
escape:
2734
0
                myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
2735
0
                changeState_2022(args->converter,&(mySource),
2736
0
                                mySourceLimit, ISO_2022_KR, err);
2737
0
                if(U_FAILURE(*err)){
2738
0
                    args->target = myTarget;
2739
0
                    args->source = mySource;
2740
0
                    return;
2741
0
                }
2742
0
                continue;
2743
0
            }
2744
2745
0
            myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
2746
0
            if(myData->toU2022State.g == 1) {
2747
0
                if(mySource < mySourceLimit) {
2748
0
                    int leadIsOk, trailIsOk;
2749
0
                    uint8_t trailByte;
2750
0
getTrailByte:
2751
0
                    targetUniChar = missingCharMarker;
2752
0
                    trailByte = (uint8_t)*mySource;
2753
                    /*
2754
                     * Ticket 5691: consistent illegal sequences:
2755
                     * - We include at least the first byte in the illegal sequence.
2756
                     * - If any of the non-initial bytes could be the start of a character,
2757
                     *   we stop the illegal sequence before the first one of those.
2758
                     *
2759
                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760
                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761
                     * Otherwise we convert or report the pair of bytes.
2762
                     */
2763
0
                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2764
0
                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2765
0
                    if (leadIsOk && trailIsOk) {
2766
0
                        ++mySource;
2767
0
                        tempBuf[0] = (char)(mySourceChar + 0x80);
2768
0
                        tempBuf[1] = (char)(trailByte + 0x80);
2769
0
                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2770
0
                        mySourceChar = (mySourceChar << 8) | trailByte;
2771
0
                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2772
                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2773
0
                        ++mySource;
2774
                        /* add another bit so that the code below writes 2 bytes in case of error */
2775
0
                        mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
2776
0
                    }
2777
0
                } else {
2778
0
                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2779
0
                    args->converter->toULength = 1;
2780
0
                    break;
2781
0
                }
2782
0
            }
2783
0
            else if(mySourceChar <= 0x7f) {
2784
0
                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2785
0
            } else {
2786
0
                targetUniChar = 0xffff;
2787
0
            }
2788
0
            if(targetUniChar < 0xfffe){
2789
0
                if(args->offsets) {
2790
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2791
0
                }
2792
0
                *(myTarget++)=(char16_t)targetUniChar;
2793
0
            }
2794
0
            else {
2795
                /* Call the callback function*/
2796
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797
0
                break;
2798
0
            }
2799
0
        }
2800
0
        else{
2801
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2802
0
            break;
2803
0
        }
2804
0
    }
2805
0
    args->target = myTarget;
2806
0
    args->source = mySource;
2807
0
}
2808
2809
/*************************** END ISO2022-KR *********************************/
2810
2811
/*************************** ISO-2022-CN *********************************
2812
*
2813
* Rules for ISO-2022-CN Encoding:
2814
* i)   The designator sequence must appear once on a line before any instance
2815
*      of character set it designates.
2816
* ii)  If two lines contain characters from the same character set, both lines
2817
*      must include the designator sequence.
2818
* iii) Once the designator sequence is known, a shifting sequence has to be found
2819
*      to invoke the  shifting
2820
* iv)  All lines start in ASCII and end in ASCII.
2821
* v)   Four shifting sequences are employed for this purpose:
2822
*
2823
*      Sequcence   ASCII Eq    Charsets
2824
*      ----------  -------    ---------
2825
*      SI           <SI>        US-ASCII
2826
*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827
*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2828
*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2829
*
2830
* vi)
2831
*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2832
*      SS2designator : ESC "$" "*" finalchar_for_SS2
2833
*      SS3designator : ESC "$" "+" finalchar_for_SS3
2834
*
2835
*      ESC $ ) A       Indicates the bytes following SO are Chinese
2836
*       characters as defined in GB 2312-80, until
2837
*       another SOdesignation appears
2838
*
2839
*
2840
*      ESC $ ) E       Indicates the bytes following SO are as defined
2841
*       in ISO-IR-165 (for details, see section 2.1),
2842
*       until another SOdesignation appears
2843
*
2844
*      ESC $ ) G       Indicates the bytes following SO are as defined
2845
*       in CNS 11643-plane-1, until another
2846
*       SOdesignation appears
2847
*
2848
*      ESC $ * H       Indicates the two bytes immediately following
2849
*       SS2 is a Chinese character as defined in CNS
2850
*       11643-plane-2, until another SS2designation
2851
*       appears
2852
*       (Meaning <ESC>N must precede every 2 byte
2853
*        sequence.)
2854
*
2855
*      ESC $ + I       Indicates the immediate two bytes following SS3
2856
*       is a Chinese character as defined in CNS
2857
*       11643-plane-3, until another SS3designation
2858
*       appears
2859
*       (Meaning <ESC>O must precede every 2 byte
2860
*        sequence.)
2861
*
2862
*      ESC $ + J       Indicates the immediate two bytes following SS3
2863
*       is a Chinese character as defined in CNS
2864
*       11643-plane-4, until another SS3designation
2865
*       appears
2866
*       (In English: <ESC>O must precede every 2 byte
2867
*        sequence.)
2868
*
2869
*      ESC $ + K       Indicates the immediate two bytes following SS3
2870
*       is a Chinese character as defined in CNS
2871
*       11643-plane-5, until another SS3designation
2872
*       appears
2873
*
2874
*      ESC $ + L       Indicates the immediate two bytes following SS3
2875
*       is a Chinese character as defined in CNS
2876
*       11643-plane-6, until another SS3designation
2877
*       appears
2878
*
2879
*      ESC $ + M       Indicates the immediate two bytes following SS3
2880
*       is a Chinese character as defined in CNS
2881
*       11643-plane-7, until another SS3designation
2882
*       appears
2883
*
2884
*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885
*       has its own designation information before any Chinese characters
2886
*       appear
2887
*
2888
*/
2889
2890
/* The following are defined this way to make the strings truly readonly */
2891
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901
/********************** ISO2022-CN Data **************************/
2902
static const char* const escSeqCharsCN[10] ={
2903
        SHIFT_IN_STR,                   /* 0 ASCII */
2904
        GB_2312_80_STR,                 /* 1 GB2312_1 */
2905
        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2906
        CNS_11643_1992_Plane_1_STR,
2907
        CNS_11643_1992_Plane_2_STR,
2908
        CNS_11643_1992_Plane_3_STR,
2909
        CNS_11643_1992_Plane_4_STR,
2910
        CNS_11643_1992_Plane_5_STR,
2911
        CNS_11643_1992_Plane_6_STR,
2912
        CNS_11643_1992_Plane_7_STR
2913
};
2914
2915
static void U_CALLCONV
2916
586k
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2917
586k
    UConverter *cnv = args->converter;
2918
586k
    UConverterDataISO2022 *converterData;
2919
586k
    ISO2022State *pFromU2022State;
2920
586k
    uint8_t *target = (uint8_t *) args->target;
2921
586k
    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2922
586k
    const char16_t* source = args->source;
2923
586k
    const char16_t* sourceLimit = args->sourceLimit;
2924
586k
    int32_t* offsets = args->offsets;
2925
586k
    UChar32 sourceChar;
2926
586k
    char buffer[8];
2927
586k
    int32_t len;
2928
586k
    int8_t choices[3];
2929
586k
    int32_t choiceCount;
2930
586k
    uint32_t targetValue = 0;
2931
586k
    UBool useFallback;
2932
2933
    /* set up the state */
2934
586k
    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2935
586k
    pFromU2022State   = &converterData->fromU2022State;
2936
2937
586k
    choiceCount = 0;
2938
2939
    /* check if the last codepoint of previous buffer was a lead surrogate*/
2940
586k
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2941
0
        goto getTrail;
2942
0
    }
2943
2944
14.5M
    while( source < sourceLimit){
2945
14.5M
        if(target < targetLimit){
2946
2947
14.5M
            sourceChar  = *(source++);
2948
            /*check if the char is a First surrogate*/
2949
14.5M
             if(U16_IS_SURROGATE(sourceChar)) {
2950
452k
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2951
1.91k
getTrail:
2952
                    /*look ahead to find the trail surrogate*/
2953
1.91k
                    if(source < sourceLimit) {
2954
                        /* test the following code unit */
2955
1.90k
                        char16_t trail=(char16_t) *source;
2956
1.90k
                        if(U16_IS_TRAIL(trail)) {
2957
471
                            source++;
2958
471
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2959
471
                            cnv->fromUChar32=0x00;
2960
                            /* convert this supplementary code point */
2961
                            /* exit this condition tree */
2962
1.43k
                        } else {
2963
                            /* this is an unmatched lead code unit (1st surrogate) */
2964
                            /* callback(illegal) */
2965
1.43k
                            *err=U_ILLEGAL_CHAR_FOUND;
2966
1.43k
                            cnv->fromUChar32=sourceChar;
2967
1.43k
                            break;
2968
1.43k
                        }
2969
1.90k
                    } else {
2970
                        /* no more input */
2971
16
                        cnv->fromUChar32=sourceChar;
2972
16
                        break;
2973
16
                    }
2974
450k
                } else {
2975
                    /* this is an unmatched trail code unit (2nd surrogate) */
2976
                    /* callback(illegal) */
2977
450k
                    *err=U_ILLEGAL_CHAR_FOUND;
2978
450k
                    cnv->fromUChar32=sourceChar;
2979
450k
                    break;
2980
450k
                }
2981
452k
            }
2982
2983
            /* do the conversion */
2984
14.0M
            if(sourceChar <= 0x007f ){
2985
                /* do not convert SO/SI/ESC */
2986
19.9k
                if(IS_2022_CONTROL(sourceChar)) {
2987
                    /* callback(illegal) */
2988
339
                    *err=U_ILLEGAL_CHAR_FOUND;
2989
339
                    cnv->fromUChar32=sourceChar;
2990
339
                    break;
2991
339
                }
2992
2993
                /* US-ASCII */
2994
19.6k
                if(pFromU2022State->g == 0) {
2995
16.0k
                    buffer[0] = (char)sourceChar;
2996
16.0k
                    len = 1;
2997
16.0k
                } else {
2998
3.59k
                    buffer[0] = UCNV_SI;
2999
3.59k
                    buffer[1] = (char)sourceChar;
3000
3.59k
                    len = 2;
3001
3.59k
                    pFromU2022State->g = 0;
3002
3.59k
                    choiceCount = 0;
3003
3.59k
                }
3004
19.6k
                if(sourceChar == CR || sourceChar == LF) {
3005
                    /* reset the state at the end of a line */
3006
805
                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3007
805
                    choiceCount = 0;
3008
805
                }
3009
19.6k
            }
3010
14.0M
            else{
3011
                /* convert U+0080..U+10ffff */
3012
14.0M
                int32_t i;
3013
14.0M
                int8_t cs, g;
3014
3015
14.0M
                if(choiceCount == 0) {
3016
                    /* try the current SO/G1 converter first */
3017
13.8M
                    choices[0] = pFromU2022State->cs[1];
3018
3019
                    /* default to GB2312_1 if none is designated yet */
3020
13.8M
                    if(choices[0] == 0) {
3021
3.97k
                        choices[0] = GB2312_1;
3022
3.97k
                    }
3023
3024
13.8M
                    if(converterData->version == 0) {
3025
                        /* ISO-2022-CN */
3026
3027
                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028
288k
                        if(choices[0] == GB2312_1) {
3029
149k
                            choices[1] = (int8_t)CNS_11643_1;
3030
149k
                        } else {
3031
138k
                            choices[1] = (int8_t)GB2312_1;
3032
138k
                        }
3033
3034
288k
                        choiceCount = 2;
3035
13.5M
                    } else if (converterData->version == 1) {
3036
                        /* ISO-2022-CN-EXT */
3037
3038
                        /* try one of the other converters */
3039
13.5M
                        switch(choices[0]) {
3040
6.74M
                        case GB2312_1:
3041
6.74M
                            choices[1] = (int8_t)CNS_11643_1;
3042
6.74M
                            choices[2] = (int8_t)ISO_IR_165;
3043
6.74M
                            break;
3044
3.93k
                        case ISO_IR_165:
3045
3.93k
                            choices[1] = (int8_t)GB2312_1;
3046
3.93k
                            choices[2] = (int8_t)CNS_11643_1;
3047
3.93k
                            break;
3048
6.76M
                        default: /* CNS_11643_x */
3049
6.76M
                            choices[1] = (int8_t)GB2312_1;
3050
6.76M
                            choices[2] = (int8_t)ISO_IR_165;
3051
6.76M
                            break;
3052
13.5M
                        }
3053
3054
13.5M
                        choiceCount = 3;
3055
13.5M
                    } else {
3056
249
                        choices[0] = (int8_t)CNS_11643_1;
3057
249
                        choices[1] = (int8_t)GB2312_1;
3058
249
                    }
3059
13.8M
                }
3060
3061
14.0M
                cs = g = 0;
3062
                /*
3063
                 * len==0: no mapping found yet
3064
                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065
                 * len>0: found a roundtrip result, done
3066
                 */
3067
14.0M
                len = 0;
3068
                /*
3069
                 * We will turn off useFallback after finding a fallback,
3070
                 * but we still get fallbacks from PUA code points as usual.
3071
                 * Therefore, we will also need to check that we don't overwrite
3072
                 * an early fallback with a later one.
3073
                 */
3074
14.0M
                useFallback = cnv->useFallback;
3075
3076
42.0M
                for(i = 0; i < choiceCount && len <= 0; ++i) {
3077
27.9M
                    int8_t cs0 = choices[i];
3078
27.9M
                    if(cs0 > 0) {
3079
27.9M
                        uint32_t value;
3080
27.9M
                        int32_t len2;
3081
27.9M
                        if(cs0 >= CNS_11643_0) {
3082
13.8M
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3083
13.8M
                                        converterData->myConverterArray[CNS_11643],
3084
13.8M
                                        sourceChar,
3085
13.8M
                                        &value,
3086
13.8M
                                        useFallback,
3087
13.8M
                                        MBCS_OUTPUT_3);
3088
13.8M
                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3089
6.89M
                                targetValue = value;
3090
6.89M
                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3091
6.89M
                                if(len2 >= 0) {
3092
6.89M
                                    len = 2;
3093
6.89M
                                } else {
3094
0
                                    len = -2;
3095
0
                                    useFallback = false;
3096
0
                                }
3097
6.89M
                                if(cs == CNS_11643_1) {
3098
6.84M
                                    g = 1;
3099
6.84M
                                } else if(cs == CNS_11643_2) {
3100
54.7k
                                    g = 2;
3101
54.7k
                                } else /* plane 3..7 */ if(converterData->version == 1) {
3102
0
                                    g = 3;
3103
0
                                } else {
3104
                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3105
0
                                    len = 0;
3106
0
                                }
3107
6.89M
                            }
3108
14.0M
                        } else {
3109
                            /* GB2312_1 or ISO-IR-165 */
3110
14.0M
                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3111
14.0M
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3112
14.0M
                                        converterData->myConverterArray[cs0],
3113
14.0M
                                        sourceChar,
3114
14.0M
                                        &value,
3115
14.0M
                                        useFallback,
3116
14.0M
                                        MBCS_OUTPUT_2);
3117
14.0M
                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3118
7.02M
                                targetValue = value;
3119
7.02M
                                len = len2;
3120
7.02M
                                cs = cs0;
3121
7.02M
                                g = 1;
3122
7.02M
                                useFallback = false;
3123
7.02M
                            }
3124
14.0M
                        }
3125
27.9M
                    }
3126
27.9M
                }
3127
3128
14.0M
                if(len != 0) {
3129
13.9M
                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3130
3131
                    /* write the designation sequence if necessary */
3132
13.9M
                    if(cs != pFromU2022State->cs[g]) {
3133
13.6M
                        if(cs < CNS_11643) {
3134
6.83M
                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3135
6.83M
                        } else {
3136
6.83M
                            U_ASSERT(cs >= CNS_11643_1);
3137
6.83M
                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3138
6.83M
                        }
3139
13.6M
                        len = 4;
3140
13.6M
                        pFromU2022State->cs[g] = cs;
3141
13.6M
                        if(g == 1) {
3142
                            /* changing the SO/G1 charset invalidates the choices[] */
3143
13.6M
                            choiceCount = 0;
3144
13.6M
                        }
3145
13.6M
                    }
3146
3147
                    /* write the shift sequence if necessary */
3148
13.9M
                    if(g != pFromU2022State->g) {
3149
69.2k
                        switch(g) {
3150
14.5k
                        case 1:
3151
14.5k
                            buffer[len++] = UCNV_SO;
3152
3153
                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154
14.5k
                            pFromU2022State->g = 1;
3155
14.5k
                            break;
3156
54.7k
                        case 2:
3157
54.7k
                            buffer[len++] = 0x1b;
3158
54.7k
                            buffer[len++] = 0x4e;
3159
54.7k
                            break;
3160
0
                        default: /* case 3 */
3161
0
                            buffer[len++] = 0x1b;
3162
0
                            buffer[len++] = 0x4f;
3163
0
                            break;
3164
69.2k
                        }
3165
69.2k
                    }
3166
3167
                    /* write the two output bytes */
3168
13.9M
                    buffer[len++] = (char)(targetValue >> 8);
3169
13.9M
                    buffer[len++] = (char)targetValue;
3170
13.9M
                } else {
3171
                    /* if we cannot find the character after checking all codepages
3172
                     * then this is an error
3173
                     */
3174
131k
                    *err = U_INVALID_CHAR_FOUND;
3175
131k
                    cnv->fromUChar32=sourceChar;
3176
131k
                    break;
3177
131k
                }
3178
14.0M
            }
3179
3180
            /* output len>0 bytes in buffer[] */
3181
13.9M
            if(len == 1) {
3182
16.0k
                *target++ = buffer[0];
3183
16.0k
                if(offsets) {
3184
0
                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3185
0
                }
3186
13.9M
            } else if(len == 2 && (target + 2) <= targetLimit) {
3187
195k
                *target++ = buffer[0];
3188
195k
                *target++ = buffer[1];
3189
195k
                if(offsets) {
3190
0
                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191
0
                    *offsets++ = sourceIndex;
3192
0
                    *offsets++ = sourceIndex;
3193
0
                }
3194
13.7M
            } else {
3195
13.7M
                fromUWriteUInt8(
3196
13.7M
                    cnv,
3197
13.7M
                    buffer, len,
3198
13.7M
                    &target, (const char *)targetLimit,
3199
13.7M
                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200
13.7M
                    err);
3201
13.7M
                if(U_FAILURE(*err)) {
3202
2.36k
                    break;
3203
2.36k
                }
3204
13.7M
            }
3205
13.9M
        } /* end if(myTargetIndex<myTargetLength) */
3206
505
        else{
3207
505
            *err =U_BUFFER_OVERFLOW_ERROR;
3208
505
            break;
3209
505
        }
3210
3211
14.5M
    }/* end while(mySourceIndex<mySourceLength) */
3212
3213
    /*
3214
     * the end of the input stream and detection of truncated input
3215
     * are handled by the framework, but for ISO-2022-CN conversion
3216
     * we need to be in ASCII mode at the very end
3217
     *
3218
     * conditions:
3219
     *   successful
3220
     *   not in ASCII mode
3221
     *   end of input and no truncated input
3222
     */
3223
586k
    if( U_SUCCESS(*err) &&
3224
586k
        pFromU2022State->g!=0 &&
3225
586k
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3226
586k
    ) {
3227
123
        int32_t sourceIndex;
3228
3229
        /* we are switching to ASCII */
3230
123
        pFromU2022State->g=0;
3231
3232
        /* get the source index of the last input character */
3233
        /*
3234
         * TODO this would be simpler and more reliable if we used a pair
3235
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236
         * so that we could simply use the prevSourceIndex here;
3237
         * this code gives an incorrect result for the rare case of an unmatched
3238
         * trail surrogate that is alone in the last buffer of the text stream
3239
         */
3240
123
        sourceIndex=(int32_t)(source-args->source);
3241
123
        if(sourceIndex>0) {
3242
112
            --sourceIndex;
3243
112
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244
112
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3245
112
            ) {
3246
0
                --sourceIndex;
3247
0
            }
3248
112
        } else {
3249
11
            sourceIndex=-1;
3250
11
        }
3251
3252
123
        fromUWriteUInt8(
3253
123
            cnv,
3254
123
            SHIFT_IN_STR, 1,
3255
123
            &target, (const char *)targetLimit,
3256
123
            &offsets, sourceIndex,
3257
123
            err);
3258
123
    }
3259
3260
    /*save the state and return */
3261
586k
    args->source = source;
3262
586k
    args->target = (char*)target;
3263
586k
}
3264
3265
3266
static void U_CALLCONV
3267
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268
0
                                               UErrorCode* err){
3269
0
    char tempBuf[3];
3270
0
    const char *mySource = (char *) args->source;
3271
0
    char16_t *myTarget = args->target;
3272
0
    const char *mySourceLimit = args->sourceLimit;
3273
0
    uint32_t targetUniChar = 0x0000;
3274
0
    uint32_t mySourceChar = 0x0000;
3275
0
    UConverterDataISO2022* myData;
3276
0
    ISO2022State *pToU2022State;
3277
3278
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279
0
    pToU2022State = &myData->toU2022State;
3280
3281
0
    if(myData->key != 0) {
3282
        /* continue with a partial escape sequence */
3283
0
        goto escape;
3284
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285
        /* continue with a partial double-byte character */
3286
0
        mySourceChar = args->converter->toUBytes[0];
3287
0
        args->converter->toULength = 0;
3288
0
        targetUniChar = missingCharMarker;
3289
0
        goto getTrailByte;
3290
0
    }
3291
3292
0
    while(mySource < mySourceLimit){
3293
3294
0
        targetUniChar =missingCharMarker;
3295
3296
0
        if(myTarget < args->targetLimit){
3297
3298
0
            mySourceChar= (unsigned char) *mySource++;
3299
3300
0
            switch(mySourceChar){
3301
0
            case UCNV_SI:
3302
0
                pToU2022State->g=0;
3303
0
                if (myData->isEmptySegment) {
3304
0
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
3305
0
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306
0
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3307
0
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3308
0
                    args->converter->toULength = 1;
3309
0
                    args->target = myTarget;
3310
0
                    args->source = mySource;
3311
0
                    return;
3312
0
                }
3313
0
                continue;
3314
3315
0
            case UCNV_SO:
3316
0
                if(pToU2022State->cs[1] != 0) {
3317
0
                    pToU2022State->g=1;
3318
0
                    myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
3319
0
                    continue;
3320
0
                } else {
3321
                    /* illegal to have SO before a matching designator */
3322
0
                    myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
3323
0
                    break;
3324
0
                }
3325
3326
0
            case ESC_2022:
3327
0
                mySource--;
3328
0
escape:
3329
0
                {
3330
0
                    const char * mySourceBefore = mySource;
3331
0
                    int8_t toULengthBefore = args->converter->toULength;
3332
3333
0
                    changeState_2022(args->converter,&(mySource),
3334
0
                        mySourceLimit, ISO_2022_CN,err);
3335
3336
                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3337
0
                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3338
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339
0
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3340
0
                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3341
0
                    }
3342
0
                }
3343
3344
                /* invalid or illegal escape sequence */
3345
0
                if(U_FAILURE(*err)){
3346
0
                    args->target = myTarget;
3347
0
                    args->source = mySource;
3348
0
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
3349
0
                    return;
3350
0
                }
3351
0
                continue;
3352
3353
            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3354
3355
0
            case CR:
3356
0
            case LF:
3357
0
                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3358
0
                U_FALLTHROUGH;
3359
0
            default:
3360
                /* convert one or two bytes */
3361
0
                myData->isEmptySegment = false;
3362
0
                if(pToU2022State->g != 0) {
3363
0
                    if(mySource < mySourceLimit) {
3364
0
                        UConverterSharedData *cnv;
3365
0
                        StateEnum tempState;
3366
0
                        int32_t tempBufLen;
3367
0
                        int leadIsOk, trailIsOk;
3368
0
                        uint8_t trailByte;
3369
0
getTrailByte:
3370
0
                        trailByte = (uint8_t)*mySource;
3371
                        /*
3372
                         * Ticket 5691: consistent illegal sequences:
3373
                         * - We include at least the first byte in the illegal sequence.
3374
                         * - If any of the non-initial bytes could be the start of a character,
3375
                         *   we stop the illegal sequence before the first one of those.
3376
                         *
3377
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379
                         * Otherwise we convert or report the pair of bytes.
3380
                         */
3381
0
                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3382
0
                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3383
0
                        if (leadIsOk && trailIsOk) {
3384
0
                            ++mySource;
3385
0
                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386
0
                            if(tempState >= CNS_11643_0) {
3387
0
                                cnv = myData->myConverterArray[CNS_11643];
3388
0
                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3389
0
                                tempBuf[1] = (char) (mySourceChar);
3390
0
                                tempBuf[2] = (char) trailByte;
3391
0
                                tempBufLen = 3;
3392
3393
0
                            }else{
3394
0
                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3395
0
                                cnv = myData->myConverterArray[tempState];
3396
0
                                tempBuf[0] = (char) (mySourceChar);
3397
0
                                tempBuf[1] = (char) trailByte;
3398
0
                                tempBufLen = 2;
3399
0
                            }
3400
0
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
3401
0
                            mySourceChar = (mySourceChar << 8) | trailByte;
3402
0
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3403
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3404
0
                            ++mySource;
3405
                            /* add another bit so that the code below writes 2 bytes in case of error */
3406
0
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3407
0
                        }
3408
0
                        if(pToU2022State->g>=2) {
3409
                            /* return from a single-shift state to the previous one */
3410
0
                            pToU2022State->g=pToU2022State->prevG;
3411
0
                        }
3412
0
                    } else {
3413
0
                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3414
0
                        args->converter->toULength = 1;
3415
0
                        goto endloop;
3416
0
                    }
3417
0
                }
3418
0
                else{
3419
0
                    if(mySourceChar <= 0x7f) {
3420
0
                        targetUniChar = (char16_t) mySourceChar;
3421
0
                    }
3422
0
                }
3423
0
                break;
3424
0
            }
3425
0
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3426
0
                if(args->offsets){
3427
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3428
0
                }
3429
0
                *(myTarget++)=(char16_t)targetUniChar;
3430
0
            }
3431
0
            else if(targetUniChar > missingCharMarker){
3432
                /* disassemble the surrogate pair and write to output*/
3433
0
                targetUniChar-=0x0010000;
3434
0
                *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
3435
0
                if(args->offsets){
3436
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3437
0
                }
3438
0
                ++myTarget;
3439
0
                if(myTarget< args->targetLimit){
3440
0
                    *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
3441
0
                    if(args->offsets){
3442
0
                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3443
0
                    }
3444
0
                    ++myTarget;
3445
0
                }else{
3446
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447
0
                                    (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
3448
0
                }
3449
3450
0
            }
3451
0
            else{
3452
                /* Call the callback function*/
3453
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454
0
                break;
3455
0
            }
3456
0
        }
3457
0
        else{
3458
0
            *err =U_BUFFER_OVERFLOW_ERROR;
3459
0
            break;
3460
0
        }
3461
0
    }
3462
0
endloop:
3463
0
    args->target = myTarget;
3464
0
    args->source = mySource;
3465
0
}
3466
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3467
3468
static void U_CALLCONV
3469
9.50M
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3470
9.50M
    UConverter *cnv = args->converter;
3471
9.50M
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3472
9.50M
    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473
9.50M
    char *p, *subchar;
3474
9.50M
    char buffer[8];
3475
9.50M
    int32_t length;
3476
3477
9.50M
    subchar=(char *)cnv->subChars;
3478
9.50M
    length=cnv->subCharLen; /* assume length==1 for most variants */
3479
3480
9.50M
    p = buffer;
3481
9.50M
    switch(myConverterData->locale[0]){
3482
566k
    case 'j':
3483
566k
        {
3484
566k
            int8_t cs;
3485
3486
566k
            if(pFromU2022State->g == 1) {
3487
                /* JIS7: switch from G1 to G0 */
3488
220
                pFromU2022State->g = 0;
3489
220
                *p++ = UCNV_SI;
3490
220
            }
3491
3492
566k
            cs = pFromU2022State->cs[0];
3493
566k
            if(cs != ASCII && cs != JISX201) {
3494
                /* not in ASCII or JIS X 0201: switch to ASCII */
3495
492k
                pFromU2022State->cs[0] = (int8_t)ASCII;
3496
492k
                *p++ = '\x1b';
3497
492k
                *p++ = '\x28';
3498
492k
                *p++ = '\x42';
3499
492k
            }
3500
3501
566k
            *p++ = subchar[0];
3502
566k
            break;
3503
0
        }
3504
582k
    case 'c':
3505
582k
        if(pFromU2022State->g != 0) {
3506
            /* not in ASCII mode: switch to ASCII */
3507
10.8k
            pFromU2022State->g = 0;
3508
10.8k
            *p++ = UCNV_SI;
3509
10.8k
        }
3510
582k
        *p++ = subchar[0];
3511
582k
        break;
3512
8.35M
    case 'k':
3513
8.35M
        if(myConverterData->version == 0) {
3514
6.86M
            if(length == 1) {
3515
6.86M
                if(args->converter->fromUnicodeStatus) {
3516
                    /* in DBCS mode: switch to SBCS */
3517
6.83M
                    args->converter->fromUnicodeStatus = 0;
3518
6.83M
                    *p++ = UCNV_SI;
3519
6.83M
                }
3520
6.86M
                *p++ = subchar[0];
3521
6.86M
            } else /* length == 2*/ {
3522
0
                if(!args->converter->fromUnicodeStatus) {
3523
                    /* in SBCS mode: switch to DBCS */
3524
0
                    args->converter->fromUnicodeStatus = 1;
3525
0
                    *p++ = UCNV_SO;
3526
0
                }
3527
0
                *p++ = subchar[0];
3528
0
                *p++ = subchar[1];
3529
0
            }
3530
6.86M
            break;
3531
6.86M
        } else {
3532
            /* save the subconverter's substitution string */
3533
1.49M
            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534
1.49M
            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536
            /* set our substitution string into the subconverter */
3537
1.49M
            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3538
1.49M
            myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
3540
            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541
1.49M
            args->converter = myConverterData->currentConverter;
3542
1.49M
            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543
1.49M
            ucnv_cbFromUWriteSub(args, 0, err);
3544
1.49M
            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3545
1.49M
            args->converter = cnv;
3546
3547
            /* restore the subconverter's substitution string */
3548
1.49M
            myConverterData->currentConverter->subChars = currentSubChars;
3549
1.49M
            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3550
3551
1.49M
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552
105
                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3553
105
                    uprv_memcpy(
3554
105
                        cnv->charErrorBuffer,
3555
105
                        myConverterData->currentConverter->charErrorBuffer,
3556
105
                        myConverterData->currentConverter->charErrorBufferLength);
3557
105
                }
3558
105
                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559
105
                myConverterData->currentConverter->charErrorBufferLength = 0;
3560
105
            }
3561
1.49M
            return;
3562
1.49M
        }
3563
0
    default:
3564
        /* not expected */
3565
0
        break;
3566
9.50M
    }
3567
8.00M
    ucnv_cbFromUWriteBytes(args,
3568
8.00M
                           buffer, (int32_t)(p - buffer),
3569
8.00M
                           offsetIndex, err);
3570
8.00M
}
3571
3572
/*
3573
 * Structure for cloning an ISO 2022 converter into a single memory block.
3574
 */
3575
struct cloneStruct
3576
{
3577
    UConverter cnv;
3578
    UConverter currentConverter;
3579
    UConverterDataISO2022 mydata;
3580
};
3581
3582
3583
U_CDECL_BEGIN
3584
3585
static UConverter * U_CALLCONV
3586
_ISO_2022_SafeClone(
3587
            const UConverter *cnv,
3588
            void *stackBuffer,
3589
            int32_t *pBufferSize,
3590
            UErrorCode *status)
3591
0
{
3592
0
    struct cloneStruct * localClone;
3593
0
    UConverterDataISO2022 *cnvData;
3594
0
    int32_t i, size;
3595
3596
0
    if (U_FAILURE(*status)){
3597
0
        return nullptr;
3598
0
    }
3599
3600
0
    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3601
0
        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3602
0
        return nullptr;
3603
0
    }
3604
3605
0
    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3606
0
    localClone = (struct cloneStruct *)stackBuffer;
3607
3608
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3609
3610
0
    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3611
0
    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3612
0
    localClone->cnv.isExtraLocal = true;
3613
3614
    /* share the subconverters */
3615
3616
0
    if(cnvData->currentConverter != nullptr) {
3617
0
        size = (int32_t)sizeof(UConverter);
3618
0
        localClone->mydata.currentConverter =
3619
0
            ucnv_safeClone(cnvData->currentConverter,
3620
0
                            &localClone->currentConverter,
3621
0
                            &size, status);
3622
0
        if(U_FAILURE(*status)) {
3623
0
            return nullptr;
3624
0
        }
3625
0
    }
3626
3627
0
    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3628
0
        if(cnvData->myConverterArray[i] != nullptr) {
3629
0
            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3630
0
        }
3631
0
    }
3632
3633
0
    return &localClone->cnv;
3634
0
}
3635
3636
U_CDECL_END
3637
3638
static void U_CALLCONV
3639
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3640
                    const USetAdder *sa,
3641
                    UConverterUnicodeSet which,
3642
                    UErrorCode *pErrorCode)
3643
0
{
3644
0
    int32_t i;
3645
0
    UConverterDataISO2022* cnvData;
3646
3647
0
    if (U_FAILURE(*pErrorCode)) {
3648
0
        return;
3649
0
    }
3650
#ifdef U_ENABLE_GENERIC_ISO_2022
3651
    if (cnv->sharedData == &_ISO2022Data) {
3652
        /* We use UTF-8 in this case */
3653
        sa->addRange(sa->set, 0, 0xd7FF);
3654
        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3655
        return;
3656
    }
3657
#endif
3658
3659
0
    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3660
3661
    /* open a set and initialize it with code points that are algorithmically round-tripped */
3662
0
    switch(cnvData->locale[0]){
3663
0
    case 'j':
3664
        /* include JIS X 0201 which is hardcoded */
3665
0
        sa->add(sa->set, 0xa5);
3666
0
        sa->add(sa->set, 0x203e);
3667
0
        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3668
            /* include Latin-1 for some variants of JP */
3669
0
            sa->addRange(sa->set, 0, 0xff);
3670
0
        } else {
3671
            /* include ASCII for JP */
3672
0
            sa->addRange(sa->set, 0, 0x7f);
3673
0
        }
3674
0
        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3675
            /*
3676
             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3677
             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3678
             * use half-width Katakana.
3679
             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3680
             * half-width Katakana via the ESC ( I sequence.
3681
             * However, we only emit (fromUnicode) half-width Katakana according to the
3682
             * definition of each variant.
3683
             *
3684
             * When including fallbacks,
3685
             * we need to include half-width Katakana Unicode code points for all JP variants because
3686
             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3687
             */
3688
            /* include half-width Katakana for JP */
3689
0
            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3690
0
        }
3691
0
        break;
3692
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3693
0
    case 'c':
3694
0
    case 'z':
3695
        /* include ASCII for CN */
3696
0
        sa->addRange(sa->set, 0, 0x7f);
3697
0
        break;
3698
0
    case 'k':
3699
        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3700
0
        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3701
0
                cnvData->currentConverter, sa, which, pErrorCode);
3702
        /* the loop over myConverterArray[] will simply not find another converter */
3703
0
        break;
3704
0
#endif
3705
0
    default:
3706
0
        break;
3707
0
    }
3708
3709
#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3710
            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3711
                cnvData->version==0 && i==CNS_11643
3712
            ) {
3713
                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3714
                ucnv_MBCSGetUnicodeSetForBytes(
3715
                        cnvData->myConverterArray[i],
3716
                        sa, UCNV_ROUNDTRIP_SET,
3717
                        0, 0x81, 0x82,
3718
                        pErrorCode);
3719
            }
3720
#endif
3721
3722
0
    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3723
0
        UConverterSetFilter filter;
3724
0
        if(cnvData->myConverterArray[i]!=nullptr) {
3725
0
            if(cnvData->locale[0]=='j' && i==JISX208) {
3726
                /*
3727
                 * Only add code points that map to Shift-JIS codes
3728
                 * corresponding to JIS X 0208.
3729
                 */
3730
0
                filter=UCNV_SET_FILTER_SJIS;
3731
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3732
0
            } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3733
0
                       cnvData->version==0 && i==CNS_11643) {
3734
                /*
3735
                 * Version-specific for CN:
3736
                 * CN version 0 does not map CNS planes 3..7 although
3737
                 * they are all available in the CNS conversion table;
3738
                 * CN version 1 (-EXT) does map them all.
3739
                 * The two versions create different Unicode sets.
3740
                 */
3741
0
                filter=UCNV_SET_FILTER_2022_CN;
3742
0
            } else if(i==KSC5601) {
3743
                /*
3744
                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3745
                 * are broader than GR94.
3746
                 */
3747
0
                filter=UCNV_SET_FILTER_GR94DBCS;
3748
0
#endif
3749
0
            } else {
3750
0
                filter=UCNV_SET_FILTER_NONE;
3751
0
            }
3752
0
            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3753
0
        }
3754
0
    }
3755
3756
    /*
3757
     * ISO 2022 converters must not convert SO/SI/ESC despite what
3758
     * sub-converters do by themselves.
3759
     * Remove these characters from the set.
3760
     */
3761
0
    sa->remove(sa->set, 0x0e);
3762
0
    sa->remove(sa->set, 0x0f);
3763
0
    sa->remove(sa->set, 0x1b);
3764
3765
    /* ISO 2022 converters do not convert C1 controls either */
3766
0
    sa->removeRange(sa->set, 0x80, 0x9f);
3767
0
}
3768
3769
static const UConverterImpl _ISO2022Impl={
3770
    UCNV_ISO_2022,
3771
3772
    nullptr,
3773
    nullptr,
3774
3775
    _ISO2022Open,
3776
    _ISO2022Close,
3777
    _ISO2022Reset,
3778
3779
#ifdef U_ENABLE_GENERIC_ISO_2022
3780
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3781
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782
    ucnv_fromUnicode_UTF8,
3783
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3784
#else
3785
    nullptr,
3786
    nullptr,
3787
    nullptr,
3788
    nullptr,
3789
#endif
3790
    nullptr,
3791
3792
    nullptr,
3793
    _ISO2022getName,
3794
    _ISO_2022_WriteSub,
3795
    _ISO_2022_SafeClone,
3796
    _ISO_2022_GetUnicodeSet,
3797
3798
    nullptr,
3799
    nullptr
3800
};
3801
static const UConverterStaticData _ISO2022StaticData={
3802
    sizeof(UConverterStaticData),
3803
    "ISO_2022",
3804
    2022,
3805
    UCNV_IBM,
3806
    UCNV_ISO_2022,
3807
    1,
3808
    3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
3809
    { 0x1a, 0, 0, 0 },
3810
    1,
3811
    false,
3812
    false,
3813
    0,
3814
    0,
3815
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3816
};
3817
const UConverterSharedData _ISO2022Data=
3818
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3819
3820
/*************JP****************/
3821
static const UConverterImpl _ISO2022JPImpl={
3822
    UCNV_ISO_2022,
3823
3824
    nullptr,
3825
    nullptr,
3826
3827
    _ISO2022Open,
3828
    _ISO2022Close,
3829
    _ISO2022Reset,
3830
3831
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3832
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835
    nullptr,
3836
3837
    nullptr,
3838
    _ISO2022getName,
3839
    _ISO_2022_WriteSub,
3840
    _ISO_2022_SafeClone,
3841
    _ISO_2022_GetUnicodeSet,
3842
3843
    nullptr,
3844
    nullptr
3845
};
3846
static const UConverterStaticData _ISO2022JPStaticData={
3847
    sizeof(UConverterStaticData),
3848
    "ISO_2022_JP",
3849
    0,
3850
    UCNV_IBM,
3851
    UCNV_ISO_2022,
3852
    1,
3853
    6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
3854
    { 0x1a, 0, 0, 0 },
3855
    1,
3856
    false,
3857
    false,
3858
    0,
3859
    0,
3860
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3861
};
3862
3863
namespace {
3864
3865
const UConverterSharedData _ISO2022JPData=
3866
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3867
3868
}  // namespace
3869
3870
#if !UCONFIG_ONLY_HTML_CONVERSION
3871
/************* KR ***************/
3872
static const UConverterImpl _ISO2022KRImpl={
3873
    UCNV_ISO_2022,
3874
3875
    nullptr,
3876
    nullptr,
3877
3878
    _ISO2022Open,
3879
    _ISO2022Close,
3880
    _ISO2022Reset,
3881
3882
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3883
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3884
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3885
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3886
    nullptr,
3887
3888
    nullptr,
3889
    _ISO2022getName,
3890
    _ISO_2022_WriteSub,
3891
    _ISO_2022_SafeClone,
3892
    _ISO_2022_GetUnicodeSet,
3893
3894
    nullptr,
3895
    nullptr
3896
};
3897
static const UConverterStaticData _ISO2022KRStaticData={
3898
    sizeof(UConverterStaticData),
3899
    "ISO_2022_KR",
3900
    0,
3901
    UCNV_IBM,
3902
    UCNV_ISO_2022,
3903
    1,
3904
    8, /* max 8 bytes per char16_t */
3905
    { 0x1a, 0, 0, 0 },
3906
    1,
3907
    false,
3908
    false,
3909
    0,
3910
    0,
3911
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3912
};
3913
3914
namespace {
3915
3916
const UConverterSharedData _ISO2022KRData=
3917
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3918
3919
}  // namespace
3920
3921
/*************** CN ***************/
3922
static const UConverterImpl _ISO2022CNImpl={
3923
3924
    UCNV_ISO_2022,
3925
3926
    nullptr,
3927
    nullptr,
3928
3929
    _ISO2022Open,
3930
    _ISO2022Close,
3931
    _ISO2022Reset,
3932
3933
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3934
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3937
    nullptr,
3938
3939
    nullptr,
3940
    _ISO2022getName,
3941
    _ISO_2022_WriteSub,
3942
    _ISO_2022_SafeClone,
3943
    _ISO_2022_GetUnicodeSet,
3944
3945
    nullptr,
3946
    nullptr
3947
};
3948
static const UConverterStaticData _ISO2022CNStaticData={
3949
    sizeof(UConverterStaticData),
3950
    "ISO_2022_CN",
3951
    0,
3952
    UCNV_IBM,
3953
    UCNV_ISO_2022,
3954
    1,
3955
    8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3956
    { 0x1a, 0, 0, 0 },
3957
    1,
3958
    false,
3959
    false,
3960
    0,
3961
    0,
3962
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3963
};
3964
3965
namespace {
3966
3967
const UConverterSharedData _ISO2022CNData=
3968
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3969
3970
}  // namespace
3971
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3972
3973
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */