Coverage Report

Created: 2026-06-13 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/source/common/ucnv2022.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2000-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv2022.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2000feb03
14
*   created by: Markus W. Scherer
15
*
16
*   Change history:
17
*
18
*   06/29/2000  helena  Major rewrite of the callback APIs.
19
*   08/08/2000  Ram     Included support for ISO-2022-JP-2
20
*                       Changed implementation of toUnicode
21
*                       function
22
*   08/21/2000  Ram     Added support for ISO-2022-KR
23
*   08/29/2000  Ram     Seperated implementation of EBCDIC to
24
*                       ucnvebdc.c
25
*   09/20/2000  Ram     Added support for ISO-2022-CN
26
*                       Added implementations for getNextUChar()
27
*                       for specific 2022 country variants.
28
*   10/31/2000  Ram     Implemented offsets logic functions
29
*/
30
31
#include "unicode/utypes.h"
32
33
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35
#include "unicode/ucnv.h"
36
#include "unicode/uset.h"
37
#include "unicode/ucnv_err.h"
38
#include "unicode/ucnv_cb.h"
39
#include "unicode/utf16.h"
40
#include "ucnv_imp.h"
41
#include "ucnv_bld.h"
42
#include "ucnv_cnv.h"
43
#include "ucnvmbcs.h"
44
#include "cstring.h"
45
#include "cmemory.h"
46
#include "uassert.h"
47
48
#ifdef U_ENABLE_GENERIC_ISO_2022
49
/*
50
 * I am disabling the generic ISO-2022 converter after proposing to do so on
51
 * the icu mailing list two days ago.
52
 *
53
 * Reasons:
54
 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55
 *    its designation sequences, single shifts with return to the previous state,
56
 *    switch-with-no-return to UTF-16BE or similar, etc.
57
 *    This is unlike the language-specific variants like ISO-2022-JP which
58
 *    require a much smaller repertoire of ISO-2022 features.
59
 *    These variants continue to be supported.
60
 * 2. I believe that no one is really using the generic ISO-2022 converter
61
 *    but rather always one of the language-specific variants.
62
 *    Note that ICU's generic ISO-2022 converter has always output one escape
63
 *    sequence followed by UTF-8 for the whole stream.
64
 * 3. Switching between subcharsets is extremely slow, because each time
65
 *    the previous converter is closed and a new one opened,
66
 *    without any kind of caching, least-recently-used list, etc.
67
 * 4. The code is currently buggy, and given the above it does not seem
68
 *    reasonable to spend the time on maintenance.
69
 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70
 *    This means, for example, that when ISO-8859-7 is designated, the following
71
 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72
 *    The ICU ISO-2022 converter does not handle this - and has no information
73
 *    about which subconverter would have to be shifted vs. which is designed
74
 *    for 7-bit ISO-2022.
75
 *
76
 * Markus Scherer 2003-dec-03
77
 */
78
#endif
79
80
#if !UCONFIG_ONLY_HTML_CONVERSION
81
static const char SHIFT_IN_STR[]  = "\x0F";
82
// static const char SHIFT_OUT_STR[] = "\x0E";
83
#endif
84
85
4.70k
#define CR      0x0D
86
12.0k
#define LF      0x0A
87
#define H_TAB   0x09
88
#define V_TAB   0x0B
89
#define SPACE   0x20
90
91
enum {
92
    HWKANA_START=0xff61,
93
    HWKANA_END=0xff9f
94
};
95
96
/*
97
 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98
 * as bytes 21..7E. (Subtract 0x80.)
99
 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100
 * as bytes 20..7F. (Subtract 0x80.)
101
 * Do not encode C1 control codes with native bytes 80..9F
102
 * as bytes 00..1F (C0 control codes).
103
 */
104
enum {
105
    GR94_START=0xa1,
106
    GR94_END=0xfe,
107
    GR96_START=0xa0,
108
    GR96_END=0xff
109
};
110
111
/*
112
 * ISO 2022 control codes must not be converted from Unicode
113
 * because they would mess up the byte stream.
114
 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115
 * corresponding to SO, SI, and ESC.
116
 */
117
47.4k
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119
/* for ISO-2022-JP and -CN implementations */
120
typedef enum  {
121
        /* shared values */
122
        INVALID_STATE=-1,
123
        ASCII = 0,
124
125
        SS2_STATE=0x10,
126
        SS3_STATE,
127
128
        /* JP */
129
        ISO8859_1 = 1 ,
130
        ISO8859_7 = 2 ,
131
        JISX201  = 3,
132
        JISX208 = 4,
133
        JISX212 = 5,
134
        GB2312  =6,
135
        KSC5601 =7,
136
        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137
138
        /* CN */
139
        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140
        GB2312_1=1,
141
        ISO_IR_165=2,
142
        CNS_11643=3,
143
144
        /*
145
         * these are used in StateEnum and ISO2022State variables,
146
         * but CNS_11643 must be used to index into myConverterArray[]
147
         */
148
        CNS_11643_0=0x20,
149
        CNS_11643_1,
150
        CNS_11643_2,
151
        CNS_11643_3,
152
        CNS_11643_4,
153
        CNS_11643_5,
154
        CNS_11643_6,
155
        CNS_11643_7
156
} StateEnum;
157
158
/* is the StateEnum charset value for a DBCS charset? */
159
#if UCONFIG_ONLY_HTML_CONVERSION
160
#define IS_JP_DBCS(cs) (JISX208==(cs))
161
#else
162
0
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163
#endif
164
165
24
#define CSM(cs) ((uint16_t)1<<(cs))
166
167
/*
168
 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169
 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170
 *
171
 * Note: The converter uses some leniency:
172
 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173
 *   all versions, not just JIS7 and JIS8.
174
 * - ICU does not distinguish between different versions of JIS X 0208.
175
 */
176
#if UCONFIG_ONLY_HTML_CONVERSION
177
enum { MAX_JA_VERSION=0 };
178
#else
179
enum { MAX_JA_VERSION=4 };
180
#endif
181
static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183
#if !UCONFIG_ONLY_HTML_CONVERSION
184
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188
#endif
189
};
190
191
typedef enum {
192
        ASCII1=0,
193
        LATIN1,
194
        SBCS,
195
        DBCS,
196
        MBCS,
197
        HWKANA
198
}Cnv2022Type;
199
200
typedef struct ISO2022State {
201
    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202
    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203
    int8_t prevG;       /* g before single shift (SS2 or SS3) */
204
} ISO2022State;
205
206
2.02k
#define UCNV_OPTIONS_VERSION_MASK 0xf
207
22.2k
#define UCNV_2022_MAX_CONVERTERS 10
208
209
typedef struct{
210
    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211
    UConverter *currentConverter;
212
    Cnv2022Type currentType;
213
    ISO2022State toU2022State, fromU2022State;
214
    uint32_t key;
215
    uint32_t version;
216
#ifdef U_ENABLE_GENERIC_ISO_2022
217
    UBool isFirstBuffer;
218
#endif
219
    UBool isEmptySegment;
220
    char name[30];
221
    char locale[3];
222
}UConverterDataISO2022;
223
224
/* Protos */
225
/* ISO-2022 ----------------------------------------------------------------- */
226
227
/*Forward declaration */
228
U_CFUNC void U_CALLCONV
229
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230
                      UErrorCode * err);
231
U_CFUNC void U_CALLCONV
232
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233
                                    UErrorCode * err);
234
235
17.2k
#define ESC_2022 0x1B /*ESC*/
236
237
typedef enum
238
{
239
        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240
        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241
        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242
        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243
} UCNV_TableStates_2022;
244
245
/*
246
* The way these state transition arrays work is:
247
* ex : ESC$B is the sequence for JISX208
248
*      a) First Iteration: char is ESC
249
*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250
*             int x = normalize_esq_chars_2022[27] which is equal to 1
251
*         ii) Search for this value in escSeqStateTable_Key_2022[]
252
*             value of x is stored at escSeqStateTable_Key_2022[0]
253
*        iii) Save this index as offset
254
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256
*     b) Switch on this state and continue to next char
257
*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258
*             which is normalize_esq_chars_2022[36] == 4
259
*         ii) x is currently 1(from above)
260
*               x<<=5 -- x is now 32
261
*               x+=normalize_esq_chars_2022[36]
262
*               now x is 36
263
*        iii) Search for this value in escSeqStateTable_Key_2022[]
264
*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267
*     c) Switch on this state and continue to next char
268
*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269
*        ii) x is currently 36 (from above)
270
*            x<<=5 -- x is now 1152
271
*            x+=normalize_esq_chars_2022[66]
272
*            now x is 1161
273
*       iii) Search for this value in escSeqStateTable_Key_2022[]
274
*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275
*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276
*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277
*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278
*/
279
280
281
/*Below are the 3 arrays depicting a state transition table*/
282
static const int8_t normalize_esq_chars_2022[256] = {
283
/*       0      1       2       3       4      5       6        7       8       9           */
284
285
         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288
        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289
        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291
        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292
        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293
        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294
        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310
        ,0     ,0      ,0      ,0      ,0      ,0
311
};
312
313
#ifdef U_ENABLE_GENERIC_ISO_2022
314
/*
315
 * When the generic ISO-2022 converter is completely removed, not just disabled
316
 * per #ifdef, then the following state table and the associated tables that are
317
 * dimensioned with MAX_STATES_2022 should be trimmed.
318
 *
319
 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320
 * the associated escape sequences starting with ESC ( B should be removed.
321
 * This includes the ones with key values 1097 and all of the ones above 1000000.
322
 *
323
 * For the latter, the tables can simply be truncated.
324
 * For the former, since the tables must be kept parallel, it is probably best
325
 * to simply duplicate an adjacent table cell, parallel in all tables.
326
 *
327
 * It may make sense to restructure the tables, especially by using small search
328
 * tables for the variants instead of indexing them parallel to the table here.
329
 */
330
#endif
331
332
45.1k
#define MAX_STATES_2022 74
333
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334
/*   0           1           2           3           4           5           6           7           8           9           */
335
336
     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337
    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338
    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339
    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340
    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341
    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342
    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343
    ,35947631   ,35947635   ,35947636   ,35947638
344
};
345
346
#ifdef U_ENABLE_GENERIC_ISO_2022
347
348
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349
 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350
351
     nullptr                   ,nullptr                   ,nullptr                   ,nullptr               ,nullptr               ,nullptr                   ,nullptr                   ,nullptr                   ,"latin1"               ,"latin1"
352
    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353
    ,"latin1"               ,nullptr                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,nullptr                   ,nullptr                   ,nullptr                   ,nullptr                   ,"UTF8"
354
    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,nullptr               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355
    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356
    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357
    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358
    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359
};
360
361
#endif
362
363
static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364
/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365
     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366
    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367
    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373
};
374
375
/* Type def for refactoring changeState_2022 code*/
376
typedef enum{
377
#ifdef U_ENABLE_GENERIC_ISO_2022
378
    ISO_2022=0,
379
#endif
380
    ISO_2022_JP=1,
381
#if !UCONFIG_ONLY_HTML_CONVERSION
382
    ISO_2022_KR=2,
383
    ISO_2022_CN=3
384
#endif
385
} Variant2022;
386
387
/*********** ISO 2022 Converter Protos ***********/
388
static void U_CALLCONV
389
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391
static void U_CALLCONV
392
 _ISO2022Close(UConverter *converter);
393
394
static void U_CALLCONV
395
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397
U_CDECL_BEGIN
398
static const char * U_CALLCONV
399
_ISO2022getName(const UConverter* cnv);
400
U_CDECL_END
401
402
static void  U_CALLCONV
403
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405
U_CDECL_BEGIN
406
static UConverter * U_CALLCONV
407
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409
U_CDECL_END
410
411
#ifdef U_ENABLE_GENERIC_ISO_2022
412
static void U_CALLCONV
413
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414
#endif
415
416
namespace {
417
418
/*const UConverterSharedData _ISO2022Data;*/
419
extern const UConverterSharedData _ISO2022JPData;
420
421
#if !UCONFIG_ONLY_HTML_CONVERSION
422
extern const UConverterSharedData _ISO2022KRData;
423
extern const UConverterSharedData _ISO2022CNData;
424
#endif
425
426
}  // namespace
427
428
/*************** Converter implementations ******************/
429
430
/* The purpose of this function is to get around gcc compiler warnings. */
431
static inline void
432
fromUWriteUInt8(UConverter *cnv,
433
                 const char *bytes, int32_t length,
434
                 uint8_t **target, const char *targetLimit,
435
                 int32_t **offsets,
436
                 int32_t sourceIndex,
437
                 UErrorCode *pErrorCode)
438
0
{
439
0
    char* targetChars = reinterpret_cast<char*>(*target);
440
0
    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441
0
                         offsets, sourceIndex, pErrorCode);
442
0
    *target = reinterpret_cast<uint8_t*>(targetChars);
443
444
0
}
445
446
static inline void
447
1
setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448
1
    if(myConverterData->version == 1) {
449
0
        UConverter *cnv = myConverterData->currentConverter;
450
451
0
        cnv->toUnicodeStatus=0;     /* offset */
452
0
        cnv->mode=0;                /* state */
453
0
        cnv->toULength=0;           /* byteIndex */
454
0
    }
455
1
}
456
457
static inline void
458
1
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459
   /* in ISO-2022-KR the designator sequence appears only once
460
    * in a file so we append it only once
461
    */
462
1
    if( converter->charErrorBufferLength==0){
463
464
1
        converter->charErrorBufferLength = 4;
465
1
        converter->charErrorBuffer[0] = 0x1b;
466
1
        converter->charErrorBuffer[1] = 0x24;
467
1
        converter->charErrorBuffer[2] = 0x29;
468
1
        converter->charErrorBuffer[3] = 0x43;
469
1
    }
470
1
    if(myConverterData->version == 1) {
471
0
        UConverter *cnv = myConverterData->currentConverter;
472
473
0
        cnv->fromUChar32=0;
474
0
        cnv->fromUnicodeStatus=1;   /* prevLength */
475
0
    }
476
1
}
477
478
static void U_CALLCONV
479
2.02k
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481
2.02k
    char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
482
483
2.02k
    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484
2.02k
    if(cnv->extraInfo != nullptr) {
485
2.02k
        UConverterNamePieces stackPieces;
486
2.02k
        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487
2.02k
        UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
488
2.02k
        uint32_t version;
489
490
2.02k
        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492
2.02k
        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493
2.02k
        myConverterData->currentType = ASCII1;
494
2.02k
        cnv->fromUnicodeStatus =false;
495
2.02k
        if(pArgs->locale){
496
2.02k
            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
497
2.02k
        }
498
2.02k
        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499
2.02k
        myConverterData->version = version;
500
2.02k
        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501
6
            (myLocale[2]=='_' || myLocale[2]=='\0'))
502
6
        {
503
            /* open the required converters and cache them */
504
6
            if(version>MAX_JA_VERSION) {
505
                // ICU 55 fails to open a converter for an unsupported version.
506
                // Previously, it fell back to version 0, but that would yield
507
                // unexpected behavior.
508
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
509
0
                return;
510
0
            }
511
6
            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512
4
                myConverterData->myConverterArray[ISO8859_7] =
513
4
                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514
4
            }
515
6
            myConverterData->myConverterArray[JISX208] =
516
6
                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517
6
            if(jpCharsetMasks[version]&CSM(JISX212)) {
518
5
                myConverterData->myConverterArray[JISX212] =
519
5
                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520
5
            }
521
6
            if(jpCharsetMasks[version]&CSM(GB2312)) {
522
4
                myConverterData->myConverterArray[GB2312] =
523
4
                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524
4
            }
525
6
            if(jpCharsetMasks[version]&CSM(KSC5601)) {
526
4
                myConverterData->myConverterArray[KSC5601] =
527
4
                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528
4
            }
529
530
            /* set the function pointers to appropriate functions */
531
6
            cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022JPData);
532
6
            uprv_strcpy(myConverterData->locale,"ja");
533
534
6
            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535
6
            size_t len = uprv_strlen(myConverterData->name);
536
6
            myConverterData->name[len] = static_cast<char>(myConverterData->version + static_cast<int>('0'));
537
6
            myConverterData->name[len+1]='\0';
538
6
        }
539
2.01k
#if !UCONFIG_ONLY_HTML_CONVERSION
540
2.01k
        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541
1
            (myLocale[2]=='_' || myLocale[2]=='\0'))
542
1
        {
543
1
            if(version>1) {
544
                // ICU 55 fails to open a converter for an unsupported version.
545
                // Previously, it fell back to version 0, but that would yield
546
                // unexpected behavior.
547
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
548
0
                return;
549
0
            }
550
1
            const char *cnvName;
551
1
            if(version==1) {
552
0
                cnvName="icu-internal-25546";
553
1
            } else {
554
1
                cnvName="ibm-949";
555
1
                myConverterData->version=version=0;
556
1
            }
557
1
            if(pArgs->onlyTestIsLoadable) {
558
0
                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559
0
                uprv_free(cnv->extraInfo);
560
0
                cnv->extraInfo=nullptr;
561
0
                return;
562
1
            } else {
563
1
                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564
1
                if (U_FAILURE(*errorCode)) {
565
0
                    _ISO2022Close(cnv);
566
0
                    return;
567
0
                }
568
569
1
                if(version==1) {
570
0
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571
0
                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572
0
                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573
1
                }else{
574
1
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575
1
                }
576
577
                /* initialize the state variables */
578
1
                setInitialStateToUnicodeKR(cnv, myConverterData);
579
1
                setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581
                /* set the function pointers to appropriate functions */
582
1
                cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022KRData);
583
1
                uprv_strcpy(myConverterData->locale,"ko");
584
1
            }
585
1
        }
586
2.01k
        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587
2.01k
            (myLocale[2]=='_' || myLocale[2]=='\0'))
588
2.01k
        {
589
2.01k
            if(version>2) {
590
                // ICU 55 fails to open a converter for an unsupported version.
591
                // Previously, it fell back to version 0, but that would yield
592
                // unexpected behavior.
593
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
594
0
                return;
595
0
            }
596
597
            /* open the required converters and cache them */
598
2.01k
            myConverterData->myConverterArray[GB2312_1] =
599
2.01k
                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600
2.01k
            if(version>=1) {
601
1.51k
                myConverterData->myConverterArray[ISO_IR_165] =
602
1.51k
                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603
1.51k
            }
604
2.01k
            myConverterData->myConverterArray[CNS_11643] =
605
2.01k
                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608
            /* set the function pointers to appropriate functions */
609
2.01k
            cnv->sharedData = const_cast<UConverterSharedData*>(&_ISO2022CNData);
610
2.01k
            uprv_strcpy(myConverterData->locale,"cn");
611
612
2.01k
            if (version==0){
613
505
                myConverterData->version = 0;
614
505
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615
1.51k
            }else if (version==1){
616
1.51k
                myConverterData->version = 1;
617
1.51k
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618
1.51k
            }else {
619
0
                myConverterData->version = 2;
620
0
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621
0
            }
622
2.01k
        }
623
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
624
0
        else{
625
#ifdef U_ENABLE_GENERIC_ISO_2022
626
            myConverterData->isFirstBuffer = true;
627
628
            /* append the UTF-8 escape sequence */
629
            cnv->charErrorBufferLength = 3;
630
            cnv->charErrorBuffer[0] = 0x1b;
631
            cnv->charErrorBuffer[1] = 0x25;
632
            cnv->charErrorBuffer[2] = 0x42;
633
634
            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635
            /* initialize the state variables */
636
            uprv_strcpy(myConverterData->name,"ISO_2022");
637
#else
638
0
            *errorCode = U_MISSING_RESOURCE_ERROR;
639
            // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640
            // data loading error code.
641
0
            return;
642
0
#endif
643
0
        }
644
645
2.02k
        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647
2.02k
        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648
0
            _ISO2022Close(cnv);
649
0
        }
650
2.02k
    } else {
651
0
        *errorCode = U_MEMORY_ALLOCATION_ERROR;
652
0
    }
653
2.02k
}
654
655
656
static void U_CALLCONV
657
2.02k
_ISO2022Close(UConverter *converter) {
658
2.02k
    UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(converter->extraInfo);
659
2.02k
    UConverterSharedData **array = myData->myConverterArray;
660
2.02k
    int32_t i;
661
662
2.02k
    if (converter->extraInfo != nullptr) {
663
        /*close the array of converter pointers and free the memory*/
664
22.2k
        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665
20.2k
            if(array[i]!=nullptr) {
666
5.56k
                ucnv_unloadSharedDataIfReady(array[i]);
667
5.56k
            }
668
20.2k
        }
669
670
2.02k
        ucnv_close(myData->currentConverter);
671
672
2.02k
        if(!converter->isExtraLocal){
673
2.02k
            uprv_free (converter->extraInfo);
674
2.02k
            converter->extraInfo = nullptr;
675
2.02k
        }
676
2.02k
    }
677
2.02k
}
678
679
static void U_CALLCONV
680
7.36k
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681
7.36k
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(converter->extraInfo);
682
7.36k
    if(choice<=UCNV_RESET_TO_UNICODE) {
683
7.36k
        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684
7.36k
        myConverterData->key = 0;
685
7.36k
        myConverterData->isEmptySegment = false;
686
7.36k
    }
687
7.36k
    if(choice!=UCNV_RESET_TO_UNICODE) {
688
0
        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689
0
    }
690
#ifdef U_ENABLE_GENERIC_ISO_2022
691
    if(myConverterData->locale[0] == 0){
692
        if(choice<=UCNV_RESET_TO_UNICODE) {
693
            myConverterData->isFirstBuffer = true;
694
            myConverterData->key = 0;
695
            if (converter->mode == UCNV_SO){
696
                ucnv_close (myConverterData->currentConverter);
697
                myConverterData->currentConverter=nullptr;
698
            }
699
            converter->mode = UCNV_SI;
700
        }
701
        if(choice!=UCNV_RESET_TO_UNICODE) {
702
            /* re-append UTF-8 escape sequence */
703
            converter->charErrorBufferLength = 3;
704
            converter->charErrorBuffer[0] = 0x1b;
705
            converter->charErrorBuffer[1] = 0x28;
706
            converter->charErrorBuffer[2] = 0x42;
707
        }
708
    }
709
    else
710
#endif
711
7.36k
    {
712
        /* reset the state variables */
713
7.36k
        if(myConverterData->locale[0] == 'k'){
714
0
            if(choice<=UCNV_RESET_TO_UNICODE) {
715
0
                setInitialStateToUnicodeKR(converter, myConverterData);
716
0
            }
717
0
            if(choice!=UCNV_RESET_TO_UNICODE) {
718
0
                setInitialStateFromUnicodeKR(converter, myConverterData);
719
0
            }
720
0
        }
721
7.36k
    }
722
7.36k
}
723
724
U_CDECL_BEGIN
725
726
static const char * U_CALLCONV
727
0
_ISO2022getName(const UConverter* cnv){
728
0
    if(cnv->extraInfo){
729
0
        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730
0
        return myData->name;
731
0
    }
732
0
    return nullptr;
733
0
}
734
735
U_CDECL_END
736
737
738
/*************** to unicode *******************/
739
/****************************************************************************
740
 * Recognized escape sequences are
741
 * <ESC>(B  ASCII
742
 * <ESC>.A  ISO-8859-1
743
 * <ESC>.F  ISO-8859-7
744
 * <ESC>(J  JISX-201
745
 * <ESC>(I  JISX-201
746
 * <ESC>$B  JISX-208
747
 * <ESC>$@  JISX-208
748
 * <ESC>$(D JISX-212
749
 * <ESC>$A  GB2312
750
 * <ESC>$(C KSC5601
751
 */
752
static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753
/*      0                1               2               3               4               5               6               7               8               9    */
754
    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755
    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756
    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757
    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762
};
763
764
#if !UCONFIG_ONLY_HTML_CONVERSION
765
/*************** to unicode *******************/
766
static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767
/*      0                1               2               3               4               5               6               7               8               9    */
768
     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773
    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776
};
777
#endif
778
779
780
static UCNV_TableStates_2022
781
45.1k
getKey_2022(char c,int32_t* key,int32_t* offset){
782
45.1k
    int32_t togo;
783
45.1k
    int32_t low = 0;
784
45.1k
    int32_t hi = MAX_STATES_2022;
785
45.1k
    int32_t oldmid=0;
786
787
45.1k
    togo = normalize_esq_chars_2022[static_cast<uint8_t>(c)];
788
45.1k
    if(togo == 0) {
789
        /* not a valid character anywhere in an escape sequence */
790
925
        *key = 0;
791
925
        *offset = 0;
792
925
        return INVALID_2022;
793
925
    }
794
44.2k
    togo = (*key << 5) + togo;
795
796
280k
    while (hi != low)  /*binary search*/{
797
798
280k
        int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800
280k
        if (mid == oldmid)
801
3.43k
            break;
802
803
276k
        if (escSeqStateTable_Key_2022[mid] > togo){
804
197k
            hi = mid;
805
197k
        }
806
79.4k
        else if (escSeqStateTable_Key_2022[mid] < togo){
807
38.6k
            low = mid;
808
38.6k
        }
809
40.8k
        else /*we found it*/{
810
40.8k
            *key = togo;
811
40.8k
            *offset = mid;
812
40.8k
            return static_cast<UCNV_TableStates_2022>(escSeqStateTable_Value_2022[mid]);
813
40.8k
        }
814
235k
        oldmid = mid;
815
816
235k
    }
817
818
3.43k
    *key = 0;
819
3.43k
    *offset = 0;
820
3.43k
    return INVALID_2022;
821
44.2k
}
822
823
/*runs through a state machine to determine the escape sequence - codepage correspondence
824
 */
825
static void
826
changeState_2022(UConverter* _this,
827
                const char** source,
828
                const char* sourceLimit,
829
                Variant2022 var,
830
17.5k
                UErrorCode* err){
831
17.5k
    UCNV_TableStates_2022 value;
832
17.5k
    UConverterDataISO2022* myData2022 = static_cast<UConverterDataISO2022*>(_this->extraInfo);
833
17.5k
    uint32_t key = myData2022->key;
834
17.5k
    int32_t offset = 0;
835
17.5k
    int8_t initialToULength = _this->toULength;
836
17.5k
    char c;
837
838
17.5k
    value = VALID_NON_TERMINAL_2022;
839
45.7k
    while (*source < sourceLimit) {
840
45.1k
        c = *(*source)++;
841
45.1k
        _this->toUBytes[_this->toULength++] = static_cast<uint8_t>(c);
842
45.1k
        value = getKey_2022(c, reinterpret_cast<int32_t*>(&key), &offset);
843
844
45.1k
        switch (value){
845
846
28.2k
        case VALID_NON_TERMINAL_2022 :
847
            /* continue with the loop */
848
28.2k
            break;
849
850
12.3k
        case VALID_TERMINAL_2022:
851
12.3k
            key = 0;
852
12.3k
            goto DONE;
853
854
4.35k
        case INVALID_2022:
855
4.35k
            goto DONE;
856
857
277
        case VALID_MAYBE_TERMINAL_2022:
858
#ifdef U_ENABLE_GENERIC_ISO_2022
859
            /* ESC ( B is ambiguous only for ISO_2022 itself */
860
            if(var == ISO_2022) {
861
                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862
                _this->toULength = 0;
863
864
                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866
                /* continue with the loop */
867
                value = VALID_NON_TERMINAL_2022;
868
                break;
869
            } else
870
#endif
871
277
            {
872
                /* not ISO_2022 itself, finish here */
873
277
                value = VALID_TERMINAL_2022;
874
277
                key = 0;
875
277
                goto DONE;
876
0
            }
877
45.1k
        }
878
45.1k
    }
879
880
17.5k
DONE:
881
17.5k
    myData2022->key = key;
882
883
17.5k
    if (value == VALID_NON_TERMINAL_2022) {
884
        /* indicate that the escape sequence is incomplete: key!=0 */
885
568
        return;
886
16.9k
    } else if (value == INVALID_2022 ) {
887
4.35k
        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888
12.5k
    } else /* value == VALID_TERMINAL_2022 */ {
889
12.5k
        switch(var){
890
#ifdef U_ENABLE_GENERIC_ISO_2022
891
        case ISO_2022:
892
        {
893
            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894
            if(chosenConverterName == nullptr) {
895
                /* SS2 or SS3 */
896
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897
                _this->toUCallbackReason = UCNV_UNASSIGNED;
898
                return;
899
            }
900
901
            _this->mode = UCNV_SI;
902
            ucnv_close(myData2022->currentConverter);
903
            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904
            if(U_SUCCESS(*err)) {
905
                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906
                _this->mode = UCNV_SO;
907
            }
908
            break;
909
        }
910
#endif
911
0
        case ISO_2022_JP:
912
0
            {
913
0
                StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeJP[offset]);
914
0
                switch(tempState) {
915
0
                case INVALID_STATE:
916
0
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917
0
                    break;
918
0
                case SS2_STATE:
919
0
                    if(myData2022->toU2022State.cs[2]!=0) {
920
0
                        if(myData2022->toU2022State.g<2) {
921
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922
0
                        }
923
0
                        myData2022->toU2022State.g=2;
924
0
                    } else {
925
                        /* illegal to have SS2 before a matching designator */
926
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927
0
                    }
928
0
                    break;
929
                /* case SS3_STATE: not used in ISO-2022-JP-x */
930
0
                case ISO8859_1:
931
0
                case ISO8859_7:
932
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934
0
                    } else {
935
                        /* G2 charset for SS2 */
936
0
                        myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
937
0
                    }
938
0
                    break;
939
0
                default:
940
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942
0
                    } else {
943
                        /* G0 charset */
944
0
                        myData2022->toU2022State.cs[0] = static_cast<int8_t>(tempState);
945
0
                    }
946
0
                    break;
947
0
                }
948
0
            }
949
0
            break;
950
0
#if !UCONFIG_ONLY_HTML_CONVERSION
951
12.5k
        case ISO_2022_CN:
952
12.5k
            {
953
12.5k
                StateEnum tempState = static_cast<StateEnum>(nextStateToUnicodeCN[offset]);
954
12.5k
                switch(tempState) {
955
493
                case INVALID_STATE:
956
493
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957
493
                    break;
958
6.07k
                case SS2_STATE:
959
6.07k
                    if(myData2022->toU2022State.cs[2]!=0) {
960
574
                        if(myData2022->toU2022State.g<2) {
961
331
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962
331
                        }
963
574
                        myData2022->toU2022State.g=2;
964
5.49k
                    } else {
965
                        /* illegal to have SS2 before a matching designator */
966
5.49k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967
5.49k
                    }
968
6.07k
                    break;
969
1.22k
                case SS3_STATE:
970
1.22k
                    if(myData2022->toU2022State.cs[3]!=0) {
971
0
                        if(myData2022->toU2022State.g<2) {
972
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973
0
                        }
974
0
                        myData2022->toU2022State.g=3;
975
1.22k
                    } else {
976
                        /* illegal to have SS3 before a matching designator */
977
1.22k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978
1.22k
                    }
979
1.22k
                    break;
980
1.02k
                case ISO_IR_165:
981
1.02k
                    if(myData2022->version==0) {
982
1.02k
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983
1.02k
                        break;
984
1.02k
                    }
985
0
                    U_FALLTHROUGH;
986
1.55k
                case GB2312_1:
987
1.55k
                    U_FALLTHROUGH;
988
2.49k
                case CNS_11643_1:
989
2.49k
                    myData2022->toU2022State.cs[1] = static_cast<int8_t>(tempState);
990
2.49k
                    break;
991
860
                case CNS_11643_2:
992
860
                    myData2022->toU2022State.cs[2] = static_cast<int8_t>(tempState);
993
860
                    break;
994
430
                default:
995
                    /* other CNS 11643 planes */
996
430
                    if(myData2022->version==0) {
997
430
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998
430
                    } else {
999
0
                        myData2022->toU2022State.cs[3] = static_cast<int8_t>(tempState);
1000
0
                    }
1001
430
                    break;
1002
12.5k
                }
1003
12.5k
            }
1004
12.5k
            break;
1005
12.5k
        case ISO_2022_KR:
1006
0
            if(offset==0x30){
1007
                /* nothing to be done, just accept this one escape sequence */
1008
0
            } else {
1009
0
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010
0
            }
1011
0
            break;
1012
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014
0
        default:
1015
0
            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016
0
            break;
1017
12.5k
        }
1018
12.5k
    }
1019
16.9k
    if(U_SUCCESS(*err)) {
1020
3.92k
        _this->toULength = 0;
1021
13.0k
    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022
11.0k
        if(_this->toULength>1) {
1023
            /*
1024
             * Ticket 5691: consistent illegal sequences:
1025
             * - We include at least the first byte (ESC) in the illegal sequence.
1026
             * - If any of the non-initial bytes could be the start of a character,
1027
             *   we stop the illegal sequence before the first one of those.
1028
             *   In escape sequences, all following bytes are "printable", that is,
1029
             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030
             *   they are valid single/lead bytes.
1031
             *   For simplicity, we always only report the initial ESC byte as the
1032
             *   illegal sequence and back out all other bytes we looked at.
1033
             */
1034
            /* Back out some bytes. */
1035
11.0k
            int8_t backOutDistance=_this->toULength-1;
1036
11.0k
            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037
11.0k
            if(backOutDistance<=bytesFromThisBuffer) {
1038
                /* same as initialToULength<=1 */
1039
11.0k
                *source-=backOutDistance;
1040
11.0k
            } else {
1041
                /* Back out bytes from the previous buffer: Need to replay them. */
1042
0
                _this->preToULength = static_cast<int8_t>(bytesFromThisBuffer - backOutDistance);
1043
                /* same as -(initialToULength-1) */
1044
                /* preToULength is negative! */
1045
0
                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046
0
                *source-=bytesFromThisBuffer;
1047
0
            }
1048
11.0k
            _this->toULength=1;
1049
11.0k
        }
1050
11.0k
    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051
1.95k
        _this->toUCallbackReason = UCNV_UNASSIGNED;
1052
1.95k
    }
1053
16.9k
}
1054
1055
#if !UCONFIG_ONLY_HTML_CONVERSION
1056
/*Checks the characters of the buffer against valid 2022 escape sequences
1057
*if the match we return a pointer to the initial start of the sequence otherwise
1058
*we return sourceLimit
1059
*/
1060
/*for 2022 looks ahead in the stream
1061
 *to determine the longest possible convertible
1062
 *data stream
1063
 */
1064
static inline const char*
1065
getEndOfBuffer_2022(const char** source,
1066
                   const char* sourceLimit,
1067
0
                   UBool /*flush*/){
1068
1069
0
    const char* mySource = *source;
1070
1071
#ifdef U_ENABLE_GENERIC_ISO_2022
1072
    if (*source >= sourceLimit)
1073
        return sourceLimit;
1074
1075
    do{
1076
1077
        if (*mySource == ESC_2022){
1078
            int8_t i;
1079
            int32_t key = 0;
1080
            int32_t offset;
1081
            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083
            /* Kludge: I could not
1084
            * figure out the reason for validating an escape sequence
1085
            * twice - once here and once in changeState_2022().
1086
            * is it possible to have an ESC character in a ISO2022
1087
            * byte stream which is valid in a code page? Is it legal?
1088
            */
1089
            for (i=0;
1090
            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091
            i++) {
1092
                value =  getKey_2022(*(mySource+i), &key, &offset);
1093
            }
1094
            if (value > 0 || *mySource==ESC_2022)
1095
                return mySource;
1096
1097
            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098
                return sourceLimit;
1099
        }
1100
    }while (++mySource < sourceLimit);
1101
1102
    return sourceLimit;
1103
#else
1104
0
    while(mySource < sourceLimit && *mySource != ESC_2022) {
1105
0
        ++mySource;
1106
0
    }
1107
0
    return mySource;
1108
0
#endif
1109
0
}
1110
#endif
1111
1112
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113
 * any future change in _MBCSFromUChar32() function should be reflected here.
1114
 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115
 */
1116
static inline int32_t
1117
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118
                                         UChar32 c,
1119
                                         uint32_t* value,
1120
                                         UBool useFallback,
1121
                                         int outputType)
1122
0
{
1123
0
    const int32_t *cx;
1124
0
    const uint16_t *table;
1125
0
    uint32_t stage2Entry;
1126
0
    uint32_t myValue;
1127
0
    int32_t length;
1128
0
    const uint8_t *p;
1129
    /*
1130
     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131
     * Use internal version of ucnv_open() that verifies that the new structures are available,
1132
     * else U_INTERNAL_PROGRAM_ERROR.
1133
     */
1134
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135
0
    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136
0
        table=sharedData->mbcs.fromUnicodeTable;
1137
0
        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138
        /* get the bytes and the length for the output */
1139
0
        if(outputType==MBCS_OUTPUT_2){
1140
0
            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141
0
            if(myValue<=0xff) {
1142
0
                length=1;
1143
0
            } else {
1144
0
                length=2;
1145
0
            }
1146
0
        } else /* outputType==MBCS_OUTPUT_3 */ {
1147
0
            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148
0
            myValue = (static_cast<uint32_t>(*p) << 16) | (static_cast<uint32_t>(p[1]) << 8) | p[2];
1149
0
            if(myValue<=0xff) {
1150
0
                length=1;
1151
0
            } else if(myValue<=0xffff) {
1152
0
                length=2;
1153
0
            } else {
1154
0
                length=3;
1155
0
            }
1156
0
        }
1157
        /* is this code point assigned, or do we use fallbacks? */
1158
0
        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159
            /* assigned */
1160
0
            *value=myValue;
1161
0
            return length;
1162
0
        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163
            /*
1164
             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165
             * There is no way with this data structure for fallback output
1166
             * to be a zero byte.
1167
             */
1168
0
            *value=myValue;
1169
0
            return -length;
1170
0
        }
1171
0
    }
1172
1173
0
    cx=sharedData->mbcs.extIndexes;
1174
0
    if(cx!=nullptr) {
1175
0
        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176
0
    }
1177
1178
    /* unassigned */
1179
0
    return 0;
1180
0
}
1181
1182
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183
 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184
 * @param retval pointer to output byte
1185
 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186
 */
1187
static inline int32_t
1188
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189
                                       UChar32 c,
1190
                                       uint32_t* retval,
1191
                                       UBool useFallback)
1192
0
{
1193
0
    const uint16_t *table;
1194
0
    int32_t value;
1195
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196
0
    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197
0
        return 0;
1198
0
    }
1199
    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200
0
    table=sharedData->mbcs.fromUnicodeTable;
1201
    /* get the byte for the output */
1202
0
    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203
    /* is this code point assigned, or do we use fallbacks? */
1204
0
    *retval = static_cast<uint32_t>(value & 0xff);
1205
0
    if(value>=0xf00) {
1206
0
        return 1;  /* roundtrip */
1207
0
    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208
0
        return -1;  /* fallback taken */
1209
0
    } else {
1210
0
        return 0;  /* no mapping */
1211
0
    }
1212
0
}
1213
1214
/*
1215
 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216
 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217
 * to move it to the ISO 2022 range 21..7E.
1218
 * Return 0 if out of range.
1219
 */
1220
static inline uint32_t
1221
0
_2022FromGR94DBCS(uint32_t value) {
1222
0
    if (static_cast<uint16_t>(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223
0
        static_cast<uint8_t>(value - 0xa1) <= (0xfe - 0xa1)
1224
0
    ) {
1225
0
        return value - 0x8080;  /* shift down to 21..7e byte range */
1226
0
    } else {
1227
0
        return 0;  /* not valid for ISO 2022 */
1228
0
    }
1229
0
}
1230
1231
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232
/*
1233
 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234
 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235
 * unchanged. 
1236
 */
1237
static inline uint32_t
1238
_2022ToGR94DBCS(uint32_t value) {
1239
    uint32_t returnValue = value + 0x8080;
1240
    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241
        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242
        return returnValue;
1243
    } else {
1244
        return value;
1245
    }
1246
}
1247
#endif
1248
1249
#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251
/**********************************************************************************
1252
*  ISO-2022 Converter
1253
*
1254
*
1255
*/
1256
1257
static void U_CALLCONV
1258
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259
                                                           UErrorCode* err){
1260
    const char* mySourceLimit, *realSourceLimit;
1261
    const char* sourceStart;
1262
    const char16_t* myTargetStart;
1263
    UConverter* saveThis;
1264
    UConverterDataISO2022* myData;
1265
    int8_t length;
1266
1267
    saveThis = args->converter;
1268
    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270
    realSourceLimit = args->sourceLimit;
1271
    while (args->source < realSourceLimit) {
1272
        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273
            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274
            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276
            if(args->source < mySourceLimit) {
1277
                if(myData->currentConverter==nullptr) {
1278
                    myData->currentConverter = ucnv_open("ASCII",err);
1279
                    if(U_FAILURE(*err)){
1280
                        return;
1281
                    }
1282
1283
                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284
                    saveThis->mode = UCNV_SO;
1285
                }
1286
1287
                /* convert to before the ESC or until the end of the buffer */
1288
                myData->isFirstBuffer=false;
1289
                sourceStart = args->source;
1290
                myTargetStart = args->target;
1291
                args->converter = myData->currentConverter;
1292
                ucnv_toUnicode(args->converter,
1293
                    &args->target,
1294
                    args->targetLimit,
1295
                    &args->source,
1296
                    mySourceLimit,
1297
                    args->offsets,
1298
                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299
                    err);
1300
                args->converter = saveThis;
1301
1302
                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303
                    /* move the overflow buffer */
1304
                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305
                    myData->currentConverter->UCharErrorBufferLength = 0;
1306
                    if(length > 0) {
1307
                        uprv_memcpy(saveThis->UCharErrorBuffer,
1308
                                    myData->currentConverter->UCharErrorBuffer,
1309
                                    length*U_SIZEOF_UCHAR);
1310
                    }
1311
                    return;
1312
                }
1313
1314
                /*
1315
                 * At least one of:
1316
                 * -Error while converting
1317
                 * -Done with entire buffer
1318
                 * -Need to write offsets or update the current offset
1319
                 *  (leave that up to the code in ucnv.c)
1320
                 *
1321
                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322
                 */
1323
                if (U_FAILURE(*err) ||
1324
                    (args->source == realSourceLimit) ||
1325
                    (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
1326
                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327
                ) {
1328
                    /* copy partial or error input for truncated detection and error handling */
1329
                    if(U_FAILURE(*err)) {
1330
                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331
                        if(length > 0) {
1332
                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333
                        }
1334
                    } else {
1335
                        length = saveThis->toULength = myData->currentConverter->toULength;
1336
                        if(length > 0) {
1337
                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338
                            if(args->source < mySourceLimit) {
1339
                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340
                            }
1341
                        }
1342
                    }
1343
                    return;
1344
                }
1345
            }
1346
        }
1347
1348
        sourceStart = args->source;
1349
        changeState_2022(args->converter,
1350
               &(args->source),
1351
               realSourceLimit,
1352
               ISO_2022,
1353
               err);
1354
        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
1355
            /* let the ucnv.c code update its current offset */
1356
            return;
1357
        }
1358
    }
1359
}
1360
1361
#endif
1362
1363
/*
1364
 * To Unicode Callback helper function
1365
 */
1366
static void
1367
toUnicodeCallback(UConverter *cnv,
1368
                  const uint32_t sourceChar, const uint32_t targetUniChar,
1369
278k
                  UErrorCode* err){
1370
278k
    if(sourceChar>0xff){
1371
49.2k
        cnv->toUBytes[0] = static_cast<uint8_t>(sourceChar >> 8);
1372
49.2k
        cnv->toUBytes[1] = static_cast<uint8_t>(sourceChar);
1373
49.2k
        cnv->toULength = 2;
1374
49.2k
    }
1375
229k
    else{
1376
229k
        cnv->toUBytes[0] = static_cast<char>(sourceChar);
1377
229k
        cnv->toULength = 1;
1378
229k
    }
1379
1380
278k
    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381
3.89k
        *err = U_INVALID_CHAR_FOUND;
1382
3.89k
    }
1383
274k
    else{
1384
274k
        *err = U_ILLEGAL_CHAR_FOUND;
1385
274k
    }
1386
278k
}
1387
1388
/**************************************ISO-2022-JP*************************************************/
1389
1390
/************************************** IMPORTANT **************************************************
1391
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393
* The converter iterates over each Unicode codepoint
1394
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395
* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396
* would do as far as possible.
1397
*
1398
* If the implementation of these macros or structure of sharedData struct change in the future, make
1399
* sure that ISO-2022 is also changed.
1400
***************************************************************************************************
1401
*/
1402
1403
/***************************************************************************************************
1404
* Rules for ISO-2022-jp encoding
1405
* (i)   Escape sequences must be fully contained within a line they should not
1406
*       span new lines or CRs
1407
* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408
*       JIS-Roman character escape sequence should follow before the line terminates
1409
* (iii) If the first character on the line is represented by two bytes then a two
1410
*       byte character escape sequence should precede it
1411
* (iv)  If no escape sequence is encountered then the characters are ASCII
1412
* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413
*       and invoked with SS2 (ESC N).
1414
* (vi)  If there is any G0 designation in text, there must be a switch to
1415
*       ASCII or to JIS X 0201-Roman before a space character (but not
1416
*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417
*       characters such as tab or CRLF.
1418
* (vi)  Supported encodings:
1419
*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420
*
1421
*  source : RFC-1554
1422
*
1423
*          JISX201, JISX208,JISX212 : new .cnv data files created
1424
*          KSC5601 : alias to ibm-949 mapping table
1425
*          GB2312 : alias to ibm-1386 mapping table
1426
*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427
*          ISO-8859-7 : alias to ibm-9409 mapping table
1428
*/
1429
1430
/* preference order of JP charsets */
1431
static const StateEnum jpCharsetPref[]={
1432
    ASCII,
1433
    JISX201,
1434
    ISO8859_1,
1435
    JISX208,
1436
    ISO8859_7,
1437
    JISX212,
1438
    GB2312,
1439
    KSC5601,
1440
    HWKANA_7BIT
1441
};
1442
1443
/*
1444
 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445
 * not in order of jpCharsetPref[]!
1446
 */
1447
static const char escSeqChars[][6] ={
1448
    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449
    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450
    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451
    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452
    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453
    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454
    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455
    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456
    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457
1458
};
1459
static  const int8_t escSeqCharsLen[] ={
1460
    3, /* length of <ESC>(B  ASCII       */
1461
    3, /* length of <ESC>.A  ISO-8859-1  */
1462
    3, /* length of <ESC>.F  ISO-8859-7  */
1463
    3, /* length of <ESC>(J  JISX-201    */
1464
    3, /* length of <ESC>$B  JISX-208    */
1465
    4, /* length of <ESC>$(D JISX-212    */
1466
    3, /* length of <ESC>$A  GB2312      */
1467
    4, /* length of <ESC>$(C KSC5601     */
1468
    3  /* length of <ESC>(I  HWKANA_7BIT */
1469
};
1470
1471
/*
1472
* The iteration over various code pages works this way:
1473
* i)   Get the currentState from myConverterData->currentState
1474
* ii)  Check if the character is mapped to a valid character in the currentState
1475
*      Yes ->  a) set the initIterState to currentState
1476
*       b) remain in this state until an invalid character is found
1477
*      No  ->  a) go to the next code page and find the character
1478
* iii) Before changing the state increment the current state check if the current state
1479
*      is equal to the intitIteration state
1480
*      Yes ->  A character that cannot be represented in any of the supported encodings
1481
*       break and return a U_INVALID_CHARACTER error
1482
*      No  ->  Continue and find the character in next code page
1483
*
1484
*
1485
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486
*/
1487
1488
/* Map 00..7F to Unicode according to JIS X 0201. */
1489
static inline uint32_t
1490
0
jisx201ToU(uint32_t value) {
1491
0
    if(value < 0x5c) {
1492
0
        return value;
1493
0
    } else if(value == 0x5c) {
1494
0
        return 0xa5;
1495
0
    } else if(value == 0x7e) {
1496
0
        return 0x203e;
1497
0
    } else /* value <= 0x7f */ {
1498
0
        return value;
1499
0
    }
1500
0
}
1501
1502
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503
static inline uint32_t
1504
0
jisx201FromU(uint32_t value) {
1505
0
    if(value<=0x7f) {
1506
0
        if(value!=0x5c && value!=0x7e) {
1507
0
            return value;
1508
0
        }
1509
0
    } else if(value==0xa5) {
1510
0
        return 0x5c;
1511
0
    } else if(value==0x203e) {
1512
0
        return 0x7e;
1513
0
    }
1514
0
    return 0xfffe;
1515
0
}
1516
1517
/*
1518
 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519
 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520
 * Return 0 if the byte pair is out of range.
1521
 */
1522
static inline uint32_t
1523
0
_2022FromSJIS(uint32_t value) {
1524
0
    uint8_t trail;
1525
1526
0
    if(value > 0xEFFC) {
1527
0
        return 0;  /* beyond JIS X 0208 */
1528
0
    }
1529
1530
0
    trail = static_cast<uint8_t>(value);
1531
1532
0
    value &= 0xff00;  /* lead byte */
1533
0
    if(value <= 0x9f00) {
1534
0
        value -= 0x7000;
1535
0
    } else /* 0xe000 <= value <= 0xef00 */ {
1536
0
        value -= 0xb000;
1537
0
    }
1538
0
    value <<= 1;
1539
1540
0
    if(trail <= 0x9e) {
1541
0
        value -= 0x100;
1542
0
        if(trail <= 0x7e) {
1543
0
            value |= trail - 0x1f;
1544
0
        } else {
1545
0
            value |= trail - 0x20;
1546
0
        }
1547
0
    } else /* trail <= 0xfc */ {
1548
0
        value |= trail - 0x7e;
1549
0
    }
1550
0
    return value;
1551
0
}
1552
1553
/*
1554
 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555
 * If either byte is outside 21..7E make sure that the result is not valid
1556
 * for Shift-JIS so that the converter catches it.
1557
 * Some invalid byte values already turn into equally invalid Shift-JIS
1558
 * byte values and need not be tested explicitly.
1559
 */
1560
static inline void
1561
0
_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562
0
    if(c1&1) {
1563
0
        ++c1;
1564
0
        if(c2 <= 0x5f) {
1565
0
            c2 += 0x1f;
1566
0
        } else if(c2 <= 0x7e) {
1567
0
            c2 += 0x20;
1568
0
        } else {
1569
0
            c2 = 0;  /* invalid */
1570
0
        }
1571
0
    } else {
1572
0
        if (static_cast<uint8_t>(c2 - 0x21) <= ((0x7e) - 0x21)) {
1573
0
            c2 += 0x7e;
1574
0
        } else {
1575
0
            c2 = 0;  /* invalid */
1576
0
        }
1577
0
    }
1578
0
    c1 >>= 1;
1579
0
    if(c1 <= 0x2f) {
1580
0
        c1 += 0x70;
1581
0
    } else if(c1 <= 0x3f) {
1582
0
        c1 += 0xb0;
1583
0
    } else {
1584
0
        c1 = 0;  /* invalid */
1585
0
    }
1586
0
    bytes[0] = static_cast<char>(c1);
1587
0
    bytes[1] = static_cast<char>(c2);
1588
0
}
1589
1590
/*
1591
 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592
 * Katakana.
1593
 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594
 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595
 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596
 */
1597
static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598
    0x2123,  /* U+FF61 */
1599
    0x2156,
1600
    0x2157,
1601
    0x2122,
1602
    0x2126,
1603
    0x2572,
1604
    0x2521,
1605
    0x2523,
1606
    0x2525,
1607
    0x2527,
1608
    0x2529,
1609
    0x2563,
1610
    0x2565,
1611
    0x2567,
1612
    0x2543,
1613
    0x213C,  /* U+FF70 */
1614
    0x2522,
1615
    0x2524,
1616
    0x2526,
1617
    0x2528,
1618
    0x252A,
1619
    0x252B,
1620
    0x252D,
1621
    0x252F,
1622
    0x2531,
1623
    0x2533,
1624
    0x2535,
1625
    0x2537,
1626
    0x2539,
1627
    0x253B,
1628
    0x253D,
1629
    0x253F,  /* U+FF80 */
1630
    0x2541,
1631
    0x2544,
1632
    0x2546,
1633
    0x2548,
1634
    0x254A,
1635
    0x254B,
1636
    0x254C,
1637
    0x254D,
1638
    0x254E,
1639
    0x254F,
1640
    0x2552,
1641
    0x2555,
1642
    0x2558,
1643
    0x255B,
1644
    0x255E,
1645
    0x255F,  /* U+FF90 */
1646
    0x2560,
1647
    0x2561,
1648
    0x2562,
1649
    0x2564,
1650
    0x2566,
1651
    0x2568,
1652
    0x2569,
1653
    0x256A,
1654
    0x256B,
1655
    0x256C,
1656
    0x256D,
1657
    0x256F,
1658
    0x2573,
1659
    0x212B,
1660
    0x212C   /* U+FF9F */
1661
};
1662
1663
static void U_CALLCONV
1664
0
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665
0
    UConverter *cnv = args->converter;
1666
0
    UConverterDataISO2022 *converterData;
1667
0
    ISO2022State *pFromU2022State;
1668
0
    uint8_t* target = reinterpret_cast<uint8_t*>(args->target);
1669
0
    const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit);
1670
0
    const char16_t* source = args->source;
1671
0
    const char16_t* sourceLimit = args->sourceLimit;
1672
0
    int32_t* offsets = args->offsets;
1673
0
    UChar32 sourceChar;
1674
0
    char buffer[8];
1675
0
    int32_t len, outLen;
1676
0
    int8_t choices[10];
1677
0
    int32_t choiceCount;
1678
0
    uint32_t targetValue = 0;
1679
0
    UBool useFallback;
1680
1681
0
    int32_t i;
1682
0
    int8_t cs, g;
1683
1684
    /* set up the state */
1685
0
    converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
1686
0
    pFromU2022State   = &converterData->fromU2022State;
1687
1688
0
    choiceCount = 0;
1689
1690
    /* check if the last codepoint of previous buffer was a lead surrogate*/
1691
0
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692
0
        goto getTrail;
1693
0
    }
1694
1695
0
    while(source < sourceLimit) {
1696
0
        if(target < targetLimit) {
1697
1698
0
            sourceChar  = *(source++);
1699
            /*check if the char is a First surrogate*/
1700
0
            if(U16_IS_SURROGATE(sourceChar)) {
1701
0
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702
0
getTrail:
1703
                    /*look ahead to find the trail surrogate*/
1704
0
                    if(source < sourceLimit) {
1705
                        /* test the following code unit */
1706
0
                        char16_t trail = *source;
1707
0
                        if(U16_IS_TRAIL(trail)) {
1708
0
                            source++;
1709
0
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710
0
                            cnv->fromUChar32=0x00;
1711
                            /* convert this supplementary code point */
1712
                            /* exit this condition tree */
1713
0
                        } else {
1714
                            /* this is an unmatched lead code unit (1st surrogate) */
1715
                            /* callback(illegal) */
1716
0
                            *err=U_ILLEGAL_CHAR_FOUND;
1717
0
                            cnv->fromUChar32=sourceChar;
1718
0
                            break;
1719
0
                        }
1720
0
                    } else {
1721
                        /* no more input */
1722
0
                        cnv->fromUChar32=sourceChar;
1723
0
                        break;
1724
0
                    }
1725
0
                } else {
1726
                    /* this is an unmatched trail code unit (2nd surrogate) */
1727
                    /* callback(illegal) */
1728
0
                    *err=U_ILLEGAL_CHAR_FOUND;
1729
0
                    cnv->fromUChar32=sourceChar;
1730
0
                    break;
1731
0
                }
1732
0
            }
1733
1734
            /* do not convert SO/SI/ESC */
1735
0
            if(IS_2022_CONTROL(sourceChar)) {
1736
                /* callback(illegal) */
1737
0
                *err=U_ILLEGAL_CHAR_FOUND;
1738
0
                cnv->fromUChar32=sourceChar;
1739
0
                break;
1740
0
            }
1741
1742
            /* do the conversion */
1743
1744
0
            if(choiceCount == 0) {
1745
0
                uint16_t csm;
1746
1747
                /*
1748
                 * The csm variable keeps track of which charsets are allowed
1749
                 * and not used yet while building the choices[].
1750
                 */
1751
0
                csm = jpCharsetMasks[converterData->version];
1752
0
                choiceCount = 0;
1753
1754
                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755
0
                if(converterData->version == 3 || converterData->version == 4) {
1756
0
                    choices[choiceCount++] = static_cast<int8_t>(HWKANA_7BIT);
1757
0
                }
1758
                /* Do not try single-byte half-width Katakana for other versions. */
1759
0
                csm &= ~CSM(HWKANA_7BIT);
1760
1761
                /* try the current G0 charset */
1762
0
                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763
0
                csm &= ~CSM(cs);
1764
1765
                /* try the current G2 charset */
1766
0
                if((cs = pFromU2022State->cs[2]) != 0) {
1767
0
                    choices[choiceCount++] = cs;
1768
0
                    csm &= ~CSM(cs);
1769
0
                }
1770
1771
                /* try all the other possible charsets */
1772
0
                for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773
0
                    cs = static_cast<int8_t>(jpCharsetPref[i]);
1774
0
                    if(CSM(cs) & csm) {
1775
0
                        choices[choiceCount++] = cs;
1776
0
                        csm &= ~CSM(cs);
1777
0
                    }
1778
0
                }
1779
0
            }
1780
1781
0
            cs = g = 0;
1782
            /*
1783
             * len==0: no mapping found yet
1784
             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785
             * len>0: found a roundtrip result, done
1786
             */
1787
0
            len = 0;
1788
            /*
1789
             * We will turn off useFallback after finding a fallback,
1790
             * but we still get fallbacks from PUA code points as usual.
1791
             * Therefore, we will also need to check that we don't overwrite
1792
             * an early fallback with a later one.
1793
             */
1794
0
            useFallback = cnv->useFallback;
1795
1796
0
            for(i = 0; i < choiceCount && len <= 0; ++i) {
1797
0
                uint32_t value;
1798
0
                int32_t len2;
1799
0
                int8_t cs0 = choices[i];
1800
0
                switch(cs0) {
1801
0
                case ASCII:
1802
0
                    if(sourceChar <= 0x7f) {
1803
0
                        targetValue = static_cast<uint32_t>(sourceChar);
1804
0
                        len = 1;
1805
0
                        cs = cs0;
1806
0
                        g = 0;
1807
0
                    }
1808
0
                    break;
1809
0
                case ISO8859_1:
1810
0
                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811
0
                        targetValue = static_cast<uint32_t>(sourceChar) - 0x80;
1812
0
                        len = 1;
1813
0
                        cs = cs0;
1814
0
                        g = 2;
1815
0
                    }
1816
0
                    break;
1817
0
                case HWKANA_7BIT:
1818
0
                    if (static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819
0
                        if(converterData->version==3) {
1820
                            /* JIS7: use G1 (SO) */
1821
                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822
0
                            targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0x21));
1823
0
                            len = 1;
1824
0
                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825
0
                            g = 1;
1826
0
                        } else if(converterData->version==4) {
1827
                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828
                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829
0
                            targetValue = static_cast<uint32_t>(sourceChar - (HWKANA_START - 0xa1));
1830
0
                            len = 1;
1831
1832
0
                            cs = pFromU2022State->cs[0];
1833
0
                            if(IS_JP_DBCS(cs)) {
1834
                                /* switch from a DBCS charset to JISX201 */
1835
0
                                cs = static_cast<int8_t>(JISX201);
1836
0
                            }
1837
                            /* else stay in the current G0 charset */
1838
0
                            g = 0;
1839
0
                        }
1840
                        /* else do not use HWKANA_7BIT with other versions */
1841
0
                    }
1842
0
                    break;
1843
0
                case JISX201:
1844
                    /* G0 SBCS */
1845
0
                    value = jisx201FromU(sourceChar);
1846
0
                    if(value <= 0x7f) {
1847
0
                        targetValue = value;
1848
0
                        len = 1;
1849
0
                        cs = cs0;
1850
0
                        g = 0;
1851
0
                        useFallback = false;
1852
0
                    }
1853
0
                    break;
1854
0
                case JISX208:
1855
                    /* G0 DBCS from Shift-JIS table */
1856
0
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1857
0
                                converterData->myConverterArray[cs0],
1858
0
                                sourceChar, &value,
1859
0
                                useFallback, MBCS_OUTPUT_2);
1860
0
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1861
0
                        value = _2022FromSJIS(value);
1862
0
                        if(value != 0) {
1863
0
                            targetValue = value;
1864
0
                            len = len2;
1865
0
                            cs = cs0;
1866
0
                            g = 0;
1867
0
                            useFallback = false;
1868
0
                        }
1869
0
                    } else if(len == 0 && useFallback &&
1870
0
                              static_cast<uint32_t>(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871
0
                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872
0
                        len = -2;
1873
0
                        cs = cs0;
1874
0
                        g = 0;
1875
0
                        useFallback = false;
1876
0
                    }
1877
0
                    break;
1878
0
                case ISO8859_7:
1879
                    /* G0 SBCS forced to 7-bit output */
1880
0
                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1881
0
                                converterData->myConverterArray[cs0],
1882
0
                                sourceChar, &value,
1883
0
                                useFallback);
1884
0
                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885
0
                        targetValue = value - 0x80;
1886
0
                        len = len2;
1887
0
                        cs = cs0;
1888
0
                        g = 2;
1889
0
                        useFallback = false;
1890
0
                    }
1891
0
                    break;
1892
0
                default:
1893
                    /* G0 DBCS */
1894
0
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1895
0
                                converterData->myConverterArray[cs0],
1896
0
                                sourceChar, &value,
1897
0
                                useFallback, MBCS_OUTPUT_2);
1898
0
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1899
0
                        if(cs0 == KSC5601) {
1900
                            /*
1901
                             * Check for valid bytes for the encoding scheme.
1902
                             * This is necessary because the sub-converter (windows-949)
1903
                             * has a broader encoding scheme than is valid for 2022.
1904
                             */
1905
0
                            value = _2022FromGR94DBCS(value);
1906
0
                            if(value == 0) {
1907
0
                                break;
1908
0
                            }
1909
0
                        }
1910
0
                        targetValue = value;
1911
0
                        len = len2;
1912
0
                        cs = cs0;
1913
0
                        g = 0;
1914
0
                        useFallback = false;
1915
0
                    }
1916
0
                    break;
1917
0
                }
1918
0
            }
1919
1920
0
            if(len != 0) {
1921
0
                if(len < 0) {
1922
0
                    len = -len;  /* fallback */
1923
0
                }
1924
0
                outLen = 0; /* count output bytes */
1925
1926
                /* write SI if necessary (only for JIS7) */
1927
0
                if(pFromU2022State->g == 1 && g == 0) {
1928
0
                    buffer[outLen++] = UCNV_SI;
1929
0
                    pFromU2022State->g = 0;
1930
0
                }
1931
1932
                /* write the designation sequence if necessary */
1933
0
                if(cs != pFromU2022State->cs[g]) {
1934
0
                    int32_t escLen = escSeqCharsLen[cs];
1935
0
                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936
0
                    outLen += escLen;
1937
0
                    pFromU2022State->cs[g] = cs;
1938
1939
                    /* invalidate the choices[] */
1940
0
                    choiceCount = 0;
1941
0
                }
1942
1943
                /* write the shift sequence if necessary */
1944
0
                if(g != pFromU2022State->g) {
1945
0
                    switch(g) {
1946
                    /* case 0 handled before writing escapes */
1947
0
                    case 1:
1948
0
                        buffer[outLen++] = UCNV_SO;
1949
0
                        pFromU2022State->g = 1;
1950
0
                        break;
1951
0
                    default: /* case 2 */
1952
0
                        buffer[outLen++] = 0x1b;
1953
0
                        buffer[outLen++] = 0x4e;
1954
0
                        break;
1955
                    /* no case 3: no SS3 in ISO-2022-JP-x */
1956
0
                    }
1957
0
                }
1958
1959
                /* write the output bytes */
1960
0
                if(len == 1) {
1961
0
                    buffer[outLen++] = static_cast<char>(targetValue);
1962
0
                } else /* len == 2 */ {
1963
0
                    buffer[outLen++] = static_cast<char>(targetValue >> 8);
1964
0
                    buffer[outLen++] = static_cast<char>(targetValue);
1965
0
                }
1966
0
            } else {
1967
                /*
1968
                 * if we cannot find the character after checking all codepages
1969
                 * then this is an error
1970
                 */
1971
0
                *err = U_INVALID_CHAR_FOUND;
1972
0
                cnv->fromUChar32=sourceChar;
1973
0
                break;
1974
0
            }
1975
1976
0
            if(sourceChar == CR || sourceChar == LF) {
1977
                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978
0
                pFromU2022State->cs[2] = 0;
1979
0
                choiceCount = 0;
1980
0
            }
1981
1982
            /* output outLen>0 bytes in buffer[] */
1983
0
            if(outLen == 1) {
1984
0
                *target++ = buffer[0];
1985
0
                if(offsets) {
1986
0
                    *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */
1987
0
                }
1988
0
            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989
0
                *target++ = buffer[0];
1990
0
                *target++ = buffer[1];
1991
0
                if(offsets) {
1992
0
                    int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar));
1993
0
                    *offsets++ = sourceIndex;
1994
0
                    *offsets++ = sourceIndex;
1995
0
                }
1996
0
            } else {
1997
0
                fromUWriteUInt8(
1998
0
                    cnv,
1999
0
                    buffer, outLen,
2000
0
                    &target, reinterpret_cast<const char*>(targetLimit),
2001
0
                    &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)),
2002
0
                    err);
2003
0
                if(U_FAILURE(*err)) {
2004
0
                    break;
2005
0
                }
2006
0
            }
2007
0
        } /* end if(myTargetIndex<myTargetLength) */
2008
0
        else{
2009
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2010
0
            break;
2011
0
        }
2012
2013
0
    }/* end while(mySourceIndex<mySourceLength) */
2014
2015
    /*
2016
     * the end of the input stream and detection of truncated input
2017
     * are handled by the framework, but for ISO-2022-JP conversion
2018
     * we need to be in ASCII mode at the very end
2019
     *
2020
     * conditions:
2021
     *   successful
2022
     *   in SO mode or not in ASCII mode
2023
     *   end of input and no truncated input
2024
     */
2025
0
    if( U_SUCCESS(*err) &&
2026
0
        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027
0
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028
0
    ) {
2029
0
        int32_t sourceIndex;
2030
2031
0
        outLen = 0;
2032
2033
0
        if(pFromU2022State->g != 0) {
2034
0
            buffer[outLen++] = UCNV_SI;
2035
0
            pFromU2022State->g = 0;
2036
0
        }
2037
2038
0
        if(pFromU2022State->cs[0] != ASCII) {
2039
0
            int32_t escLen = escSeqCharsLen[ASCII];
2040
0
            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041
0
            outLen += escLen;
2042
0
            pFromU2022State->cs[0] = static_cast<int8_t>(ASCII);
2043
0
        }
2044
2045
        /* get the source index of the last input character */
2046
        /*
2047
         * TODO this would be simpler and more reliable if we used a pair
2048
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049
         * so that we could simply use the prevSourceIndex here;
2050
         * this code gives an incorrect result for the rare case of an unmatched
2051
         * trail surrogate that is alone in the last buffer of the text stream
2052
         */
2053
0
        sourceIndex = static_cast<int32_t>(source - args->source);
2054
0
        if(sourceIndex>0) {
2055
0
            --sourceIndex;
2056
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058
0
            ) {
2059
0
                --sourceIndex;
2060
0
            }
2061
0
        } else {
2062
0
            sourceIndex=-1;
2063
0
        }
2064
2065
0
        fromUWriteUInt8(
2066
0
            cnv,
2067
0
            buffer, outLen,
2068
0
            &target, reinterpret_cast<const char*>(targetLimit),
2069
0
            &offsets, sourceIndex,
2070
0
            err);
2071
0
    }
2072
2073
    /*save the state and return */
2074
0
    args->source = source;
2075
0
    args->target = reinterpret_cast<char*>(target);
2076
0
}
2077
2078
/*************** to unicode *******************/
2079
2080
static void U_CALLCONV
2081
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082
0
                                               UErrorCode* err){
2083
0
    char tempBuf[2];
2084
0
    const char* mySource = const_cast<char*>(args->source);
2085
0
    char16_t *myTarget = args->target;
2086
0
    const char *mySourceLimit = args->sourceLimit;
2087
0
    uint32_t targetUniChar = 0x0000;
2088
0
    uint32_t mySourceChar = 0x0000;
2089
0
    uint32_t tmpSourceChar = 0x0000;
2090
0
    UConverterDataISO2022* myData;
2091
0
    ISO2022State *pToU2022State;
2092
0
    StateEnum cs;
2093
2094
0
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2095
0
    pToU2022State = &myData->toU2022State;
2096
2097
0
    if(myData->key != 0) {
2098
        /* continue with a partial escape sequence */
2099
0
        goto escape;
2100
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101
        /* continue with a partial double-byte character */
2102
0
        mySourceChar = args->converter->toUBytes[0];
2103
0
        args->converter->toULength = 0;
2104
0
        cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
2105
0
        targetUniChar = missingCharMarker;
2106
0
        goto getTrailByte;
2107
0
    }
2108
2109
0
    while(mySource < mySourceLimit){
2110
2111
0
        targetUniChar =missingCharMarker;
2112
2113
0
        if(myTarget < args->targetLimit){
2114
2115
0
            mySourceChar = static_cast<unsigned char>(*mySource++);
2116
2117
0
            switch(mySourceChar) {
2118
0
            case UCNV_SI:
2119
0
                if(myData->version==3) {
2120
0
                    pToU2022State->g=0;
2121
0
                    continue;
2122
0
                } else {
2123
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124
0
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2125
0
                    break;
2126
0
                }
2127
2128
0
            case UCNV_SO:
2129
0
                if(myData->version==3) {
2130
                    /* JIS7: switch to G1 half-width Katakana */
2131
0
                    pToU2022State->cs[1] = static_cast<int8_t>(HWKANA_7BIT);
2132
0
                    pToU2022State->g=1;
2133
0
                    continue;
2134
0
                } else {
2135
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136
0
                    myData->isEmptySegment = false; /* reset this, we have a different error */
2137
0
                    break;
2138
0
                }
2139
2140
0
            case ESC_2022:
2141
0
                mySource--;
2142
0
escape:
2143
0
                {
2144
0
                    const char * mySourceBefore = mySource;
2145
0
                    int8_t toULengthBefore = args->converter->toULength;
2146
2147
0
                    changeState_2022(args->converter,&(mySource),
2148
0
                        mySourceLimit, ISO_2022_JP,err);
2149
2150
                    /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
2151
0
                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153
0
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154
0
                        args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
2155
0
                    }
2156
0
                }
2157
2158
                /* invalid or illegal escape sequence */
2159
0
                if(U_FAILURE(*err)){
2160
0
                    args->target = myTarget;
2161
0
                    args->source = mySource;
2162
0
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
2163
0
                    return;
2164
0
                }
2165
                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166
0
                if(myData->key==0) {
2167
0
                    myData->isEmptySegment = true;
2168
0
                }
2169
0
                continue;
2170
2171
            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173
0
            case CR:
2174
0
            case LF:
2175
                /* automatically reset to single-byte mode */
2176
0
                if (static_cast<StateEnum>(pToU2022State->cs[0]) != ASCII &&
2177
0
                    static_cast<StateEnum>(pToU2022State->cs[0]) != JISX201) {
2178
0
                    pToU2022State->cs[0] = static_cast<int8_t>(ASCII);
2179
0
                }
2180
0
                pToU2022State->cs[2] = 0;
2181
0
                pToU2022State->g = 0;
2182
0
                U_FALLTHROUGH;
2183
0
            default:
2184
                /* convert one or two bytes */
2185
0
                myData->isEmptySegment = false;
2186
0
                cs = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
2187
0
                if (static_cast<uint8_t>(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version == 4 &&
2188
0
                    !IS_JP_DBCS(cs)
2189
0
                ) {
2190
                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2191
0
                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2192
2193
                    /* return from a single-shift state to the previous one */
2194
0
                    if(pToU2022State->g >= 2) {
2195
0
                        pToU2022State->g=pToU2022State->prevG;
2196
0
                    }
2197
0
                } else switch(cs) {
2198
0
                case ASCII:
2199
0
                    if(mySourceChar <= 0x7f) {
2200
0
                        targetUniChar = mySourceChar;
2201
0
                    }
2202
0
                    break;
2203
0
                case ISO8859_1:
2204
0
                    if(mySourceChar <= 0x7f) {
2205
0
                        targetUniChar = mySourceChar + 0x80;
2206
0
                    }
2207
                    /* return from a single-shift state to the previous one */
2208
0
                    pToU2022State->g=pToU2022State->prevG;
2209
0
                    break;
2210
0
                case ISO8859_7:
2211
0
                    if(mySourceChar <= 0x7f) {
2212
                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2213
0
                        targetUniChar =
2214
0
                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2215
0
                                myData->myConverterArray[cs],
2216
0
                                mySourceChar + 0x80);
2217
0
                    }
2218
                    /* return from a single-shift state to the previous one */
2219
0
                    pToU2022State->g=pToU2022State->prevG;
2220
0
                    break;
2221
0
                case JISX201:
2222
0
                    if(mySourceChar <= 0x7f) {
2223
0
                        targetUniChar = jisx201ToU(mySourceChar);
2224
0
                    }
2225
0
                    break;
2226
0
                case HWKANA_7BIT:
2227
0
                    if (static_cast<uint8_t>(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2228
                        /* 7-bit halfwidth Katakana */
2229
0
                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2230
0
                    }
2231
0
                    break;
2232
0
                default:
2233
                    /* G0 DBCS */
2234
0
                    if(mySource < mySourceLimit) {
2235
0
                        int leadIsOk, trailIsOk;
2236
0
                        uint8_t trailByte;
2237
0
getTrailByte:
2238
0
                        trailByte = static_cast<uint8_t>(*mySource);
2239
                        /*
2240
                         * Ticket 5691: consistent illegal sequences:
2241
                         * - We include at least the first byte in the illegal sequence.
2242
                         * - If any of the non-initial bytes could be the start of a character,
2243
                         *   we stop the illegal sequence before the first one of those.
2244
                         *
2245
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2246
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2247
                         * Otherwise we convert or report the pair of bytes.
2248
                         */
2249
0
                        leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
2250
0
                        trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
2251
0
                        if (leadIsOk && trailIsOk) {
2252
0
                            ++mySource;
2253
0
                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2254
0
                            if(cs == JISX208) {
2255
0
                                _2022ToSJIS(static_cast<uint8_t>(mySourceChar), trailByte, tempBuf);
2256
0
                                mySourceChar = tmpSourceChar;
2257
0
                            } else {
2258
                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2259
0
                                mySourceChar = tmpSourceChar;
2260
0
                                if (cs == KSC5601) {
2261
0
                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2262
0
                                }
2263
0
                                tempBuf[0] = static_cast<char>(tmpSourceChar >> 8);
2264
0
                                tempBuf[1] = static_cast<char>(tmpSourceChar);
2265
0
                            }
2266
0
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
2267
0
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2268
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2269
0
                            ++mySource;
2270
                            /* add another bit so that the code below writes 2 bytes in case of error */
2271
0
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2272
0
                        }
2273
0
                    } else {
2274
0
                        args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2275
0
                        args->converter->toULength = 1;
2276
0
                        goto endloop;
2277
0
                    }
2278
0
                }  /* End of inner switch */
2279
0
                break;
2280
0
            }  /* End of outer switch */
2281
0
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2282
0
                if(args->offsets){
2283
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2284
0
                }
2285
0
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
2286
0
            }
2287
0
            else if(targetUniChar > missingCharMarker){
2288
                /* disassemble the surrogate pair and write to output*/
2289
0
                targetUniChar-=0x0010000;
2290
0
                *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10));
2291
0
                if(args->offsets){
2292
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2293
0
                }
2294
0
                ++myTarget;
2295
0
                if(myTarget< args->targetLimit){
2296
0
                    *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
2297
0
                    if(args->offsets){
2298
0
                        args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2299
0
                    }
2300
0
                    ++myTarget;
2301
0
                }else{
2302
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2303
0
                                    static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
2304
0
                }
2305
2306
0
            }
2307
0
            else{
2308
                /* Call the callback function*/
2309
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2310
0
                break;
2311
0
            }
2312
0
        }
2313
0
        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2314
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2315
0
            break;
2316
0
        }
2317
0
    }
2318
0
endloop:
2319
0
    args->target = myTarget;
2320
0
    args->source = mySource;
2321
0
}
2322
2323
2324
#if !UCONFIG_ONLY_HTML_CONVERSION
2325
/***************************************************************
2326
*   Rules for ISO-2022-KR encoding
2327
*   i) The KSC5601 designator sequence should appear only once in a file,
2328
*      at the beginning of a line before any KSC5601 characters. This usually
2329
*      means that it appears by itself on the first line of the file
2330
*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2331
*      and SI to shift into single byte mode
2332
*/
2333
static void U_CALLCONV
2334
0
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2335
2336
0
    UConverter* saveConv = args->converter;
2337
0
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(saveConv->extraInfo);
2338
0
    args->converter=myConverterData->currentConverter;
2339
2340
0
    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2341
0
    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2342
0
    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2343
2344
0
    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2345
0
        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2346
0
            uprv_memcpy(
2347
0
                saveConv->charErrorBuffer,
2348
0
                myConverterData->currentConverter->charErrorBuffer,
2349
0
                myConverterData->currentConverter->charErrorBufferLength);
2350
0
        }
2351
0
        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2352
0
        myConverterData->currentConverter->charErrorBufferLength = 0;
2353
0
    }
2354
0
    args->converter=saveConv;
2355
0
}
2356
2357
static void U_CALLCONV
2358
0
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2359
2360
0
    const char16_t *source = args->source;
2361
0
    const char16_t *sourceLimit = args->sourceLimit;
2362
0
    unsigned char *target = reinterpret_cast<unsigned char*>(args->target);
2363
0
    unsigned char *targetLimit = reinterpret_cast<unsigned char*>(const_cast<char*>(args->targetLimit));
2364
0
    int32_t* offsets = args->offsets;
2365
0
    uint32_t targetByteUnit = 0x0000;
2366
0
    UChar32 sourceChar = 0x0000;
2367
0
    UBool isTargetByteDBCS;
2368
0
    UBool oldIsTargetByteDBCS;
2369
0
    UConverterDataISO2022 *converterData;
2370
0
    UConverterSharedData* sharedData;
2371
0
    UBool useFallback;
2372
0
    int32_t length =0;
2373
2374
0
    converterData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2375
    /* if the version is 1 then the user is requesting
2376
     * conversion with ibm-25546 pass the arguments to
2377
     * MBCS converter and return
2378
     */
2379
0
    if(converterData->version==1){
2380
0
        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2381
0
        return;
2382
0
    }
2383
2384
    /* initialize data */
2385
0
    sharedData = converterData->currentConverter->sharedData;
2386
0
    useFallback = args->converter->useFallback;
2387
0
    isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus);
2388
0
    oldIsTargetByteDBCS = isTargetByteDBCS;
2389
2390
0
    isTargetByteDBCS = static_cast<UBool>(args->converter->fromUnicodeStatus);
2391
0
    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2392
0
        goto getTrail;
2393
0
    }
2394
0
    while(source < sourceLimit){
2395
2396
0
        targetByteUnit = missingCharMarker;
2397
2398
0
        if(target < (unsigned char*) args->targetLimit){
2399
0
            sourceChar = *source++;
2400
2401
            /* do not convert SO/SI/ESC */
2402
0
            if(IS_2022_CONTROL(sourceChar)) {
2403
                /* callback(illegal) */
2404
0
                *err=U_ILLEGAL_CHAR_FOUND;
2405
0
                args->converter->fromUChar32=sourceChar;
2406
0
                break;
2407
0
            }
2408
2409
0
            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2410
0
            if(length < 0) {
2411
0
                length = -length;  /* fallback */
2412
0
            }
2413
            /* only DBCS or SBCS characters are expected*/
2414
            /* DB characters with high bit set to 1 are expected */
2415
0
            if( length > 2 || length==0 ||
2416
0
                (length == 1 && targetByteUnit > 0x7f) ||
2417
0
                (length == 2 &&
2418
0
                    (static_cast<uint16_t>(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2419
0
                    static_cast<uint8_t>(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2420
0
            ) {
2421
0
                targetByteUnit=missingCharMarker;
2422
0
            }
2423
0
            if (targetByteUnit != missingCharMarker){
2424
2425
0
                oldIsTargetByteDBCS = isTargetByteDBCS;
2426
0
                isTargetByteDBCS = static_cast<UBool>(targetByteUnit > 0x00FF);
2427
                  /* append the shift sequence */
2428
0
                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2429
2430
0
                    if (isTargetByteDBCS)
2431
0
                        *target++ = UCNV_SO;
2432
0
                    else
2433
0
                        *target++ = UCNV_SI;
2434
0
                    if(offsets)
2435
0
                        *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2436
0
                }
2437
                /* write the targetUniChar  to target */
2438
0
                if(targetByteUnit <= 0x00FF){
2439
0
                    if( target < targetLimit){
2440
0
                        *(target++) = static_cast<unsigned char>(targetByteUnit);
2441
0
                        if(offsets){
2442
0
                            *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2443
0
                        }
2444
2445
0
                    }else{
2446
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit);
2447
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
2448
0
                    }
2449
0
                }else{
2450
0
                    if(target < targetLimit){
2451
0
                        *(target++) = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80);
2452
0
                        if(offsets){
2453
0
                            *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2454
0
                        }
2455
0
                        if(target < targetLimit){
2456
0
                            *(target++) = static_cast<unsigned char>(targetByteUnit - 0x80);
2457
0
                            if(offsets){
2458
0
                                *(offsets++) = static_cast<int32_t>(source - args->source - 1);
2459
0
                            }
2460
0
                        }else{
2461
0
                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80);
2462
0
                            *err = U_BUFFER_OVERFLOW_ERROR;
2463
0
                        }
2464
0
                    }else{
2465
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>((targetByteUnit >> 8) - 0x80);
2466
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = static_cast<unsigned char>(targetByteUnit - 0x80);
2467
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
2468
0
                    }
2469
0
                }
2470
2471
0
            }
2472
0
            else{
2473
                /* oops.. the code point is unassingned
2474
                 * set the error and reason
2475
                 */
2476
2477
                /*check if the char is a First surrogate*/
2478
0
                if(U16_IS_SURROGATE(sourceChar)) {
2479
0
                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2480
0
getTrail:
2481
                        /*look ahead to find the trail surrogate*/
2482
0
                        if(source <  sourceLimit) {
2483
                            /* test the following code unit */
2484
0
                            char16_t trail = *source;
2485
0
                            if(U16_IS_TRAIL(trail)) {
2486
0
                                source++;
2487
0
                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2488
0
                                *err = U_INVALID_CHAR_FOUND;
2489
                                /* convert this surrogate code point */
2490
                                /* exit this condition tree */
2491
0
                            } else {
2492
                                /* this is an unmatched lead code unit (1st surrogate) */
2493
                                /* callback(illegal) */
2494
0
                                *err=U_ILLEGAL_CHAR_FOUND;
2495
0
                            }
2496
0
                        } else {
2497
                            /* no more input */
2498
0
                            *err = U_ZERO_ERROR;
2499
0
                        }
2500
0
                    } else {
2501
                        /* this is an unmatched trail code unit (2nd surrogate) */
2502
                        /* callback(illegal) */
2503
0
                        *err=U_ILLEGAL_CHAR_FOUND;
2504
0
                    }
2505
0
                } else {
2506
                    /* callback(unassigned) for a BMP code point */
2507
0
                    *err = U_INVALID_CHAR_FOUND;
2508
0
                }
2509
2510
0
                args->converter->fromUChar32=sourceChar;
2511
0
                break;
2512
0
            }
2513
0
        } /* end if(myTargetIndex<myTargetLength) */
2514
0
        else{
2515
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2516
0
            break;
2517
0
        }
2518
2519
0
    }/* end while(mySourceIndex<mySourceLength) */
2520
2521
    /*
2522
     * the end of the input stream and detection of truncated input
2523
     * are handled by the framework, but for ISO-2022-KR conversion
2524
     * we need to be in ASCII mode at the very end
2525
     *
2526
     * conditions:
2527
     *   successful
2528
     *   not in ASCII mode
2529
     *   end of input and no truncated input
2530
     */
2531
0
    if( U_SUCCESS(*err) &&
2532
0
        isTargetByteDBCS &&
2533
0
        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2534
0
    ) {
2535
0
        int32_t sourceIndex;
2536
2537
        /* we are switching to ASCII */
2538
0
        isTargetByteDBCS=false;
2539
2540
        /* get the source index of the last input character */
2541
        /*
2542
         * TODO this would be simpler and more reliable if we used a pair
2543
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2544
         * so that we could simply use the prevSourceIndex here;
2545
         * this code gives an incorrect result for the rare case of an unmatched
2546
         * trail surrogate that is alone in the last buffer of the text stream
2547
         */
2548
0
        sourceIndex = static_cast<int32_t>(source - args->source);
2549
0
        if(sourceIndex>0) {
2550
0
            --sourceIndex;
2551
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2552
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2553
0
            ) {
2554
0
                --sourceIndex;
2555
0
            }
2556
0
        } else {
2557
0
            sourceIndex=-1;
2558
0
        }
2559
2560
0
        fromUWriteUInt8(
2561
0
            args->converter,
2562
0
            SHIFT_IN_STR, 1,
2563
0
            &target, reinterpret_cast<const char*>(targetLimit),
2564
0
            &offsets, sourceIndex,
2565
0
            err);
2566
0
    }
2567
2568
    /*save the state and return */
2569
0
    args->source = source;
2570
0
    args->target = reinterpret_cast<char*>(target);
2571
0
    args->converter->fromUnicodeStatus = static_cast<uint32_t>(isTargetByteDBCS);
2572
0
}
2573
2574
/************************ To Unicode ***************************************/
2575
2576
static void U_CALLCONV
2577
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2578
0
                                                            UErrorCode* err){
2579
0
    char const* sourceStart;
2580
0
    UConverterDataISO2022* myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2581
2582
0
    UConverterToUnicodeArgs subArgs;
2583
0
    int32_t minArgsSize;
2584
2585
    /* set up the subconverter arguments */
2586
0
    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2587
0
        minArgsSize = args->size;
2588
0
    } else {
2589
0
        minArgsSize = static_cast<int32_t>(sizeof(UConverterToUnicodeArgs));
2590
0
    }
2591
2592
0
    uprv_memcpy(&subArgs, args, minArgsSize);
2593
0
    subArgs.size = static_cast<uint16_t>(minArgsSize);
2594
0
    subArgs.converter = myData->currentConverter;
2595
2596
    /* remember the original start of the input for offsets */
2597
0
    sourceStart = args->source;
2598
2599
0
    if(myData->key != 0) {
2600
        /* continue with a partial escape sequence */
2601
0
        goto escape;
2602
0
    }
2603
2604
0
    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2605
        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2606
0
        subArgs.source = args->source;
2607
0
        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2608
0
        if(subArgs.source != subArgs.sourceLimit) {
2609
            /*
2610
             * get the current partial byte sequence
2611
             *
2612
             * it needs to be moved between the public and the subconverter
2613
             * so that the conversion framework, which only sees the public
2614
             * converter, can handle truncated and illegal input etc.
2615
             */
2616
0
            if(args->converter->toULength > 0) {
2617
0
                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2618
0
            }
2619
0
            subArgs.converter->toULength = args->converter->toULength;
2620
2621
            /*
2622
             * Convert up to the end of the input, or to before the next escape character.
2623
             * Does not handle conversion extensions because the preToU[] state etc.
2624
             * is not copied.
2625
             */
2626
0
            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2627
2628
0
            if(args->offsets != nullptr && sourceStart != args->source) {
2629
                /* update offsets to base them on the actual start of the input */
2630
0
                int32_t *offsets = args->offsets;
2631
0
                char16_t *target = args->target;
2632
0
                int32_t delta = static_cast<int32_t>(args->source - sourceStart);
2633
0
                while(target < subArgs.target) {
2634
0
                    if(*offsets >= 0) {
2635
0
                        *offsets += delta;
2636
0
                    }
2637
0
                    ++offsets;
2638
0
                    ++target;
2639
0
                }
2640
0
            }
2641
0
            args->source = subArgs.source;
2642
0
            args->target = subArgs.target;
2643
0
            args->offsets = subArgs.offsets;
2644
2645
            /* copy input/error/overflow buffers */
2646
0
            if(subArgs.converter->toULength > 0) {
2647
0
                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2648
0
            }
2649
0
            args->converter->toULength = subArgs.converter->toULength;
2650
2651
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2652
0
                if(subArgs.converter->UCharErrorBufferLength > 0) {
2653
0
                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2654
0
                                subArgs.converter->UCharErrorBufferLength);
2655
0
                }
2656
0
                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2657
0
                subArgs.converter->UCharErrorBufferLength = 0;
2658
0
            }
2659
0
        }
2660
2661
0
        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2662
0
            return;
2663
0
        }
2664
2665
0
escape:
2666
0
        changeState_2022(args->converter,
2667
0
               &(args->source),
2668
0
               args->sourceLimit,
2669
0
               ISO_2022_KR,
2670
0
               err);
2671
0
    }
2672
0
}
2673
2674
static void U_CALLCONV
2675
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2676
0
                                                            UErrorCode* err){
2677
0
    char tempBuf[2];
2678
0
    const char* mySource = const_cast<char*>(args->source);
2679
0
    char16_t *myTarget = args->target;
2680
0
    const char *mySourceLimit = args->sourceLimit;
2681
0
    UChar32 targetUniChar = 0x0000;
2682
0
    char16_t mySourceChar = 0x0000;
2683
0
    UConverterDataISO2022* myData;
2684
0
    UConverterSharedData* sharedData ;
2685
0
    UBool useFallback;
2686
2687
0
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
2688
0
    if(myData->version==1){
2689
0
        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2690
0
        return;
2691
0
    }
2692
2693
    /* initialize state */
2694
0
    sharedData = myData->currentConverter->sharedData;
2695
0
    useFallback = args->converter->useFallback;
2696
2697
0
    if(myData->key != 0) {
2698
        /* continue with a partial escape sequence */
2699
0
        goto escape;
2700
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2701
        /* continue with a partial double-byte character */
2702
0
        mySourceChar = args->converter->toUBytes[0];
2703
0
        args->converter->toULength = 0;
2704
0
        goto getTrailByte;
2705
0
    }
2706
2707
0
    while(mySource< mySourceLimit){
2708
2709
0
        if(myTarget < args->targetLimit){
2710
2711
0
            mySourceChar = static_cast<unsigned char>(*mySource++);
2712
2713
0
            if(mySourceChar==UCNV_SI){
2714
0
                myData->toU2022State.g = 0;
2715
0
                if (myData->isEmptySegment) {
2716
0
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
2717
0
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2718
0
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2719
0
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2720
0
                    args->converter->toULength = 1;
2721
0
                    args->target = myTarget;
2722
0
                    args->source = mySource;
2723
0
                    return;
2724
0
                }
2725
                /*consume the source */
2726
0
                continue;
2727
0
            }else if(mySourceChar==UCNV_SO){
2728
0
                myData->toU2022State.g = 1;
2729
0
                myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
2730
                /*consume the source */
2731
0
                continue;
2732
0
            }else if(mySourceChar==ESC_2022){
2733
0
                mySource--;
2734
0
escape:
2735
0
                myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
2736
0
                changeState_2022(args->converter,&(mySource),
2737
0
                                mySourceLimit, ISO_2022_KR, err);
2738
0
                if(U_FAILURE(*err)){
2739
0
                    args->target = myTarget;
2740
0
                    args->source = mySource;
2741
0
                    return;
2742
0
                }
2743
0
                continue;
2744
0
            }
2745
2746
0
            myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
2747
0
            if(myData->toU2022State.g == 1) {
2748
0
                if(mySource < mySourceLimit) {
2749
0
                    int leadIsOk, trailIsOk;
2750
0
                    uint8_t trailByte;
2751
0
getTrailByte:
2752
0
                    targetUniChar = missingCharMarker;
2753
0
                    trailByte = static_cast<uint8_t>(*mySource);
2754
                    /*
2755
                     * Ticket 5691: consistent illegal sequences:
2756
                     * - We include at least the first byte in the illegal sequence.
2757
                     * - If any of the non-initial bytes could be the start of a character,
2758
                     *   we stop the illegal sequence before the first one of those.
2759
                     *
2760
                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2761
                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2762
                     * Otherwise we convert or report the pair of bytes.
2763
                     */
2764
0
                    leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
2765
0
                    trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
2766
0
                    if (leadIsOk && trailIsOk) {
2767
0
                        ++mySource;
2768
0
                        tempBuf[0] = static_cast<char>(mySourceChar + 0x80);
2769
0
                        tempBuf[1] = static_cast<char>(trailByte + 0x80);
2770
0
                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2771
0
                        mySourceChar = (mySourceChar << 8) | trailByte;
2772
0
                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2773
                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2774
0
                        ++mySource;
2775
                        /* add another bit so that the code below writes 2 bytes in case of error */
2776
0
                        mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
2777
0
                    }
2778
0
                } else {
2779
0
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
2780
0
                    args->converter->toULength = 1;
2781
0
                    break;
2782
0
                }
2783
0
            }
2784
0
            else if(mySourceChar <= 0x7f) {
2785
0
                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2786
0
            } else {
2787
0
                targetUniChar = 0xffff;
2788
0
            }
2789
0
            if(targetUniChar < 0xfffe){
2790
0
                if(args->offsets) {
2791
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2792
0
                }
2793
0
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
2794
0
            }
2795
0
            else {
2796
                /* Call the callback function*/
2797
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2798
0
                break;
2799
0
            }
2800
0
        }
2801
0
        else{
2802
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2803
0
            break;
2804
0
        }
2805
0
    }
2806
0
    args->target = myTarget;
2807
0
    args->source = mySource;
2808
0
}
2809
2810
/*************************** END ISO2022-KR *********************************/
2811
2812
/*************************** ISO-2022-CN *********************************
2813
*
2814
* Rules for ISO-2022-CN Encoding:
2815
* i)   The designator sequence must appear once on a line before any instance
2816
*      of character set it designates.
2817
* ii)  If two lines contain characters from the same character set, both lines
2818
*      must include the designator sequence.
2819
* iii) Once the designator sequence is known, a shifting sequence has to be found
2820
*      to invoke the  shifting
2821
* iv)  All lines start in ASCII and end in ASCII.
2822
* v)   Four shifting sequences are employed for this purpose:
2823
*
2824
*      Sequcence   ASCII Eq    Charsets
2825
*      ----------  -------    ---------
2826
*      SI           <SI>        US-ASCII
2827
*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2828
*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2829
*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2830
*
2831
* vi)
2832
*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2833
*      SS2designator : ESC "$" "*" finalchar_for_SS2
2834
*      SS3designator : ESC "$" "+" finalchar_for_SS3
2835
*
2836
*      ESC $ ) A       Indicates the bytes following SO are Chinese
2837
*       characters as defined in GB 2312-80, until
2838
*       another SOdesignation appears
2839
*
2840
*
2841
*      ESC $ ) E       Indicates the bytes following SO are as defined
2842
*       in ISO-IR-165 (for details, see section 2.1),
2843
*       until another SOdesignation appears
2844
*
2845
*      ESC $ ) G       Indicates the bytes following SO are as defined
2846
*       in CNS 11643-plane-1, until another
2847
*       SOdesignation appears
2848
*
2849
*      ESC $ * H       Indicates the two bytes immediately following
2850
*       SS2 is a Chinese character as defined in CNS
2851
*       11643-plane-2, until another SS2designation
2852
*       appears
2853
*       (Meaning <ESC>N must precede every 2 byte
2854
*        sequence.)
2855
*
2856
*      ESC $ + I       Indicates the immediate two bytes following SS3
2857
*       is a Chinese character as defined in CNS
2858
*       11643-plane-3, until another SS3designation
2859
*       appears
2860
*       (Meaning <ESC>O must precede every 2 byte
2861
*        sequence.)
2862
*
2863
*      ESC $ + J       Indicates the immediate two bytes following SS3
2864
*       is a Chinese character as defined in CNS
2865
*       11643-plane-4, until another SS3designation
2866
*       appears
2867
*       (In English: <ESC>O must precede every 2 byte
2868
*        sequence.)
2869
*
2870
*      ESC $ + K       Indicates the immediate two bytes following SS3
2871
*       is a Chinese character as defined in CNS
2872
*       11643-plane-5, until another SS3designation
2873
*       appears
2874
*
2875
*      ESC $ + L       Indicates the immediate two bytes following SS3
2876
*       is a Chinese character as defined in CNS
2877
*       11643-plane-6, until another SS3designation
2878
*       appears
2879
*
2880
*      ESC $ + M       Indicates the immediate two bytes following SS3
2881
*       is a Chinese character as defined in CNS
2882
*       11643-plane-7, until another SS3designation
2883
*       appears
2884
*
2885
*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2886
*       has its own designation information before any Chinese characters
2887
*       appear
2888
*
2889
*/
2890
2891
/* The following are defined this way to make the strings truly readonly */
2892
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2893
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2894
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2895
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2896
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2897
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2898
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2899
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2900
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2901
2902
/********************** ISO2022-CN Data **************************/
2903
static const char* const escSeqCharsCN[10] ={
2904
        SHIFT_IN_STR,                   /* 0 ASCII */
2905
        GB_2312_80_STR,                 /* 1 GB2312_1 */
2906
        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2907
        CNS_11643_1992_Plane_1_STR,
2908
        CNS_11643_1992_Plane_2_STR,
2909
        CNS_11643_1992_Plane_3_STR,
2910
        CNS_11643_1992_Plane_4_STR,
2911
        CNS_11643_1992_Plane_5_STR,
2912
        CNS_11643_1992_Plane_6_STR,
2913
        CNS_11643_1992_Plane_7_STR
2914
};
2915
2916
static void U_CALLCONV
2917
0
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2918
0
    UConverter *cnv = args->converter;
2919
0
    UConverterDataISO2022 *converterData;
2920
0
    ISO2022State *pFromU2022State;
2921
0
    uint8_t* target = reinterpret_cast<uint8_t*>(args->target);
2922
0
    const uint8_t* targetLimit = reinterpret_cast<const uint8_t*>(args->targetLimit);
2923
0
    const char16_t* source = args->source;
2924
0
    const char16_t* sourceLimit = args->sourceLimit;
2925
0
    int32_t* offsets = args->offsets;
2926
0
    UChar32 sourceChar;
2927
0
    char buffer[8];
2928
0
    int32_t len;
2929
0
    int8_t choices[3];
2930
0
    int32_t choiceCount;
2931
0
    uint32_t targetValue = 0;
2932
0
    UBool useFallback;
2933
2934
    /* set up the state */
2935
0
    converterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
2936
0
    pFromU2022State   = &converterData->fromU2022State;
2937
2938
0
    choiceCount = 0;
2939
2940
    /* check if the last codepoint of previous buffer was a lead surrogate*/
2941
0
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2942
0
        goto getTrail;
2943
0
    }
2944
2945
0
    while( source < sourceLimit){
2946
0
        if(target < targetLimit){
2947
2948
0
            sourceChar  = *(source++);
2949
            /*check if the char is a First surrogate*/
2950
0
             if(U16_IS_SURROGATE(sourceChar)) {
2951
0
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2952
0
getTrail:
2953
                    /*look ahead to find the trail surrogate*/
2954
0
                    if(source < sourceLimit) {
2955
                        /* test the following code unit */
2956
0
                        char16_t trail = *source;
2957
0
                        if(U16_IS_TRAIL(trail)) {
2958
0
                            source++;
2959
0
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2960
0
                            cnv->fromUChar32=0x00;
2961
                            /* convert this supplementary code point */
2962
                            /* exit this condition tree */
2963
0
                        } else {
2964
                            /* this is an unmatched lead code unit (1st surrogate) */
2965
                            /* callback(illegal) */
2966
0
                            *err=U_ILLEGAL_CHAR_FOUND;
2967
0
                            cnv->fromUChar32=sourceChar;
2968
0
                            break;
2969
0
                        }
2970
0
                    } else {
2971
                        /* no more input */
2972
0
                        cnv->fromUChar32=sourceChar;
2973
0
                        break;
2974
0
                    }
2975
0
                } else {
2976
                    /* this is an unmatched trail code unit (2nd surrogate) */
2977
                    /* callback(illegal) */
2978
0
                    *err=U_ILLEGAL_CHAR_FOUND;
2979
0
                    cnv->fromUChar32=sourceChar;
2980
0
                    break;
2981
0
                }
2982
0
            }
2983
2984
            /* do the conversion */
2985
0
            if(sourceChar <= 0x007f ){
2986
                /* do not convert SO/SI/ESC */
2987
0
                if(IS_2022_CONTROL(sourceChar)) {
2988
                    /* callback(illegal) */
2989
0
                    *err=U_ILLEGAL_CHAR_FOUND;
2990
0
                    cnv->fromUChar32=sourceChar;
2991
0
                    break;
2992
0
                }
2993
2994
                /* US-ASCII */
2995
0
                if(pFromU2022State->g == 0) {
2996
0
                    buffer[0] = static_cast<char>(sourceChar);
2997
0
                    len = 1;
2998
0
                } else {
2999
0
                    buffer[0] = UCNV_SI;
3000
0
                    buffer[1] = static_cast<char>(sourceChar);
3001
0
                    len = 2;
3002
0
                    pFromU2022State->g = 0;
3003
0
                    choiceCount = 0;
3004
0
                }
3005
0
                if(sourceChar == CR || sourceChar == LF) {
3006
                    /* reset the state at the end of a line */
3007
0
                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3008
0
                    choiceCount = 0;
3009
0
                }
3010
0
            }
3011
0
            else{
3012
                /* convert U+0080..U+10ffff */
3013
0
                int32_t i;
3014
0
                int8_t cs, g;
3015
3016
0
                if(choiceCount == 0) {
3017
                    /* try the current SO/G1 converter first */
3018
0
                    choices[0] = pFromU2022State->cs[1];
3019
3020
                    /* default to GB2312_1 if none is designated yet */
3021
0
                    if(choices[0] == 0) {
3022
0
                        choices[0] = GB2312_1;
3023
0
                    }
3024
3025
0
                    if(converterData->version == 0) {
3026
                        /* ISO-2022-CN */
3027
3028
                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3029
0
                        if(choices[0] == GB2312_1) {
3030
0
                            choices[1] = static_cast<int8_t>(CNS_11643_1);
3031
0
                        } else {
3032
0
                            choices[1] = static_cast<int8_t>(GB2312_1);
3033
0
                        }
3034
3035
0
                        choiceCount = 2;
3036
0
                    } else if (converterData->version == 1) {
3037
                        /* ISO-2022-CN-EXT */
3038
3039
                        /* try one of the other converters */
3040
0
                        switch(choices[0]) {
3041
0
                        case GB2312_1:
3042
0
                            choices[1] = static_cast<int8_t>(CNS_11643_1);
3043
0
                            choices[2] = static_cast<int8_t>(ISO_IR_165);
3044
0
                            break;
3045
0
                        case ISO_IR_165:
3046
0
                            choices[1] = static_cast<int8_t>(GB2312_1);
3047
0
                            choices[2] = static_cast<int8_t>(CNS_11643_1);
3048
0
                            break;
3049
0
                        default: /* CNS_11643_x */
3050
0
                            choices[1] = static_cast<int8_t>(GB2312_1);
3051
0
                            choices[2] = static_cast<int8_t>(ISO_IR_165);
3052
0
                            break;
3053
0
                        }
3054
3055
0
                        choiceCount = 3;
3056
0
                    } else {
3057
0
                        choices[0] = static_cast<int8_t>(CNS_11643_1);
3058
0
                        choices[1] = static_cast<int8_t>(GB2312_1);
3059
0
                    }
3060
0
                }
3061
3062
0
                cs = g = 0;
3063
                /*
3064
                 * len==0: no mapping found yet
3065
                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3066
                 * len>0: found a roundtrip result, done
3067
                 */
3068
0
                len = 0;
3069
                /*
3070
                 * We will turn off useFallback after finding a fallback,
3071
                 * but we still get fallbacks from PUA code points as usual.
3072
                 * Therefore, we will also need to check that we don't overwrite
3073
                 * an early fallback with a later one.
3074
                 */
3075
0
                useFallback = cnv->useFallback;
3076
3077
0
                for(i = 0; i < choiceCount && len <= 0; ++i) {
3078
0
                    int8_t cs0 = choices[i];
3079
0
                    if(cs0 > 0) {
3080
0
                        uint32_t value;
3081
0
                        int32_t len2;
3082
0
                        if(cs0 >= CNS_11643_0) {
3083
0
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3084
0
                                        converterData->myConverterArray[CNS_11643],
3085
0
                                        sourceChar,
3086
0
                                        &value,
3087
0
                                        useFallback,
3088
0
                                        MBCS_OUTPUT_3);
3089
0
                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3090
0
                                targetValue = value;
3091
0
                                cs = static_cast<int8_t>(CNS_11643_0 + (value >> 16) - 0x80);
3092
0
                                if(len2 >= 0) {
3093
0
                                    len = 2;
3094
0
                                } else {
3095
0
                                    len = -2;
3096
0
                                    useFallback = false;
3097
0
                                }
3098
0
                                if(cs == CNS_11643_1) {
3099
0
                                    g = 1;
3100
0
                                } else if(cs == CNS_11643_2) {
3101
0
                                    g = 2;
3102
0
                                } else /* plane 3..7 */ if(converterData->version == 1) {
3103
0
                                    g = 3;
3104
0
                                } else {
3105
                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3106
0
                                    len = 0;
3107
0
                                }
3108
0
                            }
3109
0
                        } else {
3110
                            /* GB2312_1 or ISO-IR-165 */
3111
0
                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3112
0
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3113
0
                                        converterData->myConverterArray[cs0],
3114
0
                                        sourceChar,
3115
0
                                        &value,
3116
0
                                        useFallback,
3117
0
                                        MBCS_OUTPUT_2);
3118
0
                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3119
0
                                targetValue = value;
3120
0
                                len = len2;
3121
0
                                cs = cs0;
3122
0
                                g = 1;
3123
0
                                useFallback = false;
3124
0
                            }
3125
0
                        }
3126
0
                    }
3127
0
                }
3128
3129
0
                if(len != 0) {
3130
0
                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3131
3132
                    /* write the designation sequence if necessary */
3133
0
                    if(cs != pFromU2022State->cs[g]) {
3134
0
                        if(cs < CNS_11643) {
3135
0
                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3136
0
                        } else {
3137
0
                            U_ASSERT(cs >= CNS_11643_1);
3138
0
                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3139
0
                        }
3140
0
                        len = 4;
3141
0
                        pFromU2022State->cs[g] = cs;
3142
0
                        if(g == 1) {
3143
                            /* changing the SO/G1 charset invalidates the choices[] */
3144
0
                            choiceCount = 0;
3145
0
                        }
3146
0
                    }
3147
3148
                    /* write the shift sequence if necessary */
3149
0
                    if(g != pFromU2022State->g) {
3150
0
                        switch(g) {
3151
0
                        case 1:
3152
0
                            buffer[len++] = UCNV_SO;
3153
3154
                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3155
0
                            pFromU2022State->g = 1;
3156
0
                            break;
3157
0
                        case 2:
3158
0
                            buffer[len++] = 0x1b;
3159
0
                            buffer[len++] = 0x4e;
3160
0
                            break;
3161
0
                        default: /* case 3 */
3162
0
                            buffer[len++] = 0x1b;
3163
0
                            buffer[len++] = 0x4f;
3164
0
                            break;
3165
0
                        }
3166
0
                    }
3167
3168
                    /* write the two output bytes */
3169
0
                    buffer[len++] = static_cast<char>(targetValue >> 8);
3170
0
                    buffer[len++] = static_cast<char>(targetValue);
3171
0
                } else {
3172
                    /* if we cannot find the character after checking all codepages
3173
                     * then this is an error
3174
                     */
3175
0
                    *err = U_INVALID_CHAR_FOUND;
3176
0
                    cnv->fromUChar32=sourceChar;
3177
0
                    break;
3178
0
                }
3179
0
            }
3180
3181
            /* output len>0 bytes in buffer[] */
3182
0
            if(len == 1) {
3183
0
                *target++ = buffer[0];
3184
0
                if(offsets) {
3185
0
                    *offsets++ = static_cast<int32_t>(source - args->source - 1); /* -1: known to be ASCII */
3186
0
                }
3187
0
            } else if(len == 2 && (target + 2) <= targetLimit) {
3188
0
                *target++ = buffer[0];
3189
0
                *target++ = buffer[1];
3190
0
                if(offsets) {
3191
0
                    int32_t sourceIndex = static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar));
3192
0
                    *offsets++ = sourceIndex;
3193
0
                    *offsets++ = sourceIndex;
3194
0
                }
3195
0
            } else {
3196
0
                fromUWriteUInt8(
3197
0
                    cnv,
3198
0
                    buffer, len,
3199
0
                    &target, reinterpret_cast<const char*>(targetLimit),
3200
0
                    &offsets, static_cast<int32_t>(source - args->source - U16_LENGTH(sourceChar)),
3201
0
                    err);
3202
0
                if(U_FAILURE(*err)) {
3203
0
                    break;
3204
0
                }
3205
0
            }
3206
0
        } /* end if(myTargetIndex<myTargetLength) */
3207
0
        else{
3208
0
            *err =U_BUFFER_OVERFLOW_ERROR;
3209
0
            break;
3210
0
        }
3211
3212
0
    }/* end while(mySourceIndex<mySourceLength) */
3213
3214
    /*
3215
     * the end of the input stream and detection of truncated input
3216
     * are handled by the framework, but for ISO-2022-CN conversion
3217
     * we need to be in ASCII mode at the very end
3218
     *
3219
     * conditions:
3220
     *   successful
3221
     *   not in ASCII mode
3222
     *   end of input and no truncated input
3223
     */
3224
0
    if( U_SUCCESS(*err) &&
3225
0
        pFromU2022State->g!=0 &&
3226
0
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3227
0
    ) {
3228
0
        int32_t sourceIndex;
3229
3230
        /* we are switching to ASCII */
3231
0
        pFromU2022State->g=0;
3232
3233
        /* get the source index of the last input character */
3234
        /*
3235
         * TODO this would be simpler and more reliable if we used a pair
3236
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3237
         * so that we could simply use the prevSourceIndex here;
3238
         * this code gives an incorrect result for the rare case of an unmatched
3239
         * trail surrogate that is alone in the last buffer of the text stream
3240
         */
3241
0
        sourceIndex = static_cast<int32_t>(source - args->source);
3242
0
        if(sourceIndex>0) {
3243
0
            --sourceIndex;
3244
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3245
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3246
0
            ) {
3247
0
                --sourceIndex;
3248
0
            }
3249
0
        } else {
3250
0
            sourceIndex=-1;
3251
0
        }
3252
3253
0
        fromUWriteUInt8(
3254
0
            cnv,
3255
0
            SHIFT_IN_STR, 1,
3256
0
            &target, reinterpret_cast<const char*>(targetLimit),
3257
0
            &offsets, sourceIndex,
3258
0
            err);
3259
0
    }
3260
3261
    /*save the state and return */
3262
0
    args->source = source;
3263
0
    args->target = reinterpret_cast<char*>(target);
3264
0
}
3265
3266
3267
static void U_CALLCONV
3268
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3269
297k
                                               UErrorCode* err){
3270
297k
    char tempBuf[3];
3271
297k
    const char* mySource = const_cast<char*>(args->source);
3272
297k
    char16_t *myTarget = args->target;
3273
297k
    const char *mySourceLimit = args->sourceLimit;
3274
297k
    uint32_t targetUniChar = 0x0000;
3275
297k
    uint32_t mySourceChar = 0x0000;
3276
297k
    UConverterDataISO2022* myData;
3277
297k
    ISO2022State *pToU2022State;
3278
3279
297k
    myData = static_cast<UConverterDataISO2022*>(args->converter->extraInfo);
3280
297k
    pToU2022State = &myData->toU2022State;
3281
3282
297k
    if(myData->key != 0) {
3283
        /* continue with a partial escape sequence */
3284
284
        goto escape;
3285
297k
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3286
        /* continue with a partial double-byte character */
3287
0
        mySourceChar = args->converter->toUBytes[0];
3288
0
        args->converter->toULength = 0;
3289
0
        targetUniChar = missingCharMarker;
3290
0
        goto getTrailByte;
3291
0
    }
3292
3293
778k
    while(mySource < mySourceLimit){
3294
3295
774k
        targetUniChar =missingCharMarker;
3296
3297
774k
        if(myTarget < args->targetLimit){
3298
3299
774k
            mySourceChar = static_cast<unsigned char>(*mySource++);
3300
3301
774k
            switch(mySourceChar){
3302
5.37k
            case UCNV_SI:
3303
5.37k
                pToU2022State->g=0;
3304
5.37k
                if (myData->isEmptySegment) {
3305
222
                    myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
3306
222
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3307
222
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3308
222
                    args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3309
222
                    args->converter->toULength = 1;
3310
222
                    args->target = myTarget;
3311
222
                    args->source = mySource;
3312
222
                    return;
3313
222
                }
3314
5.15k
                continue;
3315
3316
11.5k
            case UCNV_SO:
3317
11.5k
                if(pToU2022State->cs[1] != 0) {
3318
5.18k
                    pToU2022State->g=1;
3319
5.18k
                    myData->isEmptySegment = true;  /* Begin a new segment, empty so far */
3320
5.18k
                    continue;
3321
6.37k
                } else {
3322
                    /* illegal to have SO before a matching designator */
3323
6.37k
                    myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
3324
6.37k
                    break;
3325
6.37k
                }
3326
3327
17.2k
            case ESC_2022:
3328
17.2k
                mySource--;
3329
17.5k
escape:
3330
17.5k
                {
3331
17.5k
                    const char * mySourceBefore = mySource;
3332
17.5k
                    int8_t toULengthBefore = args->converter->toULength;
3333
3334
17.5k
                    changeState_2022(args->converter,&(mySource),
3335
17.5k
                        mySourceLimit, ISO_2022_CN,err);
3336
3337
                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3338
17.5k
                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3339
1.20k
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3340
1.20k
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3341
1.20k
                        args->converter->toULength = static_cast<int8_t>(toULengthBefore + (mySource - mySourceBefore));
3342
1.20k
                    }
3343
17.5k
                }
3344
3345
                /* invalid or illegal escape sequence */
3346
17.5k
                if(U_FAILURE(*err)){
3347
14.2k
                    args->target = myTarget;
3348
14.2k
                    args->source = mySource;
3349
14.2k
                    myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
3350
14.2k
                    return;
3351
14.2k
                }
3352
3.28k
                continue;
3353
3354
            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3355
3356
4.70k
            case CR:
3357
12.0k
            case LF:
3358
12.0k
                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3359
12.0k
                U_FALLTHROUGH;
3360
740k
            default:
3361
                /* convert one or two bytes */
3362
740k
                myData->isEmptySegment = false;
3363
740k
                if(pToU2022State->g != 0) {
3364
74.3k
                    if(mySource < mySourceLimit) {
3365
74.0k
                        UConverterSharedData *cnv;
3366
74.0k
                        StateEnum tempState;
3367
74.0k
                        int32_t tempBufLen;
3368
74.0k
                        int leadIsOk, trailIsOk;
3369
74.0k
                        uint8_t trailByte;
3370
74.0k
getTrailByte:
3371
74.0k
                        trailByte = static_cast<uint8_t>(*mySource);
3372
                        /*
3373
                         * Ticket 5691: consistent illegal sequences:
3374
                         * - We include at least the first byte in the illegal sequence.
3375
                         * - If any of the non-initial bytes could be the start of a character,
3376
                         *   we stop the illegal sequence before the first one of those.
3377
                         *
3378
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3379
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3380
                         * Otherwise we convert or report the pair of bytes.
3381
                         */
3382
74.0k
                        leadIsOk = static_cast<uint8_t>(mySourceChar - 0x21) <= (0x7e - 0x21);
3383
74.0k
                        trailIsOk = static_cast<uint8_t>(trailByte - 0x21) <= (0x7e - 0x21);
3384
74.0k
                        if (leadIsOk && trailIsOk) {
3385
22.9k
                            ++mySource;
3386
22.9k
                            tempState = static_cast<StateEnum>(pToU2022State->cs[pToU2022State->g]);
3387
22.9k
                            if(tempState >= CNS_11643_0) {
3388
5.30k
                                cnv = myData->myConverterArray[CNS_11643];
3389
5.30k
                                tempBuf[0] = static_cast<char>(0x80 + (tempState - CNS_11643_0));
3390
5.30k
                                tempBuf[1] = static_cast<char>(mySourceChar);
3391
5.30k
                                tempBuf[2] = static_cast<char>(trailByte);
3392
5.30k
                                tempBufLen = 3;
3393
3394
17.6k
                            }else{
3395
17.6k
                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3396
17.6k
                                cnv = myData->myConverterArray[tempState];
3397
17.6k
                                tempBuf[0] = static_cast<char>(mySourceChar);
3398
17.6k
                                tempBuf[1] = static_cast<char>(trailByte);
3399
17.6k
                                tempBufLen = 2;
3400
17.6k
                            }
3401
22.9k
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
3402
22.9k
                            mySourceChar = (mySourceChar << 8) | trailByte;
3403
51.0k
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3404
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3405
45.3k
                            ++mySource;
3406
                            /* add another bit so that the code below writes 2 bytes in case of error */
3407
45.3k
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3408
45.3k
                        }
3409
74.0k
                        if(pToU2022State->g>=2) {
3410
                            /* return from a single-shift state to the previous one */
3411
320
                            pToU2022State->g=pToU2022State->prevG;
3412
320
                        }
3413
74.0k
                    } else {
3414
294
                        args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
3415
294
                        args->converter->toULength = 1;
3416
294
                        goto endloop;
3417
294
                    }
3418
74.3k
                }
3419
665k
                else{
3420
665k
                    if(mySourceChar <= 0x7f) {
3421
448k
                        targetUniChar = static_cast<char16_t>(mySourceChar);
3422
448k
                    }
3423
665k
                }
3424
739k
                break;
3425
774k
            }
3426
746k
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3427
467k
                if(args->offsets){
3428
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3429
0
                }
3430
467k
                *(myTarget++) = static_cast<char16_t>(targetUniChar);
3431
467k
            }
3432
278k
            else if(targetUniChar > missingCharMarker){
3433
                /* disassemble the surrogate pair and write to output*/
3434
0
                targetUniChar-=0x0010000;
3435
0
                *myTarget = static_cast<char16_t>(0xd800 + static_cast<char16_t>(targetUniChar >> 10));
3436
0
                if(args->offsets){
3437
0
                    args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3438
0
                }
3439
0
                ++myTarget;
3440
0
                if(myTarget< args->targetLimit){
3441
0
                    *myTarget = static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
3442
0
                    if(args->offsets){
3443
0
                        args->offsets[myTarget - args->target] = static_cast<int32_t>(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3444
0
                    }
3445
0
                    ++myTarget;
3446
0
                }else{
3447
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3448
0
                                    static_cast<char16_t>(0xdc00 + static_cast<char16_t>(targetUniChar & 0x3ff));
3449
0
                }
3450
3451
0
            }
3452
278k
            else{
3453
                /* Call the callback function*/
3454
278k
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3455
278k
                break;
3456
278k
            }
3457
746k
        }
3458
620
        else{
3459
620
            *err =U_BUFFER_OVERFLOW_ERROR;
3460
620
            break;
3461
620
        }
3462
774k
    }
3463
283k
endloop:
3464
283k
    args->target = myTarget;
3465
283k
    args->source = mySource;
3466
283k
}
3467
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3468
3469
static void U_CALLCONV
3470
0
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3471
0
    UConverter *cnv = args->converter;
3472
0
    UConverterDataISO2022* myConverterData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
3473
0
    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3474
0
    char *p, *subchar;
3475
0
    char buffer[8];
3476
0
    int32_t length;
3477
3478
0
    subchar = reinterpret_cast<char*>(cnv->subChars);
3479
0
    length=cnv->subCharLen; /* assume length==1 for most variants */
3480
3481
0
    p = buffer;
3482
0
    switch(myConverterData->locale[0]){
3483
0
    case 'j':
3484
0
        {
3485
0
            int8_t cs;
3486
3487
0
            if(pFromU2022State->g == 1) {
3488
                /* JIS7: switch from G1 to G0 */
3489
0
                pFromU2022State->g = 0;
3490
0
                *p++ = UCNV_SI;
3491
0
            }
3492
3493
0
            cs = pFromU2022State->cs[0];
3494
0
            if(cs != ASCII && cs != JISX201) {
3495
                /* not in ASCII or JIS X 0201: switch to ASCII */
3496
0
                pFromU2022State->cs[0] = static_cast<int8_t>(ASCII);
3497
0
                *p++ = '\x1b';
3498
0
                *p++ = '\x28';
3499
0
                *p++ = '\x42';
3500
0
            }
3501
3502
0
            *p++ = subchar[0];
3503
0
            break;
3504
0
        }
3505
0
    case 'c':
3506
0
        if(pFromU2022State->g != 0) {
3507
            /* not in ASCII mode: switch to ASCII */
3508
0
            pFromU2022State->g = 0;
3509
0
            *p++ = UCNV_SI;
3510
0
        }
3511
0
        *p++ = subchar[0];
3512
0
        break;
3513
0
    case 'k':
3514
0
        if(myConverterData->version == 0) {
3515
0
            if(length == 1) {
3516
0
                if(args->converter->fromUnicodeStatus) {
3517
                    /* in DBCS mode: switch to SBCS */
3518
0
                    args->converter->fromUnicodeStatus = 0;
3519
0
                    *p++ = UCNV_SI;
3520
0
                }
3521
0
                *p++ = subchar[0];
3522
0
            } else /* length == 2*/ {
3523
0
                if(!args->converter->fromUnicodeStatus) {
3524
                    /* in SBCS mode: switch to DBCS */
3525
0
                    args->converter->fromUnicodeStatus = 1;
3526
0
                    *p++ = UCNV_SO;
3527
0
                }
3528
0
                *p++ = subchar[0];
3529
0
                *p++ = subchar[1];
3530
0
            }
3531
0
            break;
3532
0
        } else {
3533
            /* save the subconverter's substitution string */
3534
0
            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3535
0
            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3536
3537
            /* set our substitution string into the subconverter */
3538
0
            myConverterData->currentConverter->subChars = reinterpret_cast<uint8_t*>(subchar);
3539
0
            myConverterData->currentConverter->subCharLen = static_cast<int8_t>(length);
3540
3541
            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3542
0
            args->converter = myConverterData->currentConverter;
3543
0
            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3544
0
            ucnv_cbFromUWriteSub(args, 0, err);
3545
0
            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3546
0
            args->converter = cnv;
3547
3548
            /* restore the subconverter's substitution string */
3549
0
            myConverterData->currentConverter->subChars = currentSubChars;
3550
0
            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3551
3552
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3553
0
                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3554
0
                    uprv_memcpy(
3555
0
                        cnv->charErrorBuffer,
3556
0
                        myConverterData->currentConverter->charErrorBuffer,
3557
0
                        myConverterData->currentConverter->charErrorBufferLength);
3558
0
                }
3559
0
                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3560
0
                myConverterData->currentConverter->charErrorBufferLength = 0;
3561
0
            }
3562
0
            return;
3563
0
        }
3564
0
    default:
3565
        /* not expected */
3566
0
        break;
3567
0
    }
3568
0
    ucnv_cbFromUWriteBytes(args,
3569
0
                           buffer, static_cast<int32_t>(p - buffer),
3570
0
                           offsetIndex, err);
3571
0
}
3572
3573
/*
3574
 * Structure for cloning an ISO 2022 converter into a single memory block.
3575
 */
3576
struct cloneStruct
3577
{
3578
    UConverter cnv;
3579
    UConverter currentConverter;
3580
    UConverterDataISO2022 mydata;
3581
};
3582
3583
3584
U_CDECL_BEGIN
3585
3586
static UConverter * U_CALLCONV
3587
_ISO_2022_SafeClone(
3588
            const UConverter *cnv,
3589
            void *stackBuffer,
3590
            int32_t *pBufferSize,
3591
            UErrorCode *status)
3592
0
{
3593
0
    struct cloneStruct * localClone;
3594
0
    UConverterDataISO2022 *cnvData;
3595
0
    int32_t i, size;
3596
3597
0
    if (U_FAILURE(*status)){
3598
0
        return nullptr;
3599
0
    }
3600
3601
0
    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3602
0
        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3603
0
        return nullptr;
3604
0
    }
3605
3606
0
    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3607
0
    localClone = (struct cloneStruct *)stackBuffer;
3608
3609
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3610
3611
0
    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3612
0
    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3613
0
    localClone->cnv.isExtraLocal = true;
3614
3615
    /* share the subconverters */
3616
3617
0
    if(cnvData->currentConverter != nullptr) {
3618
0
        size = (int32_t)sizeof(UConverter);
3619
0
        localClone->mydata.currentConverter =
3620
0
            ucnv_safeClone(cnvData->currentConverter,
3621
0
                            &localClone->currentConverter,
3622
0
                            &size, status);
3623
0
        if(U_FAILURE(*status)) {
3624
0
            return nullptr;
3625
0
        }
3626
0
    }
3627
3628
0
    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3629
0
        if(cnvData->myConverterArray[i] != nullptr) {
3630
0
            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3631
0
        }
3632
0
    }
3633
3634
0
    return &localClone->cnv;
3635
0
}
3636
3637
U_CDECL_END
3638
3639
static void U_CALLCONV
3640
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3641
                    const USetAdder *sa,
3642
                    UConverterUnicodeSet which,
3643
                    UErrorCode *pErrorCode)
3644
0
{
3645
0
    int32_t i;
3646
0
    UConverterDataISO2022* cnvData;
3647
3648
0
    if (U_FAILURE(*pErrorCode)) {
3649
0
        return;
3650
0
    }
3651
#ifdef U_ENABLE_GENERIC_ISO_2022
3652
    if (cnv->sharedData == &_ISO2022Data) {
3653
        /* We use UTF-8 in this case */
3654
        sa->addRange(sa->set, 0, 0xd7FF);
3655
        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3656
        return;
3657
    }
3658
#endif
3659
3660
0
    cnvData = static_cast<UConverterDataISO2022*>(cnv->extraInfo);
3661
3662
    /* open a set and initialize it with code points that are algorithmically round-tripped */
3663
0
    switch(cnvData->locale[0]){
3664
0
    case 'j':
3665
        /* include JIS X 0201 which is hardcoded */
3666
0
        sa->add(sa->set, 0xa5);
3667
0
        sa->add(sa->set, 0x203e);
3668
0
        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3669
            /* include Latin-1 for some variants of JP */
3670
0
            sa->addRange(sa->set, 0, 0xff);
3671
0
        } else {
3672
            /* include ASCII for JP */
3673
0
            sa->addRange(sa->set, 0, 0x7f);
3674
0
        }
3675
0
        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3676
            /*
3677
             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3678
             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3679
             * use half-width Katakana.
3680
             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3681
             * half-width Katakana via the ESC ( I sequence.
3682
             * However, we only emit (fromUnicode) half-width Katakana according to the
3683
             * definition of each variant.
3684
             *
3685
             * When including fallbacks,
3686
             * we need to include half-width Katakana Unicode code points for all JP variants because
3687
             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3688
             */
3689
            /* include half-width Katakana for JP */
3690
0
            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3691
0
        }
3692
0
        break;
3693
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3694
0
    case 'c':
3695
0
    case 'z':
3696
        /* include ASCII for CN */
3697
0
        sa->addRange(sa->set, 0, 0x7f);
3698
0
        break;
3699
0
    case 'k':
3700
        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3701
0
        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3702
0
                cnvData->currentConverter, sa, which, pErrorCode);
3703
        /* the loop over myConverterArray[] will simply not find another converter */
3704
0
        break;
3705
0
#endif
3706
0
    default:
3707
0
        break;
3708
0
    }
3709
3710
#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3711
            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3712
                cnvData->version==0 && i==CNS_11643
3713
            ) {
3714
                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3715
                ucnv_MBCSGetUnicodeSetForBytes(
3716
                        cnvData->myConverterArray[i],
3717
                        sa, UCNV_ROUNDTRIP_SET,
3718
                        0, 0x81, 0x82,
3719
                        pErrorCode);
3720
            }
3721
#endif
3722
3723
0
    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3724
0
        UConverterSetFilter filter;
3725
0
        if(cnvData->myConverterArray[i]!=nullptr) {
3726
0
            if(cnvData->locale[0]=='j' && i==JISX208) {
3727
                /*
3728
                 * Only add code points that map to Shift-JIS codes
3729
                 * corresponding to JIS X 0208.
3730
                 */
3731
0
                filter=UCNV_SET_FILTER_SJIS;
3732
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3733
0
            } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3734
0
                       cnvData->version==0 && i==CNS_11643) {
3735
                /*
3736
                 * Version-specific for CN:
3737
                 * CN version 0 does not map CNS planes 3..7 although
3738
                 * they are all available in the CNS conversion table;
3739
                 * CN version 1 (-EXT) does map them all.
3740
                 * The two versions create different Unicode sets.
3741
                 */
3742
0
                filter=UCNV_SET_FILTER_2022_CN;
3743
0
            } else if(i==KSC5601) {
3744
                /*
3745
                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3746
                 * are broader than GR94.
3747
                 */
3748
0
                filter=UCNV_SET_FILTER_GR94DBCS;
3749
0
#endif
3750
0
            } else {
3751
0
                filter=UCNV_SET_FILTER_NONE;
3752
0
            }
3753
0
            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3754
0
        }
3755
0
    }
3756
3757
    /*
3758
     * ISO 2022 converters must not convert SO/SI/ESC despite what
3759
     * sub-converters do by themselves.
3760
     * Remove these characters from the set.
3761
     */
3762
0
    sa->remove(sa->set, 0x0e);
3763
0
    sa->remove(sa->set, 0x0f);
3764
0
    sa->remove(sa->set, 0x1b);
3765
3766
    /* ISO 2022 converters do not convert C1 controls either */
3767
0
    sa->removeRange(sa->set, 0x80, 0x9f);
3768
0
}
3769
3770
static const UConverterImpl _ISO2022Impl={
3771
    UCNV_ISO_2022,
3772
3773
    nullptr,
3774
    nullptr,
3775
3776
    _ISO2022Open,
3777
    _ISO2022Close,
3778
    _ISO2022Reset,
3779
3780
#ifdef U_ENABLE_GENERIC_ISO_2022
3781
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3782
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3783
    ucnv_fromUnicode_UTF8,
3784
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3785
#else
3786
    nullptr,
3787
    nullptr,
3788
    nullptr,
3789
    nullptr,
3790
#endif
3791
    nullptr,
3792
3793
    nullptr,
3794
    _ISO2022getName,
3795
    _ISO_2022_WriteSub,
3796
    _ISO_2022_SafeClone,
3797
    _ISO_2022_GetUnicodeSet,
3798
3799
    nullptr,
3800
    nullptr
3801
};
3802
static const UConverterStaticData _ISO2022StaticData={
3803
    sizeof(UConverterStaticData),
3804
    "ISO_2022",
3805
    2022,
3806
    UCNV_IBM,
3807
    UCNV_ISO_2022,
3808
    1,
3809
    3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
3810
    { 0x1a, 0, 0, 0 },
3811
    1,
3812
    false,
3813
    false,
3814
    0,
3815
    0,
3816
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3817
};
3818
const UConverterSharedData _ISO2022Data=
3819
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3820
3821
/*************JP****************/
3822
static const UConverterImpl _ISO2022JPImpl={
3823
    UCNV_ISO_2022,
3824
3825
    nullptr,
3826
    nullptr,
3827
3828
    _ISO2022Open,
3829
    _ISO2022Close,
3830
    _ISO2022Reset,
3831
3832
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3833
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3834
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3835
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3836
    nullptr,
3837
3838
    nullptr,
3839
    _ISO2022getName,
3840
    _ISO_2022_WriteSub,
3841
    _ISO_2022_SafeClone,
3842
    _ISO_2022_GetUnicodeSet,
3843
3844
    nullptr,
3845
    nullptr
3846
};
3847
static const UConverterStaticData _ISO2022JPStaticData={
3848
    sizeof(UConverterStaticData),
3849
    "ISO_2022_JP",
3850
    0,
3851
    UCNV_IBM,
3852
    UCNV_ISO_2022,
3853
    1,
3854
    6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
3855
    { 0x1a, 0, 0, 0 },
3856
    1,
3857
    false,
3858
    false,
3859
    0,
3860
    0,
3861
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3862
};
3863
3864
namespace {
3865
3866
const UConverterSharedData _ISO2022JPData=
3867
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3868
3869
}  // namespace
3870
3871
#if !UCONFIG_ONLY_HTML_CONVERSION
3872
/************* KR ***************/
3873
static const UConverterImpl _ISO2022KRImpl={
3874
    UCNV_ISO_2022,
3875
3876
    nullptr,
3877
    nullptr,
3878
3879
    _ISO2022Open,
3880
    _ISO2022Close,
3881
    _ISO2022Reset,
3882
3883
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3884
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3885
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3886
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3887
    nullptr,
3888
3889
    nullptr,
3890
    _ISO2022getName,
3891
    _ISO_2022_WriteSub,
3892
    _ISO_2022_SafeClone,
3893
    _ISO_2022_GetUnicodeSet,
3894
3895
    nullptr,
3896
    nullptr
3897
};
3898
static const UConverterStaticData _ISO2022KRStaticData={
3899
    sizeof(UConverterStaticData),
3900
    "ISO_2022_KR",
3901
    0,
3902
    UCNV_IBM,
3903
    UCNV_ISO_2022,
3904
    1,
3905
    8, /* max 8 bytes per char16_t */
3906
    { 0x1a, 0, 0, 0 },
3907
    1,
3908
    false,
3909
    false,
3910
    0,
3911
    0,
3912
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3913
};
3914
3915
namespace {
3916
3917
const UConverterSharedData _ISO2022KRData=
3918
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3919
3920
}  // namespace
3921
3922
/*************** CN ***************/
3923
static const UConverterImpl _ISO2022CNImpl={
3924
3925
    UCNV_ISO_2022,
3926
3927
    nullptr,
3928
    nullptr,
3929
3930
    _ISO2022Open,
3931
    _ISO2022Close,
3932
    _ISO2022Reset,
3933
3934
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3935
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3936
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3937
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3938
    nullptr,
3939
3940
    nullptr,
3941
    _ISO2022getName,
3942
    _ISO_2022_WriteSub,
3943
    _ISO_2022_SafeClone,
3944
    _ISO_2022_GetUnicodeSet,
3945
3946
    nullptr,
3947
    nullptr
3948
};
3949
static const UConverterStaticData _ISO2022CNStaticData={
3950
    sizeof(UConverterStaticData),
3951
    "ISO_2022_CN",
3952
    0,
3953
    UCNV_IBM,
3954
    UCNV_ISO_2022,
3955
    1,
3956
    8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3957
    { 0x1a, 0, 0, 0 },
3958
    1,
3959
    false,
3960
    false,
3961
    0,
3962
    0,
3963
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3964
};
3965
3966
namespace {
3967
3968
const UConverterSharedData _ISO2022CNData=
3969
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3970
3971
}  // namespace
3972
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3973
3974
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */