Coverage Report

Created: 2023-06-07 07:18

/src/icu/source/common/ucnv2022.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2000-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  ucnv2022.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2000feb03
14
*   created by: Markus W. Scherer
15
*
16
*   Change history:
17
*
18
*   06/29/2000  helena  Major rewrite of the callback APIs.
19
*   08/08/2000  Ram     Included support for ISO-2022-JP-2
20
*                       Changed implementation of toUnicode
21
*                       function
22
*   08/21/2000  Ram     Added support for ISO-2022-KR
23
*   08/29/2000  Ram     Seperated implementation of EBCDIC to
24
*                       ucnvebdc.c
25
*   09/20/2000  Ram     Added support for ISO-2022-CN
26
*                       Added implementations for getNextUChar()
27
*                       for specific 2022 country variants.
28
*   10/31/2000  Ram     Implemented offsets logic functions
29
*/
30
31
#include "unicode/utypes.h"
32
33
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
34
35
#include "unicode/ucnv.h"
36
#include "unicode/uset.h"
37
#include "unicode/ucnv_err.h"
38
#include "unicode/ucnv_cb.h"
39
#include "unicode/utf16.h"
40
#include "ucnv_imp.h"
41
#include "ucnv_bld.h"
42
#include "ucnv_cnv.h"
43
#include "ucnvmbcs.h"
44
#include "cstring.h"
45
#include "cmemory.h"
46
#include "uassert.h"
47
48
#ifdef U_ENABLE_GENERIC_ISO_2022
49
/*
50
 * I am disabling the generic ISO-2022 converter after proposing to do so on
51
 * the icu mailing list two days ago.
52
 *
53
 * Reasons:
54
 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
55
 *    its designation sequences, single shifts with return to the previous state,
56
 *    switch-with-no-return to UTF-16BE or similar, etc.
57
 *    This is unlike the language-specific variants like ISO-2022-JP which
58
 *    require a much smaller repertoire of ISO-2022 features.
59
 *    These variants continue to be supported.
60
 * 2. I believe that no one is really using the generic ISO-2022 converter
61
 *    but rather always one of the language-specific variants.
62
 *    Note that ICU's generic ISO-2022 converter has always output one escape
63
 *    sequence followed by UTF-8 for the whole stream.
64
 * 3. Switching between subcharsets is extremely slow, because each time
65
 *    the previous converter is closed and a new one opened,
66
 *    without any kind of caching, least-recently-used list, etc.
67
 * 4. The code is currently buggy, and given the above it does not seem
68
 *    reasonable to spend the time on maintenance.
69
 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
70
 *    This means, for example, that when ISO-8859-7 is designated, the following
71
 *    ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
72
 *    The ICU ISO-2022 converter does not handle this - and has no information
73
 *    about which subconverter would have to be shifted vs. which is designed
74
 *    for 7-bit ISO-2022.
75
 *
76
 * Markus Scherer 2003-dec-03
77
 */
78
#endif
79
80
#if !UCONFIG_ONLY_HTML_CONVERSION
81
static const char SHIFT_IN_STR[]  = "\x0F";
82
// static const char SHIFT_OUT_STR[] = "\x0E";
83
#endif
84
85
0
#define CR      0x0D
86
0
#define LF      0x0A
87
#define H_TAB   0x09
88
#define V_TAB   0x0B
89
#define SPACE   0x20
90
91
enum {
92
    HWKANA_START=0xff61,
93
    HWKANA_END=0xff9f
94
};
95
96
/*
97
 * 94-character sets with native byte values A1..FE are encoded in ISO 2022
98
 * as bytes 21..7E. (Subtract 0x80.)
99
 * 96-character sets with native byte values A0..FF are encoded in ISO 2022
100
 * as bytes 20..7F. (Subtract 0x80.)
101
 * Do not encode C1 control codes with native bytes 80..9F
102
 * as bytes 00..1F (C0 control codes).
103
 */
104
enum {
105
    GR94_START=0xa1,
106
    GR94_END=0xfe,
107
    GR96_START=0xa0,
108
    GR96_END=0xff
109
};
110
111
/*
112
 * ISO 2022 control codes must not be converted from Unicode
113
 * because they would mess up the byte stream.
114
 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
115
 * corresponding to SO, SI, and ESC.
116
 */
117
0
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
118
119
/* for ISO-2022-JP and -CN implementations */
120
typedef enum  {
121
        /* shared values */
122
        INVALID_STATE=-1,
123
        ASCII = 0,
124
125
        SS2_STATE=0x10,
126
        SS3_STATE,
127
128
        /* JP */
129
        ISO8859_1 = 1 ,
130
        ISO8859_7 = 2 ,
131
        JISX201  = 3,
132
        JISX208 = 4,
133
        JISX212 = 5,
134
        GB2312  =6,
135
        KSC5601 =7,
136
        HWKANA_7BIT=8,    /* Halfwidth Katakana 7 bit */
137
138
        /* CN */
139
        /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
140
        GB2312_1=1,
141
        ISO_IR_165=2,
142
        CNS_11643=3,
143
144
        /*
145
         * these are used in StateEnum and ISO2022State variables,
146
         * but CNS_11643 must be used to index into myConverterArray[]
147
         */
148
        CNS_11643_0=0x20,
149
        CNS_11643_1,
150
        CNS_11643_2,
151
        CNS_11643_3,
152
        CNS_11643_4,
153
        CNS_11643_5,
154
        CNS_11643_6,
155
        CNS_11643_7
156
} StateEnum;
157
158
/* is the StateEnum charset value for a DBCS charset? */
159
#if UCONFIG_ONLY_HTML_CONVERSION
160
#define IS_JP_DBCS(cs) (JISX208==(cs))
161
#else
162
0
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
163
#endif
164
165
0
#define CSM(cs) ((uint16_t)1<<(cs))
166
167
/*
168
 * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
169
 * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
170
 *
171
 * Note: The converter uses some leniency:
172
 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
173
 *   all versions, not just JIS7 and JIS8.
174
 * - ICU does not distinguish between different versions of JIS X 0208.
175
 */
176
#if UCONFIG_ONLY_HTML_CONVERSION
177
enum { MAX_JA_VERSION=0 };
178
#else
179
enum { MAX_JA_VERSION=4 };
180
#endif
181
static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
182
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
183
#if !UCONFIG_ONLY_HTML_CONVERSION
184
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
185
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
186
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
187
    CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
188
#endif
189
};
190
191
typedef enum {
192
        ASCII1=0,
193
        LATIN1,
194
        SBCS,
195
        DBCS,
196
        MBCS,
197
        HWKANA
198
}Cnv2022Type;
199
200
typedef struct ISO2022State {
201
    int8_t cs[4];       /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
202
    int8_t g;           /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
203
    int8_t prevG;       /* g before single shift (SS2 or SS3) */
204
} ISO2022State;
205
206
0
#define UCNV_OPTIONS_VERSION_MASK 0xf
207
0
#define UCNV_2022_MAX_CONVERTERS 10
208
209
typedef struct{
210
    UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
211
    UConverter *currentConverter;
212
    Cnv2022Type currentType;
213
    ISO2022State toU2022State, fromU2022State;
214
    uint32_t key;
215
    uint32_t version;
216
#ifdef U_ENABLE_GENERIC_ISO_2022
217
    UBool isFirstBuffer;
218
#endif
219
    UBool isEmptySegment;
220
    char name[30];
221
    char locale[3];
222
}UConverterDataISO2022;
223
224
/* Protos */
225
/* ISO-2022 ----------------------------------------------------------------- */
226
227
/*Forward declaration */
228
U_CFUNC void U_CALLCONV
229
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
230
                      UErrorCode * err);
231
U_CFUNC void U_CALLCONV
232
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
233
                                    UErrorCode * err);
234
235
0
#define ESC_2022 0x1B /*ESC*/
236
237
typedef enum
238
{
239
        INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
240
        VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
241
        VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
242
        VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
243
} UCNV_TableStates_2022;
244
245
/*
246
* The way these state transition arrays work is:
247
* ex : ESC$B is the sequence for JISX208
248
*      a) First Iteration: char is ESC
249
*          i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
250
*             int x = normalize_esq_chars_2022[27] which is equal to 1
251
*         ii) Search for this value in escSeqStateTable_Key_2022[]
252
*             value of x is stored at escSeqStateTable_Key_2022[0]
253
*        iii) Save this index as offset
254
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256
*     b) Switch on this state and continue to next char
257
*          i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
258
*             which is normalize_esq_chars_2022[36] == 4
259
*         ii) x is currently 1(from above)
260
*               x<<=5 -- x is now 32
261
*               x+=normalize_esq_chars_2022[36]
262
*               now x is 36
263
*        iii) Search for this value in escSeqStateTable_Key_2022[]
264
*             value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
265
*         iv) Get state of this sequence from escSeqStateTable_Value_2022[]
266
*             escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
267
*     c) Switch on this state and continue to next char
268
*        i)  Get the value of B from normalize_esq_chars_2022[] with int value of B as index
269
*        ii) x is currently 36 (from above)
270
*            x<<=5 -- x is now 1152
271
*            x+=normalize_esq_chars_2022[66]
272
*            now x is 1161
273
*       iii) Search for this value in escSeqStateTable_Key_2022[]
274
*            value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
275
*        iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
276
*            escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
277
*         v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
278
*/
279
280
281
/*Below are the 3 arrays depicting a state transition table*/
282
static const int8_t normalize_esq_chars_2022[256] = {
283
/*       0      1       2       3       4      5       6        7       8       9           */
284
285
         0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
286
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
287
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,1      ,0      ,0
288
        ,0     ,0      ,0      ,0      ,0      ,0      ,4      ,7      ,29      ,0
289
        ,2     ,24     ,26     ,27     ,0      ,3      ,23     ,6      ,0      ,0
290
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
291
        ,0     ,0      ,0      ,0      ,5      ,8      ,9      ,10     ,11     ,12
292
        ,13    ,14     ,15     ,16     ,17     ,18     ,19     ,20     ,25     ,28
293
        ,0     ,0      ,21     ,0      ,0      ,0      ,0      ,0      ,0      ,0
294
        ,22    ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
295
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
296
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
297
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
298
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
299
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
300
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
301
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
302
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
303
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
304
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
305
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
306
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
307
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
308
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
309
        ,0     ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0      ,0
310
        ,0     ,0      ,0      ,0      ,0      ,0
311
};
312
313
#ifdef U_ENABLE_GENERIC_ISO_2022
314
/*
315
 * When the generic ISO-2022 converter is completely removed, not just disabled
316
 * per #ifdef, then the following state table and the associated tables that are
317
 * dimensioned with MAX_STATES_2022 should be trimmed.
318
 *
319
 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
320
 * the associated escape sequences starting with ESC ( B should be removed.
321
 * This includes the ones with key values 1097 and all of the ones above 1000000.
322
 *
323
 * For the latter, the tables can simply be truncated.
324
 * For the former, since the tables must be kept parallel, it is probably best
325
 * to simply duplicate an adjacent table cell, parallel in all tables.
326
 *
327
 * It may make sense to restructure the tables, especially by using small search
328
 * tables for the variants instead of indexing them parallel to the table here.
329
 */
330
#endif
331
332
0
#define MAX_STATES_2022 74
333
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
334
/*   0           1           2           3           4           5           6           7           8           9           */
335
336
     1          ,34         ,36         ,39         ,55         ,57         ,60         ,61         ,1093       ,1096
337
    ,1097       ,1098       ,1099       ,1100       ,1101       ,1102       ,1103       ,1104       ,1105       ,1106
338
    ,1109       ,1154       ,1157       ,1160       ,1161       ,1176       ,1178       ,1179       ,1254       ,1257
339
    ,1768       ,1773       ,1957       ,35105      ,36933      ,36936      ,36937      ,36938      ,36939      ,36940
340
    ,36942      ,36943      ,36944      ,36945      ,36946      ,36947      ,36948      ,37640      ,37642      ,37644
341
    ,37646      ,37711      ,37744      ,37745      ,37746      ,37747      ,37748      ,40133      ,40136      ,40138
342
    ,40139      ,40140      ,40141      ,1123363    ,35947624   ,35947625   ,35947626   ,35947627   ,35947629   ,35947630
343
    ,35947631   ,35947635   ,35947636   ,35947638
344
};
345
346
#ifdef U_ENABLE_GENERIC_ISO_2022
347
348
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
349
 /*  0                      1                        2                      3                   4                   5                        6                      7                       8                       9    */
350
351
     NULL                   ,NULL                   ,NULL                   ,NULL               ,NULL               ,NULL                   ,NULL                   ,NULL                   ,"latin1"               ,"latin1"
352
    ,"latin1"               ,"ibm-865"              ,"ibm-865"              ,"ibm-865"          ,"ibm-865"          ,"ibm-865"              ,"ibm-865"              ,"JISX0201"             ,"JISX0201"             ,"latin1"
353
    ,"latin1"               ,NULL                   ,"JISX-208"             ,"ibm-5478"         ,"JISX-208"         ,NULL                   ,NULL                   ,NULL                   ,NULL                   ,"UTF8"
354
    ,"ISO-8859-1"           ,"ISO-8859-7"           ,"JIS-X-208"            ,NULL               ,"ibm-955"          ,"ibm-367"              ,"ibm-952"              ,"ibm-949"              ,"JISX-212"             ,"ibm-1383"
355
    ,"ibm-952"              ,"ibm-964"              ,"ibm-964"              ,"ibm-964"          ,"ibm-964"          ,"ibm-964"              ,"ibm-964"              ,"ibm-5478"         ,"ibm-949"              ,"ISO-IR-165"
356
    ,"CNS-11643-1992,1"     ,"CNS-11643-1992,2"     ,"CNS-11643-1992,3"     ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6"     ,"CNS-11643-1992,7"     ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
357
    ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL               ,"latin1"           ,"ibm-912"              ,"ibm-913"              ,"ibm-914"              ,"ibm-813"              ,"ibm-1089"
358
    ,"ibm-920"              ,"ibm-915"              ,"ibm-915"              ,"latin1"
359
};
360
361
#endif
362
363
static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
364
/*          0                           1                         2                             3                           4                           5                               6                        7                          8                           9       */
365
     VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022     ,VALID_NON_TERMINAL_2022   ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
366
    ,VALID_MAYBE_TERMINAL_2022  ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
367
    ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022
368
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
369
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
370
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
371
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_NON_TERMINAL_2022    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
372
    ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022        ,VALID_TERMINAL_2022
373
};
374
375
/* Type def for refactoring changeState_2022 code*/
376
typedef enum{
377
#ifdef U_ENABLE_GENERIC_ISO_2022
378
    ISO_2022=0,
379
#endif
380
    ISO_2022_JP=1,
381
#if !UCONFIG_ONLY_HTML_CONVERSION
382
    ISO_2022_KR=2,
383
    ISO_2022_CN=3
384
#endif
385
} Variant2022;
386
387
/*********** ISO 2022 Converter Protos ***********/
388
static void U_CALLCONV
389
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
390
391
static void U_CALLCONV
392
 _ISO2022Close(UConverter *converter);
393
394
static void U_CALLCONV
395
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
396
397
U_CDECL_BEGIN
398
static const char * U_CALLCONV
399
_ISO2022getName(const UConverter* cnv);
400
U_CDECL_END
401
402
static void  U_CALLCONV
403
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
404
405
U_CDECL_BEGIN
406
static UConverter * U_CALLCONV
407
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
408
409
U_CDECL_END
410
411
#ifdef U_ENABLE_GENERIC_ISO_2022
412
static void U_CALLCONV
413
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
414
#endif
415
416
namespace {
417
418
/*const UConverterSharedData _ISO2022Data;*/
419
extern const UConverterSharedData _ISO2022JPData;
420
421
#if !UCONFIG_ONLY_HTML_CONVERSION
422
extern const UConverterSharedData _ISO2022KRData;
423
extern const UConverterSharedData _ISO2022CNData;
424
#endif
425
426
}  // namespace
427
428
/*************** Converter implementations ******************/
429
430
/* The purpose of this function is to get around gcc compiler warnings. */
431
static inline void
432
fromUWriteUInt8(UConverter *cnv,
433
                 const char *bytes, int32_t length,
434
                 uint8_t **target, const char *targetLimit,
435
                 int32_t **offsets,
436
                 int32_t sourceIndex,
437
                 UErrorCode *pErrorCode)
438
0
{
439
0
    char *targetChars = (char *)*target;
440
0
    ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
441
0
                         offsets, sourceIndex, pErrorCode);
442
0
    *target = (uint8_t*)targetChars;
443
444
0
}
445
446
static inline void
447
0
setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
448
0
    if(myConverterData->version == 1) {
449
0
        UConverter *cnv = myConverterData->currentConverter;
450
451
0
        cnv->toUnicodeStatus=0;     /* offset */
452
0
        cnv->mode=0;                /* state */
453
0
        cnv->toULength=0;           /* byteIndex */
454
0
    }
455
0
}
456
457
static inline void
458
0
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
459
   /* in ISO-2022-KR the designator sequence appears only once
460
    * in a file so we append it only once
461
    */
462
0
    if( converter->charErrorBufferLength==0){
463
464
0
        converter->charErrorBufferLength = 4;
465
0
        converter->charErrorBuffer[0] = 0x1b;
466
0
        converter->charErrorBuffer[1] = 0x24;
467
0
        converter->charErrorBuffer[2] = 0x29;
468
0
        converter->charErrorBuffer[3] = 0x43;
469
0
    }
470
0
    if(myConverterData->version == 1) {
471
0
        UConverter *cnv = myConverterData->currentConverter;
472
473
0
        cnv->fromUChar32=0;
474
0
        cnv->fromUnicodeStatus=1;   /* prevLength */
475
0
    }
476
0
}
477
478
static void U_CALLCONV
479
0
_ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
480
481
0
    char myLocale[6]={' ',' ',' ',' ',' ',' '};
482
483
0
    cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
484
0
    if(cnv->extraInfo != NULL) {
485
0
        UConverterNamePieces stackPieces;
486
0
        UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
487
0
        UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
488
0
        uint32_t version;
489
490
0
        stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
491
492
0
        uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
493
0
        myConverterData->currentType = ASCII1;
494
0
        cnv->fromUnicodeStatus =FALSE;
495
0
        if(pArgs->locale){
496
0
            uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale));
497
0
        }
498
0
        version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
499
0
        myConverterData->version = version;
500
0
        if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
501
0
            (myLocale[2]=='_' || myLocale[2]=='\0'))
502
0
        {
503
            /* open the required converters and cache them */
504
0
            if(version>MAX_JA_VERSION) {
505
                // ICU 55 fails to open a converter for an unsupported version.
506
                // Previously, it fell back to version 0, but that would yield
507
                // unexpected behavior.
508
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
509
0
                return;
510
0
            }
511
0
            if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
512
0
                myConverterData->myConverterArray[ISO8859_7] =
513
0
                    ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
514
0
            }
515
0
            myConverterData->myConverterArray[JISX208] =
516
0
                ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
517
0
            if(jpCharsetMasks[version]&CSM(JISX212)) {
518
0
                myConverterData->myConverterArray[JISX212] =
519
0
                    ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
520
0
            }
521
0
            if(jpCharsetMasks[version]&CSM(GB2312)) {
522
0
                myConverterData->myConverterArray[GB2312] =
523
0
                    ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);   /* gb_2312_80-1 */
524
0
            }
525
0
            if(jpCharsetMasks[version]&CSM(KSC5601)) {
526
0
                myConverterData->myConverterArray[KSC5601] =
527
0
                    ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
528
0
            }
529
530
            /* set the function pointers to appropriate funtions */
531
0
            cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
532
0
            uprv_strcpy(myConverterData->locale,"ja");
533
534
0
            (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
535
0
            size_t len = uprv_strlen(myConverterData->name);
536
0
            myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
537
0
            myConverterData->name[len+1]='\0';
538
0
        }
539
0
#if !UCONFIG_ONLY_HTML_CONVERSION
540
0
        else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
541
0
            (myLocale[2]=='_' || myLocale[2]=='\0'))
542
0
        {
543
0
            if(version>1) {
544
                // ICU 55 fails to open a converter for an unsupported version.
545
                // Previously, it fell back to version 0, but that would yield
546
                // unexpected behavior.
547
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
548
0
                return;
549
0
            }
550
0
            const char *cnvName;
551
0
            if(version==1) {
552
0
                cnvName="icu-internal-25546";
553
0
            } else {
554
0
                cnvName="ibm-949";
555
0
                myConverterData->version=version=0;
556
0
            }
557
0
            if(pArgs->onlyTestIsLoadable) {
558
0
                ucnv_canCreateConverter(cnvName, errorCode);  /* errorCode carries result */
559
0
                uprv_free(cnv->extraInfo);
560
0
                cnv->extraInfo=NULL;
561
0
                return;
562
0
            } else {
563
0
                myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
564
0
                if (U_FAILURE(*errorCode)) {
565
0
                    _ISO2022Close(cnv);
566
0
                    return;
567
0
                }
568
569
0
                if(version==1) {
570
0
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
571
0
                    uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
572
0
                    cnv->subCharLen = myConverterData->currentConverter->subCharLen;
573
0
                }else{
574
0
                    (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
575
0
                }
576
577
                /* initialize the state variables */
578
0
                setInitialStateToUnicodeKR(cnv, myConverterData);
579
0
                setInitialStateFromUnicodeKR(cnv, myConverterData);
580
581
                /* set the function pointers to appropriate funtions */
582
0
                cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
583
0
                uprv_strcpy(myConverterData->locale,"ko");
584
0
            }
585
0
        }
586
0
        else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
587
0
            (myLocale[2]=='_' || myLocale[2]=='\0'))
588
0
        {
589
0
            if(version>2) {
590
                // ICU 55 fails to open a converter for an unsupported version.
591
                // Previously, it fell back to version 0, but that would yield
592
                // unexpected behavior.
593
0
                *errorCode = U_MISSING_RESOURCE_ERROR;
594
0
                return;
595
0
            }
596
597
            /* open the required converters and cache them */
598
0
            myConverterData->myConverterArray[GB2312_1] =
599
0
                ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
600
0
            if(version==1) {
601
0
                myConverterData->myConverterArray[ISO_IR_165] =
602
0
                    ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
603
0
            }
604
0
            myConverterData->myConverterArray[CNS_11643] =
605
0
                ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
606
607
608
            /* set the function pointers to appropriate funtions */
609
0
            cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
610
0
            uprv_strcpy(myConverterData->locale,"cn");
611
612
0
            if (version==0){
613
0
                myConverterData->version = 0;
614
0
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
615
0
            }else if (version==1){
616
0
                myConverterData->version = 1;
617
0
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
618
0
            }else {
619
0
                myConverterData->version = 2;
620
0
                (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
621
0
            }
622
0
        }
623
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
624
0
        else{
625
#ifdef U_ENABLE_GENERIC_ISO_2022
626
            myConverterData->isFirstBuffer = TRUE;
627
628
            /* append the UTF-8 escape sequence */
629
            cnv->charErrorBufferLength = 3;
630
            cnv->charErrorBuffer[0] = 0x1b;
631
            cnv->charErrorBuffer[1] = 0x25;
632
            cnv->charErrorBuffer[2] = 0x42;
633
634
            cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
635
            /* initialize the state variables */
636
            uprv_strcpy(myConverterData->name,"ISO_2022");
637
#else
638
0
            *errorCode = U_MISSING_RESOURCE_ERROR;
639
            // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
640
            // data loading error code.
641
0
            return;
642
0
#endif
643
0
        }
644
645
0
        cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
646
647
0
        if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
648
0
            _ISO2022Close(cnv);
649
0
        }
650
0
    } else {
651
0
        *errorCode = U_MEMORY_ALLOCATION_ERROR;
652
0
    }
653
0
}
654
655
656
static void U_CALLCONV
657
0
_ISO2022Close(UConverter *converter) {
658
0
    UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
659
0
    UConverterSharedData **array = myData->myConverterArray;
660
0
    int32_t i;
661
662
0
    if (converter->extraInfo != NULL) {
663
        /*close the array of converter pointers and free the memory*/
664
0
        for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
665
0
            if(array[i]!=NULL) {
666
0
                ucnv_unloadSharedDataIfReady(array[i]);
667
0
            }
668
0
        }
669
670
0
        ucnv_close(myData->currentConverter);
671
672
0
        if(!converter->isExtraLocal){
673
0
            uprv_free (converter->extraInfo);
674
0
            converter->extraInfo = NULL;
675
0
        }
676
0
    }
677
0
}
678
679
static void U_CALLCONV
680
0
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
681
0
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
682
0
    if(choice<=UCNV_RESET_TO_UNICODE) {
683
0
        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
684
0
        myConverterData->key = 0;
685
0
        myConverterData->isEmptySegment = FALSE;
686
0
    }
687
0
    if(choice!=UCNV_RESET_TO_UNICODE) {
688
0
        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
689
0
    }
690
#ifdef U_ENABLE_GENERIC_ISO_2022
691
    if(myConverterData->locale[0] == 0){
692
        if(choice<=UCNV_RESET_TO_UNICODE) {
693
            myConverterData->isFirstBuffer = TRUE;
694
            myConverterData->key = 0;
695
            if (converter->mode == UCNV_SO){
696
                ucnv_close (myConverterData->currentConverter);
697
                myConverterData->currentConverter=NULL;
698
            }
699
            converter->mode = UCNV_SI;
700
        }
701
        if(choice!=UCNV_RESET_TO_UNICODE) {
702
            /* re-append UTF-8 escape sequence */
703
            converter->charErrorBufferLength = 3;
704
            converter->charErrorBuffer[0] = 0x1b;
705
            converter->charErrorBuffer[1] = 0x28;
706
            converter->charErrorBuffer[2] = 0x42;
707
        }
708
    }
709
    else
710
#endif
711
0
    {
712
        /* reset the state variables */
713
0
        if(myConverterData->locale[0] == 'k'){
714
0
            if(choice<=UCNV_RESET_TO_UNICODE) {
715
0
                setInitialStateToUnicodeKR(converter, myConverterData);
716
0
            }
717
0
            if(choice!=UCNV_RESET_TO_UNICODE) {
718
0
                setInitialStateFromUnicodeKR(converter, myConverterData);
719
0
            }
720
0
        }
721
0
    }
722
0
}
723
724
U_CDECL_BEGIN
725
726
static const char * U_CALLCONV
727
0
_ISO2022getName(const UConverter* cnv){
728
0
    if(cnv->extraInfo){
729
0
        UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
730
0
        return myData->name;
731
0
    }
732
0
    return NULL;
733
0
}
734
735
U_CDECL_END
736
737
738
/*************** to unicode *******************/
739
/****************************************************************************
740
 * Recognized escape sequences are
741
 * <ESC>(B  ASCII
742
 * <ESC>.A  ISO-8859-1
743
 * <ESC>.F  ISO-8859-7
744
 * <ESC>(J  JISX-201
745
 * <ESC>(I  JISX-201
746
 * <ESC>$B  JISX-208
747
 * <ESC>$@  JISX-208
748
 * <ESC>$(D JISX-212
749
 * <ESC>$A  GB2312
750
 * <ESC>$(C KSC5601
751
 */
752
static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
753
/*      0                1               2               3               4               5               6               7               8               9    */
754
    INVALID_STATE   ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
755
    ,ASCII          ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,JISX201        ,HWKANA_7BIT    ,JISX201        ,INVALID_STATE
756
    ,INVALID_STATE  ,INVALID_STATE  ,JISX208        ,GB2312         ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
757
    ,ISO8859_1      ,ISO8859_7      ,JISX208        ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,KSC5601        ,JISX212        ,INVALID_STATE
758
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
759
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
760
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
761
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
762
};
763
764
#if !UCONFIG_ONLY_HTML_CONVERSION
765
/*************** to unicode *******************/
766
static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
767
/*      0                1               2               3               4               5               6               7               8               9    */
768
     INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,SS2_STATE      ,SS3_STATE      ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
769
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
770
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
771
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
772
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,GB2312_1       ,INVALID_STATE  ,ISO_IR_165
773
    ,CNS_11643_1    ,CNS_11643_2    ,CNS_11643_3    ,CNS_11643_4    ,CNS_11643_5    ,CNS_11643_6    ,CNS_11643_7    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
774
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
775
    ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE  ,INVALID_STATE
776
};
777
#endif
778
779
780
static UCNV_TableStates_2022
781
0
getKey_2022(char c,int32_t* key,int32_t* offset){
782
0
    int32_t togo;
783
0
    int32_t low = 0;
784
0
    int32_t hi = MAX_STATES_2022;
785
0
    int32_t oldmid=0;
786
787
0
    togo = normalize_esq_chars_2022[(uint8_t)c];
788
0
    if(togo == 0) {
789
        /* not a valid character anywhere in an escape sequence */
790
0
        *key = 0;
791
0
        *offset = 0;
792
0
        return INVALID_2022;
793
0
    }
794
0
    togo = (*key << 5) + togo;
795
796
0
    while (hi != low)  /*binary search*/{
797
798
0
        int32_t mid = (hi+low) >> 1; /*Finds median*/
799
800
0
        if (mid == oldmid)
801
0
            break;
802
803
0
        if (escSeqStateTable_Key_2022[mid] > togo){
804
0
            hi = mid;
805
0
        }
806
0
        else if (escSeqStateTable_Key_2022[mid] < togo){
807
0
            low = mid;
808
0
        }
809
0
        else /*we found it*/{
810
0
            *key = togo;
811
0
            *offset = mid;
812
0
            return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
813
0
        }
814
0
        oldmid = mid;
815
816
0
    }
817
818
0
    *key = 0;
819
0
    *offset = 0;
820
0
    return INVALID_2022;
821
0
}
822
823
/*runs through a state machine to determine the escape sequence - codepage correspondance
824
 */
825
static void
826
changeState_2022(UConverter* _this,
827
                const char** source,
828
                const char* sourceLimit,
829
                Variant2022 var,
830
0
                UErrorCode* err){
831
0
    UCNV_TableStates_2022 value;
832
0
    UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
833
0
    uint32_t key = myData2022->key;
834
0
    int32_t offset = 0;
835
0
    int8_t initialToULength = _this->toULength;
836
0
    char c;
837
838
0
    value = VALID_NON_TERMINAL_2022;
839
0
    while (*source < sourceLimit) {
840
0
        c = *(*source)++;
841
0
        _this->toUBytes[_this->toULength++]=(uint8_t)c;
842
0
        value = getKey_2022(c,(int32_t *) &key, &offset);
843
844
0
        switch (value){
845
846
0
        case VALID_NON_TERMINAL_2022 :
847
            /* continue with the loop */
848
0
            break;
849
850
0
        case VALID_TERMINAL_2022:
851
0
            key = 0;
852
0
            goto DONE;
853
854
0
        case INVALID_2022:
855
0
            goto DONE;
856
857
0
        case VALID_MAYBE_TERMINAL_2022:
858
#ifdef U_ENABLE_GENERIC_ISO_2022
859
            /* ESC ( B is ambiguous only for ISO_2022 itself */
860
            if(var == ISO_2022) {
861
                /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
862
                _this->toULength = 0;
863
864
                /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
865
866
                /* continue with the loop */
867
                value = VALID_NON_TERMINAL_2022;
868
                break;
869
            } else
870
#endif
871
0
            {
872
                /* not ISO_2022 itself, finish here */
873
0
                value = VALID_TERMINAL_2022;
874
0
                key = 0;
875
0
                goto DONE;
876
0
            }
877
0
        }
878
0
    }
879
880
0
DONE:
881
0
    myData2022->key = key;
882
883
0
    if (value == VALID_NON_TERMINAL_2022) {
884
        /* indicate that the escape sequence is incomplete: key!=0 */
885
0
        return;
886
0
    } else if (value == INVALID_2022 ) {
887
0
        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
888
0
    } else /* value == VALID_TERMINAL_2022 */ {
889
0
        switch(var){
890
#ifdef U_ENABLE_GENERIC_ISO_2022
891
        case ISO_2022:
892
        {
893
            const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
894
            if(chosenConverterName == NULL) {
895
                /* SS2 or SS3 */
896
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
897
                _this->toUCallbackReason = UCNV_UNASSIGNED;
898
                return;
899
            }
900
901
            _this->mode = UCNV_SI;
902
            ucnv_close(myData2022->currentConverter);
903
            myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
904
            if(U_SUCCESS(*err)) {
905
                myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
906
                _this->mode = UCNV_SO;
907
            }
908
            break;
909
        }
910
#endif
911
0
        case ISO_2022_JP:
912
0
            {
913
0
                StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
914
0
                switch(tempState) {
915
0
                case INVALID_STATE:
916
0
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
917
0
                    break;
918
0
                case SS2_STATE:
919
0
                    if(myData2022->toU2022State.cs[2]!=0) {
920
0
                        if(myData2022->toU2022State.g<2) {
921
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
922
0
                        }
923
0
                        myData2022->toU2022State.g=2;
924
0
                    } else {
925
                        /* illegal to have SS2 before a matching designator */
926
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
927
0
                    }
928
0
                    break;
929
                /* case SS3_STATE: not used in ISO-2022-JP-x */
930
0
                case ISO8859_1:
931
0
                case ISO8859_7:
932
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
933
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
934
0
                    } else {
935
                        /* G2 charset for SS2 */
936
0
                        myData2022->toU2022State.cs[2]=(int8_t)tempState;
937
0
                    }
938
0
                    break;
939
0
                default:
940
0
                    if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
941
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
942
0
                    } else {
943
                        /* G0 charset */
944
0
                        myData2022->toU2022State.cs[0]=(int8_t)tempState;
945
0
                    }
946
0
                    break;
947
0
                }
948
0
            }
949
0
            break;
950
0
#if !UCONFIG_ONLY_HTML_CONVERSION
951
0
        case ISO_2022_CN:
952
0
            {
953
0
                StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
954
0
                switch(tempState) {
955
0
                case INVALID_STATE:
956
0
                    *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
957
0
                    break;
958
0
                case SS2_STATE:
959
0
                    if(myData2022->toU2022State.cs[2]!=0) {
960
0
                        if(myData2022->toU2022State.g<2) {
961
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
962
0
                        }
963
0
                        myData2022->toU2022State.g=2;
964
0
                    } else {
965
                        /* illegal to have SS2 before a matching designator */
966
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
967
0
                    }
968
0
                    break;
969
0
                case SS3_STATE:
970
0
                    if(myData2022->toU2022State.cs[3]!=0) {
971
0
                        if(myData2022->toU2022State.g<2) {
972
0
                            myData2022->toU2022State.prevG=myData2022->toU2022State.g;
973
0
                        }
974
0
                        myData2022->toU2022State.g=3;
975
0
                    } else {
976
                        /* illegal to have SS3 before a matching designator */
977
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
978
0
                    }
979
0
                    break;
980
0
                case ISO_IR_165:
981
0
                    if(myData2022->version==0) {
982
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
983
0
                        break;
984
0
                    }
985
0
                    U_FALLTHROUGH;
986
0
                case GB2312_1:
987
0
                    U_FALLTHROUGH;
988
0
                case CNS_11643_1:
989
0
                    myData2022->toU2022State.cs[1]=(int8_t)tempState;
990
0
                    break;
991
0
                case CNS_11643_2:
992
0
                    myData2022->toU2022State.cs[2]=(int8_t)tempState;
993
0
                    break;
994
0
                default:
995
                    /* other CNS 11643 planes */
996
0
                    if(myData2022->version==0) {
997
0
                        *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
998
0
                    } else {
999
0
                       myData2022->toU2022State.cs[3]=(int8_t)tempState;
1000
0
                    }
1001
0
                    break;
1002
0
                }
1003
0
            }
1004
0
            break;
1005
0
        case ISO_2022_KR:
1006
0
            if(offset==0x30){
1007
                /* nothing to be done, just accept this one escape sequence */
1008
0
            } else {
1009
0
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1010
0
            }
1011
0
            break;
1012
0
#endif  // !UCONFIG_ONLY_HTML_CONVERSION
1013
1014
0
        default:
1015
0
            *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1016
0
            break;
1017
0
        }
1018
0
    }
1019
0
    if(U_SUCCESS(*err)) {
1020
0
        _this->toULength = 0;
1021
0
    } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
1022
0
        if(_this->toULength>1) {
1023
            /*
1024
             * Ticket 5691: consistent illegal sequences:
1025
             * - We include at least the first byte (ESC) in the illegal sequence.
1026
             * - If any of the non-initial bytes could be the start of a character,
1027
             *   we stop the illegal sequence before the first one of those.
1028
             *   In escape sequences, all following bytes are "printable", that is,
1029
             *   unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
1030
             *   they are valid single/lead bytes.
1031
             *   For simplicity, we always only report the initial ESC byte as the
1032
             *   illegal sequence and back out all other bytes we looked at.
1033
             */
1034
            /* Back out some bytes. */
1035
0
            int8_t backOutDistance=_this->toULength-1;
1036
0
            int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
1037
0
            if(backOutDistance<=bytesFromThisBuffer) {
1038
                /* same as initialToULength<=1 */
1039
0
                *source-=backOutDistance;
1040
0
            } else {
1041
                /* Back out bytes from the previous buffer: Need to replay them. */
1042
0
                _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
1043
                /* same as -(initialToULength-1) */
1044
                /* preToULength is negative! */
1045
0
                uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
1046
0
                *source-=bytesFromThisBuffer;
1047
0
            }
1048
0
            _this->toULength=1;
1049
0
        }
1050
0
    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1051
0
        _this->toUCallbackReason = UCNV_UNASSIGNED;
1052
0
    }
1053
0
}
1054
1055
#if !UCONFIG_ONLY_HTML_CONVERSION
1056
/*Checks the characters of the buffer against valid 2022 escape sequences
1057
*if the match we return a pointer to the initial start of the sequence otherwise
1058
*we return sourceLimit
1059
*/
1060
/*for 2022 looks ahead in the stream
1061
 *to determine the longest possible convertible
1062
 *data stream
1063
 */
1064
static inline const char*
1065
getEndOfBuffer_2022(const char** source,
1066
                   const char* sourceLimit,
1067
0
                   UBool /*flush*/){
1068
1069
0
    const char* mySource = *source;
1070
1071
#ifdef U_ENABLE_GENERIC_ISO_2022
1072
    if (*source >= sourceLimit)
1073
        return sourceLimit;
1074
1075
    do{
1076
1077
        if (*mySource == ESC_2022){
1078
            int8_t i;
1079
            int32_t key = 0;
1080
            int32_t offset;
1081
            UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
1082
1083
            /* Kludge: I could not
1084
            * figure out the reason for validating an escape sequence
1085
            * twice - once here and once in changeState_2022().
1086
            * is it possible to have an ESC character in a ISO2022
1087
            * byte stream which is valid in a code page? Is it legal?
1088
            */
1089
            for (i=0;
1090
            (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
1091
            i++) {
1092
                value =  getKey_2022(*(mySource+i), &key, &offset);
1093
            }
1094
            if (value > 0 || *mySource==ESC_2022)
1095
                return mySource;
1096
1097
            if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
1098
                return sourceLimit;
1099
        }
1100
    }while (++mySource < sourceLimit);
1101
1102
    return sourceLimit;
1103
#else
1104
0
    while(mySource < sourceLimit && *mySource != ESC_2022) {
1105
0
        ++mySource;
1106
0
    }
1107
0
    return mySource;
1108
0
#endif
1109
0
}
1110
#endif
1111
1112
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1113
 * any future change in _MBCSFromUChar32() function should be reflected here.
1114
 * @return number of bytes in *value; negative number if fallback; 0 if no mapping
1115
 */
1116
static inline int32_t
1117
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1118
                                         UChar32 c,
1119
                                         uint32_t* value,
1120
                                         UBool useFallback,
1121
                                         int outputType)
1122
0
{
1123
0
    const int32_t *cx;
1124
0
    const uint16_t *table;
1125
0
    uint32_t stage2Entry;
1126
0
    uint32_t myValue;
1127
0
    int32_t length;
1128
0
    const uint8_t *p;
1129
    /*
1130
     * TODO(markus): Use and require new, faster MBCS conversion table structures.
1131
     * Use internal version of ucnv_open() that verifies that the new structures are available,
1132
     * else U_INTERNAL_PROGRAM_ERROR.
1133
     */
1134
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1135
0
    if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1136
0
        table=sharedData->mbcs.fromUnicodeTable;
1137
0
        stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1138
        /* get the bytes and the length for the output */
1139
0
        if(outputType==MBCS_OUTPUT_2){
1140
0
            myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1141
0
            if(myValue<=0xff) {
1142
0
                length=1;
1143
0
            } else {
1144
0
                length=2;
1145
0
            }
1146
0
        } else /* outputType==MBCS_OUTPUT_3 */ {
1147
0
            p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1148
0
            myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1149
0
            if(myValue<=0xff) {
1150
0
                length=1;
1151
0
            } else if(myValue<=0xffff) {
1152
0
                length=2;
1153
0
            } else {
1154
0
                length=3;
1155
0
            }
1156
0
        }
1157
        /* is this code point assigned, or do we use fallbacks? */
1158
0
        if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1159
            /* assigned */
1160
0
            *value=myValue;
1161
0
            return length;
1162
0
        } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1163
            /*
1164
             * We allow a 0 byte output if the "assigned" bit is set for this entry.
1165
             * There is no way with this data structure for fallback output
1166
             * to be a zero byte.
1167
             */
1168
0
            *value=myValue;
1169
0
            return -length;
1170
0
        }
1171
0
    }
1172
1173
0
    cx=sharedData->mbcs.extIndexes;
1174
0
    if(cx!=NULL) {
1175
0
        return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1176
0
    }
1177
1178
    /* unassigned */
1179
0
    return 0;
1180
0
}
1181
1182
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1183
 * any future change in _MBCSSingleFromUChar32() function should be reflected here.
1184
 * @param retval pointer to output byte
1185
 * @return 1 roundtrip byte  0 no mapping  -1 fallback byte
1186
 */
1187
static inline int32_t
1188
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1189
                                       UChar32 c,
1190
                                       uint32_t* retval,
1191
                                       UBool useFallback)
1192
0
{
1193
0
    const uint16_t *table;
1194
0
    int32_t value;
1195
    /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1196
0
    if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1197
0
        return 0;
1198
0
    }
1199
    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1200
0
    table=sharedData->mbcs.fromUnicodeTable;
1201
    /* get the byte for the output */
1202
0
    value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1203
    /* is this code point assigned, or do we use fallbacks? */
1204
0
    *retval=(uint32_t)(value&0xff);
1205
0
    if(value>=0xf00) {
1206
0
        return 1;  /* roundtrip */
1207
0
    } else if(useFallback ? value>=0x800 : value>=0xc00) {
1208
0
        return -1;  /* fallback taken */
1209
0
    } else {
1210
0
        return 0;  /* no mapping */
1211
0
    }
1212
0
}
1213
1214
/*
1215
 * Check that the result is a 2-byte value with each byte in the range A1..FE
1216
 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
1217
 * to move it to the ISO 2022 range 21..7E.
1218
 * Return 0 if out of range.
1219
 */
1220
static inline uint32_t
1221
0
_2022FromGR94DBCS(uint32_t value) {
1222
0
    if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1223
0
        (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1224
0
    ) {
1225
0
        return value - 0x8080;  /* shift down to 21..7e byte range */
1226
0
    } else {
1227
0
        return 0;  /* not valid for ISO 2022 */
1228
0
    }
1229
0
}
1230
1231
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
1232
/*
1233
 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
1234
 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
1235
 * unchanged. 
1236
 */
1237
static inline uint32_t
1238
_2022ToGR94DBCS(uint32_t value) {
1239
    uint32_t returnValue = value + 0x8080;
1240
    if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1241
        (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
1242
        return returnValue;
1243
    } else {
1244
        return value;
1245
    }
1246
}
1247
#endif
1248
1249
#ifdef U_ENABLE_GENERIC_ISO_2022
1250
1251
/**********************************************************************************
1252
*  ISO-2022 Converter
1253
*
1254
*
1255
*/
1256
1257
static void U_CALLCONV
1258
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1259
                                                           UErrorCode* err){
1260
    const char* mySourceLimit, *realSourceLimit;
1261
    const char* sourceStart;
1262
    const UChar* myTargetStart;
1263
    UConverter* saveThis;
1264
    UConverterDataISO2022* myData;
1265
    int8_t length;
1266
1267
    saveThis = args->converter;
1268
    myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1269
1270
    realSourceLimit = args->sourceLimit;
1271
    while (args->source < realSourceLimit) {
1272
        if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1273
            /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1274
            mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1275
1276
            if(args->source < mySourceLimit) {
1277
                if(myData->currentConverter==NULL) {
1278
                    myData->currentConverter = ucnv_open("ASCII",err);
1279
                    if(U_FAILURE(*err)){
1280
                        return;
1281
                    }
1282
1283
                    myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1284
                    saveThis->mode = UCNV_SO;
1285
                }
1286
1287
                /* convert to before the ESC or until the end of the buffer */
1288
                myData->isFirstBuffer=FALSE;
1289
                sourceStart = args->source;
1290
                myTargetStart = args->target;
1291
                args->converter = myData->currentConverter;
1292
                ucnv_toUnicode(args->converter,
1293
                    &args->target,
1294
                    args->targetLimit,
1295
                    &args->source,
1296
                    mySourceLimit,
1297
                    args->offsets,
1298
                    (UBool)(args->flush && mySourceLimit == realSourceLimit),
1299
                    err);
1300
                args->converter = saveThis;
1301
1302
                if (*err == U_BUFFER_OVERFLOW_ERROR) {
1303
                    /* move the overflow buffer */
1304
                    length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1305
                    myData->currentConverter->UCharErrorBufferLength = 0;
1306
                    if(length > 0) {
1307
                        uprv_memcpy(saveThis->UCharErrorBuffer,
1308
                                    myData->currentConverter->UCharErrorBuffer,
1309
                                    length*U_SIZEOF_UCHAR);
1310
                    }
1311
                    return;
1312
                }
1313
1314
                /*
1315
                 * At least one of:
1316
                 * -Error while converting
1317
                 * -Done with entire buffer
1318
                 * -Need to write offsets or update the current offset
1319
                 *  (leave that up to the code in ucnv.c)
1320
                 *
1321
                 * or else we just stopped at an ESC byte and continue with changeState_2022()
1322
                 */
1323
                if (U_FAILURE(*err) ||
1324
                    (args->source == realSourceLimit) ||
1325
                    (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1326
                    (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1327
                ) {
1328
                    /* copy partial or error input for truncated detection and error handling */
1329
                    if(U_FAILURE(*err)) {
1330
                        length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1331
                        if(length > 0) {
1332
                            uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1333
                        }
1334
                    } else {
1335
                        length = saveThis->toULength = myData->currentConverter->toULength;
1336
                        if(length > 0) {
1337
                            uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1338
                            if(args->source < mySourceLimit) {
1339
                                *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1340
                            }
1341
                        }
1342
                    }
1343
                    return;
1344
                }
1345
            }
1346
        }
1347
1348
        sourceStart = args->source;
1349
        changeState_2022(args->converter,
1350
               &(args->source),
1351
               realSourceLimit,
1352
               ISO_2022,
1353
               err);
1354
        if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1355
            /* let the ucnv.c code update its current offset */
1356
            return;
1357
        }
1358
    }
1359
}
1360
1361
#endif
1362
1363
/*
1364
 * To Unicode Callback helper function
1365
 */
1366
static void
1367
toUnicodeCallback(UConverter *cnv,
1368
                  const uint32_t sourceChar, const uint32_t targetUniChar,
1369
0
                  UErrorCode* err){
1370
0
    if(sourceChar>0xff){
1371
0
        cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1372
0
        cnv->toUBytes[1] = (uint8_t)sourceChar;
1373
0
        cnv->toULength = 2;
1374
0
    }
1375
0
    else{
1376
0
        cnv->toUBytes[0] =(char) sourceChar;
1377
0
        cnv->toULength = 1;
1378
0
    }
1379
1380
0
    if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1381
0
        *err = U_INVALID_CHAR_FOUND;
1382
0
    }
1383
0
    else{
1384
0
        *err = U_ILLEGAL_CHAR_FOUND;
1385
0
    }
1386
0
}
1387
1388
/**************************************ISO-2022-JP*************************************************/
1389
1390
/************************************** IMPORTANT **************************************************
1391
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1392
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1393
* The converter iterates over each Unicode codepoint
1394
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1395
* processed one char at a time it would make sense to reduce the extra processing a canned converter
1396
* would do as far as possible.
1397
*
1398
* If the implementation of these macros or structure of sharedData struct change in the future, make
1399
* sure that ISO-2022 is also changed.
1400
***************************************************************************************************
1401
*/
1402
1403
/***************************************************************************************************
1404
* Rules for ISO-2022-jp encoding
1405
* (i)   Escape sequences must be fully contained within a line they should not
1406
*       span new lines or CRs
1407
* (ii)  If the last character on a line is represented by two bytes then an ASCII or
1408
*       JIS-Roman character escape sequence should follow before the line terminates
1409
* (iii) If the first character on the line is represented by two bytes then a two
1410
*       byte character escape sequence should precede it
1411
* (iv)  If no escape sequence is encountered then the characters are ASCII
1412
* (v)   Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1413
*       and invoked with SS2 (ESC N).
1414
* (vi)  If there is any G0 designation in text, there must be a switch to
1415
*       ASCII or to JIS X 0201-Roman before a space character (but not
1416
*       necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1417
*       characters such as tab or CRLF.
1418
* (vi)  Supported encodings:
1419
*          ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1420
*
1421
*  source : RFC-1554
1422
*
1423
*          JISX201, JISX208,JISX212 : new .cnv data files created
1424
*          KSC5601 : alias to ibm-949 mapping table
1425
*          GB2312 : alias to ibm-1386 mapping table
1426
*          ISO-8859-1 : Algorithmic implemented as LATIN1 case
1427
*          ISO-8859-7 : alisas to ibm-9409 mapping table
1428
*/
1429
1430
/* preference order of JP charsets */
1431
static const StateEnum jpCharsetPref[]={
1432
    ASCII,
1433
    JISX201,
1434
    ISO8859_1,
1435
    JISX208,
1436
    ISO8859_7,
1437
    JISX212,
1438
    GB2312,
1439
    KSC5601,
1440
    HWKANA_7BIT
1441
};
1442
1443
/*
1444
 * The escape sequences must be in order of the enum constants like JISX201  = 3,
1445
 * not in order of jpCharsetPref[]!
1446
 */
1447
static const char escSeqChars[][6] ={
1448
    "\x1B\x28\x42",         /* <ESC>(B  ASCII       */
1449
    "\x1B\x2E\x41",         /* <ESC>.A  ISO-8859-1  */
1450
    "\x1B\x2E\x46",         /* <ESC>.F  ISO-8859-7  */
1451
    "\x1B\x28\x4A",         /* <ESC>(J  JISX-201    */
1452
    "\x1B\x24\x42",         /* <ESC>$B  JISX-208    */
1453
    "\x1B\x24\x28\x44",     /* <ESC>$(D JISX-212    */
1454
    "\x1B\x24\x41",         /* <ESC>$A  GB2312      */
1455
    "\x1B\x24\x28\x43",     /* <ESC>$(C KSC5601     */
1456
    "\x1B\x28\x49"          /* <ESC>(I  HWKANA_7BIT */
1457
1458
};
1459
static  const int8_t escSeqCharsLen[] ={
1460
    3, /* length of <ESC>(B  ASCII       */
1461
    3, /* length of <ESC>.A  ISO-8859-1  */
1462
    3, /* length of <ESC>.F  ISO-8859-7  */
1463
    3, /* length of <ESC>(J  JISX-201    */
1464
    3, /* length of <ESC>$B  JISX-208    */
1465
    4, /* length of <ESC>$(D JISX-212    */
1466
    3, /* length of <ESC>$A  GB2312      */
1467
    4, /* length of <ESC>$(C KSC5601     */
1468
    3  /* length of <ESC>(I  HWKANA_7BIT */
1469
};
1470
1471
/*
1472
* The iteration over various code pages works this way:
1473
* i)   Get the currentState from myConverterData->currentState
1474
* ii)  Check if the character is mapped to a valid character in the currentState
1475
*      Yes ->  a) set the initIterState to currentState
1476
*       b) remain in this state until an invalid character is found
1477
*      No  ->  a) go to the next code page and find the character
1478
* iii) Before changing the state increment the current state check if the current state
1479
*      is equal to the intitIteration state
1480
*      Yes ->  A character that cannot be represented in any of the supported encodings
1481
*       break and return a U_INVALID_CHARACTER error
1482
*      No  ->  Continue and find the character in next code page
1483
*
1484
*
1485
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1486
*/
1487
1488
/* Map 00..7F to Unicode according to JIS X 0201. */
1489
static inline uint32_t
1490
0
jisx201ToU(uint32_t value) {
1491
0
    if(value < 0x5c) {
1492
0
        return value;
1493
0
    } else if(value == 0x5c) {
1494
0
        return 0xa5;
1495
0
    } else if(value == 0x7e) {
1496
0
        return 0x203e;
1497
0
    } else /* value <= 0x7f */ {
1498
0
        return value;
1499
0
    }
1500
0
}
1501
1502
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
1503
static inline uint32_t
1504
0
jisx201FromU(uint32_t value) {
1505
0
    if(value<=0x7f) {
1506
0
        if(value!=0x5c && value!=0x7e) {
1507
0
            return value;
1508
0
        }
1509
0
    } else if(value==0xa5) {
1510
0
        return 0x5c;
1511
0
    } else if(value==0x203e) {
1512
0
        return 0x7e;
1513
0
    }
1514
0
    return 0xfffe;
1515
0
}
1516
1517
/*
1518
 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
1519
 * to JIS X 0208, and convert it to a pair of 21..7E bytes.
1520
 * Return 0 if the byte pair is out of range.
1521
 */
1522
static inline uint32_t
1523
0
_2022FromSJIS(uint32_t value) {
1524
0
    uint8_t trail;
1525
1526
0
    if(value > 0xEFFC) {
1527
0
        return 0;  /* beyond JIS X 0208 */
1528
0
    }
1529
1530
0
    trail = (uint8_t)value;
1531
1532
0
    value &= 0xff00;  /* lead byte */
1533
0
    if(value <= 0x9f00) {
1534
0
        value -= 0x7000;
1535
0
    } else /* 0xe000 <= value <= 0xef00 */ {
1536
0
        value -= 0xb000;
1537
0
    }
1538
0
    value <<= 1;
1539
1540
0
    if(trail <= 0x9e) {
1541
0
        value -= 0x100;
1542
0
        if(trail <= 0x7e) {
1543
0
            value |= trail - 0x1f;
1544
0
        } else {
1545
0
            value |= trail - 0x20;
1546
0
        }
1547
0
    } else /* trail <= 0xfc */ {
1548
0
        value |= trail - 0x7e;
1549
0
    }
1550
0
    return value;
1551
0
}
1552
1553
/*
1554
 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
1555
 * If either byte is outside 21..7E make sure that the result is not valid
1556
 * for Shift-JIS so that the converter catches it.
1557
 * Some invalid byte values already turn into equally invalid Shift-JIS
1558
 * byte values and need not be tested explicitly.
1559
 */
1560
static inline void
1561
0
_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
1562
0
    if(c1&1) {
1563
0
        ++c1;
1564
0
        if(c2 <= 0x5f) {
1565
0
            c2 += 0x1f;
1566
0
        } else if(c2 <= 0x7e) {
1567
0
            c2 += 0x20;
1568
0
        } else {
1569
0
            c2 = 0;  /* invalid */
1570
0
        }
1571
0
    } else {
1572
0
        if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1573
0
            c2 += 0x7e;
1574
0
        } else {
1575
0
            c2 = 0;  /* invalid */
1576
0
        }
1577
0
    }
1578
0
    c1 >>= 1;
1579
0
    if(c1 <= 0x2f) {
1580
0
        c1 += 0x70;
1581
0
    } else if(c1 <= 0x3f) {
1582
0
        c1 += 0xb0;
1583
0
    } else {
1584
0
        c1 = 0;  /* invalid */
1585
0
    }
1586
0
    bytes[0] = (char)c1;
1587
0
    bytes[1] = (char)c2;
1588
0
}
1589
1590
/*
1591
 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
1592
 * Katakana.
1593
 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
1594
 * because Shift-JIS roundtrips half-width Katakana to single bytes.
1595
 * These were the only fallbacks in ICU's jisx-208.ucm file.
1596
 */
1597
static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
1598
    0x2123,  /* U+FF61 */
1599
    0x2156,
1600
    0x2157,
1601
    0x2122,
1602
    0x2126,
1603
    0x2572,
1604
    0x2521,
1605
    0x2523,
1606
    0x2525,
1607
    0x2527,
1608
    0x2529,
1609
    0x2563,
1610
    0x2565,
1611
    0x2567,
1612
    0x2543,
1613
    0x213C,  /* U+FF70 */
1614
    0x2522,
1615
    0x2524,
1616
    0x2526,
1617
    0x2528,
1618
    0x252A,
1619
    0x252B,
1620
    0x252D,
1621
    0x252F,
1622
    0x2531,
1623
    0x2533,
1624
    0x2535,
1625
    0x2537,
1626
    0x2539,
1627
    0x253B,
1628
    0x253D,
1629
    0x253F,  /* U+FF80 */
1630
    0x2541,
1631
    0x2544,
1632
    0x2546,
1633
    0x2548,
1634
    0x254A,
1635
    0x254B,
1636
    0x254C,
1637
    0x254D,
1638
    0x254E,
1639
    0x254F,
1640
    0x2552,
1641
    0x2555,
1642
    0x2558,
1643
    0x255B,
1644
    0x255E,
1645
    0x255F,  /* U+FF90 */
1646
    0x2560,
1647
    0x2561,
1648
    0x2562,
1649
    0x2564,
1650
    0x2566,
1651
    0x2568,
1652
    0x2569,
1653
    0x256A,
1654
    0x256B,
1655
    0x256C,
1656
    0x256D,
1657
    0x256F,
1658
    0x2573,
1659
    0x212B,
1660
    0x212C   /* U+FF9F */
1661
};
1662
1663
static void U_CALLCONV
1664
0
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1665
0
    UConverter *cnv = args->converter;
1666
0
    UConverterDataISO2022 *converterData;
1667
0
    ISO2022State *pFromU2022State;
1668
0
    uint8_t *target = (uint8_t *) args->target;
1669
0
    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1670
0
    const UChar* source = args->source;
1671
0
    const UChar* sourceLimit = args->sourceLimit;
1672
0
    int32_t* offsets = args->offsets;
1673
0
    UChar32 sourceChar;
1674
0
    char buffer[8];
1675
0
    int32_t len, outLen;
1676
0
    int8_t choices[10];
1677
0
    int32_t choiceCount;
1678
0
    uint32_t targetValue = 0;
1679
0
    UBool useFallback;
1680
1681
0
    int32_t i;
1682
0
    int8_t cs, g;
1683
1684
    /* set up the state */
1685
0
    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
1686
0
    pFromU2022State   = &converterData->fromU2022State;
1687
1688
0
    choiceCount = 0;
1689
1690
    /* check if the last codepoint of previous buffer was a lead surrogate*/
1691
0
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1692
0
        goto getTrail;
1693
0
    }
1694
1695
0
    while(source < sourceLimit) {
1696
0
        if(target < targetLimit) {
1697
1698
0
            sourceChar  = *(source++);
1699
            /*check if the char is a First surrogate*/
1700
0
            if(U16_IS_SURROGATE(sourceChar)) {
1701
0
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
1702
0
getTrail:
1703
                    /*look ahead to find the trail surrogate*/
1704
0
                    if(source < sourceLimit) {
1705
                        /* test the following code unit */
1706
0
                        UChar trail=(UChar) *source;
1707
0
                        if(U16_IS_TRAIL(trail)) {
1708
0
                            source++;
1709
0
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
1710
0
                            cnv->fromUChar32=0x00;
1711
                            /* convert this supplementary code point */
1712
                            /* exit this condition tree */
1713
0
                        } else {
1714
                            /* this is an unmatched lead code unit (1st surrogate) */
1715
                            /* callback(illegal) */
1716
0
                            *err=U_ILLEGAL_CHAR_FOUND;
1717
0
                            cnv->fromUChar32=sourceChar;
1718
0
                            break;
1719
0
                        }
1720
0
                    } else {
1721
                        /* no more input */
1722
0
                        cnv->fromUChar32=sourceChar;
1723
0
                        break;
1724
0
                    }
1725
0
                } else {
1726
                    /* this is an unmatched trail code unit (2nd surrogate) */
1727
                    /* callback(illegal) */
1728
0
                    *err=U_ILLEGAL_CHAR_FOUND;
1729
0
                    cnv->fromUChar32=sourceChar;
1730
0
                    break;
1731
0
                }
1732
0
            }
1733
1734
            /* do not convert SO/SI/ESC */
1735
0
            if(IS_2022_CONTROL(sourceChar)) {
1736
                /* callback(illegal) */
1737
0
                *err=U_ILLEGAL_CHAR_FOUND;
1738
0
                cnv->fromUChar32=sourceChar;
1739
0
                break;
1740
0
            }
1741
1742
            /* do the conversion */
1743
1744
0
            if(choiceCount == 0) {
1745
0
                uint16_t csm;
1746
1747
                /*
1748
                 * The csm variable keeps track of which charsets are allowed
1749
                 * and not used yet while building the choices[].
1750
                 */
1751
0
                csm = jpCharsetMasks[converterData->version];
1752
0
                choiceCount = 0;
1753
1754
                /* JIS7/8: try single-byte half-width Katakana before JISX208 */
1755
0
                if(converterData->version == 3 || converterData->version == 4) {
1756
0
                    choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1757
0
                }
1758
                /* Do not try single-byte half-width Katakana for other versions. */
1759
0
                csm &= ~CSM(HWKANA_7BIT);
1760
1761
                /* try the current G0 charset */
1762
0
                choices[choiceCount++] = cs = pFromU2022State->cs[0];
1763
0
                csm &= ~CSM(cs);
1764
1765
                /* try the current G2 charset */
1766
0
                if((cs = pFromU2022State->cs[2]) != 0) {
1767
0
                    choices[choiceCount++] = cs;
1768
0
                    csm &= ~CSM(cs);
1769
0
                }
1770
1771
                /* try all the other possible charsets */
1772
0
                for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
1773
0
                    cs = (int8_t)jpCharsetPref[i];
1774
0
                    if(CSM(cs) & csm) {
1775
0
                        choices[choiceCount++] = cs;
1776
0
                        csm &= ~CSM(cs);
1777
0
                    }
1778
0
                }
1779
0
            }
1780
1781
0
            cs = g = 0;
1782
            /*
1783
             * len==0: no mapping found yet
1784
             * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1785
             * len>0: found a roundtrip result, done
1786
             */
1787
0
            len = 0;
1788
            /*
1789
             * We will turn off useFallback after finding a fallback,
1790
             * but we still get fallbacks from PUA code points as usual.
1791
             * Therefore, we will also need to check that we don't overwrite
1792
             * an early fallback with a later one.
1793
             */
1794
0
            useFallback = cnv->useFallback;
1795
1796
0
            for(i = 0; i < choiceCount && len <= 0; ++i) {
1797
0
                uint32_t value;
1798
0
                int32_t len2;
1799
0
                int8_t cs0 = choices[i];
1800
0
                switch(cs0) {
1801
0
                case ASCII:
1802
0
                    if(sourceChar <= 0x7f) {
1803
0
                        targetValue = (uint32_t)sourceChar;
1804
0
                        len = 1;
1805
0
                        cs = cs0;
1806
0
                        g = 0;
1807
0
                    }
1808
0
                    break;
1809
0
                case ISO8859_1:
1810
0
                    if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1811
0
                        targetValue = (uint32_t)sourceChar - 0x80;
1812
0
                        len = 1;
1813
0
                        cs = cs0;
1814
0
                        g = 2;
1815
0
                    }
1816
0
                    break;
1817
0
                case HWKANA_7BIT:
1818
0
                    if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1819
0
                        if(converterData->version==3) {
1820
                            /* JIS7: use G1 (SO) */
1821
                            /* Shift U+FF61..U+FF9F to bytes 21..5F. */
1822
0
                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1823
0
                            len = 1;
1824
0
                            pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1825
0
                            g = 1;
1826
0
                        } else if(converterData->version==4) {
1827
                            /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1828
                            /* Shift U+FF61..U+FF9F to bytes A1..DF. */
1829
0
                            targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1830
0
                            len = 1;
1831
1832
0
                            cs = pFromU2022State->cs[0];
1833
0
                            if(IS_JP_DBCS(cs)) {
1834
                                /* switch from a DBCS charset to JISX201 */
1835
0
                                cs = (int8_t)JISX201;
1836
0
                            }
1837
                            /* else stay in the current G0 charset */
1838
0
                            g = 0;
1839
0
                        }
1840
                        /* else do not use HWKANA_7BIT with other versions */
1841
0
                    }
1842
0
                    break;
1843
0
                case JISX201:
1844
                    /* G0 SBCS */
1845
0
                    value = jisx201FromU(sourceChar);
1846
0
                    if(value <= 0x7f) {
1847
0
                        targetValue = value;
1848
0
                        len = 1;
1849
0
                        cs = cs0;
1850
0
                        g = 0;
1851
0
                        useFallback = FALSE;
1852
0
                    }
1853
0
                    break;
1854
0
                case JISX208:
1855
                    /* G0 DBCS from Shift-JIS table */
1856
0
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1857
0
                                converterData->myConverterArray[cs0],
1858
0
                                sourceChar, &value,
1859
0
                                useFallback, MBCS_OUTPUT_2);
1860
0
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1861
0
                        value = _2022FromSJIS(value);
1862
0
                        if(value != 0) {
1863
0
                            targetValue = value;
1864
0
                            len = len2;
1865
0
                            cs = cs0;
1866
0
                            g = 0;
1867
0
                            useFallback = FALSE;
1868
0
                        }
1869
0
                    } else if(len == 0 && useFallback &&
1870
0
                              (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
1871
0
                        targetValue = hwkana_fb[sourceChar - HWKANA_START];
1872
0
                        len = -2;
1873
0
                        cs = cs0;
1874
0
                        g = 0;
1875
0
                        useFallback = FALSE;
1876
0
                    }
1877
0
                    break;
1878
0
                case ISO8859_7:
1879
                    /* G0 SBCS forced to 7-bit output */
1880
0
                    len2 = MBCS_SINGLE_FROM_UCHAR32(
1881
0
                                converterData->myConverterArray[cs0],
1882
0
                                sourceChar, &value,
1883
0
                                useFallback);
1884
0
                    if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1885
0
                        targetValue = value - 0x80;
1886
0
                        len = len2;
1887
0
                        cs = cs0;
1888
0
                        g = 2;
1889
0
                        useFallback = FALSE;
1890
0
                    }
1891
0
                    break;
1892
0
                default:
1893
                    /* G0 DBCS */
1894
0
                    len2 = MBCS_FROM_UCHAR32_ISO2022(
1895
0
                                converterData->myConverterArray[cs0],
1896
0
                                sourceChar, &value,
1897
0
                                useFallback, MBCS_OUTPUT_2);
1898
0
                    if(len2 == 2 || (len2 == -2 && len == 0)) {  /* only accept DBCS: abs(len)==2 */
1899
0
                        if(cs0 == KSC5601) {
1900
                            /*
1901
                             * Check for valid bytes for the encoding scheme.
1902
                             * This is necessary because the sub-converter (windows-949)
1903
                             * has a broader encoding scheme than is valid for 2022.
1904
                             */
1905
0
                            value = _2022FromGR94DBCS(value);
1906
0
                            if(value == 0) {
1907
0
                                break;
1908
0
                            }
1909
0
                        }
1910
0
                        targetValue = value;
1911
0
                        len = len2;
1912
0
                        cs = cs0;
1913
0
                        g = 0;
1914
0
                        useFallback = FALSE;
1915
0
                    }
1916
0
                    break;
1917
0
                }
1918
0
            }
1919
1920
0
            if(len != 0) {
1921
0
                if(len < 0) {
1922
0
                    len = -len;  /* fallback */
1923
0
                }
1924
0
                outLen = 0; /* count output bytes */
1925
1926
                /* write SI if necessary (only for JIS7) */
1927
0
                if(pFromU2022State->g == 1 && g == 0) {
1928
0
                    buffer[outLen++] = UCNV_SI;
1929
0
                    pFromU2022State->g = 0;
1930
0
                }
1931
1932
                /* write the designation sequence if necessary */
1933
0
                if(cs != pFromU2022State->cs[g]) {
1934
0
                    int32_t escLen = escSeqCharsLen[cs];
1935
0
                    uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1936
0
                    outLen += escLen;
1937
0
                    pFromU2022State->cs[g] = cs;
1938
1939
                    /* invalidate the choices[] */
1940
0
                    choiceCount = 0;
1941
0
                }
1942
1943
                /* write the shift sequence if necessary */
1944
0
                if(g != pFromU2022State->g) {
1945
0
                    switch(g) {
1946
                    /* case 0 handled before writing escapes */
1947
0
                    case 1:
1948
0
                        buffer[outLen++] = UCNV_SO;
1949
0
                        pFromU2022State->g = 1;
1950
0
                        break;
1951
0
                    default: /* case 2 */
1952
0
                        buffer[outLen++] = 0x1b;
1953
0
                        buffer[outLen++] = 0x4e;
1954
0
                        break;
1955
                    /* no case 3: no SS3 in ISO-2022-JP-x */
1956
0
                    }
1957
0
                }
1958
1959
                /* write the output bytes */
1960
0
                if(len == 1) {
1961
0
                    buffer[outLen++] = (char)targetValue;
1962
0
                } else /* len == 2 */ {
1963
0
                    buffer[outLen++] = (char)(targetValue >> 8);
1964
0
                    buffer[outLen++] = (char)targetValue;
1965
0
                }
1966
0
            } else {
1967
                /*
1968
                 * if we cannot find the character after checking all codepages
1969
                 * then this is an error
1970
                 */
1971
0
                *err = U_INVALID_CHAR_FOUND;
1972
0
                cnv->fromUChar32=sourceChar;
1973
0
                break;
1974
0
            }
1975
1976
0
            if(sourceChar == CR || sourceChar == LF) {
1977
                /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1978
0
                pFromU2022State->cs[2] = 0;
1979
0
                choiceCount = 0;
1980
0
            }
1981
1982
            /* output outLen>0 bytes in buffer[] */
1983
0
            if(outLen == 1) {
1984
0
                *target++ = buffer[0];
1985
0
                if(offsets) {
1986
0
                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1987
0
                }
1988
0
            } else if(outLen == 2 && (target + 2) <= targetLimit) {
1989
0
                *target++ = buffer[0];
1990
0
                *target++ = buffer[1];
1991
0
                if(offsets) {
1992
0
                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1993
0
                    *offsets++ = sourceIndex;
1994
0
                    *offsets++ = sourceIndex;
1995
0
                }
1996
0
            } else {
1997
0
                fromUWriteUInt8(
1998
0
                    cnv,
1999
0
                    buffer, outLen,
2000
0
                    &target, (const char *)targetLimit,
2001
0
                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2002
0
                    err);
2003
0
                if(U_FAILURE(*err)) {
2004
0
                    break;
2005
0
                }
2006
0
            }
2007
0
        } /* end if(myTargetIndex<myTargetLength) */
2008
0
        else{
2009
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2010
0
            break;
2011
0
        }
2012
2013
0
    }/* end while(mySourceIndex<mySourceLength) */
2014
2015
    /*
2016
     * the end of the input stream and detection of truncated input
2017
     * are handled by the framework, but for ISO-2022-JP conversion
2018
     * we need to be in ASCII mode at the very end
2019
     *
2020
     * conditions:
2021
     *   successful
2022
     *   in SO mode or not in ASCII mode
2023
     *   end of input and no truncated input
2024
     */
2025
0
    if( U_SUCCESS(*err) &&
2026
0
        (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
2027
0
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
2028
0
    ) {
2029
0
        int32_t sourceIndex;
2030
2031
0
        outLen = 0;
2032
2033
0
        if(pFromU2022State->g != 0) {
2034
0
            buffer[outLen++] = UCNV_SI;
2035
0
            pFromU2022State->g = 0;
2036
0
        }
2037
2038
0
        if(pFromU2022State->cs[0] != ASCII) {
2039
0
            int32_t escLen = escSeqCharsLen[ASCII];
2040
0
            uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
2041
0
            outLen += escLen;
2042
0
            pFromU2022State->cs[0] = (int8_t)ASCII;
2043
0
        }
2044
2045
        /* get the source index of the last input character */
2046
        /*
2047
         * TODO this would be simpler and more reliable if we used a pair
2048
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2049
         * so that we could simply use the prevSourceIndex here;
2050
         * this code gives an incorrect result for the rare case of an unmatched
2051
         * trail surrogate that is alone in the last buffer of the text stream
2052
         */
2053
0
        sourceIndex=(int32_t)(source-args->source);
2054
0
        if(sourceIndex>0) {
2055
0
            --sourceIndex;
2056
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2057
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2058
0
            ) {
2059
0
                --sourceIndex;
2060
0
            }
2061
0
        } else {
2062
0
            sourceIndex=-1;
2063
0
        }
2064
2065
0
        fromUWriteUInt8(
2066
0
            cnv,
2067
0
            buffer, outLen,
2068
0
            &target, (const char *)targetLimit,
2069
0
            &offsets, sourceIndex,
2070
0
            err);
2071
0
    }
2072
2073
    /*save the state and return */
2074
0
    args->source = source;
2075
0
    args->target = (char*)target;
2076
0
}
2077
2078
/*************** to unicode *******************/
2079
2080
static void U_CALLCONV
2081
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2082
0
                                               UErrorCode* err){
2083
0
    char tempBuf[2];
2084
0
    const char *mySource = (char *) args->source;
2085
0
    UChar *myTarget = args->target;
2086
0
    const char *mySourceLimit = args->sourceLimit;
2087
0
    uint32_t targetUniChar = 0x0000;
2088
0
    uint32_t mySourceChar = 0x0000;
2089
0
    uint32_t tmpSourceChar = 0x0000;
2090
0
    UConverterDataISO2022* myData;
2091
0
    ISO2022State *pToU2022State;
2092
0
    StateEnum cs;
2093
2094
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2095
0
    pToU2022State = &myData->toU2022State;
2096
2097
0
    if(myData->key != 0) {
2098
        /* continue with a partial escape sequence */
2099
0
        goto escape;
2100
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2101
        /* continue with a partial double-byte character */
2102
0
        mySourceChar = args->converter->toUBytes[0];
2103
0
        args->converter->toULength = 0;
2104
0
        cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2105
0
        targetUniChar = missingCharMarker;
2106
0
        goto getTrailByte;
2107
0
    }
2108
2109
0
    while(mySource < mySourceLimit){
2110
2111
0
        targetUniChar =missingCharMarker;
2112
2113
0
        if(myTarget < args->targetLimit){
2114
2115
0
            mySourceChar= (unsigned char) *mySource++;
2116
2117
0
            switch(mySourceChar) {
2118
0
            case UCNV_SI:
2119
0
                if(myData->version==3) {
2120
0
                    pToU2022State->g=0;
2121
0
                    continue;
2122
0
                } else {
2123
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2124
0
                    myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2125
0
                    break;
2126
0
                }
2127
2128
0
            case UCNV_SO:
2129
0
                if(myData->version==3) {
2130
                    /* JIS7: switch to G1 half-width Katakana */
2131
0
                    pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
2132
0
                    pToU2022State->g=1;
2133
0
                    continue;
2134
0
                } else {
2135
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
2136
0
                    myData->isEmptySegment = FALSE; /* reset this, we have a different error */
2137
0
                    break;
2138
0
                }
2139
2140
0
            case ESC_2022:
2141
0
                mySource--;
2142
0
escape:
2143
0
                {
2144
0
                    const char * mySourceBefore = mySource;
2145
0
                    int8_t toULengthBefore = args->converter->toULength;
2146
2147
0
                    changeState_2022(args->converter,&(mySource),
2148
0
                        mySourceLimit, ISO_2022_JP,err);
2149
2150
                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
2151
0
                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
2152
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2153
0
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
2154
0
                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
2155
0
                    }
2156
0
                }
2157
2158
                /* invalid or illegal escape sequence */
2159
0
                if(U_FAILURE(*err)){
2160
0
                    args->target = myTarget;
2161
0
                    args->source = mySource;
2162
0
                    myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
2163
0
                    return;
2164
0
                }
2165
                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
2166
0
                if(myData->key==0) {
2167
0
                    myData->isEmptySegment = TRUE;
2168
0
                }
2169
0
                continue;
2170
2171
            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
2172
2173
0
            case CR:
2174
0
            case LF:
2175
                /* automatically reset to single-byte mode */
2176
0
                if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
2177
0
                    pToU2022State->cs[0] = (int8_t)ASCII;
2178
0
                }
2179
0
                pToU2022State->cs[2] = 0;
2180
0
                pToU2022State->g = 0;
2181
0
                U_FALLTHROUGH;
2182
0
            default:
2183
                /* convert one or two bytes */
2184
0
                myData->isEmptySegment = FALSE;
2185
0
                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
2186
0
                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
2187
0
                    !IS_JP_DBCS(cs)
2188
0
                ) {
2189
                    /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
2190
0
                    targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
2191
2192
                    /* return from a single-shift state to the previous one */
2193
0
                    if(pToU2022State->g >= 2) {
2194
0
                        pToU2022State->g=pToU2022State->prevG;
2195
0
                    }
2196
0
                } else switch(cs) {
2197
0
                case ASCII:
2198
0
                    if(mySourceChar <= 0x7f) {
2199
0
                        targetUniChar = mySourceChar;
2200
0
                    }
2201
0
                    break;
2202
0
                case ISO8859_1:
2203
0
                    if(mySourceChar <= 0x7f) {
2204
0
                        targetUniChar = mySourceChar + 0x80;
2205
0
                    }
2206
                    /* return from a single-shift state to the previous one */
2207
0
                    pToU2022State->g=pToU2022State->prevG;
2208
0
                    break;
2209
0
                case ISO8859_7:
2210
0
                    if(mySourceChar <= 0x7f) {
2211
                        /* convert mySourceChar+0x80 to use a normal 8-bit table */
2212
0
                        targetUniChar =
2213
0
                            _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
2214
0
                                myData->myConverterArray[cs],
2215
0
                                mySourceChar + 0x80);
2216
0
                    }
2217
                    /* return from a single-shift state to the previous one */
2218
0
                    pToU2022State->g=pToU2022State->prevG;
2219
0
                    break;
2220
0
                case JISX201:
2221
0
                    if(mySourceChar <= 0x7f) {
2222
0
                        targetUniChar = jisx201ToU(mySourceChar);
2223
0
                    }
2224
0
                    break;
2225
0
                case HWKANA_7BIT:
2226
0
                    if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
2227
                        /* 7-bit halfwidth Katakana */
2228
0
                        targetUniChar = mySourceChar + (HWKANA_START - 0x21);
2229
0
                    }
2230
0
                    break;
2231
0
                default:
2232
                    /* G0 DBCS */
2233
0
                    if(mySource < mySourceLimit) {
2234
0
                        int leadIsOk, trailIsOk;
2235
0
                        uint8_t trailByte;
2236
0
getTrailByte:
2237
0
                        trailByte = (uint8_t)*mySource;
2238
                        /*
2239
                         * Ticket 5691: consistent illegal sequences:
2240
                         * - We include at least the first byte in the illegal sequence.
2241
                         * - If any of the non-initial bytes could be the start of a character,
2242
                         *   we stop the illegal sequence before the first one of those.
2243
                         *
2244
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2245
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2246
                         * Otherwise we convert or report the pair of bytes.
2247
                         */
2248
0
                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2249
0
                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2250
0
                        if (leadIsOk && trailIsOk) {
2251
0
                            ++mySource;
2252
0
                            tmpSourceChar = (mySourceChar << 8) | trailByte;
2253
0
                            if(cs == JISX208) {
2254
0
                                _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
2255
0
                                mySourceChar = tmpSourceChar;
2256
0
                            } else {
2257
                                /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
2258
0
                                mySourceChar = tmpSourceChar;
2259
0
                                if (cs == KSC5601) {
2260
0
                                    tmpSourceChar += 0x8080;  /* = _2022ToGR94DBCS(tmpSourceChar) */
2261
0
                                }
2262
0
                                tempBuf[0] = (char)(tmpSourceChar >> 8);
2263
0
                                tempBuf[1] = (char)(tmpSourceChar);
2264
0
                            }
2265
0
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
2266
0
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2267
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2268
0
                            ++mySource;
2269
                            /* add another bit so that the code below writes 2 bytes in case of error */
2270
0
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2271
0
                        }
2272
0
                    } else {
2273
0
                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2274
0
                        args->converter->toULength = 1;
2275
0
                        goto endloop;
2276
0
                    }
2277
0
                }  /* End of inner switch */
2278
0
                break;
2279
0
            }  /* End of outer switch */
2280
0
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2281
0
                if(args->offsets){
2282
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2283
0
                }
2284
0
                *(myTarget++)=(UChar)targetUniChar;
2285
0
            }
2286
0
            else if(targetUniChar > missingCharMarker){
2287
                /* disassemble the surrogate pair and write to output*/
2288
0
                targetUniChar-=0x0010000;
2289
0
                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2290
0
                if(args->offsets){
2291
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2292
0
                }
2293
0
                ++myTarget;
2294
0
                if(myTarget< args->targetLimit){
2295
0
                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2296
0
                    if(args->offsets){
2297
0
                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2298
0
                    }
2299
0
                    ++myTarget;
2300
0
                }else{
2301
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2302
0
                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2303
0
                }
2304
2305
0
            }
2306
0
            else{
2307
                /* Call the callback function*/
2308
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2309
0
                break;
2310
0
            }
2311
0
        }
2312
0
        else{    /* goes with "if(myTarget < args->targetLimit)"  way up near top of function */
2313
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2314
0
            break;
2315
0
        }
2316
0
    }
2317
0
endloop:
2318
0
    args->target = myTarget;
2319
0
    args->source = mySource;
2320
0
}
2321
2322
2323
#if !UCONFIG_ONLY_HTML_CONVERSION
2324
/***************************************************************
2325
*   Rules for ISO-2022-KR encoding
2326
*   i) The KSC5601 designator sequence should appear only once in a file,
2327
*      at the begining of a line before any KSC5601 characters. This usually
2328
*      means that it appears by itself on the first line of the file
2329
*  ii) There are only 2 shifting sequences SO to shift into double byte mode
2330
*      and SI to shift into single byte mode
2331
*/
2332
static void U_CALLCONV
2333
0
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
2334
2335
0
    UConverter* saveConv = args->converter;
2336
0
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
2337
0
    args->converter=myConverterData->currentConverter;
2338
2339
0
    myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
2340
0
    ucnv_MBCSFromUnicodeWithOffsets(args,err);
2341
0
    saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
2342
2343
0
    if(*err == U_BUFFER_OVERFLOW_ERROR) {
2344
0
        if(myConverterData->currentConverter->charErrorBufferLength > 0) {
2345
0
            uprv_memcpy(
2346
0
                saveConv->charErrorBuffer,
2347
0
                myConverterData->currentConverter->charErrorBuffer,
2348
0
                myConverterData->currentConverter->charErrorBufferLength);
2349
0
        }
2350
0
        saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
2351
0
        myConverterData->currentConverter->charErrorBufferLength = 0;
2352
0
    }
2353
0
    args->converter=saveConv;
2354
0
}
2355
2356
static void U_CALLCONV
2357
0
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2358
2359
0
    const UChar *source = args->source;
2360
0
    const UChar *sourceLimit = args->sourceLimit;
2361
0
    unsigned char *target = (unsigned char *) args->target;
2362
0
    unsigned char *targetLimit = (unsigned char *) args->targetLimit;
2363
0
    int32_t* offsets = args->offsets;
2364
0
    uint32_t targetByteUnit = 0x0000;
2365
0
    UChar32 sourceChar = 0x0000;
2366
0
    UBool isTargetByteDBCS;
2367
0
    UBool oldIsTargetByteDBCS;
2368
0
    UConverterDataISO2022 *converterData;
2369
0
    UConverterSharedData* sharedData;
2370
0
    UBool useFallback;
2371
0
    int32_t length =0;
2372
2373
0
    converterData=(UConverterDataISO2022*)args->converter->extraInfo;
2374
    /* if the version is 1 then the user is requesting
2375
     * conversion with ibm-25546 pass the arguments to
2376
     * MBCS converter and return
2377
     */
2378
0
    if(converterData->version==1){
2379
0
        UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2380
0
        return;
2381
0
    }
2382
2383
    /* initialize data */
2384
0
    sharedData = converterData->currentConverter->sharedData;
2385
0
    useFallback = args->converter->useFallback;
2386
0
    isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2387
0
    oldIsTargetByteDBCS = isTargetByteDBCS;
2388
2389
0
    isTargetByteDBCS   = (UBool) args->converter->fromUnicodeStatus;
2390
0
    if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2391
0
        goto getTrail;
2392
0
    }
2393
0
    while(source < sourceLimit){
2394
2395
0
        targetByteUnit = missingCharMarker;
2396
2397
0
        if(target < (unsigned char*) args->targetLimit){
2398
0
            sourceChar = *source++;
2399
2400
            /* do not convert SO/SI/ESC */
2401
0
            if(IS_2022_CONTROL(sourceChar)) {
2402
                /* callback(illegal) */
2403
0
                *err=U_ILLEGAL_CHAR_FOUND;
2404
0
                args->converter->fromUChar32=sourceChar;
2405
0
                break;
2406
0
            }
2407
2408
0
            length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2409
0
            if(length < 0) {
2410
0
                length = -length;  /* fallback */
2411
0
            }
2412
            /* only DBCS or SBCS characters are expected*/
2413
            /* DB characters with high bit set to 1 are expected */
2414
0
            if( length > 2 || length==0 ||
2415
0
                (length == 1 && targetByteUnit > 0x7f) ||
2416
0
                (length == 2 &&
2417
0
                    ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
2418
0
                    (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
2419
0
            ) {
2420
0
                targetByteUnit=missingCharMarker;
2421
0
            }
2422
0
            if (targetByteUnit != missingCharMarker){
2423
2424
0
                oldIsTargetByteDBCS = isTargetByteDBCS;
2425
0
                isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2426
                  /* append the shift sequence */
2427
0
                if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2428
2429
0
                    if (isTargetByteDBCS)
2430
0
                        *target++ = UCNV_SO;
2431
0
                    else
2432
0
                        *target++ = UCNV_SI;
2433
0
                    if(offsets)
2434
0
                        *(offsets++) = (int32_t)(source - args->source-1);
2435
0
                }
2436
                /* write the targetUniChar  to target */
2437
0
                if(targetByteUnit <= 0x00FF){
2438
0
                    if( target < targetLimit){
2439
0
                        *(target++) = (unsigned char) targetByteUnit;
2440
0
                        if(offsets){
2441
0
                            *(offsets++) = (int32_t)(source - args->source-1);
2442
0
                        }
2443
2444
0
                    }else{
2445
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2446
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
2447
0
                    }
2448
0
                }else{
2449
0
                    if(target < targetLimit){
2450
0
                        *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2451
0
                        if(offsets){
2452
0
                            *(offsets++) = (int32_t)(source - args->source-1);
2453
0
                        }
2454
0
                        if(target < targetLimit){
2455
0
                            *(target++) =(unsigned char) (targetByteUnit -0x80);
2456
0
                            if(offsets){
2457
0
                                *(offsets++) = (int32_t)(source - args->source-1);
2458
0
                            }
2459
0
                        }else{
2460
0
                            args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2461
0
                            *err = U_BUFFER_OVERFLOW_ERROR;
2462
0
                        }
2463
0
                    }else{
2464
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2465
0
                        args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2466
0
                        *err = U_BUFFER_OVERFLOW_ERROR;
2467
0
                    }
2468
0
                }
2469
2470
0
            }
2471
0
            else{
2472
                /* oops.. the code point is unassingned
2473
                 * set the error and reason
2474
                 */
2475
2476
                /*check if the char is a First surrogate*/
2477
0
                if(U16_IS_SURROGATE(sourceChar)) {
2478
0
                    if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2479
0
getTrail:
2480
                        /*look ahead to find the trail surrogate*/
2481
0
                        if(source <  sourceLimit) {
2482
                            /* test the following code unit */
2483
0
                            UChar trail=(UChar) *source;
2484
0
                            if(U16_IS_TRAIL(trail)) {
2485
0
                                source++;
2486
0
                                sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2487
0
                                *err = U_INVALID_CHAR_FOUND;
2488
                                /* convert this surrogate code point */
2489
                                /* exit this condition tree */
2490
0
                            } else {
2491
                                /* this is an unmatched lead code unit (1st surrogate) */
2492
                                /* callback(illegal) */
2493
0
                                *err=U_ILLEGAL_CHAR_FOUND;
2494
0
                            }
2495
0
                        } else {
2496
                            /* no more input */
2497
0
                            *err = U_ZERO_ERROR;
2498
0
                        }
2499
0
                    } else {
2500
                        /* this is an unmatched trail code unit (2nd surrogate) */
2501
                        /* callback(illegal) */
2502
0
                        *err=U_ILLEGAL_CHAR_FOUND;
2503
0
                    }
2504
0
                } else {
2505
                    /* callback(unassigned) for a BMP code point */
2506
0
                    *err = U_INVALID_CHAR_FOUND;
2507
0
                }
2508
2509
0
                args->converter->fromUChar32=sourceChar;
2510
0
                break;
2511
0
            }
2512
0
        } /* end if(myTargetIndex<myTargetLength) */
2513
0
        else{
2514
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2515
0
            break;
2516
0
        }
2517
2518
0
    }/* end while(mySourceIndex<mySourceLength) */
2519
2520
    /*
2521
     * the end of the input stream and detection of truncated input
2522
     * are handled by the framework, but for ISO-2022-KR conversion
2523
     * we need to be in ASCII mode at the very end
2524
     *
2525
     * conditions:
2526
     *   successful
2527
     *   not in ASCII mode
2528
     *   end of input and no truncated input
2529
     */
2530
0
    if( U_SUCCESS(*err) &&
2531
0
        isTargetByteDBCS &&
2532
0
        args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2533
0
    ) {
2534
0
        int32_t sourceIndex;
2535
2536
        /* we are switching to ASCII */
2537
0
        isTargetByteDBCS=FALSE;
2538
2539
        /* get the source index of the last input character */
2540
        /*
2541
         * TODO this would be simpler and more reliable if we used a pair
2542
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2543
         * so that we could simply use the prevSourceIndex here;
2544
         * this code gives an incorrect result for the rare case of an unmatched
2545
         * trail surrogate that is alone in the last buffer of the text stream
2546
         */
2547
0
        sourceIndex=(int32_t)(source-args->source);
2548
0
        if(sourceIndex>0) {
2549
0
            --sourceIndex;
2550
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2551
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2552
0
            ) {
2553
0
                --sourceIndex;
2554
0
            }
2555
0
        } else {
2556
0
            sourceIndex=-1;
2557
0
        }
2558
2559
0
        fromUWriteUInt8(
2560
0
            args->converter,
2561
0
            SHIFT_IN_STR, 1,
2562
0
            &target, (const char *)targetLimit,
2563
0
            &offsets, sourceIndex,
2564
0
            err);
2565
0
    }
2566
2567
    /*save the state and return */
2568
0
    args->source = source;
2569
0
    args->target = (char*)target;
2570
0
    args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2571
0
}
2572
2573
/************************ To Unicode ***************************************/
2574
2575
static void U_CALLCONV
2576
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2577
0
                                                            UErrorCode* err){
2578
0
    char const* sourceStart;
2579
0
    UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2580
2581
0
    UConverterToUnicodeArgs subArgs;
2582
0
    int32_t minArgsSize;
2583
2584
    /* set up the subconverter arguments */
2585
0
    if(args->size<sizeof(UConverterToUnicodeArgs)) {
2586
0
        minArgsSize = args->size;
2587
0
    } else {
2588
0
        minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2589
0
    }
2590
2591
0
    uprv_memcpy(&subArgs, args, minArgsSize);
2592
0
    subArgs.size = (uint16_t)minArgsSize;
2593
0
    subArgs.converter = myData->currentConverter;
2594
2595
    /* remember the original start of the input for offsets */
2596
0
    sourceStart = args->source;
2597
2598
0
    if(myData->key != 0) {
2599
        /* continue with a partial escape sequence */
2600
0
        goto escape;
2601
0
    }
2602
2603
0
    while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2604
        /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2605
0
        subArgs.source = args->source;
2606
0
        subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2607
0
        if(subArgs.source != subArgs.sourceLimit) {
2608
            /*
2609
             * get the current partial byte sequence
2610
             *
2611
             * it needs to be moved between the public and the subconverter
2612
             * so that the conversion framework, which only sees the public
2613
             * converter, can handle truncated and illegal input etc.
2614
             */
2615
0
            if(args->converter->toULength > 0) {
2616
0
                uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2617
0
            }
2618
0
            subArgs.converter->toULength = args->converter->toULength;
2619
2620
            /*
2621
             * Convert up to the end of the input, or to before the next escape character.
2622
             * Does not handle conversion extensions because the preToU[] state etc.
2623
             * is not copied.
2624
             */
2625
0
            ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2626
2627
0
            if(args->offsets != NULL && sourceStart != args->source) {
2628
                /* update offsets to base them on the actual start of the input */
2629
0
                int32_t *offsets = args->offsets;
2630
0
                UChar *target = args->target;
2631
0
                int32_t delta = (int32_t)(args->source - sourceStart);
2632
0
                while(target < subArgs.target) {
2633
0
                    if(*offsets >= 0) {
2634
0
                        *offsets += delta;
2635
0
                    }
2636
0
                    ++offsets;
2637
0
                    ++target;
2638
0
                }
2639
0
            }
2640
0
            args->source = subArgs.source;
2641
0
            args->target = subArgs.target;
2642
0
            args->offsets = subArgs.offsets;
2643
2644
            /* copy input/error/overflow buffers */
2645
0
            if(subArgs.converter->toULength > 0) {
2646
0
                uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2647
0
            }
2648
0
            args->converter->toULength = subArgs.converter->toULength;
2649
2650
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
2651
0
                if(subArgs.converter->UCharErrorBufferLength > 0) {
2652
0
                    uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2653
0
                                subArgs.converter->UCharErrorBufferLength);
2654
0
                }
2655
0
                args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2656
0
                subArgs.converter->UCharErrorBufferLength = 0;
2657
0
            }
2658
0
        }
2659
2660
0
        if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2661
0
            return;
2662
0
        }
2663
2664
0
escape:
2665
0
        changeState_2022(args->converter,
2666
0
               &(args->source),
2667
0
               args->sourceLimit,
2668
0
               ISO_2022_KR,
2669
0
               err);
2670
0
    }
2671
0
}
2672
2673
static void U_CALLCONV
2674
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2675
0
                                                            UErrorCode* err){
2676
0
    char tempBuf[2];
2677
0
    const char *mySource = ( char *) args->source;
2678
0
    UChar *myTarget = args->target;
2679
0
    const char *mySourceLimit = args->sourceLimit;
2680
0
    UChar32 targetUniChar = 0x0000;
2681
0
    UChar mySourceChar = 0x0000;
2682
0
    UConverterDataISO2022* myData;
2683
0
    UConverterSharedData* sharedData ;
2684
0
    UBool useFallback;
2685
2686
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2687
0
    if(myData->version==1){
2688
0
        UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2689
0
        return;
2690
0
    }
2691
2692
    /* initialize state */
2693
0
    sharedData = myData->currentConverter->sharedData;
2694
0
    useFallback = args->converter->useFallback;
2695
2696
0
    if(myData->key != 0) {
2697
        /* continue with a partial escape sequence */
2698
0
        goto escape;
2699
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2700
        /* continue with a partial double-byte character */
2701
0
        mySourceChar = args->converter->toUBytes[0];
2702
0
        args->converter->toULength = 0;
2703
0
        goto getTrailByte;
2704
0
    }
2705
2706
0
    while(mySource< mySourceLimit){
2707
2708
0
        if(myTarget < args->targetLimit){
2709
2710
0
            mySourceChar= (unsigned char) *mySource++;
2711
2712
0
            if(mySourceChar==UCNV_SI){
2713
0
                myData->toU2022State.g = 0;
2714
0
                if (myData->isEmptySegment) {
2715
0
                    myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
2716
0
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
2717
0
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
2718
0
                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2719
0
                    args->converter->toULength = 1;
2720
0
                    args->target = myTarget;
2721
0
                    args->source = mySource;
2722
0
                    return;
2723
0
                }
2724
                /*consume the source */
2725
0
                continue;
2726
0
            }else if(mySourceChar==UCNV_SO){
2727
0
                myData->toU2022State.g = 1;
2728
0
                myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
2729
                /*consume the source */
2730
0
                continue;
2731
0
            }else if(mySourceChar==ESC_2022){
2732
0
                mySource--;
2733
0
escape:
2734
0
                myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
2735
0
                changeState_2022(args->converter,&(mySource),
2736
0
                                mySourceLimit, ISO_2022_KR, err);
2737
0
                if(U_FAILURE(*err)){
2738
0
                    args->target = myTarget;
2739
0
                    args->source = mySource;
2740
0
                    return;
2741
0
                }
2742
0
                continue;
2743
0
            }
2744
2745
0
            myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
2746
0
            if(myData->toU2022State.g == 1) {
2747
0
                if(mySource < mySourceLimit) {
2748
0
                    int leadIsOk, trailIsOk;
2749
0
                    uint8_t trailByte;
2750
0
getTrailByte:
2751
0
                    targetUniChar = missingCharMarker;
2752
0
                    trailByte = (uint8_t)*mySource;
2753
                    /*
2754
                     * Ticket 5691: consistent illegal sequences:
2755
                     * - We include at least the first byte in the illegal sequence.
2756
                     * - If any of the non-initial bytes could be the start of a character,
2757
                     *   we stop the illegal sequence before the first one of those.
2758
                     *
2759
                     * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
2760
                     * an ESC/SO/SI, we report only the first byte as the illegal sequence.
2761
                     * Otherwise we convert or report the pair of bytes.
2762
                     */
2763
0
                    leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
2764
0
                    trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
2765
0
                    if (leadIsOk && trailIsOk) {
2766
0
                        ++mySource;
2767
0
                        tempBuf[0] = (char)(mySourceChar + 0x80);
2768
0
                        tempBuf[1] = (char)(trailByte + 0x80);
2769
0
                        targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2770
0
                        mySourceChar = (mySourceChar << 8) | trailByte;
2771
0
                    } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
2772
                        /* report a pair of illegal bytes if the second byte is not a DBCS starter */
2773
0
                        ++mySource;
2774
                        /* add another bit so that the code below writes 2 bytes in case of error */
2775
0
                        mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
2776
0
                    }
2777
0
                } else {
2778
0
                    args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2779
0
                    args->converter->toULength = 1;
2780
0
                    break;
2781
0
                }
2782
0
            }
2783
0
            else if(mySourceChar <= 0x7f) {
2784
0
                targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2785
0
            } else {
2786
0
                targetUniChar = 0xffff;
2787
0
            }
2788
0
            if(targetUniChar < 0xfffe){
2789
0
                if(args->offsets) {
2790
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2791
0
                }
2792
0
                *(myTarget++)=(UChar)targetUniChar;
2793
0
            }
2794
0
            else {
2795
                /* Call the callback function*/
2796
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2797
0
                break;
2798
0
            }
2799
0
        }
2800
0
        else{
2801
0
            *err =U_BUFFER_OVERFLOW_ERROR;
2802
0
            break;
2803
0
        }
2804
0
    }
2805
0
    args->target = myTarget;
2806
0
    args->source = mySource;
2807
0
}
2808
2809
/*************************** END ISO2022-KR *********************************/
2810
2811
/*************************** ISO-2022-CN *********************************
2812
*
2813
* Rules for ISO-2022-CN Encoding:
2814
* i)   The designator sequence must appear once on a line before any instance
2815
*      of character set it designates.
2816
* ii)  If two lines contain characters from the same character set, both lines
2817
*      must include the designator sequence.
2818
* iii) Once the designator sequence is known, a shifting sequence has to be found
2819
*      to invoke the  shifting
2820
* iv)  All lines start in ASCII and end in ASCII.
2821
* v)   Four shifting sequences are employed for this purpose:
2822
*
2823
*      Sequcence   ASCII Eq    Charsets
2824
*      ----------  -------    ---------
2825
*      SI           <SI>        US-ASCII
2826
*      SO           <SO>        CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2827
*      SS2          <ESC>N      CNS-11643-1992 Plane 2
2828
*      SS3          <ESC>O      CNS-11643-1992 Planes 3-7
2829
*
2830
* vi)
2831
*      SOdesignator  : ESC "$" ")" finalchar_for_SO
2832
*      SS2designator : ESC "$" "*" finalchar_for_SS2
2833
*      SS3designator : ESC "$" "+" finalchar_for_SS3
2834
*
2835
*      ESC $ ) A       Indicates the bytes following SO are Chinese
2836
*       characters as defined in GB 2312-80, until
2837
*       another SOdesignation appears
2838
*
2839
*
2840
*      ESC $ ) E       Indicates the bytes following SO are as defined
2841
*       in ISO-IR-165 (for details, see section 2.1),
2842
*       until another SOdesignation appears
2843
*
2844
*      ESC $ ) G       Indicates the bytes following SO are as defined
2845
*       in CNS 11643-plane-1, until another
2846
*       SOdesignation appears
2847
*
2848
*      ESC $ * H       Indicates the two bytes immediately following
2849
*       SS2 is a Chinese character as defined in CNS
2850
*       11643-plane-2, until another SS2designation
2851
*       appears
2852
*       (Meaning <ESC>N must preceed every 2 byte
2853
*        sequence.)
2854
*
2855
*      ESC $ + I       Indicates the immediate two bytes following SS3
2856
*       is a Chinese character as defined in CNS
2857
*       11643-plane-3, until another SS3designation
2858
*       appears
2859
*       (Meaning <ESC>O must preceed every 2 byte
2860
*        sequence.)
2861
*
2862
*      ESC $ + J       Indicates the immediate two bytes following SS3
2863
*       is a Chinese character as defined in CNS
2864
*       11643-plane-4, until another SS3designation
2865
*       appears
2866
*       (In English: <ESC>O must preceed every 2 byte
2867
*        sequence.)
2868
*
2869
*      ESC $ + K       Indicates the immediate two bytes following SS3
2870
*       is a Chinese character as defined in CNS
2871
*       11643-plane-5, until another SS3designation
2872
*       appears
2873
*
2874
*      ESC $ + L       Indicates the immediate two bytes following SS3
2875
*       is a Chinese character as defined in CNS
2876
*       11643-plane-6, until another SS3designation
2877
*       appears
2878
*
2879
*      ESC $ + M       Indicates the immediate two bytes following SS3
2880
*       is a Chinese character as defined in CNS
2881
*       11643-plane-7, until another SS3designation
2882
*       appears
2883
*
2884
*       As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2885
*       has its own designation information before any Chinese characters
2886
*       appear
2887
*
2888
*/
2889
2890
/* The following are defined this way to make the strings truly readonly */
2891
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2892
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2893
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2894
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2895
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2896
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2897
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2898
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2899
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2900
2901
/********************** ISO2022-CN Data **************************/
2902
static const char* const escSeqCharsCN[10] ={
2903
        SHIFT_IN_STR,                   /* 0 ASCII */
2904
        GB_2312_80_STR,                 /* 1 GB2312_1 */
2905
        ISO_IR_165_STR,                 /* 2 ISO_IR_165 */
2906
        CNS_11643_1992_Plane_1_STR,
2907
        CNS_11643_1992_Plane_2_STR,
2908
        CNS_11643_1992_Plane_3_STR,
2909
        CNS_11643_1992_Plane_4_STR,
2910
        CNS_11643_1992_Plane_5_STR,
2911
        CNS_11643_1992_Plane_6_STR,
2912
        CNS_11643_1992_Plane_7_STR
2913
};
2914
2915
static void U_CALLCONV
2916
0
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2917
0
    UConverter *cnv = args->converter;
2918
0
    UConverterDataISO2022 *converterData;
2919
0
    ISO2022State *pFromU2022State;
2920
0
    uint8_t *target = (uint8_t *) args->target;
2921
0
    const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2922
0
    const UChar* source = args->source;
2923
0
    const UChar* sourceLimit = args->sourceLimit;
2924
0
    int32_t* offsets = args->offsets;
2925
0
    UChar32 sourceChar;
2926
0
    char buffer[8];
2927
0
    int32_t len;
2928
0
    int8_t choices[3];
2929
0
    int32_t choiceCount;
2930
0
    uint32_t targetValue = 0;
2931
0
    UBool useFallback;
2932
2933
    /* set up the state */
2934
0
    converterData     = (UConverterDataISO2022*)cnv->extraInfo;
2935
0
    pFromU2022State   = &converterData->fromU2022State;
2936
2937
0
    choiceCount = 0;
2938
2939
    /* check if the last codepoint of previous buffer was a lead surrogate*/
2940
0
    if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2941
0
        goto getTrail;
2942
0
    }
2943
2944
0
    while( source < sourceLimit){
2945
0
        if(target < targetLimit){
2946
2947
0
            sourceChar  = *(source++);
2948
            /*check if the char is a First surrogate*/
2949
0
             if(U16_IS_SURROGATE(sourceChar)) {
2950
0
                if(U16_IS_SURROGATE_LEAD(sourceChar)) {
2951
0
getTrail:
2952
                    /*look ahead to find the trail surrogate*/
2953
0
                    if(source < sourceLimit) {
2954
                        /* test the following code unit */
2955
0
                        UChar trail=(UChar) *source;
2956
0
                        if(U16_IS_TRAIL(trail)) {
2957
0
                            source++;
2958
0
                            sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
2959
0
                            cnv->fromUChar32=0x00;
2960
                            /* convert this supplementary code point */
2961
                            /* exit this condition tree */
2962
0
                        } else {
2963
                            /* this is an unmatched lead code unit (1st surrogate) */
2964
                            /* callback(illegal) */
2965
0
                            *err=U_ILLEGAL_CHAR_FOUND;
2966
0
                            cnv->fromUChar32=sourceChar;
2967
0
                            break;
2968
0
                        }
2969
0
                    } else {
2970
                        /* no more input */
2971
0
                        cnv->fromUChar32=sourceChar;
2972
0
                        break;
2973
0
                    }
2974
0
                } else {
2975
                    /* this is an unmatched trail code unit (2nd surrogate) */
2976
                    /* callback(illegal) */
2977
0
                    *err=U_ILLEGAL_CHAR_FOUND;
2978
0
                    cnv->fromUChar32=sourceChar;
2979
0
                    break;
2980
0
                }
2981
0
            }
2982
2983
            /* do the conversion */
2984
0
            if(sourceChar <= 0x007f ){
2985
                /* do not convert SO/SI/ESC */
2986
0
                if(IS_2022_CONTROL(sourceChar)) {
2987
                    /* callback(illegal) */
2988
0
                    *err=U_ILLEGAL_CHAR_FOUND;
2989
0
                    cnv->fromUChar32=sourceChar;
2990
0
                    break;
2991
0
                }
2992
2993
                /* US-ASCII */
2994
0
                if(pFromU2022State->g == 0) {
2995
0
                    buffer[0] = (char)sourceChar;
2996
0
                    len = 1;
2997
0
                } else {
2998
0
                    buffer[0] = UCNV_SI;
2999
0
                    buffer[1] = (char)sourceChar;
3000
0
                    len = 2;
3001
0
                    pFromU2022State->g = 0;
3002
0
                    choiceCount = 0;
3003
0
                }
3004
0
                if(sourceChar == CR || sourceChar == LF) {
3005
                    /* reset the state at the end of a line */
3006
0
                    uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
3007
0
                    choiceCount = 0;
3008
0
                }
3009
0
            }
3010
0
            else{
3011
                /* convert U+0080..U+10ffff */
3012
0
                int32_t i;
3013
0
                int8_t cs, g;
3014
3015
0
                if(choiceCount == 0) {
3016
                    /* try the current SO/G1 converter first */
3017
0
                    choices[0] = pFromU2022State->cs[1];
3018
3019
                    /* default to GB2312_1 if none is designated yet */
3020
0
                    if(choices[0] == 0) {
3021
0
                        choices[0] = GB2312_1;
3022
0
                    }
3023
3024
0
                    if(converterData->version == 0) {
3025
                        /* ISO-2022-CN */
3026
3027
                        /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
3028
0
                        if(choices[0] == GB2312_1) {
3029
0
                            choices[1] = (int8_t)CNS_11643_1;
3030
0
                        } else {
3031
0
                            choices[1] = (int8_t)GB2312_1;
3032
0
                        }
3033
3034
0
                        choiceCount = 2;
3035
0
                    } else if (converterData->version == 1) {
3036
                        /* ISO-2022-CN-EXT */
3037
3038
                        /* try one of the other converters */
3039
0
                        switch(choices[0]) {
3040
0
                        case GB2312_1:
3041
0
                            choices[1] = (int8_t)CNS_11643_1;
3042
0
                            choices[2] = (int8_t)ISO_IR_165;
3043
0
                            break;
3044
0
                        case ISO_IR_165:
3045
0
                            choices[1] = (int8_t)GB2312_1;
3046
0
                            choices[2] = (int8_t)CNS_11643_1;
3047
0
                            break;
3048
0
                        default: /* CNS_11643_x */
3049
0
                            choices[1] = (int8_t)GB2312_1;
3050
0
                            choices[2] = (int8_t)ISO_IR_165;
3051
0
                            break;
3052
0
                        }
3053
3054
0
                        choiceCount = 3;
3055
0
                    } else {
3056
0
                        choices[0] = (int8_t)CNS_11643_1;
3057
0
                        choices[1] = (int8_t)GB2312_1;
3058
0
                    }
3059
0
                }
3060
3061
0
                cs = g = 0;
3062
                /*
3063
                 * len==0: no mapping found yet
3064
                 * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
3065
                 * len>0: found a roundtrip result, done
3066
                 */
3067
0
                len = 0;
3068
                /*
3069
                 * We will turn off useFallback after finding a fallback,
3070
                 * but we still get fallbacks from PUA code points as usual.
3071
                 * Therefore, we will also need to check that we don't overwrite
3072
                 * an early fallback with a later one.
3073
                 */
3074
0
                useFallback = cnv->useFallback;
3075
3076
0
                for(i = 0; i < choiceCount && len <= 0; ++i) {
3077
0
                    int8_t cs0 = choices[i];
3078
0
                    if(cs0 > 0) {
3079
0
                        uint32_t value;
3080
0
                        int32_t len2;
3081
0
                        if(cs0 >= CNS_11643_0) {
3082
0
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3083
0
                                        converterData->myConverterArray[CNS_11643],
3084
0
                                        sourceChar,
3085
0
                                        &value,
3086
0
                                        useFallback,
3087
0
                                        MBCS_OUTPUT_3);
3088
0
                            if(len2 == 3 || (len2 == -3 && len == 0)) {
3089
0
                                targetValue = value;
3090
0
                                cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
3091
0
                                if(len2 >= 0) {
3092
0
                                    len = 2;
3093
0
                                } else {
3094
0
                                    len = -2;
3095
0
                                    useFallback = FALSE;
3096
0
                                }
3097
0
                                if(cs == CNS_11643_1) {
3098
0
                                    g = 1;
3099
0
                                } else if(cs == CNS_11643_2) {
3100
0
                                    g = 2;
3101
0
                                } else /* plane 3..7 */ if(converterData->version == 1) {
3102
0
                                    g = 3;
3103
0
                                } else {
3104
                                    /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
3105
0
                                    len = 0;
3106
0
                                }
3107
0
                            }
3108
0
                        } else {
3109
                            /* GB2312_1 or ISO-IR-165 */
3110
0
                            U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
3111
0
                            len2 = MBCS_FROM_UCHAR32_ISO2022(
3112
0
                                        converterData->myConverterArray[cs0],
3113
0
                                        sourceChar,
3114
0
                                        &value,
3115
0
                                        useFallback,
3116
0
                                        MBCS_OUTPUT_2);
3117
0
                            if(len2 == 2 || (len2 == -2 && len == 0)) {
3118
0
                                targetValue = value;
3119
0
                                len = len2;
3120
0
                                cs = cs0;
3121
0
                                g = 1;
3122
0
                                useFallback = FALSE;
3123
0
                            }
3124
0
                        }
3125
0
                    }
3126
0
                }
3127
3128
0
                if(len != 0) {
3129
0
                    len = 0; /* count output bytes; it must have been abs(len) == 2 */
3130
3131
                    /* write the designation sequence if necessary */
3132
0
                    if(cs != pFromU2022State->cs[g]) {
3133
0
                        if(cs < CNS_11643) {
3134
0
                            uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
3135
0
                        } else {
3136
0
                            U_ASSERT(cs >= CNS_11643_1);
3137
0
                            uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
3138
0
                        }
3139
0
                        len = 4;
3140
0
                        pFromU2022State->cs[g] = cs;
3141
0
                        if(g == 1) {
3142
                            /* changing the SO/G1 charset invalidates the choices[] */
3143
0
                            choiceCount = 0;
3144
0
                        }
3145
0
                    }
3146
3147
                    /* write the shift sequence if necessary */
3148
0
                    if(g != pFromU2022State->g) {
3149
0
                        switch(g) {
3150
0
                        case 1:
3151
0
                            buffer[len++] = UCNV_SO;
3152
3153
                            /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
3154
0
                            pFromU2022State->g = 1;
3155
0
                            break;
3156
0
                        case 2:
3157
0
                            buffer[len++] = 0x1b;
3158
0
                            buffer[len++] = 0x4e;
3159
0
                            break;
3160
0
                        default: /* case 3 */
3161
0
                            buffer[len++] = 0x1b;
3162
0
                            buffer[len++] = 0x4f;
3163
0
                            break;
3164
0
                        }
3165
0
                    }
3166
3167
                    /* write the two output bytes */
3168
0
                    buffer[len++] = (char)(targetValue >> 8);
3169
0
                    buffer[len++] = (char)targetValue;
3170
0
                } else {
3171
                    /* if we cannot find the character after checking all codepages
3172
                     * then this is an error
3173
                     */
3174
0
                    *err = U_INVALID_CHAR_FOUND;
3175
0
                    cnv->fromUChar32=sourceChar;
3176
0
                    break;
3177
0
                }
3178
0
            }
3179
3180
            /* output len>0 bytes in buffer[] */
3181
0
            if(len == 1) {
3182
0
                *target++ = buffer[0];
3183
0
                if(offsets) {
3184
0
                    *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
3185
0
                }
3186
0
            } else if(len == 2 && (target + 2) <= targetLimit) {
3187
0
                *target++ = buffer[0];
3188
0
                *target++ = buffer[1];
3189
0
                if(offsets) {
3190
0
                    int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
3191
0
                    *offsets++ = sourceIndex;
3192
0
                    *offsets++ = sourceIndex;
3193
0
                }
3194
0
            } else {
3195
0
                fromUWriteUInt8(
3196
0
                    cnv,
3197
0
                    buffer, len,
3198
0
                    &target, (const char *)targetLimit,
3199
0
                    &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
3200
0
                    err);
3201
0
                if(U_FAILURE(*err)) {
3202
0
                    break;
3203
0
                }
3204
0
            }
3205
0
        } /* end if(myTargetIndex<myTargetLength) */
3206
0
        else{
3207
0
            *err =U_BUFFER_OVERFLOW_ERROR;
3208
0
            break;
3209
0
        }
3210
3211
0
    }/* end while(mySourceIndex<mySourceLength) */
3212
3213
    /*
3214
     * the end of the input stream and detection of truncated input
3215
     * are handled by the framework, but for ISO-2022-CN conversion
3216
     * we need to be in ASCII mode at the very end
3217
     *
3218
     * conditions:
3219
     *   successful
3220
     *   not in ASCII mode
3221
     *   end of input and no truncated input
3222
     */
3223
0
    if( U_SUCCESS(*err) &&
3224
0
        pFromU2022State->g!=0 &&
3225
0
        args->flush && source>=sourceLimit && cnv->fromUChar32==0
3226
0
    ) {
3227
0
        int32_t sourceIndex;
3228
3229
        /* we are switching to ASCII */
3230
0
        pFromU2022State->g=0;
3231
3232
        /* get the source index of the last input character */
3233
        /*
3234
         * TODO this would be simpler and more reliable if we used a pair
3235
         * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
3236
         * so that we could simply use the prevSourceIndex here;
3237
         * this code gives an incorrect result for the rare case of an unmatched
3238
         * trail surrogate that is alone in the last buffer of the text stream
3239
         */
3240
0
        sourceIndex=(int32_t)(source-args->source);
3241
0
        if(sourceIndex>0) {
3242
0
            --sourceIndex;
3243
0
            if( U16_IS_TRAIL(args->source[sourceIndex]) &&
3244
0
                (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
3245
0
            ) {
3246
0
                --sourceIndex;
3247
0
            }
3248
0
        } else {
3249
0
            sourceIndex=-1;
3250
0
        }
3251
3252
0
        fromUWriteUInt8(
3253
0
            cnv,
3254
0
            SHIFT_IN_STR, 1,
3255
0
            &target, (const char *)targetLimit,
3256
0
            &offsets, sourceIndex,
3257
0
            err);
3258
0
    }
3259
3260
    /*save the state and return */
3261
0
    args->source = source;
3262
0
    args->target = (char*)target;
3263
0
}
3264
3265
3266
static void U_CALLCONV
3267
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
3268
0
                                               UErrorCode* err){
3269
0
    char tempBuf[3];
3270
0
    const char *mySource = (char *) args->source;
3271
0
    UChar *myTarget = args->target;
3272
0
    const char *mySourceLimit = args->sourceLimit;
3273
0
    uint32_t targetUniChar = 0x0000;
3274
0
    uint32_t mySourceChar = 0x0000;
3275
0
    UConverterDataISO2022* myData;
3276
0
    ISO2022State *pToU2022State;
3277
3278
0
    myData=(UConverterDataISO2022*)(args->converter->extraInfo);
3279
0
    pToU2022State = &myData->toU2022State;
3280
3281
0
    if(myData->key != 0) {
3282
        /* continue with a partial escape sequence */
3283
0
        goto escape;
3284
0
    } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
3285
        /* continue with a partial double-byte character */
3286
0
        mySourceChar = args->converter->toUBytes[0];
3287
0
        args->converter->toULength = 0;
3288
0
        targetUniChar = missingCharMarker;
3289
0
        goto getTrailByte;
3290
0
    }
3291
3292
0
    while(mySource < mySourceLimit){
3293
3294
0
        targetUniChar =missingCharMarker;
3295
3296
0
        if(myTarget < args->targetLimit){
3297
3298
0
            mySourceChar= (unsigned char) *mySource++;
3299
3300
0
            switch(mySourceChar){
3301
0
            case UCNV_SI:
3302
0
                pToU2022State->g=0;
3303
0
                if (myData->isEmptySegment) {
3304
0
                    myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
3305
0
                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3306
0
                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
3307
0
                    args->converter->toUBytes[0] = mySourceChar;
3308
0
                    args->converter->toULength = 1;
3309
0
                    args->target = myTarget;
3310
0
                    args->source = mySource;
3311
0
                    return;
3312
0
                }
3313
0
                continue;
3314
3315
0
            case UCNV_SO:
3316
0
                if(pToU2022State->cs[1] != 0) {
3317
0
                    pToU2022State->g=1;
3318
0
                    myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
3319
0
                    continue;
3320
0
                } else {
3321
                    /* illegal to have SO before a matching designator */
3322
0
                    myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
3323
0
                    break;
3324
0
                }
3325
3326
0
            case ESC_2022:
3327
0
                mySource--;
3328
0
escape:
3329
0
                {
3330
0
                    const char * mySourceBefore = mySource;
3331
0
                    int8_t toULengthBefore = args->converter->toULength;
3332
3333
0
                    changeState_2022(args->converter,&(mySource),
3334
0
                        mySourceLimit, ISO_2022_CN,err);
3335
3336
                    /* After SO there must be at least one character before a designator (designator error handled separately) */
3337
0
                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
3338
0
                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
3339
0
                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
3340
0
                        args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
3341
0
                    }
3342
0
                }
3343
3344
                /* invalid or illegal escape sequence */
3345
0
                if(U_FAILURE(*err)){
3346
0
                    args->target = myTarget;
3347
0
                    args->source = mySource;
3348
0
                    myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
3349
0
                    return;
3350
0
                }
3351
0
                continue;
3352
3353
            /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
3354
3355
0
            case CR:
3356
0
            case LF:
3357
0
                uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
3358
0
                U_FALLTHROUGH;
3359
0
            default:
3360
                /* convert one or two bytes */
3361
0
                myData->isEmptySegment = FALSE;
3362
0
                if(pToU2022State->g != 0) {
3363
0
                    if(mySource < mySourceLimit) {
3364
0
                        UConverterSharedData *cnv;
3365
0
                        StateEnum tempState;
3366
0
                        int32_t tempBufLen;
3367
0
                        int leadIsOk, trailIsOk;
3368
0
                        uint8_t trailByte;
3369
0
getTrailByte:
3370
0
                        trailByte = (uint8_t)*mySource;
3371
                        /*
3372
                         * Ticket 5691: consistent illegal sequences:
3373
                         * - We include at least the first byte in the illegal sequence.
3374
                         * - If any of the non-initial bytes could be the start of a character,
3375
                         *   we stop the illegal sequence before the first one of those.
3376
                         *
3377
                         * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
3378
                         * an ESC/SO/SI, we report only the first byte as the illegal sequence.
3379
                         * Otherwise we convert or report the pair of bytes.
3380
                         */
3381
0
                        leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
3382
0
                        trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
3383
0
                        if (leadIsOk && trailIsOk) {
3384
0
                            ++mySource;
3385
0
                            tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
3386
0
                            if(tempState >= CNS_11643_0) {
3387
0
                                cnv = myData->myConverterArray[CNS_11643];
3388
0
                                tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
3389
0
                                tempBuf[1] = (char) (mySourceChar);
3390
0
                                tempBuf[2] = (char) trailByte;
3391
0
                                tempBufLen = 3;
3392
3393
0
                            }else{
3394
0
                                U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
3395
0
                                cnv = myData->myConverterArray[tempState];
3396
0
                                tempBuf[0] = (char) (mySourceChar);
3397
0
                                tempBuf[1] = (char) trailByte;
3398
0
                                tempBufLen = 2;
3399
0
                            }
3400
0
                            targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
3401
0
                            mySourceChar = (mySourceChar << 8) | trailByte;
3402
0
                        } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
3403
                            /* report a pair of illegal bytes if the second byte is not a DBCS starter */
3404
0
                            ++mySource;
3405
                            /* add another bit so that the code below writes 2 bytes in case of error */
3406
0
                            mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
3407
0
                        }
3408
0
                        if(pToU2022State->g>=2) {
3409
                            /* return from a single-shift state to the previous one */
3410
0
                            pToU2022State->g=pToU2022State->prevG;
3411
0
                        }
3412
0
                    } else {
3413
0
                        args->converter->toUBytes[0] = (uint8_t)mySourceChar;
3414
0
                        args->converter->toULength = 1;
3415
0
                        goto endloop;
3416
0
                    }
3417
0
                }
3418
0
                else{
3419
0
                    if(mySourceChar <= 0x7f) {
3420
0
                        targetUniChar = (UChar) mySourceChar;
3421
0
                    }
3422
0
                }
3423
0
                break;
3424
0
            }
3425
0
            if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
3426
0
                if(args->offsets){
3427
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3428
0
                }
3429
0
                *(myTarget++)=(UChar)targetUniChar;
3430
0
            }
3431
0
            else if(targetUniChar > missingCharMarker){
3432
                /* disassemble the surrogate pair and write to output*/
3433
0
                targetUniChar-=0x0010000;
3434
0
                *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
3435
0
                if(args->offsets){
3436
0
                    args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3437
0
                }
3438
0
                ++myTarget;
3439
0
                if(myTarget< args->targetLimit){
3440
0
                    *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3441
0
                    if(args->offsets){
3442
0
                        args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
3443
0
                    }
3444
0
                    ++myTarget;
3445
0
                }else{
3446
0
                    args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
3447
0
                                    (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
3448
0
                }
3449
3450
0
            }
3451
0
            else{
3452
                /* Call the callback function*/
3453
0
                toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
3454
0
                break;
3455
0
            }
3456
0
        }
3457
0
        else{
3458
0
            *err =U_BUFFER_OVERFLOW_ERROR;
3459
0
            break;
3460
0
        }
3461
0
    }
3462
0
endloop:
3463
0
    args->target = myTarget;
3464
0
    args->source = mySource;
3465
0
}
3466
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3467
3468
static void U_CALLCONV
3469
0
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3470
0
    UConverter *cnv = args->converter;
3471
0
    UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3472
0
    ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3473
0
    char *p, *subchar;
3474
0
    char buffer[8];
3475
0
    int32_t length;
3476
3477
0
    subchar=(char *)cnv->subChars;
3478
0
    length=cnv->subCharLen; /* assume length==1 for most variants */
3479
3480
0
    p = buffer;
3481
0
    switch(myConverterData->locale[0]){
3482
0
    case 'j':
3483
0
        {
3484
0
            int8_t cs;
3485
3486
0
            if(pFromU2022State->g == 1) {
3487
                /* JIS7: switch from G1 to G0 */
3488
0
                pFromU2022State->g = 0;
3489
0
                *p++ = UCNV_SI;
3490
0
            }
3491
3492
0
            cs = pFromU2022State->cs[0];
3493
0
            if(cs != ASCII && cs != JISX201) {
3494
                /* not in ASCII or JIS X 0201: switch to ASCII */
3495
0
                pFromU2022State->cs[0] = (int8_t)ASCII;
3496
0
                *p++ = '\x1b';
3497
0
                *p++ = '\x28';
3498
0
                *p++ = '\x42';
3499
0
            }
3500
3501
0
            *p++ = subchar[0];
3502
0
            break;
3503
0
        }
3504
0
    case 'c':
3505
0
        if(pFromU2022State->g != 0) {
3506
            /* not in ASCII mode: switch to ASCII */
3507
0
            pFromU2022State->g = 0;
3508
0
            *p++ = UCNV_SI;
3509
0
        }
3510
0
        *p++ = subchar[0];
3511
0
        break;
3512
0
    case 'k':
3513
0
        if(myConverterData->version == 0) {
3514
0
            if(length == 1) {
3515
0
                if((UBool)args->converter->fromUnicodeStatus) {
3516
                    /* in DBCS mode: switch to SBCS */
3517
0
                    args->converter->fromUnicodeStatus = 0;
3518
0
                    *p++ = UCNV_SI;
3519
0
                }
3520
0
                *p++ = subchar[0];
3521
0
            } else /* length == 2*/ {
3522
0
                if(!(UBool)args->converter->fromUnicodeStatus) {
3523
                    /* in SBCS mode: switch to DBCS */
3524
0
                    args->converter->fromUnicodeStatus = 1;
3525
0
                    *p++ = UCNV_SO;
3526
0
                }
3527
0
                *p++ = subchar[0];
3528
0
                *p++ = subchar[1];
3529
0
            }
3530
0
            break;
3531
0
        } else {
3532
            /* save the subconverter's substitution string */
3533
0
            uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3534
0
            int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3535
3536
            /* set our substitution string into the subconverter */
3537
0
            myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3538
0
            myConverterData->currentConverter->subCharLen = (int8_t)length;
3539
3540
            /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3541
0
            args->converter = myConverterData->currentConverter;
3542
0
            myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3543
0
            ucnv_cbFromUWriteSub(args, 0, err);
3544
0
            cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3545
0
            args->converter = cnv;
3546
3547
            /* restore the subconverter's substitution string */
3548
0
            myConverterData->currentConverter->subChars = currentSubChars;
3549
0
            myConverterData->currentConverter->subCharLen = currentSubCharLen;
3550
3551
0
            if(*err == U_BUFFER_OVERFLOW_ERROR) {
3552
0
                if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3553
0
                    uprv_memcpy(
3554
0
                        cnv->charErrorBuffer,
3555
0
                        myConverterData->currentConverter->charErrorBuffer,
3556
0
                        myConverterData->currentConverter->charErrorBufferLength);
3557
0
                }
3558
0
                cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3559
0
                myConverterData->currentConverter->charErrorBufferLength = 0;
3560
0
            }
3561
0
            return;
3562
0
        }
3563
0
    default:
3564
        /* not expected */
3565
0
        break;
3566
0
    }
3567
0
    ucnv_cbFromUWriteBytes(args,
3568
0
                           buffer, (int32_t)(p - buffer),
3569
0
                           offsetIndex, err);
3570
0
}
3571
3572
/*
3573
 * Structure for cloning an ISO 2022 converter into a single memory block.
3574
 * ucnv_safeClone() of the converter will align the entire cloneStruct,
3575
 * and then ucnv_safeClone() of the sub-converter may additionally align
3576
 * currentConverter inside the cloneStruct, for which we need the deadSpace
3577
 * after currentConverter.
3578
 * This is because UAlignedMemory may be larger than the actually
3579
 * necessary alignment size for the platform.
3580
 * The other cloneStruct fields will not be moved around,
3581
 * and are aligned properly with cloneStruct's alignment.
3582
 */
3583
struct cloneStruct
3584
{
3585
    UConverter cnv;
3586
    UConverter currentConverter;
3587
    UAlignedMemory deadSpace;
3588
    UConverterDataISO2022 mydata;
3589
};
3590
3591
3592
U_CDECL_BEGIN
3593
3594
static UConverter * U_CALLCONV
3595
_ISO_2022_SafeClone(
3596
            const UConverter *cnv,
3597
            void *stackBuffer,
3598
            int32_t *pBufferSize,
3599
            UErrorCode *status)
3600
0
{
3601
0
    struct cloneStruct * localClone;
3602
0
    UConverterDataISO2022 *cnvData;
3603
0
    int32_t i, size;
3604
3605
0
    if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3606
0
        *pBufferSize = (int32_t)sizeof(struct cloneStruct);
3607
0
        return NULL;
3608
0
    }
3609
3610
0
    cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3611
0
    localClone = (struct cloneStruct *)stackBuffer;
3612
3613
    /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3614
3615
0
    uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3616
0
    localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3617
0
    localClone->cnv.isExtraLocal = TRUE;
3618
3619
    /* share the subconverters */
3620
3621
0
    if(cnvData->currentConverter != NULL) {
3622
0
        size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3623
0
        localClone->mydata.currentConverter =
3624
0
            ucnv_safeClone(cnvData->currentConverter,
3625
0
                            &localClone->currentConverter,
3626
0
                            &size, status);
3627
0
        if(U_FAILURE(*status)) {
3628
0
            return NULL;
3629
0
        }
3630
0
    }
3631
3632
0
    for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3633
0
        if(cnvData->myConverterArray[i] != NULL) {
3634
0
            ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3635
0
        }
3636
0
    }
3637
3638
0
    return &localClone->cnv;
3639
0
}
3640
3641
U_CDECL_END
3642
3643
static void U_CALLCONV
3644
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3645
                    const USetAdder *sa,
3646
                    UConverterUnicodeSet which,
3647
                    UErrorCode *pErrorCode)
3648
0
{
3649
0
    int32_t i;
3650
0
    UConverterDataISO2022* cnvData;
3651
3652
0
    if (U_FAILURE(*pErrorCode)) {
3653
0
        return;
3654
0
    }
3655
#ifdef U_ENABLE_GENERIC_ISO_2022
3656
    if (cnv->sharedData == &_ISO2022Data) {
3657
        /* We use UTF-8 in this case */
3658
        sa->addRange(sa->set, 0, 0xd7FF);
3659
        sa->addRange(sa->set, 0xE000, 0x10FFFF);
3660
        return;
3661
    }
3662
#endif
3663
3664
0
    cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3665
3666
    /* open a set and initialize it with code points that are algorithmically round-tripped */
3667
0
    switch(cnvData->locale[0]){
3668
0
    case 'j':
3669
        /* include JIS X 0201 which is hardcoded */
3670
0
        sa->add(sa->set, 0xa5);
3671
0
        sa->add(sa->set, 0x203e);
3672
0
        if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3673
            /* include Latin-1 for some variants of JP */
3674
0
            sa->addRange(sa->set, 0, 0xff);
3675
0
        } else {
3676
            /* include ASCII for JP */
3677
0
            sa->addRange(sa->set, 0, 0x7f);
3678
0
        }
3679
0
        if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
3680
            /*
3681
             * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
3682
             * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
3683
             * use half-width Katakana.
3684
             * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
3685
             * half-width Katakana via the ESC ( I sequence.
3686
             * However, we only emit (fromUnicode) half-width Katakana according to the
3687
             * definition of each variant.
3688
             *
3689
             * When including fallbacks,
3690
             * we need to include half-width Katakana Unicode code points for all JP variants because
3691
             * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
3692
             */
3693
            /* include half-width Katakana for JP */
3694
0
            sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3695
0
        }
3696
0
        break;
3697
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3698
0
    case 'c':
3699
0
    case 'z':
3700
        /* include ASCII for CN */
3701
0
        sa->addRange(sa->set, 0, 0x7f);
3702
0
        break;
3703
0
    case 'k':
3704
        /* there is only one converter for KR, and it is not in the myConverterArray[] */
3705
0
        cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3706
0
                cnvData->currentConverter, sa, which, pErrorCode);
3707
        /* the loop over myConverterArray[] will simply not find another converter */
3708
0
        break;
3709
0
#endif
3710
0
    default:
3711
0
        break;
3712
0
    }
3713
3714
#if 0  /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
3715
            if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3716
                cnvData->version==0 && i==CNS_11643
3717
            ) {
3718
                /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3719
                ucnv_MBCSGetUnicodeSetForBytes(
3720
                        cnvData->myConverterArray[i],
3721
                        sa, UCNV_ROUNDTRIP_SET,
3722
                        0, 0x81, 0x82,
3723
                        pErrorCode);
3724
            }
3725
#endif
3726
3727
0
    for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3728
0
        UConverterSetFilter filter;
3729
0
        if(cnvData->myConverterArray[i]!=NULL) {
3730
0
            if(cnvData->locale[0]=='j' && i==JISX208) {
3731
                /*
3732
                 * Only add code points that map to Shift-JIS codes
3733
                 * corresponding to JIS X 0208.
3734
                 */
3735
0
                filter=UCNV_SET_FILTER_SJIS;
3736
0
#if !UCONFIG_ONLY_HTML_CONVERSION
3737
0
            } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3738
0
                       cnvData->version==0 && i==CNS_11643) {
3739
                /*
3740
                 * Version-specific for CN:
3741
                 * CN version 0 does not map CNS planes 3..7 although
3742
                 * they are all available in the CNS conversion table;
3743
                 * CN version 1 (-EXT) does map them all.
3744
                 * The two versions create different Unicode sets.
3745
                 */
3746
0
                filter=UCNV_SET_FILTER_2022_CN;
3747
0
            } else if(i==KSC5601) {
3748
                /*
3749
                 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
3750
                 * are broader than GR94.
3751
                 */
3752
0
                filter=UCNV_SET_FILTER_GR94DBCS;
3753
0
#endif
3754
0
            } else {
3755
0
                filter=UCNV_SET_FILTER_NONE;
3756
0
            }
3757
0
            ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
3758
0
        }
3759
0
    }
3760
3761
    /*
3762
     * ISO 2022 converters must not convert SO/SI/ESC despite what
3763
     * sub-converters do by themselves.
3764
     * Remove these characters from the set.
3765
     */
3766
0
    sa->remove(sa->set, 0x0e);
3767
0
    sa->remove(sa->set, 0x0f);
3768
0
    sa->remove(sa->set, 0x1b);
3769
3770
    /* ISO 2022 converters do not convert C1 controls either */
3771
0
    sa->removeRange(sa->set, 0x80, 0x9f);
3772
0
}
3773
3774
static const UConverterImpl _ISO2022Impl={
3775
    UCNV_ISO_2022,
3776
3777
    NULL,
3778
    NULL,
3779
3780
    _ISO2022Open,
3781
    _ISO2022Close,
3782
    _ISO2022Reset,
3783
3784
#ifdef U_ENABLE_GENERIC_ISO_2022
3785
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3786
    T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3787
    ucnv_fromUnicode_UTF8,
3788
    ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3789
#else
3790
    NULL,
3791
    NULL,
3792
    NULL,
3793
    NULL,
3794
#endif
3795
    NULL,
3796
3797
    NULL,
3798
    _ISO2022getName,
3799
    _ISO_2022_WriteSub,
3800
    _ISO_2022_SafeClone,
3801
    _ISO_2022_GetUnicodeSet,
3802
3803
    NULL,
3804
    NULL
3805
};
3806
static const UConverterStaticData _ISO2022StaticData={
3807
    sizeof(UConverterStaticData),
3808
    "ISO_2022",
3809
    2022,
3810
    UCNV_IBM,
3811
    UCNV_ISO_2022,
3812
    1,
3813
    3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3814
    { 0x1a, 0, 0, 0 },
3815
    1,
3816
    FALSE,
3817
    FALSE,
3818
    0,
3819
    0,
3820
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3821
};
3822
const UConverterSharedData _ISO2022Data=
3823
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
3824
3825
/*************JP****************/
3826
static const UConverterImpl _ISO2022JPImpl={
3827
    UCNV_ISO_2022,
3828
3829
    NULL,
3830
    NULL,
3831
3832
    _ISO2022Open,
3833
    _ISO2022Close,
3834
    _ISO2022Reset,
3835
3836
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3837
    UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3838
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3839
    UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3840
    NULL,
3841
3842
    NULL,
3843
    _ISO2022getName,
3844
    _ISO_2022_WriteSub,
3845
    _ISO_2022_SafeClone,
3846
    _ISO_2022_GetUnicodeSet,
3847
3848
    NULL,
3849
    NULL
3850
};
3851
static const UConverterStaticData _ISO2022JPStaticData={
3852
    sizeof(UConverterStaticData),
3853
    "ISO_2022_JP",
3854
    0,
3855
    UCNV_IBM,
3856
    UCNV_ISO_2022,
3857
    1,
3858
    6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3859
    { 0x1a, 0, 0, 0 },
3860
    1,
3861
    FALSE,
3862
    FALSE,
3863
    0,
3864
    0,
3865
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3866
};
3867
3868
namespace {
3869
3870
const UConverterSharedData _ISO2022JPData=
3871
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
3872
3873
}  // namespace
3874
3875
#if !UCONFIG_ONLY_HTML_CONVERSION
3876
/************* KR ***************/
3877
static const UConverterImpl _ISO2022KRImpl={
3878
    UCNV_ISO_2022,
3879
3880
    NULL,
3881
    NULL,
3882
3883
    _ISO2022Open,
3884
    _ISO2022Close,
3885
    _ISO2022Reset,
3886
3887
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3888
    UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3889
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3890
    UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3891
    NULL,
3892
3893
    NULL,
3894
    _ISO2022getName,
3895
    _ISO_2022_WriteSub,
3896
    _ISO_2022_SafeClone,
3897
    _ISO_2022_GetUnicodeSet,
3898
3899
    NULL,
3900
    NULL
3901
};
3902
static const UConverterStaticData _ISO2022KRStaticData={
3903
    sizeof(UConverterStaticData),
3904
    "ISO_2022_KR",
3905
    0,
3906
    UCNV_IBM,
3907
    UCNV_ISO_2022,
3908
    1,
3909
    8, /* max 8 bytes per UChar */
3910
    { 0x1a, 0, 0, 0 },
3911
    1,
3912
    FALSE,
3913
    FALSE,
3914
    0,
3915
    0,
3916
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3917
};
3918
3919
namespace {
3920
3921
const UConverterSharedData _ISO2022KRData=
3922
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
3923
3924
}  // namespace
3925
3926
/*************** CN ***************/
3927
static const UConverterImpl _ISO2022CNImpl={
3928
3929
    UCNV_ISO_2022,
3930
3931
    NULL,
3932
    NULL,
3933
3934
    _ISO2022Open,
3935
    _ISO2022Close,
3936
    _ISO2022Reset,
3937
3938
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3939
    UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3940
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3941
    UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3942
    NULL,
3943
3944
    NULL,
3945
    _ISO2022getName,
3946
    _ISO_2022_WriteSub,
3947
    _ISO_2022_SafeClone,
3948
    _ISO_2022_GetUnicodeSet,
3949
3950
    NULL,
3951
    NULL
3952
};
3953
static const UConverterStaticData _ISO2022CNStaticData={
3954
    sizeof(UConverterStaticData),
3955
    "ISO_2022_CN",
3956
    0,
3957
    UCNV_IBM,
3958
    UCNV_ISO_2022,
3959
    1,
3960
    8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3961
    { 0x1a, 0, 0, 0 },
3962
    1,
3963
    FALSE,
3964
    FALSE,
3965
    0,
3966
    0,
3967
    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3968
};
3969
3970
namespace {
3971
3972
const UConverterSharedData _ISO2022CNData=
3973
        UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
3974
3975
}  // namespace
3976
#endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
3977
3978
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */