/src/icu/source/common/ucnv2022.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ********************************************************************** |
5 | | * Copyright (C) 2000-2016, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ********************************************************************** |
8 | | * file name: ucnv2022.cpp |
9 | | * encoding: UTF-8 |
10 | | * tab size: 8 (not used) |
11 | | * indentation:4 |
12 | | * |
13 | | * created on: 2000feb03 |
14 | | * created by: Markus W. Scherer |
15 | | * |
16 | | * Change history: |
17 | | * |
18 | | * 06/29/2000 helena Major rewrite of the callback APIs. |
19 | | * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
20 | | * Changed implementation of toUnicode |
21 | | * function |
22 | | * 08/21/2000 Ram Added support for ISO-2022-KR |
23 | | * 08/29/2000 Ram Seperated implementation of EBCDIC to |
24 | | * ucnvebdc.c |
25 | | * 09/20/2000 Ram Added support for ISO-2022-CN |
26 | | * Added implementations for getNextUChar() |
27 | | * for specific 2022 country variants. |
28 | | * 10/31/2000 Ram Implemented offsets logic functions |
29 | | */ |
30 | | |
31 | | #include "unicode/utypes.h" |
32 | | |
33 | | #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
34 | | |
35 | | #include "unicode/ucnv.h" |
36 | | #include "unicode/uset.h" |
37 | | #include "unicode/ucnv_err.h" |
38 | | #include "unicode/ucnv_cb.h" |
39 | | #include "unicode/utf16.h" |
40 | | #include "ucnv_imp.h" |
41 | | #include "ucnv_bld.h" |
42 | | #include "ucnv_cnv.h" |
43 | | #include "ucnvmbcs.h" |
44 | | #include "cstring.h" |
45 | | #include "cmemory.h" |
46 | | #include "uassert.h" |
47 | | |
48 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
49 | | /* |
50 | | * I am disabling the generic ISO-2022 converter after proposing to do so on |
51 | | * the icu mailing list two days ago. |
52 | | * |
53 | | * Reasons: |
54 | | * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
55 | | * its designation sequences, single shifts with return to the previous state, |
56 | | * switch-with-no-return to UTF-16BE or similar, etc. |
57 | | * This is unlike the language-specific variants like ISO-2022-JP which |
58 | | * require a much smaller repertoire of ISO-2022 features. |
59 | | * These variants continue to be supported. |
60 | | * 2. I believe that no one is really using the generic ISO-2022 converter |
61 | | * but rather always one of the language-specific variants. |
62 | | * Note that ICU's generic ISO-2022 converter has always output one escape |
63 | | * sequence followed by UTF-8 for the whole stream. |
64 | | * 3. Switching between subcharsets is extremely slow, because each time |
65 | | * the previous converter is closed and a new one opened, |
66 | | * without any kind of caching, least-recently-used list, etc. |
67 | | * 4. The code is currently buggy, and given the above it does not seem |
68 | | * reasonable to spend the time on maintenance. |
69 | | * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
70 | | * This means, for example, that when ISO-8859-7 is designated, the following |
71 | | * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
72 | | * The ICU ISO-2022 converter does not handle this - and has no information |
73 | | * about which subconverter would have to be shifted vs. which is designed |
74 | | * for 7-bit ISO-2022. |
75 | | * |
76 | | * Markus Scherer 2003-dec-03 |
77 | | */ |
78 | | #endif |
79 | | |
80 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
81 | | static const char SHIFT_IN_STR[] = "\x0F"; |
82 | | // static const char SHIFT_OUT_STR[] = "\x0E"; |
83 | | #endif |
84 | | |
85 | 0 | #define CR 0x0D |
86 | 0 | #define LF 0x0A |
87 | | #define H_TAB 0x09 |
88 | | #define V_TAB 0x0B |
89 | | #define SPACE 0x20 |
90 | | |
91 | | enum { |
92 | | HWKANA_START=0xff61, |
93 | | HWKANA_END=0xff9f |
94 | | }; |
95 | | |
96 | | /* |
97 | | * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
98 | | * as bytes 21..7E. (Subtract 0x80.) |
99 | | * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
100 | | * as bytes 20..7F. (Subtract 0x80.) |
101 | | * Do not encode C1 control codes with native bytes 80..9F |
102 | | * as bytes 00..1F (C0 control codes). |
103 | | */ |
104 | | enum { |
105 | | GR94_START=0xa1, |
106 | | GR94_END=0xfe, |
107 | | GR96_START=0xa0, |
108 | | GR96_END=0xff |
109 | | }; |
110 | | |
111 | | /* |
112 | | * ISO 2022 control codes must not be converted from Unicode |
113 | | * because they would mess up the byte stream. |
114 | | * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
115 | | * corresponding to SO, SI, and ESC. |
116 | | */ |
117 | 0 | #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
118 | | |
119 | | /* for ISO-2022-JP and -CN implementations */ |
120 | | typedef enum { |
121 | | /* shared values */ |
122 | | INVALID_STATE=-1, |
123 | | ASCII = 0, |
124 | | |
125 | | SS2_STATE=0x10, |
126 | | SS3_STATE, |
127 | | |
128 | | /* JP */ |
129 | | ISO8859_1 = 1 , |
130 | | ISO8859_7 = 2 , |
131 | | JISX201 = 3, |
132 | | JISX208 = 4, |
133 | | JISX212 = 5, |
134 | | GB2312 =6, |
135 | | KSC5601 =7, |
136 | | HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
137 | | |
138 | | /* CN */ |
139 | | /* the first few enum constants must keep their values because they correspond to myConverterArray[] */ |
140 | | GB2312_1=1, |
141 | | ISO_IR_165=2, |
142 | | CNS_11643=3, |
143 | | |
144 | | /* |
145 | | * these are used in StateEnum and ISO2022State variables, |
146 | | * but CNS_11643 must be used to index into myConverterArray[] |
147 | | */ |
148 | | CNS_11643_0=0x20, |
149 | | CNS_11643_1, |
150 | | CNS_11643_2, |
151 | | CNS_11643_3, |
152 | | CNS_11643_4, |
153 | | CNS_11643_5, |
154 | | CNS_11643_6, |
155 | | CNS_11643_7 |
156 | | } StateEnum; |
157 | | |
158 | | /* is the StateEnum charset value for a DBCS charset? */ |
159 | | #if UCONFIG_ONLY_HTML_CONVERSION |
160 | | #define IS_JP_DBCS(cs) (JISX208==(cs)) |
161 | | #else |
162 | 0 | #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
163 | | #endif |
164 | | |
165 | 0 | #define CSM(cs) ((uint16_t)1<<(cs)) |
166 | | |
167 | | /* |
168 | | * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence |
169 | | * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x |
170 | | * |
171 | | * Note: The converter uses some leniency: |
172 | | * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
173 | | * all versions, not just JIS7 and JIS8. |
174 | | * - ICU does not distinguish between different versions of JIS X 0208. |
175 | | */ |
176 | | #if UCONFIG_ONLY_HTML_CONVERSION |
177 | | enum { MAX_JA_VERSION=0 }; |
178 | | #else |
179 | | enum { MAX_JA_VERSION=4 }; |
180 | | #endif |
181 | | static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
182 | | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
183 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
184 | | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
185 | | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
186 | | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
187 | | CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
188 | | #endif |
189 | | }; |
190 | | |
191 | | typedef enum { |
192 | | ASCII1=0, |
193 | | LATIN1, |
194 | | SBCS, |
195 | | DBCS, |
196 | | MBCS, |
197 | | HWKANA |
198 | | }Cnv2022Type; |
199 | | |
200 | | typedef struct ISO2022State { |
201 | | int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */ |
202 | | int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
203 | | int8_t prevG; /* g before single shift (SS2 or SS3) */ |
204 | | } ISO2022State; |
205 | | |
206 | 0 | #define UCNV_OPTIONS_VERSION_MASK 0xf |
207 | 0 | #define UCNV_2022_MAX_CONVERTERS 10 |
208 | | |
209 | | typedef struct{ |
210 | | UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
211 | | UConverter *currentConverter; |
212 | | Cnv2022Type currentType; |
213 | | ISO2022State toU2022State, fromU2022State; |
214 | | uint32_t key; |
215 | | uint32_t version; |
216 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
217 | | UBool isFirstBuffer; |
218 | | #endif |
219 | | UBool isEmptySegment; |
220 | | char name[30]; |
221 | | char locale[3]; |
222 | | }UConverterDataISO2022; |
223 | | |
224 | | /* Protos */ |
225 | | /* ISO-2022 ----------------------------------------------------------------- */ |
226 | | |
227 | | /*Forward declaration */ |
228 | | U_CFUNC void U_CALLCONV |
229 | | ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
230 | | UErrorCode * err); |
231 | | U_CFUNC void U_CALLCONV |
232 | | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
233 | | UErrorCode * err); |
234 | | |
235 | 0 | #define ESC_2022 0x1B /*ESC*/ |
236 | | |
237 | | typedef enum |
238 | | { |
239 | | INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/ |
240 | | VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/ |
241 | | VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/ |
242 | | VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/ |
243 | | } UCNV_TableStates_2022; |
244 | | |
245 | | /* |
246 | | * The way these state transition arrays work is: |
247 | | * ex : ESC$B is the sequence for JISX208 |
248 | | * a) First Iteration: char is ESC |
249 | | * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index |
250 | | * int x = normalize_esq_chars_2022[27] which is equal to 1 |
251 | | * ii) Search for this value in escSeqStateTable_Key_2022[] |
252 | | * value of x is stored at escSeqStateTable_Key_2022[0] |
253 | | * iii) Save this index as offset |
254 | | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
255 | | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
256 | | * b) Switch on this state and continue to next char |
257 | | * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index |
258 | | * which is normalize_esq_chars_2022[36] == 4 |
259 | | * ii) x is currently 1(from above) |
260 | | * x<<=5 -- x is now 32 |
261 | | * x+=normalize_esq_chars_2022[36] |
262 | | * now x is 36 |
263 | | * iii) Search for this value in escSeqStateTable_Key_2022[] |
264 | | * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2 |
265 | | * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
266 | | * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022 |
267 | | * c) Switch on this state and continue to next char |
268 | | * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index |
269 | | * ii) x is currently 36 (from above) |
270 | | * x<<=5 -- x is now 1152 |
271 | | * x+=normalize_esq_chars_2022[66] |
272 | | * now x is 1161 |
273 | | * iii) Search for this value in escSeqStateTable_Key_2022[] |
274 | | * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21 |
275 | | * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
276 | | * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
277 | | * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208 |
278 | | */ |
279 | | |
280 | | |
281 | | /*Below are the 3 arrays depicting a state transition table*/ |
282 | | static const int8_t normalize_esq_chars_2022[256] = { |
283 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
284 | | |
285 | | 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
286 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
287 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 |
288 | | ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0 |
289 | | ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0 |
290 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
291 | | ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12 |
292 | | ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28 |
293 | | ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
294 | | ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
295 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
296 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
297 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
298 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
299 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
300 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
301 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
302 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
303 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
304 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
305 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
306 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
307 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
308 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
309 | | ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 |
310 | | ,0 ,0 ,0 ,0 ,0 ,0 |
311 | | }; |
312 | | |
313 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
314 | | /* |
315 | | * When the generic ISO-2022 converter is completely removed, not just disabled |
316 | | * per #ifdef, then the following state table and the associated tables that are |
317 | | * dimensioned with MAX_STATES_2022 should be trimmed. |
318 | | * |
319 | | * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
320 | | * the associated escape sequences starting with ESC ( B should be removed. |
321 | | * This includes the ones with key values 1097 and all of the ones above 1000000. |
322 | | * |
323 | | * For the latter, the tables can simply be truncated. |
324 | | * For the former, since the tables must be kept parallel, it is probably best |
325 | | * to simply duplicate an adjacent table cell, parallel in all tables. |
326 | | * |
327 | | * It may make sense to restructure the tables, especially by using small search |
328 | | * tables for the variants instead of indexing them parallel to the table here. |
329 | | */ |
330 | | #endif |
331 | | |
332 | 0 | #define MAX_STATES_2022 74 |
333 | | static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
334 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
335 | | |
336 | | 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096 |
337 | | ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106 |
338 | | ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257 |
339 | | ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940 |
340 | | ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644 |
341 | | ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138 |
342 | | ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630 |
343 | | ,35947631 ,35947635 ,35947636 ,35947638 |
344 | | }; |
345 | | |
346 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
347 | | |
348 | | static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
349 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
350 | | |
351 | | NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1" |
352 | | ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1" |
353 | | ,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8" |
354 | | ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383" |
355 | | ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
356 | | ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
357 | | ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089" |
358 | | ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1" |
359 | | }; |
360 | | |
361 | | #endif |
362 | | |
363 | | static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
364 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
365 | | VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
366 | | ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
367 | | ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 |
368 | | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
369 | | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
370 | | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
371 | | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
372 | | ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 |
373 | | }; |
374 | | |
375 | | /* Type def for refactoring changeState_2022 code*/ |
376 | | typedef enum{ |
377 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
378 | | ISO_2022=0, |
379 | | #endif |
380 | | ISO_2022_JP=1, |
381 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
382 | | ISO_2022_KR=2, |
383 | | ISO_2022_CN=3 |
384 | | #endif |
385 | | } Variant2022; |
386 | | |
387 | | /*********** ISO 2022 Converter Protos ***********/ |
388 | | static void U_CALLCONV |
389 | | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
390 | | |
391 | | static void U_CALLCONV |
392 | | _ISO2022Close(UConverter *converter); |
393 | | |
394 | | static void U_CALLCONV |
395 | | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
396 | | |
397 | | U_CDECL_BEGIN |
398 | | static const char * U_CALLCONV |
399 | | _ISO2022getName(const UConverter* cnv); |
400 | | U_CDECL_END |
401 | | |
402 | | static void U_CALLCONV |
403 | | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err); |
404 | | |
405 | | U_CDECL_BEGIN |
406 | | static UConverter * U_CALLCONV |
407 | | _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); |
408 | | |
409 | | U_CDECL_END |
410 | | |
411 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
412 | | static void U_CALLCONV |
413 | | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err); |
414 | | #endif |
415 | | |
416 | | namespace { |
417 | | |
418 | | /*const UConverterSharedData _ISO2022Data;*/ |
419 | | extern const UConverterSharedData _ISO2022JPData; |
420 | | |
421 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
422 | | extern const UConverterSharedData _ISO2022KRData; |
423 | | extern const UConverterSharedData _ISO2022CNData; |
424 | | #endif |
425 | | |
426 | | } // namespace |
427 | | |
428 | | /*************** Converter implementations ******************/ |
429 | | |
430 | | /* The purpose of this function is to get around gcc compiler warnings. */ |
431 | | static inline void |
432 | | fromUWriteUInt8(UConverter *cnv, |
433 | | const char *bytes, int32_t length, |
434 | | uint8_t **target, const char *targetLimit, |
435 | | int32_t **offsets, |
436 | | int32_t sourceIndex, |
437 | | UErrorCode *pErrorCode) |
438 | 0 | { |
439 | 0 | char *targetChars = (char *)*target; |
440 | 0 | ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
441 | 0 | offsets, sourceIndex, pErrorCode); |
442 | 0 | *target = (uint8_t*)targetChars; |
443 | |
|
444 | 0 | } |
445 | | |
446 | | static inline void |
447 | 0 | setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){ |
448 | 0 | if(myConverterData->version == 1) { |
449 | 0 | UConverter *cnv = myConverterData->currentConverter; |
450 | |
|
451 | 0 | cnv->toUnicodeStatus=0; /* offset */ |
452 | 0 | cnv->mode=0; /* state */ |
453 | 0 | cnv->toULength=0; /* byteIndex */ |
454 | 0 | } |
455 | 0 | } |
456 | | |
457 | | static inline void |
458 | 0 | setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){ |
459 | | /* in ISO-2022-KR the designator sequence appears only once |
460 | | * in a file so we append it only once |
461 | | */ |
462 | 0 | if( converter->charErrorBufferLength==0){ |
463 | |
|
464 | 0 | converter->charErrorBufferLength = 4; |
465 | 0 | converter->charErrorBuffer[0] = 0x1b; |
466 | 0 | converter->charErrorBuffer[1] = 0x24; |
467 | 0 | converter->charErrorBuffer[2] = 0x29; |
468 | 0 | converter->charErrorBuffer[3] = 0x43; |
469 | 0 | } |
470 | 0 | if(myConverterData->version == 1) { |
471 | 0 | UConverter *cnv = myConverterData->currentConverter; |
472 | |
|
473 | 0 | cnv->fromUChar32=0; |
474 | 0 | cnv->fromUnicodeStatus=1; /* prevLength */ |
475 | 0 | } |
476 | 0 | } |
477 | | |
478 | | static void U_CALLCONV |
479 | 0 | _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
480 | |
|
481 | 0 | char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
482 | |
|
483 | 0 | cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
484 | 0 | if(cnv->extraInfo != NULL) { |
485 | 0 | UConverterNamePieces stackPieces; |
486 | 0 | UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER; |
487 | 0 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
488 | 0 | uint32_t version; |
489 | |
|
490 | 0 | stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
491 | |
|
492 | 0 | uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
493 | 0 | myConverterData->currentType = ASCII1; |
494 | 0 | cnv->fromUnicodeStatus =FALSE; |
495 | 0 | if(pArgs->locale){ |
496 | 0 | uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); |
497 | 0 | } |
498 | 0 | version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
499 | 0 | myConverterData->version = version; |
500 | 0 | if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
501 | 0 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
502 | 0 | { |
503 | | /* open the required converters and cache them */ |
504 | 0 | if(version>MAX_JA_VERSION) { |
505 | | // ICU 55 fails to open a converter for an unsupported version. |
506 | | // Previously, it fell back to version 0, but that would yield |
507 | | // unexpected behavior. |
508 | 0 | *errorCode = U_MISSING_RESOURCE_ERROR; |
509 | 0 | return; |
510 | 0 | } |
511 | 0 | if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
512 | 0 | myConverterData->myConverterArray[ISO8859_7] = |
513 | 0 | ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode); |
514 | 0 | } |
515 | 0 | myConverterData->myConverterArray[JISX208] = |
516 | 0 | ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode); |
517 | 0 | if(jpCharsetMasks[version]&CSM(JISX212)) { |
518 | 0 | myConverterData->myConverterArray[JISX212] = |
519 | 0 | ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode); |
520 | 0 | } |
521 | 0 | if(jpCharsetMasks[version]&CSM(GB2312)) { |
522 | 0 | myConverterData->myConverterArray[GB2312] = |
523 | 0 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */ |
524 | 0 | } |
525 | 0 | if(jpCharsetMasks[version]&CSM(KSC5601)) { |
526 | 0 | myConverterData->myConverterArray[KSC5601] = |
527 | 0 | ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode); |
528 | 0 | } |
529 | | |
530 | | /* set the function pointers to appropriate funtions */ |
531 | 0 | cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
532 | 0 | uprv_strcpy(myConverterData->locale,"ja"); |
533 | |
|
534 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); |
535 | 0 | size_t len = uprv_strlen(myConverterData->name); |
536 | 0 | myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); |
537 | 0 | myConverterData->name[len+1]='\0'; |
538 | 0 | } |
539 | 0 | #if !UCONFIG_ONLY_HTML_CONVERSION |
540 | 0 | else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
541 | 0 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
542 | 0 | { |
543 | 0 | if(version>1) { |
544 | | // ICU 55 fails to open a converter for an unsupported version. |
545 | | // Previously, it fell back to version 0, but that would yield |
546 | | // unexpected behavior. |
547 | 0 | *errorCode = U_MISSING_RESOURCE_ERROR; |
548 | 0 | return; |
549 | 0 | } |
550 | 0 | const char *cnvName; |
551 | 0 | if(version==1) { |
552 | 0 | cnvName="icu-internal-25546"; |
553 | 0 | } else { |
554 | 0 | cnvName="ibm-949"; |
555 | 0 | myConverterData->version=version=0; |
556 | 0 | } |
557 | 0 | if(pArgs->onlyTestIsLoadable) { |
558 | 0 | ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */ |
559 | 0 | uprv_free(cnv->extraInfo); |
560 | 0 | cnv->extraInfo=NULL; |
561 | 0 | return; |
562 | 0 | } else { |
563 | 0 | myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
564 | 0 | if (U_FAILURE(*errorCode)) { |
565 | 0 | _ISO2022Close(cnv); |
566 | 0 | return; |
567 | 0 | } |
568 | | |
569 | 0 | if(version==1) { |
570 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1"); |
571 | 0 | uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4); |
572 | 0 | cnv->subCharLen = myConverterData->currentConverter->subCharLen; |
573 | 0 | }else{ |
574 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0"); |
575 | 0 | } |
576 | | |
577 | | /* initialize the state variables */ |
578 | 0 | setInitialStateToUnicodeKR(cnv, myConverterData); |
579 | 0 | setInitialStateFromUnicodeKR(cnv, myConverterData); |
580 | | |
581 | | /* set the function pointers to appropriate funtions */ |
582 | 0 | cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
583 | 0 | uprv_strcpy(myConverterData->locale,"ko"); |
584 | 0 | } |
585 | 0 | } |
586 | 0 | else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&& |
587 | 0 | (myLocale[2]=='_' || myLocale[2]=='\0')) |
588 | 0 | { |
589 | 0 | if(version>2) { |
590 | | // ICU 55 fails to open a converter for an unsupported version. |
591 | | // Previously, it fell back to version 0, but that would yield |
592 | | // unexpected behavior. |
593 | 0 | *errorCode = U_MISSING_RESOURCE_ERROR; |
594 | 0 | return; |
595 | 0 | } |
596 | | |
597 | | /* open the required converters and cache them */ |
598 | 0 | myConverterData->myConverterArray[GB2312_1] = |
599 | 0 | ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); |
600 | 0 | if(version==1) { |
601 | 0 | myConverterData->myConverterArray[ISO_IR_165] = |
602 | 0 | ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode); |
603 | 0 | } |
604 | 0 | myConverterData->myConverterArray[CNS_11643] = |
605 | 0 | ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode); |
606 | | |
607 | | |
608 | | /* set the function pointers to appropriate funtions */ |
609 | 0 | cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
610 | 0 | uprv_strcpy(myConverterData->locale,"cn"); |
611 | |
|
612 | 0 | if (version==0){ |
613 | 0 | myConverterData->version = 0; |
614 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0"); |
615 | 0 | }else if (version==1){ |
616 | 0 | myConverterData->version = 1; |
617 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1"); |
618 | 0 | }else { |
619 | 0 | myConverterData->version = 2; |
620 | 0 | (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2"); |
621 | 0 | } |
622 | 0 | } |
623 | 0 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
624 | 0 | else{ |
625 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
626 | | myConverterData->isFirstBuffer = TRUE; |
627 | | |
628 | | /* append the UTF-8 escape sequence */ |
629 | | cnv->charErrorBufferLength = 3; |
630 | | cnv->charErrorBuffer[0] = 0x1b; |
631 | | cnv->charErrorBuffer[1] = 0x25; |
632 | | cnv->charErrorBuffer[2] = 0x42; |
633 | | |
634 | | cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
635 | | /* initialize the state variables */ |
636 | | uprv_strcpy(myConverterData->name,"ISO_2022"); |
637 | | #else |
638 | 0 | *errorCode = U_MISSING_RESOURCE_ERROR; |
639 | | // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard |
640 | | // data loading error code. |
641 | 0 | return; |
642 | 0 | #endif |
643 | 0 | } |
644 | | |
645 | 0 | cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
646 | |
|
647 | 0 | if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
648 | 0 | _ISO2022Close(cnv); |
649 | 0 | } |
650 | 0 | } else { |
651 | 0 | *errorCode = U_MEMORY_ALLOCATION_ERROR; |
652 | 0 | } |
653 | 0 | } |
654 | | |
655 | | |
656 | | static void U_CALLCONV |
657 | 0 | _ISO2022Close(UConverter *converter) { |
658 | 0 | UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); |
659 | 0 | UConverterSharedData **array = myData->myConverterArray; |
660 | 0 | int32_t i; |
661 | |
|
662 | 0 | if (converter->extraInfo != NULL) { |
663 | | /*close the array of converter pointers and free the memory*/ |
664 | 0 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
665 | 0 | if(array[i]!=NULL) { |
666 | 0 | ucnv_unloadSharedDataIfReady(array[i]); |
667 | 0 | } |
668 | 0 | } |
669 | |
|
670 | 0 | ucnv_close(myData->currentConverter); |
671 | |
|
672 | 0 | if(!converter->isExtraLocal){ |
673 | 0 | uprv_free (converter->extraInfo); |
674 | 0 | converter->extraInfo = NULL; |
675 | 0 | } |
676 | 0 | } |
677 | 0 | } |
678 | | |
679 | | static void U_CALLCONV |
680 | 0 | _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
681 | 0 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo); |
682 | 0 | if(choice<=UCNV_RESET_TO_UNICODE) { |
683 | 0 | uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
684 | 0 | myConverterData->key = 0; |
685 | 0 | myConverterData->isEmptySegment = FALSE; |
686 | 0 | } |
687 | 0 | if(choice!=UCNV_RESET_TO_UNICODE) { |
688 | 0 | uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
689 | 0 | } |
690 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
691 | | if(myConverterData->locale[0] == 0){ |
692 | | if(choice<=UCNV_RESET_TO_UNICODE) { |
693 | | myConverterData->isFirstBuffer = TRUE; |
694 | | myConverterData->key = 0; |
695 | | if (converter->mode == UCNV_SO){ |
696 | | ucnv_close (myConverterData->currentConverter); |
697 | | myConverterData->currentConverter=NULL; |
698 | | } |
699 | | converter->mode = UCNV_SI; |
700 | | } |
701 | | if(choice!=UCNV_RESET_TO_UNICODE) { |
702 | | /* re-append UTF-8 escape sequence */ |
703 | | converter->charErrorBufferLength = 3; |
704 | | converter->charErrorBuffer[0] = 0x1b; |
705 | | converter->charErrorBuffer[1] = 0x28; |
706 | | converter->charErrorBuffer[2] = 0x42; |
707 | | } |
708 | | } |
709 | | else |
710 | | #endif |
711 | 0 | { |
712 | | /* reset the state variables */ |
713 | 0 | if(myConverterData->locale[0] == 'k'){ |
714 | 0 | if(choice<=UCNV_RESET_TO_UNICODE) { |
715 | 0 | setInitialStateToUnicodeKR(converter, myConverterData); |
716 | 0 | } |
717 | 0 | if(choice!=UCNV_RESET_TO_UNICODE) { |
718 | 0 | setInitialStateFromUnicodeKR(converter, myConverterData); |
719 | 0 | } |
720 | 0 | } |
721 | 0 | } |
722 | 0 | } |
723 | | |
724 | | U_CDECL_BEGIN |
725 | | |
726 | | static const char * U_CALLCONV |
727 | 0 | _ISO2022getName(const UConverter* cnv){ |
728 | 0 | if(cnv->extraInfo){ |
729 | 0 | UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
730 | 0 | return myData->name; |
731 | 0 | } |
732 | 0 | return NULL; |
733 | 0 | } |
734 | | |
735 | | U_CDECL_END |
736 | | |
737 | | |
738 | | /*************** to unicode *******************/ |
739 | | /**************************************************************************** |
740 | | * Recognized escape sequences are |
741 | | * <ESC>(B ASCII |
742 | | * <ESC>.A ISO-8859-1 |
743 | | * <ESC>.F ISO-8859-7 |
744 | | * <ESC>(J JISX-201 |
745 | | * <ESC>(I JISX-201 |
746 | | * <ESC>$B JISX-208 |
747 | | * <ESC>$@ JISX-208 |
748 | | * <ESC>$(D JISX-212 |
749 | | * <ESC>$A GB2312 |
750 | | * <ESC>$(C KSC5601 |
751 | | */ |
752 | | static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
753 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
754 | | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
755 | | ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE |
756 | | ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
757 | | ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE |
758 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
759 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
760 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
761 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
762 | | }; |
763 | | |
764 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
765 | | /*************** to unicode *******************/ |
766 | | static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
767 | | /* 0 1 2 3 4 5 6 7 8 9 */ |
768 | | INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
769 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
770 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
771 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
772 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
773 | | ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
774 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
775 | | ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
776 | | }; |
777 | | #endif |
778 | | |
779 | | |
780 | | static UCNV_TableStates_2022 |
781 | 0 | getKey_2022(char c,int32_t* key,int32_t* offset){ |
782 | 0 | int32_t togo; |
783 | 0 | int32_t low = 0; |
784 | 0 | int32_t hi = MAX_STATES_2022; |
785 | 0 | int32_t oldmid=0; |
786 | |
|
787 | 0 | togo = normalize_esq_chars_2022[(uint8_t)c]; |
788 | 0 | if(togo == 0) { |
789 | | /* not a valid character anywhere in an escape sequence */ |
790 | 0 | *key = 0; |
791 | 0 | *offset = 0; |
792 | 0 | return INVALID_2022; |
793 | 0 | } |
794 | 0 | togo = (*key << 5) + togo; |
795 | |
|
796 | 0 | while (hi != low) /*binary search*/{ |
797 | |
|
798 | 0 | int32_t mid = (hi+low) >> 1; /*Finds median*/ |
799 | |
|
800 | 0 | if (mid == oldmid) |
801 | 0 | break; |
802 | | |
803 | 0 | if (escSeqStateTable_Key_2022[mid] > togo){ |
804 | 0 | hi = mid; |
805 | 0 | } |
806 | 0 | else if (escSeqStateTable_Key_2022[mid] < togo){ |
807 | 0 | low = mid; |
808 | 0 | } |
809 | 0 | else /*we found it*/{ |
810 | 0 | *key = togo; |
811 | 0 | *offset = mid; |
812 | 0 | return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
813 | 0 | } |
814 | 0 | oldmid = mid; |
815 | |
|
816 | 0 | } |
817 | | |
818 | 0 | *key = 0; |
819 | 0 | *offset = 0; |
820 | 0 | return INVALID_2022; |
821 | 0 | } |
822 | | |
823 | | /*runs through a state machine to determine the escape sequence - codepage correspondance |
824 | | */ |
825 | | static void |
826 | | changeState_2022(UConverter* _this, |
827 | | const char** source, |
828 | | const char* sourceLimit, |
829 | | Variant2022 var, |
830 | 0 | UErrorCode* err){ |
831 | 0 | UCNV_TableStates_2022 value; |
832 | 0 | UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo); |
833 | 0 | uint32_t key = myData2022->key; |
834 | 0 | int32_t offset = 0; |
835 | 0 | int8_t initialToULength = _this->toULength; |
836 | 0 | char c; |
837 | |
|
838 | 0 | value = VALID_NON_TERMINAL_2022; |
839 | 0 | while (*source < sourceLimit) { |
840 | 0 | c = *(*source)++; |
841 | 0 | _this->toUBytes[_this->toULength++]=(uint8_t)c; |
842 | 0 | value = getKey_2022(c,(int32_t *) &key, &offset); |
843 | |
|
844 | 0 | switch (value){ |
845 | | |
846 | 0 | case VALID_NON_TERMINAL_2022 : |
847 | | /* continue with the loop */ |
848 | 0 | break; |
849 | | |
850 | 0 | case VALID_TERMINAL_2022: |
851 | 0 | key = 0; |
852 | 0 | goto DONE; |
853 | | |
854 | 0 | case INVALID_2022: |
855 | 0 | goto DONE; |
856 | | |
857 | 0 | case VALID_MAYBE_TERMINAL_2022: |
858 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
859 | | /* ESC ( B is ambiguous only for ISO_2022 itself */ |
860 | | if(var == ISO_2022) { |
861 | | /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */ |
862 | | _this->toULength = 0; |
863 | | |
864 | | /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */ |
865 | | |
866 | | /* continue with the loop */ |
867 | | value = VALID_NON_TERMINAL_2022; |
868 | | break; |
869 | | } else |
870 | | #endif |
871 | 0 | { |
872 | | /* not ISO_2022 itself, finish here */ |
873 | 0 | value = VALID_TERMINAL_2022; |
874 | 0 | key = 0; |
875 | 0 | goto DONE; |
876 | 0 | } |
877 | 0 | } |
878 | 0 | } |
879 | | |
880 | 0 | DONE: |
881 | 0 | myData2022->key = key; |
882 | |
|
883 | 0 | if (value == VALID_NON_TERMINAL_2022) { |
884 | | /* indicate that the escape sequence is incomplete: key!=0 */ |
885 | 0 | return; |
886 | 0 | } else if (value == INVALID_2022 ) { |
887 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
888 | 0 | } else /* value == VALID_TERMINAL_2022 */ { |
889 | 0 | switch(var){ |
890 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
891 | | case ISO_2022: |
892 | | { |
893 | | const char *chosenConverterName = escSeqStateTable_Result_2022[offset]; |
894 | | if(chosenConverterName == NULL) { |
895 | | /* SS2 or SS3 */ |
896 | | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
897 | | _this->toUCallbackReason = UCNV_UNASSIGNED; |
898 | | return; |
899 | | } |
900 | | |
901 | | _this->mode = UCNV_SI; |
902 | | ucnv_close(myData2022->currentConverter); |
903 | | myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err); |
904 | | if(U_SUCCESS(*err)) { |
905 | | myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
906 | | _this->mode = UCNV_SO; |
907 | | } |
908 | | break; |
909 | | } |
910 | | #endif |
911 | 0 | case ISO_2022_JP: |
912 | 0 | { |
913 | 0 | StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
914 | 0 | switch(tempState) { |
915 | 0 | case INVALID_STATE: |
916 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
917 | 0 | break; |
918 | 0 | case SS2_STATE: |
919 | 0 | if(myData2022->toU2022State.cs[2]!=0) { |
920 | 0 | if(myData2022->toU2022State.g<2) { |
921 | 0 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
922 | 0 | } |
923 | 0 | myData2022->toU2022State.g=2; |
924 | 0 | } else { |
925 | | /* illegal to have SS2 before a matching designator */ |
926 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
927 | 0 | } |
928 | 0 | break; |
929 | | /* case SS3_STATE: not used in ISO-2022-JP-x */ |
930 | 0 | case ISO8859_1: |
931 | 0 | case ISO8859_7: |
932 | 0 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
933 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
934 | 0 | } else { |
935 | | /* G2 charset for SS2 */ |
936 | 0 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
937 | 0 | } |
938 | 0 | break; |
939 | 0 | default: |
940 | 0 | if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) { |
941 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
942 | 0 | } else { |
943 | | /* G0 charset */ |
944 | 0 | myData2022->toU2022State.cs[0]=(int8_t)tempState; |
945 | 0 | } |
946 | 0 | break; |
947 | 0 | } |
948 | 0 | } |
949 | 0 | break; |
950 | 0 | #if !UCONFIG_ONLY_HTML_CONVERSION |
951 | 0 | case ISO_2022_CN: |
952 | 0 | { |
953 | 0 | StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
954 | 0 | switch(tempState) { |
955 | 0 | case INVALID_STATE: |
956 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
957 | 0 | break; |
958 | 0 | case SS2_STATE: |
959 | 0 | if(myData2022->toU2022State.cs[2]!=0) { |
960 | 0 | if(myData2022->toU2022State.g<2) { |
961 | 0 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
962 | 0 | } |
963 | 0 | myData2022->toU2022State.g=2; |
964 | 0 | } else { |
965 | | /* illegal to have SS2 before a matching designator */ |
966 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
967 | 0 | } |
968 | 0 | break; |
969 | 0 | case SS3_STATE: |
970 | 0 | if(myData2022->toU2022State.cs[3]!=0) { |
971 | 0 | if(myData2022->toU2022State.g<2) { |
972 | 0 | myData2022->toU2022State.prevG=myData2022->toU2022State.g; |
973 | 0 | } |
974 | 0 | myData2022->toU2022State.g=3; |
975 | 0 | } else { |
976 | | /* illegal to have SS3 before a matching designator */ |
977 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
978 | 0 | } |
979 | 0 | break; |
980 | 0 | case ISO_IR_165: |
981 | 0 | if(myData2022->version==0) { |
982 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
983 | 0 | break; |
984 | 0 | } |
985 | 0 | U_FALLTHROUGH; |
986 | 0 | case GB2312_1: |
987 | 0 | U_FALLTHROUGH; |
988 | 0 | case CNS_11643_1: |
989 | 0 | myData2022->toU2022State.cs[1]=(int8_t)tempState; |
990 | 0 | break; |
991 | 0 | case CNS_11643_2: |
992 | 0 | myData2022->toU2022State.cs[2]=(int8_t)tempState; |
993 | 0 | break; |
994 | 0 | default: |
995 | | /* other CNS 11643 planes */ |
996 | 0 | if(myData2022->version==0) { |
997 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
998 | 0 | } else { |
999 | 0 | myData2022->toU2022State.cs[3]=(int8_t)tempState; |
1000 | 0 | } |
1001 | 0 | break; |
1002 | 0 | } |
1003 | 0 | } |
1004 | 0 | break; |
1005 | 0 | case ISO_2022_KR: |
1006 | 0 | if(offset==0x30){ |
1007 | | /* nothing to be done, just accept this one escape sequence */ |
1008 | 0 | } else { |
1009 | 0 | *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
1010 | 0 | } |
1011 | 0 | break; |
1012 | 0 | #endif // !UCONFIG_ONLY_HTML_CONVERSION |
1013 | | |
1014 | 0 | default: |
1015 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
1016 | 0 | break; |
1017 | 0 | } |
1018 | 0 | } |
1019 | 0 | if(U_SUCCESS(*err)) { |
1020 | 0 | _this->toULength = 0; |
1021 | 0 | } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
1022 | 0 | if(_this->toULength>1) { |
1023 | | /* |
1024 | | * Ticket 5691: consistent illegal sequences: |
1025 | | * - We include at least the first byte (ESC) in the illegal sequence. |
1026 | | * - If any of the non-initial bytes could be the start of a character, |
1027 | | * we stop the illegal sequence before the first one of those. |
1028 | | * In escape sequences, all following bytes are "printable", that is, |
1029 | | * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS), |
1030 | | * they are valid single/lead bytes. |
1031 | | * For simplicity, we always only report the initial ESC byte as the |
1032 | | * illegal sequence and back out all other bytes we looked at. |
1033 | | */ |
1034 | | /* Back out some bytes. */ |
1035 | 0 | int8_t backOutDistance=_this->toULength-1; |
1036 | 0 | int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
1037 | 0 | if(backOutDistance<=bytesFromThisBuffer) { |
1038 | | /* same as initialToULength<=1 */ |
1039 | 0 | *source-=backOutDistance; |
1040 | 0 | } else { |
1041 | | /* Back out bytes from the previous buffer: Need to replay them. */ |
1042 | 0 | _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance); |
1043 | | /* same as -(initialToULength-1) */ |
1044 | | /* preToULength is negative! */ |
1045 | 0 | uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength); |
1046 | 0 | *source-=bytesFromThisBuffer; |
1047 | 0 | } |
1048 | 0 | _this->toULength=1; |
1049 | 0 | } |
1050 | 0 | } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
1051 | 0 | _this->toUCallbackReason = UCNV_UNASSIGNED; |
1052 | 0 | } |
1053 | 0 | } |
1054 | | |
1055 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
1056 | | /*Checks the characters of the buffer against valid 2022 escape sequences |
1057 | | *if the match we return a pointer to the initial start of the sequence otherwise |
1058 | | *we return sourceLimit |
1059 | | */ |
1060 | | /*for 2022 looks ahead in the stream |
1061 | | *to determine the longest possible convertible |
1062 | | *data stream |
1063 | | */ |
1064 | | static inline const char* |
1065 | | getEndOfBuffer_2022(const char** source, |
1066 | | const char* sourceLimit, |
1067 | 0 | UBool /*flush*/){ |
1068 | |
|
1069 | 0 | const char* mySource = *source; |
1070 | |
|
1071 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
1072 | | if (*source >= sourceLimit) |
1073 | | return sourceLimit; |
1074 | | |
1075 | | do{ |
1076 | | |
1077 | | if (*mySource == ESC_2022){ |
1078 | | int8_t i; |
1079 | | int32_t key = 0; |
1080 | | int32_t offset; |
1081 | | UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
1082 | | |
1083 | | /* Kludge: I could not |
1084 | | * figure out the reason for validating an escape sequence |
1085 | | * twice - once here and once in changeState_2022(). |
1086 | | * is it possible to have an ESC character in a ISO2022 |
1087 | | * byte stream which is valid in a code page? Is it legal? |
1088 | | */ |
1089 | | for (i=0; |
1090 | | (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
1091 | | i++) { |
1092 | | value = getKey_2022(*(mySource+i), &key, &offset); |
1093 | | } |
1094 | | if (value > 0 || *mySource==ESC_2022) |
1095 | | return mySource; |
1096 | | |
1097 | | if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
1098 | | return sourceLimit; |
1099 | | } |
1100 | | }while (++mySource < sourceLimit); |
1101 | | |
1102 | | return sourceLimit; |
1103 | | #else |
1104 | 0 | while(mySource < sourceLimit && *mySource != ESC_2022) { |
1105 | 0 | ++mySource; |
1106 | 0 | } |
1107 | 0 | return mySource; |
1108 | 0 | #endif |
1109 | 0 | } |
1110 | | #endif |
1111 | | |
1112 | | /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c |
1113 | | * any future change in _MBCSFromUChar32() function should be reflected here. |
1114 | | * @return number of bytes in *value; negative number if fallback; 0 if no mapping |
1115 | | */ |
1116 | | static inline int32_t |
1117 | | MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
1118 | | UChar32 c, |
1119 | | uint32_t* value, |
1120 | | UBool useFallback, |
1121 | | int outputType) |
1122 | 0 | { |
1123 | 0 | const int32_t *cx; |
1124 | 0 | const uint16_t *table; |
1125 | 0 | uint32_t stage2Entry; |
1126 | 0 | uint32_t myValue; |
1127 | 0 | int32_t length; |
1128 | 0 | const uint8_t *p; |
1129 | | /* |
1130 | | * TODO(markus): Use and require new, faster MBCS conversion table structures. |
1131 | | * Use internal version of ucnv_open() that verifies that the new structures are available, |
1132 | | * else U_INTERNAL_PROGRAM_ERROR. |
1133 | | */ |
1134 | | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
1135 | 0 | if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
1136 | 0 | table=sharedData->mbcs.fromUnicodeTable; |
1137 | 0 | stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
1138 | | /* get the bytes and the length for the output */ |
1139 | 0 | if(outputType==MBCS_OUTPUT_2){ |
1140 | 0 | myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
1141 | 0 | if(myValue<=0xff) { |
1142 | 0 | length=1; |
1143 | 0 | } else { |
1144 | 0 | length=2; |
1145 | 0 | } |
1146 | 0 | } else /* outputType==MBCS_OUTPUT_3 */ { |
1147 | 0 | p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); |
1148 | 0 | myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
1149 | 0 | if(myValue<=0xff) { |
1150 | 0 | length=1; |
1151 | 0 | } else if(myValue<=0xffff) { |
1152 | 0 | length=2; |
1153 | 0 | } else { |
1154 | 0 | length=3; |
1155 | 0 | } |
1156 | 0 | } |
1157 | | /* is this code point assigned, or do we use fallbacks? */ |
1158 | 0 | if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
1159 | | /* assigned */ |
1160 | 0 | *value=myValue; |
1161 | 0 | return length; |
1162 | 0 | } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
1163 | | /* |
1164 | | * We allow a 0 byte output if the "assigned" bit is set for this entry. |
1165 | | * There is no way with this data structure for fallback output |
1166 | | * to be a zero byte. |
1167 | | */ |
1168 | 0 | *value=myValue; |
1169 | 0 | return -length; |
1170 | 0 | } |
1171 | 0 | } |
1172 | | |
1173 | 0 | cx=sharedData->mbcs.extIndexes; |
1174 | 0 | if(cx!=NULL) { |
1175 | 0 | return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
1176 | 0 | } |
1177 | | |
1178 | | /* unassigned */ |
1179 | 0 | return 0; |
1180 | 0 | } |
1181 | | |
1182 | | /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c |
1183 | | * any future change in _MBCSSingleFromUChar32() function should be reflected here. |
1184 | | * @param retval pointer to output byte |
1185 | | * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
1186 | | */ |
1187 | | static inline int32_t |
1188 | | MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
1189 | | UChar32 c, |
1190 | | uint32_t* retval, |
1191 | | UBool useFallback) |
1192 | 0 | { |
1193 | 0 | const uint16_t *table; |
1194 | 0 | int32_t value; |
1195 | | /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ |
1196 | 0 | if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
1197 | 0 | return 0; |
1198 | 0 | } |
1199 | | /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ |
1200 | 0 | table=sharedData->mbcs.fromUnicodeTable; |
1201 | | /* get the byte for the output */ |
1202 | 0 | value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c); |
1203 | | /* is this code point assigned, or do we use fallbacks? */ |
1204 | 0 | *retval=(uint32_t)(value&0xff); |
1205 | 0 | if(value>=0xf00) { |
1206 | 0 | return 1; /* roundtrip */ |
1207 | 0 | } else if(useFallback ? value>=0x800 : value>=0xc00) { |
1208 | 0 | return -1; /* fallback taken */ |
1209 | 0 | } else { |
1210 | 0 | return 0; /* no mapping */ |
1211 | 0 | } |
1212 | 0 | } |
1213 | | |
1214 | | /* |
1215 | | * Check that the result is a 2-byte value with each byte in the range A1..FE |
1216 | | * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
1217 | | * to move it to the ISO 2022 range 21..7E. |
1218 | | * Return 0 if out of range. |
1219 | | */ |
1220 | | static inline uint32_t |
1221 | 0 | _2022FromGR94DBCS(uint32_t value) { |
1222 | 0 | if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
1223 | 0 | (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
1224 | 0 | ) { |
1225 | 0 | return value - 0x8080; /* shift down to 21..7e byte range */ |
1226 | 0 | } else { |
1227 | 0 | return 0; /* not valid for ISO 2022 */ |
1228 | 0 | } |
1229 | 0 | } |
1230 | | |
1231 | | #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */ |
1232 | | /* |
1233 | | * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the |
1234 | | * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point |
1235 | | * unchanged. |
1236 | | */ |
1237 | | static inline uint32_t |
1238 | | _2022ToGR94DBCS(uint32_t value) { |
1239 | | uint32_t returnValue = value + 0x8080; |
1240 | | if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
1241 | | (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
1242 | | return returnValue; |
1243 | | } else { |
1244 | | return value; |
1245 | | } |
1246 | | } |
1247 | | #endif |
1248 | | |
1249 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
1250 | | |
1251 | | /********************************************************************************** |
1252 | | * ISO-2022 Converter |
1253 | | * |
1254 | | * |
1255 | | */ |
1256 | | |
1257 | | static void U_CALLCONV |
1258 | | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
1259 | | UErrorCode* err){ |
1260 | | const char* mySourceLimit, *realSourceLimit; |
1261 | | const char* sourceStart; |
1262 | | const UChar* myTargetStart; |
1263 | | UConverter* saveThis; |
1264 | | UConverterDataISO2022* myData; |
1265 | | int8_t length; |
1266 | | |
1267 | | saveThis = args->converter; |
1268 | | myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
1269 | | |
1270 | | realSourceLimit = args->sourceLimit; |
1271 | | while (args->source < realSourceLimit) { |
1272 | | if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
1273 | | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
1274 | | mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush); |
1275 | | |
1276 | | if(args->source < mySourceLimit) { |
1277 | | if(myData->currentConverter==NULL) { |
1278 | | myData->currentConverter = ucnv_open("ASCII",err); |
1279 | | if(U_FAILURE(*err)){ |
1280 | | return; |
1281 | | } |
1282 | | |
1283 | | myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
1284 | | saveThis->mode = UCNV_SO; |
1285 | | } |
1286 | | |
1287 | | /* convert to before the ESC or until the end of the buffer */ |
1288 | | myData->isFirstBuffer=FALSE; |
1289 | | sourceStart = args->source; |
1290 | | myTargetStart = args->target; |
1291 | | args->converter = myData->currentConverter; |
1292 | | ucnv_toUnicode(args->converter, |
1293 | | &args->target, |
1294 | | args->targetLimit, |
1295 | | &args->source, |
1296 | | mySourceLimit, |
1297 | | args->offsets, |
1298 | | (UBool)(args->flush && mySourceLimit == realSourceLimit), |
1299 | | err); |
1300 | | args->converter = saveThis; |
1301 | | |
1302 | | if (*err == U_BUFFER_OVERFLOW_ERROR) { |
1303 | | /* move the overflow buffer */ |
1304 | | length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength; |
1305 | | myData->currentConverter->UCharErrorBufferLength = 0; |
1306 | | if(length > 0) { |
1307 | | uprv_memcpy(saveThis->UCharErrorBuffer, |
1308 | | myData->currentConverter->UCharErrorBuffer, |
1309 | | length*U_SIZEOF_UCHAR); |
1310 | | } |
1311 | | return; |
1312 | | } |
1313 | | |
1314 | | /* |
1315 | | * At least one of: |
1316 | | * -Error while converting |
1317 | | * -Done with entire buffer |
1318 | | * -Need to write offsets or update the current offset |
1319 | | * (leave that up to the code in ucnv.c) |
1320 | | * |
1321 | | * or else we just stopped at an ESC byte and continue with changeState_2022() |
1322 | | */ |
1323 | | if (U_FAILURE(*err) || |
1324 | | (args->source == realSourceLimit) || |
1325 | | (args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) || |
1326 | | (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0)) |
1327 | | ) { |
1328 | | /* copy partial or error input for truncated detection and error handling */ |
1329 | | if(U_FAILURE(*err)) { |
1330 | | length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength; |
1331 | | if(length > 0) { |
1332 | | uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length); |
1333 | | } |
1334 | | } else { |
1335 | | length = saveThis->toULength = myData->currentConverter->toULength; |
1336 | | if(length > 0) { |
1337 | | uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length); |
1338 | | if(args->source < mySourceLimit) { |
1339 | | *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */ |
1340 | | } |
1341 | | } |
1342 | | } |
1343 | | return; |
1344 | | } |
1345 | | } |
1346 | | } |
1347 | | |
1348 | | sourceStart = args->source; |
1349 | | changeState_2022(args->converter, |
1350 | | &(args->source), |
1351 | | realSourceLimit, |
1352 | | ISO_2022, |
1353 | | err); |
1354 | | if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) { |
1355 | | /* let the ucnv.c code update its current offset */ |
1356 | | return; |
1357 | | } |
1358 | | } |
1359 | | } |
1360 | | |
1361 | | #endif |
1362 | | |
1363 | | /* |
1364 | | * To Unicode Callback helper function |
1365 | | */ |
1366 | | static void |
1367 | | toUnicodeCallback(UConverter *cnv, |
1368 | | const uint32_t sourceChar, const uint32_t targetUniChar, |
1369 | 0 | UErrorCode* err){ |
1370 | 0 | if(sourceChar>0xff){ |
1371 | 0 | cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
1372 | 0 | cnv->toUBytes[1] = (uint8_t)sourceChar; |
1373 | 0 | cnv->toULength = 2; |
1374 | 0 | } |
1375 | 0 | else{ |
1376 | 0 | cnv->toUBytes[0] =(char) sourceChar; |
1377 | 0 | cnv->toULength = 1; |
1378 | 0 | } |
1379 | |
|
1380 | 0 | if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
1381 | 0 | *err = U_INVALID_CHAR_FOUND; |
1382 | 0 | } |
1383 | 0 | else{ |
1384 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
1385 | 0 | } |
1386 | 0 | } |
1387 | | |
1388 | | /**************************************ISO-2022-JP*************************************************/ |
1389 | | |
1390 | | /************************************** IMPORTANT ************************************************** |
1391 | | * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and |
1392 | | * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
1393 | | * The converter iterates over each Unicode codepoint |
1394 | | * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is |
1395 | | * processed one char at a time it would make sense to reduce the extra processing a canned converter |
1396 | | * would do as far as possible. |
1397 | | * |
1398 | | * If the implementation of these macros or structure of sharedData struct change in the future, make |
1399 | | * sure that ISO-2022 is also changed. |
1400 | | *************************************************************************************************** |
1401 | | */ |
1402 | | |
1403 | | /*************************************************************************************************** |
1404 | | * Rules for ISO-2022-jp encoding |
1405 | | * (i) Escape sequences must be fully contained within a line they should not |
1406 | | * span new lines or CRs |
1407 | | * (ii) If the last character on a line is represented by two bytes then an ASCII or |
1408 | | * JIS-Roman character escape sequence should follow before the line terminates |
1409 | | * (iii) If the first character on the line is represented by two bytes then a two |
1410 | | * byte character escape sequence should precede it |
1411 | | * (iv) If no escape sequence is encountered then the characters are ASCII |
1412 | | * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2, |
1413 | | * and invoked with SS2 (ESC N). |
1414 | | * (vi) If there is any G0 designation in text, there must be a switch to |
1415 | | * ASCII or to JIS X 0201-Roman before a space character (but not |
1416 | | * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
1417 | | * characters such as tab or CRLF. |
1418 | | * (vi) Supported encodings: |
1419 | | * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7 |
1420 | | * |
1421 | | * source : RFC-1554 |
1422 | | * |
1423 | | * JISX201, JISX208,JISX212 : new .cnv data files created |
1424 | | * KSC5601 : alias to ibm-949 mapping table |
1425 | | * GB2312 : alias to ibm-1386 mapping table |
1426 | | * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
1427 | | * ISO-8859-7 : alisas to ibm-9409 mapping table |
1428 | | */ |
1429 | | |
1430 | | /* preference order of JP charsets */ |
1431 | | static const StateEnum jpCharsetPref[]={ |
1432 | | ASCII, |
1433 | | JISX201, |
1434 | | ISO8859_1, |
1435 | | JISX208, |
1436 | | ISO8859_7, |
1437 | | JISX212, |
1438 | | GB2312, |
1439 | | KSC5601, |
1440 | | HWKANA_7BIT |
1441 | | }; |
1442 | | |
1443 | | /* |
1444 | | * The escape sequences must be in order of the enum constants like JISX201 = 3, |
1445 | | * not in order of jpCharsetPref[]! |
1446 | | */ |
1447 | | static const char escSeqChars[][6] ={ |
1448 | | "\x1B\x28\x42", /* <ESC>(B ASCII */ |
1449 | | "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
1450 | | "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
1451 | | "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
1452 | | "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
1453 | | "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
1454 | | "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
1455 | | "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
1456 | | "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
1457 | | |
1458 | | }; |
1459 | | static const int8_t escSeqCharsLen[] ={ |
1460 | | 3, /* length of <ESC>(B ASCII */ |
1461 | | 3, /* length of <ESC>.A ISO-8859-1 */ |
1462 | | 3, /* length of <ESC>.F ISO-8859-7 */ |
1463 | | 3, /* length of <ESC>(J JISX-201 */ |
1464 | | 3, /* length of <ESC>$B JISX-208 */ |
1465 | | 4, /* length of <ESC>$(D JISX-212 */ |
1466 | | 3, /* length of <ESC>$A GB2312 */ |
1467 | | 4, /* length of <ESC>$(C KSC5601 */ |
1468 | | 3 /* length of <ESC>(I HWKANA_7BIT */ |
1469 | | }; |
1470 | | |
1471 | | /* |
1472 | | * The iteration over various code pages works this way: |
1473 | | * i) Get the currentState from myConverterData->currentState |
1474 | | * ii) Check if the character is mapped to a valid character in the currentState |
1475 | | * Yes -> a) set the initIterState to currentState |
1476 | | * b) remain in this state until an invalid character is found |
1477 | | * No -> a) go to the next code page and find the character |
1478 | | * iii) Before changing the state increment the current state check if the current state |
1479 | | * is equal to the intitIteration state |
1480 | | * Yes -> A character that cannot be represented in any of the supported encodings |
1481 | | * break and return a U_INVALID_CHARACTER error |
1482 | | * No -> Continue and find the character in next code page |
1483 | | * |
1484 | | * |
1485 | | * TODO: Implement a priority technique where the users are allowed to set the priority of code pages |
1486 | | */ |
1487 | | |
1488 | | /* Map 00..7F to Unicode according to JIS X 0201. */ |
1489 | | static inline uint32_t |
1490 | 0 | jisx201ToU(uint32_t value) { |
1491 | 0 | if(value < 0x5c) { |
1492 | 0 | return value; |
1493 | 0 | } else if(value == 0x5c) { |
1494 | 0 | return 0xa5; |
1495 | 0 | } else if(value == 0x7e) { |
1496 | 0 | return 0x203e; |
1497 | 0 | } else /* value <= 0x7f */ { |
1498 | 0 | return value; |
1499 | 0 | } |
1500 | 0 | } |
1501 | | |
1502 | | /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ |
1503 | | static inline uint32_t |
1504 | 0 | jisx201FromU(uint32_t value) { |
1505 | 0 | if(value<=0x7f) { |
1506 | 0 | if(value!=0x5c && value!=0x7e) { |
1507 | 0 | return value; |
1508 | 0 | } |
1509 | 0 | } else if(value==0xa5) { |
1510 | 0 | return 0x5c; |
1511 | 0 | } else if(value==0x203e) { |
1512 | 0 | return 0x7e; |
1513 | 0 | } |
1514 | 0 | return 0xfffe; |
1515 | 0 | } |
1516 | | |
1517 | | /* |
1518 | | * Take a valid Shift-JIS byte pair, check that it is in the range corresponding |
1519 | | * to JIS X 0208, and convert it to a pair of 21..7E bytes. |
1520 | | * Return 0 if the byte pair is out of range. |
1521 | | */ |
1522 | | static inline uint32_t |
1523 | 0 | _2022FromSJIS(uint32_t value) { |
1524 | 0 | uint8_t trail; |
1525 | |
|
1526 | 0 | if(value > 0xEFFC) { |
1527 | 0 | return 0; /* beyond JIS X 0208 */ |
1528 | 0 | } |
1529 | | |
1530 | 0 | trail = (uint8_t)value; |
1531 | |
|
1532 | 0 | value &= 0xff00; /* lead byte */ |
1533 | 0 | if(value <= 0x9f00) { |
1534 | 0 | value -= 0x7000; |
1535 | 0 | } else /* 0xe000 <= value <= 0xef00 */ { |
1536 | 0 | value -= 0xb000; |
1537 | 0 | } |
1538 | 0 | value <<= 1; |
1539 | |
|
1540 | 0 | if(trail <= 0x9e) { |
1541 | 0 | value -= 0x100; |
1542 | 0 | if(trail <= 0x7e) { |
1543 | 0 | value |= trail - 0x1f; |
1544 | 0 | } else { |
1545 | 0 | value |= trail - 0x20; |
1546 | 0 | } |
1547 | 0 | } else /* trail <= 0xfc */ { |
1548 | 0 | value |= trail - 0x7e; |
1549 | 0 | } |
1550 | 0 | return value; |
1551 | 0 | } |
1552 | | |
1553 | | /* |
1554 | | * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. |
1555 | | * If either byte is outside 21..7E make sure that the result is not valid |
1556 | | * for Shift-JIS so that the converter catches it. |
1557 | | * Some invalid byte values already turn into equally invalid Shift-JIS |
1558 | | * byte values and need not be tested explicitly. |
1559 | | */ |
1560 | | static inline void |
1561 | 0 | _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
1562 | 0 | if(c1&1) { |
1563 | 0 | ++c1; |
1564 | 0 | if(c2 <= 0x5f) { |
1565 | 0 | c2 += 0x1f; |
1566 | 0 | } else if(c2 <= 0x7e) { |
1567 | 0 | c2 += 0x20; |
1568 | 0 | } else { |
1569 | 0 | c2 = 0; /* invalid */ |
1570 | 0 | } |
1571 | 0 | } else { |
1572 | 0 | if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
1573 | 0 | c2 += 0x7e; |
1574 | 0 | } else { |
1575 | 0 | c2 = 0; /* invalid */ |
1576 | 0 | } |
1577 | 0 | } |
1578 | 0 | c1 >>= 1; |
1579 | 0 | if(c1 <= 0x2f) { |
1580 | 0 | c1 += 0x70; |
1581 | 0 | } else if(c1 <= 0x3f) { |
1582 | 0 | c1 += 0xb0; |
1583 | 0 | } else { |
1584 | 0 | c1 = 0; /* invalid */ |
1585 | 0 | } |
1586 | 0 | bytes[0] = (char)c1; |
1587 | 0 | bytes[1] = (char)c2; |
1588 | 0 | } |
1589 | | |
1590 | | /* |
1591 | | * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) |
1592 | | * Katakana. |
1593 | | * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks |
1594 | | * because Shift-JIS roundtrips half-width Katakana to single bytes. |
1595 | | * These were the only fallbacks in ICU's jisx-208.ucm file. |
1596 | | */ |
1597 | | static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
1598 | | 0x2123, /* U+FF61 */ |
1599 | | 0x2156, |
1600 | | 0x2157, |
1601 | | 0x2122, |
1602 | | 0x2126, |
1603 | | 0x2572, |
1604 | | 0x2521, |
1605 | | 0x2523, |
1606 | | 0x2525, |
1607 | | 0x2527, |
1608 | | 0x2529, |
1609 | | 0x2563, |
1610 | | 0x2565, |
1611 | | 0x2567, |
1612 | | 0x2543, |
1613 | | 0x213C, /* U+FF70 */ |
1614 | | 0x2522, |
1615 | | 0x2524, |
1616 | | 0x2526, |
1617 | | 0x2528, |
1618 | | 0x252A, |
1619 | | 0x252B, |
1620 | | 0x252D, |
1621 | | 0x252F, |
1622 | | 0x2531, |
1623 | | 0x2533, |
1624 | | 0x2535, |
1625 | | 0x2537, |
1626 | | 0x2539, |
1627 | | 0x253B, |
1628 | | 0x253D, |
1629 | | 0x253F, /* U+FF80 */ |
1630 | | 0x2541, |
1631 | | 0x2544, |
1632 | | 0x2546, |
1633 | | 0x2548, |
1634 | | 0x254A, |
1635 | | 0x254B, |
1636 | | 0x254C, |
1637 | | 0x254D, |
1638 | | 0x254E, |
1639 | | 0x254F, |
1640 | | 0x2552, |
1641 | | 0x2555, |
1642 | | 0x2558, |
1643 | | 0x255B, |
1644 | | 0x255E, |
1645 | | 0x255F, /* U+FF90 */ |
1646 | | 0x2560, |
1647 | | 0x2561, |
1648 | | 0x2562, |
1649 | | 0x2564, |
1650 | | 0x2566, |
1651 | | 0x2568, |
1652 | | 0x2569, |
1653 | | 0x256A, |
1654 | | 0x256B, |
1655 | | 0x256C, |
1656 | | 0x256D, |
1657 | | 0x256F, |
1658 | | 0x2573, |
1659 | | 0x212B, |
1660 | | 0x212C /* U+FF9F */ |
1661 | | }; |
1662 | | |
1663 | | static void U_CALLCONV |
1664 | 0 | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { |
1665 | 0 | UConverter *cnv = args->converter; |
1666 | 0 | UConverterDataISO2022 *converterData; |
1667 | 0 | ISO2022State *pFromU2022State; |
1668 | 0 | uint8_t *target = (uint8_t *) args->target; |
1669 | 0 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
1670 | 0 | const UChar* source = args->source; |
1671 | 0 | const UChar* sourceLimit = args->sourceLimit; |
1672 | 0 | int32_t* offsets = args->offsets; |
1673 | 0 | UChar32 sourceChar; |
1674 | 0 | char buffer[8]; |
1675 | 0 | int32_t len, outLen; |
1676 | 0 | int8_t choices[10]; |
1677 | 0 | int32_t choiceCount; |
1678 | 0 | uint32_t targetValue = 0; |
1679 | 0 | UBool useFallback; |
1680 | |
|
1681 | 0 | int32_t i; |
1682 | 0 | int8_t cs, g; |
1683 | | |
1684 | | /* set up the state */ |
1685 | 0 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
1686 | 0 | pFromU2022State = &converterData->fromU2022State; |
1687 | |
|
1688 | 0 | choiceCount = 0; |
1689 | | |
1690 | | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
1691 | 0 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
1692 | 0 | goto getTrail; |
1693 | 0 | } |
1694 | | |
1695 | 0 | while(source < sourceLimit) { |
1696 | 0 | if(target < targetLimit) { |
1697 | |
|
1698 | 0 | sourceChar = *(source++); |
1699 | | /*check if the char is a First surrogate*/ |
1700 | 0 | if(U16_IS_SURROGATE(sourceChar)) { |
1701 | 0 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
1702 | 0 | getTrail: |
1703 | | /*look ahead to find the trail surrogate*/ |
1704 | 0 | if(source < sourceLimit) { |
1705 | | /* test the following code unit */ |
1706 | 0 | UChar trail=(UChar) *source; |
1707 | 0 | if(U16_IS_TRAIL(trail)) { |
1708 | 0 | source++; |
1709 | 0 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
1710 | 0 | cnv->fromUChar32=0x00; |
1711 | | /* convert this supplementary code point */ |
1712 | | /* exit this condition tree */ |
1713 | 0 | } else { |
1714 | | /* this is an unmatched lead code unit (1st surrogate) */ |
1715 | | /* callback(illegal) */ |
1716 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
1717 | 0 | cnv->fromUChar32=sourceChar; |
1718 | 0 | break; |
1719 | 0 | } |
1720 | 0 | } else { |
1721 | | /* no more input */ |
1722 | 0 | cnv->fromUChar32=sourceChar; |
1723 | 0 | break; |
1724 | 0 | } |
1725 | 0 | } else { |
1726 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
1727 | | /* callback(illegal) */ |
1728 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
1729 | 0 | cnv->fromUChar32=sourceChar; |
1730 | 0 | break; |
1731 | 0 | } |
1732 | 0 | } |
1733 | | |
1734 | | /* do not convert SO/SI/ESC */ |
1735 | 0 | if(IS_2022_CONTROL(sourceChar)) { |
1736 | | /* callback(illegal) */ |
1737 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
1738 | 0 | cnv->fromUChar32=sourceChar; |
1739 | 0 | break; |
1740 | 0 | } |
1741 | | |
1742 | | /* do the conversion */ |
1743 | | |
1744 | 0 | if(choiceCount == 0) { |
1745 | 0 | uint16_t csm; |
1746 | | |
1747 | | /* |
1748 | | * The csm variable keeps track of which charsets are allowed |
1749 | | * and not used yet while building the choices[]. |
1750 | | */ |
1751 | 0 | csm = jpCharsetMasks[converterData->version]; |
1752 | 0 | choiceCount = 0; |
1753 | | |
1754 | | /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
1755 | 0 | if(converterData->version == 3 || converterData->version == 4) { |
1756 | 0 | choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
1757 | 0 | } |
1758 | | /* Do not try single-byte half-width Katakana for other versions. */ |
1759 | 0 | csm &= ~CSM(HWKANA_7BIT); |
1760 | | |
1761 | | /* try the current G0 charset */ |
1762 | 0 | choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
1763 | 0 | csm &= ~CSM(cs); |
1764 | | |
1765 | | /* try the current G2 charset */ |
1766 | 0 | if((cs = pFromU2022State->cs[2]) != 0) { |
1767 | 0 | choices[choiceCount++] = cs; |
1768 | 0 | csm &= ~CSM(cs); |
1769 | 0 | } |
1770 | | |
1771 | | /* try all the other possible charsets */ |
1772 | 0 | for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) { |
1773 | 0 | cs = (int8_t)jpCharsetPref[i]; |
1774 | 0 | if(CSM(cs) & csm) { |
1775 | 0 | choices[choiceCount++] = cs; |
1776 | 0 | csm &= ~CSM(cs); |
1777 | 0 | } |
1778 | 0 | } |
1779 | 0 | } |
1780 | |
|
1781 | 0 | cs = g = 0; |
1782 | | /* |
1783 | | * len==0: no mapping found yet |
1784 | | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
1785 | | * len>0: found a roundtrip result, done |
1786 | | */ |
1787 | 0 | len = 0; |
1788 | | /* |
1789 | | * We will turn off useFallback after finding a fallback, |
1790 | | * but we still get fallbacks from PUA code points as usual. |
1791 | | * Therefore, we will also need to check that we don't overwrite |
1792 | | * an early fallback with a later one. |
1793 | | */ |
1794 | 0 | useFallback = cnv->useFallback; |
1795 | |
|
1796 | 0 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
1797 | 0 | uint32_t value; |
1798 | 0 | int32_t len2; |
1799 | 0 | int8_t cs0 = choices[i]; |
1800 | 0 | switch(cs0) { |
1801 | 0 | case ASCII: |
1802 | 0 | if(sourceChar <= 0x7f) { |
1803 | 0 | targetValue = (uint32_t)sourceChar; |
1804 | 0 | len = 1; |
1805 | 0 | cs = cs0; |
1806 | 0 | g = 0; |
1807 | 0 | } |
1808 | 0 | break; |
1809 | 0 | case ISO8859_1: |
1810 | 0 | if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
1811 | 0 | targetValue = (uint32_t)sourceChar - 0x80; |
1812 | 0 | len = 1; |
1813 | 0 | cs = cs0; |
1814 | 0 | g = 2; |
1815 | 0 | } |
1816 | 0 | break; |
1817 | 0 | case HWKANA_7BIT: |
1818 | 0 | if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
1819 | 0 | if(converterData->version==3) { |
1820 | | /* JIS7: use G1 (SO) */ |
1821 | | /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
1822 | 0 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21)); |
1823 | 0 | len = 1; |
1824 | 0 | pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */ |
1825 | 0 | g = 1; |
1826 | 0 | } else if(converterData->version==4) { |
1827 | | /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */ |
1828 | | /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
1829 | 0 | targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1)); |
1830 | 0 | len = 1; |
1831 | |
|
1832 | 0 | cs = pFromU2022State->cs[0]; |
1833 | 0 | if(IS_JP_DBCS(cs)) { |
1834 | | /* switch from a DBCS charset to JISX201 */ |
1835 | 0 | cs = (int8_t)JISX201; |
1836 | 0 | } |
1837 | | /* else stay in the current G0 charset */ |
1838 | 0 | g = 0; |
1839 | 0 | } |
1840 | | /* else do not use HWKANA_7BIT with other versions */ |
1841 | 0 | } |
1842 | 0 | break; |
1843 | 0 | case JISX201: |
1844 | | /* G0 SBCS */ |
1845 | 0 | value = jisx201FromU(sourceChar); |
1846 | 0 | if(value <= 0x7f) { |
1847 | 0 | targetValue = value; |
1848 | 0 | len = 1; |
1849 | 0 | cs = cs0; |
1850 | 0 | g = 0; |
1851 | 0 | useFallback = FALSE; |
1852 | 0 | } |
1853 | 0 | break; |
1854 | 0 | case JISX208: |
1855 | | /* G0 DBCS from Shift-JIS table */ |
1856 | 0 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
1857 | 0 | converterData->myConverterArray[cs0], |
1858 | 0 | sourceChar, &value, |
1859 | 0 | useFallback, MBCS_OUTPUT_2); |
1860 | 0 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
1861 | 0 | value = _2022FromSJIS(value); |
1862 | 0 | if(value != 0) { |
1863 | 0 | targetValue = value; |
1864 | 0 | len = len2; |
1865 | 0 | cs = cs0; |
1866 | 0 | g = 0; |
1867 | 0 | useFallback = FALSE; |
1868 | 0 | } |
1869 | 0 | } else if(len == 0 && useFallback && |
1870 | 0 | (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { |
1871 | 0 | targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
1872 | 0 | len = -2; |
1873 | 0 | cs = cs0; |
1874 | 0 | g = 0; |
1875 | 0 | useFallback = FALSE; |
1876 | 0 | } |
1877 | 0 | break; |
1878 | 0 | case ISO8859_7: |
1879 | | /* G0 SBCS forced to 7-bit output */ |
1880 | 0 | len2 = MBCS_SINGLE_FROM_UCHAR32( |
1881 | 0 | converterData->myConverterArray[cs0], |
1882 | 0 | sourceChar, &value, |
1883 | 0 | useFallback); |
1884 | 0 | if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) { |
1885 | 0 | targetValue = value - 0x80; |
1886 | 0 | len = len2; |
1887 | 0 | cs = cs0; |
1888 | 0 | g = 2; |
1889 | 0 | useFallback = FALSE; |
1890 | 0 | } |
1891 | 0 | break; |
1892 | 0 | default: |
1893 | | /* G0 DBCS */ |
1894 | 0 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
1895 | 0 | converterData->myConverterArray[cs0], |
1896 | 0 | sourceChar, &value, |
1897 | 0 | useFallback, MBCS_OUTPUT_2); |
1898 | 0 | if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ |
1899 | 0 | if(cs0 == KSC5601) { |
1900 | | /* |
1901 | | * Check for valid bytes for the encoding scheme. |
1902 | | * This is necessary because the sub-converter (windows-949) |
1903 | | * has a broader encoding scheme than is valid for 2022. |
1904 | | */ |
1905 | 0 | value = _2022FromGR94DBCS(value); |
1906 | 0 | if(value == 0) { |
1907 | 0 | break; |
1908 | 0 | } |
1909 | 0 | } |
1910 | 0 | targetValue = value; |
1911 | 0 | len = len2; |
1912 | 0 | cs = cs0; |
1913 | 0 | g = 0; |
1914 | 0 | useFallback = FALSE; |
1915 | 0 | } |
1916 | 0 | break; |
1917 | 0 | } |
1918 | 0 | } |
1919 | | |
1920 | 0 | if(len != 0) { |
1921 | 0 | if(len < 0) { |
1922 | 0 | len = -len; /* fallback */ |
1923 | 0 | } |
1924 | 0 | outLen = 0; /* count output bytes */ |
1925 | | |
1926 | | /* write SI if necessary (only for JIS7) */ |
1927 | 0 | if(pFromU2022State->g == 1 && g == 0) { |
1928 | 0 | buffer[outLen++] = UCNV_SI; |
1929 | 0 | pFromU2022State->g = 0; |
1930 | 0 | } |
1931 | | |
1932 | | /* write the designation sequence if necessary */ |
1933 | 0 | if(cs != pFromU2022State->cs[g]) { |
1934 | 0 | int32_t escLen = escSeqCharsLen[cs]; |
1935 | 0 | uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
1936 | 0 | outLen += escLen; |
1937 | 0 | pFromU2022State->cs[g] = cs; |
1938 | | |
1939 | | /* invalidate the choices[] */ |
1940 | 0 | choiceCount = 0; |
1941 | 0 | } |
1942 | | |
1943 | | /* write the shift sequence if necessary */ |
1944 | 0 | if(g != pFromU2022State->g) { |
1945 | 0 | switch(g) { |
1946 | | /* case 0 handled before writing escapes */ |
1947 | 0 | case 1: |
1948 | 0 | buffer[outLen++] = UCNV_SO; |
1949 | 0 | pFromU2022State->g = 1; |
1950 | 0 | break; |
1951 | 0 | default: /* case 2 */ |
1952 | 0 | buffer[outLen++] = 0x1b; |
1953 | 0 | buffer[outLen++] = 0x4e; |
1954 | 0 | break; |
1955 | | /* no case 3: no SS3 in ISO-2022-JP-x */ |
1956 | 0 | } |
1957 | 0 | } |
1958 | | |
1959 | | /* write the output bytes */ |
1960 | 0 | if(len == 1) { |
1961 | 0 | buffer[outLen++] = (char)targetValue; |
1962 | 0 | } else /* len == 2 */ { |
1963 | 0 | buffer[outLen++] = (char)(targetValue >> 8); |
1964 | 0 | buffer[outLen++] = (char)targetValue; |
1965 | 0 | } |
1966 | 0 | } else { |
1967 | | /* |
1968 | | * if we cannot find the character after checking all codepages |
1969 | | * then this is an error |
1970 | | */ |
1971 | 0 | *err = U_INVALID_CHAR_FOUND; |
1972 | 0 | cnv->fromUChar32=sourceChar; |
1973 | 0 | break; |
1974 | 0 | } |
1975 | | |
1976 | 0 | if(sourceChar == CR || sourceChar == LF) { |
1977 | | /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */ |
1978 | 0 | pFromU2022State->cs[2] = 0; |
1979 | 0 | choiceCount = 0; |
1980 | 0 | } |
1981 | | |
1982 | | /* output outLen>0 bytes in buffer[] */ |
1983 | 0 | if(outLen == 1) { |
1984 | 0 | *target++ = buffer[0]; |
1985 | 0 | if(offsets) { |
1986 | 0 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
1987 | 0 | } |
1988 | 0 | } else if(outLen == 2 && (target + 2) <= targetLimit) { |
1989 | 0 | *target++ = buffer[0]; |
1990 | 0 | *target++ = buffer[1]; |
1991 | 0 | if(offsets) { |
1992 | 0 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
1993 | 0 | *offsets++ = sourceIndex; |
1994 | 0 | *offsets++ = sourceIndex; |
1995 | 0 | } |
1996 | 0 | } else { |
1997 | 0 | fromUWriteUInt8( |
1998 | 0 | cnv, |
1999 | 0 | buffer, outLen, |
2000 | 0 | &target, (const char *)targetLimit, |
2001 | 0 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
2002 | 0 | err); |
2003 | 0 | if(U_FAILURE(*err)) { |
2004 | 0 | break; |
2005 | 0 | } |
2006 | 0 | } |
2007 | 0 | } /* end if(myTargetIndex<myTargetLength) */ |
2008 | 0 | else{ |
2009 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
2010 | 0 | break; |
2011 | 0 | } |
2012 | |
|
2013 | 0 | }/* end while(mySourceIndex<mySourceLength) */ |
2014 | | |
2015 | | /* |
2016 | | * the end of the input stream and detection of truncated input |
2017 | | * are handled by the framework, but for ISO-2022-JP conversion |
2018 | | * we need to be in ASCII mode at the very end |
2019 | | * |
2020 | | * conditions: |
2021 | | * successful |
2022 | | * in SO mode or not in ASCII mode |
2023 | | * end of input and no truncated input |
2024 | | */ |
2025 | 0 | if( U_SUCCESS(*err) && |
2026 | 0 | (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
2027 | 0 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
2028 | 0 | ) { |
2029 | 0 | int32_t sourceIndex; |
2030 | |
|
2031 | 0 | outLen = 0; |
2032 | |
|
2033 | 0 | if(pFromU2022State->g != 0) { |
2034 | 0 | buffer[outLen++] = UCNV_SI; |
2035 | 0 | pFromU2022State->g = 0; |
2036 | 0 | } |
2037 | |
|
2038 | 0 | if(pFromU2022State->cs[0] != ASCII) { |
2039 | 0 | int32_t escLen = escSeqCharsLen[ASCII]; |
2040 | 0 | uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
2041 | 0 | outLen += escLen; |
2042 | 0 | pFromU2022State->cs[0] = (int8_t)ASCII; |
2043 | 0 | } |
2044 | | |
2045 | | /* get the source index of the last input character */ |
2046 | | /* |
2047 | | * TODO this would be simpler and more reliable if we used a pair |
2048 | | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
2049 | | * so that we could simply use the prevSourceIndex here; |
2050 | | * this code gives an incorrect result for the rare case of an unmatched |
2051 | | * trail surrogate that is alone in the last buffer of the text stream |
2052 | | */ |
2053 | 0 | sourceIndex=(int32_t)(source-args->source); |
2054 | 0 | if(sourceIndex>0) { |
2055 | 0 | --sourceIndex; |
2056 | 0 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
2057 | 0 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
2058 | 0 | ) { |
2059 | 0 | --sourceIndex; |
2060 | 0 | } |
2061 | 0 | } else { |
2062 | 0 | sourceIndex=-1; |
2063 | 0 | } |
2064 | |
|
2065 | 0 | fromUWriteUInt8( |
2066 | 0 | cnv, |
2067 | 0 | buffer, outLen, |
2068 | 0 | &target, (const char *)targetLimit, |
2069 | 0 | &offsets, sourceIndex, |
2070 | 0 | err); |
2071 | 0 | } |
2072 | | |
2073 | | /*save the state and return */ |
2074 | 0 | args->source = source; |
2075 | 0 | args->target = (char*)target; |
2076 | 0 | } |
2077 | | |
2078 | | /*************** to unicode *******************/ |
2079 | | |
2080 | | static void U_CALLCONV |
2081 | | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
2082 | 0 | UErrorCode* err){ |
2083 | 0 | char tempBuf[2]; |
2084 | 0 | const char *mySource = (char *) args->source; |
2085 | 0 | UChar *myTarget = args->target; |
2086 | 0 | const char *mySourceLimit = args->sourceLimit; |
2087 | 0 | uint32_t targetUniChar = 0x0000; |
2088 | 0 | uint32_t mySourceChar = 0x0000; |
2089 | 0 | uint32_t tmpSourceChar = 0x0000; |
2090 | 0 | UConverterDataISO2022* myData; |
2091 | 0 | ISO2022State *pToU2022State; |
2092 | 0 | StateEnum cs; |
2093 | |
|
2094 | 0 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
2095 | 0 | pToU2022State = &myData->toU2022State; |
2096 | |
|
2097 | 0 | if(myData->key != 0) { |
2098 | | /* continue with a partial escape sequence */ |
2099 | 0 | goto escape; |
2100 | 0 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
2101 | | /* continue with a partial double-byte character */ |
2102 | 0 | mySourceChar = args->converter->toUBytes[0]; |
2103 | 0 | args->converter->toULength = 0; |
2104 | 0 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
2105 | 0 | targetUniChar = missingCharMarker; |
2106 | 0 | goto getTrailByte; |
2107 | 0 | } |
2108 | | |
2109 | 0 | while(mySource < mySourceLimit){ |
2110 | |
|
2111 | 0 | targetUniChar =missingCharMarker; |
2112 | |
|
2113 | 0 | if(myTarget < args->targetLimit){ |
2114 | |
|
2115 | 0 | mySourceChar= (unsigned char) *mySource++; |
2116 | |
|
2117 | 0 | switch(mySourceChar) { |
2118 | 0 | case UCNV_SI: |
2119 | 0 | if(myData->version==3) { |
2120 | 0 | pToU2022State->g=0; |
2121 | 0 | continue; |
2122 | 0 | } else { |
2123 | | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
2124 | 0 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
2125 | 0 | break; |
2126 | 0 | } |
2127 | | |
2128 | 0 | case UCNV_SO: |
2129 | 0 | if(myData->version==3) { |
2130 | | /* JIS7: switch to G1 half-width Katakana */ |
2131 | 0 | pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
2132 | 0 | pToU2022State->g=1; |
2133 | 0 | continue; |
2134 | 0 | } else { |
2135 | | /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
2136 | 0 | myData->isEmptySegment = FALSE; /* reset this, we have a different error */ |
2137 | 0 | break; |
2138 | 0 | } |
2139 | | |
2140 | 0 | case ESC_2022: |
2141 | 0 | mySource--; |
2142 | 0 | escape: |
2143 | 0 | { |
2144 | 0 | const char * mySourceBefore = mySource; |
2145 | 0 | int8_t toULengthBefore = args->converter->toULength; |
2146 | |
|
2147 | 0 | changeState_2022(args->converter,&(mySource), |
2148 | 0 | mySourceLimit, ISO_2022_JP,err); |
2149 | | |
2150 | | /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */ |
2151 | 0 | if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
2152 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
2153 | 0 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
2154 | 0 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
2155 | 0 | } |
2156 | 0 | } |
2157 | | |
2158 | | /* invalid or illegal escape sequence */ |
2159 | 0 | if(U_FAILURE(*err)){ |
2160 | 0 | args->target = myTarget; |
2161 | 0 | args->source = mySource; |
2162 | 0 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
2163 | 0 | return; |
2164 | 0 | } |
2165 | | /* If we successfully completed an escape sequence, we begin a new segment, empty so far */ |
2166 | 0 | if(myData->key==0) { |
2167 | 0 | myData->isEmptySegment = TRUE; |
2168 | 0 | } |
2169 | 0 | continue; |
2170 | | |
2171 | | /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
2172 | | |
2173 | 0 | case CR: |
2174 | 0 | case LF: |
2175 | | /* automatically reset to single-byte mode */ |
2176 | 0 | if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) { |
2177 | 0 | pToU2022State->cs[0] = (int8_t)ASCII; |
2178 | 0 | } |
2179 | 0 | pToU2022State->cs[2] = 0; |
2180 | 0 | pToU2022State->g = 0; |
2181 | 0 | U_FALLTHROUGH; |
2182 | 0 | default: |
2183 | | /* convert one or two bytes */ |
2184 | 0 | myData->isEmptySegment = FALSE; |
2185 | 0 | cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
2186 | 0 | if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 && |
2187 | 0 | !IS_JP_DBCS(cs) |
2188 | 0 | ) { |
2189 | | /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ |
2190 | 0 | targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
2191 | | |
2192 | | /* return from a single-shift state to the previous one */ |
2193 | 0 | if(pToU2022State->g >= 2) { |
2194 | 0 | pToU2022State->g=pToU2022State->prevG; |
2195 | 0 | } |
2196 | 0 | } else switch(cs) { |
2197 | 0 | case ASCII: |
2198 | 0 | if(mySourceChar <= 0x7f) { |
2199 | 0 | targetUniChar = mySourceChar; |
2200 | 0 | } |
2201 | 0 | break; |
2202 | 0 | case ISO8859_1: |
2203 | 0 | if(mySourceChar <= 0x7f) { |
2204 | 0 | targetUniChar = mySourceChar + 0x80; |
2205 | 0 | } |
2206 | | /* return from a single-shift state to the previous one */ |
2207 | 0 | pToU2022State->g=pToU2022State->prevG; |
2208 | 0 | break; |
2209 | 0 | case ISO8859_7: |
2210 | 0 | if(mySourceChar <= 0x7f) { |
2211 | | /* convert mySourceChar+0x80 to use a normal 8-bit table */ |
2212 | 0 | targetUniChar = |
2213 | 0 | _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
2214 | 0 | myData->myConverterArray[cs], |
2215 | 0 | mySourceChar + 0x80); |
2216 | 0 | } |
2217 | | /* return from a single-shift state to the previous one */ |
2218 | 0 | pToU2022State->g=pToU2022State->prevG; |
2219 | 0 | break; |
2220 | 0 | case JISX201: |
2221 | 0 | if(mySourceChar <= 0x7f) { |
2222 | 0 | targetUniChar = jisx201ToU(mySourceChar); |
2223 | 0 | } |
2224 | 0 | break; |
2225 | 0 | case HWKANA_7BIT: |
2226 | 0 | if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
2227 | | /* 7-bit halfwidth Katakana */ |
2228 | 0 | targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
2229 | 0 | } |
2230 | 0 | break; |
2231 | 0 | default: |
2232 | | /* G0 DBCS */ |
2233 | 0 | if(mySource < mySourceLimit) { |
2234 | 0 | int leadIsOk, trailIsOk; |
2235 | 0 | uint8_t trailByte; |
2236 | 0 | getTrailByte: |
2237 | 0 | trailByte = (uint8_t)*mySource; |
2238 | | /* |
2239 | | * Ticket 5691: consistent illegal sequences: |
2240 | | * - We include at least the first byte in the illegal sequence. |
2241 | | * - If any of the non-initial bytes could be the start of a character, |
2242 | | * we stop the illegal sequence before the first one of those. |
2243 | | * |
2244 | | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
2245 | | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
2246 | | * Otherwise we convert or report the pair of bytes. |
2247 | | */ |
2248 | 0 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
2249 | 0 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
2250 | 0 | if (leadIsOk && trailIsOk) { |
2251 | 0 | ++mySource; |
2252 | 0 | tmpSourceChar = (mySourceChar << 8) | trailByte; |
2253 | 0 | if(cs == JISX208) { |
2254 | 0 | _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf); |
2255 | 0 | mySourceChar = tmpSourceChar; |
2256 | 0 | } else { |
2257 | | /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */ |
2258 | 0 | mySourceChar = tmpSourceChar; |
2259 | 0 | if (cs == KSC5601) { |
2260 | 0 | tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */ |
2261 | 0 | } |
2262 | 0 | tempBuf[0] = (char)(tmpSourceChar >> 8); |
2263 | 0 | tempBuf[1] = (char)(tmpSourceChar); |
2264 | 0 | } |
2265 | 0 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); |
2266 | 0 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
2267 | | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
2268 | 0 | ++mySource; |
2269 | | /* add another bit so that the code below writes 2 bytes in case of error */ |
2270 | 0 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
2271 | 0 | } |
2272 | 0 | } else { |
2273 | 0 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
2274 | 0 | args->converter->toULength = 1; |
2275 | 0 | goto endloop; |
2276 | 0 | } |
2277 | 0 | } /* End of inner switch */ |
2278 | 0 | break; |
2279 | 0 | } /* End of outer switch */ |
2280 | 0 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
2281 | 0 | if(args->offsets){ |
2282 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
2283 | 0 | } |
2284 | 0 | *(myTarget++)=(UChar)targetUniChar; |
2285 | 0 | } |
2286 | 0 | else if(targetUniChar > missingCharMarker){ |
2287 | | /* disassemble the surrogate pair and write to output*/ |
2288 | 0 | targetUniChar-=0x0010000; |
2289 | 0 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
2290 | 0 | if(args->offsets){ |
2291 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
2292 | 0 | } |
2293 | 0 | ++myTarget; |
2294 | 0 | if(myTarget< args->targetLimit){ |
2295 | 0 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
2296 | 0 | if(args->offsets){ |
2297 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
2298 | 0 | } |
2299 | 0 | ++myTarget; |
2300 | 0 | }else{ |
2301 | 0 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
2302 | 0 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
2303 | 0 | } |
2304 | |
|
2305 | 0 | } |
2306 | 0 | else{ |
2307 | | /* Call the callback function*/ |
2308 | 0 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
2309 | 0 | break; |
2310 | 0 | } |
2311 | 0 | } |
2312 | 0 | else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */ |
2313 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
2314 | 0 | break; |
2315 | 0 | } |
2316 | 0 | } |
2317 | 0 | endloop: |
2318 | 0 | args->target = myTarget; |
2319 | 0 | args->source = mySource; |
2320 | 0 | } |
2321 | | |
2322 | | |
2323 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
2324 | | /*************************************************************** |
2325 | | * Rules for ISO-2022-KR encoding |
2326 | | * i) The KSC5601 designator sequence should appear only once in a file, |
2327 | | * at the begining of a line before any KSC5601 characters. This usually |
2328 | | * means that it appears by itself on the first line of the file |
2329 | | * ii) There are only 2 shifting sequences SO to shift into double byte mode |
2330 | | * and SI to shift into single byte mode |
2331 | | */ |
2332 | | static void U_CALLCONV |
2333 | 0 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
2334 | |
|
2335 | 0 | UConverter* saveConv = args->converter; |
2336 | 0 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo; |
2337 | 0 | args->converter=myConverterData->currentConverter; |
2338 | |
|
2339 | 0 | myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
2340 | 0 | ucnv_MBCSFromUnicodeWithOffsets(args,err); |
2341 | 0 | saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
2342 | |
|
2343 | 0 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
2344 | 0 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
2345 | 0 | uprv_memcpy( |
2346 | 0 | saveConv->charErrorBuffer, |
2347 | 0 | myConverterData->currentConverter->charErrorBuffer, |
2348 | 0 | myConverterData->currentConverter->charErrorBufferLength); |
2349 | 0 | } |
2350 | 0 | saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
2351 | 0 | myConverterData->currentConverter->charErrorBufferLength = 0; |
2352 | 0 | } |
2353 | 0 | args->converter=saveConv; |
2354 | 0 | } |
2355 | | |
2356 | | static void U_CALLCONV |
2357 | 0 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
2358 | |
|
2359 | 0 | const UChar *source = args->source; |
2360 | 0 | const UChar *sourceLimit = args->sourceLimit; |
2361 | 0 | unsigned char *target = (unsigned char *) args->target; |
2362 | 0 | unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
2363 | 0 | int32_t* offsets = args->offsets; |
2364 | 0 | uint32_t targetByteUnit = 0x0000; |
2365 | 0 | UChar32 sourceChar = 0x0000; |
2366 | 0 | UBool isTargetByteDBCS; |
2367 | 0 | UBool oldIsTargetByteDBCS; |
2368 | 0 | UConverterDataISO2022 *converterData; |
2369 | 0 | UConverterSharedData* sharedData; |
2370 | 0 | UBool useFallback; |
2371 | 0 | int32_t length =0; |
2372 | |
|
2373 | 0 | converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
2374 | | /* if the version is 1 then the user is requesting |
2375 | | * conversion with ibm-25546 pass the arguments to |
2376 | | * MBCS converter and return |
2377 | | */ |
2378 | 0 | if(converterData->version==1){ |
2379 | 0 | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
2380 | 0 | return; |
2381 | 0 | } |
2382 | | |
2383 | | /* initialize data */ |
2384 | 0 | sharedData = converterData->currentConverter->sharedData; |
2385 | 0 | useFallback = args->converter->useFallback; |
2386 | 0 | isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
2387 | 0 | oldIsTargetByteDBCS = isTargetByteDBCS; |
2388 | |
|
2389 | 0 | isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
2390 | 0 | if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
2391 | 0 | goto getTrail; |
2392 | 0 | } |
2393 | 0 | while(source < sourceLimit){ |
2394 | |
|
2395 | 0 | targetByteUnit = missingCharMarker; |
2396 | |
|
2397 | 0 | if(target < (unsigned char*) args->targetLimit){ |
2398 | 0 | sourceChar = *source++; |
2399 | | |
2400 | | /* do not convert SO/SI/ESC */ |
2401 | 0 | if(IS_2022_CONTROL(sourceChar)) { |
2402 | | /* callback(illegal) */ |
2403 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2404 | 0 | args->converter->fromUChar32=sourceChar; |
2405 | 0 | break; |
2406 | 0 | } |
2407 | | |
2408 | 0 | length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2); |
2409 | 0 | if(length < 0) { |
2410 | 0 | length = -length; /* fallback */ |
2411 | 0 | } |
2412 | | /* only DBCS or SBCS characters are expected*/ |
2413 | | /* DB characters with high bit set to 1 are expected */ |
2414 | 0 | if( length > 2 || length==0 || |
2415 | 0 | (length == 1 && targetByteUnit > 0x7f) || |
2416 | 0 | (length == 2 && |
2417 | 0 | ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
2418 | 0 | (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
2419 | 0 | ) { |
2420 | 0 | targetByteUnit=missingCharMarker; |
2421 | 0 | } |
2422 | 0 | if (targetByteUnit != missingCharMarker){ |
2423 | |
|
2424 | 0 | oldIsTargetByteDBCS = isTargetByteDBCS; |
2425 | 0 | isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
2426 | | /* append the shift sequence */ |
2427 | 0 | if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
2428 | |
|
2429 | 0 | if (isTargetByteDBCS) |
2430 | 0 | *target++ = UCNV_SO; |
2431 | 0 | else |
2432 | 0 | *target++ = UCNV_SI; |
2433 | 0 | if(offsets) |
2434 | 0 | *(offsets++) = (int32_t)(source - args->source-1); |
2435 | 0 | } |
2436 | | /* write the targetUniChar to target */ |
2437 | 0 | if(targetByteUnit <= 0x00FF){ |
2438 | 0 | if( target < targetLimit){ |
2439 | 0 | *(target++) = (unsigned char) targetByteUnit; |
2440 | 0 | if(offsets){ |
2441 | 0 | *(offsets++) = (int32_t)(source - args->source-1); |
2442 | 0 | } |
2443 | |
|
2444 | 0 | }else{ |
2445 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit); |
2446 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
2447 | 0 | } |
2448 | 0 | }else{ |
2449 | 0 | if(target < targetLimit){ |
2450 | 0 | *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80); |
2451 | 0 | if(offsets){ |
2452 | 0 | *(offsets++) = (int32_t)(source - args->source-1); |
2453 | 0 | } |
2454 | 0 | if(target < targetLimit){ |
2455 | 0 | *(target++) =(unsigned char) (targetByteUnit -0x80); |
2456 | 0 | if(offsets){ |
2457 | 0 | *(offsets++) = (int32_t)(source - args->source-1); |
2458 | 0 | } |
2459 | 0 | }else{ |
2460 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
2461 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
2462 | 0 | } |
2463 | 0 | }else{ |
2464 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
2465 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
2466 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
2467 | 0 | } |
2468 | 0 | } |
2469 | |
|
2470 | 0 | } |
2471 | 0 | else{ |
2472 | | /* oops.. the code point is unassingned |
2473 | | * set the error and reason |
2474 | | */ |
2475 | | |
2476 | | /*check if the char is a First surrogate*/ |
2477 | 0 | if(U16_IS_SURROGATE(sourceChar)) { |
2478 | 0 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
2479 | 0 | getTrail: |
2480 | | /*look ahead to find the trail surrogate*/ |
2481 | 0 | if(source < sourceLimit) { |
2482 | | /* test the following code unit */ |
2483 | 0 | UChar trail=(UChar) *source; |
2484 | 0 | if(U16_IS_TRAIL(trail)) { |
2485 | 0 | source++; |
2486 | 0 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
2487 | 0 | *err = U_INVALID_CHAR_FOUND; |
2488 | | /* convert this surrogate code point */ |
2489 | | /* exit this condition tree */ |
2490 | 0 | } else { |
2491 | | /* this is an unmatched lead code unit (1st surrogate) */ |
2492 | | /* callback(illegal) */ |
2493 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2494 | 0 | } |
2495 | 0 | } else { |
2496 | | /* no more input */ |
2497 | 0 | *err = U_ZERO_ERROR; |
2498 | 0 | } |
2499 | 0 | } else { |
2500 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
2501 | | /* callback(illegal) */ |
2502 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2503 | 0 | } |
2504 | 0 | } else { |
2505 | | /* callback(unassigned) for a BMP code point */ |
2506 | 0 | *err = U_INVALID_CHAR_FOUND; |
2507 | 0 | } |
2508 | |
|
2509 | 0 | args->converter->fromUChar32=sourceChar; |
2510 | 0 | break; |
2511 | 0 | } |
2512 | 0 | } /* end if(myTargetIndex<myTargetLength) */ |
2513 | 0 | else{ |
2514 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
2515 | 0 | break; |
2516 | 0 | } |
2517 | |
|
2518 | 0 | }/* end while(mySourceIndex<mySourceLength) */ |
2519 | | |
2520 | | /* |
2521 | | * the end of the input stream and detection of truncated input |
2522 | | * are handled by the framework, but for ISO-2022-KR conversion |
2523 | | * we need to be in ASCII mode at the very end |
2524 | | * |
2525 | | * conditions: |
2526 | | * successful |
2527 | | * not in ASCII mode |
2528 | | * end of input and no truncated input |
2529 | | */ |
2530 | 0 | if( U_SUCCESS(*err) && |
2531 | 0 | isTargetByteDBCS && |
2532 | 0 | args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
2533 | 0 | ) { |
2534 | 0 | int32_t sourceIndex; |
2535 | | |
2536 | | /* we are switching to ASCII */ |
2537 | 0 | isTargetByteDBCS=FALSE; |
2538 | | |
2539 | | /* get the source index of the last input character */ |
2540 | | /* |
2541 | | * TODO this would be simpler and more reliable if we used a pair |
2542 | | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
2543 | | * so that we could simply use the prevSourceIndex here; |
2544 | | * this code gives an incorrect result for the rare case of an unmatched |
2545 | | * trail surrogate that is alone in the last buffer of the text stream |
2546 | | */ |
2547 | 0 | sourceIndex=(int32_t)(source-args->source); |
2548 | 0 | if(sourceIndex>0) { |
2549 | 0 | --sourceIndex; |
2550 | 0 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
2551 | 0 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
2552 | 0 | ) { |
2553 | 0 | --sourceIndex; |
2554 | 0 | } |
2555 | 0 | } else { |
2556 | 0 | sourceIndex=-1; |
2557 | 0 | } |
2558 | |
|
2559 | 0 | fromUWriteUInt8( |
2560 | 0 | args->converter, |
2561 | 0 | SHIFT_IN_STR, 1, |
2562 | 0 | &target, (const char *)targetLimit, |
2563 | 0 | &offsets, sourceIndex, |
2564 | 0 | err); |
2565 | 0 | } |
2566 | | |
2567 | | /*save the state and return */ |
2568 | 0 | args->source = source; |
2569 | 0 | args->target = (char*)target; |
2570 | 0 | args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
2571 | 0 | } |
2572 | | |
2573 | | /************************ To Unicode ***************************************/ |
2574 | | |
2575 | | static void U_CALLCONV |
2576 | | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args, |
2577 | 0 | UErrorCode* err){ |
2578 | 0 | char const* sourceStart; |
2579 | 0 | UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
2580 | |
|
2581 | 0 | UConverterToUnicodeArgs subArgs; |
2582 | 0 | int32_t minArgsSize; |
2583 | | |
2584 | | /* set up the subconverter arguments */ |
2585 | 0 | if(args->size<sizeof(UConverterToUnicodeArgs)) { |
2586 | 0 | minArgsSize = args->size; |
2587 | 0 | } else { |
2588 | 0 | minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
2589 | 0 | } |
2590 | |
|
2591 | 0 | uprv_memcpy(&subArgs, args, minArgsSize); |
2592 | 0 | subArgs.size = (uint16_t)minArgsSize; |
2593 | 0 | subArgs.converter = myData->currentConverter; |
2594 | | |
2595 | | /* remember the original start of the input for offsets */ |
2596 | 0 | sourceStart = args->source; |
2597 | |
|
2598 | 0 | if(myData->key != 0) { |
2599 | | /* continue with a partial escape sequence */ |
2600 | 0 | goto escape; |
2601 | 0 | } |
2602 | | |
2603 | 0 | while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
2604 | | /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
2605 | 0 | subArgs.source = args->source; |
2606 | 0 | subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush); |
2607 | 0 | if(subArgs.source != subArgs.sourceLimit) { |
2608 | | /* |
2609 | | * get the current partial byte sequence |
2610 | | * |
2611 | | * it needs to be moved between the public and the subconverter |
2612 | | * so that the conversion framework, which only sees the public |
2613 | | * converter, can handle truncated and illegal input etc. |
2614 | | */ |
2615 | 0 | if(args->converter->toULength > 0) { |
2616 | 0 | uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength); |
2617 | 0 | } |
2618 | 0 | subArgs.converter->toULength = args->converter->toULength; |
2619 | | |
2620 | | /* |
2621 | | * Convert up to the end of the input, or to before the next escape character. |
2622 | | * Does not handle conversion extensions because the preToU[] state etc. |
2623 | | * is not copied. |
2624 | | */ |
2625 | 0 | ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
2626 | |
|
2627 | 0 | if(args->offsets != NULL && sourceStart != args->source) { |
2628 | | /* update offsets to base them on the actual start of the input */ |
2629 | 0 | int32_t *offsets = args->offsets; |
2630 | 0 | UChar *target = args->target; |
2631 | 0 | int32_t delta = (int32_t)(args->source - sourceStart); |
2632 | 0 | while(target < subArgs.target) { |
2633 | 0 | if(*offsets >= 0) { |
2634 | 0 | *offsets += delta; |
2635 | 0 | } |
2636 | 0 | ++offsets; |
2637 | 0 | ++target; |
2638 | 0 | } |
2639 | 0 | } |
2640 | 0 | args->source = subArgs.source; |
2641 | 0 | args->target = subArgs.target; |
2642 | 0 | args->offsets = subArgs.offsets; |
2643 | | |
2644 | | /* copy input/error/overflow buffers */ |
2645 | 0 | if(subArgs.converter->toULength > 0) { |
2646 | 0 | uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength); |
2647 | 0 | } |
2648 | 0 | args->converter->toULength = subArgs.converter->toULength; |
2649 | |
|
2650 | 0 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
2651 | 0 | if(subArgs.converter->UCharErrorBufferLength > 0) { |
2652 | 0 | uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer, |
2653 | 0 | subArgs.converter->UCharErrorBufferLength); |
2654 | 0 | } |
2655 | 0 | args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength; |
2656 | 0 | subArgs.converter->UCharErrorBufferLength = 0; |
2657 | 0 | } |
2658 | 0 | } |
2659 | |
|
2660 | 0 | if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
2661 | 0 | return; |
2662 | 0 | } |
2663 | | |
2664 | 0 | escape: |
2665 | 0 | changeState_2022(args->converter, |
2666 | 0 | &(args->source), |
2667 | 0 | args->sourceLimit, |
2668 | 0 | ISO_2022_KR, |
2669 | 0 | err); |
2670 | 0 | } |
2671 | 0 | } |
2672 | | |
2673 | | static void U_CALLCONV |
2674 | | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
2675 | 0 | UErrorCode* err){ |
2676 | 0 | char tempBuf[2]; |
2677 | 0 | const char *mySource = ( char *) args->source; |
2678 | 0 | UChar *myTarget = args->target; |
2679 | 0 | const char *mySourceLimit = args->sourceLimit; |
2680 | 0 | UChar32 targetUniChar = 0x0000; |
2681 | 0 | UChar mySourceChar = 0x0000; |
2682 | 0 | UConverterDataISO2022* myData; |
2683 | 0 | UConverterSharedData* sharedData ; |
2684 | 0 | UBool useFallback; |
2685 | |
|
2686 | 0 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
2687 | 0 | if(myData->version==1){ |
2688 | 0 | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
2689 | 0 | return; |
2690 | 0 | } |
2691 | | |
2692 | | /* initialize state */ |
2693 | 0 | sharedData = myData->currentConverter->sharedData; |
2694 | 0 | useFallback = args->converter->useFallback; |
2695 | |
|
2696 | 0 | if(myData->key != 0) { |
2697 | | /* continue with a partial escape sequence */ |
2698 | 0 | goto escape; |
2699 | 0 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
2700 | | /* continue with a partial double-byte character */ |
2701 | 0 | mySourceChar = args->converter->toUBytes[0]; |
2702 | 0 | args->converter->toULength = 0; |
2703 | 0 | goto getTrailByte; |
2704 | 0 | } |
2705 | | |
2706 | 0 | while(mySource< mySourceLimit){ |
2707 | |
|
2708 | 0 | if(myTarget < args->targetLimit){ |
2709 | |
|
2710 | 0 | mySourceChar= (unsigned char) *mySource++; |
2711 | |
|
2712 | 0 | if(mySourceChar==UCNV_SI){ |
2713 | 0 | myData->toU2022State.g = 0; |
2714 | 0 | if (myData->isEmptySegment) { |
2715 | 0 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
2716 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
2717 | 0 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
2718 | 0 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
2719 | 0 | args->converter->toULength = 1; |
2720 | 0 | args->target = myTarget; |
2721 | 0 | args->source = mySource; |
2722 | 0 | return; |
2723 | 0 | } |
2724 | | /*consume the source */ |
2725 | 0 | continue; |
2726 | 0 | }else if(mySourceChar==UCNV_SO){ |
2727 | 0 | myData->toU2022State.g = 1; |
2728 | 0 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
2729 | | /*consume the source */ |
2730 | 0 | continue; |
2731 | 0 | }else if(mySourceChar==ESC_2022){ |
2732 | 0 | mySource--; |
2733 | 0 | escape: |
2734 | 0 | myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */ |
2735 | 0 | changeState_2022(args->converter,&(mySource), |
2736 | 0 | mySourceLimit, ISO_2022_KR, err); |
2737 | 0 | if(U_FAILURE(*err)){ |
2738 | 0 | args->target = myTarget; |
2739 | 0 | args->source = mySource; |
2740 | 0 | return; |
2741 | 0 | } |
2742 | 0 | continue; |
2743 | 0 | } |
2744 | | |
2745 | 0 | myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */ |
2746 | 0 | if(myData->toU2022State.g == 1) { |
2747 | 0 | if(mySource < mySourceLimit) { |
2748 | 0 | int leadIsOk, trailIsOk; |
2749 | 0 | uint8_t trailByte; |
2750 | 0 | getTrailByte: |
2751 | 0 | targetUniChar = missingCharMarker; |
2752 | 0 | trailByte = (uint8_t)*mySource; |
2753 | | /* |
2754 | | * Ticket 5691: consistent illegal sequences: |
2755 | | * - We include at least the first byte in the illegal sequence. |
2756 | | * - If any of the non-initial bytes could be the start of a character, |
2757 | | * we stop the illegal sequence before the first one of those. |
2758 | | * |
2759 | | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
2760 | | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
2761 | | * Otherwise we convert or report the pair of bytes. |
2762 | | */ |
2763 | 0 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
2764 | 0 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
2765 | 0 | if (leadIsOk && trailIsOk) { |
2766 | 0 | ++mySource; |
2767 | 0 | tempBuf[0] = (char)(mySourceChar + 0x80); |
2768 | 0 | tempBuf[1] = (char)(trailByte + 0x80); |
2769 | 0 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback); |
2770 | 0 | mySourceChar = (mySourceChar << 8) | trailByte; |
2771 | 0 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
2772 | | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
2773 | 0 | ++mySource; |
2774 | | /* add another bit so that the code below writes 2 bytes in case of error */ |
2775 | 0 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
2776 | 0 | } |
2777 | 0 | } else { |
2778 | 0 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
2779 | 0 | args->converter->toULength = 1; |
2780 | 0 | break; |
2781 | 0 | } |
2782 | 0 | } |
2783 | 0 | else if(mySourceChar <= 0x7f) { |
2784 | 0 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback); |
2785 | 0 | } else { |
2786 | 0 | targetUniChar = 0xffff; |
2787 | 0 | } |
2788 | 0 | if(targetUniChar < 0xfffe){ |
2789 | 0 | if(args->offsets) { |
2790 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
2791 | 0 | } |
2792 | 0 | *(myTarget++)=(UChar)targetUniChar; |
2793 | 0 | } |
2794 | 0 | else { |
2795 | | /* Call the callback function*/ |
2796 | 0 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
2797 | 0 | break; |
2798 | 0 | } |
2799 | 0 | } |
2800 | 0 | else{ |
2801 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
2802 | 0 | break; |
2803 | 0 | } |
2804 | 0 | } |
2805 | 0 | args->target = myTarget; |
2806 | 0 | args->source = mySource; |
2807 | 0 | } |
2808 | | |
2809 | | /*************************** END ISO2022-KR *********************************/ |
2810 | | |
2811 | | /*************************** ISO-2022-CN ********************************* |
2812 | | * |
2813 | | * Rules for ISO-2022-CN Encoding: |
2814 | | * i) The designator sequence must appear once on a line before any instance |
2815 | | * of character set it designates. |
2816 | | * ii) If two lines contain characters from the same character set, both lines |
2817 | | * must include the designator sequence. |
2818 | | * iii) Once the designator sequence is known, a shifting sequence has to be found |
2819 | | * to invoke the shifting |
2820 | | * iv) All lines start in ASCII and end in ASCII. |
2821 | | * v) Four shifting sequences are employed for this purpose: |
2822 | | * |
2823 | | * Sequcence ASCII Eq Charsets |
2824 | | * ---------- ------- --------- |
2825 | | * SI <SI> US-ASCII |
2826 | | * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
2827 | | * SS2 <ESC>N CNS-11643-1992 Plane 2 |
2828 | | * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
2829 | | * |
2830 | | * vi) |
2831 | | * SOdesignator : ESC "$" ")" finalchar_for_SO |
2832 | | * SS2designator : ESC "$" "*" finalchar_for_SS2 |
2833 | | * SS3designator : ESC "$" "+" finalchar_for_SS3 |
2834 | | * |
2835 | | * ESC $ ) A Indicates the bytes following SO are Chinese |
2836 | | * characters as defined in GB 2312-80, until |
2837 | | * another SOdesignation appears |
2838 | | * |
2839 | | * |
2840 | | * ESC $ ) E Indicates the bytes following SO are as defined |
2841 | | * in ISO-IR-165 (for details, see section 2.1), |
2842 | | * until another SOdesignation appears |
2843 | | * |
2844 | | * ESC $ ) G Indicates the bytes following SO are as defined |
2845 | | * in CNS 11643-plane-1, until another |
2846 | | * SOdesignation appears |
2847 | | * |
2848 | | * ESC $ * H Indicates the two bytes immediately following |
2849 | | * SS2 is a Chinese character as defined in CNS |
2850 | | * 11643-plane-2, until another SS2designation |
2851 | | * appears |
2852 | | * (Meaning <ESC>N must preceed every 2 byte |
2853 | | * sequence.) |
2854 | | * |
2855 | | * ESC $ + I Indicates the immediate two bytes following SS3 |
2856 | | * is a Chinese character as defined in CNS |
2857 | | * 11643-plane-3, until another SS3designation |
2858 | | * appears |
2859 | | * (Meaning <ESC>O must preceed every 2 byte |
2860 | | * sequence.) |
2861 | | * |
2862 | | * ESC $ + J Indicates the immediate two bytes following SS3 |
2863 | | * is a Chinese character as defined in CNS |
2864 | | * 11643-plane-4, until another SS3designation |
2865 | | * appears |
2866 | | * (In English: <ESC>O must preceed every 2 byte |
2867 | | * sequence.) |
2868 | | * |
2869 | | * ESC $ + K Indicates the immediate two bytes following SS3 |
2870 | | * is a Chinese character as defined in CNS |
2871 | | * 11643-plane-5, until another SS3designation |
2872 | | * appears |
2873 | | * |
2874 | | * ESC $ + L Indicates the immediate two bytes following SS3 |
2875 | | * is a Chinese character as defined in CNS |
2876 | | * 11643-plane-6, until another SS3designation |
2877 | | * appears |
2878 | | * |
2879 | | * ESC $ + M Indicates the immediate two bytes following SS3 |
2880 | | * is a Chinese character as defined in CNS |
2881 | | * 11643-plane-7, until another SS3designation |
2882 | | * appears |
2883 | | * |
2884 | | * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
2885 | | * has its own designation information before any Chinese characters |
2886 | | * appear |
2887 | | * |
2888 | | */ |
2889 | | |
2890 | | /* The following are defined this way to make the strings truly readonly */ |
2891 | | static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
2892 | | static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
2893 | | static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
2894 | | static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
2895 | | static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
2896 | | static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
2897 | | static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; |
2898 | | static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; |
2899 | | static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; |
2900 | | |
2901 | | /********************** ISO2022-CN Data **************************/ |
2902 | | static const char* const escSeqCharsCN[10] ={ |
2903 | | SHIFT_IN_STR, /* 0 ASCII */ |
2904 | | GB_2312_80_STR, /* 1 GB2312_1 */ |
2905 | | ISO_IR_165_STR, /* 2 ISO_IR_165 */ |
2906 | | CNS_11643_1992_Plane_1_STR, |
2907 | | CNS_11643_1992_Plane_2_STR, |
2908 | | CNS_11643_1992_Plane_3_STR, |
2909 | | CNS_11643_1992_Plane_4_STR, |
2910 | | CNS_11643_1992_Plane_5_STR, |
2911 | | CNS_11643_1992_Plane_6_STR, |
2912 | | CNS_11643_1992_Plane_7_STR |
2913 | | }; |
2914 | | |
2915 | | static void U_CALLCONV |
2916 | 0 | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){ |
2917 | 0 | UConverter *cnv = args->converter; |
2918 | 0 | UConverterDataISO2022 *converterData; |
2919 | 0 | ISO2022State *pFromU2022State; |
2920 | 0 | uint8_t *target = (uint8_t *) args->target; |
2921 | 0 | const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
2922 | 0 | const UChar* source = args->source; |
2923 | 0 | const UChar* sourceLimit = args->sourceLimit; |
2924 | 0 | int32_t* offsets = args->offsets; |
2925 | 0 | UChar32 sourceChar; |
2926 | 0 | char buffer[8]; |
2927 | 0 | int32_t len; |
2928 | 0 | int8_t choices[3]; |
2929 | 0 | int32_t choiceCount; |
2930 | 0 | uint32_t targetValue = 0; |
2931 | 0 | UBool useFallback; |
2932 | | |
2933 | | /* set up the state */ |
2934 | 0 | converterData = (UConverterDataISO2022*)cnv->extraInfo; |
2935 | 0 | pFromU2022State = &converterData->fromU2022State; |
2936 | |
|
2937 | 0 | choiceCount = 0; |
2938 | | |
2939 | | /* check if the last codepoint of previous buffer was a lead surrogate*/ |
2940 | 0 | if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
2941 | 0 | goto getTrail; |
2942 | 0 | } |
2943 | | |
2944 | 0 | while( source < sourceLimit){ |
2945 | 0 | if(target < targetLimit){ |
2946 | |
|
2947 | 0 | sourceChar = *(source++); |
2948 | | /*check if the char is a First surrogate*/ |
2949 | 0 | if(U16_IS_SURROGATE(sourceChar)) { |
2950 | 0 | if(U16_IS_SURROGATE_LEAD(sourceChar)) { |
2951 | 0 | getTrail: |
2952 | | /*look ahead to find the trail surrogate*/ |
2953 | 0 | if(source < sourceLimit) { |
2954 | | /* test the following code unit */ |
2955 | 0 | UChar trail=(UChar) *source; |
2956 | 0 | if(U16_IS_TRAIL(trail)) { |
2957 | 0 | source++; |
2958 | 0 | sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail); |
2959 | 0 | cnv->fromUChar32=0x00; |
2960 | | /* convert this supplementary code point */ |
2961 | | /* exit this condition tree */ |
2962 | 0 | } else { |
2963 | | /* this is an unmatched lead code unit (1st surrogate) */ |
2964 | | /* callback(illegal) */ |
2965 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2966 | 0 | cnv->fromUChar32=sourceChar; |
2967 | 0 | break; |
2968 | 0 | } |
2969 | 0 | } else { |
2970 | | /* no more input */ |
2971 | 0 | cnv->fromUChar32=sourceChar; |
2972 | 0 | break; |
2973 | 0 | } |
2974 | 0 | } else { |
2975 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
2976 | | /* callback(illegal) */ |
2977 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2978 | 0 | cnv->fromUChar32=sourceChar; |
2979 | 0 | break; |
2980 | 0 | } |
2981 | 0 | } |
2982 | | |
2983 | | /* do the conversion */ |
2984 | 0 | if(sourceChar <= 0x007f ){ |
2985 | | /* do not convert SO/SI/ESC */ |
2986 | 0 | if(IS_2022_CONTROL(sourceChar)) { |
2987 | | /* callback(illegal) */ |
2988 | 0 | *err=U_ILLEGAL_CHAR_FOUND; |
2989 | 0 | cnv->fromUChar32=sourceChar; |
2990 | 0 | break; |
2991 | 0 | } |
2992 | | |
2993 | | /* US-ASCII */ |
2994 | 0 | if(pFromU2022State->g == 0) { |
2995 | 0 | buffer[0] = (char)sourceChar; |
2996 | 0 | len = 1; |
2997 | 0 | } else { |
2998 | 0 | buffer[0] = UCNV_SI; |
2999 | 0 | buffer[1] = (char)sourceChar; |
3000 | 0 | len = 2; |
3001 | 0 | pFromU2022State->g = 0; |
3002 | 0 | choiceCount = 0; |
3003 | 0 | } |
3004 | 0 | if(sourceChar == CR || sourceChar == LF) { |
3005 | | /* reset the state at the end of a line */ |
3006 | 0 | uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); |
3007 | 0 | choiceCount = 0; |
3008 | 0 | } |
3009 | 0 | } |
3010 | 0 | else{ |
3011 | | /* convert U+0080..U+10ffff */ |
3012 | 0 | int32_t i; |
3013 | 0 | int8_t cs, g; |
3014 | |
|
3015 | 0 | if(choiceCount == 0) { |
3016 | | /* try the current SO/G1 converter first */ |
3017 | 0 | choices[0] = pFromU2022State->cs[1]; |
3018 | | |
3019 | | /* default to GB2312_1 if none is designated yet */ |
3020 | 0 | if(choices[0] == 0) { |
3021 | 0 | choices[0] = GB2312_1; |
3022 | 0 | } |
3023 | |
|
3024 | 0 | if(converterData->version == 0) { |
3025 | | /* ISO-2022-CN */ |
3026 | | |
3027 | | /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */ |
3028 | 0 | if(choices[0] == GB2312_1) { |
3029 | 0 | choices[1] = (int8_t)CNS_11643_1; |
3030 | 0 | } else { |
3031 | 0 | choices[1] = (int8_t)GB2312_1; |
3032 | 0 | } |
3033 | |
|
3034 | 0 | choiceCount = 2; |
3035 | 0 | } else if (converterData->version == 1) { |
3036 | | /* ISO-2022-CN-EXT */ |
3037 | | |
3038 | | /* try one of the other converters */ |
3039 | 0 | switch(choices[0]) { |
3040 | 0 | case GB2312_1: |
3041 | 0 | choices[1] = (int8_t)CNS_11643_1; |
3042 | 0 | choices[2] = (int8_t)ISO_IR_165; |
3043 | 0 | break; |
3044 | 0 | case ISO_IR_165: |
3045 | 0 | choices[1] = (int8_t)GB2312_1; |
3046 | 0 | choices[2] = (int8_t)CNS_11643_1; |
3047 | 0 | break; |
3048 | 0 | default: /* CNS_11643_x */ |
3049 | 0 | choices[1] = (int8_t)GB2312_1; |
3050 | 0 | choices[2] = (int8_t)ISO_IR_165; |
3051 | 0 | break; |
3052 | 0 | } |
3053 | | |
3054 | 0 | choiceCount = 3; |
3055 | 0 | } else { |
3056 | 0 | choices[0] = (int8_t)CNS_11643_1; |
3057 | 0 | choices[1] = (int8_t)GB2312_1; |
3058 | 0 | } |
3059 | 0 | } |
3060 | | |
3061 | 0 | cs = g = 0; |
3062 | | /* |
3063 | | * len==0: no mapping found yet |
3064 | | * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks |
3065 | | * len>0: found a roundtrip result, done |
3066 | | */ |
3067 | 0 | len = 0; |
3068 | | /* |
3069 | | * We will turn off useFallback after finding a fallback, |
3070 | | * but we still get fallbacks from PUA code points as usual. |
3071 | | * Therefore, we will also need to check that we don't overwrite |
3072 | | * an early fallback with a later one. |
3073 | | */ |
3074 | 0 | useFallback = cnv->useFallback; |
3075 | |
|
3076 | 0 | for(i = 0; i < choiceCount && len <= 0; ++i) { |
3077 | 0 | int8_t cs0 = choices[i]; |
3078 | 0 | if(cs0 > 0) { |
3079 | 0 | uint32_t value; |
3080 | 0 | int32_t len2; |
3081 | 0 | if(cs0 >= CNS_11643_0) { |
3082 | 0 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
3083 | 0 | converterData->myConverterArray[CNS_11643], |
3084 | 0 | sourceChar, |
3085 | 0 | &value, |
3086 | 0 | useFallback, |
3087 | 0 | MBCS_OUTPUT_3); |
3088 | 0 | if(len2 == 3 || (len2 == -3 && len == 0)) { |
3089 | 0 | targetValue = value; |
3090 | 0 | cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80); |
3091 | 0 | if(len2 >= 0) { |
3092 | 0 | len = 2; |
3093 | 0 | } else { |
3094 | 0 | len = -2; |
3095 | 0 | useFallback = FALSE; |
3096 | 0 | } |
3097 | 0 | if(cs == CNS_11643_1) { |
3098 | 0 | g = 1; |
3099 | 0 | } else if(cs == CNS_11643_2) { |
3100 | 0 | g = 2; |
3101 | 0 | } else /* plane 3..7 */ if(converterData->version == 1) { |
3102 | 0 | g = 3; |
3103 | 0 | } else { |
3104 | | /* ISO-2022-CN (without -EXT) does not support plane 3..7 */ |
3105 | 0 | len = 0; |
3106 | 0 | } |
3107 | 0 | } |
3108 | 0 | } else { |
3109 | | /* GB2312_1 or ISO-IR-165 */ |
3110 | 0 | U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS); |
3111 | 0 | len2 = MBCS_FROM_UCHAR32_ISO2022( |
3112 | 0 | converterData->myConverterArray[cs0], |
3113 | 0 | sourceChar, |
3114 | 0 | &value, |
3115 | 0 | useFallback, |
3116 | 0 | MBCS_OUTPUT_2); |
3117 | 0 | if(len2 == 2 || (len2 == -2 && len == 0)) { |
3118 | 0 | targetValue = value; |
3119 | 0 | len = len2; |
3120 | 0 | cs = cs0; |
3121 | 0 | g = 1; |
3122 | 0 | useFallback = FALSE; |
3123 | 0 | } |
3124 | 0 | } |
3125 | 0 | } |
3126 | 0 | } |
3127 | |
|
3128 | 0 | if(len != 0) { |
3129 | 0 | len = 0; /* count output bytes; it must have been abs(len) == 2 */ |
3130 | | |
3131 | | /* write the designation sequence if necessary */ |
3132 | 0 | if(cs != pFromU2022State->cs[g]) { |
3133 | 0 | if(cs < CNS_11643) { |
3134 | 0 | uprv_memcpy(buffer, escSeqCharsCN[cs], 4); |
3135 | 0 | } else { |
3136 | 0 | U_ASSERT(cs >= CNS_11643_1); |
3137 | 0 | uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4); |
3138 | 0 | } |
3139 | 0 | len = 4; |
3140 | 0 | pFromU2022State->cs[g] = cs; |
3141 | 0 | if(g == 1) { |
3142 | | /* changing the SO/G1 charset invalidates the choices[] */ |
3143 | 0 | choiceCount = 0; |
3144 | 0 | } |
3145 | 0 | } |
3146 | | |
3147 | | /* write the shift sequence if necessary */ |
3148 | 0 | if(g != pFromU2022State->g) { |
3149 | 0 | switch(g) { |
3150 | 0 | case 1: |
3151 | 0 | buffer[len++] = UCNV_SO; |
3152 | | |
3153 | | /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */ |
3154 | 0 | pFromU2022State->g = 1; |
3155 | 0 | break; |
3156 | 0 | case 2: |
3157 | 0 | buffer[len++] = 0x1b; |
3158 | 0 | buffer[len++] = 0x4e; |
3159 | 0 | break; |
3160 | 0 | default: /* case 3 */ |
3161 | 0 | buffer[len++] = 0x1b; |
3162 | 0 | buffer[len++] = 0x4f; |
3163 | 0 | break; |
3164 | 0 | } |
3165 | 0 | } |
3166 | | |
3167 | | /* write the two output bytes */ |
3168 | 0 | buffer[len++] = (char)(targetValue >> 8); |
3169 | 0 | buffer[len++] = (char)targetValue; |
3170 | 0 | } else { |
3171 | | /* if we cannot find the character after checking all codepages |
3172 | | * then this is an error |
3173 | | */ |
3174 | 0 | *err = U_INVALID_CHAR_FOUND; |
3175 | 0 | cnv->fromUChar32=sourceChar; |
3176 | 0 | break; |
3177 | 0 | } |
3178 | 0 | } |
3179 | | |
3180 | | /* output len>0 bytes in buffer[] */ |
3181 | 0 | if(len == 1) { |
3182 | 0 | *target++ = buffer[0]; |
3183 | 0 | if(offsets) { |
3184 | 0 | *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */ |
3185 | 0 | } |
3186 | 0 | } else if(len == 2 && (target + 2) <= targetLimit) { |
3187 | 0 | *target++ = buffer[0]; |
3188 | 0 | *target++ = buffer[1]; |
3189 | 0 | if(offsets) { |
3190 | 0 | int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar)); |
3191 | 0 | *offsets++ = sourceIndex; |
3192 | 0 | *offsets++ = sourceIndex; |
3193 | 0 | } |
3194 | 0 | } else { |
3195 | 0 | fromUWriteUInt8( |
3196 | 0 | cnv, |
3197 | 0 | buffer, len, |
3198 | 0 | &target, (const char *)targetLimit, |
3199 | 0 | &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)), |
3200 | 0 | err); |
3201 | 0 | if(U_FAILURE(*err)) { |
3202 | 0 | break; |
3203 | 0 | } |
3204 | 0 | } |
3205 | 0 | } /* end if(myTargetIndex<myTargetLength) */ |
3206 | 0 | else{ |
3207 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
3208 | 0 | break; |
3209 | 0 | } |
3210 | |
|
3211 | 0 | }/* end while(mySourceIndex<mySourceLength) */ |
3212 | | |
3213 | | /* |
3214 | | * the end of the input stream and detection of truncated input |
3215 | | * are handled by the framework, but for ISO-2022-CN conversion |
3216 | | * we need to be in ASCII mode at the very end |
3217 | | * |
3218 | | * conditions: |
3219 | | * successful |
3220 | | * not in ASCII mode |
3221 | | * end of input and no truncated input |
3222 | | */ |
3223 | 0 | if( U_SUCCESS(*err) && |
3224 | 0 | pFromU2022State->g!=0 && |
3225 | 0 | args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
3226 | 0 | ) { |
3227 | 0 | int32_t sourceIndex; |
3228 | | |
3229 | | /* we are switching to ASCII */ |
3230 | 0 | pFromU2022State->g=0; |
3231 | | |
3232 | | /* get the source index of the last input character */ |
3233 | | /* |
3234 | | * TODO this would be simpler and more reliable if we used a pair |
3235 | | * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
3236 | | * so that we could simply use the prevSourceIndex here; |
3237 | | * this code gives an incorrect result for the rare case of an unmatched |
3238 | | * trail surrogate that is alone in the last buffer of the text stream |
3239 | | */ |
3240 | 0 | sourceIndex=(int32_t)(source-args->source); |
3241 | 0 | if(sourceIndex>0) { |
3242 | 0 | --sourceIndex; |
3243 | 0 | if( U16_IS_TRAIL(args->source[sourceIndex]) && |
3244 | 0 | (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
3245 | 0 | ) { |
3246 | 0 | --sourceIndex; |
3247 | 0 | } |
3248 | 0 | } else { |
3249 | 0 | sourceIndex=-1; |
3250 | 0 | } |
3251 | |
|
3252 | 0 | fromUWriteUInt8( |
3253 | 0 | cnv, |
3254 | 0 | SHIFT_IN_STR, 1, |
3255 | 0 | &target, (const char *)targetLimit, |
3256 | 0 | &offsets, sourceIndex, |
3257 | 0 | err); |
3258 | 0 | } |
3259 | | |
3260 | | /*save the state and return */ |
3261 | 0 | args->source = source; |
3262 | 0 | args->target = (char*)target; |
3263 | 0 | } |
3264 | | |
3265 | | |
3266 | | static void U_CALLCONV |
3267 | | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
3268 | 0 | UErrorCode* err){ |
3269 | 0 | char tempBuf[3]; |
3270 | 0 | const char *mySource = (char *) args->source; |
3271 | 0 | UChar *myTarget = args->target; |
3272 | 0 | const char *mySourceLimit = args->sourceLimit; |
3273 | 0 | uint32_t targetUniChar = 0x0000; |
3274 | 0 | uint32_t mySourceChar = 0x0000; |
3275 | 0 | UConverterDataISO2022* myData; |
3276 | 0 | ISO2022State *pToU2022State; |
3277 | |
|
3278 | 0 | myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
3279 | 0 | pToU2022State = &myData->toU2022State; |
3280 | |
|
3281 | 0 | if(myData->key != 0) { |
3282 | | /* continue with a partial escape sequence */ |
3283 | 0 | goto escape; |
3284 | 0 | } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) { |
3285 | | /* continue with a partial double-byte character */ |
3286 | 0 | mySourceChar = args->converter->toUBytes[0]; |
3287 | 0 | args->converter->toULength = 0; |
3288 | 0 | targetUniChar = missingCharMarker; |
3289 | 0 | goto getTrailByte; |
3290 | 0 | } |
3291 | | |
3292 | 0 | while(mySource < mySourceLimit){ |
3293 | |
|
3294 | 0 | targetUniChar =missingCharMarker; |
3295 | |
|
3296 | 0 | if(myTarget < args->targetLimit){ |
3297 | |
|
3298 | 0 | mySourceChar= (unsigned char) *mySource++; |
3299 | |
|
3300 | 0 | switch(mySourceChar){ |
3301 | 0 | case UCNV_SI: |
3302 | 0 | pToU2022State->g=0; |
3303 | 0 | if (myData->isEmptySegment) { |
3304 | 0 | myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */ |
3305 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
3306 | 0 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
3307 | 0 | args->converter->toUBytes[0] = mySourceChar; |
3308 | 0 | args->converter->toULength = 1; |
3309 | 0 | args->target = myTarget; |
3310 | 0 | args->source = mySource; |
3311 | 0 | return; |
3312 | 0 | } |
3313 | 0 | continue; |
3314 | | |
3315 | 0 | case UCNV_SO: |
3316 | 0 | if(pToU2022State->cs[1] != 0) { |
3317 | 0 | pToU2022State->g=1; |
3318 | 0 | myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */ |
3319 | 0 | continue; |
3320 | 0 | } else { |
3321 | | /* illegal to have SO before a matching designator */ |
3322 | 0 | myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */ |
3323 | 0 | break; |
3324 | 0 | } |
3325 | | |
3326 | 0 | case ESC_2022: |
3327 | 0 | mySource--; |
3328 | 0 | escape: |
3329 | 0 | { |
3330 | 0 | const char * mySourceBefore = mySource; |
3331 | 0 | int8_t toULengthBefore = args->converter->toULength; |
3332 | |
|
3333 | 0 | changeState_2022(args->converter,&(mySource), |
3334 | 0 | mySourceLimit, ISO_2022_CN,err); |
3335 | | |
3336 | | /* After SO there must be at least one character before a designator (designator error handled separately) */ |
3337 | 0 | if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) { |
3338 | 0 | *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
3339 | 0 | args->converter->toUCallbackReason = UCNV_IRREGULAR; |
3340 | 0 | args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore)); |
3341 | 0 | } |
3342 | 0 | } |
3343 | | |
3344 | | /* invalid or illegal escape sequence */ |
3345 | 0 | if(U_FAILURE(*err)){ |
3346 | 0 | args->target = myTarget; |
3347 | 0 | args->source = mySource; |
3348 | 0 | myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */ |
3349 | 0 | return; |
3350 | 0 | } |
3351 | 0 | continue; |
3352 | | |
3353 | | /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
3354 | | |
3355 | 0 | case CR: |
3356 | 0 | case LF: |
3357 | 0 | uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); |
3358 | 0 | U_FALLTHROUGH; |
3359 | 0 | default: |
3360 | | /* convert one or two bytes */ |
3361 | 0 | myData->isEmptySegment = FALSE; |
3362 | 0 | if(pToU2022State->g != 0) { |
3363 | 0 | if(mySource < mySourceLimit) { |
3364 | 0 | UConverterSharedData *cnv; |
3365 | 0 | StateEnum tempState; |
3366 | 0 | int32_t tempBufLen; |
3367 | 0 | int leadIsOk, trailIsOk; |
3368 | 0 | uint8_t trailByte; |
3369 | 0 | getTrailByte: |
3370 | 0 | trailByte = (uint8_t)*mySource; |
3371 | | /* |
3372 | | * Ticket 5691: consistent illegal sequences: |
3373 | | * - We include at least the first byte in the illegal sequence. |
3374 | | * - If any of the non-initial bytes could be the start of a character, |
3375 | | * we stop the illegal sequence before the first one of those. |
3376 | | * |
3377 | | * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is |
3378 | | * an ESC/SO/SI, we report only the first byte as the illegal sequence. |
3379 | | * Otherwise we convert or report the pair of bytes. |
3380 | | */ |
3381 | 0 | leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
3382 | 0 | trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
3383 | 0 | if (leadIsOk && trailIsOk) { |
3384 | 0 | ++mySource; |
3385 | 0 | tempState = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
3386 | 0 | if(tempState >= CNS_11643_0) { |
3387 | 0 | cnv = myData->myConverterArray[CNS_11643]; |
3388 | 0 | tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0)); |
3389 | 0 | tempBuf[1] = (char) (mySourceChar); |
3390 | 0 | tempBuf[2] = (char) trailByte; |
3391 | 0 | tempBufLen = 3; |
3392 | |
|
3393 | 0 | }else{ |
3394 | 0 | U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS); |
3395 | 0 | cnv = myData->myConverterArray[tempState]; |
3396 | 0 | tempBuf[0] = (char) (mySourceChar); |
3397 | 0 | tempBuf[1] = (char) trailByte; |
3398 | 0 | tempBufLen = 2; |
3399 | 0 | } |
3400 | 0 | targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE); |
3401 | 0 | mySourceChar = (mySourceChar << 8) | trailByte; |
3402 | 0 | } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
3403 | | /* report a pair of illegal bytes if the second byte is not a DBCS starter */ |
3404 | 0 | ++mySource; |
3405 | | /* add another bit so that the code below writes 2 bytes in case of error */ |
3406 | 0 | mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte; |
3407 | 0 | } |
3408 | 0 | if(pToU2022State->g>=2) { |
3409 | | /* return from a single-shift state to the previous one */ |
3410 | 0 | pToU2022State->g=pToU2022State->prevG; |
3411 | 0 | } |
3412 | 0 | } else { |
3413 | 0 | args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
3414 | 0 | args->converter->toULength = 1; |
3415 | 0 | goto endloop; |
3416 | 0 | } |
3417 | 0 | } |
3418 | 0 | else{ |
3419 | 0 | if(mySourceChar <= 0x7f) { |
3420 | 0 | targetUniChar = (UChar) mySourceChar; |
3421 | 0 | } |
3422 | 0 | } |
3423 | 0 | break; |
3424 | 0 | } |
3425 | 0 | if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
3426 | 0 | if(args->offsets){ |
3427 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
3428 | 0 | } |
3429 | 0 | *(myTarget++)=(UChar)targetUniChar; |
3430 | 0 | } |
3431 | 0 | else if(targetUniChar > missingCharMarker){ |
3432 | | /* disassemble the surrogate pair and write to output*/ |
3433 | 0 | targetUniChar-=0x0010000; |
3434 | 0 | *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
3435 | 0 | if(args->offsets){ |
3436 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
3437 | 0 | } |
3438 | 0 | ++myTarget; |
3439 | 0 | if(myTarget< args->targetLimit){ |
3440 | 0 | *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
3441 | 0 | if(args->offsets){ |
3442 | 0 | args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
3443 | 0 | } |
3444 | 0 | ++myTarget; |
3445 | 0 | }else{ |
3446 | 0 | args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= |
3447 | 0 | (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
3448 | 0 | } |
3449 | |
|
3450 | 0 | } |
3451 | 0 | else{ |
3452 | | /* Call the callback function*/ |
3453 | 0 | toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err); |
3454 | 0 | break; |
3455 | 0 | } |
3456 | 0 | } |
3457 | 0 | else{ |
3458 | 0 | *err =U_BUFFER_OVERFLOW_ERROR; |
3459 | 0 | break; |
3460 | 0 | } |
3461 | 0 | } |
3462 | 0 | endloop: |
3463 | 0 | args->target = myTarget; |
3464 | 0 | args->source = mySource; |
3465 | 0 | } |
3466 | | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
3467 | | |
3468 | | static void U_CALLCONV |
3469 | 0 | _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) { |
3470 | 0 | UConverter *cnv = args->converter; |
3471 | 0 | UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; |
3472 | 0 | ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
3473 | 0 | char *p, *subchar; |
3474 | 0 | char buffer[8]; |
3475 | 0 | int32_t length; |
3476 | |
|
3477 | 0 | subchar=(char *)cnv->subChars; |
3478 | 0 | length=cnv->subCharLen; /* assume length==1 for most variants */ |
3479 | |
|
3480 | 0 | p = buffer; |
3481 | 0 | switch(myConverterData->locale[0]){ |
3482 | 0 | case 'j': |
3483 | 0 | { |
3484 | 0 | int8_t cs; |
3485 | |
|
3486 | 0 | if(pFromU2022State->g == 1) { |
3487 | | /* JIS7: switch from G1 to G0 */ |
3488 | 0 | pFromU2022State->g = 0; |
3489 | 0 | *p++ = UCNV_SI; |
3490 | 0 | } |
3491 | |
|
3492 | 0 | cs = pFromU2022State->cs[0]; |
3493 | 0 | if(cs != ASCII && cs != JISX201) { |
3494 | | /* not in ASCII or JIS X 0201: switch to ASCII */ |
3495 | 0 | pFromU2022State->cs[0] = (int8_t)ASCII; |
3496 | 0 | *p++ = '\x1b'; |
3497 | 0 | *p++ = '\x28'; |
3498 | 0 | *p++ = '\x42'; |
3499 | 0 | } |
3500 | |
|
3501 | 0 | *p++ = subchar[0]; |
3502 | 0 | break; |
3503 | 0 | } |
3504 | 0 | case 'c': |
3505 | 0 | if(pFromU2022State->g != 0) { |
3506 | | /* not in ASCII mode: switch to ASCII */ |
3507 | 0 | pFromU2022State->g = 0; |
3508 | 0 | *p++ = UCNV_SI; |
3509 | 0 | } |
3510 | 0 | *p++ = subchar[0]; |
3511 | 0 | break; |
3512 | 0 | case 'k': |
3513 | 0 | if(myConverterData->version == 0) { |
3514 | 0 | if(length == 1) { |
3515 | 0 | if((UBool)args->converter->fromUnicodeStatus) { |
3516 | | /* in DBCS mode: switch to SBCS */ |
3517 | 0 | args->converter->fromUnicodeStatus = 0; |
3518 | 0 | *p++ = UCNV_SI; |
3519 | 0 | } |
3520 | 0 | *p++ = subchar[0]; |
3521 | 0 | } else /* length == 2*/ { |
3522 | 0 | if(!(UBool)args->converter->fromUnicodeStatus) { |
3523 | | /* in SBCS mode: switch to DBCS */ |
3524 | 0 | args->converter->fromUnicodeStatus = 1; |
3525 | 0 | *p++ = UCNV_SO; |
3526 | 0 | } |
3527 | 0 | *p++ = subchar[0]; |
3528 | 0 | *p++ = subchar[1]; |
3529 | 0 | } |
3530 | 0 | break; |
3531 | 0 | } else { |
3532 | | /* save the subconverter's substitution string */ |
3533 | 0 | uint8_t *currentSubChars = myConverterData->currentConverter->subChars; |
3534 | 0 | int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen; |
3535 | | |
3536 | | /* set our substitution string into the subconverter */ |
3537 | 0 | myConverterData->currentConverter->subChars = (uint8_t *)subchar; |
3538 | 0 | myConverterData->currentConverter->subCharLen = (int8_t)length; |
3539 | | |
3540 | | /* let the subconverter write the subchar, set/retrieve fromUChar32 state */ |
3541 | 0 | args->converter = myConverterData->currentConverter; |
3542 | 0 | myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
3543 | 0 | ucnv_cbFromUWriteSub(args, 0, err); |
3544 | 0 | cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
3545 | 0 | args->converter = cnv; |
3546 | | |
3547 | | /* restore the subconverter's substitution string */ |
3548 | 0 | myConverterData->currentConverter->subChars = currentSubChars; |
3549 | 0 | myConverterData->currentConverter->subCharLen = currentSubCharLen; |
3550 | |
|
3551 | 0 | if(*err == U_BUFFER_OVERFLOW_ERROR) { |
3552 | 0 | if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
3553 | 0 | uprv_memcpy( |
3554 | 0 | cnv->charErrorBuffer, |
3555 | 0 | myConverterData->currentConverter->charErrorBuffer, |
3556 | 0 | myConverterData->currentConverter->charErrorBufferLength); |
3557 | 0 | } |
3558 | 0 | cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength; |
3559 | 0 | myConverterData->currentConverter->charErrorBufferLength = 0; |
3560 | 0 | } |
3561 | 0 | return; |
3562 | 0 | } |
3563 | 0 | default: |
3564 | | /* not expected */ |
3565 | 0 | break; |
3566 | 0 | } |
3567 | 0 | ucnv_cbFromUWriteBytes(args, |
3568 | 0 | buffer, (int32_t)(p - buffer), |
3569 | 0 | offsetIndex, err); |
3570 | 0 | } |
3571 | | |
3572 | | /* |
3573 | | * Structure for cloning an ISO 2022 converter into a single memory block. |
3574 | | * ucnv_safeClone() of the converter will align the entire cloneStruct, |
3575 | | * and then ucnv_safeClone() of the sub-converter may additionally align |
3576 | | * currentConverter inside the cloneStruct, for which we need the deadSpace |
3577 | | * after currentConverter. |
3578 | | * This is because UAlignedMemory may be larger than the actually |
3579 | | * necessary alignment size for the platform. |
3580 | | * The other cloneStruct fields will not be moved around, |
3581 | | * and are aligned properly with cloneStruct's alignment. |
3582 | | */ |
3583 | | struct cloneStruct |
3584 | | { |
3585 | | UConverter cnv; |
3586 | | UConverter currentConverter; |
3587 | | UAlignedMemory deadSpace; |
3588 | | UConverterDataISO2022 mydata; |
3589 | | }; |
3590 | | |
3591 | | |
3592 | | U_CDECL_BEGIN |
3593 | | |
3594 | | static UConverter * U_CALLCONV |
3595 | | _ISO_2022_SafeClone( |
3596 | | const UConverter *cnv, |
3597 | | void *stackBuffer, |
3598 | | int32_t *pBufferSize, |
3599 | | UErrorCode *status) |
3600 | 0 | { |
3601 | 0 | struct cloneStruct * localClone; |
3602 | 0 | UConverterDataISO2022 *cnvData; |
3603 | 0 | int32_t i, size; |
3604 | |
|
3605 | 0 | if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */ |
3606 | 0 | *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
3607 | 0 | return NULL; |
3608 | 0 | } |
3609 | | |
3610 | 0 | cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
3611 | 0 | localClone = (struct cloneStruct *)stackBuffer; |
3612 | | |
3613 | | /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
3614 | |
|
3615 | 0 | uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
3616 | 0 | localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */ |
3617 | 0 | localClone->cnv.isExtraLocal = TRUE; |
3618 | | |
3619 | | /* share the subconverters */ |
3620 | |
|
3621 | 0 | if(cnvData->currentConverter != NULL) { |
3622 | 0 | size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */ |
3623 | 0 | localClone->mydata.currentConverter = |
3624 | 0 | ucnv_safeClone(cnvData->currentConverter, |
3625 | 0 | &localClone->currentConverter, |
3626 | 0 | &size, status); |
3627 | 0 | if(U_FAILURE(*status)) { |
3628 | 0 | return NULL; |
3629 | 0 | } |
3630 | 0 | } |
3631 | | |
3632 | 0 | for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
3633 | 0 | if(cnvData->myConverterArray[i] != NULL) { |
3634 | 0 | ucnv_incrementRefCount(cnvData->myConverterArray[i]); |
3635 | 0 | } |
3636 | 0 | } |
3637 | |
|
3638 | 0 | return &localClone->cnv; |
3639 | 0 | } |
3640 | | |
3641 | | U_CDECL_END |
3642 | | |
3643 | | static void U_CALLCONV |
3644 | | _ISO_2022_GetUnicodeSet(const UConverter *cnv, |
3645 | | const USetAdder *sa, |
3646 | | UConverterUnicodeSet which, |
3647 | | UErrorCode *pErrorCode) |
3648 | 0 | { |
3649 | 0 | int32_t i; |
3650 | 0 | UConverterDataISO2022* cnvData; |
3651 | |
|
3652 | 0 | if (U_FAILURE(*pErrorCode)) { |
3653 | 0 | return; |
3654 | 0 | } |
3655 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
3656 | | if (cnv->sharedData == &_ISO2022Data) { |
3657 | | /* We use UTF-8 in this case */ |
3658 | | sa->addRange(sa->set, 0, 0xd7FF); |
3659 | | sa->addRange(sa->set, 0xE000, 0x10FFFF); |
3660 | | return; |
3661 | | } |
3662 | | #endif |
3663 | | |
3664 | 0 | cnvData = (UConverterDataISO2022*)cnv->extraInfo; |
3665 | | |
3666 | | /* open a set and initialize it with code points that are algorithmically round-tripped */ |
3667 | 0 | switch(cnvData->locale[0]){ |
3668 | 0 | case 'j': |
3669 | | /* include JIS X 0201 which is hardcoded */ |
3670 | 0 | sa->add(sa->set, 0xa5); |
3671 | 0 | sa->add(sa->set, 0x203e); |
3672 | 0 | if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
3673 | | /* include Latin-1 for some variants of JP */ |
3674 | 0 | sa->addRange(sa->set, 0, 0xff); |
3675 | 0 | } else { |
3676 | | /* include ASCII for JP */ |
3677 | 0 | sa->addRange(sa->set, 0, 0x7f); |
3678 | 0 | } |
3679 | 0 | if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
3680 | | /* |
3681 | | * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0 |
3682 | | * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8) |
3683 | | * use half-width Katakana. |
3684 | | * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode) |
3685 | | * half-width Katakana via the ESC ( I sequence. |
3686 | | * However, we only emit (fromUnicode) half-width Katakana according to the |
3687 | | * definition of each variant. |
3688 | | * |
3689 | | * When including fallbacks, |
3690 | | * we need to include half-width Katakana Unicode code points for all JP variants because |
3691 | | * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana). |
3692 | | */ |
3693 | | /* include half-width Katakana for JP */ |
3694 | 0 | sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
3695 | 0 | } |
3696 | 0 | break; |
3697 | 0 | #if !UCONFIG_ONLY_HTML_CONVERSION |
3698 | 0 | case 'c': |
3699 | 0 | case 'z': |
3700 | | /* include ASCII for CN */ |
3701 | 0 | sa->addRange(sa->set, 0, 0x7f); |
3702 | 0 | break; |
3703 | 0 | case 'k': |
3704 | | /* there is only one converter for KR, and it is not in the myConverterArray[] */ |
3705 | 0 | cnvData->currentConverter->sharedData->impl->getUnicodeSet( |
3706 | 0 | cnvData->currentConverter, sa, which, pErrorCode); |
3707 | | /* the loop over myConverterArray[] will simply not find another converter */ |
3708 | 0 | break; |
3709 | 0 | #endif |
3710 | 0 | default: |
3711 | 0 | break; |
3712 | 0 | } |
3713 | | |
3714 | | #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ |
3715 | | if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
3716 | | cnvData->version==0 && i==CNS_11643 |
3717 | | ) { |
3718 | | /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */ |
3719 | | ucnv_MBCSGetUnicodeSetForBytes( |
3720 | | cnvData->myConverterArray[i], |
3721 | | sa, UCNV_ROUNDTRIP_SET, |
3722 | | 0, 0x81, 0x82, |
3723 | | pErrorCode); |
3724 | | } |
3725 | | #endif |
3726 | | |
3727 | 0 | for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
3728 | 0 | UConverterSetFilter filter; |
3729 | 0 | if(cnvData->myConverterArray[i]!=NULL) { |
3730 | 0 | if(cnvData->locale[0]=='j' && i==JISX208) { |
3731 | | /* |
3732 | | * Only add code points that map to Shift-JIS codes |
3733 | | * corresponding to JIS X 0208. |
3734 | | */ |
3735 | 0 | filter=UCNV_SET_FILTER_SJIS; |
3736 | 0 | #if !UCONFIG_ONLY_HTML_CONVERSION |
3737 | 0 | } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
3738 | 0 | cnvData->version==0 && i==CNS_11643) { |
3739 | | /* |
3740 | | * Version-specific for CN: |
3741 | | * CN version 0 does not map CNS planes 3..7 although |
3742 | | * they are all available in the CNS conversion table; |
3743 | | * CN version 1 (-EXT) does map them all. |
3744 | | * The two versions create different Unicode sets. |
3745 | | */ |
3746 | 0 | filter=UCNV_SET_FILTER_2022_CN; |
3747 | 0 | } else if(i==KSC5601) { |
3748 | | /* |
3749 | | * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables) |
3750 | | * are broader than GR94. |
3751 | | */ |
3752 | 0 | filter=UCNV_SET_FILTER_GR94DBCS; |
3753 | 0 | #endif |
3754 | 0 | } else { |
3755 | 0 | filter=UCNV_SET_FILTER_NONE; |
3756 | 0 | } |
3757 | 0 | ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); |
3758 | 0 | } |
3759 | 0 | } |
3760 | | |
3761 | | /* |
3762 | | * ISO 2022 converters must not convert SO/SI/ESC despite what |
3763 | | * sub-converters do by themselves. |
3764 | | * Remove these characters from the set. |
3765 | | */ |
3766 | 0 | sa->remove(sa->set, 0x0e); |
3767 | 0 | sa->remove(sa->set, 0x0f); |
3768 | 0 | sa->remove(sa->set, 0x1b); |
3769 | | |
3770 | | /* ISO 2022 converters do not convert C1 controls either */ |
3771 | 0 | sa->removeRange(sa->set, 0x80, 0x9f); |
3772 | 0 | } |
3773 | | |
3774 | | static const UConverterImpl _ISO2022Impl={ |
3775 | | UCNV_ISO_2022, |
3776 | | |
3777 | | NULL, |
3778 | | NULL, |
3779 | | |
3780 | | _ISO2022Open, |
3781 | | _ISO2022Close, |
3782 | | _ISO2022Reset, |
3783 | | |
3784 | | #ifdef U_ENABLE_GENERIC_ISO_2022 |
3785 | | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
3786 | | T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
3787 | | ucnv_fromUnicode_UTF8, |
3788 | | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
3789 | | #else |
3790 | | NULL, |
3791 | | NULL, |
3792 | | NULL, |
3793 | | NULL, |
3794 | | #endif |
3795 | | NULL, |
3796 | | |
3797 | | NULL, |
3798 | | _ISO2022getName, |
3799 | | _ISO_2022_WriteSub, |
3800 | | _ISO_2022_SafeClone, |
3801 | | _ISO_2022_GetUnicodeSet, |
3802 | | |
3803 | | NULL, |
3804 | | NULL |
3805 | | }; |
3806 | | static const UConverterStaticData _ISO2022StaticData={ |
3807 | | sizeof(UConverterStaticData), |
3808 | | "ISO_2022", |
3809 | | 2022, |
3810 | | UCNV_IBM, |
3811 | | UCNV_ISO_2022, |
3812 | | 1, |
3813 | | 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
3814 | | { 0x1a, 0, 0, 0 }, |
3815 | | 1, |
3816 | | FALSE, |
3817 | | FALSE, |
3818 | | 0, |
3819 | | 0, |
3820 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
3821 | | }; |
3822 | | const UConverterSharedData _ISO2022Data= |
3823 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl); |
3824 | | |
3825 | | /*************JP****************/ |
3826 | | static const UConverterImpl _ISO2022JPImpl={ |
3827 | | UCNV_ISO_2022, |
3828 | | |
3829 | | NULL, |
3830 | | NULL, |
3831 | | |
3832 | | _ISO2022Open, |
3833 | | _ISO2022Close, |
3834 | | _ISO2022Reset, |
3835 | | |
3836 | | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
3837 | | UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
3838 | | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
3839 | | UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
3840 | | NULL, |
3841 | | |
3842 | | NULL, |
3843 | | _ISO2022getName, |
3844 | | _ISO_2022_WriteSub, |
3845 | | _ISO_2022_SafeClone, |
3846 | | _ISO_2022_GetUnicodeSet, |
3847 | | |
3848 | | NULL, |
3849 | | NULL |
3850 | | }; |
3851 | | static const UConverterStaticData _ISO2022JPStaticData={ |
3852 | | sizeof(UConverterStaticData), |
3853 | | "ISO_2022_JP", |
3854 | | 0, |
3855 | | UCNV_IBM, |
3856 | | UCNV_ISO_2022, |
3857 | | 1, |
3858 | | 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ |
3859 | | { 0x1a, 0, 0, 0 }, |
3860 | | 1, |
3861 | | FALSE, |
3862 | | FALSE, |
3863 | | 0, |
3864 | | 0, |
3865 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
3866 | | }; |
3867 | | |
3868 | | namespace { |
3869 | | |
3870 | | const UConverterSharedData _ISO2022JPData= |
3871 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl); |
3872 | | |
3873 | | } // namespace |
3874 | | |
3875 | | #if !UCONFIG_ONLY_HTML_CONVERSION |
3876 | | /************* KR ***************/ |
3877 | | static const UConverterImpl _ISO2022KRImpl={ |
3878 | | UCNV_ISO_2022, |
3879 | | |
3880 | | NULL, |
3881 | | NULL, |
3882 | | |
3883 | | _ISO2022Open, |
3884 | | _ISO2022Close, |
3885 | | _ISO2022Reset, |
3886 | | |
3887 | | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
3888 | | UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
3889 | | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
3890 | | UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
3891 | | NULL, |
3892 | | |
3893 | | NULL, |
3894 | | _ISO2022getName, |
3895 | | _ISO_2022_WriteSub, |
3896 | | _ISO_2022_SafeClone, |
3897 | | _ISO_2022_GetUnicodeSet, |
3898 | | |
3899 | | NULL, |
3900 | | NULL |
3901 | | }; |
3902 | | static const UConverterStaticData _ISO2022KRStaticData={ |
3903 | | sizeof(UConverterStaticData), |
3904 | | "ISO_2022_KR", |
3905 | | 0, |
3906 | | UCNV_IBM, |
3907 | | UCNV_ISO_2022, |
3908 | | 1, |
3909 | | 8, /* max 8 bytes per UChar */ |
3910 | | { 0x1a, 0, 0, 0 }, |
3911 | | 1, |
3912 | | FALSE, |
3913 | | FALSE, |
3914 | | 0, |
3915 | | 0, |
3916 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
3917 | | }; |
3918 | | |
3919 | | namespace { |
3920 | | |
3921 | | const UConverterSharedData _ISO2022KRData= |
3922 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl); |
3923 | | |
3924 | | } // namespace |
3925 | | |
3926 | | /*************** CN ***************/ |
3927 | | static const UConverterImpl _ISO2022CNImpl={ |
3928 | | |
3929 | | UCNV_ISO_2022, |
3930 | | |
3931 | | NULL, |
3932 | | NULL, |
3933 | | |
3934 | | _ISO2022Open, |
3935 | | _ISO2022Close, |
3936 | | _ISO2022Reset, |
3937 | | |
3938 | | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
3939 | | UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
3940 | | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
3941 | | UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
3942 | | NULL, |
3943 | | |
3944 | | NULL, |
3945 | | _ISO2022getName, |
3946 | | _ISO_2022_WriteSub, |
3947 | | _ISO_2022_SafeClone, |
3948 | | _ISO_2022_GetUnicodeSet, |
3949 | | |
3950 | | NULL, |
3951 | | NULL |
3952 | | }; |
3953 | | static const UConverterStaticData _ISO2022CNStaticData={ |
3954 | | sizeof(UConverterStaticData), |
3955 | | "ISO_2022_CN", |
3956 | | 0, |
3957 | | UCNV_IBM, |
3958 | | UCNV_ISO_2022, |
3959 | | 1, |
3960 | | 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */ |
3961 | | { 0x1a, 0, 0, 0 }, |
3962 | | 1, |
3963 | | FALSE, |
3964 | | FALSE, |
3965 | | 0, |
3966 | | 0, |
3967 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
3968 | | }; |
3969 | | |
3970 | | namespace { |
3971 | | |
3972 | | const UConverterSharedData _ISO2022CNData= |
3973 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl); |
3974 | | |
3975 | | } // namespace |
3976 | | #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */ |
3977 | | |
3978 | | #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |