/src/icu/source/common/ucnv_u32.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ********************************************************************** |
5 | | * Copyright (C) 2002-2015, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ********************************************************************** |
8 | | * file name: ucnv_u32.c |
9 | | * encoding: UTF-8 |
10 | | * tab size: 8 (not used) |
11 | | * indentation:4 |
12 | | * |
13 | | * created on: 2002jul01 |
14 | | * created by: Markus W. Scherer |
15 | | * |
16 | | * UTF-32 converter implementation. Used to be in ucnv_utf.c. |
17 | | */ |
18 | | |
19 | | #include "unicode/utypes.h" |
20 | | |
21 | | #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION |
22 | | |
23 | | #include "unicode/ucnv.h" |
24 | | #include "unicode/utf.h" |
25 | | #include "ucnv_bld.h" |
26 | | #include "ucnv_cnv.h" |
27 | | #include "cmemory.h" |
28 | | |
29 | 0 | #define MAXIMUM_UCS2 0x0000FFFF |
30 | 0 | #define MAXIMUM_UTF 0x0010FFFF |
31 | 0 | #define HALF_SHIFT 10 |
32 | | #define HALF_BASE 0x0010000 |
33 | | #define HALF_MASK 0x3FF |
34 | 0 | #define SURROGATE_HIGH_START 0xD800 |
35 | | #define SURROGATE_LOW_START 0xDC00 |
36 | | |
37 | | /* -SURROGATE_LOW_START + HALF_BASE */ |
38 | 0 | #define SURROGATE_LOW_BASE 9216 |
39 | | |
40 | | enum { |
41 | | UCNV_NEED_TO_WRITE_BOM=1 |
42 | | }; |
43 | | |
44 | | /* UTF-32BE ----------------------------------------------------------------- */ |
45 | | U_CDECL_BEGIN |
46 | | static void U_CALLCONV |
47 | | T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, |
48 | | UErrorCode * err) |
49 | 0 | { |
50 | 0 | const unsigned char *mySource = (unsigned char *) args->source; |
51 | 0 | UChar *myTarget = args->target; |
52 | 0 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
53 | 0 | const UChar *targetLimit = args->targetLimit; |
54 | 0 | unsigned char *toUBytes = args->converter->toUBytes; |
55 | 0 | uint32_t ch, i; |
56 | | |
57 | | /* Restore state of current sequence */ |
58 | 0 | if (args->converter->toULength > 0 && myTarget < targetLimit) { |
59 | 0 | i = args->converter->toULength; /* restore # of bytes consumed */ |
60 | 0 | args->converter->toULength = 0; |
61 | |
|
62 | 0 | ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
63 | 0 | args->converter->toUnicodeStatus = 0; |
64 | 0 | goto morebytes; |
65 | 0 | } |
66 | | |
67 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) { |
68 | 0 | i = 0; |
69 | 0 | ch = 0; |
70 | 0 | morebytes: |
71 | 0 | while (i < sizeof(uint32_t)) { |
72 | 0 | if (mySource < sourceLimit) { |
73 | 0 | ch = (ch << 8) | (uint8_t)(*mySource); |
74 | 0 | toUBytes[i++] = (char) *(mySource++); |
75 | 0 | } |
76 | 0 | else { |
77 | | /* stores a partially calculated target*/ |
78 | | /* + 1 to make 0 a valid character */ |
79 | 0 | args->converter->toUnicodeStatus = ch + 1; |
80 | 0 | args->converter->toULength = (int8_t) i; |
81 | 0 | goto donefornow; |
82 | 0 | } |
83 | 0 | } |
84 | | |
85 | 0 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
86 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
87 | 0 | if (ch <= MAXIMUM_UCS2) |
88 | 0 | { |
89 | | /* fits in 16 bits */ |
90 | 0 | *(myTarget++) = (UChar) ch; |
91 | 0 | } |
92 | 0 | else { |
93 | | /* write out the surrogates */ |
94 | 0 | *(myTarget++) = U16_LEAD(ch); |
95 | 0 | ch = U16_TRAIL(ch); |
96 | 0 | if (myTarget < targetLimit) { |
97 | 0 | *(myTarget++) = (UChar)ch; |
98 | 0 | } |
99 | 0 | else { |
100 | | /* Put in overflow buffer (not handled here) */ |
101 | 0 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
102 | 0 | args->converter->UCharErrorBufferLength = 1; |
103 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
104 | 0 | break; |
105 | 0 | } |
106 | 0 | } |
107 | 0 | } |
108 | 0 | else { |
109 | 0 | args->converter->toULength = (int8_t)i; |
110 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
111 | 0 | break; |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | 0 | donefornow: |
116 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
117 | | /* End of target buffer */ |
118 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
119 | 0 | } |
120 | |
|
121 | 0 | args->target = myTarget; |
122 | 0 | args->source = (const char *) mySource; |
123 | 0 | } |
124 | | |
125 | | static void U_CALLCONV |
126 | | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
127 | | UErrorCode * err) |
128 | 0 | { |
129 | 0 | const unsigned char *mySource = (unsigned char *) args->source; |
130 | 0 | UChar *myTarget = args->target; |
131 | 0 | int32_t *myOffsets = args->offsets; |
132 | 0 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
133 | 0 | const UChar *targetLimit = args->targetLimit; |
134 | 0 | unsigned char *toUBytes = args->converter->toUBytes; |
135 | 0 | uint32_t ch, i; |
136 | 0 | int32_t offsetNum = 0; |
137 | | |
138 | | /* Restore state of current sequence */ |
139 | 0 | if (args->converter->toULength > 0 && myTarget < targetLimit) { |
140 | 0 | i = args->converter->toULength; /* restore # of bytes consumed */ |
141 | 0 | args->converter->toULength = 0; |
142 | |
|
143 | 0 | ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ |
144 | 0 | args->converter->toUnicodeStatus = 0; |
145 | 0 | goto morebytes; |
146 | 0 | } |
147 | | |
148 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) { |
149 | 0 | i = 0; |
150 | 0 | ch = 0; |
151 | 0 | morebytes: |
152 | 0 | while (i < sizeof(uint32_t)) { |
153 | 0 | if (mySource < sourceLimit) { |
154 | 0 | ch = (ch << 8) | (uint8_t)(*mySource); |
155 | 0 | toUBytes[i++] = (char) *(mySource++); |
156 | 0 | } |
157 | 0 | else { |
158 | | /* stores a partially calculated target*/ |
159 | | /* + 1 to make 0 a valid character */ |
160 | 0 | args->converter->toUnicodeStatus = ch + 1; |
161 | 0 | args->converter->toULength = (int8_t) i; |
162 | 0 | goto donefornow; |
163 | 0 | } |
164 | 0 | } |
165 | | |
166 | 0 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
167 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
168 | 0 | if (ch <= MAXIMUM_UCS2) { |
169 | | /* fits in 16 bits */ |
170 | 0 | *(myTarget++) = (UChar) ch; |
171 | 0 | *(myOffsets++) = offsetNum; |
172 | 0 | } |
173 | 0 | else { |
174 | | /* write out the surrogates */ |
175 | 0 | *(myTarget++) = U16_LEAD(ch); |
176 | 0 | *myOffsets++ = offsetNum; |
177 | 0 | ch = U16_TRAIL(ch); |
178 | 0 | if (myTarget < targetLimit) |
179 | 0 | { |
180 | 0 | *(myTarget++) = (UChar)ch; |
181 | 0 | *(myOffsets++) = offsetNum; |
182 | 0 | } |
183 | 0 | else { |
184 | | /* Put in overflow buffer (not handled here) */ |
185 | 0 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
186 | 0 | args->converter->UCharErrorBufferLength = 1; |
187 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
188 | 0 | break; |
189 | 0 | } |
190 | 0 | } |
191 | 0 | } |
192 | 0 | else { |
193 | 0 | args->converter->toULength = (int8_t)i; |
194 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
195 | 0 | break; |
196 | 0 | } |
197 | 0 | offsetNum += i; |
198 | 0 | } |
199 | | |
200 | 0 | donefornow: |
201 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
202 | 0 | { |
203 | | /* End of target buffer */ |
204 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
205 | 0 | } |
206 | |
|
207 | 0 | args->target = myTarget; |
208 | 0 | args->source = (const char *) mySource; |
209 | 0 | args->offsets = myOffsets; |
210 | 0 | } |
211 | | |
212 | | static void U_CALLCONV |
213 | | T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, |
214 | | UErrorCode * err) |
215 | 0 | { |
216 | 0 | const UChar *mySource = args->source; |
217 | 0 | unsigned char *myTarget; |
218 | 0 | const UChar *sourceLimit = args->sourceLimit; |
219 | 0 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
220 | 0 | UChar32 ch, ch2; |
221 | 0 | unsigned int indexToWrite; |
222 | 0 | unsigned char temp[sizeof(uint32_t)]; |
223 | |
|
224 | 0 | if(mySource >= sourceLimit) { |
225 | | /* no input, nothing to do */ |
226 | 0 | return; |
227 | 0 | } |
228 | | |
229 | | /* write the BOM if necessary */ |
230 | 0 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
231 | 0 | static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu }; |
232 | 0 | ucnv_fromUWriteBytes(args->converter, |
233 | 0 | bom, 4, |
234 | 0 | &args->target, args->targetLimit, |
235 | 0 | &args->offsets, -1, |
236 | 0 | err); |
237 | 0 | args->converter->fromUnicodeStatus=0; |
238 | 0 | } |
239 | |
|
240 | 0 | myTarget = (unsigned char *) args->target; |
241 | 0 | temp[0] = 0; |
242 | |
|
243 | 0 | if (args->converter->fromUChar32) { |
244 | 0 | ch = args->converter->fromUChar32; |
245 | 0 | args->converter->fromUChar32 = 0; |
246 | 0 | goto lowsurogate; |
247 | 0 | } |
248 | | |
249 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) { |
250 | 0 | ch = *(mySource++); |
251 | |
|
252 | 0 | if (U_IS_SURROGATE(ch)) { |
253 | 0 | if (U_IS_LEAD(ch)) { |
254 | 0 | lowsurogate: |
255 | 0 | if (mySource < sourceLimit) { |
256 | 0 | ch2 = *mySource; |
257 | 0 | if (U_IS_TRAIL(ch2)) { |
258 | 0 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
259 | 0 | mySource++; |
260 | 0 | } |
261 | 0 | else { |
262 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
263 | | /* callback(illegal) */ |
264 | 0 | args->converter->fromUChar32 = ch; |
265 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
266 | 0 | break; |
267 | 0 | } |
268 | 0 | } |
269 | 0 | else { |
270 | | /* ran out of source */ |
271 | 0 | args->converter->fromUChar32 = ch; |
272 | 0 | if (args->flush) { |
273 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
274 | | /* callback(illegal) */ |
275 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
276 | 0 | } |
277 | 0 | break; |
278 | 0 | } |
279 | 0 | } |
280 | 0 | else { |
281 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
282 | | /* callback(illegal) */ |
283 | 0 | args->converter->fromUChar32 = ch; |
284 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
285 | 0 | break; |
286 | 0 | } |
287 | 0 | } |
288 | | |
289 | | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
290 | 0 | temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
291 | 0 | temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
292 | 0 | temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
293 | |
|
294 | 0 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
295 | 0 | if (myTarget < targetLimit) { |
296 | 0 | *(myTarget++) = temp[indexToWrite]; |
297 | 0 | } |
298 | 0 | else { |
299 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
300 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
301 | 0 | } |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
306 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
307 | 0 | } |
308 | |
|
309 | 0 | args->target = (char *) myTarget; |
310 | 0 | args->source = mySource; |
311 | 0 | } |
312 | | |
313 | | static void U_CALLCONV |
314 | | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
315 | | UErrorCode * err) |
316 | 0 | { |
317 | 0 | const UChar *mySource = args->source; |
318 | 0 | unsigned char *myTarget; |
319 | 0 | int32_t *myOffsets; |
320 | 0 | const UChar *sourceLimit = args->sourceLimit; |
321 | 0 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
322 | 0 | UChar32 ch, ch2; |
323 | 0 | int32_t offsetNum = 0; |
324 | 0 | unsigned int indexToWrite; |
325 | 0 | unsigned char temp[sizeof(uint32_t)]; |
326 | |
|
327 | 0 | if(mySource >= sourceLimit) { |
328 | | /* no input, nothing to do */ |
329 | 0 | return; |
330 | 0 | } |
331 | | |
332 | | /* write the BOM if necessary */ |
333 | 0 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
334 | 0 | static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu }; |
335 | 0 | ucnv_fromUWriteBytes(args->converter, |
336 | 0 | bom, 4, |
337 | 0 | &args->target, args->targetLimit, |
338 | 0 | &args->offsets, -1, |
339 | 0 | err); |
340 | 0 | args->converter->fromUnicodeStatus=0; |
341 | 0 | } |
342 | |
|
343 | 0 | myTarget = (unsigned char *) args->target; |
344 | 0 | myOffsets = args->offsets; |
345 | 0 | temp[0] = 0; |
346 | |
|
347 | 0 | if (args->converter->fromUChar32) { |
348 | 0 | ch = args->converter->fromUChar32; |
349 | 0 | args->converter->fromUChar32 = 0; |
350 | 0 | goto lowsurogate; |
351 | 0 | } |
352 | | |
353 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) { |
354 | 0 | ch = *(mySource++); |
355 | |
|
356 | 0 | if (U_IS_SURROGATE(ch)) { |
357 | 0 | if (U_IS_LEAD(ch)) { |
358 | 0 | lowsurogate: |
359 | 0 | if (mySource < sourceLimit) { |
360 | 0 | ch2 = *mySource; |
361 | 0 | if (U_IS_TRAIL(ch2)) { |
362 | 0 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
363 | 0 | mySource++; |
364 | 0 | } |
365 | 0 | else { |
366 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
367 | | /* callback(illegal) */ |
368 | 0 | args->converter->fromUChar32 = ch; |
369 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
370 | 0 | break; |
371 | 0 | } |
372 | 0 | } |
373 | 0 | else { |
374 | | /* ran out of source */ |
375 | 0 | args->converter->fromUChar32 = ch; |
376 | 0 | if (args->flush) { |
377 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
378 | | /* callback(illegal) */ |
379 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
380 | 0 | } |
381 | 0 | break; |
382 | 0 | } |
383 | 0 | } |
384 | 0 | else { |
385 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
386 | | /* callback(illegal) */ |
387 | 0 | args->converter->fromUChar32 = ch; |
388 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
389 | 0 | break; |
390 | 0 | } |
391 | 0 | } |
392 | | |
393 | | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
394 | 0 | temp[1] = (uint8_t) (ch >> 16 & 0x1F); |
395 | 0 | temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
396 | 0 | temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
397 | |
|
398 | 0 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { |
399 | 0 | if (myTarget < targetLimit) { |
400 | 0 | *(myTarget++) = temp[indexToWrite]; |
401 | 0 | *(myOffsets++) = offsetNum; |
402 | 0 | } |
403 | 0 | else { |
404 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
405 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
406 | 0 | } |
407 | 0 | } |
408 | 0 | offsetNum = offsetNum + 1 + (temp[1] != 0); |
409 | 0 | } |
410 | | |
411 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { |
412 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
413 | 0 | } |
414 | |
|
415 | 0 | args->target = (char *) myTarget; |
416 | 0 | args->source = mySource; |
417 | 0 | args->offsets = myOffsets; |
418 | 0 | } |
419 | | |
420 | | static UChar32 U_CALLCONV |
421 | | T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, |
422 | | UErrorCode* err) |
423 | 0 | { |
424 | 0 | const uint8_t *mySource; |
425 | 0 | UChar32 myUChar; |
426 | 0 | int32_t length; |
427 | |
|
428 | 0 | mySource = (const uint8_t *)args->source; |
429 | 0 | if (mySource >= (const uint8_t *)args->sourceLimit) |
430 | 0 | { |
431 | | /* no input */ |
432 | 0 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
433 | 0 | return 0xffff; |
434 | 0 | } |
435 | | |
436 | 0 | length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
437 | 0 | if (length < 4) |
438 | 0 | { |
439 | | /* got a partial character */ |
440 | 0 | uprv_memcpy(args->converter->toUBytes, mySource, length); |
441 | 0 | args->converter->toULength = (int8_t)length; |
442 | 0 | args->source = (const char *)(mySource + length); |
443 | 0 | *err = U_TRUNCATED_CHAR_FOUND; |
444 | 0 | return 0xffff; |
445 | 0 | } |
446 | | |
447 | | /* Don't even try to do a direct cast because the value may be on an odd address. */ |
448 | 0 | myUChar = ((UChar32)mySource[0] << 24) |
449 | 0 | | ((UChar32)mySource[1] << 16) |
450 | 0 | | ((UChar32)mySource[2] << 8) |
451 | 0 | | ((UChar32)mySource[3]); |
452 | |
|
453 | 0 | args->source = (const char *)(mySource + 4); |
454 | 0 | if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
455 | 0 | return myUChar; |
456 | 0 | } |
457 | | |
458 | 0 | uprv_memcpy(args->converter->toUBytes, mySource, 4); |
459 | 0 | args->converter->toULength = 4; |
460 | |
|
461 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
462 | 0 | return 0xffff; |
463 | 0 | } |
464 | | U_CDECL_END |
465 | | static const UConverterImpl _UTF32BEImpl = { |
466 | | UCNV_UTF32_BigEndian, |
467 | | |
468 | | NULL, |
469 | | NULL, |
470 | | |
471 | | NULL, |
472 | | NULL, |
473 | | NULL, |
474 | | |
475 | | T_UConverter_toUnicode_UTF32_BE, |
476 | | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, |
477 | | T_UConverter_fromUnicode_UTF32_BE, |
478 | | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
479 | | T_UConverter_getNextUChar_UTF32_BE, |
480 | | |
481 | | NULL, |
482 | | NULL, |
483 | | NULL, |
484 | | NULL, |
485 | | ucnv_getNonSurrogateUnicodeSet, |
486 | | |
487 | | NULL, |
488 | | NULL |
489 | | }; |
490 | | |
491 | | /* The 1232 CCSID refers to any version of Unicode with any endianness of UTF-32 */ |
492 | | static const UConverterStaticData _UTF32BEStaticData = { |
493 | | sizeof(UConverterStaticData), |
494 | | "UTF-32BE", |
495 | | 1232, |
496 | | UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, |
497 | | { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, |
498 | | 0, |
499 | | 0, |
500 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
501 | | }; |
502 | | |
503 | | const UConverterSharedData _UTF32BEData = |
504 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl); |
505 | | |
506 | | /* UTF-32LE ---------------------------------------------------------- */ |
507 | | U_CDECL_BEGIN |
508 | | static void U_CALLCONV |
509 | | T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, |
510 | | UErrorCode * err) |
511 | 0 | { |
512 | 0 | const unsigned char *mySource = (unsigned char *) args->source; |
513 | 0 | UChar *myTarget = args->target; |
514 | 0 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
515 | 0 | const UChar *targetLimit = args->targetLimit; |
516 | 0 | unsigned char *toUBytes = args->converter->toUBytes; |
517 | 0 | uint32_t ch, i; |
518 | | |
519 | | /* Restore state of current sequence */ |
520 | 0 | if (args->converter->toULength > 0 && myTarget < targetLimit) |
521 | 0 | { |
522 | 0 | i = args->converter->toULength; /* restore # of bytes consumed */ |
523 | 0 | args->converter->toULength = 0; |
524 | | |
525 | | /* Stores the previously calculated ch from a previous call*/ |
526 | 0 | ch = args->converter->toUnicodeStatus - 1; |
527 | 0 | args->converter->toUnicodeStatus = 0; |
528 | 0 | goto morebytes; |
529 | 0 | } |
530 | | |
531 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
532 | 0 | { |
533 | 0 | i = 0; |
534 | 0 | ch = 0; |
535 | 0 | morebytes: |
536 | 0 | while (i < sizeof(uint32_t)) |
537 | 0 | { |
538 | 0 | if (mySource < sourceLimit) |
539 | 0 | { |
540 | 0 | ch |= ((uint8_t)(*mySource)) << (i * 8); |
541 | 0 | toUBytes[i++] = (char) *(mySource++); |
542 | 0 | } |
543 | 0 | else |
544 | 0 | { |
545 | | /* stores a partially calculated target*/ |
546 | | /* + 1 to make 0 a valid character */ |
547 | 0 | args->converter->toUnicodeStatus = ch + 1; |
548 | 0 | args->converter->toULength = (int8_t) i; |
549 | 0 | goto donefornow; |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | 0 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { |
554 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
555 | 0 | if (ch <= MAXIMUM_UCS2) { |
556 | | /* fits in 16 bits */ |
557 | 0 | *(myTarget++) = (UChar) ch; |
558 | 0 | } |
559 | 0 | else { |
560 | | /* write out the surrogates */ |
561 | 0 | *(myTarget++) = U16_LEAD(ch); |
562 | 0 | ch = U16_TRAIL(ch); |
563 | 0 | if (myTarget < targetLimit) { |
564 | 0 | *(myTarget++) = (UChar)ch; |
565 | 0 | } |
566 | 0 | else { |
567 | | /* Put in overflow buffer (not handled here) */ |
568 | 0 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
569 | 0 | args->converter->UCharErrorBufferLength = 1; |
570 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
571 | 0 | break; |
572 | 0 | } |
573 | 0 | } |
574 | 0 | } |
575 | 0 | else { |
576 | 0 | args->converter->toULength = (int8_t)i; |
577 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
578 | 0 | break; |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | 0 | donefornow: |
583 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
584 | 0 | { |
585 | | /* End of target buffer */ |
586 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
587 | 0 | } |
588 | |
|
589 | 0 | args->target = myTarget; |
590 | 0 | args->source = (const char *) mySource; |
591 | 0 | } |
592 | | |
593 | | static void U_CALLCONV |
594 | | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, |
595 | | UErrorCode * err) |
596 | 0 | { |
597 | 0 | const unsigned char *mySource = (unsigned char *) args->source; |
598 | 0 | UChar *myTarget = args->target; |
599 | 0 | int32_t *myOffsets = args->offsets; |
600 | 0 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
601 | 0 | const UChar *targetLimit = args->targetLimit; |
602 | 0 | unsigned char *toUBytes = args->converter->toUBytes; |
603 | 0 | uint32_t ch, i; |
604 | 0 | int32_t offsetNum = 0; |
605 | | |
606 | | /* Restore state of current sequence */ |
607 | 0 | if (args->converter->toULength > 0 && myTarget < targetLimit) |
608 | 0 | { |
609 | 0 | i = args->converter->toULength; /* restore # of bytes consumed */ |
610 | 0 | args->converter->toULength = 0; |
611 | | |
612 | | /* Stores the previously calculated ch from a previous call*/ |
613 | 0 | ch = args->converter->toUnicodeStatus - 1; |
614 | 0 | args->converter->toUnicodeStatus = 0; |
615 | 0 | goto morebytes; |
616 | 0 | } |
617 | | |
618 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
619 | 0 | { |
620 | 0 | i = 0; |
621 | 0 | ch = 0; |
622 | 0 | morebytes: |
623 | 0 | while (i < sizeof(uint32_t)) |
624 | 0 | { |
625 | 0 | if (mySource < sourceLimit) |
626 | 0 | { |
627 | 0 | ch |= ((uint8_t)(*mySource)) << (i * 8); |
628 | 0 | toUBytes[i++] = (char) *(mySource++); |
629 | 0 | } |
630 | 0 | else |
631 | 0 | { |
632 | | /* stores a partially calculated target*/ |
633 | | /* + 1 to make 0 a valid character */ |
634 | 0 | args->converter->toUnicodeStatus = ch + 1; |
635 | 0 | args->converter->toULength = (int8_t) i; |
636 | 0 | goto donefornow; |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | 0 | if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) |
641 | 0 | { |
642 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
643 | 0 | if (ch <= MAXIMUM_UCS2) |
644 | 0 | { |
645 | | /* fits in 16 bits */ |
646 | 0 | *(myTarget++) = (UChar) ch; |
647 | 0 | *(myOffsets++) = offsetNum; |
648 | 0 | } |
649 | 0 | else { |
650 | | /* write out the surrogates */ |
651 | 0 | *(myTarget++) = U16_LEAD(ch); |
652 | 0 | *(myOffsets++) = offsetNum; |
653 | 0 | ch = U16_TRAIL(ch); |
654 | 0 | if (myTarget < targetLimit) |
655 | 0 | { |
656 | 0 | *(myTarget++) = (UChar)ch; |
657 | 0 | *(myOffsets++) = offsetNum; |
658 | 0 | } |
659 | 0 | else |
660 | 0 | { |
661 | | /* Put in overflow buffer (not handled here) */ |
662 | 0 | args->converter->UCharErrorBuffer[0] = (UChar) ch; |
663 | 0 | args->converter->UCharErrorBufferLength = 1; |
664 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
665 | 0 | break; |
666 | 0 | } |
667 | 0 | } |
668 | 0 | } |
669 | 0 | else |
670 | 0 | { |
671 | 0 | args->converter->toULength = (int8_t)i; |
672 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
673 | 0 | break; |
674 | 0 | } |
675 | 0 | offsetNum += i; |
676 | 0 | } |
677 | | |
678 | 0 | donefornow: |
679 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
680 | 0 | { |
681 | | /* End of target buffer */ |
682 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
683 | 0 | } |
684 | |
|
685 | 0 | args->target = myTarget; |
686 | 0 | args->source = (const char *) mySource; |
687 | 0 | args->offsets = myOffsets; |
688 | 0 | } |
689 | | |
690 | | static void U_CALLCONV |
691 | | T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, |
692 | | UErrorCode * err) |
693 | 0 | { |
694 | 0 | const UChar *mySource = args->source; |
695 | 0 | unsigned char *myTarget; |
696 | 0 | const UChar *sourceLimit = args->sourceLimit; |
697 | 0 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
698 | 0 | UChar32 ch, ch2; |
699 | 0 | unsigned int indexToWrite; |
700 | 0 | unsigned char temp[sizeof(uint32_t)]; |
701 | |
|
702 | 0 | if(mySource >= sourceLimit) { |
703 | | /* no input, nothing to do */ |
704 | 0 | return; |
705 | 0 | } |
706 | | |
707 | | /* write the BOM if necessary */ |
708 | 0 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
709 | 0 | static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 }; |
710 | 0 | ucnv_fromUWriteBytes(args->converter, |
711 | 0 | bom, 4, |
712 | 0 | &args->target, args->targetLimit, |
713 | 0 | &args->offsets, -1, |
714 | 0 | err); |
715 | 0 | args->converter->fromUnicodeStatus=0; |
716 | 0 | } |
717 | |
|
718 | 0 | myTarget = (unsigned char *) args->target; |
719 | 0 | temp[3] = 0; |
720 | |
|
721 | 0 | if (args->converter->fromUChar32) |
722 | 0 | { |
723 | 0 | ch = args->converter->fromUChar32; |
724 | 0 | args->converter->fromUChar32 = 0; |
725 | 0 | goto lowsurogate; |
726 | 0 | } |
727 | | |
728 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
729 | 0 | { |
730 | 0 | ch = *(mySource++); |
731 | |
|
732 | 0 | if (U16_IS_SURROGATE(ch)) { |
733 | 0 | if (U16_IS_LEAD(ch)) |
734 | 0 | { |
735 | 0 | lowsurogate: |
736 | 0 | if (mySource < sourceLimit) |
737 | 0 | { |
738 | 0 | ch2 = *mySource; |
739 | 0 | if (U16_IS_TRAIL(ch2)) { |
740 | 0 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
741 | 0 | mySource++; |
742 | 0 | } |
743 | 0 | else { |
744 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
745 | | /* callback(illegal) */ |
746 | 0 | args->converter->fromUChar32 = ch; |
747 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
748 | 0 | break; |
749 | 0 | } |
750 | 0 | } |
751 | 0 | else { |
752 | | /* ran out of source */ |
753 | 0 | args->converter->fromUChar32 = ch; |
754 | 0 | if (args->flush) { |
755 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
756 | | /* callback(illegal) */ |
757 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
758 | 0 | } |
759 | 0 | break; |
760 | 0 | } |
761 | 0 | } |
762 | 0 | else { |
763 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
764 | | /* callback(illegal) */ |
765 | 0 | args->converter->fromUChar32 = ch; |
766 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
767 | 0 | break; |
768 | 0 | } |
769 | 0 | } |
770 | | |
771 | | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
772 | 0 | temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
773 | 0 | temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
774 | 0 | temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
775 | |
|
776 | 0 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
777 | 0 | { |
778 | 0 | if (myTarget < targetLimit) |
779 | 0 | { |
780 | 0 | *(myTarget++) = temp[indexToWrite]; |
781 | 0 | } |
782 | 0 | else |
783 | 0 | { |
784 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
785 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
786 | 0 | } |
787 | 0 | } |
788 | 0 | } |
789 | | |
790 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
791 | 0 | { |
792 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
793 | 0 | } |
794 | |
|
795 | 0 | args->target = (char *) myTarget; |
796 | 0 | args->source = mySource; |
797 | 0 | } |
798 | | |
799 | | static void U_CALLCONV |
800 | | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, |
801 | | UErrorCode * err) |
802 | 0 | { |
803 | 0 | const UChar *mySource = args->source; |
804 | 0 | unsigned char *myTarget; |
805 | 0 | int32_t *myOffsets; |
806 | 0 | const UChar *sourceLimit = args->sourceLimit; |
807 | 0 | const unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
808 | 0 | UChar32 ch, ch2; |
809 | 0 | unsigned int indexToWrite; |
810 | 0 | unsigned char temp[sizeof(uint32_t)]; |
811 | 0 | int32_t offsetNum = 0; |
812 | |
|
813 | 0 | if(mySource >= sourceLimit) { |
814 | | /* no input, nothing to do */ |
815 | 0 | return; |
816 | 0 | } |
817 | | |
818 | | /* write the BOM if necessary */ |
819 | 0 | if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
820 | 0 | static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 }; |
821 | 0 | ucnv_fromUWriteBytes(args->converter, |
822 | 0 | bom, 4, |
823 | 0 | &args->target, args->targetLimit, |
824 | 0 | &args->offsets, -1, |
825 | 0 | err); |
826 | 0 | args->converter->fromUnicodeStatus=0; |
827 | 0 | } |
828 | |
|
829 | 0 | myTarget = (unsigned char *) args->target; |
830 | 0 | myOffsets = args->offsets; |
831 | 0 | temp[3] = 0; |
832 | |
|
833 | 0 | if (args->converter->fromUChar32) |
834 | 0 | { |
835 | 0 | ch = args->converter->fromUChar32; |
836 | 0 | args->converter->fromUChar32 = 0; |
837 | 0 | goto lowsurogate; |
838 | 0 | } |
839 | | |
840 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
841 | 0 | { |
842 | 0 | ch = *(mySource++); |
843 | |
|
844 | 0 | if (U16_IS_SURROGATE(ch)) { |
845 | 0 | if (U16_IS_LEAD(ch)) |
846 | 0 | { |
847 | 0 | lowsurogate: |
848 | 0 | if (mySource < sourceLimit) |
849 | 0 | { |
850 | 0 | ch2 = *mySource; |
851 | 0 | if (U16_IS_TRAIL(ch2)) |
852 | 0 | { |
853 | 0 | ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; |
854 | 0 | mySource++; |
855 | 0 | } |
856 | 0 | else { |
857 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
858 | | /* callback(illegal) */ |
859 | 0 | args->converter->fromUChar32 = ch; |
860 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
861 | 0 | break; |
862 | 0 | } |
863 | 0 | } |
864 | 0 | else { |
865 | | /* ran out of source */ |
866 | 0 | args->converter->fromUChar32 = ch; |
867 | 0 | if (args->flush) { |
868 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
869 | | /* callback(illegal) */ |
870 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
871 | 0 | } |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | } |
875 | 0 | else { |
876 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
877 | | /* callback(illegal) */ |
878 | 0 | args->converter->fromUChar32 = ch; |
879 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
880 | 0 | break; |
881 | 0 | } |
882 | 0 | } |
883 | | |
884 | | /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ |
885 | 0 | temp[2] = (uint8_t) (ch >> 16 & 0x1F); |
886 | 0 | temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ |
887 | 0 | temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ |
888 | |
|
889 | 0 | for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) |
890 | 0 | { |
891 | 0 | if (myTarget < targetLimit) |
892 | 0 | { |
893 | 0 | *(myTarget++) = temp[indexToWrite]; |
894 | 0 | *(myOffsets++) = offsetNum; |
895 | 0 | } |
896 | 0 | else |
897 | 0 | { |
898 | 0 | args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; |
899 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
900 | 0 | } |
901 | 0 | } |
902 | 0 | offsetNum = offsetNum + 1 + (temp[2] != 0); |
903 | 0 | } |
904 | | |
905 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
906 | 0 | { |
907 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
908 | 0 | } |
909 | |
|
910 | 0 | args->target = (char *) myTarget; |
911 | 0 | args->source = mySource; |
912 | 0 | args->offsets = myOffsets; |
913 | 0 | } |
914 | | |
915 | | static UChar32 U_CALLCONV |
916 | | T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, |
917 | | UErrorCode* err) |
918 | 0 | { |
919 | 0 | const uint8_t *mySource; |
920 | 0 | UChar32 myUChar; |
921 | 0 | int32_t length; |
922 | |
|
923 | 0 | mySource = (const uint8_t *)args->source; |
924 | 0 | if (mySource >= (const uint8_t *)args->sourceLimit) |
925 | 0 | { |
926 | | /* no input */ |
927 | 0 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
928 | 0 | return 0xffff; |
929 | 0 | } |
930 | | |
931 | 0 | length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); |
932 | 0 | if (length < 4) |
933 | 0 | { |
934 | | /* got a partial character */ |
935 | 0 | uprv_memcpy(args->converter->toUBytes, mySource, length); |
936 | 0 | args->converter->toULength = (int8_t)length; |
937 | 0 | args->source = (const char *)(mySource + length); |
938 | 0 | *err = U_TRUNCATED_CHAR_FOUND; |
939 | 0 | return 0xffff; |
940 | 0 | } |
941 | | |
942 | | /* Don't even try to do a direct cast because the value may be on an odd address. */ |
943 | 0 | myUChar = ((UChar32)mySource[3] << 24) |
944 | 0 | | ((UChar32)mySource[2] << 16) |
945 | 0 | | ((UChar32)mySource[1] << 8) |
946 | 0 | | ((UChar32)mySource[0]); |
947 | |
|
948 | 0 | args->source = (const char *)(mySource + 4); |
949 | 0 | if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { |
950 | 0 | return myUChar; |
951 | 0 | } |
952 | | |
953 | 0 | uprv_memcpy(args->converter->toUBytes, mySource, 4); |
954 | 0 | args->converter->toULength = 4; |
955 | |
|
956 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
957 | 0 | return 0xffff; |
958 | 0 | } |
959 | | U_CDECL_END |
960 | | static const UConverterImpl _UTF32LEImpl = { |
961 | | UCNV_UTF32_LittleEndian, |
962 | | |
963 | | NULL, |
964 | | NULL, |
965 | | |
966 | | NULL, |
967 | | NULL, |
968 | | NULL, |
969 | | |
970 | | T_UConverter_toUnicode_UTF32_LE, |
971 | | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, |
972 | | T_UConverter_fromUnicode_UTF32_LE, |
973 | | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
974 | | T_UConverter_getNextUChar_UTF32_LE, |
975 | | |
976 | | NULL, |
977 | | NULL, |
978 | | NULL, |
979 | | NULL, |
980 | | ucnv_getNonSurrogateUnicodeSet, |
981 | | |
982 | | NULL, |
983 | | NULL |
984 | | }; |
985 | | |
986 | | /* The 1232 CCSID refers to any version of Unicode with any endianness of UTF-32 */ |
987 | | static const UConverterStaticData _UTF32LEStaticData = { |
988 | | sizeof(UConverterStaticData), |
989 | | "UTF-32LE", |
990 | | 1234, |
991 | | UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, |
992 | | { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, |
993 | | 0, |
994 | | 0, |
995 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
996 | | }; |
997 | | |
998 | | |
999 | | const UConverterSharedData _UTF32LEData = |
1000 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl); |
1001 | | |
1002 | | /* UTF-32 (Detect BOM) ------------------------------------------------------ */ |
1003 | | |
1004 | | /* |
1005 | | * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE |
1006 | | * accordingly. |
1007 | | * |
1008 | | * State values: |
1009 | | * 0 initial state |
1010 | | * 1 saw 00 |
1011 | | * 2 saw 00 00 |
1012 | | * 3 saw 00 00 FE |
1013 | | * 4 - |
1014 | | * 5 saw FF |
1015 | | * 6 saw FF FE |
1016 | | * 7 saw FF FE 00 |
1017 | | * 8 UTF-32BE mode |
1018 | | * 9 UTF-32LE mode |
1019 | | * |
1020 | | * During detection: state&3==number of matching bytes so far. |
1021 | | * |
1022 | | * On output, emit U+FEFF as the first code point. |
1023 | | */ |
1024 | | U_CDECL_BEGIN |
1025 | | static void U_CALLCONV |
1026 | 0 | _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { |
1027 | 0 | if(choice<=UCNV_RESET_TO_UNICODE) { |
1028 | | /* reset toUnicode: state=0 */ |
1029 | 0 | cnv->mode=0; |
1030 | 0 | } |
1031 | 0 | if(choice!=UCNV_RESET_TO_UNICODE) { |
1032 | | /* reset fromUnicode: prepare to output the UTF-32PE BOM */ |
1033 | 0 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
1034 | 0 | } |
1035 | 0 | } |
1036 | | |
1037 | | static void U_CALLCONV |
1038 | | _UTF32Open(UConverter *cnv, |
1039 | | UConverterLoadArgs *pArgs, |
1040 | 0 | UErrorCode *pErrorCode) { |
1041 | 0 | (void)pArgs; |
1042 | 0 | (void)pErrorCode; |
1043 | 0 | _UTF32Reset(cnv, UCNV_RESET_BOTH); |
1044 | 0 | } |
1045 | | |
1046 | | static const char utf32BOM[8]={ 0, 0, (char)0xfeu, (char)0xffu, (char)0xffu, (char)0xfeu, 0, 0 }; |
1047 | | |
1048 | | static void U_CALLCONV |
1049 | | _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
1050 | 0 | UErrorCode *pErrorCode) { |
1051 | 0 | UConverter *cnv=pArgs->converter; |
1052 | 0 | const char *source=pArgs->source; |
1053 | 0 | const char *sourceLimit=pArgs->sourceLimit; |
1054 | 0 | int32_t *offsets=pArgs->offsets; |
1055 | |
|
1056 | 0 | int32_t state, offsetDelta; |
1057 | 0 | char b; |
1058 | |
|
1059 | 0 | state=cnv->mode; |
1060 | | |
1061 | | /* |
1062 | | * If we detect a BOM in this buffer, then we must add the BOM size to the |
1063 | | * offsets because the actual converter function will not see and count the BOM. |
1064 | | * offsetDelta will have the number of the BOM bytes that are in the current buffer. |
1065 | | */ |
1066 | 0 | offsetDelta=0; |
1067 | |
|
1068 | 0 | while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
1069 | 0 | switch(state) { |
1070 | 0 | case 0: |
1071 | 0 | b=*source; |
1072 | 0 | if(b==0) { |
1073 | 0 | state=1; /* could be 00 00 FE FF */ |
1074 | 0 | } else if(b==(char)0xffu) { |
1075 | 0 | state=5; /* could be FF FE 00 00 */ |
1076 | 0 | } else { |
1077 | 0 | state=8; /* default to UTF-32BE */ |
1078 | 0 | continue; |
1079 | 0 | } |
1080 | 0 | ++source; |
1081 | 0 | break; |
1082 | 0 | case 1: |
1083 | 0 | case 2: |
1084 | 0 | case 3: |
1085 | 0 | case 5: |
1086 | 0 | case 6: |
1087 | 0 | case 7: |
1088 | 0 | if(*source==utf32BOM[state]) { |
1089 | 0 | ++state; |
1090 | 0 | ++source; |
1091 | 0 | if(state==4) { |
1092 | 0 | state=8; /* detect UTF-32BE */ |
1093 | 0 | offsetDelta=(int32_t)(source-pArgs->source); |
1094 | 0 | } else if(state==8) { |
1095 | 0 | state=9; /* detect UTF-32LE */ |
1096 | 0 | offsetDelta=(int32_t)(source-pArgs->source); |
1097 | 0 | } |
1098 | 0 | } else { |
1099 | | /* switch to UTF-32BE and pass the previous bytes */ |
1100 | 0 | int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ |
1101 | | |
1102 | | /* reset the source */ |
1103 | 0 | source=pArgs->source; |
1104 | |
|
1105 | 0 | if(count==(state&3)) { |
1106 | | /* simple: all in the same buffer, just reset source */ |
1107 | 0 | } else { |
1108 | 0 | UBool oldFlush=pArgs->flush; |
1109 | | |
1110 | | /* some of the bytes are from a previous buffer, replay those first */ |
1111 | 0 | pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
1112 | 0 | pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ |
1113 | 0 | pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ |
1114 | | |
1115 | | /* no offsets: bytes from previous buffer, and not enough for output */ |
1116 | 0 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
1117 | | |
1118 | | /* restore real pointers; pArgs->source will be set in case 8/9 */ |
1119 | 0 | pArgs->sourceLimit=sourceLimit; |
1120 | 0 | pArgs->flush=oldFlush; |
1121 | 0 | } |
1122 | 0 | state=8; |
1123 | 0 | continue; |
1124 | 0 | } |
1125 | 0 | break; |
1126 | 0 | case 8: |
1127 | | /* call UTF-32BE */ |
1128 | 0 | pArgs->source=source; |
1129 | 0 | if(offsets==NULL) { |
1130 | 0 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
1131 | 0 | } else { |
1132 | 0 | T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); |
1133 | 0 | } |
1134 | 0 | source=pArgs->source; |
1135 | 0 | break; |
1136 | 0 | case 9: |
1137 | | /* call UTF-32LE */ |
1138 | 0 | pArgs->source=source; |
1139 | 0 | if(offsets==NULL) { |
1140 | 0 | T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
1141 | 0 | } else { |
1142 | 0 | T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); |
1143 | 0 | } |
1144 | 0 | source=pArgs->source; |
1145 | 0 | break; |
1146 | 0 | default: |
1147 | 0 | break; /* does not occur */ |
1148 | 0 | } |
1149 | 0 | } |
1150 | | |
1151 | | /* add BOM size to offsets - see comment at offsetDelta declaration */ |
1152 | 0 | if(offsets!=NULL && offsetDelta!=0) { |
1153 | 0 | int32_t *offsetsLimit=pArgs->offsets; |
1154 | 0 | while(offsets<offsetsLimit) { |
1155 | 0 | *offsets++ += offsetDelta; |
1156 | 0 | } |
1157 | 0 | } |
1158 | |
|
1159 | 0 | pArgs->source=source; |
1160 | |
|
1161 | 0 | if(source==sourceLimit && pArgs->flush) { |
1162 | | /* handle truncated input */ |
1163 | 0 | switch(state) { |
1164 | 0 | case 0: |
1165 | 0 | break; /* no input at all, nothing to do */ |
1166 | 0 | case 8: |
1167 | 0 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
1168 | 0 | break; |
1169 | 0 | case 9: |
1170 | 0 | T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); |
1171 | 0 | break; |
1172 | 0 | default: |
1173 | | /* handle 0<state<8: call UTF-32BE with too-short input */ |
1174 | 0 | pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ |
1175 | 0 | pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ |
1176 | | |
1177 | | /* no offsets: not enough for output */ |
1178 | 0 | T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); |
1179 | 0 | pArgs->source=source; |
1180 | 0 | pArgs->sourceLimit=sourceLimit; |
1181 | 0 | state=8; |
1182 | 0 | break; |
1183 | 0 | } |
1184 | 0 | } |
1185 | | |
1186 | 0 | cnv->mode=state; |
1187 | 0 | } |
1188 | | |
1189 | | static UChar32 U_CALLCONV |
1190 | | _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, |
1191 | 0 | UErrorCode *pErrorCode) { |
1192 | 0 | switch(pArgs->converter->mode) { |
1193 | 0 | case 8: |
1194 | 0 | return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); |
1195 | 0 | case 9: |
1196 | 0 | return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); |
1197 | 0 | default: |
1198 | 0 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
1199 | 0 | } |
1200 | 0 | } |
1201 | | U_CDECL_END |
1202 | | static const UConverterImpl _UTF32Impl = { |
1203 | | UCNV_UTF32, |
1204 | | |
1205 | | NULL, |
1206 | | NULL, |
1207 | | |
1208 | | _UTF32Open, |
1209 | | NULL, |
1210 | | _UTF32Reset, |
1211 | | |
1212 | | _UTF32ToUnicodeWithOffsets, |
1213 | | _UTF32ToUnicodeWithOffsets, |
1214 | | #if U_IS_BIG_ENDIAN |
1215 | | T_UConverter_fromUnicode_UTF32_BE, |
1216 | | T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, |
1217 | | #else |
1218 | | T_UConverter_fromUnicode_UTF32_LE, |
1219 | | T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, |
1220 | | #endif |
1221 | | _UTF32GetNextUChar, |
1222 | | |
1223 | | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
1224 | | NULL, |
1225 | | NULL, |
1226 | | NULL, |
1227 | | ucnv_getNonSurrogateUnicodeSet, |
1228 | | |
1229 | | NULL, |
1230 | | NULL |
1231 | | }; |
1232 | | |
1233 | | /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianness of UTF-32 */ |
1234 | | static const UConverterStaticData _UTF32StaticData = { |
1235 | | sizeof(UConverterStaticData), |
1236 | | "UTF-32", |
1237 | | 1236, |
1238 | | UCNV_IBM, UCNV_UTF32, 4, 4, |
1239 | | #if U_IS_BIG_ENDIAN |
1240 | | { 0, 0, 0xff, 0xfd }, 4, |
1241 | | #else |
1242 | | { 0xfd, 0xff, 0, 0 }, 4, |
1243 | | #endif |
1244 | | FALSE, FALSE, |
1245 | | 0, |
1246 | | 0, |
1247 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
1248 | | }; |
1249 | | |
1250 | | const UConverterSharedData _UTF32Data = |
1251 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl); |
1252 | | |
1253 | | #endif |