/src/icu/source/common/ucnv_u8.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ********************************************************************** |
5 | | * Copyright (C) 2002-2016, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ********************************************************************** |
8 | | * file name: ucnv_u8.c |
9 | | * encoding: UTF-8 |
10 | | * tab size: 8 (not used) |
11 | | * indentation:4 |
12 | | * |
13 | | * created on: 2002jul01 |
14 | | * created by: Markus W. Scherer |
15 | | * |
16 | | * UTF-8 converter implementation. Used to be in ucnv_utf.c. |
17 | | * |
18 | | * Also, CESU-8 implementation, see UTR 26. |
19 | | * The CESU-8 converter uses all the same functions as the |
20 | | * UTF-8 converter, with a branch for converting supplementary code points. |
21 | | */ |
22 | | |
23 | | #include "unicode/utypes.h" |
24 | | |
25 | | #if !UCONFIG_NO_CONVERSION |
26 | | |
27 | | #include "unicode/ucnv.h" |
28 | | #include "unicode/utf.h" |
29 | | #include "unicode/utf8.h" |
30 | | #include "unicode/utf16.h" |
31 | | #include "ucnv_bld.h" |
32 | | #include "ucnv_cnv.h" |
33 | | #include "cmemory.h" |
34 | | |
35 | | /* Prototypes --------------------------------------------------------------- */ |
36 | | |
37 | | /* Keep these here to make finicky compilers happy */ |
38 | | |
39 | | U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, |
40 | | UErrorCode *err); |
41 | | U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, |
42 | | UErrorCode *err); |
43 | | |
44 | | |
45 | | /* UTF-8 -------------------------------------------------------------------- */ |
46 | | |
47 | | /* UTF-8 Conversion DATA |
48 | | * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 |
49 | | */ |
50 | | /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ |
51 | 316k | #define MAXIMUM_UCS2 0x0000FFFF |
52 | 1.12M | #define MAXIMUM_UTF 0x0010FFFF |
53 | | #define MAXIMUM_UCS4 0x7FFFFFFF |
54 | 44.1k | #define HALF_SHIFT 10 |
55 | 44.1k | #define HALF_BASE 0x0010000 |
56 | 44.1k | #define HALF_MASK 0x3FF |
57 | 44.1k | #define SURROGATE_HIGH_START 0xD800 |
58 | | #define SURROGATE_HIGH_END 0xDBFF |
59 | 44.1k | #define SURROGATE_LOW_START 0xDC00 |
60 | | #define SURROGATE_LOW_END 0xDFFF |
61 | | |
62 | | /* -SURROGATE_LOW_START + HALF_BASE */ |
63 | | #define SURROGATE_LOW_BASE 9216 |
64 | | |
65 | | static const uint32_t offsetsFromUTF8[7] = {0, |
66 | | (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, |
67 | | (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 |
68 | | }; |
69 | | |
70 | | /* END OF UTF-8 Conversion DATA */ |
71 | | |
72 | | static const int8_t bytesFromUTF8[256] = { |
73 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
74 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
75 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
76 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
77 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
78 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
79 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
80 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
81 | | }; |
82 | | |
83 | | /* |
84 | | * Starting with Unicode 3.0.1: |
85 | | * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; |
86 | | * byte sequences with more than 4 bytes are illegal in UTF-8, |
87 | | * which is tested with impossible values for them |
88 | | */ |
89 | | static const uint32_t |
90 | | utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; |
91 | | |
92 | | static UBool hasCESU8Data(const UConverter *cnv) |
93 | 468k | { |
94 | | #if UCONFIG_ONLY_HTML_CONVERSION |
95 | | return FALSE; |
96 | | #else |
97 | 468k | return (UBool)(cnv->sharedData == &_CESU8Data); |
98 | 468k | #endif |
99 | 468k | } |
100 | | U_CDECL_BEGIN |
101 | | static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, |
102 | | UErrorCode * err) |
103 | 468k | { |
104 | 468k | UConverter *cnv = args->converter; |
105 | 468k | const unsigned char *mySource = (unsigned char *) args->source; |
106 | 468k | UChar *myTarget = args->target; |
107 | 468k | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
108 | 468k | const UChar *targetLimit = args->targetLimit; |
109 | 468k | unsigned char *toUBytes = cnv->toUBytes; |
110 | 468k | UBool isCESU8 = hasCESU8Data(cnv); |
111 | 468k | uint32_t ch, ch2 = 0; |
112 | 468k | int32_t i, inBytes; |
113 | | |
114 | | /* Restore size of current sequence */ |
115 | 468k | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
116 | 227 | { |
117 | 227 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
118 | 227 | i = cnv->toULength; /* restore # of bytes consumed */ |
119 | 227 | cnv->toULength = 0; |
120 | | |
121 | 227 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
122 | 227 | cnv->toUnicodeStatus = 0; |
123 | 227 | goto morebytes; |
124 | 227 | } |
125 | | |
126 | | |
127 | 1.85M | while (mySource < sourceLimit && myTarget < targetLimit) |
128 | 1.84M | { |
129 | 1.84M | ch = *(mySource++); |
130 | 1.84M | if (ch < 0x80) /* Simple case */ |
131 | 1.06M | { |
132 | 1.06M | *(myTarget++) = (UChar) ch; |
133 | 1.06M | } |
134 | 784k | else |
135 | 784k | { |
136 | | /* store the first char */ |
137 | 784k | toUBytes[0] = (char)ch; |
138 | 784k | inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ |
139 | 784k | i = 1; |
140 | | |
141 | 784k | morebytes: |
142 | 1.28M | while (i < inBytes) |
143 | 610k | { |
144 | 610k | if (mySource < sourceLimit) |
145 | 610k | { |
146 | 610k | toUBytes[i] = (char) (ch2 = *mySource); |
147 | 610k | if (!U8_IS_TRAIL(ch2)) |
148 | 112k | { |
149 | 112k | break; /* i < inBytes */ |
150 | 112k | } |
151 | 497k | ch = (ch << 6) + ch2; |
152 | 497k | ++mySource; |
153 | 497k | i++; |
154 | 497k | } |
155 | 455 | else |
156 | 455 | { |
157 | | /* stores a partially calculated target*/ |
158 | 455 | cnv->toUnicodeStatus = ch; |
159 | 455 | cnv->mode = inBytes; |
160 | 455 | cnv->toULength = (int8_t) i; |
161 | 455 | goto donefornow; |
162 | 455 | } |
163 | 610k | } |
164 | | |
165 | | /* Remove the accumulated high bits */ |
166 | 783k | ch -= offsetsFromUTF8[inBytes]; |
167 | | |
168 | | /* |
169 | | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
170 | | * - use only trail bytes after a lead byte (checked above) |
171 | | * - use the right number of trail bytes for a given lead byte |
172 | | * - encode a code point <= U+10ffff |
173 | | * - use the fewest possible number of bytes for their code points |
174 | | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
175 | | * |
176 | | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
177 | | * There are no irregular sequences any more. |
178 | | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
179 | | */ |
180 | 783k | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
181 | 783k | (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) |
182 | 316k | { |
183 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
184 | 316k | if (ch <= MAXIMUM_UCS2) |
185 | 272k | { |
186 | | /* fits in 16 bits */ |
187 | 272k | *(myTarget++) = (UChar) ch; |
188 | 272k | } |
189 | 44.1k | else |
190 | 44.1k | { |
191 | | /* write out the surrogates */ |
192 | 44.1k | ch -= HALF_BASE; |
193 | 44.1k | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); |
194 | 44.1k | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
195 | 44.1k | if (myTarget < targetLimit) |
196 | 44.1k | { |
197 | 44.1k | *(myTarget++) = (UChar)ch; |
198 | 44.1k | } |
199 | 0 | else |
200 | 0 | { |
201 | | /* Put in overflow buffer (not handled here) */ |
202 | 0 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
203 | 0 | cnv->UCharErrorBufferLength = 1; |
204 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
205 | 0 | break; |
206 | 0 | } |
207 | 44.1k | } |
208 | 316k | } |
209 | 467k | else |
210 | 467k | { |
211 | 467k | cnv->toULength = (int8_t)i; |
212 | 467k | *err = U_ILLEGAL_CHAR_FOUND; |
213 | 467k | break; |
214 | 467k | } |
215 | 783k | } |
216 | 1.84M | } |
217 | | |
218 | 468k | donefornow: |
219 | 468k | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
220 | 0 | { |
221 | | /* End of target buffer */ |
222 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
223 | 0 | } |
224 | | |
225 | 468k | args->target = myTarget; |
226 | 468k | args->source = (const char *) mySource; |
227 | 468k | } |
228 | | |
229 | | static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, |
230 | | UErrorCode * err) |
231 | 0 | { |
232 | 0 | UConverter *cnv = args->converter; |
233 | 0 | const unsigned char *mySource = (unsigned char *) args->source; |
234 | 0 | UChar *myTarget = args->target; |
235 | 0 | int32_t *myOffsets = args->offsets; |
236 | 0 | int32_t offsetNum = 0; |
237 | 0 | const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; |
238 | 0 | const UChar *targetLimit = args->targetLimit; |
239 | 0 | unsigned char *toUBytes = cnv->toUBytes; |
240 | 0 | UBool isCESU8 = hasCESU8Data(cnv); |
241 | 0 | uint32_t ch, ch2 = 0; |
242 | 0 | int32_t i, inBytes; |
243 | | |
244 | | /* Restore size of current sequence */ |
245 | 0 | if (cnv->toUnicodeStatus && myTarget < targetLimit) |
246 | 0 | { |
247 | 0 | inBytes = cnv->mode; /* restore # of bytes to consume */ |
248 | 0 | i = cnv->toULength; /* restore # of bytes consumed */ |
249 | 0 | cnv->toULength = 0; |
250 | |
|
251 | 0 | ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ |
252 | 0 | cnv->toUnicodeStatus = 0; |
253 | 0 | goto morebytes; |
254 | 0 | } |
255 | | |
256 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
257 | 0 | { |
258 | 0 | ch = *(mySource++); |
259 | 0 | if (ch < 0x80) /* Simple case */ |
260 | 0 | { |
261 | 0 | *(myTarget++) = (UChar) ch; |
262 | 0 | *(myOffsets++) = offsetNum++; |
263 | 0 | } |
264 | 0 | else |
265 | 0 | { |
266 | 0 | toUBytes[0] = (char)ch; |
267 | 0 | inBytes = bytesFromUTF8[ch]; |
268 | 0 | i = 1; |
269 | |
|
270 | 0 | morebytes: |
271 | 0 | while (i < inBytes) |
272 | 0 | { |
273 | 0 | if (mySource < sourceLimit) |
274 | 0 | { |
275 | 0 | toUBytes[i] = (char) (ch2 = *mySource); |
276 | 0 | if (!U8_IS_TRAIL(ch2)) |
277 | 0 | { |
278 | 0 | break; /* i < inBytes */ |
279 | 0 | } |
280 | 0 | ch = (ch << 6) + ch2; |
281 | 0 | ++mySource; |
282 | 0 | i++; |
283 | 0 | } |
284 | 0 | else |
285 | 0 | { |
286 | 0 | cnv->toUnicodeStatus = ch; |
287 | 0 | cnv->mode = inBytes; |
288 | 0 | cnv->toULength = (int8_t)i; |
289 | 0 | goto donefornow; |
290 | 0 | } |
291 | 0 | } |
292 | | |
293 | | /* Remove the accumulated high bits */ |
294 | 0 | ch -= offsetsFromUTF8[inBytes]; |
295 | | |
296 | | /* |
297 | | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
298 | | * - use only trail bytes after a lead byte (checked above) |
299 | | * - use the right number of trail bytes for a given lead byte |
300 | | * - encode a code point <= U+10ffff |
301 | | * - use the fewest possible number of bytes for their code points |
302 | | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
303 | | * |
304 | | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
305 | | * There are no irregular sequences any more. |
306 | | * In CESU-8, only surrogates, not supplementary code points, are encoded directly. |
307 | | */ |
308 | 0 | if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && |
309 | 0 | (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) |
310 | 0 | { |
311 | | /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ |
312 | 0 | if (ch <= MAXIMUM_UCS2) |
313 | 0 | { |
314 | | /* fits in 16 bits */ |
315 | 0 | *(myTarget++) = (UChar) ch; |
316 | 0 | *(myOffsets++) = offsetNum; |
317 | 0 | } |
318 | 0 | else |
319 | 0 | { |
320 | | /* write out the surrogates */ |
321 | 0 | ch -= HALF_BASE; |
322 | 0 | *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); |
323 | 0 | *(myOffsets++) = offsetNum; |
324 | 0 | ch = (ch & HALF_MASK) + SURROGATE_LOW_START; |
325 | 0 | if (myTarget < targetLimit) |
326 | 0 | { |
327 | 0 | *(myTarget++) = (UChar)ch; |
328 | 0 | *(myOffsets++) = offsetNum; |
329 | 0 | } |
330 | 0 | else |
331 | 0 | { |
332 | 0 | cnv->UCharErrorBuffer[0] = (UChar) ch; |
333 | 0 | cnv->UCharErrorBufferLength = 1; |
334 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
335 | 0 | } |
336 | 0 | } |
337 | 0 | offsetNum += i; |
338 | 0 | } |
339 | 0 | else |
340 | 0 | { |
341 | 0 | cnv->toULength = (int8_t)i; |
342 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
343 | 0 | break; |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | | |
348 | 0 | donefornow: |
349 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
350 | 0 | { /* End of target buffer */ |
351 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
352 | 0 | } |
353 | |
|
354 | 0 | args->target = myTarget; |
355 | 0 | args->source = (const char *) mySource; |
356 | 0 | args->offsets = myOffsets; |
357 | 0 | } |
358 | | U_CDECL_END |
359 | | |
360 | | U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, |
361 | | UErrorCode * err) |
362 | 0 | { |
363 | 0 | UConverter *cnv = args->converter; |
364 | 0 | const UChar *mySource = args->source; |
365 | 0 | const UChar *sourceLimit = args->sourceLimit; |
366 | 0 | uint8_t *myTarget = (uint8_t *) args->target; |
367 | 0 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
368 | 0 | uint8_t *tempPtr; |
369 | 0 | UChar32 ch; |
370 | 0 | uint8_t tempBuf[4]; |
371 | 0 | int32_t indexToWrite; |
372 | 0 | UBool isNotCESU8 = !hasCESU8Data(cnv); |
373 | |
|
374 | 0 | if (cnv->fromUChar32 && myTarget < targetLimit) |
375 | 0 | { |
376 | 0 | ch = cnv->fromUChar32; |
377 | 0 | cnv->fromUChar32 = 0; |
378 | 0 | goto lowsurrogate; |
379 | 0 | } |
380 | | |
381 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
382 | 0 | { |
383 | 0 | ch = *(mySource++); |
384 | |
|
385 | 0 | if (ch < 0x80) /* Single byte */ |
386 | 0 | { |
387 | 0 | *(myTarget++) = (uint8_t) ch; |
388 | 0 | } |
389 | 0 | else if (ch < 0x800) /* Double byte */ |
390 | 0 | { |
391 | 0 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
392 | 0 | if (myTarget < targetLimit) |
393 | 0 | { |
394 | 0 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
395 | 0 | } |
396 | 0 | else |
397 | 0 | { |
398 | 0 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
399 | 0 | cnv->charErrorBufferLength = 1; |
400 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
401 | 0 | } |
402 | 0 | } |
403 | 0 | else { |
404 | | /* Check for surrogates */ |
405 | 0 | if(U16_IS_SURROGATE(ch) && isNotCESU8) { |
406 | 0 | lowsurrogate: |
407 | 0 | if (mySource < sourceLimit) { |
408 | | /* test both code units */ |
409 | 0 | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { |
410 | | /* convert and consume this supplementary code point */ |
411 | 0 | ch=U16_GET_SUPPLEMENTARY(ch, *mySource); |
412 | 0 | ++mySource; |
413 | | /* exit this condition tree */ |
414 | 0 | } |
415 | 0 | else { |
416 | | /* this is an unpaired trail or lead code unit */ |
417 | | /* callback(illegal) */ |
418 | 0 | cnv->fromUChar32 = ch; |
419 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
420 | 0 | break; |
421 | 0 | } |
422 | 0 | } |
423 | 0 | else { |
424 | | /* no more input */ |
425 | 0 | cnv->fromUChar32 = ch; |
426 | 0 | break; |
427 | 0 | } |
428 | 0 | } |
429 | | |
430 | | /* Do we write the buffer directly for speed, |
431 | | or do we have to be careful about target buffer space? */ |
432 | 0 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
433 | |
|
434 | 0 | if (ch <= MAXIMUM_UCS2) { |
435 | 0 | indexToWrite = 2; |
436 | 0 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
437 | 0 | } |
438 | 0 | else { |
439 | 0 | indexToWrite = 3; |
440 | 0 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
441 | 0 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
442 | 0 | } |
443 | 0 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
444 | 0 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
445 | |
|
446 | 0 | if (tempPtr == myTarget) { |
447 | | /* There was enough space to write the codepoint directly. */ |
448 | 0 | myTarget += (indexToWrite + 1); |
449 | 0 | } |
450 | 0 | else { |
451 | | /* We might run out of room soon. Write it slowly. */ |
452 | 0 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
453 | 0 | if (myTarget < targetLimit) { |
454 | 0 | *(myTarget++) = *tempPtr; |
455 | 0 | } |
456 | 0 | else { |
457 | 0 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; |
458 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
459 | 0 | } |
460 | 0 | } |
461 | 0 | } |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
466 | 0 | { |
467 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
468 | 0 | } |
469 | |
|
470 | 0 | args->target = (char *) myTarget; |
471 | 0 | args->source = mySource; |
472 | 0 | } |
473 | | |
474 | | U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, |
475 | | UErrorCode * err) |
476 | 0 | { |
477 | 0 | UConverter *cnv = args->converter; |
478 | 0 | const UChar *mySource = args->source; |
479 | 0 | int32_t *myOffsets = args->offsets; |
480 | 0 | const UChar *sourceLimit = args->sourceLimit; |
481 | 0 | uint8_t *myTarget = (uint8_t *) args->target; |
482 | 0 | const uint8_t *targetLimit = (uint8_t *) args->targetLimit; |
483 | 0 | uint8_t *tempPtr; |
484 | 0 | UChar32 ch; |
485 | 0 | int32_t offsetNum, nextSourceIndex; |
486 | 0 | int32_t indexToWrite; |
487 | 0 | uint8_t tempBuf[4]; |
488 | 0 | UBool isNotCESU8 = !hasCESU8Data(cnv); |
489 | |
|
490 | 0 | if (cnv->fromUChar32 && myTarget < targetLimit) |
491 | 0 | { |
492 | 0 | ch = cnv->fromUChar32; |
493 | 0 | cnv->fromUChar32 = 0; |
494 | 0 | offsetNum = -1; |
495 | 0 | nextSourceIndex = 0; |
496 | 0 | goto lowsurrogate; |
497 | 0 | } else { |
498 | 0 | offsetNum = 0; |
499 | 0 | } |
500 | | |
501 | 0 | while (mySource < sourceLimit && myTarget < targetLimit) |
502 | 0 | { |
503 | 0 | ch = *(mySource++); |
504 | |
|
505 | 0 | if (ch < 0x80) /* Single byte */ |
506 | 0 | { |
507 | 0 | *(myOffsets++) = offsetNum++; |
508 | 0 | *(myTarget++) = (char) ch; |
509 | 0 | } |
510 | 0 | else if (ch < 0x800) /* Double byte */ |
511 | 0 | { |
512 | 0 | *(myOffsets++) = offsetNum; |
513 | 0 | *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); |
514 | 0 | if (myTarget < targetLimit) |
515 | 0 | { |
516 | 0 | *(myOffsets++) = offsetNum++; |
517 | 0 | *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); |
518 | 0 | } |
519 | 0 | else |
520 | 0 | { |
521 | 0 | cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); |
522 | 0 | cnv->charErrorBufferLength = 1; |
523 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
524 | 0 | } |
525 | 0 | } |
526 | 0 | else |
527 | | /* Check for surrogates */ |
528 | 0 | { |
529 | 0 | nextSourceIndex = offsetNum + 1; |
530 | |
|
531 | 0 | if(U16_IS_SURROGATE(ch) && isNotCESU8) { |
532 | 0 | lowsurrogate: |
533 | 0 | if (mySource < sourceLimit) { |
534 | | /* test both code units */ |
535 | 0 | if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { |
536 | | /* convert and consume this supplementary code point */ |
537 | 0 | ch=U16_GET_SUPPLEMENTARY(ch, *mySource); |
538 | 0 | ++mySource; |
539 | 0 | ++nextSourceIndex; |
540 | | /* exit this condition tree */ |
541 | 0 | } |
542 | 0 | else { |
543 | | /* this is an unpaired trail or lead code unit */ |
544 | | /* callback(illegal) */ |
545 | 0 | cnv->fromUChar32 = ch; |
546 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
547 | 0 | break; |
548 | 0 | } |
549 | 0 | } |
550 | 0 | else { |
551 | | /* no more input */ |
552 | 0 | cnv->fromUChar32 = ch; |
553 | 0 | break; |
554 | 0 | } |
555 | 0 | } |
556 | | |
557 | | /* Do we write the buffer directly for speed, |
558 | | or do we have to be careful about target buffer space? */ |
559 | 0 | tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); |
560 | |
|
561 | 0 | if (ch <= MAXIMUM_UCS2) { |
562 | 0 | indexToWrite = 2; |
563 | 0 | tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); |
564 | 0 | } |
565 | 0 | else { |
566 | 0 | indexToWrite = 3; |
567 | 0 | tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); |
568 | 0 | tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); |
569 | 0 | } |
570 | 0 | tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); |
571 | 0 | tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); |
572 | |
|
573 | 0 | if (tempPtr == myTarget) { |
574 | | /* There was enough space to write the codepoint directly. */ |
575 | 0 | myTarget += (indexToWrite + 1); |
576 | 0 | myOffsets[0] = offsetNum; |
577 | 0 | myOffsets[1] = offsetNum; |
578 | 0 | myOffsets[2] = offsetNum; |
579 | 0 | if (indexToWrite >= 3) { |
580 | 0 | myOffsets[3] = offsetNum; |
581 | 0 | } |
582 | 0 | myOffsets += (indexToWrite + 1); |
583 | 0 | } |
584 | 0 | else { |
585 | | /* We might run out of room soon. Write it slowly. */ |
586 | 0 | for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { |
587 | 0 | if (myTarget < targetLimit) |
588 | 0 | { |
589 | 0 | *(myOffsets++) = offsetNum; |
590 | 0 | *(myTarget++) = *tempPtr; |
591 | 0 | } |
592 | 0 | else |
593 | 0 | { |
594 | 0 | cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; |
595 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
596 | 0 | } |
597 | 0 | } |
598 | 0 | } |
599 | 0 | offsetNum = nextSourceIndex; |
600 | 0 | } |
601 | 0 | } |
602 | | |
603 | 0 | if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) |
604 | 0 | { |
605 | 0 | *err = U_BUFFER_OVERFLOW_ERROR; |
606 | 0 | } |
607 | |
|
608 | 0 | args->target = (char *) myTarget; |
609 | 0 | args->source = mySource; |
610 | 0 | args->offsets = myOffsets; |
611 | 0 | } |
612 | | |
613 | | U_CDECL_BEGIN |
614 | | static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, |
615 | 0 | UErrorCode *err) { |
616 | 0 | UConverter *cnv; |
617 | 0 | const uint8_t *sourceInitial; |
618 | 0 | const uint8_t *source; |
619 | 0 | uint16_t extraBytesToWrite; |
620 | 0 | uint8_t myByte; |
621 | 0 | UChar32 ch; |
622 | 0 | int8_t i, isLegalSequence; |
623 | | |
624 | | /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ |
625 | |
|
626 | 0 | cnv = args->converter; |
627 | 0 | sourceInitial = source = (const uint8_t *)args->source; |
628 | 0 | if (source >= (const uint8_t *)args->sourceLimit) |
629 | 0 | { |
630 | | /* no input */ |
631 | 0 | *err = U_INDEX_OUTOFBOUNDS_ERROR; |
632 | 0 | return 0xffff; |
633 | 0 | } |
634 | | |
635 | 0 | myByte = (uint8_t)*(source++); |
636 | 0 | if (myByte < 0x80) |
637 | 0 | { |
638 | 0 | args->source = (const char *)source; |
639 | 0 | return (UChar32)myByte; |
640 | 0 | } |
641 | | |
642 | 0 | extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; |
643 | 0 | if (extraBytesToWrite == 0) { |
644 | 0 | cnv->toUBytes[0] = myByte; |
645 | 0 | cnv->toULength = 1; |
646 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
647 | 0 | args->source = (const char *)source; |
648 | 0 | return 0xffff; |
649 | 0 | } |
650 | | |
651 | | /*The byte sequence is longer than the buffer area passed*/ |
652 | 0 | if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) |
653 | 0 | { |
654 | | /* check if all of the remaining bytes are trail bytes */ |
655 | 0 | cnv->toUBytes[0] = myByte; |
656 | 0 | i = 1; |
657 | 0 | *err = U_TRUNCATED_CHAR_FOUND; |
658 | 0 | while(source < (const uint8_t *)args->sourceLimit) { |
659 | 0 | if(U8_IS_TRAIL(myByte = *source)) { |
660 | 0 | cnv->toUBytes[i++] = myByte; |
661 | 0 | ++source; |
662 | 0 | } else { |
663 | | /* error even before we run out of input */ |
664 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
665 | 0 | break; |
666 | 0 | } |
667 | 0 | } |
668 | 0 | cnv->toULength = i; |
669 | 0 | args->source = (const char *)source; |
670 | 0 | return 0xffff; |
671 | 0 | } |
672 | | |
673 | 0 | isLegalSequence = 1; |
674 | 0 | ch = myByte << 6; |
675 | 0 | switch(extraBytesToWrite) |
676 | 0 | { |
677 | | /* note: code falls through cases! (sic)*/ |
678 | 0 | case 6: |
679 | 0 | ch += (myByte = *source); |
680 | 0 | ch <<= 6; |
681 | 0 | if (!U8_IS_TRAIL(myByte)) |
682 | 0 | { |
683 | 0 | isLegalSequence = 0; |
684 | 0 | break; |
685 | 0 | } |
686 | 0 | ++source; |
687 | 0 | U_FALLTHROUGH; |
688 | 0 | case 5: |
689 | 0 | ch += (myByte = *source); |
690 | 0 | ch <<= 6; |
691 | 0 | if (!U8_IS_TRAIL(myByte)) |
692 | 0 | { |
693 | 0 | isLegalSequence = 0; |
694 | 0 | break; |
695 | 0 | } |
696 | 0 | ++source; |
697 | 0 | U_FALLTHROUGH; |
698 | 0 | case 4: |
699 | 0 | ch += (myByte = *source); |
700 | 0 | ch <<= 6; |
701 | 0 | if (!U8_IS_TRAIL(myByte)) |
702 | 0 | { |
703 | 0 | isLegalSequence = 0; |
704 | 0 | break; |
705 | 0 | } |
706 | 0 | ++source; |
707 | 0 | U_FALLTHROUGH; |
708 | 0 | case 3: |
709 | 0 | ch += (myByte = *source); |
710 | 0 | ch <<= 6; |
711 | 0 | if (!U8_IS_TRAIL(myByte)) |
712 | 0 | { |
713 | 0 | isLegalSequence = 0; |
714 | 0 | break; |
715 | 0 | } |
716 | 0 | ++source; |
717 | 0 | U_FALLTHROUGH; |
718 | 0 | case 2: |
719 | 0 | ch += (myByte = *source); |
720 | 0 | if (!U8_IS_TRAIL(myByte)) |
721 | 0 | { |
722 | 0 | isLegalSequence = 0; |
723 | 0 | break; |
724 | 0 | } |
725 | 0 | ++source; |
726 | 0 | }; |
727 | 0 | ch -= offsetsFromUTF8[extraBytesToWrite]; |
728 | 0 | args->source = (const char *)source; |
729 | | |
730 | | /* |
731 | | * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: |
732 | | * - use only trail bytes after a lead byte (checked above) |
733 | | * - use the right number of trail bytes for a given lead byte |
734 | | * - encode a code point <= U+10ffff |
735 | | * - use the fewest possible number of bytes for their code points |
736 | | * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) |
737 | | * |
738 | | * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. |
739 | | * There are no irregular sequences any more. |
740 | | */ |
741 | 0 | if (isLegalSequence && |
742 | 0 | (uint32_t)ch <= MAXIMUM_UTF && |
743 | 0 | (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && |
744 | 0 | !U_IS_SURROGATE(ch) |
745 | 0 | ) { |
746 | 0 | return ch; /* return the code point */ |
747 | 0 | } |
748 | | |
749 | 0 | for(i = 0; sourceInitial < source; ++i) { |
750 | 0 | cnv->toUBytes[i] = *sourceInitial++; |
751 | 0 | } |
752 | 0 | cnv->toULength = i; |
753 | 0 | *err = U_ILLEGAL_CHAR_FOUND; |
754 | 0 | return 0xffff; |
755 | 0 | } |
756 | | U_CDECL_END |
757 | | |
758 | | /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ |
759 | | |
760 | | /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ |
761 | | static const UChar32 |
762 | | utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; |
763 | | |
764 | | /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ |
765 | | static const UChar32 |
766 | | utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; |
767 | | |
768 | | U_CDECL_BEGIN |
769 | | /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ |
770 | | static void U_CALLCONV |
771 | | ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
772 | | UConverterToUnicodeArgs *pToUArgs, |
773 | 0 | UErrorCode *pErrorCode) { |
774 | 0 | UConverter *utf8; |
775 | 0 | const uint8_t *source, *sourceLimit; |
776 | 0 | uint8_t *target; |
777 | 0 | int32_t targetCapacity; |
778 | 0 | int32_t count; |
779 | |
|
780 | 0 | int8_t oldToULength, toULength, toULimit; |
781 | |
|
782 | 0 | UChar32 c; |
783 | 0 | uint8_t b, t1, t2; |
784 | | |
785 | | /* set up the local pointers */ |
786 | 0 | utf8=pToUArgs->converter; |
787 | 0 | source=(uint8_t *)pToUArgs->source; |
788 | 0 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
789 | 0 | target=(uint8_t *)pFromUArgs->target; |
790 | 0 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
791 | | |
792 | | /* get the converter state from the UTF-8 UConverter */ |
793 | 0 | c=(UChar32)utf8->toUnicodeStatus; |
794 | 0 | if(c!=0) { |
795 | 0 | toULength=oldToULength=utf8->toULength; |
796 | 0 | toULimit=(int8_t)utf8->mode; |
797 | 0 | } else { |
798 | 0 | toULength=oldToULength=toULimit=0; |
799 | 0 | } |
800 | |
|
801 | 0 | count=(int32_t)(sourceLimit-source)+oldToULength; |
802 | 0 | if(count<toULimit) { |
803 | | /* |
804 | | * Not enough input to complete the partial character. |
805 | | * Jump to moreBytes below - it will not output to target. |
806 | | */ |
807 | 0 | } else if(targetCapacity<toULimit) { |
808 | | /* |
809 | | * Not enough target capacity to output the partial character. |
810 | | * Let the standard converter handle this. |
811 | | */ |
812 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
813 | 0 | return; |
814 | 0 | } else { |
815 | | /* |
816 | | * Use a single counter for source and target, counting the minimum of |
817 | | * the source length and the target capacity. |
818 | | * As a result, the source length is checked only once per multi-byte |
819 | | * character instead of twice. |
820 | | * |
821 | | * Make sure that the last byte sequence is complete, or else |
822 | | * stop just before it. |
823 | | * (The longest legal byte sequence has 3 trail bytes.) |
824 | | * Count oldToULength (number of source bytes from a previous buffer) |
825 | | * into the source length but reduce the source index by toULimit |
826 | | * while going back over trail bytes in order to not go back into |
827 | | * the bytes that will be read for finishing a partial |
828 | | * sequence from the previous buffer. |
829 | | * Let the standard converter handle edge cases. |
830 | | */ |
831 | 0 | int32_t i; |
832 | |
|
833 | 0 | if(count>targetCapacity) { |
834 | 0 | count=targetCapacity; |
835 | 0 | } |
836 | |
|
837 | 0 | i=0; |
838 | 0 | while(i<3 && i<(count-toULimit)) { |
839 | 0 | b=source[count-oldToULength-i-1]; |
840 | 0 | if(U8_IS_TRAIL(b)) { |
841 | 0 | ++i; |
842 | 0 | } else { |
843 | 0 | if(i<U8_COUNT_TRAIL_BYTES(b)) { |
844 | | /* stop converting before the lead byte if there are not enough trail bytes for it */ |
845 | 0 | count-=i+1; |
846 | 0 | } |
847 | 0 | break; |
848 | 0 | } |
849 | 0 | } |
850 | 0 | } |
851 | | |
852 | 0 | if(c!=0) { |
853 | 0 | utf8->toUnicodeStatus=0; |
854 | 0 | utf8->toULength=0; |
855 | 0 | goto moreBytes; |
856 | | /* See note in ucnv_SBCSFromUTF8() about this goto. */ |
857 | 0 | } |
858 | | |
859 | | /* conversion loop */ |
860 | 0 | while(count>0) { |
861 | 0 | b=*source++; |
862 | 0 | if((int8_t)b>=0) { |
863 | | /* convert ASCII */ |
864 | 0 | *target++=b; |
865 | 0 | --count; |
866 | 0 | continue; |
867 | 0 | } else { |
868 | 0 | if(b>0xe0) { |
869 | 0 | if( /* handle U+1000..U+D7FF inline */ |
870 | 0 | (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || |
871 | 0 | (b==0xed && (t1 <= 0x9f))) && |
872 | 0 | (t2=source[1]) >= 0x80 && t2 <= 0xbf |
873 | 0 | ) { |
874 | 0 | source+=2; |
875 | 0 | *target++=b; |
876 | 0 | *target++=t1; |
877 | 0 | *target++=t2; |
878 | 0 | count-=3; |
879 | 0 | continue; |
880 | 0 | } |
881 | 0 | } else if(b<0xe0) { |
882 | 0 | if( /* handle U+0080..U+07FF inline */ |
883 | 0 | b>=0xc2 && |
884 | 0 | (t1=*source) >= 0x80 && t1 <= 0xbf |
885 | 0 | ) { |
886 | 0 | ++source; |
887 | 0 | *target++=b; |
888 | 0 | *target++=t1; |
889 | 0 | count-=2; |
890 | 0 | continue; |
891 | 0 | } |
892 | 0 | } else if(b==0xe0) { |
893 | 0 | if( /* handle U+0800..U+0FFF inline */ |
894 | 0 | (t1=source[0]) >= 0xa0 && t1 <= 0xbf && |
895 | 0 | (t2=source[1]) >= 0x80 && t2 <= 0xbf |
896 | 0 | ) { |
897 | 0 | source+=2; |
898 | 0 | *target++=b; |
899 | 0 | *target++=t1; |
900 | 0 | *target++=t2; |
901 | 0 | count-=3; |
902 | 0 | continue; |
903 | 0 | } |
904 | 0 | } |
905 | | |
906 | | /* handle "complicated" and error cases, and continuing partial characters */ |
907 | 0 | oldToULength=0; |
908 | 0 | toULength=1; |
909 | 0 | toULimit=U8_COUNT_TRAIL_BYTES(b)+1; |
910 | 0 | c=b; |
911 | 0 | moreBytes: |
912 | 0 | while(toULength<toULimit) { |
913 | 0 | if(source<sourceLimit) { |
914 | 0 | b=*source; |
915 | 0 | if(U8_IS_TRAIL(b)) { |
916 | 0 | ++source; |
917 | 0 | ++toULength; |
918 | 0 | c=(c<<6)+b; |
919 | 0 | } else { |
920 | 0 | break; /* sequence too short, stop with toULength<toULimit */ |
921 | 0 | } |
922 | 0 | } else { |
923 | | /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ |
924 | 0 | source-=(toULength-oldToULength); |
925 | 0 | while(oldToULength<toULength) { |
926 | 0 | utf8->toUBytes[oldToULength++]=*source++; |
927 | 0 | } |
928 | 0 | utf8->toUnicodeStatus=c; |
929 | 0 | utf8->toULength=toULength; |
930 | 0 | utf8->mode=toULimit; |
931 | 0 | pToUArgs->source=(char *)source; |
932 | 0 | pFromUArgs->target=(char *)target; |
933 | 0 | return; |
934 | 0 | } |
935 | 0 | } |
936 | | |
937 | 0 | if( toULength==toULimit && /* consumed all trail bytes */ |
938 | 0 | (toULength==3 || toULength==2) && /* BMP */ |
939 | 0 | (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && |
940 | 0 | (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ |
941 | 0 | ) { |
942 | | /* legal byte sequence for BMP code point */ |
943 | 0 | } else if( |
944 | 0 | toULength==toULimit && toULength==4 && |
945 | 0 | (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) |
946 | 0 | ) { |
947 | | /* legal byte sequence for supplementary code point */ |
948 | 0 | } else { |
949 | | /* error handling: illegal UTF-8 byte sequence */ |
950 | 0 | source-=(toULength-oldToULength); |
951 | 0 | while(oldToULength<toULength) { |
952 | 0 | utf8->toUBytes[oldToULength++]=*source++; |
953 | 0 | } |
954 | 0 | utf8->toULength=toULength; |
955 | 0 | pToUArgs->source=(char *)source; |
956 | 0 | pFromUArgs->target=(char *)target; |
957 | 0 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
958 | 0 | return; |
959 | 0 | } |
960 | | |
961 | | /* copy the legal byte sequence to the target */ |
962 | 0 | { |
963 | 0 | int8_t i; |
964 | |
|
965 | 0 | for(i=0; i<oldToULength; ++i) { |
966 | 0 | *target++=utf8->toUBytes[i]; |
967 | 0 | } |
968 | 0 | source-=(toULength-oldToULength); |
969 | 0 | for(; i<toULength; ++i) { |
970 | 0 | *target++=*source++; |
971 | 0 | } |
972 | 0 | count-=toULength; |
973 | 0 | } |
974 | 0 | } |
975 | 0 | } |
976 | | |
977 | 0 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { |
978 | 0 | if(target==(const uint8_t *)pFromUArgs->targetLimit) { |
979 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
980 | 0 | } else { |
981 | 0 | b=*source; |
982 | 0 | toULimit=U8_COUNT_TRAIL_BYTES(b)+1; |
983 | 0 | if(toULimit>(sourceLimit-source)) { |
984 | | /* collect a truncated byte sequence */ |
985 | 0 | toULength=0; |
986 | 0 | c=b; |
987 | 0 | for(;;) { |
988 | 0 | utf8->toUBytes[toULength++]=b; |
989 | 0 | if(++source==sourceLimit) { |
990 | | /* partial byte sequence at end of source */ |
991 | 0 | utf8->toUnicodeStatus=c; |
992 | 0 | utf8->toULength=toULength; |
993 | 0 | utf8->mode=toULimit; |
994 | 0 | break; |
995 | 0 | } else if(!U8_IS_TRAIL(b=*source)) { |
996 | | /* lead byte in trail byte position */ |
997 | 0 | utf8->toULength=toULength; |
998 | 0 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
999 | 0 | break; |
1000 | 0 | } |
1001 | 0 | c=(c<<6)+b; |
1002 | 0 | } |
1003 | 0 | } else { |
1004 | | /* partial-sequence target overflow: fall back to the pivoting implementation */ |
1005 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
1006 | 0 | } |
1007 | 0 | } |
1008 | 0 | } |
1009 | | |
1010 | | /* write back the updated pointers */ |
1011 | 0 | pToUArgs->source=(char *)source; |
1012 | 0 | pFromUArgs->target=(char *)target; |
1013 | 0 | } |
1014 | | |
1015 | | U_CDECL_END |
1016 | | |
1017 | | /* UTF-8 converter data ----------------------------------------------------- */ |
1018 | | |
1019 | | static const UConverterImpl _UTF8Impl={ |
1020 | | UCNV_UTF8, |
1021 | | |
1022 | | NULL, |
1023 | | NULL, |
1024 | | |
1025 | | NULL, |
1026 | | NULL, |
1027 | | NULL, |
1028 | | |
1029 | | ucnv_toUnicode_UTF8, |
1030 | | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
1031 | | ucnv_fromUnicode_UTF8, |
1032 | | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
1033 | | ucnv_getNextUChar_UTF8, |
1034 | | |
1035 | | NULL, |
1036 | | NULL, |
1037 | | NULL, |
1038 | | NULL, |
1039 | | ucnv_getNonSurrogateUnicodeSet, |
1040 | | |
1041 | | ucnv_UTF8FromUTF8, |
1042 | | ucnv_UTF8FromUTF8 |
1043 | | }; |
1044 | | |
1045 | | /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ |
1046 | | static const UConverterStaticData _UTF8StaticData={ |
1047 | | sizeof(UConverterStaticData), |
1048 | | "UTF-8", |
1049 | | 1208, UCNV_IBM, UCNV_UTF8, |
1050 | | 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
1051 | | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
1052 | | 0, |
1053 | | 0, |
1054 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
1055 | | }; |
1056 | | |
1057 | | |
1058 | | const UConverterSharedData _UTF8Data= |
1059 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); |
1060 | | |
1061 | | /* CESU-8 converter data ---------------------------------------------------- */ |
1062 | | |
1063 | | static const UConverterImpl _CESU8Impl={ |
1064 | | UCNV_CESU8, |
1065 | | |
1066 | | NULL, |
1067 | | NULL, |
1068 | | |
1069 | | NULL, |
1070 | | NULL, |
1071 | | NULL, |
1072 | | |
1073 | | ucnv_toUnicode_UTF8, |
1074 | | ucnv_toUnicode_UTF8_OFFSETS_LOGIC, |
1075 | | ucnv_fromUnicode_UTF8, |
1076 | | ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
1077 | | NULL, |
1078 | | |
1079 | | NULL, |
1080 | | NULL, |
1081 | | NULL, |
1082 | | NULL, |
1083 | | ucnv_getCompleteUnicodeSet, |
1084 | | |
1085 | | NULL, |
1086 | | NULL |
1087 | | }; |
1088 | | |
1089 | | static const UConverterStaticData _CESU8StaticData={ |
1090 | | sizeof(UConverterStaticData), |
1091 | | "CESU-8", |
1092 | | 9400, /* CCSID for CESU-8 */ |
1093 | | UCNV_UNKNOWN, UCNV_CESU8, 1, 3, |
1094 | | { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, |
1095 | | 0, |
1096 | | 0, |
1097 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
1098 | | }; |
1099 | | |
1100 | | |
1101 | | const UConverterSharedData _CESU8Data= |
1102 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); |
1103 | | |
1104 | | #endif |