/src/icu/source/common/ucnvlat1.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ********************************************************************** |
5 | | * Copyright (C) 2000-2015, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ********************************************************************** |
8 | | * file name: ucnvlat1.cpp |
9 | | * encoding: UTF-8 |
10 | | * tab size: 8 (not used) |
11 | | * indentation:4 |
12 | | * |
13 | | * created on: 2000feb07 |
14 | | * created by: Markus W. Scherer |
15 | | */ |
16 | | |
17 | | #include "unicode/utypes.h" |
18 | | |
19 | | #if !UCONFIG_NO_CONVERSION |
20 | | |
21 | | #include "unicode/ucnv.h" |
22 | | #include "unicode/uset.h" |
23 | | #include "unicode/utf8.h" |
24 | | #include "ucnv_bld.h" |
25 | | #include "ucnv_cnv.h" |
26 | | |
27 | | /* control optimizations according to the platform */ |
28 | | #define LATIN1_UNROLL_FROM_UNICODE 1 |
29 | | |
30 | | /* ISO 8859-1 --------------------------------------------------------------- */ |
31 | | |
32 | | /* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
33 | | U_CDECL_BEGIN |
34 | | static void U_CALLCONV |
35 | | _Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
36 | 1.11k | UErrorCode *pErrorCode) { |
37 | 1.11k | const uint8_t *source; |
38 | 1.11k | UChar *target; |
39 | 1.11k | int32_t targetCapacity, length; |
40 | 1.11k | int32_t *offsets; |
41 | | |
42 | 1.11k | int32_t sourceIndex; |
43 | | |
44 | | /* set up the local pointers */ |
45 | 1.11k | source=(const uint8_t *)pArgs->source; |
46 | 1.11k | target=pArgs->target; |
47 | 1.11k | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
48 | 1.11k | offsets=pArgs->offsets; |
49 | | |
50 | 1.11k | sourceIndex=0; |
51 | | |
52 | | /* |
53 | | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
54 | | * for the minimum of the sourceLength and targetCapacity |
55 | | */ |
56 | 1.11k | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
57 | 1.11k | if(length<=targetCapacity) { |
58 | 1.11k | targetCapacity=length; |
59 | 1.11k | } else { |
60 | | /* target will be full */ |
61 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
62 | 0 | length=targetCapacity; |
63 | 0 | } |
64 | | |
65 | 1.11k | if(targetCapacity>=8) { |
66 | | /* This loop is unrolled for speed and improved pipelining. */ |
67 | 442 | int32_t count, loops; |
68 | | |
69 | 442 | loops=count=targetCapacity>>3; |
70 | 442 | length=targetCapacity&=0x7; |
71 | 292k | do { |
72 | 292k | target[0]=source[0]; |
73 | 292k | target[1]=source[1]; |
74 | 292k | target[2]=source[2]; |
75 | 292k | target[3]=source[3]; |
76 | 292k | target[4]=source[4]; |
77 | 292k | target[5]=source[5]; |
78 | 292k | target[6]=source[6]; |
79 | 292k | target[7]=source[7]; |
80 | 292k | target+=8; |
81 | 292k | source+=8; |
82 | 292k | } while(--count>0); |
83 | | |
84 | 442 | if(offsets!=NULL) { |
85 | 0 | do { |
86 | 0 | offsets[0]=sourceIndex++; |
87 | 0 | offsets[1]=sourceIndex++; |
88 | 0 | offsets[2]=sourceIndex++; |
89 | 0 | offsets[3]=sourceIndex++; |
90 | 0 | offsets[4]=sourceIndex++; |
91 | 0 | offsets[5]=sourceIndex++; |
92 | 0 | offsets[6]=sourceIndex++; |
93 | 0 | offsets[7]=sourceIndex++; |
94 | 0 | offsets+=8; |
95 | 0 | } while(--loops>0); |
96 | 0 | } |
97 | 442 | } |
98 | | |
99 | | /* conversion loop */ |
100 | 4.98k | while(targetCapacity>0) { |
101 | 3.87k | *target++=*source++; |
102 | 3.87k | --targetCapacity; |
103 | 3.87k | } |
104 | | |
105 | | /* write back the updated pointers */ |
106 | 1.11k | pArgs->source=(const char *)source; |
107 | 1.11k | pArgs->target=target; |
108 | | |
109 | | /* set offsets */ |
110 | 1.11k | if(offsets!=NULL) { |
111 | 0 | while(length>0) { |
112 | 0 | *offsets++=sourceIndex++; |
113 | 0 | --length; |
114 | 0 | } |
115 | 0 | pArgs->offsets=offsets; |
116 | 0 | } |
117 | 1.11k | } |
118 | | |
119 | | /* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */ |
120 | | static UChar32 U_CALLCONV |
121 | | _Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs, |
122 | 0 | UErrorCode *pErrorCode) { |
123 | 0 | const uint8_t *source=(const uint8_t *)pArgs->source; |
124 | 0 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
125 | 0 | pArgs->source=(const char *)(source+1); |
126 | 0 | return *source; |
127 | 0 | } |
128 | | |
129 | | /* no output because of empty input */ |
130 | 0 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
131 | 0 | return 0xffff; |
132 | 0 | } |
133 | | |
134 | | /* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */ |
135 | | static void U_CALLCONV |
136 | | _Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
137 | 0 | UErrorCode *pErrorCode) { |
138 | 0 | UConverter *cnv; |
139 | 0 | const UChar *source, *sourceLimit; |
140 | 0 | uint8_t *target, *oldTarget; |
141 | 0 | int32_t targetCapacity, length; |
142 | 0 | int32_t *offsets; |
143 | |
|
144 | 0 | UChar32 cp; |
145 | 0 | UChar c, max; |
146 | |
|
147 | 0 | int32_t sourceIndex; |
148 | | |
149 | | /* set up the local pointers */ |
150 | 0 | cnv=pArgs->converter; |
151 | 0 | source=pArgs->source; |
152 | 0 | sourceLimit=pArgs->sourceLimit; |
153 | 0 | target=oldTarget=(uint8_t *)pArgs->target; |
154 | 0 | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
155 | 0 | offsets=pArgs->offsets; |
156 | |
|
157 | 0 | if(cnv->sharedData==&_Latin1Data) { |
158 | 0 | max=0xff; /* Latin-1 */ |
159 | 0 | } else { |
160 | 0 | max=0x7f; /* US-ASCII */ |
161 | 0 | } |
162 | | |
163 | | /* get the converter state from UConverter */ |
164 | 0 | cp=cnv->fromUChar32; |
165 | | |
166 | | /* sourceIndex=-1 if the current character began in the previous buffer */ |
167 | 0 | sourceIndex= cp==0 ? 0 : -1; |
168 | | |
169 | | /* |
170 | | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
171 | | * for the minimum of the sourceLength and targetCapacity |
172 | | */ |
173 | 0 | length=(int32_t)(sourceLimit-source); |
174 | 0 | if(length<targetCapacity) { |
175 | 0 | targetCapacity=length; |
176 | 0 | } |
177 | | |
178 | | /* conversion loop */ |
179 | 0 | if(cp!=0 && targetCapacity>0) { |
180 | 0 | goto getTrail; |
181 | 0 | } |
182 | | |
183 | 0 | #if LATIN1_UNROLL_FROM_UNICODE |
184 | | /* unroll the loop with the most common case */ |
185 | 0 | if(targetCapacity>=16) { |
186 | 0 | int32_t count, loops; |
187 | 0 | UChar u, oredChars; |
188 | |
|
189 | 0 | loops=count=targetCapacity>>4; |
190 | 0 | do { |
191 | 0 | oredChars=u=*source++; |
192 | 0 | *target++=(uint8_t)u; |
193 | 0 | oredChars|=u=*source++; |
194 | 0 | *target++=(uint8_t)u; |
195 | 0 | oredChars|=u=*source++; |
196 | 0 | *target++=(uint8_t)u; |
197 | 0 | oredChars|=u=*source++; |
198 | 0 | *target++=(uint8_t)u; |
199 | 0 | oredChars|=u=*source++; |
200 | 0 | *target++=(uint8_t)u; |
201 | 0 | oredChars|=u=*source++; |
202 | 0 | *target++=(uint8_t)u; |
203 | 0 | oredChars|=u=*source++; |
204 | 0 | *target++=(uint8_t)u; |
205 | 0 | oredChars|=u=*source++; |
206 | 0 | *target++=(uint8_t)u; |
207 | 0 | oredChars|=u=*source++; |
208 | 0 | *target++=(uint8_t)u; |
209 | 0 | oredChars|=u=*source++; |
210 | 0 | *target++=(uint8_t)u; |
211 | 0 | oredChars|=u=*source++; |
212 | 0 | *target++=(uint8_t)u; |
213 | 0 | oredChars|=u=*source++; |
214 | 0 | *target++=(uint8_t)u; |
215 | 0 | oredChars|=u=*source++; |
216 | 0 | *target++=(uint8_t)u; |
217 | 0 | oredChars|=u=*source++; |
218 | 0 | *target++=(uint8_t)u; |
219 | 0 | oredChars|=u=*source++; |
220 | 0 | *target++=(uint8_t)u; |
221 | 0 | oredChars|=u=*source++; |
222 | 0 | *target++=(uint8_t)u; |
223 | | |
224 | | /* were all 16 entries really valid? */ |
225 | 0 | if(oredChars>max) { |
226 | | /* no, return to the first of these 16 */ |
227 | 0 | source-=16; |
228 | 0 | target-=16; |
229 | 0 | break; |
230 | 0 | } |
231 | 0 | } while(--count>0); |
232 | 0 | count=loops-count; |
233 | 0 | targetCapacity-=16*count; |
234 | |
|
235 | 0 | if(offsets!=NULL) { |
236 | 0 | oldTarget+=16*count; |
237 | 0 | while(count>0) { |
238 | 0 | *offsets++=sourceIndex++; |
239 | 0 | *offsets++=sourceIndex++; |
240 | 0 | *offsets++=sourceIndex++; |
241 | 0 | *offsets++=sourceIndex++; |
242 | 0 | *offsets++=sourceIndex++; |
243 | 0 | *offsets++=sourceIndex++; |
244 | 0 | *offsets++=sourceIndex++; |
245 | 0 | *offsets++=sourceIndex++; |
246 | 0 | *offsets++=sourceIndex++; |
247 | 0 | *offsets++=sourceIndex++; |
248 | 0 | *offsets++=sourceIndex++; |
249 | 0 | *offsets++=sourceIndex++; |
250 | 0 | *offsets++=sourceIndex++; |
251 | 0 | *offsets++=sourceIndex++; |
252 | 0 | *offsets++=sourceIndex++; |
253 | 0 | *offsets++=sourceIndex++; |
254 | 0 | --count; |
255 | 0 | } |
256 | 0 | } |
257 | 0 | } |
258 | 0 | #endif |
259 | | |
260 | | /* conversion loop */ |
261 | 0 | c=0; |
262 | 0 | while(targetCapacity>0 && (c=*source++)<=max) { |
263 | | /* convert the Unicode code point */ |
264 | 0 | *target++=(uint8_t)c; |
265 | 0 | --targetCapacity; |
266 | 0 | } |
267 | |
|
268 | 0 | if(c>max) { |
269 | 0 | cp=c; |
270 | 0 | if(!U_IS_SURROGATE(cp)) { |
271 | | /* callback(unassigned) */ |
272 | 0 | } else if(U_IS_SURROGATE_LEAD(cp)) { |
273 | 0 | getTrail: |
274 | 0 | if(source<sourceLimit) { |
275 | | /* test the following code unit */ |
276 | 0 | UChar trail=*source; |
277 | 0 | if(U16_IS_TRAIL(trail)) { |
278 | 0 | ++source; |
279 | 0 | cp=U16_GET_SUPPLEMENTARY(cp, trail); |
280 | | /* this codepage does not map supplementary code points */ |
281 | | /* callback(unassigned) */ |
282 | 0 | } else { |
283 | | /* this is an unmatched lead code unit (1st surrogate) */ |
284 | | /* callback(illegal) */ |
285 | 0 | } |
286 | 0 | } else { |
287 | | /* no more input */ |
288 | 0 | cnv->fromUChar32=cp; |
289 | 0 | goto noMoreInput; |
290 | 0 | } |
291 | 0 | } else { |
292 | | /* this is an unmatched trail code unit (2nd surrogate) */ |
293 | | /* callback(illegal) */ |
294 | 0 | } |
295 | | |
296 | 0 | *pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND; |
297 | 0 | cnv->fromUChar32=cp; |
298 | 0 | } |
299 | 0 | noMoreInput: |
300 | | |
301 | | /* set offsets since the start */ |
302 | 0 | if(offsets!=NULL) { |
303 | 0 | size_t count=target-oldTarget; |
304 | 0 | while(count>0) { |
305 | 0 | *offsets++=sourceIndex++; |
306 | 0 | --count; |
307 | 0 | } |
308 | 0 | } |
309 | |
|
310 | 0 | if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) { |
311 | | /* target is full */ |
312 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
313 | 0 | } |
314 | | |
315 | | /* write back the updated pointers */ |
316 | 0 | pArgs->source=source; |
317 | 0 | pArgs->target=(char *)target; |
318 | 0 | pArgs->offsets=offsets; |
319 | 0 | } |
320 | | |
321 | | /* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */ |
322 | | static void U_CALLCONV |
323 | | ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
324 | | UConverterToUnicodeArgs *pToUArgs, |
325 | 0 | UErrorCode *pErrorCode) { |
326 | 0 | UConverter *utf8; |
327 | 0 | const uint8_t *source, *sourceLimit; |
328 | 0 | uint8_t *target; |
329 | 0 | int32_t targetCapacity; |
330 | |
|
331 | 0 | UChar32 c; |
332 | 0 | uint8_t b, t1; |
333 | | |
334 | | /* set up the local pointers */ |
335 | 0 | utf8=pToUArgs->converter; |
336 | 0 | source=(uint8_t *)pToUArgs->source; |
337 | 0 | sourceLimit=(uint8_t *)pToUArgs->sourceLimit; |
338 | 0 | target=(uint8_t *)pFromUArgs->target; |
339 | 0 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
340 | | |
341 | | /* get the converter state from the UTF-8 UConverter */ |
342 | 0 | c=(UChar32)utf8->toUnicodeStatus; |
343 | 0 | if(c!=0 && source<sourceLimit) { |
344 | 0 | if(targetCapacity==0) { |
345 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
346 | 0 | return; |
347 | 0 | } else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) { |
348 | 0 | ++source; |
349 | 0 | *target++=(uint8_t)(((c&3)<<6)|t1); |
350 | 0 | --targetCapacity; |
351 | |
|
352 | 0 | utf8->toUnicodeStatus=0; |
353 | 0 | utf8->toULength=0; |
354 | 0 | } else { |
355 | | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
356 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
357 | 0 | return; |
358 | 0 | } |
359 | 0 | } |
360 | | |
361 | | /* |
362 | | * Make sure that the last byte sequence before sourceLimit is complete |
363 | | * or runs into a lead byte. |
364 | | * In the conversion loop compare source with sourceLimit only once |
365 | | * per multi-byte character. |
366 | | * For Latin-1, adjust sourceLimit only for 1 trail byte because |
367 | | * the conversion loop handles at most 2-byte sequences. |
368 | | */ |
369 | 0 | if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) { |
370 | 0 | --sourceLimit; |
371 | 0 | } |
372 | | |
373 | | /* conversion loop */ |
374 | 0 | while(source<sourceLimit) { |
375 | 0 | if(targetCapacity>0) { |
376 | 0 | b=*source++; |
377 | 0 | if((int8_t)b>=0) { |
378 | | /* convert ASCII */ |
379 | 0 | *target++=(uint8_t)b; |
380 | 0 | --targetCapacity; |
381 | 0 | } else if( /* handle U+0080..U+00FF inline */ |
382 | 0 | b>=0xc2 && b<=0xc3 && |
383 | 0 | (t1=(uint8_t)(*source-0x80)) <= 0x3f |
384 | 0 | ) { |
385 | 0 | ++source; |
386 | 0 | *target++=(uint8_t)(((b&3)<<6)|t1); |
387 | 0 | --targetCapacity; |
388 | 0 | } else { |
389 | | /* complicated, illegal or unmappable input: fall back to the pivoting implementation */ |
390 | 0 | pToUArgs->source=(char *)(source-1); |
391 | 0 | pFromUArgs->target=(char *)target; |
392 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
393 | 0 | return; |
394 | 0 | } |
395 | 0 | } else { |
396 | | /* target is full */ |
397 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
398 | 0 | break; |
399 | 0 | } |
400 | 0 | } |
401 | | |
402 | | /* |
403 | | * The sourceLimit may have been adjusted before the conversion loop |
404 | | * to stop before a truncated sequence. |
405 | | * If so, then collect the truncated sequence now. |
406 | | * For Latin-1, there is at most exactly one lead byte because of the |
407 | | * smaller sourceLimit adjustment logic. |
408 | | */ |
409 | 0 | if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) { |
410 | 0 | utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++; |
411 | 0 | utf8->toULength=1; |
412 | 0 | utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1; |
413 | 0 | } |
414 | | |
415 | | /* write back the updated pointers */ |
416 | 0 | pToUArgs->source=(char *)source; |
417 | 0 | pFromUArgs->target=(char *)target; |
418 | 0 | } |
419 | | |
420 | | static void U_CALLCONV |
421 | | _Latin1GetUnicodeSet(const UConverter *cnv, |
422 | | const USetAdder *sa, |
423 | | UConverterUnicodeSet which, |
424 | 0 | UErrorCode *pErrorCode) { |
425 | 0 | (void)cnv; |
426 | 0 | (void)which; |
427 | 0 | (void)pErrorCode; |
428 | 0 | sa->addRange(sa->set, 0, 0xff); |
429 | 0 | } |
430 | | U_CDECL_END |
431 | | |
432 | | |
433 | | static const UConverterImpl _Latin1Impl={ |
434 | | UCNV_LATIN_1, |
435 | | |
436 | | NULL, |
437 | | NULL, |
438 | | |
439 | | NULL, |
440 | | NULL, |
441 | | NULL, |
442 | | |
443 | | _Latin1ToUnicodeWithOffsets, |
444 | | _Latin1ToUnicodeWithOffsets, |
445 | | _Latin1FromUnicodeWithOffsets, |
446 | | _Latin1FromUnicodeWithOffsets, |
447 | | _Latin1GetNextUChar, |
448 | | |
449 | | NULL, |
450 | | NULL, |
451 | | NULL, |
452 | | NULL, |
453 | | _Latin1GetUnicodeSet, |
454 | | |
455 | | NULL, |
456 | | ucnv_Latin1FromUTF8 |
457 | | }; |
458 | | |
459 | | static const UConverterStaticData _Latin1StaticData={ |
460 | | sizeof(UConverterStaticData), |
461 | | "ISO-8859-1", |
462 | | 819, UCNV_IBM, UCNV_LATIN_1, 1, 1, |
463 | | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, |
464 | | 0, |
465 | | 0, |
466 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
467 | | }; |
468 | | |
469 | | const UConverterSharedData _Latin1Data= |
470 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Latin1StaticData, &_Latin1Impl); |
471 | | |
472 | | /* US-ASCII ----------------------------------------------------------------- */ |
473 | | |
474 | | U_CDECL_BEGIN |
475 | | /* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */ |
476 | | static void U_CALLCONV |
477 | | _ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
478 | 1.28M | UErrorCode *pErrorCode) { |
479 | 1.28M | const uint8_t *source, *sourceLimit; |
480 | 1.28M | UChar *target, *oldTarget; |
481 | 1.28M | int32_t targetCapacity, length; |
482 | 1.28M | int32_t *offsets; |
483 | | |
484 | 1.28M | int32_t sourceIndex; |
485 | | |
486 | 1.28M | uint8_t c; |
487 | | |
488 | | /* set up the local pointers */ |
489 | 1.28M | source=(const uint8_t *)pArgs->source; |
490 | 1.28M | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
491 | 1.28M | target=oldTarget=pArgs->target; |
492 | 1.28M | targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
493 | 1.28M | offsets=pArgs->offsets; |
494 | | |
495 | | /* sourceIndex=-1 if the current character began in the previous buffer */ |
496 | 1.28M | sourceIndex=0; |
497 | | |
498 | | /* |
499 | | * since the conversion here is 1:1 UChar:uint8_t, we need only one counter |
500 | | * for the minimum of the sourceLength and targetCapacity |
501 | | */ |
502 | 1.28M | length=(int32_t)(sourceLimit-source); |
503 | 1.28M | if(length<targetCapacity) { |
504 | 1.28M | targetCapacity=length; |
505 | 1.28M | } |
506 | | |
507 | 1.28M | if(targetCapacity>=8) { |
508 | | /* This loop is unrolled for speed and improved pipelining. */ |
509 | 1.27M | int32_t count, loops; |
510 | 1.27M | UChar oredChars; |
511 | | |
512 | 1.27M | loops=count=targetCapacity>>3; |
513 | 1.38M | do { |
514 | 1.38M | oredChars=target[0]=source[0]; |
515 | 1.38M | oredChars|=target[1]=source[1]; |
516 | 1.38M | oredChars|=target[2]=source[2]; |
517 | 1.38M | oredChars|=target[3]=source[3]; |
518 | 1.38M | oredChars|=target[4]=source[4]; |
519 | 1.38M | oredChars|=target[5]=source[5]; |
520 | 1.38M | oredChars|=target[6]=source[6]; |
521 | 1.38M | oredChars|=target[7]=source[7]; |
522 | | |
523 | | /* were all 16 entries really valid? */ |
524 | 1.38M | if(oredChars>0x7f) { |
525 | | /* no, return to the first of these 16 */ |
526 | 1.27M | break; |
527 | 1.27M | } |
528 | 109k | source+=8; |
529 | 109k | target+=8; |
530 | 109k | } while(--count>0); |
531 | 0 | count=loops-count; |
532 | 1.27M | targetCapacity-=count*8; |
533 | | |
534 | 1.27M | if(offsets!=NULL) { |
535 | 0 | oldTarget+=count*8; |
536 | 0 | while(count>0) { |
537 | 0 | offsets[0]=sourceIndex++; |
538 | 0 | offsets[1]=sourceIndex++; |
539 | 0 | offsets[2]=sourceIndex++; |
540 | 0 | offsets[3]=sourceIndex++; |
541 | 0 | offsets[4]=sourceIndex++; |
542 | 0 | offsets[5]=sourceIndex++; |
543 | 0 | offsets[6]=sourceIndex++; |
544 | 0 | offsets[7]=sourceIndex++; |
545 | 0 | offsets+=8; |
546 | 0 | --count; |
547 | 0 | } |
548 | 0 | } |
549 | 1.27M | } |
550 | | |
551 | | /* conversion loop */ |
552 | 0 | c=0; |
553 | 1.47M | while(targetCapacity>0 && (c=*source++)<=0x7f) { |
554 | 193k | *target++=c; |
555 | 193k | --targetCapacity; |
556 | 193k | } |
557 | | |
558 | 1.28M | if(c>0x7f) { |
559 | | /* callback(illegal); copy the current bytes to toUBytes[] */ |
560 | 1.28M | UConverter *cnv=pArgs->converter; |
561 | 1.28M | cnv->toUBytes[0]=c; |
562 | 1.28M | cnv->toULength=1; |
563 | 1.28M | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
564 | 1.28M | } else if(source<sourceLimit && target>=pArgs->targetLimit) { |
565 | | /* target is full */ |
566 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
567 | 0 | } |
568 | | |
569 | | /* set offsets since the start */ |
570 | 1.28M | if(offsets!=NULL) { |
571 | 0 | size_t count=target-oldTarget; |
572 | 0 | while(count>0) { |
573 | 0 | *offsets++=sourceIndex++; |
574 | 0 | --count; |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | /* write back the updated pointers */ |
579 | 1.28M | pArgs->source=(const char *)source; |
580 | 1.28M | pArgs->target=target; |
581 | 1.28M | pArgs->offsets=offsets; |
582 | 1.28M | } |
583 | | |
584 | | /* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */ |
585 | | static UChar32 U_CALLCONV |
586 | | _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs, |
587 | 0 | UErrorCode *pErrorCode) { |
588 | 0 | const uint8_t *source; |
589 | 0 | uint8_t b; |
590 | |
|
591 | 0 | source=(const uint8_t *)pArgs->source; |
592 | 0 | if(source<(const uint8_t *)pArgs->sourceLimit) { |
593 | 0 | b=*source++; |
594 | 0 | pArgs->source=(const char *)source; |
595 | 0 | if(b<=0x7f) { |
596 | 0 | return b; |
597 | 0 | } else { |
598 | 0 | UConverter *cnv=pArgs->converter; |
599 | 0 | cnv->toUBytes[0]=b; |
600 | 0 | cnv->toULength=1; |
601 | 0 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
602 | 0 | return 0xffff; |
603 | 0 | } |
604 | 0 | } |
605 | | |
606 | | /* no output because of empty input */ |
607 | 0 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
608 | 0 | return 0xffff; |
609 | 0 | } |
610 | | |
611 | | /* "Convert" UTF-8 to US-ASCII: Validate and copy. */ |
612 | | static void U_CALLCONV |
613 | | ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs, |
614 | | UConverterToUnicodeArgs *pToUArgs, |
615 | 0 | UErrorCode *pErrorCode) { |
616 | 0 | const uint8_t *source, *sourceLimit; |
617 | 0 | uint8_t *target; |
618 | 0 | int32_t targetCapacity, length; |
619 | |
|
620 | 0 | uint8_t c; |
621 | |
|
622 | 0 | if(pToUArgs->converter->toUnicodeStatus!=0) { |
623 | | /* no handling of partial UTF-8 characters here, fall back to pivoting */ |
624 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
625 | 0 | return; |
626 | 0 | } |
627 | | |
628 | | /* set up the local pointers */ |
629 | 0 | source=(const uint8_t *)pToUArgs->source; |
630 | 0 | sourceLimit=(const uint8_t *)pToUArgs->sourceLimit; |
631 | 0 | target=(uint8_t *)pFromUArgs->target; |
632 | 0 | targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); |
633 | | |
634 | | /* |
635 | | * since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter |
636 | | * for the minimum of the sourceLength and targetCapacity |
637 | | */ |
638 | 0 | length=(int32_t)(sourceLimit-source); |
639 | 0 | if(length<targetCapacity) { |
640 | 0 | targetCapacity=length; |
641 | 0 | } |
642 | | |
643 | | /* unroll the loop with the most common case */ |
644 | 0 | if(targetCapacity>=16) { |
645 | 0 | int32_t count, loops; |
646 | 0 | uint8_t oredChars; |
647 | |
|
648 | 0 | loops=count=targetCapacity>>4; |
649 | 0 | do { |
650 | 0 | oredChars=*target++=*source++; |
651 | 0 | oredChars|=*target++=*source++; |
652 | 0 | oredChars|=*target++=*source++; |
653 | 0 | oredChars|=*target++=*source++; |
654 | 0 | oredChars|=*target++=*source++; |
655 | 0 | oredChars|=*target++=*source++; |
656 | 0 | oredChars|=*target++=*source++; |
657 | 0 | oredChars|=*target++=*source++; |
658 | 0 | oredChars|=*target++=*source++; |
659 | 0 | oredChars|=*target++=*source++; |
660 | 0 | oredChars|=*target++=*source++; |
661 | 0 | oredChars|=*target++=*source++; |
662 | 0 | oredChars|=*target++=*source++; |
663 | 0 | oredChars|=*target++=*source++; |
664 | 0 | oredChars|=*target++=*source++; |
665 | 0 | oredChars|=*target++=*source++; |
666 | | |
667 | | /* were all 16 entries really valid? */ |
668 | 0 | if(oredChars>0x7f) { |
669 | | /* no, return to the first of these 16 */ |
670 | 0 | source-=16; |
671 | 0 | target-=16; |
672 | 0 | break; |
673 | 0 | } |
674 | 0 | } while(--count>0); |
675 | 0 | count=loops-count; |
676 | 0 | targetCapacity-=16*count; |
677 | 0 | } |
678 | | |
679 | | /* conversion loop */ |
680 | 0 | c=0; |
681 | 0 | while(targetCapacity>0 && (c=*source)<=0x7f) { |
682 | 0 | ++source; |
683 | 0 | *target++=c; |
684 | 0 | --targetCapacity; |
685 | 0 | } |
686 | |
|
687 | 0 | if(c>0x7f) { |
688 | | /* non-ASCII character, handle in standard converter */ |
689 | 0 | *pErrorCode=U_USING_DEFAULT_WARNING; |
690 | 0 | } else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) { |
691 | | /* target is full */ |
692 | 0 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
693 | 0 | } |
694 | | |
695 | | /* write back the updated pointers */ |
696 | 0 | pToUArgs->source=(const char *)source; |
697 | 0 | pFromUArgs->target=(char *)target; |
698 | 0 | } |
699 | | |
700 | | static void U_CALLCONV |
701 | | _ASCIIGetUnicodeSet(const UConverter *cnv, |
702 | | const USetAdder *sa, |
703 | | UConverterUnicodeSet which, |
704 | 0 | UErrorCode *pErrorCode) { |
705 | 0 | (void)cnv; |
706 | 0 | (void)which; |
707 | 0 | (void)pErrorCode; |
708 | 0 | sa->addRange(sa->set, 0, 0x7f); |
709 | 0 | } |
710 | | U_CDECL_END |
711 | | |
712 | | static const UConverterImpl _ASCIIImpl={ |
713 | | UCNV_US_ASCII, |
714 | | |
715 | | NULL, |
716 | | NULL, |
717 | | |
718 | | NULL, |
719 | | NULL, |
720 | | NULL, |
721 | | |
722 | | _ASCIIToUnicodeWithOffsets, |
723 | | _ASCIIToUnicodeWithOffsets, |
724 | | _Latin1FromUnicodeWithOffsets, |
725 | | _Latin1FromUnicodeWithOffsets, |
726 | | _ASCIIGetNextUChar, |
727 | | |
728 | | NULL, |
729 | | NULL, |
730 | | NULL, |
731 | | NULL, |
732 | | _ASCIIGetUnicodeSet, |
733 | | |
734 | | NULL, |
735 | | ucnv_ASCIIFromUTF8 |
736 | | }; |
737 | | |
738 | | static const UConverterStaticData _ASCIIStaticData={ |
739 | | sizeof(UConverterStaticData), |
740 | | "US-ASCII", |
741 | | 367, UCNV_IBM, UCNV_US_ASCII, 1, 1, |
742 | | { 0x1a, 0, 0, 0 }, 1, FALSE, FALSE, |
743 | | 0, |
744 | | 0, |
745 | | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
746 | | }; |
747 | | |
748 | | const UConverterSharedData _ASCIIData= |
749 | | UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ASCIIStaticData, &_ASCIIImpl); |
750 | | |
751 | | #endif |