/src/icu/source/common/utf_impl.cpp
Line  | Count  | Source  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 1999-2012, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  utf_impl.c  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 1999sep13  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | *  | 
18  |  | *   This file provides implementation functions for macros in the utfXX.h  | 
19  |  | *   that would otherwise be too long as macros.  | 
20  |  | */  | 
21  |  |  | 
22  |  | /* set import/export definitions */  | 
23  |  | #ifndef U_UTF8_IMPL  | 
24  |  | #   define U_UTF8_IMPL  | 
25  |  | #endif  | 
26  |  |  | 
27  |  | #include "unicode/utypes.h"  | 
28  |  | #include "unicode/utf.h"  | 
29  |  | #include "unicode/utf8.h"  | 
30  |  | #include "unicode/utf_old.h"  | 
31  |  | #include "uassert.h"  | 
32  |  |  | 
33  |  | /*  | 
34  |  |  * Table of the number of utf8 trail bytes, indexed by the lead byte.  | 
35  |  |  * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h  | 
36  |  |  *  | 
37  |  |  * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.  | 
38  |  |  *  | 
39  |  |  * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were  | 
40  |  |  * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES  | 
41  |  |  * may exist in old client code that must continue to run with newer icu library versions.  | 
42  |  |  *  | 
43  |  |  * This table could be replaced on many machines by  | 
44  |  |  * a few lines of assembler code using an  | 
45  |  |  * "index of first 0-bit from msb" instruction and  | 
46  |  |  * one or two more integer instructions.  | 
47  |  |  *  | 
48  |  |  * For example, on an i386, do something like  | 
49  |  |  * - MOV AL, leadByte  | 
50  |  |  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)  | 
51  |  |  * - MOV AH, 0  | 
52  |  |  * - BSR BX, AX     (16-bit)  | 
53  |  |  * - MOV AX, 6      (result)  | 
54  |  |  * - JZ finish      (ZF==1 if leadByte==0xff)  | 
55  |  |  * - SUB AX, BX (result)  | 
56  |  |  * -finish:  | 
57  |  |  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)  | 
58  |  |  *  | 
59  |  |  * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;  | 
60  |  |  * lead bytes above 0xf4 are illegal.  | 
61  |  |  * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.  | 
62  |  |  */  | 
63  |  | extern "C" U_EXPORT const uint8_t  | 
64  |  | utf8_countTrailBytes[256]={ | 
65  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
66  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
67  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
68  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
69  |  |  | 
70  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
71  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
72  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
73  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
74  |  |  | 
75  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
76  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
77  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
78  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
79  |  |  | 
80  |  |     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
81  |  |     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
82  |  |  | 
83  |  |     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  | 
84  |  |     3, 3, 3, 3, 3,  | 
85  |  |     3, 3, 3,    /* illegal in Unicode */  | 
86  |  |     4, 4, 4, 4, /* illegal in Unicode */  | 
87  |  |     5, 5,       /* illegal in Unicode */  | 
88  |  |     0, 0        /* illegal bytes 0xfe and 0xff */  | 
89  |  | };  | 
90  |  |  | 
91  |  | static const UChar32  | 
92  |  | utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; | 
93  |  |  | 
94  |  | static const UChar32  | 
95  |  | utf8_errorValue[6]={ | 
96  |  |     UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,  | 
97  |  |     0x3ffffff, 0x7fffffff  | 
98  |  | };  | 
99  |  |  | 
100  |  | static UChar32  | 
101  | 0  | errorValue(int32_t count, int8_t strict) { | 
102  | 0  |     if(strict>=0) { | 
103  | 0  |         return utf8_errorValue[count];  | 
104  | 0  |     } else if(strict==-3) { | 
105  | 0  |         return 0xfffd;  | 
106  | 0  |     } else { | 
107  | 0  |         return U_SENTINEL;  | 
108  | 0  |     }  | 
109  | 0  | }  | 
110  |  |  | 
111  |  | /*  | 
112  |  |  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros  | 
113  |  |  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().  | 
114  |  |  *  | 
115  |  |  * U8_NEXT() supports NUL-terminated strings indicated via length<0.  | 
116  |  |  *  | 
117  |  |  * The "strict" parameter controls the error behavior:  | 
118  |  |  * <0  "Safe" behavior of U8_NEXT():  | 
119  |  |  *     -1: All illegal byte sequences yield U_SENTINEL=-1.  | 
120  |  |  *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.  | 
121  |  |  *         Some implementations use this for roundtripping of  | 
122  |  |  *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they  | 
123  |  |  *         contain unpaired surrogates.  | 
124  |  |  *     -3: All illegal byte sequences yield U+FFFD.  | 
125  |  |  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):  | 
126  |  |  *     All illegal byte sequences yield a positive code point such that this  | 
127  |  |  *     result code point would be encoded with the same number of bytes as  | 
128  |  |  *     the illegal sequence.  | 
129  |  |  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):  | 
130  |  |  *     Same as the obsolete "safe" behavior, but non-characters are also treated  | 
131  |  |  *     like illegal sequences.  | 
132  |  |  *  | 
133  |  |  * Note that a UBool is the same as an int8_t.  | 
134  |  |  */  | 
135  |  | U_CAPI UChar32 U_EXPORT2  | 
136  | 0  | utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { | 
137  | 0  |     int32_t i=*pi;  | 
138  | 0  |     uint8_t count=U8_COUNT_TRAIL_BYTES(c);  | 
139  | 0  |     U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */  | 
140  | 0  |     if(i+count<=length || length<0) { | 
141  | 0  |         uint8_t trail;  | 
142  |  | 
  | 
143  | 0  |         U8_MASK_LEAD_BYTE(c, count);  | 
144  |  |         /* support NUL-terminated strings: do not read beyond the first non-trail byte */  | 
145  | 0  |         switch(count) { | 
146  |  |         /* each branch falls through to the next one */  | 
147  | 0  |         case 0:  | 
148  |  |             /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */  | 
149  | 0  |         case 5:  | 
150  | 0  |         case 4:  | 
151  |  |             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */  | 
152  | 0  |             break;  | 
153  | 0  |         case 3:  | 
154  | 0  |             trail=s[i++]-0x80;  | 
155  | 0  |             c=(c<<6)|trail;  | 
156  |  |             /* c>=0x110 would result in code point>0x10ffff, outside Unicode */  | 
157  | 0  |             if(c>=0x110 || trail>0x3f) { break; } | 
158  | 0  |             U_FALLTHROUGH;  | 
159  | 0  |         case 2:  | 
160  | 0  |             trail=s[i++]-0x80;  | 
161  | 0  |             c=(c<<6)|trail;  | 
162  |  |             /*  | 
163  |  |              * test for a surrogate d800..dfff unless we are lenient:  | 
164  |  |              * before the last (c<<6), a surrogate is c=360..37f  | 
165  |  |              */  | 
166  | 0  |             if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; } | 
167  | 0  |             U_FALLTHROUGH;  | 
168  | 0  |         case 1:  | 
169  | 0  |             trail=s[i++]-0x80;  | 
170  | 0  |             c=(c<<6)|trail;  | 
171  | 0  |             if(trail>0x3f) { break; } | 
172  |  |             /* correct sequence - all trail bytes have (b7..b6)==(10) */  | 
173  | 0  |             if(c>=utf8_minLegal[count] &&  | 
174  |  |                     /* strict: forbid non-characters like U+fffe */  | 
175  | 0  |                     (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) { | 
176  | 0  |                 *pi=i;  | 
177  | 0  |                 return c;  | 
178  | 0  |             }  | 
179  |  |         /* no default branch to optimize switch()  - all values are covered */  | 
180  | 0  |         }  | 
181  | 0  |     } else { | 
182  |  |         /* too few bytes left */  | 
183  | 0  |         count=length-i;  | 
184  | 0  |     }  | 
185  |  |  | 
186  |  |     /* error handling */  | 
187  | 0  |     i=*pi;  | 
188  | 0  |     while(count>0 && U8_IS_TRAIL(s[i])) { | 
189  | 0  |         ++i;  | 
190  | 0  |         --count;  | 
191  | 0  |     }  | 
192  | 0  |     c=errorValue(i-*pi, strict);  | 
193  | 0  |     *pi=i;  | 
194  | 0  |     return c;  | 
195  | 0  | }  | 
196  |  |  | 
197  |  | U_CAPI int32_t U_EXPORT2  | 
198  | 0  | utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { | 
199  | 0  |     if((uint32_t)(c)<=0x7ff) { | 
200  | 0  |         if((i)+1<(length)) { | 
201  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);  | 
202  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
203  | 0  |             return i;  | 
204  | 0  |         }  | 
205  | 0  |     } else if((uint32_t)(c)<=0xffff) { | 
206  |  |         /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */  | 
207  | 0  |         if((i)+2<(length) && !U_IS_SURROGATE(c)) { | 
208  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);  | 
209  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);  | 
210  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
211  | 0  |             return i;  | 
212  | 0  |         }  | 
213  | 0  |     } else if((uint32_t)(c)<=0x10ffff) { | 
214  | 0  |         if((i)+3<(length)) { | 
215  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);  | 
216  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);  | 
217  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);  | 
218  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
219  | 0  |             return i;  | 
220  | 0  |         }  | 
221  | 0  |     }  | 
222  |  |     /* c>0x10ffff or not enough space, write an error value */  | 
223  | 0  |     if(pIsError!=NULL) { | 
224  | 0  |         *pIsError=TRUE;  | 
225  | 0  |     } else { | 
226  | 0  |         length-=i;  | 
227  | 0  |         if(length>0) { | 
228  | 0  |             int32_t offset;  | 
229  | 0  |             if(length>3) { | 
230  | 0  |                 length=3;  | 
231  | 0  |             }  | 
232  | 0  |             s+=i;  | 
233  | 0  |             offset=0;  | 
234  | 0  |             c=utf8_errorValue[length-1];  | 
235  | 0  |             UTF8_APPEND_CHAR_UNSAFE(s, offset, c);  | 
236  | 0  |             i=i+offset;  | 
237  | 0  |         }  | 
238  | 0  |     }  | 
239  | 0  |     return i;  | 
240  | 0  | }  | 
241  |  |  | 
242  |  | U_CAPI UChar32 U_EXPORT2  | 
243  | 0  | utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { | 
244  | 0  |     int32_t i=*pi;  | 
245  | 0  |     uint8_t b, count=1, shift=6;  | 
246  |  | 
  | 
247  | 0  |     if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); } | 
248  |  |  | 
249  |  |     /* extract value bits from the last trail byte */  | 
250  | 0  |     c&=0x3f;  | 
251  |  | 
  | 
252  | 0  |     for(;;) { | 
253  | 0  |         if(i<=start) { | 
254  |  |             /* no lead byte at all */  | 
255  | 0  |             return errorValue(0, strict);  | 
256  | 0  |         }  | 
257  |  |  | 
258  |  |         /* read another previous byte */  | 
259  | 0  |         b=s[--i];  | 
260  | 0  |         if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ | 
261  | 0  |             if(b&0x40) { | 
262  |  |                 /* lead byte, this will always end the loop */  | 
263  | 0  |                 uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b);  | 
264  |  | 
  | 
265  | 0  |                 if(count==shouldCount) { | 
266  |  |                     /* set the new position */  | 
267  | 0  |                     *pi=i;  | 
268  | 0  |                     U8_MASK_LEAD_BYTE(b, count);  | 
269  | 0  |                     c|=(UChar32)b<<shift;  | 
270  | 0  |                     if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) { | 
271  |  |                         /* illegal sequence or (strict and non-character) */  | 
272  | 0  |                         if(count>=4) { | 
273  | 0  |                             count=3;  | 
274  | 0  |                         }  | 
275  | 0  |                         c=errorValue(count, strict);  | 
276  | 0  |                     } else { | 
277  |  |                         /* exit with correct c */  | 
278  | 0  |                     }  | 
279  | 0  |                 } else { | 
280  |  |                     /* the lead byte does not match the number of trail bytes */  | 
281  |  |                     /* only set the position to the lead byte if it would  | 
282  |  |                        include the trail byte that we started with */  | 
283  | 0  |                     if(count<shouldCount) { | 
284  | 0  |                         *pi=i;  | 
285  | 0  |                         c=errorValue(count, strict);  | 
286  | 0  |                     } else { | 
287  | 0  |                         c=errorValue(0, strict);  | 
288  | 0  |                     }  | 
289  | 0  |                 }  | 
290  | 0  |                 break;  | 
291  | 0  |             } else if(count<5) { | 
292  |  |                 /* trail byte */  | 
293  | 0  |                 c|=(UChar32)(b&0x3f)<<shift;  | 
294  | 0  |                 ++count;  | 
295  | 0  |                 shift+=6;  | 
296  | 0  |             } else { | 
297  |  |                 /* more than 5 trail bytes is illegal */  | 
298  | 0  |                 c=errorValue(0, strict);  | 
299  | 0  |                 break;  | 
300  | 0  |             }  | 
301  | 0  |         } else { | 
302  |  |             /* single-byte character precedes trailing bytes */  | 
303  | 0  |             c=errorValue(0, strict);  | 
304  | 0  |             break;  | 
305  | 0  |         }  | 
306  | 0  |     }  | 
307  | 0  |     return c;  | 
308  | 0  | }  | 
309  |  |  | 
310  |  | U_CAPI int32_t U_EXPORT2  | 
311  | 0  | utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { | 
312  |  |     /* i had been decremented once before the function call */  | 
313  | 0  |     int32_t I=i, Z;  | 
314  | 0  |     uint8_t b;  | 
315  |  |  | 
316  |  |     /* read at most the 6 bytes s[Z] to s[i], inclusively */  | 
317  | 0  |     if(I-5>start) { | 
318  | 0  |         Z=I-5;  | 
319  | 0  |     } else { | 
320  | 0  |         Z=start;  | 
321  | 0  |     }  | 
322  |  |  | 
323  |  |     /* return I if the sequence starting there is long enough to include i */  | 
324  | 0  |     do { | 
325  | 0  |         b=s[I];  | 
326  | 0  |         if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ | 
327  | 0  |             break;  | 
328  | 0  |         } else if(b>=0xc0) { | 
329  | 0  |             if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) { | 
330  | 0  |                 return I;  | 
331  | 0  |             } else { | 
332  | 0  |                 break;  | 
333  | 0  |             }  | 
334  | 0  |         }  | 
335  | 0  |     } while(Z<=--I);  | 
336  |  |  | 
337  |  |     /* return i itself to be consistent with the FWD_1 macro */  | 
338  | 0  |     return i;  | 
339  | 0  | }  |