/src/icu/source/common/utf_impl.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 1999-2012, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  utf_impl.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 1999sep13  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | *  | 
18  |  | *   This file provides implementation functions for macros in the utfXX.h  | 
19  |  | *   that would otherwise be too long as macros.  | 
20  |  | */  | 
21  |  |  | 
22  |  | /* set import/export definitions */  | 
23  |  | #ifndef U_UTF8_IMPL  | 
24  |  | #   define U_UTF8_IMPL  | 
25  |  | #endif  | 
26  |  |  | 
27  |  | #include "unicode/utypes.h"  | 
28  |  | #include "unicode/utf.h"  | 
29  |  | #include "unicode/utf8.h"  | 
30  |  | #include "uassert.h"  | 
31  |  |  | 
32  |  | /*  | 
33  |  |  * Table of the number of utf8 trail bytes, indexed by the lead byte.  | 
34  |  |  * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h  | 
35  |  |  *  | 
36  |  |  * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.  | 
37  |  |  *  | 
38  |  |  * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were  | 
39  |  |  * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES  | 
40  |  |  * may exist in old client code that must continue to run with newer icu library versions.  | 
41  |  |  *  | 
42  |  |  * This table could be replaced on many machines by  | 
43  |  |  * a few lines of assembler code using an  | 
44  |  |  * "index of first 0-bit from msb" instruction and  | 
45  |  |  * one or two more integer instructions.  | 
46  |  |  *  | 
47  |  |  * For example, on an i386, do something like  | 
48  |  |  * - MOV AL, leadByte  | 
49  |  |  * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)  | 
50  |  |  * - MOV AH, 0  | 
51  |  |  * - BSR BX, AX     (16-bit)  | 
52  |  |  * - MOV AX, 6      (result)  | 
53  |  |  * - JZ finish      (ZF==1 if leadByte==0xff)  | 
54  |  |  * - SUB AX, BX (result)  | 
55  |  |  * -finish:  | 
56  |  |  * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)  | 
57  |  |  */  | 
58  |  | extern "C" U_EXPORT const uint8_t  | 
59  |  | utf8_countTrailBytes[256]={ | 
60  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
61  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
62  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
63  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
64  |  |  | 
65  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
66  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
67  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
68  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
69  |  |  | 
70  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
71  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
72  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
73  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
74  |  |  | 
75  |  |     // illegal C0 & C1  | 
76  |  |     // 2-byte lead bytes C2..DF  | 
77  |  |     0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
78  |  |     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  | 
79  |  |  | 
80  |  |     // 3-byte lead bytes E0..EF  | 
81  |  |     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  | 
82  |  |     // 4-byte lead bytes F0..F4  | 
83  |  |     // illegal F5..FF  | 
84  |  |     3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  | 
85  |  | };  | 
86  |  |  | 
87  |  | static const UChar32  | 
88  |  | utf8_errorValue[6]={ | 
89  |  |     // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,  | 
90  |  |     // but without relying on the obsolete unicode/utf_old.h.  | 
91  |  |     0x15, 0x9f, 0xffff,  | 
92  |  |     0x10ffff  | 
93  |  | };  | 
94  |  |  | 
95  |  | static UChar32  | 
96  | 0  | errorValue(int32_t count, int8_t strict) { | 
97  | 0  |     if(strict>=0) { | 
98  | 0  |         return utf8_errorValue[count];  | 
99  | 0  |     } else if(strict==-3) { | 
100  | 0  |         return 0xfffd;  | 
101  | 0  |     } else { | 
102  | 0  |         return U_SENTINEL;  | 
103  | 0  |     }  | 
104  | 0  | }  | 
105  |  |  | 
106  |  | /*  | 
107  |  |  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros  | 
108  |  |  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().  | 
109  |  |  *  | 
110  |  |  * U8_NEXT() supports NUL-terminated strings indicated via length<0.  | 
111  |  |  *  | 
112  |  |  * The "strict" parameter controls the error behavior:  | 
113  |  |  * <0  "Safe" behavior of U8_NEXT():  | 
114  |  |  *     -1: All illegal byte sequences yield U_SENTINEL=-1.  | 
115  |  |  *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.  | 
116  |  |  *         Some implementations use this for roundtripping of  | 
117  |  |  *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they  | 
118  |  |  *         contain unpaired surrogates.  | 
119  |  |  *     -3: All illegal byte sequences yield U+FFFD.  | 
120  |  |  *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):  | 
121  |  |  *     All illegal byte sequences yield a positive code point such that this  | 
122  |  |  *     result code point would be encoded with the same number of bytes as  | 
123  |  |  *     the illegal sequence.  | 
124  |  |  * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):  | 
125  |  |  *     Same as the obsolete "safe" behavior, but non-characters are also treated  | 
126  |  |  *     like illegal sequences.  | 
127  |  |  *  | 
128  |  |  * Note that a UBool is the same as an int8_t.  | 
129  |  |  */  | 
130  |  | U_CAPI UChar32 U_EXPORT2  | 
131  | 0  | utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { | 
132  |  |     // *pi is one after byte c.  | 
133  | 0  |     int32_t i=*pi;  | 
134  |  |     // length can be negative for NUL-terminated strings: Read and validate one byte at a time.  | 
135  | 0  |     if(i==length || c>0xf4) { | 
136  |  |         // end of string, or not a lead byte  | 
137  | 0  |     } else if(c>=0xf0) { | 
138  |  |         // Test for 4-byte sequences first because  | 
139  |  |         // U8_NEXT() handles shorter valid sequences inline.  | 
140  | 0  |         uint8_t t1=s[i], t2, t3;  | 
141  | 0  |         c&=7;  | 
142  | 0  |         if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&  | 
143  | 0  |                 ++i!=length && (t2=s[i]-0x80)<=0x3f &&  | 
144  | 0  |                 ++i!=length && (t3=s[i]-0x80)<=0x3f) { | 
145  | 0  |             ++i;  | 
146  | 0  |             c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;  | 
147  |  |             // strict: forbid non-characters like U+fffe  | 
148  | 0  |             if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { | 
149  | 0  |                 *pi=i;  | 
150  | 0  |                 return c;  | 
151  | 0  |             }  | 
152  | 0  |         }  | 
153  | 0  |     } else if(c>=0xe0) { | 
154  | 0  |         c&=0xf;  | 
155  | 0  |         if(strict!=-2) { | 
156  | 0  |             uint8_t t1=s[i], t2;  | 
157  | 0  |             if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&  | 
158  | 0  |                     ++i!=length && (t2=s[i]-0x80)<=0x3f) { | 
159  | 0  |                 ++i;  | 
160  | 0  |                 c=(c<<12)|((t1&0x3f)<<6)|t2;  | 
161  |  |                 // strict: forbid non-characters like U+fffe  | 
162  | 0  |                 if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { | 
163  | 0  |                     *pi=i;  | 
164  | 0  |                     return c;  | 
165  | 0  |                 }  | 
166  | 0  |             }  | 
167  | 0  |         } else { | 
168  |  |             // strict=-2 -> lenient: allow surrogates  | 
169  | 0  |             uint8_t t1=s[i]-0x80, t2;  | 
170  | 0  |             if(t1<=0x3f && (c>0 || t1>=0x20) &&  | 
171  | 0  |                     ++i!=length && (t2=s[i]-0x80)<=0x3f) { | 
172  | 0  |                 *pi=i+1;  | 
173  | 0  |                 return (c<<12)|(t1<<6)|t2;  | 
174  | 0  |             }  | 
175  | 0  |         }  | 
176  | 0  |     } else if(c>=0xc2) { | 
177  | 0  |         uint8_t t1=s[i]-0x80;  | 
178  | 0  |         if(t1<=0x3f) { | 
179  | 0  |             *pi=i+1;  | 
180  | 0  |             return ((c-0xc0)<<6)|t1;  | 
181  | 0  |         }  | 
182  | 0  |     }  // else 0x80<=c<0xc2 is not a lead byte  | 
183  |  |  | 
184  |  |     /* error handling */  | 
185  | 0  |     c=errorValue(i-*pi, strict);  | 
186  | 0  |     *pi=i;  | 
187  | 0  |     return c;  | 
188  | 0  | }  | 
189  |  |  | 
190  |  | U_CAPI int32_t U_EXPORT2  | 
191  | 0  | utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { | 
192  | 0  |     if((uint32_t)(c)<=0x7ff) { | 
193  | 0  |         if((i)+1<(length)) { | 
194  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);  | 
195  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
196  | 0  |             return i;  | 
197  | 0  |         }  | 
198  | 0  |     } else if((uint32_t)(c)<=0xffff) { | 
199  |  |         /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */  | 
200  | 0  |         if((i)+2<(length) && !U_IS_SURROGATE(c)) { | 
201  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);  | 
202  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);  | 
203  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
204  | 0  |             return i;  | 
205  | 0  |         }  | 
206  | 0  |     } else if((uint32_t)(c)<=0x10ffff) { | 
207  | 0  |         if((i)+3<(length)) { | 
208  | 0  |             (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);  | 
209  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);  | 
210  | 0  |             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);  | 
211  | 0  |             (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);  | 
212  | 0  |             return i;  | 
213  | 0  |         }  | 
214  | 0  |     }  | 
215  |  |     /* c>0x10ffff or not enough space, write an error value */  | 
216  | 0  |     if(pIsError!=NULL) { | 
217  | 0  |         *pIsError=TRUE;  | 
218  | 0  |     } else { | 
219  | 0  |         length-=i;  | 
220  | 0  |         if(length>0) { | 
221  | 0  |             int32_t offset;  | 
222  | 0  |             if(length>3) { | 
223  | 0  |                 length=3;  | 
224  | 0  |             }  | 
225  | 0  |             s+=i;  | 
226  | 0  |             offset=0;  | 
227  | 0  |             c=utf8_errorValue[length-1];  | 
228  | 0  |             U8_APPEND_UNSAFE(s, offset, c);  | 
229  | 0  |             i=i+offset;  | 
230  | 0  |         }  | 
231  | 0  |     }  | 
232  | 0  |     return i;  | 
233  | 0  | }  | 
234  |  |  | 
235  |  | U_CAPI UChar32 U_EXPORT2  | 
236  | 0  | utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { | 
237  |  |     // *pi is the index of byte c.  | 
238  | 0  |     int32_t i=*pi;  | 
239  | 0  |     if(U8_IS_TRAIL(c) && i>start) { | 
240  | 0  |         uint8_t b1=s[--i];  | 
241  | 0  |         if(U8_IS_LEAD(b1)) { | 
242  | 0  |             if(b1<0xe0) { | 
243  | 0  |                 *pi=i;  | 
244  | 0  |                 return ((b1-0xc0)<<6)|(c&0x3f);  | 
245  | 0  |             } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) { | 
246  |  |                 // Truncated 3- or 4-byte sequence.  | 
247  | 0  |                 *pi=i;  | 
248  | 0  |                 return errorValue(1, strict);  | 
249  | 0  |             }  | 
250  | 0  |         } else if(U8_IS_TRAIL(b1) && i>start) { | 
251  |  |             // Extract the value bits from the last trail byte.  | 
252  | 0  |             c&=0x3f;  | 
253  | 0  |             uint8_t b2=s[--i];  | 
254  | 0  |             if(0xe0<=b2 && b2<=0xf4) { | 
255  | 0  |                 if(b2<0xf0) { | 
256  | 0  |                     b2&=0xf;  | 
257  | 0  |                     if(strict!=-2) { | 
258  | 0  |                         if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { | 
259  | 0  |                             *pi=i;  | 
260  | 0  |                             c=(b2<<12)|((b1&0x3f)<<6)|c;  | 
261  | 0  |                             if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { | 
262  | 0  |                                 return c;  | 
263  | 0  |                             } else { | 
264  |  |                                 // strict: forbid non-characters like U+fffe  | 
265  | 0  |                                 return errorValue(2, strict);  | 
266  | 0  |                             }  | 
267  | 0  |                         }  | 
268  | 0  |                     } else { | 
269  |  |                         // strict=-2 -> lenient: allow surrogates  | 
270  | 0  |                         b1-=0x80;  | 
271  | 0  |                         if((b2>0 || b1>=0x20)) { | 
272  | 0  |                             *pi=i;  | 
273  | 0  |                             return (b2<<12)|(b1<<6)|c;  | 
274  | 0  |                         }  | 
275  | 0  |                     }  | 
276  | 0  |                 } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { | 
277  |  |                     // Truncated 4-byte sequence.  | 
278  | 0  |                     *pi=i;  | 
279  | 0  |                     return errorValue(2, strict);  | 
280  | 0  |                 }  | 
281  | 0  |             } else if(U8_IS_TRAIL(b2) && i>start) { | 
282  | 0  |                 uint8_t b3=s[--i];  | 
283  | 0  |                 if(0xf0<=b3 && b3<=0xf4) { | 
284  | 0  |                     b3&=7;  | 
285  | 0  |                     if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { | 
286  | 0  |                         *pi=i;  | 
287  | 0  |                         c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;  | 
288  | 0  |                         if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) { | 
289  | 0  |                             return c;  | 
290  | 0  |                         } else { | 
291  |  |                             // strict: forbid non-characters like U+fffe  | 
292  | 0  |                             return errorValue(3, strict);  | 
293  | 0  |                         }  | 
294  | 0  |                     }  | 
295  | 0  |                 }  | 
296  | 0  |             }  | 
297  | 0  |         }  | 
298  | 0  |     }  | 
299  | 0  |     return errorValue(0, strict);  | 
300  | 0  | }  | 
301  |  |  | 
302  |  | U_CAPI int32_t U_EXPORT2  | 
303  | 0  | utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { | 
304  |  |     // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.  | 
305  | 0  |     int32_t orig_i=i;  | 
306  | 0  |     uint8_t c=s[i];  | 
307  | 0  |     if(U8_IS_TRAIL(c) && i>start) { | 
308  | 0  |         uint8_t b1=s[--i];  | 
309  | 0  |         if(U8_IS_LEAD(b1)) { | 
310  | 0  |             if(b1<0xe0 ||  | 
311  | 0  |                     (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) { | 
312  | 0  |                 return i;  | 
313  | 0  |             }  | 
314  | 0  |         } else if(U8_IS_TRAIL(b1) && i>start) { | 
315  | 0  |             uint8_t b2=s[--i];  | 
316  | 0  |             if(0xe0<=b2 && b2<=0xf4) { | 
317  | 0  |                 if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { | 
318  | 0  |                     return i;  | 
319  | 0  |                 }  | 
320  | 0  |             } else if(U8_IS_TRAIL(b2) && i>start) { | 
321  | 0  |                 uint8_t b3=s[--i];  | 
322  | 0  |                 if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { | 
323  | 0  |                     return i;  | 
324  | 0  |                 }  | 
325  | 0  |             }  | 
326  | 0  |         }  | 
327  | 0  |     }  | 
328  | 0  |     return orig_i;  | 
329  | 0  | }  |