/src/icu/source/common/patternprops.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *   Copyright (C) 2011, International Business Machines  | 
6  |  | *   Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | *   file name:  patternprops.cpp  | 
9  |  | *   encoding:   UTF-8  | 
10  |  | *   tab size:   8 (not used)  | 
11  |  | *   indentation:4  | 
12  |  | *  | 
13  |  | *   created on: 2011mar13  | 
14  |  | *   created by: Markus W. Scherer  | 
15  |  | */  | 
16  |  |  | 
17  |  | #include "unicode/utypes.h"  | 
18  |  | #include "patternprops.h"  | 
19  |  |  | 
20  |  | U_NAMESPACE_BEGIN  | 
21  |  |  | 
22  |  | /*  | 
23  |  |  * One byte per Latin-1 character.  | 
24  |  |  * Bit 0 is set if either Pattern property is true,  | 
25  |  |  * bit 1 if Pattern_Syntax is true,  | 
26  |  |  * bit 2 if Pattern_White_Space is true.  | 
27  |  |  * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5.  | 
28  |  |  */  | 
29  |  | static const uint8_t latin1[256]={ | 
30  |  |     // WS: 9..D  | 
31  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0,  | 
32  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
33  |  |     // WS: 20  Syntax: 21..2F  | 
34  |  |     5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  | 
35  |  |     // Syntax: 3A..40  | 
36  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3,  | 
37  |  |     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
38  |  |     // Syntax: 5B..5E  | 
39  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,  | 
40  |  |     // Syntax: 60  | 
41  |  |     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
42  |  |     // Syntax: 7B..7E  | 
43  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,  | 
44  |  |     // WS: 85  | 
45  |  |     0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
46  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
47  |  |     // Syntax: A1..A7, A9, AB, AC, AE  | 
48  |  |     0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0,  | 
49  |  |     // Syntax: B0, B1, B6, BB, BF  | 
50  |  |     3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3,  | 
51  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
52  |  |     // Syntax: D7  | 
53  |  |     0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,  | 
54  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
55  |  |     // Syntax: F7  | 
56  |  |     0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0  | 
57  |  | };  | 
58  |  |  | 
59  |  | /*  | 
60  |  |  * One byte per 32 characters from U+2000..U+303F indexing into  | 
61  |  |  * a small table of 32-bit data words.  | 
62  |  |  * The first two data words are all-zeros and all-ones.  | 
63  |  |  */  | 
64  |  | static const uint8_t index2000[130]={ | 
65  |  |     2, 3, 4, 0, 0, 0, 0, 0,  // 20xx  | 
66  |  |     0, 0, 0, 0, 5, 1, 1, 1,  // 21xx  | 
67  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 22xx  | 
68  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 23xx  | 
69  |  |     1, 1, 1, 0, 0, 0, 0, 0,  // 24xx  | 
70  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 25xx  | 
71  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 26xx  | 
72  |  |     1, 1, 1, 6, 7, 1, 1, 1,  // 27xx  | 
73  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 28xx  | 
74  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 29xx  | 
75  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 2Axx  | 
76  |  |     1, 1, 1, 1, 1, 1, 1, 1,  // 2Bxx  | 
77  |  |     0, 0, 0, 0, 0, 0, 0, 0,  // 2Cxx  | 
78  |  |     0, 0, 0, 0, 0, 0, 0, 0,  // 2Dxx  | 
79  |  |     1, 1, 1, 1, 0, 0, 0, 0,  // 2Exx  | 
80  |  |     0, 0, 0, 0, 0, 0, 0, 0,  // 2Fxx  | 
81  |  |     8, 9  // 3000..303F  | 
82  |  | };  | 
83  |  |  | 
84  |  | /*  | 
85  |  |  * One 32-bit integer per 32 characters. Ranges of all-false and all-true  | 
86  |  |  * are mapped to the first two values, other ranges map to appropriate bit patterns.  | 
87  |  |  */  | 
88  |  | static const uint32_t syntax2000[]={ | 
89  |  |     0,  | 
90  |  |     0xffffffff,  | 
91  |  |     0xffff0000,  // 2: 2010..201F  | 
92  |  |     0x7fff00ff,  // 3: 2020..2027, 2030..203E  | 
93  |  |     0x7feffffe,  // 4: 2041..2053, 2055..205E  | 
94  |  |     0xffff0000,  // 5: 2190..219F  | 
95  |  |     0x003fffff,  // 6: 2760..2775  | 
96  |  |     0xfff00000,  // 7: 2794..279F  | 
97  |  |     0xffffff0e,  // 8: 3001..3003, 3008..301F  | 
98  |  |     0x00010001   // 9: 3020, 3030  | 
99  |  | };  | 
100  |  |  | 
101  |  | /*  | 
102  |  |  * Same as syntax2000, but with additional bits set for the  | 
103  |  |  * Pattern_White_Space characters 200E 200F 2028 2029.  | 
104  |  |  */  | 
105  |  | static const uint32_t syntaxOrWhiteSpace2000[]={ | 
106  |  |     0,  | 
107  |  |     0xffffffff,  | 
108  |  |     0xffffc000,  // 2: 200E..201F  | 
109  |  |     0x7fff03ff,  // 3: 2020..2029, 2030..203E  | 
110  |  |     0x7feffffe,  // 4: 2041..2053, 2055..205E  | 
111  |  |     0xffff0000,  // 5: 2190..219F  | 
112  |  |     0x003fffff,  // 6: 2760..2775  | 
113  |  |     0xfff00000,  // 7: 2794..279F  | 
114  |  |     0xffffff0e,  // 8: 3001..3003, 3008..301F  | 
115  |  |     0x00010001   // 9: 3020, 3030  | 
116  |  | };  | 
117  |  |  | 
118  |  | UBool  | 
119  | 0  | PatternProps::isSyntax(UChar32 c) { | 
120  | 0  |     if(c<0) { | 
121  | 0  |         return FALSE;  | 
122  | 0  |     } else if(c<=0xff) { | 
123  | 0  |         return (UBool)(latin1[c]>>1)&1;  | 
124  | 0  |     } else if(c<0x2010) { | 
125  | 0  |         return FALSE;  | 
126  | 0  |     } else if(c<=0x3030) { | 
127  | 0  |         uint32_t bits=syntax2000[index2000[(c-0x2000)>>5]];  | 
128  | 0  |         return (UBool)((bits>>(c&0x1f))&1);  | 
129  | 0  |     } else if(0xfd3e<=c && c<=0xfe46) { | 
130  | 0  |         return c<=0xfd3f || 0xfe45<=c;  | 
131  | 0  |     } else { | 
132  | 0  |         return FALSE;  | 
133  | 0  |     }  | 
134  | 0  | }  | 
135  |  |  | 
136  |  | UBool  | 
137  | 0  | PatternProps::isSyntaxOrWhiteSpace(UChar32 c) { | 
138  | 0  |     if(c<0) { | 
139  | 0  |         return FALSE;  | 
140  | 0  |     } else if(c<=0xff) { | 
141  | 0  |         return (UBool)(latin1[c]&1);  | 
142  | 0  |     } else if(c<0x200e) { | 
143  | 0  |         return FALSE;  | 
144  | 0  |     } else if(c<=0x3030) { | 
145  | 0  |         uint32_t bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]];  | 
146  | 0  |         return (UBool)((bits>>(c&0x1f))&1);  | 
147  | 0  |     } else if(0xfd3e<=c && c<=0xfe46) { | 
148  | 0  |         return c<=0xfd3f || 0xfe45<=c;  | 
149  | 0  |     } else { | 
150  | 0  |         return FALSE;  | 
151  | 0  |     }  | 
152  | 0  | }  | 
153  |  |  | 
154  |  | UBool  | 
155  | 0  | PatternProps::isWhiteSpace(UChar32 c) { | 
156  | 0  |     if(c<0) { | 
157  | 0  |         return FALSE;  | 
158  | 0  |     } else if(c<=0xff) { | 
159  | 0  |         return (UBool)(latin1[c]>>2)&1;  | 
160  | 0  |     } else if(0x200e<=c && c<=0x2029) { | 
161  | 0  |         return c<=0x200f || 0x2028<=c;  | 
162  | 0  |     } else { | 
163  | 0  |         return FALSE;  | 
164  | 0  |     }  | 
165  | 0  | }  | 
166  |  |  | 
167  |  | const UChar *  | 
168  | 0  | PatternProps::skipWhiteSpace(const UChar *s, int32_t length) { | 
169  | 0  |     while(length>0 && isWhiteSpace(*s)) { | 
170  | 0  |         ++s;  | 
171  | 0  |         --length;  | 
172  | 0  |     }  | 
173  | 0  |     return s;  | 
174  | 0  | }  | 
175  |  |  | 
176  |  | int32_t  | 
177  | 0  | PatternProps::skipWhiteSpace(const UnicodeString& s, int32_t start) { | 
178  | 0  |     int32_t i = start;  | 
179  | 0  |     int32_t length = s.length();  | 
180  | 0  |     while(i<length && isWhiteSpace(s.charAt(i))) { | 
181  | 0  |         ++i;  | 
182  | 0  |     }  | 
183  | 0  |     return i;  | 
184  | 0  | }  | 
185  |  |  | 
186  |  | const UChar *  | 
187  | 0  | PatternProps::trimWhiteSpace(const UChar *s, int32_t &length) { | 
188  | 0  |     if(length<=0 || (!isWhiteSpace(s[0]) && !isWhiteSpace(s[length-1]))) { | 
189  | 0  |         return s;  | 
190  | 0  |     }  | 
191  | 0  |     int32_t start=0;  | 
192  | 0  |     int32_t limit=length;  | 
193  | 0  |     while(start<limit && isWhiteSpace(s[start])) { | 
194  | 0  |         ++start;  | 
195  | 0  |     }  | 
196  | 0  |     if(start<limit) { | 
197  |  |         // There is non-white space at start; we will not move limit below that,  | 
198  |  |         // so we need not test start<limit in the loop.  | 
199  | 0  |         while(isWhiteSpace(s[limit-1])) { | 
200  | 0  |             --limit;  | 
201  | 0  |         }  | 
202  | 0  |     }  | 
203  | 0  |     length=limit-start;  | 
204  | 0  |     return s+start;  | 
205  | 0  | }  | 
206  |  |  | 
207  |  | UBool  | 
208  | 0  | PatternProps::isIdentifier(const UChar *s, int32_t length) { | 
209  | 0  |     if(length<=0) { | 
210  | 0  |         return FALSE;  | 
211  | 0  |     }  | 
212  | 0  |     const UChar *limit=s+length;  | 
213  | 0  |     do { | 
214  | 0  |         if(isSyntaxOrWhiteSpace(*s++)) { | 
215  | 0  |             return FALSE;  | 
216  | 0  |         }  | 
217  | 0  |     } while(s<limit);  | 
218  | 0  |     return TRUE;  | 
219  | 0  | }  | 
220  |  |  | 
221  |  | const UChar *  | 
222  | 0  | PatternProps::skipIdentifier(const UChar *s, int32_t length) { | 
223  | 0  |     while(length>0 && !isSyntaxOrWhiteSpace(*s)) { | 
224  | 0  |         ++s;  | 
225  | 0  |         --length;  | 
226  | 0  |     }  | 
227  | 0  |     return s;  | 
228  | 0  | }  | 
229  |  |  | 
230  |  | U_NAMESPACE_END  |