/src/icu/source/common/bmpset.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2007-2012, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  bmpset.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2007jan29  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | #include "unicode/utypes.h"  | 
20  |  | #include "unicode/uniset.h"  | 
21  |  | #include "unicode/utf8.h"  | 
22  |  | #include "unicode/utf16.h"  | 
23  |  | #include "cmemory.h"  | 
24  |  | #include "bmpset.h"  | 
25  |  | #include "uassert.h"  | 
26  |  |  | 
27  |  | U_NAMESPACE_BEGIN  | 
28  |  |  | 
29  |  | BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :  | 
30  | 0  |         list(parentList), listLength(parentListLength) { | 
31  | 0  |     uprv_memset(latin1Contains, 0, sizeof(latin1Contains));  | 
32  | 0  |     uprv_memset(table7FF, 0, sizeof(table7FF));  | 
33  | 0  |     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));  | 
34  |  |  | 
35  |  |     /*  | 
36  |  |      * Set the list indexes for binary searches for  | 
37  |  |      * U+0800, U+1000, U+2000, .., U+F000, U+10000.  | 
38  |  |      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are  | 
39  |  |      * looked up in the bit tables.  | 
40  |  |      * The last pair of indexes is for finding supplementary code points.  | 
41  |  |      */  | 
42  | 0  |     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);  | 
43  | 0  |     int32_t i;  | 
44  | 0  |     for(i=1; i<=0x10; ++i) { | 
45  | 0  |         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);  | 
46  | 0  |     }  | 
47  | 0  |     list4kStarts[0x11]=listLength-1;  | 
48  | 0  |     containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);  | 
49  |  | 
  | 
50  | 0  |     initBits();  | 
51  | 0  |     overrideIllegal();  | 
52  | 0  | }  | 
53  |  |  | 
54  |  | BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :  | 
55  | 0  |         containsFFFD(otherBMPSet.containsFFFD),  | 
56  | 0  |         list(newParentList), listLength(newParentListLength) { | 
57  | 0  |     uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));  | 
58  | 0  |     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));  | 
59  | 0  |     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));  | 
60  | 0  |     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));  | 
61  | 0  | }  | 
62  |  |  | 
63  | 0  | BMPSet::~BMPSet() { | 
64  | 0  | }  | 
65  |  |  | 
66  |  | /*  | 
67  |  |  * Set bits in a bit rectangle in "vertical" bit organization.  | 
68  |  |  * start<limit<=0x800  | 
69  |  |  */  | 
70  | 0  | static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { | 
71  | 0  |     U_ASSERT(start<limit);  | 
72  | 0  |     U_ASSERT(limit<=0x800);  | 
73  |  | 
  | 
74  | 0  |     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.  | 
75  | 0  |     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.  | 
76  |  |  | 
77  |  |     // Set one bit indicating an all-one block.  | 
78  | 0  |     uint32_t bits=(uint32_t)1<<lead;  | 
79  | 0  |     if((start+1)==limit) {  // Single-character shortcut. | 
80  | 0  |         table[trail]|=bits;  | 
81  | 0  |         return;  | 
82  | 0  |     }  | 
83  |  |  | 
84  | 0  |     int32_t limitLead=limit>>6;  | 
85  | 0  |     int32_t limitTrail=limit&0x3f;  | 
86  |  | 
  | 
87  | 0  |     if(lead==limitLead) { | 
88  |  |         // Partial vertical bit column.  | 
89  | 0  |         while(trail<limitTrail) { | 
90  | 0  |             table[trail++]|=bits;  | 
91  | 0  |         }  | 
92  | 0  |     } else { | 
93  |  |         // Partial vertical bit column,  | 
94  |  |         // followed by a bit rectangle,  | 
95  |  |         // followed by another partial vertical bit column.  | 
96  | 0  |         if(trail>0) { | 
97  | 0  |             do { | 
98  | 0  |                 table[trail++]|=bits;  | 
99  | 0  |             } while(trail<64);  | 
100  | 0  |             ++lead;  | 
101  | 0  |         }  | 
102  | 0  |         if(lead<limitLead) { | 
103  | 0  |             bits=~(((unsigned)1<<lead)-1);  | 
104  | 0  |             if(limitLead<0x20) { | 
105  | 0  |                 bits&=((unsigned)1<<limitLead)-1;  | 
106  | 0  |             }  | 
107  | 0  |             for(trail=0; trail<64; ++trail) { | 
108  | 0  |                 table[trail]|=bits;  | 
109  | 0  |             }  | 
110  | 0  |         }  | 
111  |  |         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.  | 
112  |  |         // In that case, bits=1<<limitLead is undefined but the bits value  | 
113  |  |         // is not used because trail<limitTrail is already false.  | 
114  | 0  |         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);  | 
115  | 0  |         for(trail=0; trail<limitTrail; ++trail) { | 
116  | 0  |             table[trail]|=bits;  | 
117  | 0  |         }  | 
118  | 0  |     }  | 
119  | 0  | }  | 
120  |  |  | 
121  | 0  | void BMPSet::initBits() { | 
122  | 0  |     UChar32 start, limit;  | 
123  | 0  |     int32_t listIndex=0;  | 
124  |  |  | 
125  |  |     // Set latin1Contains[].  | 
126  | 0  |     do { | 
127  | 0  |         start=list[listIndex++];  | 
128  | 0  |         if(listIndex<listLength) { | 
129  | 0  |             limit=list[listIndex++];  | 
130  | 0  |         } else { | 
131  | 0  |             limit=0x110000;  | 
132  | 0  |         }  | 
133  | 0  |         if(start>=0x100) { | 
134  | 0  |             break;  | 
135  | 0  |         }  | 
136  | 0  |         do { | 
137  | 0  |             latin1Contains[start++]=1;  | 
138  | 0  |         } while(start<limit && start<0x100);  | 
139  | 0  |     } while(limit<=0x100);  | 
140  |  |  | 
141  |  |     // Find the first range overlapping with (or after) 80..FF again,  | 
142  |  |     // to include them in table7FF as well.  | 
143  | 0  |     for(listIndex=0;;) { | 
144  | 0  |         start=list[listIndex++];  | 
145  | 0  |         if(listIndex<listLength) { | 
146  | 0  |             limit=list[listIndex++];  | 
147  | 0  |         } else { | 
148  | 0  |             limit=0x110000;  | 
149  | 0  |         }  | 
150  | 0  |         if(limit>0x80) { | 
151  | 0  |             if(start<0x80) { | 
152  | 0  |                 start=0x80;  | 
153  | 0  |             }  | 
154  | 0  |             break;  | 
155  | 0  |         }  | 
156  | 0  |     }  | 
157  |  |  | 
158  |  |     // Set table7FF[].  | 
159  | 0  |     while(start<0x800) { | 
160  | 0  |         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);  | 
161  | 0  |         if(limit>0x800) { | 
162  | 0  |             start=0x800;  | 
163  | 0  |             break;  | 
164  | 0  |         }  | 
165  |  |  | 
166  | 0  |         start=list[listIndex++];  | 
167  | 0  |         if(listIndex<listLength) { | 
168  | 0  |             limit=list[listIndex++];  | 
169  | 0  |         } else { | 
170  | 0  |             limit=0x110000;  | 
171  | 0  |         }  | 
172  | 0  |     }  | 
173  |  |  | 
174  |  |     // Set bmpBlockBits[].  | 
175  | 0  |     int32_t minStart=0x800;  | 
176  | 0  |     while(start<0x10000) { | 
177  | 0  |         if(limit>0x10000) { | 
178  | 0  |             limit=0x10000;  | 
179  | 0  |         }  | 
180  |  | 
  | 
181  | 0  |         if(start<minStart) { | 
182  | 0  |             start=minStart;  | 
183  | 0  |         }  | 
184  | 0  |         if(start<limit) {  // Else: Another range entirely in a known mixed-value block. | 
185  | 0  |             if(start&0x3f) { | 
186  |  |                 // Mixed-value block of 64 code points.  | 
187  | 0  |                 start>>=6;  | 
188  | 0  |                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);  | 
189  | 0  |                 start=(start+1)<<6;  // Round up to the next block boundary.  | 
190  | 0  |                 minStart=start;      // Ignore further ranges in this block.  | 
191  | 0  |             }  | 
192  | 0  |             if(start<limit) { | 
193  | 0  |                 if(start<(limit&~0x3f)) { | 
194  |  |                     // Multiple all-ones blocks of 64 code points each.  | 
195  | 0  |                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);  | 
196  | 0  |                 }  | 
197  |  | 
  | 
198  | 0  |                 if(limit&0x3f) { | 
199  |  |                     // Mixed-value block of 64 code points.  | 
200  | 0  |                     limit>>=6;  | 
201  | 0  |                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);  | 
202  | 0  |                     limit=(limit+1)<<6;  // Round up to the next block boundary.  | 
203  | 0  |                     minStart=limit;      // Ignore further ranges in this block.  | 
204  | 0  |                 }  | 
205  | 0  |             }  | 
206  | 0  |         }  | 
207  |  | 
  | 
208  | 0  |         if(limit==0x10000) { | 
209  | 0  |             break;  | 
210  | 0  |         }  | 
211  |  |  | 
212  | 0  |         start=list[listIndex++];  | 
213  | 0  |         if(listIndex<listLength) { | 
214  | 0  |             limit=list[listIndex++];  | 
215  | 0  |         } else { | 
216  | 0  |             limit=0x110000;  | 
217  | 0  |         }  | 
218  | 0  |     }  | 
219  | 0  | }  | 
220  |  |  | 
221  |  | /*  | 
222  |  |  * Override some bits and bytes to the result of contains(FFFD)  | 
223  |  |  * for faster validity checking at runtime.  | 
224  |  |  * No need to set 0 values where they were reset to 0 in the constructor  | 
225  |  |  * and not modified by initBits().  | 
226  |  |  * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)  | 
227  |  |  * Need to set 0 values for surrogates D800..DFFF.  | 
228  |  |  */  | 
229  | 0  | void BMPSet::overrideIllegal() { | 
230  | 0  |     uint32_t bits, mask;  | 
231  | 0  |     int32_t i;  | 
232  |  | 
  | 
233  | 0  |     if(containsFFFD) { | 
234  | 0  |         bits=3;                 // Lead bytes 0xC0 and 0xC1.  | 
235  | 0  |         for(i=0; i<64; ++i) { | 
236  | 0  |             table7FF[i]|=bits;  | 
237  | 0  |         }  | 
238  |  | 
  | 
239  | 0  |         bits=1;                 // Lead byte 0xE0.  | 
240  | 0  |         for(i=0; i<32; ++i) {   // First half of 4k block. | 
241  | 0  |             bmpBlockBits[i]|=bits;  | 
242  | 0  |         }  | 
243  |  | 
  | 
244  | 0  |         mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.  | 
245  | 0  |         bits=1<<0xd;  | 
246  | 0  |         for(i=32; i<64; ++i) {  // Second half of 4k block. | 
247  | 0  |             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;  | 
248  | 0  |         }  | 
249  | 0  |     } else { | 
250  | 0  |         mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.  | 
251  | 0  |         for(i=32; i<64; ++i) {  // Second half of 4k block. | 
252  | 0  |             bmpBlockBits[i]&=mask;  | 
253  | 0  |         }  | 
254  | 0  |     }  | 
255  | 0  | }  | 
256  |  |  | 
257  | 0  | int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { | 
258  |  |     /* Examples:  | 
259  |  |                                        findCodePoint(c)  | 
260  |  |        set              list[]         c=0 1 3 4 7 8  | 
261  |  |        ===              ==============   ===========  | 
262  |  |        []               [110000]         0 0 0 0 0 0  | 
263  |  |        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2  | 
264  |  |        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2  | 
265  |  |        [:Any:]          [0, 110000]      1 1 1 1 1 1  | 
266  |  |      */  | 
267  |  |  | 
268  |  |     // Return the smallest i such that c < list[i].  Assume  | 
269  |  |     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).  | 
270  | 0  |     if (c < list[lo])  | 
271  | 0  |         return lo;  | 
272  |  |     // High runner test.  c is often after the last range, so an  | 
273  |  |     // initial check for this condition pays off.  | 
274  | 0  |     if (lo >= hi || c >= list[hi-1])  | 
275  | 0  |         return hi;  | 
276  |  |     // invariant: c >= list[lo]  | 
277  |  |     // invariant: c < list[hi]  | 
278  | 0  |     for (;;) { | 
279  | 0  |         int32_t i = (lo + hi) >> 1;  | 
280  | 0  |         if (i == lo) { | 
281  | 0  |             break; // Found!  | 
282  | 0  |         } else if (c < list[i]) { | 
283  | 0  |             hi = i;  | 
284  | 0  |         } else { | 
285  | 0  |             lo = i;  | 
286  | 0  |         }  | 
287  | 0  |     }  | 
288  | 0  |     return hi;  | 
289  | 0  | }  | 
290  |  |  | 
291  |  | UBool  | 
292  | 0  | BMPSet::contains(UChar32 c) const { | 
293  | 0  |     if((uint32_t)c<=0xff) { | 
294  | 0  |         return (UBool)latin1Contains[c];  | 
295  | 0  |     } else if((uint32_t)c<=0x7ff) { | 
296  | 0  |         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);  | 
297  | 0  |     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { | 
298  | 0  |         int lead=c>>12;  | 
299  | 0  |         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
300  | 0  |         if(twoBits<=1) { | 
301  |  |             // All 64 code points with the same bits 15..6  | 
302  |  |             // are either in the set or not.  | 
303  | 0  |             return (UBool)twoBits;  | 
304  | 0  |         } else { | 
305  |  |             // Look up the code point in its 4k block of code points.  | 
306  | 0  |             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);  | 
307  | 0  |         }  | 
308  | 0  |     } else if((uint32_t)c<=0x10ffff) { | 
309  |  |         // surrogate or supplementary code point  | 
310  | 0  |         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);  | 
311  | 0  |     } else { | 
312  |  |         // Out-of-range code points get FALSE, consistent with long-standing  | 
313  |  |         // behavior of UnicodeSet::contains(c).  | 
314  | 0  |         return FALSE;  | 
315  | 0  |     }  | 
316  | 0  | }  | 
317  |  |  | 
318  |  | /*  | 
319  |  |  * Check for sufficient length for trail unit for each surrogate pair.  | 
320  |  |  * Handle single surrogates as surrogate code points as usual in ICU.  | 
321  |  |  */  | 
322  |  | const UChar *  | 
323  | 0  | BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { | 
324  | 0  |     UChar c, c2;  | 
325  |  | 
  | 
326  | 0  |     if(spanCondition) { | 
327  |  |         // span  | 
328  | 0  |         do { | 
329  | 0  |             c=*s;  | 
330  | 0  |             if(c<=0xff) { | 
331  | 0  |                 if(!latin1Contains[c]) { | 
332  | 0  |                     break;  | 
333  | 0  |                 }  | 
334  | 0  |             } else if(c<=0x7ff) { | 
335  | 0  |                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { | 
336  | 0  |                     break;  | 
337  | 0  |                 }  | 
338  | 0  |             } else if(c<0xd800 || c>=0xe000) { | 
339  | 0  |                 int lead=c>>12;  | 
340  | 0  |                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
341  | 0  |                 if(twoBits<=1) { | 
342  |  |                     // All 64 code points with the same bits 15..6  | 
343  |  |                     // are either in the set or not.  | 
344  | 0  |                     if(twoBits==0) { | 
345  | 0  |                         break;  | 
346  | 0  |                     }  | 
347  | 0  |                 } else { | 
348  |  |                     // Look up the code point in its 4k block of code points.  | 
349  | 0  |                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { | 
350  | 0  |                         break;  | 
351  | 0  |                     }  | 
352  | 0  |                 }  | 
353  | 0  |             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { | 
354  |  |                 // surrogate code point  | 
355  | 0  |                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { | 
356  | 0  |                     break;  | 
357  | 0  |                 }  | 
358  | 0  |             } else { | 
359  |  |                 // surrogate pair  | 
360  | 0  |                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { | 
361  | 0  |                     break;  | 
362  | 0  |                 }  | 
363  | 0  |                 ++s;  | 
364  | 0  |             }  | 
365  | 0  |         } while(++s<limit);  | 
366  | 0  |     } else { | 
367  |  |         // span not  | 
368  | 0  |         do { | 
369  | 0  |             c=*s;  | 
370  | 0  |             if(c<=0xff) { | 
371  | 0  |                 if(latin1Contains[c]) { | 
372  | 0  |                     break;  | 
373  | 0  |                 }  | 
374  | 0  |             } else if(c<=0x7ff) { | 
375  | 0  |                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { | 
376  | 0  |                     break;  | 
377  | 0  |                 }  | 
378  | 0  |             } else if(c<0xd800 || c>=0xe000) { | 
379  | 0  |                 int lead=c>>12;  | 
380  | 0  |                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
381  | 0  |                 if(twoBits<=1) { | 
382  |  |                     // All 64 code points with the same bits 15..6  | 
383  |  |                     // are either in the set or not.  | 
384  | 0  |                     if(twoBits!=0) { | 
385  | 0  |                         break;  | 
386  | 0  |                     }  | 
387  | 0  |                 } else { | 
388  |  |                     // Look up the code point in its 4k block of code points.  | 
389  | 0  |                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { | 
390  | 0  |                         break;  | 
391  | 0  |                     }  | 
392  | 0  |                 }  | 
393  | 0  |             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { | 
394  |  |                 // surrogate code point  | 
395  | 0  |                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { | 
396  | 0  |                     break;  | 
397  | 0  |                 }  | 
398  | 0  |             } else { | 
399  |  |                 // surrogate pair  | 
400  | 0  |                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { | 
401  | 0  |                     break;  | 
402  | 0  |                 }  | 
403  | 0  |                 ++s;  | 
404  | 0  |             }  | 
405  | 0  |         } while(++s<limit);  | 
406  | 0  |     }  | 
407  | 0  |     return s;  | 
408  | 0  | }  | 
409  |  |  | 
410  |  | /* Symmetrical with span(). */  | 
411  |  | const UChar *  | 
412  | 0  | BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { | 
413  | 0  |     UChar c, c2;  | 
414  |  | 
  | 
415  | 0  |     if(spanCondition) { | 
416  |  |         // span  | 
417  | 0  |         for(;;) { | 
418  | 0  |             c=*(--limit);  | 
419  | 0  |             if(c<=0xff) { | 
420  | 0  |                 if(!latin1Contains[c]) { | 
421  | 0  |                     break;  | 
422  | 0  |                 }  | 
423  | 0  |             } else if(c<=0x7ff) { | 
424  | 0  |                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { | 
425  | 0  |                     break;  | 
426  | 0  |                 }  | 
427  | 0  |             } else if(c<0xd800 || c>=0xe000) { | 
428  | 0  |                 int lead=c>>12;  | 
429  | 0  |                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
430  | 0  |                 if(twoBits<=1) { | 
431  |  |                     // All 64 code points with the same bits 15..6  | 
432  |  |                     // are either in the set or not.  | 
433  | 0  |                     if(twoBits==0) { | 
434  | 0  |                         break;  | 
435  | 0  |                     }  | 
436  | 0  |                 } else { | 
437  |  |                     // Look up the code point in its 4k block of code points.  | 
438  | 0  |                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { | 
439  | 0  |                         break;  | 
440  | 0  |                     }  | 
441  | 0  |                 }  | 
442  | 0  |             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { | 
443  |  |                 // surrogate code point  | 
444  | 0  |                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { | 
445  | 0  |                     break;  | 
446  | 0  |                 }  | 
447  | 0  |             } else { | 
448  |  |                 // surrogate pair  | 
449  | 0  |                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { | 
450  | 0  |                     break;  | 
451  | 0  |                 }  | 
452  | 0  |                 --limit;  | 
453  | 0  |             }  | 
454  | 0  |             if(s==limit) { | 
455  | 0  |                 return s;  | 
456  | 0  |             }  | 
457  | 0  |         }  | 
458  | 0  |     } else { | 
459  |  |         // span not  | 
460  | 0  |         for(;;) { | 
461  | 0  |             c=*(--limit);  | 
462  | 0  |             if(c<=0xff) { | 
463  | 0  |                 if(latin1Contains[c]) { | 
464  | 0  |                     break;  | 
465  | 0  |                 }  | 
466  | 0  |             } else if(c<=0x7ff) { | 
467  | 0  |                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { | 
468  | 0  |                     break;  | 
469  | 0  |                 }  | 
470  | 0  |             } else if(c<0xd800 || c>=0xe000) { | 
471  | 0  |                 int lead=c>>12;  | 
472  | 0  |                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
473  | 0  |                 if(twoBits<=1) { | 
474  |  |                     // All 64 code points with the same bits 15..6  | 
475  |  |                     // are either in the set or not.  | 
476  | 0  |                     if(twoBits!=0) { | 
477  | 0  |                         break;  | 
478  | 0  |                     }  | 
479  | 0  |                 } else { | 
480  |  |                     // Look up the code point in its 4k block of code points.  | 
481  | 0  |                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { | 
482  | 0  |                         break;  | 
483  | 0  |                     }  | 
484  | 0  |                 }  | 
485  | 0  |             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { | 
486  |  |                 // surrogate code point  | 
487  | 0  |                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { | 
488  | 0  |                     break;  | 
489  | 0  |                 }  | 
490  | 0  |             } else { | 
491  |  |                 // surrogate pair  | 
492  | 0  |                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { | 
493  | 0  |                     break;  | 
494  | 0  |                 }  | 
495  | 0  |                 --limit;  | 
496  | 0  |             }  | 
497  | 0  |             if(s==limit) { | 
498  | 0  |                 return s;  | 
499  | 0  |             }  | 
500  | 0  |         }  | 
501  | 0  |     }  | 
502  | 0  |     return limit+1;  | 
503  | 0  | }  | 
504  |  |  | 
505  |  | /*  | 
506  |  |  * Precheck for sufficient trail bytes at end of string only once per span.  | 
507  |  |  * Check validity.  | 
508  |  |  */  | 
509  |  | const uint8_t *  | 
510  | 0  | BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { | 
511  | 0  |     const uint8_t *limit=s+length;  | 
512  | 0  |     uint8_t b=*s;  | 
513  | 0  |     if(U8_IS_SINGLE(b)) { | 
514  |  |         // Initial all-ASCII span.  | 
515  | 0  |         if(spanCondition) { | 
516  | 0  |             do { | 
517  | 0  |                 if(!latin1Contains[b] || ++s==limit) { | 
518  | 0  |                     return s;  | 
519  | 0  |                 }  | 
520  | 0  |                 b=*s;  | 
521  | 0  |             } while(U8_IS_SINGLE(b));  | 
522  | 0  |         } else { | 
523  | 0  |             do { | 
524  | 0  |                 if(latin1Contains[b] || ++s==limit) { | 
525  | 0  |                     return s;  | 
526  | 0  |                 }  | 
527  | 0  |                 b=*s;  | 
528  | 0  |             } while(U8_IS_SINGLE(b));  | 
529  | 0  |         }  | 
530  | 0  |         length=(int32_t)(limit-s);  | 
531  | 0  |     }  | 
532  |  |  | 
533  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
534  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
535  | 0  |     }  | 
536  |  | 
  | 
537  | 0  |     const uint8_t *limit0=limit;  | 
538  |  |  | 
539  |  |     /*  | 
540  |  |      * Make sure that the last 1/2/3/4-byte sequence before limit is complete  | 
541  |  |      * or runs into a lead byte.  | 
542  |  |      * In the span loop compare s with limit only once  | 
543  |  |      * per multi-byte character.  | 
544  |  |      *  | 
545  |  |      * Give a trailing illegal sequence the same value as the result of contains(FFFD),  | 
546  |  |      * including it if that is part of the span, otherwise set limit0 to before  | 
547  |  |      * the truncated sequence.  | 
548  |  |      */  | 
549  | 0  |     b=*(limit-1);  | 
550  | 0  |     if((int8_t)b<0) { | 
551  |  |         // b>=0x80: lead or trail byte  | 
552  | 0  |         if(b<0xc0) { | 
553  |  |             // single trail byte, check for preceding 3- or 4-byte lead byte  | 
554  | 0  |             if(length>=2 && (b=*(limit-2))>=0xe0) { | 
555  | 0  |                 limit-=2;  | 
556  | 0  |                 if(containsFFFD!=spanCondition) { | 
557  | 0  |                     limit0=limit;  | 
558  | 0  |                 }  | 
559  | 0  |             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { | 
560  |  |                 // 4-byte lead byte with only two trail bytes  | 
561  | 0  |                 limit-=3;  | 
562  | 0  |                 if(containsFFFD!=spanCondition) { | 
563  | 0  |                     limit0=limit;  | 
564  | 0  |                 }  | 
565  | 0  |             }  | 
566  | 0  |         } else { | 
567  |  |             // lead byte with no trail bytes  | 
568  | 0  |             --limit;  | 
569  | 0  |             if(containsFFFD!=spanCondition) { | 
570  | 0  |                 limit0=limit;  | 
571  | 0  |             }  | 
572  | 0  |         }  | 
573  | 0  |     }  | 
574  |  | 
  | 
575  | 0  |     uint8_t t1, t2, t3;  | 
576  |  | 
  | 
577  | 0  |     while(s<limit) { | 
578  | 0  |         b=*s;  | 
579  | 0  |         if(U8_IS_SINGLE(b)) { | 
580  |  |             // ASCII  | 
581  | 0  |             if(spanCondition) { | 
582  | 0  |                 do { | 
583  | 0  |                     if(!latin1Contains[b]) { | 
584  | 0  |                         return s;  | 
585  | 0  |                     } else if(++s==limit) { | 
586  | 0  |                         return limit0;  | 
587  | 0  |                     }  | 
588  | 0  |                     b=*s;  | 
589  | 0  |                 } while(U8_IS_SINGLE(b));  | 
590  | 0  |             } else { | 
591  | 0  |                 do { | 
592  | 0  |                     if(latin1Contains[b]) { | 
593  | 0  |                         return s;  | 
594  | 0  |                     } else if(++s==limit) { | 
595  | 0  |                         return limit0;  | 
596  | 0  |                     }  | 
597  | 0  |                     b=*s;  | 
598  | 0  |                 } while(U8_IS_SINGLE(b));  | 
599  | 0  |             }  | 
600  | 0  |         }  | 
601  | 0  |         ++s;  // Advance past the lead byte.  | 
602  | 0  |         if(b>=0xe0) { | 
603  | 0  |             if(b<0xf0) { | 
604  | 0  |                 if( /* handle U+0000..U+FFFF inline */  | 
605  | 0  |                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&  | 
606  | 0  |                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f  | 
607  | 0  |                 ) { | 
608  | 0  |                     b&=0xf;  | 
609  | 0  |                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;  | 
610  | 0  |                     if(twoBits<=1) { | 
611  |  |                         // All 64 code points with this lead byte and middle trail byte  | 
612  |  |                         // are either in the set or not.  | 
613  | 0  |                         if(twoBits!=(uint32_t)spanCondition) { | 
614  | 0  |                             return s-1;  | 
615  | 0  |                         }  | 
616  | 0  |                     } else { | 
617  |  |                         // Look up the code point in its 4k block of code points.  | 
618  | 0  |                         UChar32 c=(b<<12)|(t1<<6)|t2;  | 
619  | 0  |                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { | 
620  | 0  |                             return s-1;  | 
621  | 0  |                         }  | 
622  | 0  |                     }  | 
623  | 0  |                     s+=2;  | 
624  | 0  |                     continue;  | 
625  | 0  |                 }  | 
626  | 0  |             } else if( /* handle U+10000..U+10FFFF inline */  | 
627  | 0  |                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&  | 
628  | 0  |                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&  | 
629  | 0  |                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f  | 
630  | 0  |             ) { | 
631  |  |                 // Give an illegal sequence the same value as the result of contains(FFFD).  | 
632  | 0  |                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;  | 
633  | 0  |                 if( (   (0x10000<=c && c<=0x10ffff) ?  | 
634  | 0  |                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :  | 
635  | 0  |                             containsFFFD  | 
636  | 0  |                     ) != spanCondition  | 
637  | 0  |                 ) { | 
638  | 0  |                     return s-1;  | 
639  | 0  |                 }  | 
640  | 0  |                 s+=3;  | 
641  | 0  |                 continue;  | 
642  | 0  |             }  | 
643  | 0  |         } else { | 
644  | 0  |             if( /* handle U+0000..U+07FF inline */  | 
645  | 0  |                 b>=0xc0 &&  | 
646  | 0  |                 (t1=(uint8_t)(*s-0x80)) <= 0x3f  | 
647  | 0  |             ) { | 
648  | 0  |                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { | 
649  | 0  |                     return s-1;  | 
650  | 0  |                 }  | 
651  | 0  |                 ++s;  | 
652  | 0  |                 continue;  | 
653  | 0  |             }  | 
654  | 0  |         }  | 
655  |  |  | 
656  |  |         // Give an illegal sequence the same value as the result of contains(FFFD).  | 
657  |  |         // Handle each byte of an illegal sequence separately to simplify the code;  | 
658  |  |         // no need to optimize error handling.  | 
659  | 0  |         if(containsFFFD!=spanCondition) { | 
660  | 0  |             return s-1;  | 
661  | 0  |         }  | 
662  | 0  |     }  | 
663  |  |  | 
664  | 0  |     return limit0;  | 
665  | 0  | }  | 
666  |  |  | 
667  |  | /*  | 
668  |  |  * While going backwards through UTF-8 optimize only for ASCII.  | 
669  |  |  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not  | 
670  |  |  * possible to tell from the last byte in a multi-byte sequence how many  | 
671  |  |  * preceding bytes there should be. Therefore, going backwards through UTF-8  | 
672  |  |  * is much harder than going forward.  | 
673  |  |  */  | 
674  |  | int32_t  | 
675  | 0  | BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { | 
676  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
677  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
678  | 0  |     }  | 
679  |  | 
  | 
680  | 0  |     uint8_t b;  | 
681  |  | 
  | 
682  | 0  |     do { | 
683  | 0  |         b=s[--length];  | 
684  | 0  |         if(U8_IS_SINGLE(b)) { | 
685  |  |             // ASCII sub-span  | 
686  | 0  |             if(spanCondition) { | 
687  | 0  |                 do { | 
688  | 0  |                     if(!latin1Contains[b]) { | 
689  | 0  |                         return length+1;  | 
690  | 0  |                     } else if(length==0) { | 
691  | 0  |                         return 0;  | 
692  | 0  |                     }  | 
693  | 0  |                     b=s[--length];  | 
694  | 0  |                 } while(U8_IS_SINGLE(b));  | 
695  | 0  |             } else { | 
696  | 0  |                 do { | 
697  | 0  |                     if(latin1Contains[b]) { | 
698  | 0  |                         return length+1;  | 
699  | 0  |                     } else if(length==0) { | 
700  | 0  |                         return 0;  | 
701  | 0  |                     }  | 
702  | 0  |                     b=s[--length];  | 
703  | 0  |                 } while(U8_IS_SINGLE(b));  | 
704  | 0  |             }  | 
705  | 0  |         }  | 
706  |  |  | 
707  | 0  |         int32_t prev=length;  | 
708  | 0  |         UChar32 c;  | 
709  |  |         // trail byte: collect a multi-byte character  | 
710  |  |         // (or  lead byte in last-trail position)  | 
711  | 0  |         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);  | 
712  |  |         // c is a valid code point, not ASCII, not a surrogate  | 
713  | 0  |         if(c<=0x7ff) { | 
714  | 0  |             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { | 
715  | 0  |                 return prev+1;  | 
716  | 0  |             }  | 
717  | 0  |         } else if(c<=0xffff) { | 
718  | 0  |             int lead=c>>12;  | 
719  | 0  |             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;  | 
720  | 0  |             if(twoBits<=1) { | 
721  |  |                 // All 64 code points with the same bits 15..6  | 
722  |  |                 // are either in the set or not.  | 
723  | 0  |                 if(twoBits!=(uint32_t)spanCondition) { | 
724  | 0  |                     return prev+1;  | 
725  | 0  |                 }  | 
726  | 0  |             } else { | 
727  |  |                 // Look up the code point in its 4k block of code points.  | 
728  | 0  |                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { | 
729  | 0  |                     return prev+1;  | 
730  | 0  |                 }  | 
731  | 0  |             }  | 
732  | 0  |         } else { | 
733  | 0  |             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { | 
734  | 0  |                 return prev+1;  | 
735  | 0  |             }  | 
736  | 0  |         }  | 
737  | 0  |     } while(length>0);  | 
738  | 0  |     return 0;  | 
739  | 0  | }  | 
740  |  |  | 
741  |  | U_NAMESPACE_END  |