/src/icu/source/common/uniset.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | **********************************************************************  | 
5  |  | *   Copyright (C) 1999-2015, International Business Machines  | 
6  |  | *   Corporation and others.  All Rights Reserved.  | 
7  |  | **********************************************************************  | 
8  |  | *   Date        Name        Description  | 
9  |  | *   10/20/99    alan        Creation.  | 
10  |  | **********************************************************************  | 
11  |  | */  | 
12  |  |  | 
13  |  | #include "unicode/utypes.h"  | 
14  |  | #include "unicode/parsepos.h"  | 
15  |  | #include "unicode/symtable.h"  | 
16  |  | #include "unicode/uniset.h"  | 
17  |  | #include "unicode/ustring.h"  | 
18  |  | #include "unicode/utf8.h"  | 
19  |  | #include "unicode/utf16.h"  | 
20  |  | #include "ruleiter.h"  | 
21  |  | #include "cmemory.h"  | 
22  |  | #include "cstring.h"  | 
23  |  | #include "patternprops.h"  | 
24  |  | #include "uelement.h"  | 
25  |  | #include "util.h"  | 
26  |  | #include "uvector.h"  | 
27  |  | #include "charstr.h"  | 
28  |  | #include "ustrfmt.h"  | 
29  |  | #include "uassert.h"  | 
30  |  | #include "bmpset.h"  | 
31  |  | #include "unisetspan.h"  | 
32  |  |  | 
33  |  | // HIGH_VALUE > all valid values. 110000 for codepoints  | 
34  | 0  | #define UNICODESET_HIGH 0x0110000  | 
35  |  |  | 
36  |  | // LOW <= all valid values. ZERO for codepoints  | 
37  | 0  | #define UNICODESET_LOW 0x000000  | 
38  |  |  | 
39  |  | /** Max list [0, 1, 2, ..., max code point, HIGH] */  | 
40  |  | constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;  | 
41  |  |  | 
42  |  | U_NAMESPACE_BEGIN  | 
43  |  |  | 
44  | 0  | SymbolTable::~SymbolTable() {} | 
45  |  |  | 
46  |  | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)  | 
47  |  |  | 
48  |  | /**  | 
49  |  |  * Modify the given UChar32 variable so that it is in range, by  | 
50  |  |  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and  | 
51  |  |  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.  | 
52  |  |  * It modifies its argument in-place and also returns it.  | 
53  |  |  */  | 
54  | 0  | static inline UChar32 pinCodePoint(UChar32& c) { | 
55  | 0  |     if (c < UNICODESET_LOW) { | 
56  | 0  |         c = UNICODESET_LOW;  | 
57  | 0  |     } else if (c > (UNICODESET_HIGH-1)) { | 
58  | 0  |         c = (UNICODESET_HIGH-1);  | 
59  | 0  |     }  | 
60  | 0  |     return c;  | 
61  | 0  | }  | 
62  |  |  | 
63  |  | //----------------------------------------------------------------  | 
64  |  | // Debugging  | 
65  |  | //----------------------------------------------------------------  | 
66  |  |  | 
67  |  | // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.  | 
68  |  | // To enable the debugging, define the symbol DEBUG_MEM in the line  | 
69  |  | // below.  This will result in text being sent to stdout that looks  | 
70  |  | // like this:  | 
71  |  | //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-  | 
72  |  | //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-  | 
73  |  | // Each line lists a construction (ct) or destruction (dt) event, the  | 
74  |  | // object address, the number of outstanding objects after the event,  | 
75  |  | // and the pattern of the object in question.  | 
76  |  |  | 
77  |  | // #define DEBUG_MEM  | 
78  |  |  | 
79  |  | #ifdef DEBUG_MEM  | 
80  |  | #include <stdio.h>  | 
81  |  | static int32_t _dbgCount = 0;  | 
82  |  |  | 
83  |  | static inline void _dbgct(UnicodeSet* set) { | 
84  |  |     UnicodeString str;  | 
85  |  |     set->toPattern(str, TRUE);  | 
86  |  |     char buf[40];  | 
87  |  |     str.extract(0, 39, buf, "");  | 
88  |  |     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); | 
89  |  | }  | 
90  |  |  | 
91  |  | static inline void _dbgdt(UnicodeSet* set) { | 
92  |  |     UnicodeString str;  | 
93  |  |     set->toPattern(str, TRUE);  | 
94  |  |     char buf[40];  | 
95  |  |     str.extract(0, 39, buf, "");  | 
96  |  |     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); | 
97  |  | }  | 
98  |  |  | 
99  |  | #else  | 
100  |  |  | 
101  |  | #define _dbgct(set)  | 
102  |  | #define _dbgdt(set)  | 
103  |  |  | 
104  |  | #endif  | 
105  |  |  | 
106  |  | //----------------------------------------------------------------  | 
107  |  | // UnicodeString in UVector support  | 
108  |  | //----------------------------------------------------------------  | 
109  |  |  | 
110  | 0  | static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { | 
111  | 0  |     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);  | 
112  | 0  | }  | 
113  |  |  | 
114  | 0  | static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { | 
115  | 0  |     const UnicodeString &a = *(const UnicodeString*)t1.pointer;  | 
116  | 0  |     const UnicodeString &b = *(const UnicodeString*)t2.pointer;  | 
117  | 0  |     return a.compare(b);  | 
118  | 0  | }  | 
119  |  |  | 
120  | 0  | UBool UnicodeSet::hasStrings() const { | 
121  | 0  |     return strings != nullptr && !strings->isEmpty();  | 
122  | 0  | }  | 
123  |  |  | 
124  | 0  | int32_t UnicodeSet::stringsSize() const { | 
125  | 0  |     return strings == nullptr ? 0 : strings->size();  | 
126  | 0  | }  | 
127  |  |  | 
128  | 0  | UBool UnicodeSet::stringsContains(const UnicodeString &s) const { | 
129  | 0  |     return strings != nullptr && strings->contains((void*) &s);  | 
130  | 0  | }  | 
131  |  |  | 
132  |  | //----------------------------------------------------------------  | 
133  |  | // Constructors &c  | 
134  |  | //----------------------------------------------------------------  | 
135  |  |  | 
136  |  | /**  | 
137  |  |  * Constructs an empty set.  | 
138  |  |  */  | 
139  | 0  | UnicodeSet::UnicodeSet() { | 
140  | 0  |     list[0] = UNICODESET_HIGH;  | 
141  | 0  |     _dbgct(this);  | 
142  | 0  | }  | 
143  |  |  | 
144  |  | /**  | 
145  |  |  * Constructs a set containing the given range. If <code>end >  | 
146  |  |  * start</code> then an empty set is created.  | 
147  |  |  *  | 
148  |  |  * @param start first character, inclusive, of range  | 
149  |  |  * @param end last character, inclusive, of range  | 
150  |  |  */  | 
151  | 0  | UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { | 
152  | 0  |     list[0] = UNICODESET_HIGH;  | 
153  | 0  |     add(start, end);  | 
154  | 0  |     _dbgct(this);  | 
155  | 0  | }  | 
156  |  |  | 
157  |  | /**  | 
158  |  |  * Constructs a set that is identical to the given UnicodeSet.  | 
159  |  |  */  | 
160  | 0  | UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { | 
161  | 0  |     *this = o;  | 
162  | 0  |     _dbgct(this);  | 
163  | 0  | }  | 
164  |  |  | 
165  |  | // Copy-construct as thawed.  | 
166  | 0  | UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { | 
167  | 0  |     if (ensureCapacity(o.len)) { | 
168  |  |         // *this = o except for bmpSet and stringSpan  | 
169  | 0  |         len = o.len;  | 
170  | 0  |         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));  | 
171  | 0  |         if (o.hasStrings()) { | 
172  | 0  |             UErrorCode status = U_ZERO_ERROR;  | 
173  | 0  |             if (!allocateStrings(status) ||  | 
174  | 0  |                     (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { | 
175  | 0  |                 setToBogus();  | 
176  | 0  |                 return;  | 
177  | 0  |             }  | 
178  | 0  |         }  | 
179  | 0  |         if (o.pat) { | 
180  | 0  |             setPattern(o.pat, o.patLen);  | 
181  | 0  |         }  | 
182  | 0  |         _dbgct(this);  | 
183  | 0  |     }  | 
184  | 0  | }  | 
185  |  |  | 
186  |  | /**  | 
187  |  |  * Destructs the set.  | 
188  |  |  */  | 
189  | 0  | UnicodeSet::~UnicodeSet() { | 
190  | 0  |     _dbgdt(this); // first!  | 
191  | 0  |     if (list != stackList) { | 
192  | 0  |         uprv_free(list);  | 
193  | 0  |     }  | 
194  | 0  |     delete bmpSet;  | 
195  | 0  |     if (buffer != stackList) { | 
196  | 0  |         uprv_free(buffer);  | 
197  | 0  |     }  | 
198  | 0  |     delete strings;  | 
199  | 0  |     delete stringSpan;  | 
200  | 0  |     releasePattern();  | 
201  | 0  | }  | 
202  |  |  | 
203  |  | /**  | 
204  |  |  * Assigns this object to be a copy of another.  | 
205  |  |  */  | 
206  | 0  | UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { | 
207  | 0  |     return copyFrom(o, FALSE);  | 
208  | 0  | }  | 
209  |  |  | 
210  | 0  | UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { | 
211  | 0  |     if (this == &o) { | 
212  | 0  |         return *this;  | 
213  | 0  |     }  | 
214  | 0  |     if (isFrozen()) { | 
215  | 0  |         return *this;  | 
216  | 0  |     }  | 
217  | 0  |     if (o.isBogus()) { | 
218  | 0  |         setToBogus();  | 
219  | 0  |         return *this;  | 
220  | 0  |     }  | 
221  | 0  |     if (!ensureCapacity(o.len)) { | 
222  |  |         // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.  | 
223  | 0  |         return *this;  | 
224  | 0  |     }  | 
225  | 0  |     len = o.len;  | 
226  | 0  |     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));  | 
227  | 0  |     if (o.bmpSet != nullptr && !asThawed) { | 
228  | 0  |         bmpSet = new BMPSet(*o.bmpSet, list, len);  | 
229  | 0  |         if (bmpSet == NULL) { // Check for memory allocation error. | 
230  | 0  |             setToBogus();  | 
231  | 0  |             return *this;  | 
232  | 0  |         }  | 
233  | 0  |     }  | 
234  | 0  |     if (o.hasStrings()) { | 
235  | 0  |         UErrorCode status = U_ZERO_ERROR;  | 
236  | 0  |         if ((strings == nullptr && !allocateStrings(status)) ||  | 
237  | 0  |                 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { | 
238  | 0  |             setToBogus();  | 
239  | 0  |             return *this;  | 
240  | 0  |         }  | 
241  | 0  |     } else if (hasStrings()) { | 
242  | 0  |         strings->removeAllElements();  | 
243  | 0  |     }  | 
244  | 0  |     if (o.stringSpan != nullptr && !asThawed) { | 
245  | 0  |         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);  | 
246  | 0  |         if (stringSpan == NULL) { // Check for memory allocation error. | 
247  | 0  |             setToBogus();  | 
248  | 0  |             return *this;  | 
249  | 0  |         }  | 
250  | 0  |     }  | 
251  | 0  |     releasePattern();  | 
252  | 0  |     if (o.pat) { | 
253  | 0  |         setPattern(o.pat, o.patLen);  | 
254  | 0  |     }  | 
255  | 0  |     return *this;  | 
256  | 0  | }  | 
257  |  |  | 
258  |  | /**  | 
259  |  |  * Returns a copy of this object.  All UnicodeMatcher objects have  | 
260  |  |  * to support cloning in order to allow classes using  | 
261  |  |  * UnicodeMatchers, such as Transliterator, to implement cloning.  | 
262  |  |  */  | 
263  | 0  | UnicodeSet* UnicodeSet::clone() const { | 
264  | 0  |     return new UnicodeSet(*this);  | 
265  | 0  | }  | 
266  |  |  | 
267  | 0  | UnicodeSet *UnicodeSet::cloneAsThawed() const { | 
268  | 0  |     return new UnicodeSet(*this, TRUE);  | 
269  | 0  | }  | 
270  |  |  | 
271  |  | /**  | 
272  |  |  * Compares the specified object with this set for equality.  Returns  | 
273  |  |  * <tt>true</tt> if the two sets  | 
274  |  |  * have the same size, and every member of the specified set is  | 
275  |  |  * contained in this set (or equivalently, every member of this set is  | 
276  |  |  * contained in the specified set).  | 
277  |  |  *  | 
278  |  |  * @param o set to be compared for equality with this set.  | 
279  |  |  * @return <tt>true</tt> if the specified set is equal to this set.  | 
280  |  |  */  | 
281  | 0  | bool UnicodeSet::operator==(const UnicodeSet& o) const { | 
282  | 0  |     if (len != o.len) return FALSE;  | 
283  | 0  |     for (int32_t i = 0; i < len; ++i) { | 
284  | 0  |         if (list[i] != o.list[i]) return FALSE;  | 
285  | 0  |     }  | 
286  | 0  |     if (hasStrings() != o.hasStrings()) { return FALSE; } | 
287  | 0  |     if (hasStrings() && *strings != *o.strings) return FALSE;  | 
288  | 0  |     return TRUE;  | 
289  | 0  | }  | 
290  |  |  | 
291  |  | /**  | 
292  |  |  * Returns the hash code value for this set.  | 
293  |  |  *  | 
294  |  |  * @return the hash code value for this set.  | 
295  |  |  * @see Object#hashCode()  | 
296  |  |  */  | 
297  | 0  | int32_t UnicodeSet::hashCode(void) const { | 
298  | 0  |     uint32_t result = static_cast<uint32_t>(len);  | 
299  | 0  |     for (int32_t i = 0; i < len; ++i) { | 
300  | 0  |         result *= 1000003u;  | 
301  | 0  |         result += list[i];  | 
302  | 0  |     }  | 
303  | 0  |     return static_cast<int32_t>(result);  | 
304  | 0  | }  | 
305  |  |  | 
306  |  | //----------------------------------------------------------------  | 
307  |  | // Public API  | 
308  |  | //----------------------------------------------------------------  | 
309  |  |  | 
310  |  | /**  | 
311  |  |  * Returns the number of elements in this set (its cardinality),  | 
312  |  |  * Note than the elements of a set may include both individual  | 
313  |  |  * codepoints and strings.  | 
314  |  |  *  | 
315  |  |  * @return the number of elements in this set (its cardinality).  | 
316  |  |  */  | 
317  | 0  | int32_t UnicodeSet::size(void) const { | 
318  | 0  |     int32_t n = 0;  | 
319  | 0  |     int32_t count = getRangeCount();  | 
320  | 0  |     for (int32_t i = 0; i < count; ++i) { | 
321  | 0  |         n += getRangeEnd(i) - getRangeStart(i) + 1;  | 
322  | 0  |     }  | 
323  | 0  |     return n + stringsSize();  | 
324  | 0  | }  | 
325  |  |  | 
326  |  | /**  | 
327  |  |  * Returns <tt>true</tt> if this set contains no elements.  | 
328  |  |  *  | 
329  |  |  * @return <tt>true</tt> if this set contains no elements.  | 
330  |  |  */  | 
331  | 0  | UBool UnicodeSet::isEmpty(void) const { | 
332  | 0  |     return len == 1 && !hasStrings();  | 
333  | 0  | }  | 
334  |  |  | 
335  |  | /**  | 
336  |  |  * Returns true if this set contains the given character.  | 
337  |  |  * @param c character to be checked for containment  | 
338  |  |  * @return true if the test condition is met  | 
339  |  |  */  | 
340  | 0  | UBool UnicodeSet::contains(UChar32 c) const { | 
341  |  |     // Set i to the index of the start item greater than ch  | 
342  |  |     // We know we will terminate without length test!  | 
343  |  |     // LATER: for large sets, add binary search  | 
344  |  |     //int32_t i = -1;  | 
345  |  |     //for (;;) { | 
346  |  |     //    if (c < list[++i]) break;  | 
347  |  |     //}  | 
348  | 0  |     if (bmpSet != NULL) { | 
349  | 0  |         return bmpSet->contains(c);  | 
350  | 0  |     }  | 
351  | 0  |     if (stringSpan != NULL) { | 
352  | 0  |         return stringSpan->contains(c);  | 
353  | 0  |     }  | 
354  | 0  |     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound | 
355  | 0  |         return FALSE;  | 
356  | 0  |     }  | 
357  | 0  |     int32_t i = findCodePoint(c);  | 
358  | 0  |     return (UBool)(i & 1); // return true if odd  | 
359  | 0  | }  | 
360  |  |  | 
361  |  | /**  | 
362  |  |  * Returns the smallest value i such that c < list[i].  Caller  | 
363  |  |  * must ensure that c is a legal value or this method will enter  | 
364  |  |  * an infinite loop.  This method performs a binary search.  | 
365  |  |  * @param c a character in the range MIN_VALUE..MAX_VALUE  | 
366  |  |  * inclusive  | 
367  |  |  * @return the smallest integer i in the range 0..len-1,  | 
368  |  |  * inclusive, such that c < list[i]  | 
369  |  |  */  | 
370  | 0  | int32_t UnicodeSet::findCodePoint(UChar32 c) const { | 
371  |  |     /* Examples:  | 
372  |  |                                        findCodePoint(c)  | 
373  |  |        set              list[]         c=0 1 3 4 7 8  | 
374  |  |        ===              ==============   ===========  | 
375  |  |        []               [110000]         0 0 0 0 0 0  | 
376  |  |        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2  | 
377  |  |        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2  | 
378  |  |        [:Any:]          [0, 110000]      1 1 1 1 1 1  | 
379  |  |      */  | 
380  |  |  | 
381  |  |     // Return the smallest i such that c < list[i].  Assume  | 
382  |  |     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).  | 
383  | 0  |     if (c < list[0])  | 
384  | 0  |         return 0;  | 
385  |  |     // High runner test.  c is often after the last range, so an  | 
386  |  |     // initial check for this condition pays off.  | 
387  | 0  |     int32_t lo = 0;  | 
388  | 0  |     int32_t hi = len - 1;  | 
389  | 0  |     if (lo >= hi || c >= list[hi-1])  | 
390  | 0  |         return hi;  | 
391  |  |     // invariant: c >= list[lo]  | 
392  |  |     // invariant: c < list[hi]  | 
393  | 0  |     for (;;) { | 
394  | 0  |         int32_t i = (lo + hi) >> 1;  | 
395  | 0  |         if (i == lo) { | 
396  | 0  |             break; // Found!  | 
397  | 0  |         } else if (c < list[i]) { | 
398  | 0  |             hi = i;  | 
399  | 0  |         } else { | 
400  | 0  |             lo = i;  | 
401  | 0  |         }  | 
402  | 0  |     }  | 
403  | 0  |     return hi;  | 
404  | 0  | }  | 
405  |  |  | 
406  |  | /**  | 
407  |  |  * Returns true if this set contains every character  | 
408  |  |  * of the given range.  | 
409  |  |  * @param start first character, inclusive, of the range  | 
410  |  |  * @param end last character, inclusive, of the range  | 
411  |  |  * @return true if the test condition is met  | 
412  |  |  */  | 
413  | 0  | UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { | 
414  |  |     //int32_t i = -1;  | 
415  |  |     //for (;;) { | 
416  |  |     //    if (start < list[++i]) break;  | 
417  |  |     //}  | 
418  | 0  |     int32_t i = findCodePoint(start);  | 
419  | 0  |     return ((i & 1) != 0 && end < list[i]);  | 
420  | 0  | }  | 
421  |  |  | 
422  |  | /**  | 
423  |  |  * Returns <tt>true</tt> if this set contains the given  | 
424  |  |  * multicharacter string.  | 
425  |  |  * @param s string to be checked for containment  | 
426  |  |  * @return <tt>true</tt> if this set contains the specified string  | 
427  |  |  */  | 
428  | 0  | UBool UnicodeSet::contains(const UnicodeString& s) const { | 
429  | 0  |     int32_t cp = getSingleCP(s);  | 
430  | 0  |     if (cp < 0) { | 
431  | 0  |         return stringsContains(s);  | 
432  | 0  |     } else { | 
433  | 0  |         return contains((UChar32) cp);  | 
434  | 0  |     }  | 
435  | 0  | }  | 
436  |  |  | 
437  |  | /**  | 
438  |  |  * Returns true if this set contains all the characters and strings  | 
439  |  |  * of the given set.  | 
440  |  |  * @param c set to be checked for containment  | 
441  |  |  * @return true if the test condition is met  | 
442  |  |  */  | 
443  | 0  | UBool UnicodeSet::containsAll(const UnicodeSet& c) const { | 
444  |  |     // The specified set is a subset if all of its pairs are contained in  | 
445  |  |     // this set.  It's possible to code this more efficiently in terms of  | 
446  |  |     // direct manipulation of the inversion lists if the need arises.  | 
447  | 0  |     int32_t n = c.getRangeCount();  | 
448  | 0  |     for (int i=0; i<n; ++i) { | 
449  | 0  |         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { | 
450  | 0  |             return FALSE;  | 
451  | 0  |         }  | 
452  | 0  |     }  | 
453  | 0  |     return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));  | 
454  | 0  | }  | 
455  |  |  | 
456  |  | /**  | 
457  |  |  * Returns true if this set contains all the characters  | 
458  |  |  * of the given string.  | 
459  |  |  * @param s string containing characters to be checked for containment  | 
460  |  |  * @return true if the test condition is met  | 
461  |  |  */  | 
462  | 0  | UBool UnicodeSet::containsAll(const UnicodeString& s) const { | 
463  | 0  |     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==  | 
464  | 0  |                    s.length());  | 
465  | 0  | }  | 
466  |  |  | 
467  |  | /**  | 
468  |  |  * Returns true if this set contains none of the characters  | 
469  |  |  * of the given range.  | 
470  |  |  * @param start first character, inclusive, of the range  | 
471  |  |  * @param end last character, inclusive, of the range  | 
472  |  |  * @return true if the test condition is met  | 
473  |  |  */  | 
474  | 0  | UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { | 
475  |  |     //int32_t i = -1;  | 
476  |  |     //for (;;) { | 
477  |  |     //    if (start < list[++i]) break;  | 
478  |  |     //}  | 
479  | 0  |     int32_t i = findCodePoint(start);  | 
480  | 0  |     return ((i & 1) == 0 && end < list[i]);  | 
481  | 0  | }  | 
482  |  |  | 
483  |  | /**  | 
484  |  |  * Returns true if this set contains none of the characters and strings  | 
485  |  |  * of the given set.  | 
486  |  |  * @param c set to be checked for containment  | 
487  |  |  * @return true if the test condition is met  | 
488  |  |  */  | 
489  | 0  | UBool UnicodeSet::containsNone(const UnicodeSet& c) const { | 
490  |  |     // The specified set is a subset if all of its pairs are contained in  | 
491  |  |     // this set.  It's possible to code this more efficiently in terms of  | 
492  |  |     // direct manipulation of the inversion lists if the need arises.  | 
493  | 0  |     int32_t n = c.getRangeCount();  | 
494  | 0  |     for (int32_t i=0; i<n; ++i) { | 
495  | 0  |         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { | 
496  | 0  |             return FALSE;  | 
497  | 0  |         }  | 
498  | 0  |     }  | 
499  | 0  |     return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);  | 
500  | 0  | }  | 
501  |  |  | 
502  |  | /**  | 
503  |  |  * Returns true if this set contains none of the characters  | 
504  |  |  * of the given string.  | 
505  |  |  * @param s string containing characters to be checked for containment  | 
506  |  |  * @return true if the test condition is met  | 
507  |  |  */  | 
508  | 0  | UBool UnicodeSet::containsNone(const UnicodeString& s) const { | 
509  | 0  |     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==  | 
510  | 0  |                    s.length());  | 
511  | 0  | }  | 
512  |  |  | 
513  |  | /**  | 
514  |  |  * Returns <tt>true</tt> if this set contains any character whose low byte  | 
515  |  |  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for  | 
516  |  |  * indexing.  | 
517  |  |  */  | 
518  | 0  | UBool UnicodeSet::matchesIndexValue(uint8_t v) const { | 
519  |  |     /* The index value v, in the range [0,255], is contained in this set if  | 
520  |  |      * it is contained in any pair of this set.  Pairs either have the high  | 
521  |  |      * bytes equal, or unequal.  If the high bytes are equal, then we have  | 
522  |  |      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=  | 
523  |  |      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.  | 
524  |  |      * Then v is contained if xx <= v || v <= yy.  (This is identical to the  | 
525  |  |      * time zone month containment logic.)  | 
526  |  |      */  | 
527  | 0  |     int32_t i;  | 
528  | 0  |     int32_t rangeCount=getRangeCount();  | 
529  | 0  |     for (i=0; i<rangeCount; ++i) { | 
530  | 0  |         UChar32 low = getRangeStart(i);  | 
531  | 0  |         UChar32 high = getRangeEnd(i);  | 
532  | 0  |         if ((low & ~0xFF) == (high & ~0xFF)) { | 
533  | 0  |             if ((low & 0xFF) <= v && v <= (high & 0xFF)) { | 
534  | 0  |                 return TRUE;  | 
535  | 0  |             }  | 
536  | 0  |         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { | 
537  | 0  |             return TRUE;  | 
538  | 0  |         }  | 
539  | 0  |     }  | 
540  | 0  |     if (hasStrings()) { | 
541  | 0  |         for (i=0; i<strings->size(); ++i) { | 
542  | 0  |             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);  | 
543  | 0  |             if (s.isEmpty()) { | 
544  | 0  |                 continue;  // skip the empty string  | 
545  | 0  |             }  | 
546  | 0  |             UChar32 c = s.char32At(0);  | 
547  | 0  |             if ((c & 0xFF) == v) { | 
548  | 0  |                 return TRUE;  | 
549  | 0  |             }  | 
550  | 0  |         }  | 
551  | 0  |     }  | 
552  | 0  |     return FALSE;  | 
553  | 0  | }  | 
554  |  |  | 
555  |  | /**  | 
556  |  |  * Implementation of UnicodeMatcher::matches().  Always matches the  | 
557  |  |  * longest possible multichar string.  | 
558  |  |  */  | 
559  |  | UMatchDegree UnicodeSet::matches(const Replaceable& text,  | 
560  |  |                                  int32_t& offset,  | 
561  |  |                                  int32_t limit,  | 
562  | 0  |                                  UBool incremental) { | 
563  | 0  |     if (offset == limit) { | 
564  | 0  |         if (contains(U_ETHER)) { | 
565  | 0  |             return incremental ? U_PARTIAL_MATCH : U_MATCH;  | 
566  | 0  |         } else { | 
567  | 0  |             return U_MISMATCH;  | 
568  | 0  |         }  | 
569  | 0  |     } else { | 
570  | 0  |         if (hasStrings()) { // try strings first | 
571  |  |  | 
572  |  |             // might separate forward and backward loops later  | 
573  |  |             // for now they are combined  | 
574  |  |  | 
575  |  |             // TODO Improve efficiency of this, at least in the forward  | 
576  |  |             // direction, if not in both.  In the forward direction we  | 
577  |  |             // can assume the strings are sorted.  | 
578  |  | 
  | 
579  | 0  |             int32_t i;  | 
580  | 0  |             UBool forward = offset < limit;  | 
581  |  |  | 
582  |  |             // firstChar is the leftmost char to match in the  | 
583  |  |             // forward direction or the rightmost char to match in  | 
584  |  |             // the reverse direction.  | 
585  | 0  |             UChar firstChar = text.charAt(offset);  | 
586  |  |  | 
587  |  |             // If there are multiple strings that can match we  | 
588  |  |             // return the longest match.  | 
589  | 0  |             int32_t highWaterLength = 0;  | 
590  |  | 
  | 
591  | 0  |             for (i=0; i<strings->size(); ++i) { | 
592  | 0  |                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);  | 
593  | 0  |                 if (trial.isEmpty()) { | 
594  | 0  |                     continue;  // skip the empty string  | 
595  | 0  |                 }  | 
596  |  |  | 
597  | 0  |                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);  | 
598  |  |  | 
599  |  |                 // Strings are sorted, so we can optimize in the  | 
600  |  |                 // forward direction.  | 
601  | 0  |                 if (forward && c > firstChar) break;  | 
602  | 0  |                 if (c != firstChar) continue;  | 
603  |  |  | 
604  | 0  |                 int32_t matchLen = matchRest(text, offset, limit, trial);  | 
605  |  | 
  | 
606  | 0  |                 if (incremental) { | 
607  | 0  |                     int32_t maxLen = forward ? limit-offset : offset-limit;  | 
608  | 0  |                     if (matchLen == maxLen) { | 
609  |  |                         // We have successfully matched but only up to limit.  | 
610  | 0  |                         return U_PARTIAL_MATCH;  | 
611  | 0  |                     }  | 
612  | 0  |                 }  | 
613  |  |  | 
614  | 0  |                 if (matchLen == trial.length()) { | 
615  |  |                     // We have successfully matched the whole string.  | 
616  | 0  |                     if (matchLen > highWaterLength) { | 
617  | 0  |                         highWaterLength = matchLen;  | 
618  | 0  |                     }  | 
619  |  |                     // In the forward direction we know strings  | 
620  |  |                     // are sorted so we can bail early.  | 
621  | 0  |                     if (forward && matchLen < highWaterLength) { | 
622  | 0  |                         break;  | 
623  | 0  |                     }  | 
624  | 0  |                     continue;  | 
625  | 0  |                 }  | 
626  | 0  |             }  | 
627  |  |  | 
628  |  |             // We've checked all strings without a partial match.  | 
629  |  |             // If we have full matches, return the longest one.  | 
630  | 0  |             if (highWaterLength != 0) { | 
631  | 0  |                 offset += forward ? highWaterLength : -highWaterLength;  | 
632  | 0  |                 return U_MATCH;  | 
633  | 0  |             }  | 
634  | 0  |         }  | 
635  | 0  |         return UnicodeFilter::matches(text, offset, limit, incremental);  | 
636  | 0  |     }  | 
637  | 0  | }  | 
638  |  |  | 
639  |  | /**  | 
640  |  |  * Returns the longest match for s in text at the given position.  | 
641  |  |  * If limit > start then match forward from start+1 to limit  | 
642  |  |  * matching all characters except s.charAt(0).  If limit < start,  | 
643  |  |  * go backward starting from start-1 matching all characters  | 
644  |  |  * except s.charAt(s.length()-1).  This method assumes that the  | 
645  |  |  * first character, text.charAt(start), matches s, so it does not  | 
646  |  |  * check it.  | 
647  |  |  * @param text the text to match  | 
648  |  |  * @param start the first character to match.  In the forward  | 
649  |  |  * direction, text.charAt(start) is matched against s.charAt(0).  | 
650  |  |  * In the reverse direction, it is matched against  | 
651  |  |  * s.charAt(s.length()-1).  | 
652  |  |  * @param limit the limit offset for matching, either last+1 in  | 
653  |  |  * the forward direction, or last-1 in the reverse direction,  | 
654  |  |  * where last is the index of the last character to match.  | 
655  |  |  * @return If part of s matches up to the limit, return |limit -  | 
656  |  |  * start|.  If all of s matches before reaching the limit, return  | 
657  |  |  * s.length().  If there is a mismatch between s and text, return  | 
658  |  |  * 0  | 
659  |  |  */  | 
660  |  | int32_t UnicodeSet::matchRest(const Replaceable& text,  | 
661  |  |                               int32_t start, int32_t limit,  | 
662  | 0  |                               const UnicodeString& s) { | 
663  | 0  |     int32_t i;  | 
664  | 0  |     int32_t maxLen;  | 
665  | 0  |     int32_t slen = s.length();  | 
666  | 0  |     if (start < limit) { | 
667  | 0  |         maxLen = limit - start;  | 
668  | 0  |         if (maxLen > slen) maxLen = slen;  | 
669  | 0  |         for (i = 1; i < maxLen; ++i) { | 
670  | 0  |             if (text.charAt(start + i) != s.charAt(i)) return 0;  | 
671  | 0  |         }  | 
672  | 0  |     } else { | 
673  | 0  |         maxLen = start - limit;  | 
674  | 0  |         if (maxLen > slen) maxLen = slen;  | 
675  | 0  |         --slen; // <=> slen = s.length() - 1;  | 
676  | 0  |         for (i = 1; i < maxLen; ++i) { | 
677  | 0  |             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;  | 
678  | 0  |         }  | 
679  | 0  |     }  | 
680  | 0  |     return maxLen;  | 
681  | 0  | }  | 
682  |  |  | 
683  |  | /**  | 
684  |  |  * Implement of UnicodeMatcher  | 
685  |  |  */  | 
686  | 0  | void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { | 
687  | 0  |     toUnionTo.addAll(*this);  | 
688  | 0  | }  | 
689  |  |  | 
690  |  | /**  | 
691  |  |  * Returns the index of the given character within this set, where  | 
692  |  |  * the set is ordered by ascending code point.  If the character  | 
693  |  |  * is not in this set, return -1.  The inverse of this method is  | 
694  |  |  * <code>charAt()</code>.  | 
695  |  |  * @return an index from 0..size()-1, or -1  | 
696  |  |  */  | 
697  | 0  | int32_t UnicodeSet::indexOf(UChar32 c) const { | 
698  | 0  |     if (c < MIN_VALUE || c > MAX_VALUE) { | 
699  | 0  |         return -1;  | 
700  | 0  |     }  | 
701  | 0  |     int32_t i = 0;  | 
702  | 0  |     int32_t n = 0;  | 
703  | 0  |     for (;;) { | 
704  | 0  |         UChar32 start = list[i++];  | 
705  | 0  |         if (c < start) { | 
706  | 0  |             return -1;  | 
707  | 0  |         }  | 
708  | 0  |         UChar32 limit = list[i++];  | 
709  | 0  |         if (c < limit) { | 
710  | 0  |             return n + c - start;  | 
711  | 0  |         }  | 
712  | 0  |         n += limit - start;  | 
713  | 0  |     }  | 
714  | 0  | }  | 
715  |  |  | 
716  |  | /**  | 
717  |  |  * Returns the character at the given index within this set, where  | 
718  |  |  * the set is ordered by ascending code point.  If the index is  | 
719  |  |  * out of range, return (UChar32)-1.  The inverse of this method is  | 
720  |  |  * <code>indexOf()</code>.  | 
721  |  |  * @param index an index from 0..size()-1  | 
722  |  |  * @return the character at the given index, or (UChar32)-1.  | 
723  |  |  */  | 
724  | 0  | UChar32 UnicodeSet::charAt(int32_t index) const { | 
725  | 0  |     if (index >= 0) { | 
726  |  |         // len2 is the largest even integer <= len, that is, it is len  | 
727  |  |         // for even values and len-1 for odd values.  With odd values  | 
728  |  |         // the last entry is UNICODESET_HIGH.  | 
729  | 0  |         int32_t len2 = len & ~1;  | 
730  | 0  |         for (int32_t i=0; i < len2;) { | 
731  | 0  |             UChar32 start = list[i++];  | 
732  | 0  |             int32_t count = list[i++] - start;  | 
733  | 0  |             if (index < count) { | 
734  | 0  |                 return (UChar32)(start + index);  | 
735  | 0  |             }  | 
736  | 0  |             index -= count;  | 
737  | 0  |         }  | 
738  | 0  |     }  | 
739  | 0  |     return (UChar32)-1;  | 
740  | 0  | }  | 
741  |  |  | 
742  |  | /**  | 
743  |  |  * Make this object represent the range <code>start - end</code>.  | 
744  |  |  * If <code>end > start</code> then this object is set to an  | 
745  |  |  * an empty range.  | 
746  |  |  *  | 
747  |  |  * @param start first character in the set, inclusive  | 
748  |  |  * @rparam end last character in the set, inclusive  | 
749  |  |  */  | 
750  | 0  | UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { | 
751  | 0  |     clear();  | 
752  | 0  |     complement(start, end);  | 
753  | 0  |     return *this;  | 
754  | 0  | }  | 
755  |  |  | 
756  |  | /**  | 
757  |  |  * Adds the specified range to this set if it is not already  | 
758  |  |  * present.  If this set already contains the specified range,  | 
759  |  |  * the call leaves this set unchanged.  If <code>end > start</code>  | 
760  |  |  * then an empty range is added, leaving the set unchanged.  | 
761  |  |  *  | 
762  |  |  * @param start first character, inclusive, of range to be added  | 
763  |  |  * to this set.  | 
764  |  |  * @param end last character, inclusive, of range to be added  | 
765  |  |  * to this set.  | 
766  |  |  */  | 
767  | 0  | UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { | 
768  | 0  |     if (pinCodePoint(start) < pinCodePoint(end)) { | 
769  | 0  |         UChar32 limit = end + 1;  | 
770  |  |         // Fast path for adding a new range after the last one.  | 
771  |  |         // Odd list length: [..., lastStart, lastLimit, HIGH]  | 
772  | 0  |         if ((len & 1) != 0) { | 
773  |  |             // If the list is empty, set lastLimit low enough to not be adjacent to 0.  | 
774  | 0  |             UChar32 lastLimit = len == 1 ? -2 : list[len - 2];  | 
775  | 0  |             if (lastLimit <= start && !isFrozen() && !isBogus()) { | 
776  | 0  |                 if (lastLimit == start) { | 
777  |  |                     // Extend the last range.  | 
778  | 0  |                     list[len - 2] = limit;  | 
779  | 0  |                     if (limit == UNICODESET_HIGH) { | 
780  | 0  |                         --len;  | 
781  | 0  |                     }  | 
782  | 0  |                 } else { | 
783  | 0  |                     list[len - 1] = start;  | 
784  | 0  |                     if (limit < UNICODESET_HIGH) { | 
785  | 0  |                         if (ensureCapacity(len + 2)) { | 
786  | 0  |                             list[len++] = limit;  | 
787  | 0  |                             list[len++] = UNICODESET_HIGH;  | 
788  | 0  |                         }  | 
789  | 0  |                     } else {  // limit == UNICODESET_HIGH | 
790  | 0  |                         if (ensureCapacity(len + 1)) { | 
791  | 0  |                             list[len++] = UNICODESET_HIGH;  | 
792  | 0  |                         }  | 
793  | 0  |                     }  | 
794  | 0  |                 }  | 
795  | 0  |                 releasePattern();  | 
796  | 0  |                 return *this;  | 
797  | 0  |             }  | 
798  | 0  |         }  | 
799  |  |         // This is slow. Could be much faster using findCodePoint(start)  | 
800  |  |         // and modifying the list, dealing with adjacent & overlapping ranges.  | 
801  | 0  |         UChar32 range[3] = { start, limit, UNICODESET_HIGH }; | 
802  | 0  |         add(range, 2, 0);  | 
803  | 0  |     } else if (start == end) { | 
804  | 0  |         add(start);  | 
805  | 0  |     }  | 
806  | 0  |     return *this;  | 
807  | 0  | }  | 
808  |  |  | 
809  |  | // #define DEBUG_US_ADD  | 
810  |  |  | 
811  |  | #ifdef DEBUG_US_ADD  | 
812  |  | #include <stdio.h>  | 
813  |  | void dump(UChar32 c) { | 
814  |  |     if (c <= 0xFF) { | 
815  |  |         printf("%c", (char)c); | 
816  |  |     } else { | 
817  |  |         printf("U+%04X", c); | 
818  |  |     }  | 
819  |  | }  | 
820  |  | void dump(const UChar32* list, int32_t len) { | 
821  |  |     printf("["); | 
822  |  |     for (int32_t i=0; i<len; ++i) { | 
823  |  |         if (i != 0) printf(", "); | 
824  |  |         dump(list[i]);  | 
825  |  |     }  | 
826  |  |     printf("]"); | 
827  |  | }  | 
828  |  | #endif  | 
829  |  |  | 
830  |  | /**  | 
831  |  |  * Adds the specified character to this set if it is not already  | 
832  |  |  * present.  If this set already contains the specified character,  | 
833  |  |  * the call leaves this set unchanged.  | 
834  |  |  */  | 
835  | 0  | UnicodeSet& UnicodeSet::add(UChar32 c) { | 
836  |  |     // find smallest i such that c < list[i]  | 
837  |  |     // if odd, then it is IN the set  | 
838  |  |     // if even, then it is OUT of the set  | 
839  | 0  |     int32_t i = findCodePoint(pinCodePoint(c));  | 
840  |  |  | 
841  |  |     // already in set?  | 
842  | 0  |     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;  | 
843  |  |  | 
844  |  |     // HIGH is 0x110000  | 
845  |  |     // assert(list[len-1] == HIGH);  | 
846  |  |  | 
847  |  |     // empty = [HIGH]  | 
848  |  |     // [start_0, limit_0, start_1, limit_1, HIGH]  | 
849  |  |  | 
850  |  |     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]  | 
851  |  |     //                             ^  | 
852  |  |     //                             list[i]  | 
853  |  |  | 
854  |  |     // i == 0 means c is before the first range  | 
855  |  |  | 
856  |  | #ifdef DEBUG_US_ADD  | 
857  |  |     printf("Add of "); | 
858  |  |     dump(c);  | 
859  |  |     printf(" found at %d", i); | 
860  |  |     printf(": "); | 
861  |  |     dump(list, len);  | 
862  |  |     printf(" => "); | 
863  |  | #endif  | 
864  |  |  | 
865  | 0  |     if (c == list[i]-1) { | 
866  |  |         // c is before start of next range  | 
867  | 0  |         list[i] = c;  | 
868  |  |         // if we touched the HIGH mark, then add a new one  | 
869  | 0  |         if (c == (UNICODESET_HIGH - 1)) { | 
870  | 0  |             if (!ensureCapacity(len+1)) { | 
871  |  |                 // ensureCapacity will mark the object as Bogus if OOM failure happens.  | 
872  | 0  |                 return *this;  | 
873  | 0  |             }  | 
874  | 0  |             list[len++] = UNICODESET_HIGH;  | 
875  | 0  |         }  | 
876  | 0  |         if (i > 0 && c == list[i-1]) { | 
877  |  |             // collapse adjacent ranges  | 
878  |  |  | 
879  |  |             // [..., start_k-1, c, c, limit_k, ..., HIGH]  | 
880  |  |             //                     ^  | 
881  |  |             //                     list[i]  | 
882  |  |  | 
883  |  |             //for (int32_t k=i-1; k<len-2; ++k) { | 
884  |  |             //    list[k] = list[k+2];  | 
885  |  |             //}  | 
886  | 0  |             UChar32* dst = list + i - 1;  | 
887  | 0  |             UChar32* src = dst + 2;  | 
888  | 0  |             UChar32* srclimit = list + len;  | 
889  | 0  |             while (src < srclimit) *(dst++) = *(src++);  | 
890  |  | 
  | 
891  | 0  |             len -= 2;  | 
892  | 0  |         }  | 
893  | 0  |     }  | 
894  |  |  | 
895  | 0  |     else if (i > 0 && c == list[i-1]) { | 
896  |  |         // c is after end of prior range  | 
897  | 0  |         list[i-1]++;  | 
898  |  |         // no need to check for collapse here  | 
899  | 0  |     }  | 
900  |  |  | 
901  | 0  |     else { | 
902  |  |         // At this point we know the new char is not adjacent to  | 
903  |  |         // any existing ranges, and it is not 10FFFF.  | 
904  |  |  | 
905  |  |  | 
906  |  |         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]  | 
907  |  |         //                             ^  | 
908  |  |         //                             list[i]  | 
909  |  |  | 
910  |  |         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]  | 
911  |  |         //                             ^  | 
912  |  |         //                             list[i]  | 
913  |  | 
  | 
914  | 0  |         if (!ensureCapacity(len+2)) { | 
915  |  |             // ensureCapacity will mark the object as Bogus if OOM failure happens.  | 
916  | 0  |             return *this;  | 
917  | 0  |         }  | 
918  |  |  | 
919  | 0  |         UChar32 *p = list + i;  | 
920  | 0  |         uprv_memmove(p + 2, p, (len - i) * sizeof(*p));  | 
921  | 0  |         list[i] = c;  | 
922  | 0  |         list[i+1] = c+1;  | 
923  | 0  |         len += 2;  | 
924  | 0  |     }  | 
925  |  |  | 
926  |  | #ifdef DEBUG_US_ADD  | 
927  |  |     dump(list, len);  | 
928  |  |     printf("\n"); | 
929  |  |  | 
930  |  |     for (i=1; i<len; ++i) { | 
931  |  |         if (list[i] <= list[i-1]) { | 
932  |  |             // Corrupt array!  | 
933  |  |             printf("ERROR: list has been corrupted\n"); | 
934  |  |             exit(1);  | 
935  |  |         }  | 
936  |  |     }  | 
937  |  | #endif  | 
938  |  |  | 
939  | 0  |     releasePattern();  | 
940  | 0  |     return *this;  | 
941  | 0  | }  | 
942  |  |  | 
943  |  | /**  | 
944  |  |  * Adds the specified multicharacter to this set if it is not already  | 
945  |  |  * present.  If this set already contains the multicharacter,  | 
946  |  |  * the call leaves this set unchanged.  | 
947  |  |  * Thus "ch" => {"ch"} | 
948  |  |  *  | 
949  |  |  * @param s the source string  | 
950  |  |  * @return the modified set, for chaining  | 
951  |  |  */  | 
952  | 0  | UnicodeSet& UnicodeSet::add(const UnicodeString& s) { | 
953  | 0  |     if (isFrozen() || isBogus()) return *this;  | 
954  | 0  |     int32_t cp = getSingleCP(s);  | 
955  | 0  |     if (cp < 0) { | 
956  | 0  |         if (!stringsContains(s)) { | 
957  | 0  |             _add(s);  | 
958  | 0  |             releasePattern();  | 
959  | 0  |         }  | 
960  | 0  |     } else { | 
961  | 0  |         add((UChar32)cp);  | 
962  | 0  |     }  | 
963  | 0  |     return *this;  | 
964  | 0  | }  | 
965  |  |  | 
966  |  | /**  | 
967  |  |  * Adds the given string, in order, to 'strings'.  The given string  | 
968  |  |  * must have been checked by the caller to not already be in 'strings'.  | 
969  |  |  */  | 
970  | 0  | void UnicodeSet::_add(const UnicodeString& s) { | 
971  | 0  |     if (isFrozen() || isBogus()) { | 
972  | 0  |         return;  | 
973  | 0  |     }  | 
974  | 0  |     UErrorCode ec = U_ZERO_ERROR;  | 
975  | 0  |     if (strings == nullptr && !allocateStrings(ec)) { | 
976  | 0  |         setToBogus();  | 
977  | 0  |         return;  | 
978  | 0  |     }  | 
979  | 0  |     UnicodeString* t = new UnicodeString(s);  | 
980  | 0  |     if (t == NULL) { // Check for memory allocation error. | 
981  | 0  |         setToBogus();  | 
982  | 0  |         return;  | 
983  | 0  |     }  | 
984  | 0  |     strings->sortedInsert(t, compareUnicodeString, ec);  | 
985  | 0  |     if (U_FAILURE(ec)) { | 
986  | 0  |         setToBogus();  | 
987  | 0  |         delete t;  | 
988  | 0  |     }  | 
989  | 0  | }  | 
990  |  |  | 
991  |  | /**  | 
992  |  |  * @return a code point IF the string consists of a single one.  | 
993  |  |  * otherwise returns -1.  | 
994  |  |  * @param string to test  | 
995  |  |  */  | 
996  | 0  | int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { | 
997  | 0  |     int32_t sLength = s.length();  | 
998  | 0  |     if (sLength == 1) return s.charAt(0);  | 
999  | 0  |     if (sLength == 2) { | 
1000  | 0  |         UChar32 cp = s.char32At(0);  | 
1001  | 0  |         if (cp > 0xFFFF) { // is surrogate pair | 
1002  | 0  |             return cp;  | 
1003  | 0  |         }  | 
1004  | 0  |     }  | 
1005  | 0  |     return -1;  | 
1006  | 0  | }  | 
1007  |  |  | 
1008  |  | /**  | 
1009  |  |  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} | 
1010  |  |  * If this set already any particular character, it has no effect on that character.  | 
1011  |  |  * @param the source string  | 
1012  |  |  * @return the modified set, for chaining  | 
1013  |  |  */  | 
1014  | 0  | UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { | 
1015  | 0  |     UChar32 cp;  | 
1016  | 0  |     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { | 
1017  | 0  |         cp = s.char32At(i);  | 
1018  | 0  |         add(cp);  | 
1019  | 0  |     }  | 
1020  | 0  |     return *this;  | 
1021  | 0  | }  | 
1022  |  |  | 
1023  |  | /**  | 
1024  |  |  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} | 
1025  |  |  * If this set already any particular character, it has no effect on that character.  | 
1026  |  |  * @param the source string  | 
1027  |  |  * @return the modified set, for chaining  | 
1028  |  |  */  | 
1029  | 0  | UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { | 
1030  | 0  |     UnicodeSet set;  | 
1031  | 0  |     set.addAll(s);  | 
1032  | 0  |     retainAll(set);  | 
1033  | 0  |     return *this;  | 
1034  | 0  | }  | 
1035  |  |  | 
1036  |  | /**  | 
1037  |  |  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} | 
1038  |  |  * If this set already any particular character, it has no effect on that character.  | 
1039  |  |  * @param the source string  | 
1040  |  |  * @return the modified set, for chaining  | 
1041  |  |  */  | 
1042  | 0  | UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { | 
1043  | 0  |     UnicodeSet set;  | 
1044  | 0  |     set.addAll(s);  | 
1045  | 0  |     complementAll(set);  | 
1046  | 0  |     return *this;  | 
1047  | 0  | }  | 
1048  |  |  | 
1049  |  | /**  | 
1050  |  |  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} | 
1051  |  |  * If this set already any particular character, it has no effect on that character.  | 
1052  |  |  * @param the source string  | 
1053  |  |  * @return the modified set, for chaining  | 
1054  |  |  */  | 
1055  | 0  | UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { | 
1056  | 0  |     UnicodeSet set;  | 
1057  | 0  |     set.addAll(s);  | 
1058  | 0  |     removeAll(set);  | 
1059  | 0  |     return *this;  | 
1060  | 0  | }  | 
1061  |  |  | 
1062  | 0  | UnicodeSet& UnicodeSet::removeAllStrings() { | 
1063  | 0  |     if (!isFrozen() && hasStrings()) { | 
1064  | 0  |         strings->removeAllElements();  | 
1065  | 0  |         releasePattern();  | 
1066  | 0  |     }  | 
1067  | 0  |     return *this;  | 
1068  | 0  | }  | 
1069  |  |  | 
1070  |  |  | 
1071  |  | /**  | 
1072  |  |  * Makes a set from a multicharacter string. Thus "ch" => {"ch"} | 
1073  |  |  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> | 
1074  |  |  * @param the source string  | 
1075  |  |  * @return a newly created set containing the given string  | 
1076  |  |  */  | 
1077  | 0  | UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { | 
1078  | 0  |     UnicodeSet *set = new UnicodeSet();  | 
1079  | 0  |     if (set != NULL) { // Check for memory allocation error. | 
1080  | 0  |         set->add(s);  | 
1081  | 0  |     }  | 
1082  | 0  |     return set;  | 
1083  | 0  | }  | 
1084  |  |  | 
1085  |  |  | 
1086  |  | /**  | 
1087  |  |  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} | 
1088  |  |  * @param the source string  | 
1089  |  |  * @return a newly created set containing the given characters  | 
1090  |  |  */  | 
1091  | 0  | UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { | 
1092  | 0  |     UnicodeSet *set = new UnicodeSet();  | 
1093  | 0  |     if (set != NULL) { // Check for memory allocation error. | 
1094  | 0  |         set->addAll(s);  | 
1095  | 0  |     }  | 
1096  | 0  |     return set;  | 
1097  | 0  | }  | 
1098  |  |  | 
1099  |  | /**  | 
1100  |  |  * Retain only the elements in this set that are contained in the  | 
1101  |  |  * specified range.  If <code>end > start</code> then an empty range is  | 
1102  |  |  * retained, leaving the set empty.  | 
1103  |  |  *  | 
1104  |  |  * @param start first character, inclusive, of range to be retained  | 
1105  |  |  * to this set.  | 
1106  |  |  * @param end last character, inclusive, of range to be retained  | 
1107  |  |  * to this set.  | 
1108  |  |  */  | 
1109  | 0  | UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { | 
1110  | 0  |     if (pinCodePoint(start) <= pinCodePoint(end)) { | 
1111  | 0  |         UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; | 
1112  | 0  |         retain(range, 2, 0);  | 
1113  | 0  |     } else { | 
1114  | 0  |         clear();  | 
1115  | 0  |     }  | 
1116  | 0  |     return *this;  | 
1117  | 0  | }  | 
1118  |  |  | 
1119  | 0  | UnicodeSet& UnicodeSet::retain(UChar32 c) { | 
1120  | 0  |     return retain(c, c);  | 
1121  | 0  | }  | 
1122  |  |  | 
1123  | 0  | UnicodeSet& UnicodeSet::retain(const UnicodeString &s) { | 
1124  | 0  |     if (isFrozen() || isBogus()) { return *this; } | 
1125  | 0  |     UChar32 cp = getSingleCP(s);  | 
1126  | 0  |     if (cp < 0) { | 
1127  | 0  |         bool isIn = stringsContains(s);  | 
1128  |  |         // Check for getRangeCount() first to avoid somewhat-expensive size()  | 
1129  |  |         // when there are single code points.  | 
1130  | 0  |         if (isIn && getRangeCount() == 0 && size() == 1) { | 
1131  | 0  |             return *this;  | 
1132  | 0  |         }  | 
1133  | 0  |         clear();  | 
1134  | 0  |         if (isIn) { | 
1135  | 0  |             _add(s);  | 
1136  | 0  |         }  | 
1137  | 0  |     } else { | 
1138  | 0  |         retain(cp, cp);  | 
1139  | 0  |     }  | 
1140  | 0  |     return *this;  | 
1141  | 0  | }  | 
1142  |  |  | 
1143  |  | /**  | 
1144  |  |  * Removes the specified range from this set if it is present.  | 
1145  |  |  * The set will not contain the specified range once the call  | 
1146  |  |  * returns.  If <code>end > start</code> then an empty range is  | 
1147  |  |  * removed, leaving the set unchanged.  | 
1148  |  |  *  | 
1149  |  |  * @param start first character, inclusive, of range to be removed  | 
1150  |  |  * from this set.  | 
1151  |  |  * @param end last character, inclusive, of range to be removed  | 
1152  |  |  * from this set.  | 
1153  |  |  */  | 
1154  | 0  | UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { | 
1155  | 0  |     if (pinCodePoint(start) <= pinCodePoint(end)) { | 
1156  | 0  |         UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; | 
1157  | 0  |         retain(range, 2, 2);  | 
1158  | 0  |     }  | 
1159  | 0  |     return *this;  | 
1160  | 0  | }  | 
1161  |  |  | 
1162  |  | /**  | 
1163  |  |  * Removes the specified character from this set if it is present.  | 
1164  |  |  * The set will not contain the specified range once the call  | 
1165  |  |  * returns.  | 
1166  |  |  */  | 
1167  | 0  | UnicodeSet& UnicodeSet::remove(UChar32 c) { | 
1168  | 0  |     return remove(c, c);  | 
1169  | 0  | }  | 
1170  |  |  | 
1171  |  | /**  | 
1172  |  |  * Removes the specified string from this set if it is present.  | 
1173  |  |  * The set will not contain the specified character once the call  | 
1174  |  |  * returns.  | 
1175  |  |  * @param the source string  | 
1176  |  |  * @return the modified set, for chaining  | 
1177  |  |  */  | 
1178  | 0  | UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { | 
1179  | 0  |     if (isFrozen() || isBogus()) return *this;  | 
1180  | 0  |     int32_t cp = getSingleCP(s);  | 
1181  | 0  |     if (cp < 0) { | 
1182  | 0  |         if (strings != nullptr && strings->removeElement((void*) &s)) { | 
1183  | 0  |             releasePattern();  | 
1184  | 0  |         }  | 
1185  | 0  |     } else { | 
1186  | 0  |         remove((UChar32)cp, (UChar32)cp);  | 
1187  | 0  |     }  | 
1188  | 0  |     return *this;  | 
1189  | 0  | }  | 
1190  |  |  | 
1191  |  | /**  | 
1192  |  |  * Complements the specified range in this set.  Any character in  | 
1193  |  |  * the range will be removed if it is in this set, or will be  | 
1194  |  |  * added if it is not in this set.  If <code>end > start</code>  | 
1195  |  |  * then an empty range is xor'ed, leaving the set unchanged.  | 
1196  |  |  *  | 
1197  |  |  * @param start first character, inclusive, of range to be removed  | 
1198  |  |  * from this set.  | 
1199  |  |  * @param end last character, inclusive, of range to be removed  | 
1200  |  |  * from this set.  | 
1201  |  |  */  | 
1202  | 0  | UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { | 
1203  | 0  |     if (isFrozen() || isBogus()) { | 
1204  | 0  |         return *this;  | 
1205  | 0  |     }  | 
1206  | 0  |     if (pinCodePoint(start) <= pinCodePoint(end)) { | 
1207  | 0  |         UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; | 
1208  | 0  |         exclusiveOr(range, 2, 0);  | 
1209  | 0  |     }  | 
1210  | 0  |     releasePattern();  | 
1211  | 0  |     return *this;  | 
1212  | 0  | }  | 
1213  |  |  | 
1214  | 0  | UnicodeSet& UnicodeSet::complement(UChar32 c) { | 
1215  | 0  |     return complement(c, c);  | 
1216  | 0  | }  | 
1217  |  |  | 
1218  |  | /**  | 
1219  |  |  * This is equivalent to  | 
1220  |  |  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.  | 
1221  |  |  */  | 
1222  | 0  | UnicodeSet& UnicodeSet::complement(void) { | 
1223  | 0  |     if (isFrozen() || isBogus()) { | 
1224  | 0  |         return *this;  | 
1225  | 0  |     }  | 
1226  | 0  |     if (list[0] == UNICODESET_LOW) { | 
1227  | 0  |         uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));  | 
1228  | 0  |         --len;  | 
1229  | 0  |     } else { | 
1230  | 0  |         if (!ensureCapacity(len+1)) { | 
1231  | 0  |             return *this;  | 
1232  | 0  |         }  | 
1233  | 0  |         uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));  | 
1234  | 0  |         list[0] = UNICODESET_LOW;  | 
1235  | 0  |         ++len;  | 
1236  | 0  |     }  | 
1237  | 0  |     releasePattern();  | 
1238  | 0  |     return *this;  | 
1239  | 0  | }  | 
1240  |  |  | 
1241  |  | /**  | 
1242  |  |  * Complement the specified string in this set.  | 
1243  |  |  * The set will not contain the specified string once the call  | 
1244  |  |  * returns.  | 
1245  |  |  *  | 
1246  |  |  * @param s the string to complement  | 
1247  |  |  * @return this object, for chaining  | 
1248  |  |  */  | 
1249  | 0  | UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { | 
1250  | 0  |     if (isFrozen() || isBogus()) return *this;  | 
1251  | 0  |     int32_t cp = getSingleCP(s);  | 
1252  | 0  |     if (cp < 0) { | 
1253  | 0  |         if (stringsContains(s)) { | 
1254  | 0  |             strings->removeElement((void*) &s);  | 
1255  | 0  |         } else { | 
1256  | 0  |             _add(s);  | 
1257  | 0  |         }  | 
1258  | 0  |         releasePattern();  | 
1259  | 0  |     } else { | 
1260  | 0  |         complement((UChar32)cp, (UChar32)cp);  | 
1261  | 0  |     }  | 
1262  | 0  |     return *this;  | 
1263  | 0  | }  | 
1264  |  |  | 
1265  |  | /**  | 
1266  |  |  * Adds all of the elements in the specified set to this set if  | 
1267  |  |  * they're not already present.  This operation effectively  | 
1268  |  |  * modifies this set so that its value is the <i>union</i> of the two  | 
1269  |  |  * sets.  The behavior of this operation is unspecified if the specified  | 
1270  |  |  * collection is modified while the operation is in progress.  | 
1271  |  |  *  | 
1272  |  |  * @param c set whose elements are to be added to this set.  | 
1273  |  |  * @see #add(char, char)  | 
1274  |  |  */  | 
1275  | 0  | UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { | 
1276  | 0  |     if ( c.len>0 && c.list!=NULL ) { | 
1277  | 0  |         add(c.list, c.len, 0);  | 
1278  | 0  |     }  | 
1279  |  |  | 
1280  |  |     // Add strings in order  | 
1281  | 0  |     if ( c.strings!=NULL ) { | 
1282  | 0  |         for (int32_t i=0; i<c.strings->size(); ++i) { | 
1283  | 0  |             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);  | 
1284  | 0  |             if (!stringsContains(*s)) { | 
1285  | 0  |                 _add(*s);  | 
1286  | 0  |             }  | 
1287  | 0  |         }  | 
1288  | 0  |     }  | 
1289  | 0  |     return *this;  | 
1290  | 0  | }  | 
1291  |  |  | 
1292  |  | /**  | 
1293  |  |  * Retains only the elements in this set that are contained in the  | 
1294  |  |  * specified set.  In other words, removes from this set all of  | 
1295  |  |  * its elements that are not contained in the specified set.  This  | 
1296  |  |  * operation effectively modifies this set so that its value is  | 
1297  |  |  * the <i>intersection</i> of the two sets.  | 
1298  |  |  *  | 
1299  |  |  * @param c set that defines which elements this set will retain.  | 
1300  |  |  */  | 
1301  | 0  | UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { | 
1302  | 0  |     if (isFrozen() || isBogus()) { | 
1303  | 0  |         return *this;  | 
1304  | 0  |     }  | 
1305  | 0  |     retain(c.list, c.len, 0);  | 
1306  | 0  |     if (hasStrings()) { | 
1307  | 0  |         if (!c.hasStrings()) { | 
1308  | 0  |             strings->removeAllElements();  | 
1309  | 0  |         } else { | 
1310  | 0  |             strings->retainAll(*c.strings);  | 
1311  | 0  |         }  | 
1312  | 0  |     }  | 
1313  | 0  |     return *this;  | 
1314  | 0  | }  | 
1315  |  |  | 
1316  |  | /**  | 
1317  |  |  * Removes from this set all of its elements that are contained in the  | 
1318  |  |  * specified set.  This operation effectively modifies this  | 
1319  |  |  * set so that its value is the <i>asymmetric set difference</i> of  | 
1320  |  |  * the two sets.  | 
1321  |  |  *  | 
1322  |  |  * @param c set that defines which elements will be removed from  | 
1323  |  |  *          this set.  | 
1324  |  |  */  | 
1325  | 0  | UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { | 
1326  | 0  |     if (isFrozen() || isBogus()) { | 
1327  | 0  |         return *this;  | 
1328  | 0  |     }  | 
1329  | 0  |     retain(c.list, c.len, 2);  | 
1330  | 0  |     if (hasStrings() && c.hasStrings()) { | 
1331  | 0  |         strings->removeAll(*c.strings);  | 
1332  | 0  |     }  | 
1333  | 0  |     return *this;  | 
1334  | 0  | }  | 
1335  |  |  | 
1336  |  | /**  | 
1337  |  |  * Complements in this set all elements contained in the specified  | 
1338  |  |  * set.  Any character in the other set will be removed if it is  | 
1339  |  |  * in this set, or will be added if it is not in this set.  | 
1340  |  |  *  | 
1341  |  |  * @param c set that defines which elements will be xor'ed from  | 
1342  |  |  *          this set.  | 
1343  |  |  */  | 
1344  | 0  | UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { | 
1345  | 0  |     if (isFrozen() || isBogus()) { | 
1346  | 0  |         return *this;  | 
1347  | 0  |     }  | 
1348  | 0  |     exclusiveOr(c.list, c.len, 0);  | 
1349  |  | 
  | 
1350  | 0  |     if (c.strings != nullptr) { | 
1351  | 0  |         for (int32_t i=0; i<c.strings->size(); ++i) { | 
1352  | 0  |             void* e = c.strings->elementAt(i);  | 
1353  | 0  |             if (strings == nullptr || !strings->removeElement(e)) { | 
1354  | 0  |                 _add(*(const UnicodeString*)e);  | 
1355  | 0  |             }  | 
1356  | 0  |         }  | 
1357  | 0  |     }  | 
1358  | 0  |     return *this;  | 
1359  | 0  | }  | 
1360  |  |  | 
1361  |  | /**  | 
1362  |  |  * Removes all of the elements from this set.  This set will be  | 
1363  |  |  * empty after this call returns.  | 
1364  |  |  */  | 
1365  | 0  | UnicodeSet& UnicodeSet::clear(void) { | 
1366  | 0  |     if (isFrozen()) { | 
1367  | 0  |         return *this;  | 
1368  | 0  |     }  | 
1369  | 0  |     list[0] = UNICODESET_HIGH;  | 
1370  | 0  |     len = 1;  | 
1371  | 0  |     releasePattern();  | 
1372  | 0  |     if (strings != NULL) { | 
1373  | 0  |         strings->removeAllElements();  | 
1374  | 0  |     }  | 
1375  |  |     // Remove bogus  | 
1376  | 0  |     fFlags = 0;  | 
1377  | 0  |     return *this;  | 
1378  | 0  | }  | 
1379  |  |  | 
1380  |  | /**  | 
1381  |  |  * Iteration method that returns the number of ranges contained in  | 
1382  |  |  * this set.  | 
1383  |  |  * @see #getRangeStart  | 
1384  |  |  * @see #getRangeEnd  | 
1385  |  |  */  | 
1386  | 0  | int32_t UnicodeSet::getRangeCount() const { | 
1387  | 0  |     return len/2;  | 
1388  | 0  | }  | 
1389  |  |  | 
1390  |  | /**  | 
1391  |  |  * Iteration method that returns the first character in the  | 
1392  |  |  * specified range of this set.  | 
1393  |  |  * @see #getRangeCount  | 
1394  |  |  * @see #getRangeEnd  | 
1395  |  |  */  | 
1396  | 0  | UChar32 UnicodeSet::getRangeStart(int32_t index) const { | 
1397  | 0  |     return list[index*2];  | 
1398  | 0  | }  | 
1399  |  |  | 
1400  |  | /**  | 
1401  |  |  * Iteration method that returns the last character in the  | 
1402  |  |  * specified range of this set.  | 
1403  |  |  * @see #getRangeStart  | 
1404  |  |  * @see #getRangeEnd  | 
1405  |  |  */  | 
1406  | 0  | UChar32 UnicodeSet::getRangeEnd(int32_t index) const { | 
1407  | 0  |     return list[index*2 + 1] - 1;  | 
1408  | 0  | }  | 
1409  |  |  | 
1410  | 0  | const UnicodeString* UnicodeSet::getString(int32_t index) const { | 
1411  | 0  |     return (const UnicodeString*) strings->elementAt(index);  | 
1412  | 0  | }  | 
1413  |  |  | 
1414  |  | /**  | 
1415  |  |  * Reallocate this objects internal structures to take up the least  | 
1416  |  |  * possible space, without changing this object's value.  | 
1417  |  |  */  | 
1418  | 0  | UnicodeSet& UnicodeSet::compact() { | 
1419  | 0  |     if (isFrozen() || isBogus()) { | 
1420  | 0  |         return *this;  | 
1421  | 0  |     }  | 
1422  |  |     // Delete buffer first to defragment memory less.  | 
1423  | 0  |     if (buffer != stackList) { | 
1424  | 0  |         uprv_free(buffer);  | 
1425  | 0  |         buffer = NULL;  | 
1426  | 0  |         bufferCapacity = 0;  | 
1427  | 0  |     }  | 
1428  | 0  |     if (list == stackList) { | 
1429  |  |         // pass  | 
1430  | 0  |     } else if (len <= INITIAL_CAPACITY) { | 
1431  | 0  |         uprv_memcpy(stackList, list, len * sizeof(UChar32));  | 
1432  | 0  |         uprv_free(list);  | 
1433  | 0  |         list = stackList;  | 
1434  | 0  |         capacity = INITIAL_CAPACITY;  | 
1435  | 0  |     } else if ((len + 7) < capacity) { | 
1436  |  |         // If we have more than a little unused capacity, shrink it to len.  | 
1437  | 0  |         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);  | 
1438  | 0  |         if (temp) { | 
1439  | 0  |             list = temp;  | 
1440  | 0  |             capacity = len;  | 
1441  | 0  |         }  | 
1442  |  |         // else what the heck happened?! We allocated less memory!  | 
1443  |  |         // Oh well. We'll keep our original array.  | 
1444  | 0  |     }  | 
1445  | 0  |     if (strings != nullptr && strings->isEmpty()) { | 
1446  | 0  |         delete strings;  | 
1447  | 0  |         strings = nullptr;  | 
1448  | 0  |     }  | 
1449  | 0  |     return *this;  | 
1450  | 0  | }  | 
1451  |  |  | 
1452  |  | #ifdef DEBUG_SERIALIZE  | 
1453  |  | #include <stdio.h>  | 
1454  |  | #endif  | 
1455  |  |  | 
1456  |  | /**  | 
1457  |  |  * Deserialize constructor.  | 
1458  |  |  */  | 
1459  |  | UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,  | 
1460  | 0  |                        UErrorCode &ec) { | 
1461  |  | 
  | 
1462  | 0  |   if(U_FAILURE(ec)) { | 
1463  | 0  |     setToBogus();  | 
1464  | 0  |     return;  | 
1465  | 0  |   }  | 
1466  |  |  | 
1467  | 0  |   if( (serialization != kSerialized)  | 
1468  | 0  |       || (data==NULL)  | 
1469  | 0  |       || (dataLen < 1)) { | 
1470  | 0  |     ec = U_ILLEGAL_ARGUMENT_ERROR;  | 
1471  | 0  |     setToBogus();  | 
1472  | 0  |     return;  | 
1473  | 0  |   }  | 
1474  |  |  | 
1475  |  |   // bmp?  | 
1476  | 0  |   int32_t headerSize = ((data[0]&0x8000)) ?2:1;  | 
1477  | 0  |   int32_t bmpLength = (headerSize==1)?data[0]:data[1];  | 
1478  |  | 
  | 
1479  | 0  |   int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;  | 
1480  |  | #ifdef DEBUG_SERIALIZE  | 
1481  |  |   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); | 
1482  |  | #endif  | 
1483  | 0  |   if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH | 
1484  | 0  |     return;  | 
1485  | 0  |   }  | 
1486  |  |   // copy bmp  | 
1487  | 0  |   int32_t i;  | 
1488  | 0  |   for(i = 0; i< bmpLength;i++) { | 
1489  | 0  |     list[i] = data[i+headerSize];  | 
1490  |  | #ifdef DEBUG_SERIALIZE  | 
1491  |  |     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]); | 
1492  |  | #endif  | 
1493  | 0  |   }  | 
1494  |  |   // copy smp  | 
1495  | 0  |   for(i=bmpLength;i<newLength;i++) { | 
1496  | 0  |     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +  | 
1497  | 0  |               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);  | 
1498  |  | #ifdef DEBUG_SERIALIZE  | 
1499  |  |     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]); | 
1500  |  | #endif  | 
1501  | 0  |   }  | 
1502  | 0  |   U_ASSERT(i == newLength);  | 
1503  | 0  |   if (i == 0 || list[i - 1] != UNICODESET_HIGH) { | 
1504  | 0  |     list[i++] = UNICODESET_HIGH;  | 
1505  | 0  |   }  | 
1506  | 0  |   len = i;  | 
1507  | 0  | }  | 
1508  |  |  | 
1509  |  |  | 
1510  | 0  | int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { | 
1511  | 0  |     int32_t bmpLength, length, destLength;  | 
1512  |  | 
  | 
1513  | 0  |     if (U_FAILURE(ec)) { | 
1514  | 0  |         return 0;  | 
1515  | 0  |     }  | 
1516  |  |  | 
1517  | 0  |     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { | 
1518  | 0  |         ec=U_ILLEGAL_ARGUMENT_ERROR;  | 
1519  | 0  |         return 0;  | 
1520  | 0  |     }  | 
1521  |  |  | 
1522  |  |     /* count necessary 16-bit units */  | 
1523  | 0  |     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH  | 
1524  |  |     // assert(length>=0);  | 
1525  | 0  |     if (length==0) { | 
1526  |  |         /* empty set */  | 
1527  | 0  |         if (destCapacity>0) { | 
1528  | 0  |             *dest=0;  | 
1529  | 0  |         } else { | 
1530  | 0  |             ec=U_BUFFER_OVERFLOW_ERROR;  | 
1531  | 0  |         }  | 
1532  | 0  |         return 1;  | 
1533  | 0  |     }  | 
1534  |  |     /* now length>0 */  | 
1535  |  |  | 
1536  | 0  |     if (this->list[length-1]<=0xffff) { | 
1537  |  |         /* all BMP */  | 
1538  | 0  |         bmpLength=length;  | 
1539  | 0  |     } else if (this->list[0]>=0x10000) { | 
1540  |  |         /* all supplementary */  | 
1541  | 0  |         bmpLength=0;  | 
1542  | 0  |         length*=2;  | 
1543  | 0  |     } else { | 
1544  |  |         /* some BMP, some supplementary */  | 
1545  | 0  |         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} | 
1546  | 0  |         length=bmpLength+2*(length-bmpLength);  | 
1547  | 0  |     }  | 
1548  |  | #ifdef DEBUG_SERIALIZE  | 
1549  |  |     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len); | 
1550  |  | #endif  | 
1551  |  |     /* length: number of 16-bit array units */  | 
1552  | 0  |     if (length>0x7fff) { | 
1553  |  |         /* there are only 15 bits for the length in the first serialized word */  | 
1554  | 0  |         ec=U_INDEX_OUTOFBOUNDS_ERROR;  | 
1555  | 0  |         return 0;  | 
1556  | 0  |     }  | 
1557  |  |  | 
1558  |  |     /*  | 
1559  |  |      * total serialized length:  | 
1560  |  |      * number of 16-bit array units (length) +  | 
1561  |  |      * 1 length unit (always) +  | 
1562  |  |      * 1 bmpLength unit (if there are supplementary values)  | 
1563  |  |      */  | 
1564  | 0  |     destLength=length+((length>bmpLength)?2:1);  | 
1565  | 0  |     if (destLength<=destCapacity) { | 
1566  | 0  |         const UChar32 *p;  | 
1567  | 0  |         int32_t i;  | 
1568  |  | 
  | 
1569  |  | #ifdef DEBUG_SERIALIZE  | 
1570  |  |         printf("writeHdr\n"); | 
1571  |  | #endif  | 
1572  | 0  |         *dest=(uint16_t)length;  | 
1573  | 0  |         if (length>bmpLength) { | 
1574  | 0  |             *dest|=0x8000;  | 
1575  | 0  |             *++dest=(uint16_t)bmpLength;  | 
1576  | 0  |         }  | 
1577  | 0  |         ++dest;  | 
1578  |  |  | 
1579  |  |         /* write the BMP part of the array */  | 
1580  | 0  |         p=this->list;  | 
1581  | 0  |         for (i=0; i<bmpLength; ++i) { | 
1582  |  | #ifdef DEBUG_SERIALIZE  | 
1583  |  |           printf("writebmp: %x\n", (int)*p); | 
1584  |  | #endif  | 
1585  | 0  |             *dest++=(uint16_t)*p++;  | 
1586  | 0  |         }  | 
1587  |  |  | 
1588  |  |         /* write the supplementary part of the array */  | 
1589  | 0  |         for (; i<length; i+=2) { | 
1590  |  | #ifdef DEBUG_SERIALIZE  | 
1591  |  |           printf("write32: %x\n", (int)*p); | 
1592  |  | #endif  | 
1593  | 0  |             *dest++=(uint16_t)(*p>>16);  | 
1594  | 0  |             *dest++=(uint16_t)*p++;  | 
1595  | 0  |         }  | 
1596  | 0  |     } else { | 
1597  | 0  |         ec=U_BUFFER_OVERFLOW_ERROR;  | 
1598  | 0  |     }  | 
1599  | 0  |     return destLength;  | 
1600  | 0  | }  | 
1601  |  |  | 
1602  |  | //----------------------------------------------------------------  | 
1603  |  | // Implementation: Utility methods  | 
1604  |  | //----------------------------------------------------------------  | 
1605  |  |  | 
1606  |  | /**  | 
1607  |  |  * Allocate our strings vector and return TRUE if successful.  | 
1608  |  |  */  | 
1609  | 0  | UBool UnicodeSet::allocateStrings(UErrorCode &status) { | 
1610  | 0  |     if (U_FAILURE(status)) { | 
1611  | 0  |         return FALSE;  | 
1612  | 0  |     }  | 
1613  | 0  |     strings = new UVector(uprv_deleteUObject,  | 
1614  | 0  |                           uhash_compareUnicodeString, 1, status);  | 
1615  | 0  |     if (strings == NULL) { // Check for memory allocation error. | 
1616  | 0  |         status = U_MEMORY_ALLOCATION_ERROR;  | 
1617  | 0  |         return FALSE;  | 
1618  | 0  |     }  | 
1619  | 0  |     if (U_FAILURE(status)) { | 
1620  | 0  |         delete strings;  | 
1621  | 0  |         strings = NULL;  | 
1622  | 0  |         return FALSE;  | 
1623  | 0  |     }   | 
1624  | 0  |     return TRUE;  | 
1625  | 0  | }  | 
1626  |  |  | 
1627  | 0  | int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { | 
1628  |  |     // Grow exponentially to reduce the frequency of allocations.  | 
1629  | 0  |     if (minCapacity < INITIAL_CAPACITY) { | 
1630  | 0  |         return minCapacity + INITIAL_CAPACITY;  | 
1631  | 0  |     } else if (minCapacity <= 2500) { | 
1632  | 0  |         return 5 * minCapacity;  | 
1633  | 0  |     } else { | 
1634  | 0  |         int32_t newCapacity = 2 * minCapacity;  | 
1635  | 0  |         if (newCapacity > MAX_LENGTH) { | 
1636  | 0  |             newCapacity = MAX_LENGTH;  | 
1637  | 0  |         }  | 
1638  | 0  |         return newCapacity;  | 
1639  | 0  |     }  | 
1640  | 0  | }  | 
1641  |  |  | 
1642  | 0  | bool UnicodeSet::ensureCapacity(int32_t newLen) { | 
1643  | 0  |     if (newLen > MAX_LENGTH) { | 
1644  | 0  |         newLen = MAX_LENGTH;  | 
1645  | 0  |     }  | 
1646  | 0  |     if (newLen <= capacity) { | 
1647  | 0  |         return true;  | 
1648  | 0  |     }  | 
1649  | 0  |     int32_t newCapacity = nextCapacity(newLen);  | 
1650  | 0  |     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));  | 
1651  | 0  |     if (temp == NULL) { | 
1652  | 0  |         setToBogus(); // set the object to bogus state if an OOM failure occurred.  | 
1653  | 0  |         return false;  | 
1654  | 0  |     }  | 
1655  |  |     // Copy only the actual contents.  | 
1656  | 0  |     uprv_memcpy(temp, list, len * sizeof(UChar32));  | 
1657  | 0  |     if (list != stackList) { | 
1658  | 0  |         uprv_free(list);  | 
1659  | 0  |     }  | 
1660  | 0  |     list = temp;  | 
1661  | 0  |     capacity = newCapacity;  | 
1662  | 0  |     return true;  | 
1663  | 0  | }  | 
1664  |  |  | 
1665  | 0  | bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { | 
1666  | 0  |     if (newLen > MAX_LENGTH) { | 
1667  | 0  |         newLen = MAX_LENGTH;  | 
1668  | 0  |     }  | 
1669  | 0  |     if (newLen <= bufferCapacity) { | 
1670  | 0  |         return true;  | 
1671  | 0  |     }  | 
1672  | 0  |     int32_t newCapacity = nextCapacity(newLen);  | 
1673  | 0  |     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));  | 
1674  | 0  |     if (temp == NULL) { | 
1675  | 0  |         setToBogus();  | 
1676  | 0  |         return false;  | 
1677  | 0  |     }  | 
1678  |  |     // The buffer has no contents to be copied.  | 
1679  |  |     // It is always filled from scratch after this call.  | 
1680  | 0  |     if (buffer != stackList) { | 
1681  | 0  |         uprv_free(buffer);  | 
1682  | 0  |     }  | 
1683  | 0  |     buffer = temp;  | 
1684  | 0  |     bufferCapacity = newCapacity;  | 
1685  | 0  |     return true;  | 
1686  | 0  | }  | 
1687  |  |  | 
1688  |  | /**  | 
1689  |  |  * Swap list and buffer.  | 
1690  |  |  */  | 
1691  | 0  | void UnicodeSet::swapBuffers(void) { | 
1692  |  |     // swap list and buffer  | 
1693  | 0  |     UChar32* temp = list;  | 
1694  | 0  |     list = buffer;  | 
1695  | 0  |     buffer = temp;  | 
1696  |  | 
  | 
1697  | 0  |     int32_t c = capacity;  | 
1698  | 0  |     capacity = bufferCapacity;  | 
1699  | 0  |     bufferCapacity = c;  | 
1700  | 0  | }  | 
1701  |  |  | 
1702  | 0  | void UnicodeSet::setToBogus() { | 
1703  | 0  |     clear(); // Remove everything in the set.  | 
1704  | 0  |     fFlags = kIsBogus;  | 
1705  | 0  | }  | 
1706  |  |  | 
1707  |  | //----------------------------------------------------------------  | 
1708  |  | // Implementation: Fundamental operators  | 
1709  |  | //----------------------------------------------------------------  | 
1710  |  |  | 
1711  | 0  | static inline UChar32 max(UChar32 a, UChar32 b) { | 
1712  | 0  |     return (a > b) ? a : b;  | 
1713  | 0  | }  | 
1714  |  |  | 
1715  |  | // polarity = 0, 3 is normal: x xor y  | 
1716  |  | // polarity = 1, 2: x xor ~y == x === y  | 
1717  |  |  | 
1718  | 0  | void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { | 
1719  | 0  |     if (isFrozen() || isBogus()) { | 
1720  | 0  |         return;  | 
1721  | 0  |     }  | 
1722  | 0  |     if (!ensureBufferCapacity(len + otherLen)) { | 
1723  | 0  |         return;  | 
1724  | 0  |     }  | 
1725  |  |  | 
1726  | 0  |     int32_t i = 0, j = 0, k = 0;  | 
1727  | 0  |     UChar32 a = list[i++];  | 
1728  | 0  |     UChar32 b;  | 
1729  | 0  |     if (polarity == 1 || polarity == 2) { | 
1730  | 0  |         b = UNICODESET_LOW;  | 
1731  | 0  |         if (other[j] == UNICODESET_LOW) { // skip base if already LOW | 
1732  | 0  |             ++j;  | 
1733  | 0  |             b = other[j];  | 
1734  | 0  |         }  | 
1735  | 0  |     } else { | 
1736  | 0  |         b = other[j++];  | 
1737  | 0  |     }  | 
1738  |  |     // simplest of all the routines  | 
1739  |  |     // sort the values, discarding identicals!  | 
1740  | 0  |     for (;;) { | 
1741  | 0  |         if (a < b) { | 
1742  | 0  |             buffer[k++] = a;  | 
1743  | 0  |             a = list[i++];  | 
1744  | 0  |         } else if (b < a) { | 
1745  | 0  |             buffer[k++] = b;  | 
1746  | 0  |             b = other[j++];  | 
1747  | 0  |         } else if (a != UNICODESET_HIGH) { // at this point, a == b | 
1748  |  |             // discard both values!  | 
1749  | 0  |             a = list[i++];  | 
1750  | 0  |             b = other[j++];  | 
1751  | 0  |         } else { // DONE! | 
1752  | 0  |             buffer[k++] = UNICODESET_HIGH;  | 
1753  | 0  |             len = k;  | 
1754  | 0  |             break;  | 
1755  | 0  |         }  | 
1756  | 0  |     }  | 
1757  | 0  |     swapBuffers();  | 
1758  | 0  |     releasePattern();  | 
1759  | 0  | }  | 
1760  |  |  | 
1761  |  | // polarity = 0 is normal: x union y  | 
1762  |  | // polarity = 2: x union ~y  | 
1763  |  | // polarity = 1: ~x union y  | 
1764  |  | // polarity = 3: ~x union ~y  | 
1765  |  |  | 
1766  | 0  | void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { | 
1767  | 0  |     if (isFrozen() || isBogus() || other==NULL) { | 
1768  | 0  |         return;  | 
1769  | 0  |     }  | 
1770  | 0  |     if (!ensureBufferCapacity(len + otherLen)) { | 
1771  | 0  |         return;  | 
1772  | 0  |     }  | 
1773  |  |  | 
1774  | 0  |     int32_t i = 0, j = 0, k = 0;  | 
1775  | 0  |     UChar32 a = list[i++];  | 
1776  | 0  |     UChar32 b = other[j++];  | 
1777  |  |     // change from xor is that we have to check overlapping pairs  | 
1778  |  |     // polarity bit 1 means a is second, bit 2 means b is.  | 
1779  | 0  |     for (;;) { | 
1780  | 0  |         switch (polarity) { | 
1781  | 0  |           case 0: // both first; take lower if unequal  | 
1782  | 0  |             if (a < b) { // take a | 
1783  |  |                 // Back up over overlapping ranges in buffer[]  | 
1784  | 0  |                 if (k > 0 && a <= buffer[k-1]) { | 
1785  |  |                     // Pick latter end value in buffer[] vs. list[]  | 
1786  | 0  |                     a = max(list[i], buffer[--k]);  | 
1787  | 0  |                 } else { | 
1788  |  |                     // No overlap  | 
1789  | 0  |                     buffer[k++] = a;  | 
1790  | 0  |                     a = list[i];  | 
1791  | 0  |                 }  | 
1792  | 0  |                 i++; // Common if/else code factored out  | 
1793  | 0  |                 polarity ^= 1;  | 
1794  | 0  |             } else if (b < a) { // take b | 
1795  | 0  |                 if (k > 0 && b <= buffer[k-1]) { | 
1796  | 0  |                     b = max(other[j], buffer[--k]);  | 
1797  | 0  |                 } else { | 
1798  | 0  |                     buffer[k++] = b;  | 
1799  | 0  |                     b = other[j];  | 
1800  | 0  |                 }  | 
1801  | 0  |                 j++;  | 
1802  | 0  |                 polarity ^= 2;  | 
1803  | 0  |             } else { // a == b, take a, drop b | 
1804  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1805  |  |                 // This is symmetrical; it doesn't matter if  | 
1806  |  |                 // we backtrack with a or b. - liu  | 
1807  | 0  |                 if (k > 0 && a <= buffer[k-1]) { | 
1808  | 0  |                     a = max(list[i], buffer[--k]);  | 
1809  | 0  |                 } else { | 
1810  |  |                     // No overlap  | 
1811  | 0  |                     buffer[k++] = a;  | 
1812  | 0  |                     a = list[i];  | 
1813  | 0  |                 }  | 
1814  | 0  |                 i++;  | 
1815  | 0  |                 polarity ^= 1;  | 
1816  | 0  |                 b = other[j++];  | 
1817  | 0  |                 polarity ^= 2;  | 
1818  | 0  |             }  | 
1819  | 0  |             break;  | 
1820  | 0  |           case 3: // both second; take higher if unequal, and drop other  | 
1821  | 0  |             if (b <= a) { // take a | 
1822  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1823  | 0  |                 buffer[k++] = a;  | 
1824  | 0  |             } else { // take b | 
1825  | 0  |                 if (b == UNICODESET_HIGH) goto loop_end;  | 
1826  | 0  |                 buffer[k++] = b;  | 
1827  | 0  |             }  | 
1828  | 0  |             a = list[i++];  | 
1829  | 0  |             polarity ^= 1;   // factored common code  | 
1830  | 0  |             b = other[j++];  | 
1831  | 0  |             polarity ^= 2;  | 
1832  | 0  |             break;  | 
1833  | 0  |           case 1: // a second, b first; if b < a, overlap  | 
1834  | 0  |             if (a < b) { // no overlap, take a | 
1835  | 0  |                 buffer[k++] = a; a = list[i++]; polarity ^= 1;  | 
1836  | 0  |             } else if (b < a) { // OVERLAP, drop b | 
1837  | 0  |                 b = other[j++];  | 
1838  | 0  |                 polarity ^= 2;  | 
1839  | 0  |             } else { // a == b, drop both! | 
1840  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1841  | 0  |                 a = list[i++];  | 
1842  | 0  |                 polarity ^= 1;  | 
1843  | 0  |                 b = other[j++];  | 
1844  | 0  |                 polarity ^= 2;  | 
1845  | 0  |             }  | 
1846  | 0  |             break;  | 
1847  | 0  |           case 2: // a first, b second; if a < b, overlap  | 
1848  | 0  |             if (b < a) { // no overlap, take b | 
1849  | 0  |                 buffer[k++] = b;  | 
1850  | 0  |                 b = other[j++];  | 
1851  | 0  |                 polarity ^= 2;  | 
1852  | 0  |             } else  if (a < b) { // OVERLAP, drop a | 
1853  | 0  |                 a = list[i++];  | 
1854  | 0  |                 polarity ^= 1;  | 
1855  | 0  |             } else { // a == b, drop both! | 
1856  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1857  | 0  |                 a = list[i++];  | 
1858  | 0  |                 polarity ^= 1;  | 
1859  | 0  |                 b = other[j++];  | 
1860  | 0  |                 polarity ^= 2;  | 
1861  | 0  |             }  | 
1862  | 0  |             break;  | 
1863  | 0  |         }  | 
1864  | 0  |     }  | 
1865  | 0  |  loop_end:  | 
1866  | 0  |     buffer[k++] = UNICODESET_HIGH;    // terminate  | 
1867  | 0  |     len = k;  | 
1868  | 0  |     swapBuffers();  | 
1869  | 0  |     releasePattern();  | 
1870  | 0  | }  | 
1871  |  |  | 
1872  |  | // polarity = 0 is normal: x intersect y  | 
1873  |  | // polarity = 2: x intersect ~y == set-minus  | 
1874  |  | // polarity = 1: ~x intersect y  | 
1875  |  | // polarity = 3: ~x intersect ~y  | 
1876  |  |  | 
1877  | 0  | void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { | 
1878  | 0  |     if (isFrozen() || isBogus()) { | 
1879  | 0  |         return;  | 
1880  | 0  |     }  | 
1881  | 0  |     if (!ensureBufferCapacity(len + otherLen)) { | 
1882  | 0  |         return;  | 
1883  | 0  |     }  | 
1884  |  |  | 
1885  | 0  |     int32_t i = 0, j = 0, k = 0;  | 
1886  | 0  |     UChar32 a = list[i++];  | 
1887  | 0  |     UChar32 b = other[j++];  | 
1888  |  |     // change from xor is that we have to check overlapping pairs  | 
1889  |  |     // polarity bit 1 means a is second, bit 2 means b is.  | 
1890  | 0  |     for (;;) { | 
1891  | 0  |         switch (polarity) { | 
1892  | 0  |           case 0: // both first; drop the smaller  | 
1893  | 0  |             if (a < b) { // drop a | 
1894  | 0  |                 a = list[i++];  | 
1895  | 0  |                 polarity ^= 1;  | 
1896  | 0  |             } else if (b < a) { // drop b | 
1897  | 0  |                 b = other[j++];  | 
1898  | 0  |                 polarity ^= 2;  | 
1899  | 0  |             } else { // a == b, take one, drop other | 
1900  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1901  | 0  |                 buffer[k++] = a;  | 
1902  | 0  |                 a = list[i++];  | 
1903  | 0  |                 polarity ^= 1;  | 
1904  | 0  |                 b = other[j++];  | 
1905  | 0  |                 polarity ^= 2;  | 
1906  | 0  |             }  | 
1907  | 0  |             break;  | 
1908  | 0  |           case 3: // both second; take lower if unequal  | 
1909  | 0  |             if (a < b) { // take a | 
1910  | 0  |                 buffer[k++] = a;  | 
1911  | 0  |                 a = list[i++];  | 
1912  | 0  |                 polarity ^= 1;  | 
1913  | 0  |             } else if (b < a) { // take b | 
1914  | 0  |                 buffer[k++] = b;  | 
1915  | 0  |                 b = other[j++];  | 
1916  | 0  |                 polarity ^= 2;  | 
1917  | 0  |             } else { // a == b, take one, drop other | 
1918  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1919  | 0  |                 buffer[k++] = a;  | 
1920  | 0  |                 a = list[i++];  | 
1921  | 0  |                 polarity ^= 1;  | 
1922  | 0  |                 b = other[j++];  | 
1923  | 0  |                 polarity ^= 2;  | 
1924  | 0  |             }  | 
1925  | 0  |             break;  | 
1926  | 0  |           case 1: // a second, b first;  | 
1927  | 0  |             if (a < b) { // NO OVERLAP, drop a | 
1928  | 0  |                 a = list[i++];  | 
1929  | 0  |                 polarity ^= 1;  | 
1930  | 0  |             } else if (b < a) { // OVERLAP, take b | 
1931  | 0  |                 buffer[k++] = b;  | 
1932  | 0  |                 b = other[j++];  | 
1933  | 0  |                 polarity ^= 2;  | 
1934  | 0  |             } else { // a == b, drop both! | 
1935  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1936  | 0  |                 a = list[i++];  | 
1937  | 0  |                 polarity ^= 1;  | 
1938  | 0  |                 b = other[j++];  | 
1939  | 0  |                 polarity ^= 2;  | 
1940  | 0  |             }  | 
1941  | 0  |             break;  | 
1942  | 0  |           case 2: // a first, b second; if a < b, overlap  | 
1943  | 0  |             if (b < a) { // no overlap, drop b | 
1944  | 0  |                 b = other[j++];  | 
1945  | 0  |                 polarity ^= 2;  | 
1946  | 0  |             } else  if (a < b) { // OVERLAP, take a | 
1947  | 0  |                 buffer[k++] = a;  | 
1948  | 0  |                 a = list[i++];  | 
1949  | 0  |                 polarity ^= 1;  | 
1950  | 0  |             } else { // a == b, drop both! | 
1951  | 0  |                 if (a == UNICODESET_HIGH) goto loop_end;  | 
1952  | 0  |                 a = list[i++];  | 
1953  | 0  |                 polarity ^= 1;  | 
1954  | 0  |                 b = other[j++];  | 
1955  | 0  |                 polarity ^= 2;  | 
1956  | 0  |             }  | 
1957  | 0  |             break;  | 
1958  | 0  |         }  | 
1959  | 0  |     }  | 
1960  | 0  |  loop_end:  | 
1961  | 0  |     buffer[k++] = UNICODESET_HIGH;    // terminate  | 
1962  | 0  |     len = k;  | 
1963  | 0  |     swapBuffers();  | 
1964  | 0  |     releasePattern();  | 
1965  | 0  | }  | 
1966  |  |  | 
1967  |  | /**  | 
1968  |  |  * Append the <code>toPattern()</code> representation of a  | 
1969  |  |  * string to the given <code>StringBuffer</code>.  | 
1970  |  |  */  | 
1971  |  | void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool  | 
1972  | 0  | escapeUnprintable) { | 
1973  | 0  |     UChar32 cp;  | 
1974  | 0  |     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { | 
1975  | 0  |         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);  | 
1976  | 0  |     }  | 
1977  | 0  | }  | 
1978  |  |  | 
1979  |  | /**  | 
1980  |  |  * Append the <code>toPattern()</code> representation of a  | 
1981  |  |  * character to the given <code>StringBuffer</code>.  | 
1982  |  |  */  | 
1983  |  | void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool  | 
1984  | 0  | escapeUnprintable) { | 
1985  | 0  |     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { | 
1986  |  |         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything  | 
1987  |  |         // unprintable  | 
1988  | 0  |         if (ICU_Utility::escapeUnprintable(buf, c)) { | 
1989  | 0  |             return;  | 
1990  | 0  |         }  | 
1991  | 0  |     }  | 
1992  |  |     // Okay to let ':' pass through  | 
1993  | 0  |     switch (c) { | 
1994  | 0  |     case u'[':  | 
1995  | 0  |     case u']':  | 
1996  | 0  |     case u'-':  | 
1997  | 0  |     case u'^':  | 
1998  | 0  |     case u'&':  | 
1999  | 0  |     case u'\\':  | 
2000  | 0  |     case u'{': | 
2001  | 0  |     case u'}':  | 
2002  | 0  |     case u':':  | 
2003  | 0  |     case SymbolTable::SYMBOL_REF:  | 
2004  | 0  |         buf.append(u'\\');  | 
2005  | 0  |         break;  | 
2006  | 0  |     default:  | 
2007  |  |         // Escape whitespace  | 
2008  | 0  |         if (PatternProps::isWhiteSpace(c)) { | 
2009  | 0  |             buf.append(u'\\');  | 
2010  | 0  |         }  | 
2011  | 0  |         break;  | 
2012  | 0  |     }  | 
2013  | 0  |     buf.append(c);  | 
2014  | 0  | }  | 
2015  |  |  | 
2016  |  | /**  | 
2017  |  |  * Append a string representation of this set to result.  This will be  | 
2018  |  |  * a cleaned version of the string passed to applyPattern(), if there  | 
2019  |  |  * is one.  Otherwise it will be generated.  | 
2020  |  |  */  | 
2021  |  | UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,  | 
2022  |  |                                       UBool escapeUnprintable) const  | 
2023  | 0  | { | 
2024  | 0  |     if (pat != NULL) { | 
2025  | 0  |         int32_t i;  | 
2026  | 0  |         int32_t backslashCount = 0;  | 
2027  | 0  |         for (i=0; i<patLen; ) { | 
2028  | 0  |             UChar32 c;  | 
2029  | 0  |             U16_NEXT(pat, i, patLen, c);  | 
2030  | 0  |             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { | 
2031  |  |                 // If the unprintable character is preceded by an odd  | 
2032  |  |                 // number of backslashes, then it has been escaped.  | 
2033  |  |                 // Before unescaping it, we delete the final  | 
2034  |  |                 // backslash.  | 
2035  | 0  |                 if ((backslashCount % 2) == 1) { | 
2036  | 0  |                     result.truncate(result.length() - 1);  | 
2037  | 0  |                 }  | 
2038  | 0  |                 ICU_Utility::escapeUnprintable(result, c);  | 
2039  | 0  |                 backslashCount = 0;  | 
2040  | 0  |             } else { | 
2041  | 0  |                 result.append(c);  | 
2042  | 0  |                 if (c == u'\\') { | 
2043  | 0  |                     ++backslashCount;  | 
2044  | 0  |                 } else { | 
2045  | 0  |                     backslashCount = 0;  | 
2046  | 0  |                 }  | 
2047  | 0  |             }  | 
2048  | 0  |         }  | 
2049  | 0  |         return result;  | 
2050  | 0  |     }  | 
2051  |  |  | 
2052  | 0  |     return _generatePattern(result, escapeUnprintable);  | 
2053  | 0  | }  | 
2054  |  |  | 
2055  |  | /**  | 
2056  |  |  * Returns a string representation of this set.  If the result of  | 
2057  |  |  * calling this function is passed to a UnicodeSet constructor, it  | 
2058  |  |  * will produce another set that is equal to this one.  | 
2059  |  |  */  | 
2060  |  | UnicodeString& UnicodeSet::toPattern(UnicodeString& result,  | 
2061  |  |                                      UBool escapeUnprintable) const  | 
2062  | 0  | { | 
2063  | 0  |     result.truncate(0);  | 
2064  | 0  |     return _toPattern(result, escapeUnprintable);  | 
2065  | 0  | }  | 
2066  |  |  | 
2067  |  | /**  | 
2068  |  |  * Generate and append a string representation of this set to result.  | 
2069  |  |  * This does not use this.pat, the cleaned up copy of the string  | 
2070  |  |  * passed to applyPattern().  | 
2071  |  |  */  | 
2072  |  | UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,  | 
2073  |  |                                             UBool escapeUnprintable) const  | 
2074  | 0  | { | 
2075  | 0  |     result.append(u'[');  | 
2076  |  |  | 
2077  |  | //  // Check against the predefined categories.  We implicitly build  | 
2078  |  | //  // up ALL category sets the first time toPattern() is called.  | 
2079  |  | //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { | 
2080  |  | //      if (*this == getCategorySet(cat)) { | 
2081  |  | //          result.append(u':');  | 
2082  |  | //          result.append(CATEGORY_NAMES, cat*2, 2);  | 
2083  |  | //          return result.append(CATEGORY_CLOSE);  | 
2084  |  | //      }  | 
2085  |  | //  }  | 
2086  |  | 
  | 
2087  | 0  |     int32_t count = getRangeCount();  | 
2088  |  |  | 
2089  |  |     // If the set contains at least 2 intervals and includes both  | 
2090  |  |     // MIN_VALUE and MAX_VALUE, then the inverse representation will  | 
2091  |  |     // be more economical.  | 
2092  | 0  |     if (count > 1 &&  | 
2093  | 0  |         getRangeStart(0) == MIN_VALUE &&  | 
2094  | 0  |         getRangeEnd(count-1) == MAX_VALUE) { | 
2095  |  |  | 
2096  |  |         // Emit the inverse  | 
2097  | 0  |         result.append(u'^');  | 
2098  |  | 
  | 
2099  | 0  |         for (int32_t i = 1; i < count; ++i) { | 
2100  | 0  |             UChar32 start = getRangeEnd(i-1)+1;  | 
2101  | 0  |             UChar32 end = getRangeStart(i)-1;  | 
2102  | 0  |             _appendToPat(result, start, escapeUnprintable);  | 
2103  | 0  |             if (start != end) { | 
2104  | 0  |                 if ((start+1) != end) { | 
2105  | 0  |                     result.append(u'-');  | 
2106  | 0  |                 }  | 
2107  | 0  |                 _appendToPat(result, end, escapeUnprintable);  | 
2108  | 0  |             }  | 
2109  | 0  |         }  | 
2110  | 0  |     }  | 
2111  |  |  | 
2112  |  |     // Default; emit the ranges as pairs  | 
2113  | 0  |     else { | 
2114  | 0  |         for (int32_t i = 0; i < count; ++i) { | 
2115  | 0  |             UChar32 start = getRangeStart(i);  | 
2116  | 0  |             UChar32 end = getRangeEnd(i);  | 
2117  | 0  |             _appendToPat(result, start, escapeUnprintable);  | 
2118  | 0  |             if (start != end) { | 
2119  | 0  |                 if ((start+1) != end) { | 
2120  | 0  |                     result.append(u'-');  | 
2121  | 0  |                 }  | 
2122  | 0  |                 _appendToPat(result, end, escapeUnprintable);  | 
2123  | 0  |             }  | 
2124  | 0  |         }  | 
2125  | 0  |     }  | 
2126  |  | 
  | 
2127  | 0  |     if (strings != nullptr) { | 
2128  | 0  |         for (int32_t i = 0; i<strings->size(); ++i) { | 
2129  | 0  |             result.append(u'{'); | 
2130  | 0  |             _appendToPat(result,  | 
2131  | 0  |                          *(const UnicodeString*) strings->elementAt(i),  | 
2132  | 0  |                          escapeUnprintable);  | 
2133  | 0  |             result.append(u'}');  | 
2134  | 0  |         }  | 
2135  | 0  |     }  | 
2136  | 0  |     return result.append(u']');  | 
2137  | 0  | }  | 
2138  |  |  | 
2139  |  | /**  | 
2140  |  | * Release existing cached pattern  | 
2141  |  | */  | 
2142  | 0  | void UnicodeSet::releasePattern() { | 
2143  | 0  |     if (pat) { | 
2144  | 0  |         uprv_free(pat);  | 
2145  | 0  |         pat = NULL;  | 
2146  | 0  |         patLen = 0;  | 
2147  | 0  |     }  | 
2148  | 0  | }  | 
2149  |  |  | 
2150  |  | /**  | 
2151  |  | * Set the new pattern to cache.  | 
2152  |  | */  | 
2153  | 0  | void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { | 
2154  | 0  |     releasePattern();  | 
2155  | 0  |     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));  | 
2156  | 0  |     if (pat) { | 
2157  | 0  |         patLen = newPatLen;  | 
2158  | 0  |         u_memcpy(pat, newPat, patLen);  | 
2159  | 0  |         pat[patLen] = 0;  | 
2160  | 0  |     }  | 
2161  |  |     // else we don't care if malloc failed. This was just a nice cache.  | 
2162  |  |     // We can regenerate an equivalent pattern later when requested.  | 
2163  | 0  | }  | 
2164  |  |  | 
2165  | 0  | UnicodeSet *UnicodeSet::freeze() { | 
2166  | 0  |     if(!isFrozen() && !isBogus()) { | 
2167  | 0  |         compact();  | 
2168  |  |  | 
2169  |  |         // Optimize contains() and span() and similar functions.  | 
2170  | 0  |         if (hasStrings()) { | 
2171  | 0  |             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);  | 
2172  | 0  |             if (stringSpan == nullptr) { | 
2173  | 0  |                 setToBogus();  | 
2174  | 0  |                 return this;  | 
2175  | 0  |             } else if (!stringSpan->needsStringSpanUTF16()) { | 
2176  |  |                 // All strings are irrelevant for span() etc. because  | 
2177  |  |                 // all of each string's code points are contained in this set.  | 
2178  |  |                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as  | 
2179  |  |                 // many relevant strings as UTF-16.  | 
2180  |  |                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)  | 
2181  | 0  |                 delete stringSpan;  | 
2182  | 0  |                 stringSpan = NULL;  | 
2183  | 0  |             }  | 
2184  | 0  |         }  | 
2185  | 0  |         if (stringSpan == NULL) { | 
2186  |  |             // No span-relevant strings: Optimize for code point spans.  | 
2187  | 0  |             bmpSet=new BMPSet(list, len);  | 
2188  | 0  |             if (bmpSet == NULL) { // Check for memory allocation error. | 
2189  | 0  |                 setToBogus();  | 
2190  | 0  |             }  | 
2191  | 0  |         }  | 
2192  | 0  |     }  | 
2193  | 0  |     return this;  | 
2194  | 0  | }  | 
2195  |  |  | 
2196  | 0  | int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { | 
2197  | 0  |     if(length>0 && bmpSet!=NULL) { | 
2198  | 0  |         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);  | 
2199  | 0  |     }  | 
2200  | 0  |     if(length<0) { | 
2201  | 0  |         length=u_strlen(s);  | 
2202  | 0  |     }  | 
2203  | 0  |     if(length==0) { | 
2204  | 0  |         return 0;  | 
2205  | 0  |     }  | 
2206  | 0  |     if(stringSpan!=NULL) { | 
2207  | 0  |         return stringSpan->span(s, length, spanCondition);  | 
2208  | 0  |     } else if(hasStrings()) { | 
2209  | 0  |         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?  | 
2210  | 0  |                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :  | 
2211  | 0  |                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;  | 
2212  | 0  |         UnicodeSetStringSpan strSpan(*this, *strings, which);  | 
2213  | 0  |         if(strSpan.needsStringSpanUTF16()) { | 
2214  | 0  |             return strSpan.span(s, length, spanCondition);  | 
2215  | 0  |         }  | 
2216  | 0  |     }  | 
2217  |  |  | 
2218  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
2219  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
2220  | 0  |     }  | 
2221  |  | 
  | 
2222  | 0  |     UChar32 c;  | 
2223  | 0  |     int32_t start=0, prev=0;  | 
2224  | 0  |     do { | 
2225  | 0  |         U16_NEXT(s, start, length, c);  | 
2226  | 0  |         if(spanCondition!=contains(c)) { | 
2227  | 0  |             break;  | 
2228  | 0  |         }  | 
2229  | 0  |     } while((prev=start)<length);  | 
2230  | 0  |     return prev;  | 
2231  | 0  | }  | 
2232  |  |  | 
2233  | 0  | int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { | 
2234  | 0  |     if(length>0 && bmpSet!=NULL) { | 
2235  | 0  |         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);  | 
2236  | 0  |     }  | 
2237  | 0  |     if(length<0) { | 
2238  | 0  |         length=u_strlen(s);  | 
2239  | 0  |     }  | 
2240  | 0  |     if(length==0) { | 
2241  | 0  |         return 0;  | 
2242  | 0  |     }  | 
2243  | 0  |     if(stringSpan!=NULL) { | 
2244  | 0  |         return stringSpan->spanBack(s, length, spanCondition);  | 
2245  | 0  |     } else if(hasStrings()) { | 
2246  | 0  |         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?  | 
2247  | 0  |                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :  | 
2248  | 0  |                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;  | 
2249  | 0  |         UnicodeSetStringSpan strSpan(*this, *strings, which);  | 
2250  | 0  |         if(strSpan.needsStringSpanUTF16()) { | 
2251  | 0  |             return strSpan.spanBack(s, length, spanCondition);  | 
2252  | 0  |         }  | 
2253  | 0  |     }  | 
2254  |  |  | 
2255  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
2256  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
2257  | 0  |     }  | 
2258  |  | 
  | 
2259  | 0  |     UChar32 c;  | 
2260  | 0  |     int32_t prev=length;  | 
2261  | 0  |     do { | 
2262  | 0  |         U16_PREV(s, 0, length, c);  | 
2263  | 0  |         if(spanCondition!=contains(c)) { | 
2264  | 0  |             break;  | 
2265  | 0  |         }  | 
2266  | 0  |     } while((prev=length)>0);  | 
2267  | 0  |     return prev;  | 
2268  | 0  | }  | 
2269  |  |  | 
2270  | 0  | int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { | 
2271  | 0  |     if(length>0 && bmpSet!=NULL) { | 
2272  | 0  |         const uint8_t *s0=(const uint8_t *)s;  | 
2273  | 0  |         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);  | 
2274  | 0  |     }  | 
2275  | 0  |     if(length<0) { | 
2276  | 0  |         length=(int32_t)uprv_strlen(s);  | 
2277  | 0  |     }  | 
2278  | 0  |     if(length==0) { | 
2279  | 0  |         return 0;  | 
2280  | 0  |     }  | 
2281  | 0  |     if(stringSpan!=NULL) { | 
2282  | 0  |         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);  | 
2283  | 0  |     } else if(hasStrings()) { | 
2284  | 0  |         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?  | 
2285  | 0  |                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :  | 
2286  | 0  |                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;  | 
2287  | 0  |         UnicodeSetStringSpan strSpan(*this, *strings, which);  | 
2288  | 0  |         if(strSpan.needsStringSpanUTF8()) { | 
2289  | 0  |             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);  | 
2290  | 0  |         }  | 
2291  | 0  |     }  | 
2292  |  |  | 
2293  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
2294  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
2295  | 0  |     }  | 
2296  |  | 
  | 
2297  | 0  |     UChar32 c;  | 
2298  | 0  |     int32_t start=0, prev=0;  | 
2299  | 0  |     do { | 
2300  | 0  |         U8_NEXT_OR_FFFD(s, start, length, c);  | 
2301  | 0  |         if(spanCondition!=contains(c)) { | 
2302  | 0  |             break;  | 
2303  | 0  |         }  | 
2304  | 0  |     } while((prev=start)<length);  | 
2305  | 0  |     return prev;  | 
2306  | 0  | }  | 
2307  |  |  | 
2308  | 0  | int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { | 
2309  | 0  |     if(length>0 && bmpSet!=NULL) { | 
2310  | 0  |         const uint8_t *s0=(const uint8_t *)s;  | 
2311  | 0  |         return bmpSet->spanBackUTF8(s0, length, spanCondition);  | 
2312  | 0  |     }  | 
2313  | 0  |     if(length<0) { | 
2314  | 0  |         length=(int32_t)uprv_strlen(s);  | 
2315  | 0  |     }  | 
2316  | 0  |     if(length==0) { | 
2317  | 0  |         return 0;  | 
2318  | 0  |     }  | 
2319  | 0  |     if(stringSpan!=NULL) { | 
2320  | 0  |         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);  | 
2321  | 0  |     } else if(hasStrings()) { | 
2322  | 0  |         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?  | 
2323  | 0  |                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :  | 
2324  | 0  |                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;  | 
2325  | 0  |         UnicodeSetStringSpan strSpan(*this, *strings, which);  | 
2326  | 0  |         if(strSpan.needsStringSpanUTF8()) { | 
2327  | 0  |             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);  | 
2328  | 0  |         }  | 
2329  | 0  |     }  | 
2330  |  |  | 
2331  | 0  |     if(spanCondition!=USET_SPAN_NOT_CONTAINED) { | 
2332  | 0  |         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.  | 
2333  | 0  |     }  | 
2334  |  | 
  | 
2335  | 0  |     UChar32 c;  | 
2336  | 0  |     int32_t prev=length;  | 
2337  | 0  |     do { | 
2338  | 0  |         U8_PREV_OR_FFFD(s, 0, length, c);  | 
2339  | 0  |         if(spanCondition!=contains(c)) { | 
2340  | 0  |             break;  | 
2341  | 0  |         }  | 
2342  | 0  |     } while((prev=length)>0);  | 
2343  | 0  |     return prev;  | 
2344  | 0  | }  | 
2345  |  |  | 
2346  |  | U_NAMESPACE_END  |