Coverage Report

Created: 2026-06-07 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/uniset.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 1999-2015, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   Date        Name        Description
9
*   10/20/99    alan        Creation.
10
**********************************************************************
11
*/
12
13
#include "unicode/utypes.h"
14
#include "unicode/parsepos.h"
15
#include "unicode/symtable.h"
16
#include "unicode/uniset.h"
17
#include "unicode/ustring.h"
18
#include "unicode/utf8.h"
19
#include "unicode/utf16.h"
20
#include "ruleiter.h"
21
#include "cmemory.h"
22
#include "cstring.h"
23
#include "patternprops.h"
24
#include "uelement.h"
25
#include "util.h"
26
#include "uvector.h"
27
#include "charstr.h"
28
#include "ustrfmt.h"
29
#include "uassert.h"
30
#include "bmpset.h"
31
#include "unisetspan.h"
32
33
// HIGH_VALUE > all valid values. 110000 for codepoints
34
1.11G
#define UNICODESET_HIGH 0x0110000
35
36
// LOW <= all valid values. ZERO for codepoints
37
587M
#define UNICODESET_LOW 0x000000
38
39
/** Max list [0, 1, 2, ..., max code point, HIGH] */
40
constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
41
42
U_NAMESPACE_BEGIN
43
44
0
SymbolTable::~SymbolTable() {}
45
46
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47
48
/**
49
 * Modify the given UChar32 variable so that it is in range, by
50
 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51
 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52
 * It modifies its argument in-place and also returns it.
53
 */
54
586M
static inline UChar32 pinCodePoint(UChar32& c) {
55
586M
    if (c < UNICODESET_LOW) {
56
370
        c = UNICODESET_LOW;
57
586M
    } else if (c > (UNICODESET_HIGH-1)) {
58
0
        c = (UNICODESET_HIGH-1);
59
0
    }
60
586M
    return c;
61
586M
}
62
63
//----------------------------------------------------------------
64
// Debugging
65
//----------------------------------------------------------------
66
67
// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
68
// To enable the debugging, define the symbol DEBUG_MEM in the line
69
// below.  This will result in text being sent to stdout that looks
70
// like this:
71
//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72
//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73
// Each line lists a construction (ct) or destruction (dt) event, the
74
// object address, the number of outstanding objects after the event,
75
// and the pattern of the object in question.
76
77
// #define DEBUG_MEM
78
79
#ifdef DEBUG_MEM
80
#include <stdio.h>
81
static int32_t _dbgCount = 0;
82
83
static inline void _dbgct(UnicodeSet* set) {
84
    UnicodeString str;
85
    set->toPattern(str, true);
86
    char buf[40];
87
    str.extract(0, 39, buf, "");
88
    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89
}
90
91
static inline void _dbgdt(UnicodeSet* set) {
92
    UnicodeString str;
93
    set->toPattern(str, true);
94
    char buf[40];
95
    str.extract(0, 39, buf, "");
96
    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97
}
98
99
#else
100
101
#define _dbgct(set)
102
#define _dbgdt(set)
103
104
#endif
105
106
//----------------------------------------------------------------
107
// UnicodeString in UVector support
108
//----------------------------------------------------------------
109
110
5.11M
static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111
5.11M
    dst->pointer = new UnicodeString(*static_cast<UnicodeString*>(src->pointer));
112
5.11M
}
113
114
23.4M
static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115
23.4M
    const UnicodeString& a = *static_cast<const UnicodeString*>(t1.pointer);
116
23.4M
    const UnicodeString& b = *static_cast<const UnicodeString*>(t2.pointer);
117
23.4M
    return a.compare(b);
118
23.4M
}
119
120
167M
UBool UnicodeSet::hasStrings() const {
121
167M
    return strings_ != nullptr && !strings_->isEmpty();
122
167M
}
123
124
982k
int32_t UnicodeSet::stringsSize() const {
125
982k
    return strings_ == nullptr ? 0 : strings_->size();
126
982k
}
127
128
9.95M
UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129
9.95M
    return strings_ != nullptr && strings_->contains((void*) &s);
130
9.95M
}
131
132
//----------------------------------------------------------------
133
// Constructors &c
134
//----------------------------------------------------------------
135
136
/**
137
 * Constructs an empty set.
138
 */
139
42.8M
UnicodeSet::UnicodeSet() {
140
42.8M
    list[0] = UNICODESET_HIGH;
141
42.8M
    _dbgct(this);
142
42.8M
}
143
144
/**
145
 * Constructs a set containing the given range. If <code>end >
146
 * start</code> then an empty set is created.
147
 *
148
 * @param start first character, inclusive, of range
149
 * @param end last character, inclusive, of range
150
 */
151
558k
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152
558k
    list[0] = UNICODESET_HIGH;
153
558k
    add(start, end);
154
558k
    _dbgct(this);
155
558k
}
156
157
/**
158
 * Constructs a set that is identical to the given UnicodeSet.
159
 */
160
80.6M
UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161
80.6M
    *this = o;
162
80.6M
    _dbgct(this);
163
80.6M
}
164
165
// Copy-construct as thawed.
166
844
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167
844
    if (ensureCapacity(o.len)) {
168
        // *this = o except for bmpSet and stringSpan
169
844
        len = o.len;
170
844
        uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171
844
        if (o.hasStrings()) {
172
0
            UErrorCode status = U_ZERO_ERROR;
173
0
            if (!allocateStrings(status) ||
174
0
                    (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
175
0
                setToBogus();
176
0
                return;
177
0
            }
178
0
        }
179
844
        if (o.pat) {
180
774
            setPattern(o.pat, o.patLen);
181
774
        }
182
844
        _dbgct(this);
183
844
    }
184
844
}
185
186
/**
187
 * Destructs the set.
188
 */
189
124M
UnicodeSet::~UnicodeSet() {
190
124M
    _dbgdt(this); // first!
191
124M
    if (list != stackList) {
192
1.34M
        uprv_free(list);
193
1.34M
    }
194
124M
    delete bmpSet;
195
124M
    if (buffer != stackList) {
196
123M
        uprv_free(buffer);
197
123M
    }
198
124M
    delete strings_;
199
124M
    delete stringSpan;
200
124M
    releasePattern();
201
124M
}
202
203
/**
204
 * Assigns this object to be a copy of another.
205
 */
206
81.4M
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207
81.4M
    return copyFrom(o, false);
208
81.4M
}
209
210
81.4M
UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211
81.4M
    if (this == &o) {
212
0
        return *this;
213
0
    }
214
81.4M
    if (isFrozen()) {
215
0
        return *this;
216
0
    }
217
81.4M
    if (o.isBogus()) {
218
0
        setToBogus();
219
0
        return *this;
220
0
    }
221
81.4M
    if (!ensureCapacity(o.len)) {
222
        // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223
0
        return *this;
224
0
    }
225
81.4M
    len = o.len;
226
81.4M
    uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227
81.4M
    if (o.bmpSet != nullptr && !asThawed) {
228
0
        bmpSet = new BMPSet(*o.bmpSet, list, len);
229
0
        if (bmpSet == nullptr) { // Check for memory allocation error.
230
0
            setToBogus();
231
0
            return *this;
232
0
        }
233
0
    }
234
81.4M
    if (o.hasStrings()) {
235
114k
        UErrorCode status = U_ZERO_ERROR;
236
114k
        if ((strings_ == nullptr && !allocateStrings(status)) ||
237
114k
                (strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
238
0
            setToBogus();
239
0
            return *this;
240
0
        }
241
81.3M
    } else if (hasStrings()) {
242
0
        strings_->removeAllElements();
243
0
    }
244
81.4M
    if (o.stringSpan != nullptr && strings_ != nullptr && !asThawed) {
245
0
        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_);
246
0
        if (stringSpan == nullptr) { // Check for memory allocation error.
247
0
            setToBogus();
248
0
            return *this;
249
0
        }
250
0
    }
251
81.4M
    releasePattern();
252
81.4M
    if (o.pat) {
253
488k
        setPattern(o.pat, o.patLen);
254
488k
    }
255
81.4M
    return *this;
256
81.4M
}
257
258
/**
259
 * Returns a copy of this object.  All UnicodeMatcher objects have
260
 * to support cloning in order to allow classes using
261
 * UnicodeMatchers, such as Transliterator, to implement cloning.
262
 */
263
0
UnicodeSet* UnicodeSet::clone() const {
264
0
    return new UnicodeSet(*this);
265
0
}
266
267
844
UnicodeSet *UnicodeSet::cloneAsThawed() const {
268
844
    return new UnicodeSet(*this, true);
269
844
}
270
271
/**
272
 * Compares the specified object with this set for equality.  Returns
273
 * <tt>true</tt> if the two sets
274
 * have the same size, and every member of the specified set is
275
 * contained in this set (or equivalently, every member of this set is
276
 * contained in the specified set).
277
 *
278
 * @param o set to be compared for equality with this set.
279
 * @return <tt>true</tt> if the specified set is equal to this set.
280
 */
281
0
bool UnicodeSet::operator==(const UnicodeSet& o) const {
282
0
    if (len != o.len) return false;
283
0
    for (int32_t i = 0; i < len; ++i) {
284
0
        if (list[i] != o.list[i]) return false;
285
0
    }
286
0
    if (hasStrings() != o.hasStrings()) { return false; }
287
0
    if (hasStrings() && *strings_ != *o.strings_) return false;
288
0
    return true;
289
0
}
290
291
/**
292
 * Returns the hash code value for this set.
293
 *
294
 * @return the hash code value for this set.
295
 * @see Object#hashCode()
296
 */
297
0
int32_t UnicodeSet::hashCode() const {
298
0
    uint32_t result = static_cast<uint32_t>(len);
299
0
    for (int32_t i = 0; i < len; ++i) {
300
0
        result *= 1000003u;
301
0
        result += list[i];
302
0
    }
303
0
    return static_cast<int32_t>(result);
304
0
}
305
306
//----------------------------------------------------------------
307
// Public API
308
//----------------------------------------------------------------
309
310
/**
311
 * Returns the number of elements in this set (its cardinality),
312
 * Note than the elements of a set may include both individual
313
 * codepoints and strings.
314
 *
315
 * @return the number of elements in this set (its cardinality).
316
 */
317
982k
int32_t UnicodeSet::size() const {
318
982k
    int32_t n = 0;
319
982k
    int32_t count = getRangeCount();
320
49.5M
    for (int32_t i = 0; i < count; ++i) {
321
48.5M
        n += getRangeEnd(i) - getRangeStart(i) + 1;
322
48.5M
    }
323
982k
    return n + stringsSize();
324
982k
}
325
326
/**
327
 * Returns <tt>true</tt> if this set contains no elements.
328
 *
329
 * @return <tt>true</tt> if this set contains no elements.
330
 */
331
6.70k
UBool UnicodeSet::isEmpty() const {
332
6.70k
    return len == 1 && !hasStrings();
333
6.70k
}
334
335
/**
336
 * Returns true if this set contains the given character.
337
 * @param c character to be checked for containment
338
 * @return true if the test condition is met
339
 */
340
756M
UBool UnicodeSet::contains(UChar32 c) const {
341
    // Set i to the index of the start item greater than ch
342
    // We know we will terminate without length test!
343
    // LATER: for large sets, add binary search
344
    //int32_t i = -1;
345
    //for (;;) {
346
    //    if (c < list[++i]) break;
347
    //}
348
756M
    if (bmpSet != nullptr) {
349
352M
        return bmpSet->contains(c);
350
352M
    }
351
403M
    if (stringSpan != nullptr) {
352
0
        return stringSpan->contains(c);
353
0
    }
354
403M
    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355
0
        return false;
356
0
    }
357
403M
    int32_t i = findCodePoint(c);
358
403M
    return i & 1; // return true if odd
359
403M
}
360
361
/**
362
 * Returns the smallest value i such that c < list[i].  Caller
363
 * must ensure that c is a legal value or this method will enter
364
 * an infinite loop.  This method performs a binary search.
365
 * @param c a character in the range MIN_VALUE..MAX_VALUE
366
 * inclusive
367
 * @return the smallest integer i in the range 0..len-1,
368
 * inclusive, such that c < list[i]
369
 */
370
941M
int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371
    /* Examples:
372
                                       findCodePoint(c)
373
       set              list[]         c=0 1 3 4 7 8
374
       ===              ==============   ===========
375
       []               [110000]         0 0 0 0 0 0
376
       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
377
       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
378
       [:Any:]          [0, 110000]      1 1 1 1 1 1
379
     */
380
381
    // Return the smallest i such that c < list[i].  Assume
382
    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383
941M
    if (c < list[0])
384
189M
        return 0;
385
    // High runner test.  c is often after the last range, so an
386
    // initial check for this condition pays off.
387
752M
    int32_t lo = 0;
388
752M
    int32_t hi = len - 1;
389
752M
    if (lo >= hi || c >= list[hi-1])
390
43.2M
        return hi;
391
    // invariant: c >= list[lo]
392
    // invariant: c < list[hi]
393
5.75G
    for (;;) {
394
5.75G
        int32_t i = (lo + hi) >> 1;
395
5.75G
        if (i == lo) {
396
708M
            break; // Found!
397
5.04G
        } else if (c < list[i]) {
398
3.21G
            hi = i;
399
3.21G
        } else {
400
1.83G
            lo = i;
401
1.83G
        }
402
5.75G
    }
403
708M
    return hi;
404
752M
}
405
406
/**
407
 * Returns true if this set contains every character
408
 * of the given range.
409
 * @param start first character, inclusive, of the range
410
 * @param end last character, inclusive, of the range
411
 * @return true if the test condition is met
412
 */
413
7.57k
UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414
    //int32_t i = -1;
415
    //for (;;) {
416
    //    if (start < list[++i]) break;
417
    //}
418
7.57k
    int32_t i = findCodePoint(start);
419
7.57k
    return ((i & 1) != 0 && end < list[i]);
420
7.57k
}
421
422
/**
423
 * Returns <tt>true</tt> if this set contains the given
424
 * multicharacter string.
425
 * @param s string to be checked for containment
426
 * @return <tt>true</tt> if this set contains the specified string
427
 */
428
0
UBool UnicodeSet::contains(const UnicodeString& s) const {
429
0
    int32_t cp = getSingleCP(s);
430
0
    if (cp < 0) {
431
0
        return stringsContains(s);
432
0
    } else {
433
0
        return contains(static_cast<UChar32>(cp));
434
0
    }
435
0
}
436
437
/**
438
 * Returns true if this set contains all the characters and strings
439
 * of the given set.
440
 * @param c set to be checked for containment
441
 * @return true if the test condition is met
442
 */
443
0
UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444
    // The specified set is a subset if all of its pairs are contained in
445
    // this set.  It's possible to code this more efficiently in terms of
446
    // direct manipulation of the inversion lists if the need arises.
447
0
    int32_t n = c.getRangeCount();
448
0
    for (int i=0; i<n; ++i) {
449
0
        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450
0
            return false;
451
0
        }
452
0
    }
453
0
    return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_));
454
0
}
455
456
/**
457
 * Returns true if this set contains all the characters
458
 * of the given string.
459
 * @param s string containing characters to be checked for containment
460
 * @return true if the test condition is met
461
 */
462
0
UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463
0
    return span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == s.length();
464
0
}
465
466
/**
467
 * Returns true if this set contains none of the characters
468
 * of the given range.
469
 * @param start first character, inclusive, of the range
470
 * @param end last character, inclusive, of the range
471
 * @return true if the test condition is met
472
 */
473
63.4k
UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
474
    //int32_t i = -1;
475
    //for (;;) {
476
    //    if (start < list[++i]) break;
477
    //}
478
63.4k
    int32_t i = findCodePoint(start);
479
63.4k
    return ((i & 1) == 0 && end < list[i]);
480
63.4k
}
481
482
/**
483
 * Returns true if this set contains none of the characters and strings
484
 * of the given set.
485
 * @param c set to be checked for containment
486
 * @return true if the test condition is met
487
 */
488
0
UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
489
    // The specified set is a subset if all of its pairs are contained in
490
    // this set.  It's possible to code this more efficiently in terms of
491
    // direct manipulation of the inversion lists if the need arises.
492
0
    int32_t n = c.getRangeCount();
493
0
    for (int32_t i=0; i<n; ++i) {
494
0
        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
495
0
            return false;
496
0
        }
497
0
    }
498
0
    return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_);
499
0
}
500
501
/**
502
 * Returns true if this set contains none of the characters
503
 * of the given string.
504
 * @param s string containing characters to be checked for containment
505
 * @return true if the test condition is met
506
 */
507
0
UBool UnicodeSet::containsNone(const UnicodeString& s) const {
508
0
    return span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == s.length();
509
0
}
510
511
/**
512
 * Returns <tt>true</tt> if this set contains any character whose low byte
513
 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
514
 * indexing.
515
 */
516
0
UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
517
    /* The index value v, in the range [0,255], is contained in this set if
518
     * it is contained in any pair of this set.  Pairs either have the high
519
     * bytes equal, or unequal.  If the high bytes are equal, then we have
520
     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
521
     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
522
     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
523
     * time zone month containment logic.)
524
     */
525
0
    int32_t i;
526
0
    int32_t rangeCount=getRangeCount();
527
0
    for (i=0; i<rangeCount; ++i) {
528
0
        UChar32 low = getRangeStart(i);
529
0
        UChar32 high = getRangeEnd(i);
530
0
        if ((low & ~0xFF) == (high & ~0xFF)) {
531
0
            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
532
0
                return true;
533
0
            }
534
0
        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
535
0
            return true;
536
0
        }
537
0
    }
538
0
    if (hasStrings()) {
539
0
        for (i=0; i<strings_->size(); ++i) {
540
0
            const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i));
541
0
            if (s.isEmpty()) {
542
0
                continue;  // skip the empty string
543
0
            }
544
0
            UChar32 c = s.char32At(0);
545
0
            if ((c & 0xFF) == v) {
546
0
                return true;
547
0
            }
548
0
        }
549
0
    }
550
0
    return false;
551
0
}
552
553
/**
554
 * Implementation of UnicodeMatcher::matches().  Always matches the
555
 * longest possible multichar string.
556
 */
557
UMatchDegree UnicodeSet::matches(const Replaceable& text,
558
                                 int32_t& offset,
559
                                 int32_t limit,
560
0
                                 UBool incremental) {
561
0
    if (offset == limit) {
562
0
        if (contains(U_ETHER)) {
563
0
            return incremental ? U_PARTIAL_MATCH : U_MATCH;
564
0
        } else {
565
0
            return U_MISMATCH;
566
0
        }
567
0
    } else {
568
0
        if (hasStrings()) { // try strings first
569
570
            // might separate forward and backward loops later
571
            // for now they are combined
572
573
            // TODO Improve efficiency of this, at least in the forward
574
            // direction, if not in both.  In the forward direction we
575
            // can assume the strings are sorted.
576
577
0
            int32_t i;
578
0
            UBool forward = offset < limit;
579
580
            // firstChar is the leftmost char to match in the
581
            // forward direction or the rightmost char to match in
582
            // the reverse direction.
583
0
            char16_t firstChar = text.charAt(offset);
584
585
            // If there are multiple strings that can match we
586
            // return the longest match.
587
0
            int32_t highWaterLength = 0;
588
589
0
            for (i=0; i<strings_->size(); ++i) {
590
0
                const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i));
591
0
                if (trial.isEmpty()) {
592
0
                    continue;  // skip the empty string
593
0
                }
594
595
0
                char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
596
597
                // Strings are sorted, so we can optimize in the
598
                // forward direction.
599
0
                if (forward && c > firstChar) break;
600
0
                if (c != firstChar) continue;
601
602
0
                int32_t matchLen = matchRest(text, offset, limit, trial);
603
604
0
                if (incremental) {
605
0
                    int32_t maxLen = forward ? limit-offset : offset-limit;
606
0
                    if (matchLen == maxLen) {
607
                        // We have successfully matched but only up to limit.
608
0
                        return U_PARTIAL_MATCH;
609
0
                    }
610
0
                }
611
612
0
                if (matchLen == trial.length()) {
613
                    // We have successfully matched the whole string.
614
0
                    if (matchLen > highWaterLength) {
615
0
                        highWaterLength = matchLen;
616
0
                    }
617
                    // In the forward direction we know strings
618
                    // are sorted so we can bail early.
619
0
                    if (forward && matchLen < highWaterLength) {
620
0
                        break;
621
0
                    }
622
0
                    continue;
623
0
                }
624
0
            }
625
626
            // We've checked all strings without a partial match.
627
            // If we have full matches, return the longest one.
628
0
            if (highWaterLength != 0) {
629
0
                offset += forward ? highWaterLength : -highWaterLength;
630
0
                return U_MATCH;
631
0
            }
632
0
        }
633
0
        return UnicodeFilter::matches(text, offset, limit, incremental);
634
0
    }
635
0
}
636
637
/**
638
 * Returns the longest match for s in text at the given position.
639
 * If limit > start then match forward from start+1 to limit
640
 * matching all characters except s.charAt(0).  If limit < start,
641
 * go backward starting from start-1 matching all characters
642
 * except s.charAt(s.length()-1).  This method assumes that the
643
 * first character, text.charAt(start), matches s, so it does not
644
 * check it.
645
 * @param text the text to match
646
 * @param start the first character to match.  In the forward
647
 * direction, text.charAt(start) is matched against s.charAt(0).
648
 * In the reverse direction, it is matched against
649
 * s.charAt(s.length()-1).
650
 * @param limit the limit offset for matching, either last+1 in
651
 * the forward direction, or last-1 in the reverse direction,
652
 * where last is the index of the last character to match.
653
 * @return If part of s matches up to the limit, return |limit -
654
 * start|.  If all of s matches before reaching the limit, return
655
 * s.length().  If there is a mismatch between s and text, return
656
 * 0
657
 */
658
int32_t UnicodeSet::matchRest(const Replaceable& text,
659
                              int32_t start, int32_t limit,
660
0
                              const UnicodeString& s) {
661
0
    int32_t i;
662
0
    int32_t maxLen;
663
0
    int32_t slen = s.length();
664
0
    if (start < limit) {
665
0
        maxLen = limit - start;
666
0
        if (maxLen > slen) maxLen = slen;
667
0
        for (i = 1; i < maxLen; ++i) {
668
0
            if (text.charAt(start + i) != s.charAt(i)) return 0;
669
0
        }
670
0
    } else {
671
0
        maxLen = start - limit;
672
0
        if (maxLen > slen) maxLen = slen;
673
0
        --slen; // <=> slen = s.length() - 1;
674
0
        for (i = 1; i < maxLen; ++i) {
675
0
            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
676
0
        }
677
0
    }
678
0
    return maxLen;
679
0
}
680
681
/**
682
 * Implement of UnicodeMatcher
683
 */
684
0
void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
685
0
    toUnionTo.addAll(*this);
686
0
}
687
688
/**
689
 * Returns the index of the given character within this set, where
690
 * the set is ordered by ascending code point.  If the character
691
 * is not in this set, return -1.  The inverse of this method is
692
 * <code>charAt()</code>.
693
 * @return an index from 0..size()-1, or -1
694
 */
695
0
int32_t UnicodeSet::indexOf(UChar32 c) const {
696
0
    if (c < MIN_VALUE || c > MAX_VALUE) {
697
0
        return -1;
698
0
    }
699
0
    int32_t i = 0;
700
0
    int32_t n = 0;
701
0
    for (;;) {
702
0
        UChar32 start = list[i++];
703
0
        if (c < start) {
704
0
            return -1;
705
0
        }
706
0
        UChar32 limit = list[i++];
707
0
        if (c < limit) {
708
0
            return n + c - start;
709
0
        }
710
0
        n += limit - start;
711
0
    }
712
0
}
713
714
/**
715
 * Returns the character at the given index within this set, where
716
 * the set is ordered by ascending code point.  If the index is
717
 * out of range, return (UChar32)-1.  The inverse of this method is
718
 * <code>indexOf()</code>.
719
 * @param index an index from 0..size()-1
720
 * @return the character at the given index, or (UChar32)-1.
721
 */
722
10.0k
UChar32 UnicodeSet::charAt(int32_t index) const {
723
10.0k
    if (index >= 0) {
724
        // len2 is the largest even integer <= len, that is, it is len
725
        // for even values and len-1 for odd values.  With odd values
726
        // the last entry is UNICODESET_HIGH.
727
10.0k
        int32_t len2 = len & ~1;
728
10.0k
        for (int32_t i=0; i < len2;) {
729
10.0k
            UChar32 start = list[i++];
730
10.0k
            int32_t count = list[i++] - start;
731
10.0k
            if (index < count) {
732
10.0k
                return static_cast<UChar32>(start + index);
733
10.0k
            }
734
0
            index -= count;
735
0
        }
736
10.0k
    }
737
0
    return static_cast<UChar32>(-1);
738
10.0k
}
739
740
/**
741
 * Make this object represent the range <code>start - end</code>.
742
 * If <code>end > start</code> then this object is set to an
743
 * an empty range.
744
 *
745
 * @param start first character in the set, inclusive
746
 * @rparam end last character in the set, inclusive
747
 */
748
398k
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
749
398k
    clear();
750
398k
    complement(start, end);
751
398k
    return *this;
752
398k
}
753
754
/**
755
 * Adds the specified range to this set if it is not already
756
 * present.  If this set already contains the specified range,
757
 * the call leaves this set unchanged.  If <code>end > start</code>
758
 * then an empty range is added, leaving the set unchanged.
759
 *
760
 * @param start first character, inclusive, of range to be added
761
 * to this set.
762
 * @param end last character, inclusive, of range to be added
763
 * to this set.
764
 */
765
23.3M
UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
766
23.3M
    if (pinCodePoint(start) < pinCodePoint(end)) {
767
16.5M
        UChar32 limit = end + 1;
768
        // Fast path for adding a new range after the last one.
769
        // Odd list length: [..., lastStart, lastLimit, HIGH]
770
16.5M
        if ((len & 1) != 0) {
771
            // If the list is empty, set lastLimit low enough to not be adjacent to 0.
772
16.5M
            UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
773
16.5M
            if (lastLimit <= start && !isFrozen() && !isBogus()) {
774
16.5M
                if (lastLimit == start) {
775
                    // Extend the last range.
776
715
                    list[len - 2] = limit;
777
715
                    if (limit == UNICODESET_HIGH) {
778
278
                        --len;
779
278
                    }
780
16.5M
                } else {
781
16.5M
                    list[len - 1] = start;
782
16.5M
                    if (limit < UNICODESET_HIGH) {
783
15.9M
                        if (ensureCapacity(len + 2)) {
784
15.9M
                            list[len++] = limit;
785
15.9M
                            list[len++] = UNICODESET_HIGH;
786
15.9M
                        }
787
15.9M
                    } else {  // limit == UNICODESET_HIGH
788
575k
                        if (ensureCapacity(len + 1)) {
789
575k
                            list[len++] = UNICODESET_HIGH;
790
575k
                        }
791
575k
                    }
792
16.5M
                }
793
16.5M
                releasePattern();
794
16.5M
                return *this;
795
16.5M
            }
796
16.5M
        }
797
        // This is slow. Could be much faster using findCodePoint(start)
798
        // and modifying the list, dealing with adjacent & overlapping ranges.
799
17.0k
        UChar32 range[3] = { start, limit, UNICODESET_HIGH };
800
17.0k
        add(range, 2, 0);
801
6.83M
    } else if (start == end) {
802
6.83M
        add(start);
803
6.83M
    }
804
6.85M
    return *this;
805
23.3M
}
806
807
// #define DEBUG_US_ADD
808
809
#ifdef DEBUG_US_ADD
810
#include <stdio.h>
811
void dump(UChar32 c) {
812
    if (c <= 0xFF) {
813
        printf("%c", (char)c);
814
    } else {
815
        printf("U+%04X", c);
816
    }
817
}
818
void dump(const UChar32* list, int32_t len) {
819
    printf("[");
820
    for (int32_t i=0; i<len; ++i) {
821
        if (i != 0) printf(", ");
822
        dump(list[i]);
823
    }
824
    printf("]");
825
}
826
#endif
827
828
/**
829
 * Adds the specified character to this set if it is not already
830
 * present.  If this set already contains the specified character,
831
 * the call leaves this set unchanged.
832
 */
833
538M
UnicodeSet& UnicodeSet::add(UChar32 c) {
834
    // find smallest i such that c < list[i]
835
    // if odd, then it is IN the set
836
    // if even, then it is OUT of the set
837
538M
    int32_t i = findCodePoint(pinCodePoint(c));
838
839
    // already in set?
840
538M
    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
841
842
    // HIGH is 0x110000
843
    // assert(list[len-1] == HIGH);
844
845
    // empty = [HIGH]
846
    // [start_0, limit_0, start_1, limit_1, HIGH]
847
848
    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
849
    //                             ^
850
    //                             list[i]
851
852
    // i == 0 means c is before the first range
853
854
#ifdef DEBUG_US_ADD
855
    printf("Add of ");
856
    dump(c);
857
    printf(" found at %d", i);
858
    printf(": ");
859
    dump(list, len);
860
    printf(" => ");
861
#endif
862
863
33.2M
    if (c == list[i]-1) {
864
        // c is before start of next range
865
2.27M
        list[i] = c;
866
        // if we touched the HIGH mark, then add a new one
867
2.27M
        if (c == (UNICODESET_HIGH - 1)) {
868
6.71k
            if (!ensureCapacity(len+1)) {
869
                // ensureCapacity will mark the object as Bogus if OOM failure happens.
870
0
                return *this;
871
0
            }
872
6.71k
            list[len++] = UNICODESET_HIGH;
873
6.71k
        }
874
2.27M
        if (i > 0 && c == list[i-1]) {
875
            // collapse adjacent ranges
876
877
            // [..., start_k-1, c, c, limit_k, ..., HIGH]
878
            //                     ^
879
            //                     list[i]
880
881
            //for (int32_t k=i-1; k<len-2; ++k) {
882
            //    list[k] = list[k+2];
883
            //}
884
1.04M
            UChar32* dst = list + i - 1;
885
1.04M
            UChar32* src = dst + 2;
886
1.04M
            UChar32* srclimit = list + len;
887
10.1G
            while (src < srclimit) *(dst++) = *(src++);
888
889
1.04M
            len -= 2;
890
1.04M
        }
891
2.27M
    }
892
893
30.9M
    else if (i > 0 && c == list[i-1]) {
894
        // c is after end of prior range
895
2.29M
        list[i-1]++;
896
        // no need to check for collapse here
897
2.29M
    }
898
899
28.6M
    else {
900
        // At this point we know the new char is not adjacent to
901
        // any existing ranges, and it is not 10FFFF.
902
903
904
        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
905
        //                             ^
906
        //                             list[i]
907
908
        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
909
        //                             ^
910
        //                             list[i]
911
912
28.6M
        if (!ensureCapacity(len+2)) {
913
            // ensureCapacity will mark the object as Bogus if OOM failure happens.
914
0
            return *this;
915
0
        }
916
917
28.6M
        UChar32 *p = list + i;
918
28.6M
        uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
919
28.6M
        list[i] = c;
920
28.6M
        list[i+1] = c+1;
921
28.6M
        len += 2;
922
28.6M
    }
923
924
#ifdef DEBUG_US_ADD
925
    dump(list, len);
926
    printf("\n");
927
928
    for (i=1; i<len; ++i) {
929
        if (list[i] <= list[i-1]) {
930
            // Corrupt array!
931
            printf("ERROR: list has been corrupted\n");
932
            exit(1);
933
        }
934
    }
935
#endif
936
937
33.2M
    releasePattern();
938
33.2M
    return *this;
939
33.2M
}
940
941
/**
942
 * Adds the specified multicharacter to this set if it is not already
943
 * present.  If this set already contains the multicharacter,
944
 * the call leaves this set unchanged.
945
 * Thus "ch" => {"ch"}
946
 *
947
 * @param s the source string
948
 * @return the modified set, for chaining
949
 */
950
7.25M
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
951
7.25M
    if (isFrozen() || isBogus()) return *this;
952
7.25M
    int32_t cp = getSingleCP(s);
953
7.25M
    if (cp < 0) {
954
7.25M
        if (!stringsContains(s)) {
955
4.54M
            _add(s);
956
4.54M
            releasePattern();
957
4.54M
        }
958
7.25M
    } else {
959
966
        add(static_cast<UChar32>(cp));
960
966
    }
961
7.25M
    return *this;
962
7.25M
}
963
964
/**
965
 * Adds the given string, in order, to 'strings_'.  The given string
966
 * must have been checked by the caller to not already be in 'strings_'.
967
 */
968
5.41M
void UnicodeSet::_add(const UnicodeString& s) {
969
5.41M
    if (isFrozen() || isBogus()) {
970
0
        return;
971
0
    }
972
5.41M
    UErrorCode ec = U_ZERO_ERROR;
973
5.41M
    if (strings_ == nullptr && !allocateStrings(ec)) {
974
0
        setToBogus();
975
0
        return;
976
0
    }
977
5.41M
    LocalPointer<UnicodeString> t(new UnicodeString(s));
978
5.41M
    if (t.isNull()) { // Check for memory allocation error.
979
0
        setToBogus();
980
0
        return;
981
0
    }
982
5.41M
    strings_->sortedInsert(t.orphan(), compareUnicodeString, ec);
983
5.41M
    if (U_FAILURE(ec)) {
984
0
        setToBogus();
985
0
    }
986
5.41M
}
987
988
/**
989
 * @return a code point IF the string consists of a single one.
990
 * otherwise returns -1.
991
 * @param string to test
992
 */
993
7.26M
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
994
7.26M
    int32_t sLength = s.length();
995
7.26M
    if (sLength == 1) return s.charAt(0);
996
7.26M
    if (sLength == 2) {
997
6.04M
        UChar32 cp = s.char32At(0);
998
6.04M
        if (cp > 0xFFFF) { // is surrogate pair
999
908
            return cp;
1000
908
        }
1001
6.04M
    }
1002
7.26M
    return -1;
1003
7.26M
}
1004
1005
/**
1006
 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1007
 * If this set already any particular character, it has no effect on that character.
1008
 * @param the source string
1009
 * @return the modified set, for chaining
1010
 */
1011
26.6k
UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1012
26.6k
    UChar32 cp;
1013
309M
    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1014
309M
        cp = s.char32At(i);
1015
309M
        add(cp);
1016
309M
    }
1017
26.6k
    return *this;
1018
26.6k
}
1019
1020
/**
1021
 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1022
 * If this set already any particular character, it has no effect on that character.
1023
 * @param the source string
1024
 * @return the modified set, for chaining
1025
 */
1026
5.33k
UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1027
5.33k
    UnicodeSet set;
1028
5.33k
    set.addAll(s);
1029
5.33k
    retainAll(set);
1030
5.33k
    return *this;
1031
5.33k
}
1032
1033
/**
1034
 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1035
 * If this set already any particular character, it has no effect on that character.
1036
 * @param the source string
1037
 * @return the modified set, for chaining
1038
 */
1039
5.33k
UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1040
5.33k
    UnicodeSet set;
1041
5.33k
    set.addAll(s);
1042
5.33k
    complementAll(set);
1043
5.33k
    return *this;
1044
5.33k
}
1045
1046
/**
1047
 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1048
 * If this set already any particular character, it has no effect on that character.
1049
 * @param the source string
1050
 * @return the modified set, for chaining
1051
 */
1052
5.33k
UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1053
5.33k
    UnicodeSet set;
1054
5.33k
    set.addAll(s);
1055
5.33k
    removeAll(set);
1056
5.33k
    return *this;
1057
5.33k
}
1058
1059
2.41M
UnicodeSet& UnicodeSet::removeAllStrings() {
1060
2.41M
    if (!isFrozen() && hasStrings()) {
1061
56.6k
        strings_->removeAllElements();
1062
56.6k
        releasePattern();
1063
56.6k
    }
1064
2.41M
    return *this;
1065
2.41M
}
1066
1067
1068
/**
1069
 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1070
 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1071
 * @param the source string
1072
 * @return a newly created set containing the given string
1073
 */
1074
5.33k
UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1075
5.33k
    UnicodeSet *set = new UnicodeSet();
1076
5.33k
    if (set != nullptr) { // Check for memory allocation error.
1077
5.33k
        set->add(s);
1078
5.33k
    }
1079
5.33k
    return set;
1080
5.33k
}
1081
1082
1083
/**
1084
 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1085
 * @param the source string
1086
 * @return a newly created set containing the given characters
1087
 */
1088
5.33k
UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1089
5.33k
    UnicodeSet *set = new UnicodeSet();
1090
5.33k
    if (set != nullptr) { // Check for memory allocation error.
1091
5.33k
        set->addAll(s);
1092
5.33k
    }
1093
5.33k
    return set;
1094
5.33k
}
1095
1096
/**
1097
 * Retain only the elements in this set that are contained in the
1098
 * specified range.  If <code>end > start</code> then an empty range is
1099
 * retained, leaving the set empty.
1100
 *
1101
 * @param start first character, inclusive, of range to be retained
1102
 * to this set.
1103
 * @param end last character, inclusive, of range to be retained
1104
 * to this set.
1105
 */
1106
139
UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1107
139
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1108
139
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1109
139
        retain(range, 2, 0);
1110
139
    } else {
1111
0
        clear();
1112
0
    }
1113
139
    return *this;
1114
139
}
1115
1116
0
UnicodeSet& UnicodeSet::retain(UChar32 c) {
1117
0
    return retain(c, c);
1118
0
}
1119
1120
5.33k
UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1121
5.33k
    if (isFrozen() || isBogus()) { return *this; }
1122
5.33k
    UChar32 cp = getSingleCP(s);
1123
5.33k
    if (cp < 0) {
1124
5.19k
        bool isIn = stringsContains(s);
1125
        // Check for getRangeCount() first to avoid somewhat-expensive size()
1126
        // when there are single code points.
1127
5.19k
        if (isIn && getRangeCount() == 0 && size() == 1) {
1128
0
            return *this;
1129
0
        }
1130
5.19k
        clear();
1131
5.19k
        if (isIn) {
1132
0
            _add(s);
1133
0
        }
1134
5.19k
    } else {
1135
139
        retain(cp, cp);
1136
139
    }
1137
5.33k
    return *this;
1138
5.33k
}
1139
1140
/**
1141
 * Removes the specified range from this set if it is present.
1142
 * The set will not contain the specified range once the call
1143
 * returns.  If <code>end > start</code> then an empty range is
1144
 * removed, leaving the set unchanged.
1145
 *
1146
 * @param start first character, inclusive, of range to be removed
1147
 * from this set.
1148
 * @param end last character, inclusive, of range to be removed
1149
 * from this set.
1150
 */
1151
153
UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1152
153
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1153
153
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1154
153
        retain(range, 2, 2);
1155
153
    }
1156
153
    return *this;
1157
153
}
1158
1159
/**
1160
 * Removes the specified character from this set if it is present.
1161
 * The set will not contain the specified range once the call
1162
 * returns.
1163
 */
1164
6
UnicodeSet& UnicodeSet::remove(UChar32 c) {
1165
6
    return remove(c, c);
1166
6
}
1167
1168
/**
1169
 * Removes the specified string from this set if it is present.
1170
 * The set will not contain the specified character once the call
1171
 * returns.
1172
 * @param the source string
1173
 * @return the modified set, for chaining
1174
 */
1175
5.33k
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1176
5.33k
    if (isFrozen() || isBogus()) return *this;
1177
5.33k
    int32_t cp = getSingleCP(s);
1178
5.33k
    if (cp < 0) {
1179
5.19k
        if (strings_ != nullptr && strings_->removeElement((void*) &s)) {
1180
0
            releasePattern();
1181
0
        }
1182
5.19k
    } else {
1183
139
        remove(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
1184
139
    }
1185
5.33k
    return *this;
1186
5.33k
}
1187
1188
/**
1189
 * Complements the specified range in this set.  Any character in
1190
 * the range will be removed if it is in this set, or will be
1191
 * added if it is not in this set.  If <code>end > start</code>
1192
 * then an empty range is xor'ed, leaving the set unchanged.
1193
 *
1194
 * @param start first character, inclusive, of range to be removed
1195
 * from this set.
1196
 * @param end last character, inclusive, of range to be removed
1197
 * from this set.
1198
 */
1199
398k
UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1200
398k
    if (isFrozen() || isBogus()) {
1201
0
        return *this;
1202
0
    }
1203
398k
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1204
398k
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1205
398k
        exclusiveOr(range, 2, 0);
1206
398k
    }
1207
398k
    releasePattern();
1208
398k
    return *this;
1209
398k
}
1210
1211
0
UnicodeSet& UnicodeSet::complement(UChar32 c) {
1212
0
    return complement(c, c);
1213
0
}
1214
1215
/**
1216
 * This is equivalent to
1217
 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1218
 */
1219
1.52M
UnicodeSet& UnicodeSet::complement() {
1220
1.52M
    if (isFrozen() || isBogus()) {
1221
0
        return *this;
1222
0
    }
1223
1.52M
    if (list[0] == UNICODESET_LOW) {
1224
1.45M
        uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1225
1.45M
        --len;
1226
1.45M
    } else {
1227
63.9k
        if (!ensureCapacity(len+1)) {
1228
0
            return *this;
1229
0
        }
1230
63.9k
        uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1231
63.9k
        list[0] = UNICODESET_LOW;
1232
63.9k
        ++len;
1233
63.9k
    }
1234
1.52M
    releasePattern();
1235
1.52M
    return *this;
1236
1.52M
}
1237
1238
/**
1239
 * Complement the specified string in this set.
1240
 * The set will not contain the specified string once the call
1241
 * returns.
1242
 *
1243
 * @param s the string to complement
1244
 * @return this object, for chaining
1245
 */
1246
5.33k
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1247
5.33k
    if (isFrozen() || isBogus()) return *this;
1248
5.33k
    int32_t cp = getSingleCP(s);
1249
5.33k
    if (cp < 0) {
1250
5.19k
        if (stringsContains(s)) {
1251
0
            strings_->removeElement((void*) &s);
1252
5.19k
        } else {
1253
5.19k
            _add(s);
1254
5.19k
        }
1255
5.19k
        releasePattern();
1256
5.19k
    } else {
1257
139
        complement(static_cast<UChar32>(cp), static_cast<UChar32>(cp));
1258
139
    }
1259
5.33k
    return *this;
1260
5.33k
}
1261
1262
/**
1263
 * Adds all of the elements in the specified set to this set if
1264
 * they're not already present.  This operation effectively
1265
 * modifies this set so that its value is the <i>union</i> of the two
1266
 * sets.  The behavior of this operation is unspecified if the specified
1267
 * collection is modified while the operation is in progress.
1268
 *
1269
 * @param c set whose elements are to be added to this set.
1270
 * @see #add(char, char)
1271
 */
1272
2.34M
UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1273
2.34M
    if ( c.len>0 && c.list!=nullptr ) {
1274
2.34M
        add(c.list, c.len, 0);
1275
2.34M
    }
1276
1277
    // Add strings in order
1278
2.34M
    if ( c.strings_!=nullptr ) {
1279
2.79M
        for (int32_t i=0; i<c.strings_->size(); ++i) {
1280
2.68M
            const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i));
1281
2.68M
            if (!stringsContains(*s)) {
1282
858k
                _add(*s);
1283
858k
            }
1284
2.68M
        }
1285
106k
    }
1286
2.34M
    return *this;
1287
2.34M
}
1288
1289
/**
1290
 * Retains only the elements in this set that are contained in the
1291
 * specified set.  In other words, removes from this set all of
1292
 * its elements that are not contained in the specified set.  This
1293
 * operation effectively modifies this set so that its value is
1294
 * the <i>intersection</i> of the two sets.
1295
 *
1296
 * @param c set that defines which elements this set will retain.
1297
 */
1298
318k
UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1299
318k
    if (isFrozen() || isBogus()) {
1300
0
        return *this;
1301
0
    }
1302
318k
    retain(c.list, c.len, 0);
1303
318k
    if (hasStrings()) {
1304
3.21k
        if (!c.hasStrings()) {
1305
1.96k
            strings_->removeAllElements();
1306
1.96k
        } else {
1307
1.25k
            strings_->retainAll(*c.strings_);
1308
1.25k
        }
1309
3.21k
    }
1310
318k
    return *this;
1311
318k
}
1312
1313
/**
1314
 * Removes from this set all of its elements that are contained in the
1315
 * specified set.  This operation effectively modifies this
1316
 * set so that its value is the <i>asymmetric set difference</i> of
1317
 * the two sets.
1318
 *
1319
 * @param c set that defines which elements will be removed from
1320
 *          this set.
1321
 */
1322
15.0k
UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1323
15.0k
    if (isFrozen() || isBogus()) {
1324
0
        return *this;
1325
0
    }
1326
15.0k
    retain(c.list, c.len, 2);
1327
15.0k
    if (hasStrings() && c.hasStrings()) {
1328
4.82k
        strings_->removeAll(*c.strings_);
1329
4.82k
    }
1330
15.0k
    return *this;
1331
15.0k
}
1332
1333
/**
1334
 * Complements in this set all elements contained in the specified
1335
 * set.  Any character in the other set will be removed if it is
1336
 * in this set, or will be added if it is not in this set.
1337
 *
1338
 * @param c set that defines which elements will be xor'ed from
1339
 *          this set.
1340
 */
1341
5.33k
UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1342
5.33k
    if (isFrozen() || isBogus()) {
1343
0
        return *this;
1344
0
    }
1345
5.33k
    exclusiveOr(c.list, c.len, 0);
1346
1347
5.33k
    if (c.strings_ != nullptr) {
1348
0
        for (int32_t i=0; i<c.strings_->size(); ++i) {
1349
0
            void* e = c.strings_->elementAt(i);
1350
0
            if (strings_ == nullptr || !strings_->removeElement(e)) {
1351
0
                _add(*static_cast<const UnicodeString*>(e));
1352
0
            }
1353
0
        }
1354
0
    }
1355
5.33k
    return *this;
1356
5.33k
}
1357
1358
/**
1359
 * Removes all of the elements from this set.  This set will be
1360
 * empty after this call returns.
1361
 */
1362
2.54M
UnicodeSet& UnicodeSet::clear() {
1363
2.54M
    if (isFrozen()) {
1364
0
        return *this;
1365
0
    }
1366
2.54M
    list[0] = UNICODESET_HIGH;
1367
2.54M
    len = 1;
1368
2.54M
    releasePattern();
1369
2.54M
    if (strings_ != nullptr) {
1370
0
        strings_->removeAllElements();
1371
0
    }
1372
    // Remove bogus
1373
2.54M
    fFlags = 0;
1374
2.54M
    return *this;
1375
2.54M
}
1376
1377
/**
1378
 * Iteration method that returns the number of ranges contained in
1379
 * this set.
1380
 * @see #getRangeStart
1381
 * @see #getRangeEnd
1382
 */
1383
2.11M
int32_t UnicodeSet::getRangeCount() const {
1384
2.11M
    return len/2;
1385
2.11M
}
1386
1387
/**
1388
 * Iteration method that returns the first character in the
1389
 * specified range of this set.
1390
 * @see #getRangeCount
1391
 * @see #getRangeEnd
1392
 */
1393
483M
UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1394
483M
    return list[index*2];
1395
483M
}
1396
1397
/**
1398
 * Iteration method that returns the last character in the
1399
 * specified range of this set.
1400
 * @see #getRangeStart
1401
 * @see #getRangeEnd
1402
 */
1403
483M
UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1404
483M
    return list[index*2 + 1] - 1;
1405
483M
}
1406
1407
0
const UnicodeString* UnicodeSet::getString(int32_t index) const {
1408
0
    return static_cast<const UnicodeString*>(strings_->elementAt(index));
1409
0
}
1410
1411
/**
1412
 * Reallocate this objects internal structures to take up the least
1413
 * possible space, without changing this object's value.
1414
 */
1415
400k
UnicodeSet& UnicodeSet::compact() {
1416
400k
    if (isFrozen() || isBogus()) {
1417
0
        return *this;
1418
0
    }
1419
    // Delete buffer first to defragment memory less.
1420
400k
    if (buffer != stackList) {
1421
344k
        uprv_free(buffer);
1422
344k
        buffer = nullptr;
1423
344k
        bufferCapacity = 0;
1424
344k
    }
1425
400k
    if (list == stackList) {
1426
        // pass
1427
341k
    } else if (len <= INITIAL_CAPACITY) {
1428
12.1k
        uprv_memcpy(stackList, list, len * sizeof(UChar32));
1429
12.1k
        uprv_free(list);
1430
12.1k
        list = stackList;
1431
12.1k
        capacity = INITIAL_CAPACITY;
1432
47.0k
    } else if ((len + 7) < capacity) {
1433
        // If we have more than a little unused capacity, shrink it to len.
1434
45.9k
        UChar32* temp = static_cast<UChar32*>(uprv_realloc(list, sizeof(UChar32) * len));
1435
45.9k
        if (temp) {
1436
45.9k
            list = temp;
1437
45.9k
            capacity = len;
1438
45.9k
        }
1439
        // else what the heck happened?! We allocated less memory!
1440
        // Oh well. We'll keep our original array.
1441
45.9k
    }
1442
400k
    if (strings_ != nullptr && strings_->isEmpty()) {
1443
14.5k
        delete strings_;
1444
14.5k
        strings_ = nullptr;
1445
14.5k
    }
1446
400k
    return *this;
1447
400k
}
1448
1449
#ifdef DEBUG_SERIALIZE
1450
#include <stdio.h>
1451
#endif
1452
1453
/**
1454
 * Deserialize constructor.
1455
 */
1456
UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1457
1
                       UErrorCode &ec) {
1458
1459
1
  if(U_FAILURE(ec)) {
1460
0
    setToBogus();
1461
0
    return;
1462
0
  }
1463
1464
1
  if( (serialization != kSerialized)
1465
1
      || (data==nullptr)
1466
1
      || (dataLen < 1)) {
1467
0
    ec = U_ILLEGAL_ARGUMENT_ERROR;
1468
0
    setToBogus();
1469
0
    return;
1470
0
  }
1471
1472
  // bmp?
1473
1
  int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1474
1
  int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1475
1476
1
  int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1477
#ifdef DEBUG_SERIALIZE
1478
  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1479
#endif
1480
1
  if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
1481
0
    return;
1482
0
  }
1483
  // copy bmp
1484
1
  int32_t i;
1485
451
  for(i = 0; i< bmpLength;i++) {
1486
450
    list[i] = data[i+headerSize];
1487
#ifdef DEBUG_SERIALIZE
1488
    printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1489
#endif
1490
450
  }
1491
  // copy smp
1492
393
  for(i=bmpLength;i<newLength;i++) {
1493
392
    list[i] = (static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 0]) << 16) +
1494
392
               static_cast<UChar32>(data[headerSize + bmpLength + (i - bmpLength) * 2 + 1]);
1495
#ifdef DEBUG_SERIALIZE
1496
    printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1497
#endif
1498
392
  }
1499
1
  U_ASSERT(i == newLength);
1500
1
  if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1501
1
    list[i++] = UNICODESET_HIGH;
1502
1
  }
1503
1
  len = i;
1504
1
}
1505
1506
1507
0
int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1508
0
    int32_t bmpLength, length, destLength;
1509
1510
0
    if (U_FAILURE(ec)) {
1511
0
        return 0;
1512
0
    }
1513
1514
0
    if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
1515
0
        ec=U_ILLEGAL_ARGUMENT_ERROR;
1516
0
        return 0;
1517
0
    }
1518
1519
    /* count necessary 16-bit units */
1520
0
    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1521
    // assert(length>=0);
1522
0
    if (length==0) {
1523
        /* empty set */
1524
0
        if (destCapacity>0) {
1525
0
            *dest=0;
1526
0
        } else {
1527
0
            ec=U_BUFFER_OVERFLOW_ERROR;
1528
0
        }
1529
0
        return 1;
1530
0
    }
1531
    /* now length>0 */
1532
1533
0
    if (this->list[length-1]<=0xffff) {
1534
        /* all BMP */
1535
0
        bmpLength=length;
1536
0
    } else if (this->list[0]>=0x10000) {
1537
        /* all supplementary */
1538
0
        bmpLength=0;
1539
0
        length*=2;
1540
0
    } else {
1541
        /* some BMP, some supplementary */
1542
0
        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1543
0
        length=bmpLength+2*(length-bmpLength);
1544
0
    }
1545
#ifdef DEBUG_SERIALIZE
1546
    printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1547
#endif
1548
    /* length: number of 16-bit array units */
1549
0
    if (length>0x7fff) {
1550
        /* there are only 15 bits for the length in the first serialized word */
1551
0
        ec=U_INDEX_OUTOFBOUNDS_ERROR;
1552
0
        return 0;
1553
0
    }
1554
1555
    /*
1556
     * total serialized length:
1557
     * number of 16-bit array units (length) +
1558
     * 1 length unit (always) +
1559
     * 1 bmpLength unit (if there are supplementary values)
1560
     */
1561
0
    destLength=length+((length>bmpLength)?2:1);
1562
0
    if (destLength<=destCapacity) {
1563
0
        const UChar32 *p;
1564
0
        int32_t i;
1565
1566
#ifdef DEBUG_SERIALIZE
1567
        printf("writeHdr\n");
1568
#endif
1569
0
        *dest = static_cast<uint16_t>(length);
1570
0
        if (length>bmpLength) {
1571
0
            *dest|=0x8000;
1572
0
            *++dest = static_cast<uint16_t>(bmpLength);
1573
0
        }
1574
0
        ++dest;
1575
1576
        /* write the BMP part of the array */
1577
0
        p=this->list;
1578
0
        for (i=0; i<bmpLength; ++i) {
1579
#ifdef DEBUG_SERIALIZE
1580
          printf("writebmp: %x\n", (int)*p);
1581
#endif
1582
0
            *dest++ = static_cast<uint16_t>(*p++);
1583
0
        }
1584
1585
        /* write the supplementary part of the array */
1586
0
        for (; i<length; i+=2) {
1587
#ifdef DEBUG_SERIALIZE
1588
          printf("write32: %x\n", (int)*p);
1589
#endif
1590
0
            *dest++ = static_cast<uint16_t>(*p >> 16);
1591
0
            *dest++ = static_cast<uint16_t>(*p++);
1592
0
        }
1593
0
    } else {
1594
0
        ec=U_BUFFER_OVERFLOW_ERROR;
1595
0
    }
1596
0
    return destLength;
1597
0
}
1598
1599
//----------------------------------------------------------------
1600
// Implementation: Utility methods
1601
//----------------------------------------------------------------
1602
1603
/**
1604
 * Allocate our strings vector and return true if successful.
1605
 */
1606
247k
UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1607
247k
    if (U_FAILURE(status)) {
1608
0
        return false;
1609
0
    }
1610
247k
    strings_ = new UVector(uprv_deleteUObject,
1611
247k
                          uhash_compareUnicodeString, 1, status);
1612
247k
    if (strings_ == nullptr) { // Check for memory allocation error.
1613
0
        status = U_MEMORY_ALLOCATION_ERROR;
1614
0
        return false;
1615
0
    }
1616
247k
    if (U_FAILURE(status)) {
1617
0
        delete strings_;
1618
0
        strings_ = nullptr;
1619
0
        return false;
1620
0
    } 
1621
247k
    return true;
1622
247k
}
1623
1624
1.68M
int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1625
    // Grow exponentially to reduce the frequency of allocations.
1626
1.68M
    if (minCapacity < INITIAL_CAPACITY) {
1627
616k
        return minCapacity + INITIAL_CAPACITY;
1628
1.06M
    } else if (minCapacity <= 2500) {
1629
1.05M
        return 5 * minCapacity;
1630
1.05M
    } else {
1631
6.62k
        int32_t newCapacity = 2 * minCapacity;
1632
6.62k
        if (newCapacity > MAX_LENGTH) {
1633
0
            newCapacity = MAX_LENGTH;
1634
0
        }
1635
6.62k
        return newCapacity;
1636
6.62k
    }
1637
1.68M
}
1638
1639
126M
bool UnicodeSet::ensureCapacity(int32_t newLen) {
1640
126M
    if (newLen > MAX_LENGTH) {
1641
0
        newLen = MAX_LENGTH;
1642
0
    }
1643
126M
    if (newLen <= capacity) {
1644
126M
        return true;
1645
126M
    }
1646
607k
    int32_t newCapacity = nextCapacity(newLen);
1647
607k
    UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32)));
1648
607k
    if (temp == nullptr) {
1649
0
        setToBogus(); // set the object to bogus state if an OOM failure occurred.
1650
0
        return false;
1651
0
    }
1652
    // Copy only the actual contents.
1653
607k
    uprv_memcpy(temp, list, len * sizeof(UChar32));
1654
607k
    if (list != stackList) {
1655
101k
        uprv_free(list);
1656
101k
    }
1657
607k
    list = temp;
1658
607k
    capacity = newCapacity;
1659
607k
    return true;
1660
607k
}
1661
1662
3.09M
bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1663
3.09M
    if (newLen > MAX_LENGTH) {
1664
0
        newLen = MAX_LENGTH;
1665
0
    }
1666
3.09M
    if (newLen <= bufferCapacity) {
1667
2.02M
        return true;
1668
2.02M
    }
1669
1.07M
    int32_t newCapacity = nextCapacity(newLen);
1670
1.07M
    UChar32* temp = static_cast<UChar32*>(uprv_malloc(newCapacity * sizeof(UChar32)));
1671
1.07M
    if (temp == nullptr) {
1672
0
        setToBogus();
1673
0
        return false;
1674
0
    }
1675
    // The buffer has no contents to be copied.
1676
    // It is always filled from scratch after this call.
1677
1.07M
    if (buffer != stackList) {
1678
866k
        uprv_free(buffer);
1679
866k
    }
1680
1.07M
    buffer = temp;
1681
1.07M
    bufferCapacity = newCapacity;
1682
1.07M
    return true;
1683
1.07M
}
1684
1685
/**
1686
 * Swap list and buffer.
1687
 */
1688
3.09M
void UnicodeSet::swapBuffers() {
1689
    // swap list and buffer
1690
3.09M
    UChar32* temp = list;
1691
3.09M
    list = buffer;
1692
3.09M
    buffer = temp;
1693
1694
3.09M
    int32_t c = capacity;
1695
3.09M
    capacity = bufferCapacity;
1696
3.09M
    bufferCapacity = c;
1697
3.09M
}
1698
1699
0
void UnicodeSet::setToBogus() {
1700
0
    clear(); // Remove everything in the set.
1701
0
    fFlags = kIsBogus;
1702
0
}
1703
1704
//----------------------------------------------------------------
1705
// Implementation: Fundamental operators
1706
//----------------------------------------------------------------
1707
1708
4.39M
static inline UChar32 max(UChar32 a, UChar32 b) {
1709
4.39M
    return (a > b) ? a : b;
1710
4.39M
}
1711
1712
// polarity = 0, 3 is normal: x xor y
1713
// polarity = 1, 2: x xor ~y == x === y
1714
1715
404k
void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1716
404k
    if (isFrozen() || isBogus()) {
1717
0
        return;
1718
0
    }
1719
404k
    if (!ensureBufferCapacity(len + otherLen)) {
1720
0
        return;
1721
0
    }
1722
1723
404k
    int32_t i = 0, j = 0, k = 0;
1724
404k
    UChar32 a = list[i++];
1725
404k
    UChar32 b;
1726
404k
    if (polarity == 1 || polarity == 2) {
1727
0
        b = UNICODESET_LOW;
1728
0
        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1729
0
            ++j;
1730
0
            b = other[j];
1731
0
        }
1732
404k
    } else {
1733
404k
        b = other[j++];
1734
404k
    }
1735
    // simplest of all the routines
1736
    // sort the values, discarding identicals!
1737
2.82M
    for (;;) {
1738
2.82M
        if (a < b) {
1739
0
            buffer[k++] = a;
1740
0
            a = list[i++];
1741
2.82M
        } else if (b < a) {
1742
2.42M
            buffer[k++] = b;
1743
2.42M
            b = other[j++];
1744
2.42M
        } else if (a != UNICODESET_HIGH) { // at this point, a == b
1745
            // discard both values!
1746
0
            a = list[i++];
1747
0
            b = other[j++];
1748
404k
        } else { // DONE!
1749
404k
            buffer[k++] = UNICODESET_HIGH;
1750
404k
            len = k;
1751
404k
            break;
1752
404k
        }
1753
2.82M
    }
1754
404k
    swapBuffers();
1755
404k
    releasePattern();
1756
404k
}
1757
1758
// polarity = 0 is normal: x union y
1759
// polarity = 2: x union ~y
1760
// polarity = 1: ~x union y
1761
// polarity = 3: ~x union ~y
1762
1763
2.35M
void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1764
2.35M
    if (isFrozen() || isBogus() || other==nullptr) {
1765
0
        return;
1766
0
    }
1767
2.35M
    if (!ensureBufferCapacity(len + otherLen)) {
1768
0
        return;
1769
0
    }
1770
1771
2.35M
    int32_t i = 0, j = 0, k = 0;
1772
2.35M
    UChar32 a = list[i++];
1773
2.35M
    UChar32 b = other[j++];
1774
    // change from xor is that we have to check overlapping pairs
1775
    // polarity bit 1 means a is second, bit 2 means b is.
1776
202M
    for (;;) {
1777
202M
        switch (polarity) {
1778
99.8M
          case 0: // both first; take lower if unequal
1779
99.8M
            if (a < b) { // take a
1780
                // Back up over overlapping ranges in buffer[]
1781
23.3M
                if (k > 0 && a <= buffer[k-1]) {
1782
                    // Pick latter end value in buffer[] vs. list[]
1783
1.14M
                    a = max(list[i], buffer[--k]);
1784
22.1M
                } else {
1785
                    // No overlap
1786
22.1M
                    buffer[k++] = a;
1787
22.1M
                    a = list[i];
1788
22.1M
                }
1789
23.3M
                i++; // Common if/else code factored out
1790
23.3M
                polarity ^= 1;
1791
76.4M
            } else if (b < a) { // take b
1792
60.0M
                if (k > 0 && b <= buffer[k-1]) {
1793
3.24M
                    b = max(other[j], buffer[--k]);
1794
56.8M
                } else {
1795
56.8M
                    buffer[k++] = b;
1796
56.8M
                    b = other[j];
1797
56.8M
                }
1798
60.0M
                j++;
1799
60.0M
                polarity ^= 2;
1800
60.0M
            } else { // a == b, take a, drop b
1801
16.3M
                if (a == UNICODESET_HIGH) goto loop_end;
1802
                // This is symmetrical; it doesn't matter if
1803
                // we backtrack with a or b. - liu
1804
15.9M
                if (k > 0 && a <= buffer[k-1]) {
1805
0
                    a = max(list[i], buffer[--k]);
1806
15.9M
                } else {
1807
                    // No overlap
1808
15.9M
                    buffer[k++] = a;
1809
15.9M
                    a = list[i];
1810
15.9M
                }
1811
15.9M
                i++;
1812
15.9M
                polarity ^= 1;
1813
15.9M
                b = other[j++];
1814
15.9M
                polarity ^= 2;
1815
15.9M
            }
1816
99.3M
            break;
1817
99.3M
          case 3: // both second; take higher if unequal, and drop other
1818
18.2M
            if (b <= a) { // take a
1819
18.0M
                if (a == UNICODESET_HIGH) goto loop_end;
1820
16.3M
                buffer[k++] = a;
1821
16.3M
            } else { // take b
1822
188k
                if (b == UNICODESET_HIGH) goto loop_end;
1823
181k
                buffer[k++] = b;
1824
181k
            }
1825
16.5M
            a = list[i++];
1826
16.5M
            polarity ^= 1;   // factored common code
1827
16.5M
            b = other[j++];
1828
16.5M
            polarity ^= 2;
1829
16.5M
            break;
1830
23.6M
          case 1: // a second, b first; if b < a, overlap
1831
23.6M
            if (a < b) { // no overlap, take a
1832
21.0M
                buffer[k++] = a; a = list[i++]; polarity ^= 1;
1833
21.0M
            } else if (b < a) { // OVERLAP, drop b
1834
2.11M
                b = other[j++];
1835
2.11M
                polarity ^= 2;
1836
2.11M
            } else { // a == b, drop both!
1837
495k
                if (a == UNICODESET_HIGH) goto loop_end;
1838
431k
                a = list[i++];
1839
431k
                polarity ^= 1;
1840
431k
                b = other[j++];
1841
431k
                polarity ^= 2;
1842
431k
            }
1843
23.5M
            break;
1844
60.5M
          case 2: // a first, b second; if a < b, overlap
1845
60.5M
            if (b < a) { // no overlap, take b
1846
59.9M
                buffer[k++] = b;
1847
59.9M
                b = other[j++];
1848
59.9M
                polarity ^= 2;
1849
59.9M
            } else  if (a < b) { // OVERLAP, drop a
1850
187k
                a = list[i++];
1851
187k
                polarity ^= 1;
1852
438k
            } else { // a == b, drop both!
1853
438k
                if (a == UNICODESET_HIGH) goto loop_end;
1854
334k
                a = list[i++];
1855
334k
                polarity ^= 1;
1856
334k
                b = other[j++];
1857
334k
                polarity ^= 2;
1858
334k
            }
1859
60.4M
            break;
1860
202M
        }
1861
202M
    }
1862
2.35M
 loop_end:
1863
2.35M
    buffer[k++] = UNICODESET_HIGH;    // terminate
1864
2.35M
    len = k;
1865
2.35M
    swapBuffers();
1866
2.35M
    releasePattern();
1867
2.35M
}
1868
1869
// polarity = 0 is normal: x intersect y
1870
// polarity = 2: x intersect ~y == set-minus
1871
// polarity = 1: ~x intersect y
1872
// polarity = 3: ~x intersect ~y
1873
1874
333k
void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1875
333k
    if (isFrozen() || isBogus()) {
1876
0
        return;
1877
0
    }
1878
333k
    if (!ensureBufferCapacity(len + otherLen)) {
1879
0
        return;
1880
0
    }
1881
1882
333k
    int32_t i = 0, j = 0, k = 0;
1883
333k
    UChar32 a = list[i++];
1884
333k
    UChar32 b = other[j++];
1885
    // change from xor is that we have to check overlapping pairs
1886
    // polarity bit 1 means a is second, bit 2 means b is.
1887
130M
    for (;;) {
1888
130M
        switch (polarity) {
1889
44.2M
          case 0: // both first; drop the smaller
1890
44.2M
            if (a < b) { // drop a
1891
7.00M
                a = list[i++];
1892
7.00M
                polarity ^= 1;
1893
37.2M
            } else if (b < a) { // drop b
1894
36.3M
                b = other[j++];
1895
36.3M
                polarity ^= 2;
1896
36.3M
            } else { // a == b, take one, drop other
1897
920k
                if (a == UNICODESET_HIGH) goto loop_end;
1898
794k
                buffer[k++] = a;
1899
794k
                a = list[i++];
1900
794k
                polarity ^= 1;
1901
794k
                b = other[j++];
1902
794k
                polarity ^= 2;
1903
794k
            }
1904
44.1M
            break;
1905
44.1M
          case 3: // both second; take lower if unequal
1906
20.2M
            if (a < b) { // take a
1907
3.59M
                buffer[k++] = a;
1908
3.59M
                a = list[i++];
1909
3.59M
                polarity ^= 1;
1910
16.6M
            } else if (b < a) { // take b
1911
15.9M
                buffer[k++] = b;
1912
15.9M
                b = other[j++];
1913
15.9M
                polarity ^= 2;
1914
15.9M
            } else { // a == b, take one, drop other
1915
718k
                if (a == UNICODESET_HIGH) goto loop_end;
1916
706k
                buffer[k++] = a;
1917
706k
                a = list[i++];
1918
706k
                polarity ^= 1;
1919
706k
                b = other[j++];
1920
706k
                polarity ^= 2;
1921
706k
            }
1922
20.2M
            break;
1923
24.3M
          case 1: // a second, b first;
1924
24.3M
            if (a < b) { // NO OVERLAP, drop a
1925
6.96M
                a = list[i++];
1926
6.96M
                polarity ^= 1;
1927
17.3M
            } else if (b < a) { // OVERLAP, take b
1928
15.9M
                buffer[k++] = b;
1929
15.9M
                b = other[j++];
1930
15.9M
                polarity ^= 2;
1931
15.9M
            } else { // a == b, drop both!
1932
1.48M
                if (a == UNICODESET_HIGH) goto loop_end;
1933
1.33M
                a = list[i++];
1934
1.33M
                polarity ^= 1;
1935
1.33M
                b = other[j++];
1936
1.33M
                polarity ^= 2;
1937
1.33M
            }
1938
24.2M
            break;
1939
41.2M
          case 2: // a first, b second; if a < b, overlap
1940
41.2M
            if (b < a) { // no overlap, drop b
1941
36.2M
                b = other[j++];
1942
36.2M
                polarity ^= 2;
1943
36.2M
            } else  if (a < b) { // OVERLAP, take a
1944
3.57M
                buffer[k++] = a;
1945
3.57M
                a = list[i++];
1946
3.57M
                polarity ^= 1;
1947
3.57M
            } else { // a == b, drop both!
1948
1.43M
                if (a == UNICODESET_HIGH) goto loop_end;
1949
1.38M
                a = list[i++];
1950
1.38M
                polarity ^= 1;
1951
1.38M
                b = other[j++];
1952
1.38M
                polarity ^= 2;
1953
1.38M
            }
1954
41.2M
            break;
1955
130M
        }
1956
130M
    }
1957
333k
 loop_end:
1958
333k
    buffer[k++] = UNICODESET_HIGH;    // terminate
1959
333k
    len = k;
1960
333k
    swapBuffers();
1961
333k
    releasePattern();
1962
333k
}
1963
1964
/**
1965
 * Append the <code>toPattern()</code> representation of a
1966
 * string to the given <code>StringBuffer</code>.
1967
 */
1968
1.93M
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
1969
1.93M
    UChar32 cp;
1970
27.2M
    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1971
25.3M
        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1972
25.3M
    }
1973
1.93M
}
1974
1975
/**
1976
 * Append the <code>toPattern()</code> representation of a
1977
 * character to the given <code>StringBuffer</code>.
1978
 */
1979
84.8M
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
1980
84.8M
    if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
1981
        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1982
        // unprintable
1983
53.9M
        ICU_Utility::escape(buf, c);
1984
53.9M
        return;
1985
53.9M
    }
1986
    // Okay to let ':' pass through
1987
30.9M
    switch (c) {
1988
114k
    case u'[':
1989
205k
    case u']':
1990
216k
    case u'-':
1991
289k
    case u'^':
1992
296k
    case u'&':
1993
438k
    case u'\\':
1994
460k
    case u'{':
1995
485k
    case u'}':
1996
495k
    case u':':
1997
509k
    case SymbolTable::SYMBOL_REF:
1998
509k
        buf.append(u'\\');
1999
509k
        break;
2000
30.3M
    default:
2001
        // Escape whitespace
2002
30.3M
        if (PatternProps::isWhiteSpace(c)) {
2003
2.13k
            buf.append(u'\\');
2004
2.13k
        }
2005
30.3M
        break;
2006
30.9M
    }
2007
30.9M
    buf.append(c);
2008
30.9M
}
2009
2010
void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
2011
12.9M
                              UBool escapeUnprintable) {
2012
12.9M
    _appendToPat(result, start, escapeUnprintable);
2013
12.9M
    if (start != end) {
2014
11.7M
        if ((start+1) != end ||
2015
                // Avoid writing what looks like a lead+trail surrogate pair.
2016
11.5M
                start == 0xdbff) {
2017
11.5M
            result.append(u'-');
2018
11.5M
        }
2019
11.7M
        _appendToPat(result, end, escapeUnprintable);
2020
11.7M
    }
2021
12.9M
}
2022
2023
/**
2024
 * Append a string representation of this set to result.  This will be
2025
 * a cleaned version of the string passed to applyPattern(), if there
2026
 * is one.  Otherwise it will be generated.
2027
 */
2028
UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2029
                                      UBool escapeUnprintable) const
2030
98.6k
{
2031
98.6k
    if (pat != nullptr) {
2032
98.6k
        int32_t i;
2033
98.6k
        int32_t backslashCount = 0;
2034
3.58M
        for (i=0; i<patLen; ) {
2035
3.48M
            UChar32 c;
2036
3.48M
            U16_NEXT(pat, i, patLen, c);
2037
3.48M
            if (escapeUnprintable ?
2038
3.48M
                    ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
2039
                // If the unprintable character is preceded by an odd
2040
                // number of backslashes, then it has been escaped.
2041
                // Before unescaping it, we delete the final
2042
                // backslash.
2043
2.72M
                if ((backslashCount % 2) == 1) {
2044
0
                    result.truncate(result.length() - 1);
2045
0
                }
2046
2.72M
                ICU_Utility::escape(result, c);
2047
2.72M
                backslashCount = 0;
2048
2.72M
            } else {
2049
765k
                result.append(c);
2050
765k
                if (c == u'\\') {
2051
83.3k
                    ++backslashCount;
2052
682k
                } else {
2053
682k
                    backslashCount = 0;
2054
682k
                }
2055
765k
            }
2056
3.48M
        }
2057
98.6k
        return result;
2058
98.6k
    }
2059
2060
0
    return _generatePattern(result, escapeUnprintable);
2061
98.6k
}
2062
2063
/**
2064
 * Returns a string representation of this set.  If the result of
2065
 * calling this function is passed to a UnicodeSet constructor, it
2066
 * will produce another set that is equal to this one.
2067
 */
2068
UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2069
                                     UBool escapeUnprintable) const
2070
0
{
2071
0
    result.truncate(0);
2072
0
    return _toPattern(result, escapeUnprintable);
2073
0
}
2074
2075
/**
2076
 * Generate and append a string representation of this set to result.
2077
 * This does not use this.pat, the cleaned up copy of the string
2078
 * passed to applyPattern().
2079
 */
2080
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2081
                                            UBool escapeUnprintable) const
2082
1.56M
{
2083
1.56M
    result.append(u'[');
2084
2085
1.56M
    int32_t i = 0;
2086
1.56M
    int32_t limit = len & ~1;  // = 2 * getRangeCount()
2087
2088
    // If the set contains at least 2 intervals and includes both
2089
    // MIN_VALUE and MAX_VALUE, then the inverse representation will
2090
    // be more economical.
2091
    //     if (getRangeCount() >= 2 &&
2092
    //             getRangeStart(0) == MIN_VALUE &&
2093
    //             getRangeEnd(last) == MAX_VALUE)
2094
    // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
2095
    // If limit == len then len is even and the last range ends with MAX_VALUE.
2096
    //
2097
    // *But* do not write the inverse (complement) if there are strings.
2098
    // Since ICU 70, the '^' performs a code point complement which removes all strings.
2099
1.56M
    if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
2100
        // Emit the inverse
2101
7.24k
        result.append(u'^');
2102
        // Offsetting the inversion list index by one lets us
2103
        // iterate over the ranges of the set complement.
2104
7.24k
        i = 1;
2105
7.24k
        --limit;
2106
7.24k
    }
2107
2108
    // Emit the ranges as pairs.
2109
5.94M
    while (i < limit) {
2110
4.38M
        UChar32 start = list[i];  // getRangeStart()
2111
4.38M
        UChar32 end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
2112
4.38M
        if (!(0xd800 <= end && end <= 0xdbff)) {
2113
2.91M
            _appendToPat(result, start, end, escapeUnprintable);
2114
2.91M
            i += 2;
2115
2.91M
        } else {
2116
            // The range ends with a lead surrogate.
2117
            // Avoid writing what looks like a lead+trail surrogate pair.
2118
            // 1. Postpone ranges that start with a lead surrogate code point.
2119
1.46M
            int32_t firstLead = i;
2120
10.0M
            while ((i += 2) < limit && list[i] <= 0xdbff) {}
2121
1.46M
            int32_t firstAfterLead = i;
2122
            // 2. Write following ranges that start with a trail surrogate code point.
2123
1.47M
            while (i < limit && (start = list[i]) <= 0xdfff) {
2124
11.2k
                _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
2125
11.2k
                i += 2;
2126
11.2k
            }
2127
            // 3. Now write the postponed ranges.
2128
11.4M
            for (int j = firstLead; j < firstAfterLead; j += 2) {
2129
10.0M
                _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
2130
10.0M
            }
2131
1.46M
        }
2132
4.38M
    }
2133
2134
1.56M
    if (strings_ != nullptr) {
2135
1.90M
        for (int32_t i = 0; i<strings_->size(); ++i) {
2136
1.85M
            result.append(u'{');
2137
1.85M
            _appendToPat(result,
2138
1.85M
                         *static_cast<const UnicodeString*>(strings_->elementAt(i)),
2139
1.85M
                         escapeUnprintable);
2140
1.85M
            result.append(u'}');
2141
1.85M
        }
2142
44.1k
    }
2143
1.56M
    return result.append(u']');
2144
1.56M
}
2145
2146
/**
2147
* Release existing cached pattern
2148
*/
2149
268M
void UnicodeSet::releasePattern() {
2150
268M
    if (pat) {
2151
672k
        uprv_free(pat);
2152
672k
        pat = nullptr;
2153
672k
        patLen = 0;
2154
672k
    }
2155
268M
}
2156
2157
/**
2158
* Set the new pattern to cache.
2159
*/
2160
672k
void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2161
672k
    releasePattern();
2162
672k
    pat = static_cast<char16_t*>(uprv_malloc((newPatLen + 1) * sizeof(char16_t)));
2163
672k
    if (pat) {
2164
672k
        patLen = newPatLen;
2165
672k
        u_memcpy(pat, newPat, patLen);
2166
672k
        pat[patLen] = 0;
2167
672k
    }
2168
    // else we don't care if malloc failed. This was just a nice cache.
2169
    // We can regenerate an equivalent pattern later when requested.
2170
672k
}
2171
2172
400k
UnicodeSet *UnicodeSet::freeze() {
2173
400k
    if(!isFrozen() && !isBogus()) {
2174
400k
        compact();
2175
2176
        // Optimize contains() and span() and similar functions.
2177
400k
        if (hasStrings()) {
2178
7
            stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL);
2179
7
            if (stringSpan == nullptr) {
2180
0
                setToBogus();
2181
0
                return this;
2182
7
            } else if (!stringSpan->needsStringSpanUTF16()) {
2183
                // All strings are irrelevant for span() etc. because
2184
                // all of each string's code points are contained in this set.
2185
                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2186
                // many relevant strings as UTF-16.
2187
                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2188
0
                delete stringSpan;
2189
0
                stringSpan = nullptr;
2190
0
            }
2191
7
        }
2192
400k
        if (stringSpan == nullptr) {
2193
            // No span-relevant strings: Optimize for code point spans.
2194
400k
            bmpSet=new BMPSet(list, len);
2195
400k
            if (bmpSet == nullptr) { // Check for memory allocation error.
2196
0
                setToBogus();
2197
0
            }
2198
400k
        }
2199
400k
    }
2200
400k
    return this;
2201
400k
}
2202
2203
11.0k
int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2204
11.0k
    if(length>0 && bmpSet!=nullptr) {
2205
5.52k
        return static_cast<int32_t>(bmpSet->span(s, s + length, spanCondition) - s);
2206
5.52k
    }
2207
5.52k
    if(length<0) {
2208
0
        length=u_strlen(s);
2209
0
    }
2210
5.52k
    if(length==0) {
2211
0
        return 0;
2212
0
    }
2213
5.52k
    if(stringSpan!=nullptr) {
2214
0
        return stringSpan->span(s, length, spanCondition);
2215
5.52k
    } else if(hasStrings()) {
2216
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2217
0
                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2218
0
                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2219
0
        UnicodeSetStringSpan strSpan(*this, *strings_, which);
2220
0
        if(strSpan.needsStringSpanUTF16()) {
2221
0
            return strSpan.span(s, length, spanCondition);
2222
0
        }
2223
0
    }
2224
2225
5.52k
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2226
5.52k
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2227
5.52k
    }
2228
2229
5.52k
    UChar32 c;
2230
5.52k
    int32_t start=0, prev=0;
2231
9.02k
    do {
2232
9.02k
        U16_NEXT(s, start, length, c);
2233
9.02k
        if(spanCondition!=contains(c)) {
2234
4.90k
            break;
2235
4.90k
        }
2236
9.02k
    } while((prev=start)<length);
2237
5.52k
    return prev;
2238
5.52k
}
2239
2240
4.90k
int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2241
4.90k
    if(length>0 && bmpSet!=nullptr) {
2242
4.90k
        return static_cast<int32_t>(bmpSet->spanBack(s, s + length, spanCondition) - s);
2243
4.90k
    }
2244
0
    if(length<0) {
2245
0
        length=u_strlen(s);
2246
0
    }
2247
0
    if(length==0) {
2248
0
        return 0;
2249
0
    }
2250
0
    if(stringSpan!=nullptr) {
2251
0
        return stringSpan->spanBack(s, length, spanCondition);
2252
0
    } else if(hasStrings()) {
2253
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2254
0
                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2255
0
                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2256
0
        UnicodeSetStringSpan strSpan(*this, *strings_, which);
2257
0
        if(strSpan.needsStringSpanUTF16()) {
2258
0
            return strSpan.spanBack(s, length, spanCondition);
2259
0
        }
2260
0
    }
2261
2262
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2263
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2264
0
    }
2265
2266
0
    UChar32 c;
2267
0
    int32_t prev=length;
2268
0
    do {
2269
0
        U16_PREV(s, 0, length, c);
2270
0
        if(spanCondition!=contains(c)) {
2271
0
            break;
2272
0
        }
2273
0
    } while((prev=length)>0);
2274
0
    return prev;
2275
0
}
2276
2277
4.90k
int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2278
4.90k
    if(length>0 && bmpSet!=nullptr) {
2279
4.90k
        const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
2280
4.90k
        return static_cast<int32_t>(bmpSet->spanUTF8(s0, length, spanCondition) - s0);
2281
4.90k
    }
2282
0
    if(length<0) {
2283
0
        length = static_cast<int32_t>(uprv_strlen(s));
2284
0
    }
2285
0
    if(length==0) {
2286
0
        return 0;
2287
0
    }
2288
0
    if(stringSpan!=nullptr) {
2289
0
        return stringSpan->spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2290
0
    } else if(hasStrings()) {
2291
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2292
0
                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2293
0
                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2294
0
        UnicodeSetStringSpan strSpan(*this, *strings_, which);
2295
0
        if(strSpan.needsStringSpanUTF8()) {
2296
0
            return strSpan.spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2297
0
        }
2298
0
    }
2299
2300
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2301
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2302
0
    }
2303
2304
0
    UChar32 c;
2305
0
    int32_t start=0, prev=0;
2306
0
    do {
2307
0
        U8_NEXT_OR_FFFD(s, start, length, c);
2308
0
        if(spanCondition!=contains(c)) {
2309
0
            break;
2310
0
        }
2311
0
    } while((prev=start)<length);
2312
0
    return prev;
2313
0
}
2314
2315
4.90k
int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2316
4.90k
    if(length>0 && bmpSet!=nullptr) {
2317
4.90k
        const uint8_t* s0 = reinterpret_cast<const uint8_t*>(s);
2318
4.90k
        return bmpSet->spanBackUTF8(s0, length, spanCondition);
2319
4.90k
    }
2320
0
    if(length<0) {
2321
0
        length = static_cast<int32_t>(uprv_strlen(s));
2322
0
    }
2323
0
    if(length==0) {
2324
0
        return 0;
2325
0
    }
2326
0
    if(stringSpan!=nullptr) {
2327
0
        return stringSpan->spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2328
0
    } else if(hasStrings()) {
2329
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2330
0
                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2331
0
                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2332
0
        UnicodeSetStringSpan strSpan(*this, *strings_, which);
2333
0
        if(strSpan.needsStringSpanUTF8()) {
2334
0
            return strSpan.spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
2335
0
        }
2336
0
    }
2337
2338
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2339
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2340
0
    }
2341
2342
0
    UChar32 c;
2343
0
    int32_t prev=length;
2344
0
    do {
2345
0
        U8_PREV_OR_FFFD(s, 0, length, c);
2346
0
        if(spanCondition!=contains(c)) {
2347
0
            break;
2348
0
        }
2349
0
    } while((prev=length)>0);
2350
0
    return prev;
2351
0
}
2352
2353
U_NAMESPACE_END