Coverage Report

Created: 2025-07-11 06:23

/src/icu/source/common/uniset.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 1999-2015, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   Date        Name        Description
9
*   10/20/99    alan        Creation.
10
**********************************************************************
11
*/
12
13
#include "unicode/utypes.h"
14
#include "unicode/parsepos.h"
15
#include "unicode/symtable.h"
16
#include "unicode/uniset.h"
17
#include "unicode/utf8.h"
18
#include "unicode/utf16.h"
19
#include "ruleiter.h"
20
#include "cmemory.h"
21
#include "cstring.h"
22
#include "patternprops.h"
23
#include "uelement.h"
24
#include "util.h"
25
#include "uvector.h"
26
#include "charstr.h"
27
#include "ustrfmt.h"
28
#include "uassert.h"
29
#include "bmpset.h"
30
#include "unisetspan.h"
31
32
// Define UChar constants using hex for EBCDIC compatibility
33
// Used #define to reduce private static exports and memory access time.
34
0
#define SET_OPEN        ((UChar)0x005B) /*[*/
35
0
#define SET_CLOSE       ((UChar)0x005D) /*]*/
36
0
#define HYPHEN          ((UChar)0x002D) /*-*/
37
0
#define COMPLEMENT      ((UChar)0x005E) /*^*/
38
0
#define COLON           ((UChar)0x003A) /*:*/
39
0
#define BACKSLASH       ((UChar)0x005C) /*\*/
40
0
#define INTERSECTION    ((UChar)0x0026) /*&*/
41
#define UPPER_U         ((UChar)0x0055) /*U*/
42
#define LOWER_U         ((UChar)0x0075) /*u*/
43
0
#define OPEN_BRACE      ((UChar)123)    /*{*/
44
0
#define CLOSE_BRACE     ((UChar)125)    /*}*/
45
#define UPPER_P         ((UChar)0x0050) /*P*/
46
#define LOWER_P         ((UChar)0x0070) /*p*/
47
#define UPPER_N         ((UChar)78)     /*N*/
48
#define EQUALS          ((UChar)0x003D) /*=*/
49
50
// HIGH_VALUE > all valid values. 110000 for codepoints
51
0
#define UNICODESET_HIGH 0x0110000
52
53
// LOW <= all valid values. ZERO for codepoints
54
0
#define UNICODESET_LOW 0x000000
55
56
// initial storage. Must be >= 0
57
0
#define START_EXTRA 16
58
59
// extra amount for growth. Must be >= 0
60
0
#define GROW_EXTRA START_EXTRA
61
62
U_NAMESPACE_BEGIN
63
64
0
SymbolTable::~SymbolTable() {}
65
66
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
67
68
/**
69
 * Modify the given UChar32 variable so that it is in range, by
70
 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
71
 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
72
 * It modifies its argument in-place and also returns it.
73
 */
74
0
static inline UChar32 pinCodePoint(UChar32& c) {
75
0
    if (c < UNICODESET_LOW) {
76
0
        c = UNICODESET_LOW;
77
0
    } else if (c > (UNICODESET_HIGH-1)) {
78
0
        c = (UNICODESET_HIGH-1);
79
0
    }
80
0
    return c;
81
0
}
82
83
//----------------------------------------------------------------
84
// Debugging
85
//----------------------------------------------------------------
86
87
// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
88
// To enable the debugging, define the symbol DEBUG_MEM in the line
89
// below.  This will result in text being sent to stdout that looks
90
// like this:
91
//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
92
//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
93
// Each line lists a construction (ct) or destruction (dt) event, the
94
// object address, the number of outstanding objects after the event,
95
// and the pattern of the object in question.
96
97
// #define DEBUG_MEM
98
99
#ifdef DEBUG_MEM
100
#include <stdio.h>
101
static int32_t _dbgCount = 0;
102
103
static inline void _dbgct(UnicodeSet* set) {
104
    UnicodeString str;
105
    set->toPattern(str, TRUE);
106
    char buf[40];
107
    str.extract(0, 39, buf, "");
108
    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
109
}
110
111
static inline void _dbgdt(UnicodeSet* set) {
112
    UnicodeString str;
113
    set->toPattern(str, TRUE);
114
    char buf[40];
115
    str.extract(0, 39, buf, "");
116
    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
117
}
118
119
#else
120
121
#define _dbgct(set)
122
#define _dbgdt(set)
123
124
#endif
125
126
//----------------------------------------------------------------
127
// UnicodeString in UVector support
128
//----------------------------------------------------------------
129
130
0
static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
131
0
    dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
132
0
}
133
134
0
static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
135
0
    const UnicodeString &a = *(const UnicodeString*)t1.pointer;
136
0
    const UnicodeString &b = *(const UnicodeString*)t2.pointer;
137
0
    return a.compare(b);
138
0
}
139
140
//----------------------------------------------------------------
141
// Constructors &c
142
//----------------------------------------------------------------
143
144
/**
145
 * Constructs an empty set.
146
 */
147
UnicodeSet::UnicodeSet() :
148
0
    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
149
0
    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
150
0
    fFlags(0)
151
0
{
152
0
    UErrorCode status = U_ZERO_ERROR;
153
0
    allocateStrings(status);
154
0
    if (U_FAILURE(status)) {
155
0
        return;
156
0
    }
157
0
    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
158
0
    if(list!=NULL){
159
0
        list[0] = UNICODESET_HIGH;
160
0
    } else { // If memory allocation failed, set to bogus state.
161
0
        setToBogus();
162
0
        return;
163
0
    }
164
0
    _dbgct(this);
165
0
}
166
167
/**
168
 * Constructs a set containing the given range. If <code>end >
169
 * start</code> then an empty set is created.
170
 *
171
 * @param start first character, inclusive, of range
172
 * @param end last character, inclusive, of range
173
 */
174
UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
175
0
    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
176
0
    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
177
0
    fFlags(0)
178
0
{
179
0
    UErrorCode status = U_ZERO_ERROR;
180
0
    allocateStrings(status);
181
0
    if (U_FAILURE(status)) {
182
0
        return;
183
0
    }
184
0
    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
185
0
    if(list!=NULL){
186
0
        list[0] = UNICODESET_HIGH;
187
0
        complement(start, end);
188
0
    } else { // If memory allocation failed, set to bogus state.
189
0
        setToBogus();
190
0
        return;
191
0
    }
192
0
    _dbgct(this);
193
0
}
194
195
/**
196
 * Constructs a set that is identical to the given UnicodeSet.
197
 */
198
UnicodeSet::UnicodeSet(const UnicodeSet& o) :
199
0
    UnicodeFilter(o),
200
0
    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
201
0
    bmpSet(0),
202
0
    buffer(0), bufferCapacity(0),
203
0
    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
204
0
    fFlags(0)
205
0
{
206
0
    UErrorCode status = U_ZERO_ERROR;
207
0
    allocateStrings(status);
208
0
    if (U_FAILURE(status)) {
209
0
        return;
210
0
    }
211
0
    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
212
0
    if(list!=NULL){
213
0
        *this = o;
214
0
    } else { // If memory allocation failed, set to bogus state.
215
0
        setToBogus();
216
0
        return;
217
0
    }
218
0
    _dbgct(this);
219
0
}
220
221
// Copy-construct as thawed.
222
UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
223
0
    UnicodeFilter(o),
224
0
    len(0), capacity(o.len + GROW_EXTRA), list(0),
225
0
    bmpSet(0),
226
0
    buffer(0), bufferCapacity(0),
227
0
    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
228
0
    fFlags(0)
229
0
{
230
0
    UErrorCode status = U_ZERO_ERROR;
231
0
    allocateStrings(status);
232
0
    if (U_FAILURE(status)) {
233
0
        return;
234
0
    }
235
0
    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
236
0
    if(list!=NULL){
237
        // *this = o except for bmpSet and stringSpan
238
0
        len = o.len;
239
0
        uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
240
0
        if (strings != NULL && o.strings != NULL) {
241
0
            strings->assign(*o.strings, cloneUnicodeString, status);
242
0
        } else { // Invalid strings.
243
0
            setToBogus();
244
0
            return;
245
0
        }
246
0
        if (o.pat) {
247
0
            setPattern(UnicodeString(o.pat, o.patLen));
248
0
        }
249
0
    } else { // If memory allocation failed, set to bogus state.
250
0
        setToBogus();
251
0
        return;
252
0
    }
253
0
    _dbgct(this);
254
0
}
255
256
/**
257
 * Destructs the set.
258
 */
259
0
UnicodeSet::~UnicodeSet() {
260
0
    _dbgdt(this); // first!
261
0
    uprv_free(list);
262
0
    delete bmpSet;
263
0
    if (buffer) {
264
0
        uprv_free(buffer);
265
0
    }
266
0
    delete strings;
267
0
    delete stringSpan;
268
0
    releasePattern();
269
0
}
270
271
/**
272
 * Assigns this object to be a copy of another.
273
 */
274
0
UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
275
0
    if (this == &o) {
276
0
        return *this;
277
0
    }
278
0
    if (isFrozen()) {
279
0
        return *this;
280
0
    }
281
0
    if (o.isBogus()) {
282
0
        setToBogus();
283
0
        return *this;
284
0
    }
285
0
    UErrorCode ec = U_ZERO_ERROR;
286
0
    ensureCapacity(o.len, ec);
287
0
    if (U_FAILURE(ec)) {
288
0
        return *this; // There is no way to report this error :-(
289
0
    }
290
0
    len = o.len;
291
0
    uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
292
0
    if (o.bmpSet == NULL) {
293
0
        bmpSet = NULL;
294
0
    } else {
295
0
        bmpSet = new BMPSet(*o.bmpSet, list, len);
296
0
        if (bmpSet == NULL) { // Check for memory allocation error.
297
0
            setToBogus();
298
0
            return *this;
299
0
        }
300
0
    }
301
0
    if (strings != NULL && o.strings != NULL) {
302
0
        strings->assign(*o.strings, cloneUnicodeString, ec);
303
0
    } else { // Invalid strings.
304
0
        setToBogus();
305
0
        return *this;
306
0
    }
307
0
    if (o.stringSpan == NULL) {
308
0
        stringSpan = NULL;
309
0
    } else {
310
0
        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
311
0
        if (stringSpan == NULL) { // Check for memory allocation error.
312
0
            setToBogus();
313
0
            return *this;
314
0
        }
315
0
    }
316
0
    releasePattern();
317
0
    if (o.pat) {
318
0
        setPattern(UnicodeString(o.pat, o.patLen));
319
0
    }
320
0
    return *this;
321
0
}
322
323
/**
324
 * Returns a copy of this object.  All UnicodeMatcher objects have
325
 * to support cloning in order to allow classes using
326
 * UnicodeMatchers, such as Transliterator, to implement cloning.
327
 */
328
0
UnicodeFunctor* UnicodeSet::clone() const {
329
0
    return new UnicodeSet(*this);
330
0
}
331
332
0
UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
333
0
    return new UnicodeSet(*this, TRUE);
334
0
}
335
336
/**
337
 * Compares the specified object with this set for equality.  Returns
338
 * <tt>true</tt> if the two sets
339
 * have the same size, and every member of the specified set is
340
 * contained in this set (or equivalently, every member of this set is
341
 * contained in the specified set).
342
 *
343
 * @param o set to be compared for equality with this set.
344
 * @return <tt>true</tt> if the specified set is equal to this set.
345
 */
346
0
UBool UnicodeSet::operator==(const UnicodeSet& o) const {
347
0
    if (len != o.len) return FALSE;
348
0
    for (int32_t i = 0; i < len; ++i) {
349
0
        if (list[i] != o.list[i]) return FALSE;
350
0
    }
351
0
    if (*strings != *o.strings) return FALSE;
352
0
    return TRUE;
353
0
}
354
355
/**
356
 * Returns the hash code value for this set.
357
 *
358
 * @return the hash code value for this set.
359
 * @see Object#hashCode()
360
 */
361
0
int32_t UnicodeSet::hashCode(void) const {
362
0
    int32_t result = len;
363
0
    for (int32_t i = 0; i < len; ++i) {
364
0
        result *= 1000003;
365
0
        result += list[i];
366
0
    }
367
0
    return result;
368
0
}
369
370
//----------------------------------------------------------------
371
// Public API
372
//----------------------------------------------------------------
373
374
/**
375
 * Returns the number of elements in this set (its cardinality),
376
 * Note than the elements of a set may include both individual
377
 * codepoints and strings.
378
 *
379
 * @return the number of elements in this set (its cardinality).
380
 */
381
0
int32_t UnicodeSet::size(void) const {
382
0
    int32_t n = 0;
383
0
    int32_t count = getRangeCount();
384
0
    for (int32_t i = 0; i < count; ++i) {
385
0
        n += getRangeEnd(i) - getRangeStart(i) + 1;
386
0
    }
387
0
    return n + strings->size();
388
0
}
389
390
/**
391
 * Returns <tt>true</tt> if this set contains no elements.
392
 *
393
 * @return <tt>true</tt> if this set contains no elements.
394
 */
395
0
UBool UnicodeSet::isEmpty(void) const {
396
0
    return len == 1 && strings->size() == 0;
397
0
}
398
399
/**
400
 * Returns true if this set contains the given character.
401
 * @param c character to be checked for containment
402
 * @return true if the test condition is met
403
 */
404
0
UBool UnicodeSet::contains(UChar32 c) const {
405
    // Set i to the index of the start item greater than ch
406
    // We know we will terminate without length test!
407
    // LATER: for large sets, add binary search
408
    //int32_t i = -1;
409
    //for (;;) {
410
    //    if (c < list[++i]) break;
411
    //}
412
0
    if (bmpSet != NULL) {
413
0
        return bmpSet->contains(c);
414
0
    }
415
0
    if (stringSpan != NULL) {
416
0
        return stringSpan->contains(c);
417
0
    }
418
0
    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
419
0
        return FALSE;
420
0
    }
421
0
    int32_t i = findCodePoint(c);
422
0
    return (UBool)(i & 1); // return true if odd
423
0
}
424
425
/**
426
 * Returns the smallest value i such that c < list[i].  Caller
427
 * must ensure that c is a legal value or this method will enter
428
 * an infinite loop.  This method performs a binary search.
429
 * @param c a character in the range MIN_VALUE..MAX_VALUE
430
 * inclusive
431
 * @return the smallest integer i in the range 0..len-1,
432
 * inclusive, such that c < list[i]
433
 */
434
0
int32_t UnicodeSet::findCodePoint(UChar32 c) const {
435
    /* Examples:
436
                                       findCodePoint(c)
437
       set              list[]         c=0 1 3 4 7 8
438
       ===              ==============   ===========
439
       []               [110000]         0 0 0 0 0 0
440
       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
441
       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
442
       [:Any:]          [0, 110000]      1 1 1 1 1 1
443
     */
444
445
    // Return the smallest i such that c < list[i].  Assume
446
    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
447
0
    if (c < list[0])
448
0
        return 0;
449
    // High runner test.  c is often after the last range, so an
450
    // initial check for this condition pays off.
451
0
    int32_t lo = 0;
452
0
    int32_t hi = len - 1;
453
0
    if (lo >= hi || c >= list[hi-1])
454
0
        return hi;
455
    // invariant: c >= list[lo]
456
    // invariant: c < list[hi]
457
0
    for (;;) {
458
0
        int32_t i = (lo + hi) >> 1;
459
0
        if (i == lo) {
460
0
            break; // Found!
461
0
        } else if (c < list[i]) {
462
0
            hi = i;
463
0
        } else {
464
0
            lo = i;
465
0
        }
466
0
    }
467
0
    return hi;
468
0
}
469
470
/**
471
 * Returns true if this set contains every character
472
 * of the given range.
473
 * @param start first character, inclusive, of the range
474
 * @param end last character, inclusive, of the range
475
 * @return true if the test condition is met
476
 */
477
0
UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
478
    //int32_t i = -1;
479
    //for (;;) {
480
    //    if (start < list[++i]) break;
481
    //}
482
0
    int32_t i = findCodePoint(start);
483
0
    return ((i & 1) != 0 && end < list[i]);
484
0
}
485
486
/**
487
 * Returns <tt>true</tt> if this set contains the given
488
 * multicharacter string.
489
 * @param s string to be checked for containment
490
 * @return <tt>true</tt> if this set contains the specified string
491
 */
492
0
UBool UnicodeSet::contains(const UnicodeString& s) const {
493
0
    if (s.length() == 0) return FALSE;
494
0
    int32_t cp = getSingleCP(s);
495
0
    if (cp < 0) {
496
0
        return strings->contains((void*) &s);
497
0
    } else {
498
0
        return contains((UChar32) cp);
499
0
    }
500
0
}
501
502
/**
503
 * Returns true if this set contains all the characters and strings
504
 * of the given set.
505
 * @param c set to be checked for containment
506
 * @return true if the test condition is met
507
 */
508
0
UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
509
    // The specified set is a subset if all of its pairs are contained in
510
    // this set.  It's possible to code this more efficiently in terms of
511
    // direct manipulation of the inversion lists if the need arises.
512
0
    int32_t n = c.getRangeCount();
513
0
    for (int i=0; i<n; ++i) {
514
0
        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
515
0
            return FALSE;
516
0
        }
517
0
    }
518
0
    if (!strings->containsAll(*c.strings)) return FALSE;
519
0
    return TRUE;
520
0
}
521
522
/**
523
 * Returns true if this set contains all the characters
524
 * of the given string.
525
 * @param s string containing characters to be checked for containment
526
 * @return true if the test condition is met
527
 */
528
0
UBool UnicodeSet::containsAll(const UnicodeString& s) const {
529
0
    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
530
0
                   s.length());
531
0
}
532
533
/**
534
 * Returns true if this set contains none of the characters
535
 * of the given range.
536
 * @param start first character, inclusive, of the range
537
 * @param end last character, inclusive, of the range
538
 * @return true if the test condition is met
539
 */
540
0
UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
541
    //int32_t i = -1;
542
    //for (;;) {
543
    //    if (start < list[++i]) break;
544
    //}
545
0
    int32_t i = findCodePoint(start);
546
0
    return ((i & 1) == 0 && end < list[i]);
547
0
}
548
549
/**
550
 * Returns true if this set contains none of the characters and strings
551
 * of the given set.
552
 * @param c set to be checked for containment
553
 * @return true if the test condition is met
554
 */
555
0
UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
556
    // The specified set is a subset if all of its pairs are contained in
557
    // this set.  It's possible to code this more efficiently in terms of
558
    // direct manipulation of the inversion lists if the need arises.
559
0
    int32_t n = c.getRangeCount();
560
0
    for (int32_t i=0; i<n; ++i) {
561
0
        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
562
0
            return FALSE;
563
0
        }
564
0
    }
565
0
    if (!strings->containsNone(*c.strings)) return FALSE;
566
0
    return TRUE;
567
0
}
568
569
/**
570
 * Returns true if this set contains none of the characters
571
 * of the given string.
572
 * @param s string containing characters to be checked for containment
573
 * @return true if the test condition is met
574
 */
575
0
UBool UnicodeSet::containsNone(const UnicodeString& s) const {
576
0
    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
577
0
                   s.length());
578
0
}
579
580
/**
581
 * Returns <tt>true</tt> if this set contains any character whose low byte
582
 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
583
 * indexing.
584
 */
585
0
UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
586
    /* The index value v, in the range [0,255], is contained in this set if
587
     * it is contained in any pair of this set.  Pairs either have the high
588
     * bytes equal, or unequal.  If the high bytes are equal, then we have
589
     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
590
     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
591
     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
592
     * time zone month containment logic.)
593
     */
594
0
    int32_t i;
595
0
    int32_t rangeCount=getRangeCount();
596
0
    for (i=0; i<rangeCount; ++i) {
597
0
        UChar32 low = getRangeStart(i);
598
0
        UChar32 high = getRangeEnd(i);
599
0
        if ((low & ~0xFF) == (high & ~0xFF)) {
600
0
            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
601
0
                return TRUE;
602
0
            }
603
0
        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
604
0
            return TRUE;
605
0
        }
606
0
    }
607
0
    if (strings->size() != 0) {
608
0
        for (i=0; i<strings->size(); ++i) {
609
0
            const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
610
            //if (s.length() == 0) {
611
            //    // Empty strings match everything
612
            //    return TRUE;
613
            //}
614
            // assert(s.length() != 0); // We enforce this elsewhere
615
0
            UChar32 c = s.char32At(0);
616
0
            if ((c & 0xFF) == v) {
617
0
                return TRUE;
618
0
            }
619
0
        }
620
0
    }
621
0
    return FALSE;
622
0
}
623
624
/**
625
 * Implementation of UnicodeMatcher::matches().  Always matches the
626
 * longest possible multichar string.
627
 */
628
UMatchDegree UnicodeSet::matches(const Replaceable& text,
629
                                 int32_t& offset,
630
                                 int32_t limit,
631
0
                                 UBool incremental) {
632
0
    if (offset == limit) {
633
        // Strings, if any, have length != 0, so we don't worry
634
        // about them here.  If we ever allow zero-length strings
635
        // we much check for them here.
636
0
        if (contains(U_ETHER)) {
637
0
            return incremental ? U_PARTIAL_MATCH : U_MATCH;
638
0
        } else {
639
0
            return U_MISMATCH;
640
0
        }
641
0
    } else {
642
0
        if (strings->size() != 0) { // try strings first
643
644
            // might separate forward and backward loops later
645
            // for now they are combined
646
647
            // TODO Improve efficiency of this, at least in the forward
648
            // direction, if not in both.  In the forward direction we
649
            // can assume the strings are sorted.
650
651
0
            int32_t i;
652
0
            UBool forward = offset < limit;
653
654
            // firstChar is the leftmost char to match in the
655
            // forward direction or the rightmost char to match in
656
            // the reverse direction.
657
0
            UChar firstChar = text.charAt(offset);
658
659
            // If there are multiple strings that can match we
660
            // return the longest match.
661
0
            int32_t highWaterLength = 0;
662
663
0
            for (i=0; i<strings->size(); ++i) {
664
0
                const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
665
666
                //if (trial.length() == 0) {
667
                //    return U_MATCH; // null-string always matches
668
                //}
669
                // assert(trial.length() != 0); // We ensure this elsewhere
670
671
0
                UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
672
673
                // Strings are sorted, so we can optimize in the
674
                // forward direction.
675
0
                if (forward && c > firstChar) break;
676
0
                if (c != firstChar) continue;
677
678
0
                int32_t matchLen = matchRest(text, offset, limit, trial);
679
680
0
                if (incremental) {
681
0
                    int32_t maxLen = forward ? limit-offset : offset-limit;
682
0
                    if (matchLen == maxLen) {
683
                        // We have successfully matched but only up to limit.
684
0
                        return U_PARTIAL_MATCH;
685
0
                    }
686
0
                }
687
688
0
                if (matchLen == trial.length()) {
689
                    // We have successfully matched the whole string.
690
0
                    if (matchLen > highWaterLength) {
691
0
                        highWaterLength = matchLen;
692
0
                    }
693
                    // In the forward direction we know strings
694
                    // are sorted so we can bail early.
695
0
                    if (forward && matchLen < highWaterLength) {
696
0
                        break;
697
0
                    }
698
0
                    continue;
699
0
                }
700
0
            }
701
702
            // We've checked all strings without a partial match.
703
            // If we have full matches, return the longest one.
704
0
            if (highWaterLength != 0) {
705
0
                offset += forward ? highWaterLength : -highWaterLength;
706
0
                return U_MATCH;
707
0
            }
708
0
        }
709
0
        return UnicodeFilter::matches(text, offset, limit, incremental);
710
0
    }
711
0
}
712
713
/**
714
 * Returns the longest match for s in text at the given position.
715
 * If limit > start then match forward from start+1 to limit
716
 * matching all characters except s.charAt(0).  If limit < start,
717
 * go backward starting from start-1 matching all characters
718
 * except s.charAt(s.length()-1).  This method assumes that the
719
 * first character, text.charAt(start), matches s, so it does not
720
 * check it.
721
 * @param text the text to match
722
 * @param start the first character to match.  In the forward
723
 * direction, text.charAt(start) is matched against s.charAt(0).
724
 * In the reverse direction, it is matched against
725
 * s.charAt(s.length()-1).
726
 * @param limit the limit offset for matching, either last+1 in
727
 * the forward direction, or last-1 in the reverse direction,
728
 * where last is the index of the last character to match.
729
 * @return If part of s matches up to the limit, return |limit -
730
 * start|.  If all of s matches before reaching the limit, return
731
 * s.length().  If there is a mismatch between s and text, return
732
 * 0
733
 */
734
int32_t UnicodeSet::matchRest(const Replaceable& text,
735
                              int32_t start, int32_t limit,
736
0
                              const UnicodeString& s) {
737
0
    int32_t i;
738
0
    int32_t maxLen;
739
0
    int32_t slen = s.length();
740
0
    if (start < limit) {
741
0
        maxLen = limit - start;
742
0
        if (maxLen > slen) maxLen = slen;
743
0
        for (i = 1; i < maxLen; ++i) {
744
0
            if (text.charAt(start + i) != s.charAt(i)) return 0;
745
0
        }
746
0
    } else {
747
0
        maxLen = start - limit;
748
0
        if (maxLen > slen) maxLen = slen;
749
0
        --slen; // <=> slen = s.length() - 1;
750
0
        for (i = 1; i < maxLen; ++i) {
751
0
            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
752
0
        }
753
0
    }
754
0
    return maxLen;
755
0
}
756
757
/**
758
 * Implement of UnicodeMatcher
759
 */
760
0
void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
761
0
    toUnionTo.addAll(*this);
762
0
}
763
764
/**
765
 * Returns the index of the given character within this set, where
766
 * the set is ordered by ascending code point.  If the character
767
 * is not in this set, return -1.  The inverse of this method is
768
 * <code>charAt()</code>.
769
 * @return an index from 0..size()-1, or -1
770
 */
771
0
int32_t UnicodeSet::indexOf(UChar32 c) const {
772
0
    if (c < MIN_VALUE || c > MAX_VALUE) {
773
0
        return -1;
774
0
    }
775
0
    int32_t i = 0;
776
0
    int32_t n = 0;
777
0
    for (;;) {
778
0
        UChar32 start = list[i++];
779
0
        if (c < start) {
780
0
            return -1;
781
0
        }
782
0
        UChar32 limit = list[i++];
783
0
        if (c < limit) {
784
0
            return n + c - start;
785
0
        }
786
0
        n += limit - start;
787
0
    }
788
0
}
789
790
/**
791
 * Returns the character at the given index within this set, where
792
 * the set is ordered by ascending code point.  If the index is
793
 * out of range, return (UChar32)-1.  The inverse of this method is
794
 * <code>indexOf()</code>.
795
 * @param index an index from 0..size()-1
796
 * @return the character at the given index, or (UChar32)-1.
797
 */
798
0
UChar32 UnicodeSet::charAt(int32_t index) const {
799
0
    if (index >= 0) {
800
        // len2 is the largest even integer <= len, that is, it is len
801
        // for even values and len-1 for odd values.  With odd values
802
        // the last entry is UNICODESET_HIGH.
803
0
        int32_t len2 = len & ~1;
804
0
        for (int32_t i=0; i < len2;) {
805
0
            UChar32 start = list[i++];
806
0
            int32_t count = list[i++] - start;
807
0
            if (index < count) {
808
0
                return (UChar32)(start + index);
809
0
            }
810
0
            index -= count;
811
0
        }
812
0
    }
813
0
    return (UChar32)-1;
814
0
}
815
816
/**
817
 * Make this object represent the range <code>start - end</code>.
818
 * If <code>end > start</code> then this object is set to an
819
 * an empty range.
820
 *
821
 * @param start first character in the set, inclusive
822
 * @rparam end last character in the set, inclusive
823
 */
824
0
UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
825
0
    clear();
826
0
    complement(start, end);
827
0
    return *this;
828
0
}
829
830
/**
831
 * Adds the specified range to this set if it is not already
832
 * present.  If this set already contains the specified range,
833
 * the call leaves this set unchanged.  If <code>end > start</code>
834
 * then an empty range is added, leaving the set unchanged.
835
 *
836
 * @param start first character, inclusive, of range to be added
837
 * to this set.
838
 * @param end last character, inclusive, of range to be added
839
 * to this set.
840
 */
841
0
UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
842
0
    if (pinCodePoint(start) < pinCodePoint(end)) {
843
0
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
844
0
        add(range, 2, 0);
845
0
    } else if (start == end) {
846
0
        add(start);
847
0
    }
848
0
    return *this;
849
0
}
850
851
// #define DEBUG_US_ADD
852
853
#ifdef DEBUG_US_ADD
854
#include <stdio.h>
855
void dump(UChar32 c) {
856
    if (c <= 0xFF) {
857
        printf("%c", (char)c);
858
    } else {
859
        printf("U+%04X", c);
860
    }
861
}
862
void dump(const UChar32* list, int32_t len) {
863
    printf("[");
864
    for (int32_t i=0; i<len; ++i) {
865
        if (i != 0) printf(", ");
866
        dump(list[i]);
867
    }
868
    printf("]");
869
}
870
#endif
871
872
/**
873
 * Adds the specified character to this set if it is not already
874
 * present.  If this set already contains the specified character,
875
 * the call leaves this set unchanged.
876
 */
877
0
UnicodeSet& UnicodeSet::add(UChar32 c) {
878
    // find smallest i such that c < list[i]
879
    // if odd, then it is IN the set
880
    // if even, then it is OUT of the set
881
0
    int32_t i = findCodePoint(pinCodePoint(c));
882
883
    // already in set?
884
0
    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
885
886
    // HIGH is 0x110000
887
    // assert(list[len-1] == HIGH);
888
889
    // empty = [HIGH]
890
    // [start_0, limit_0, start_1, limit_1, HIGH]
891
892
    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
893
    //                             ^
894
    //                             list[i]
895
896
    // i == 0 means c is before the first range
897
898
#ifdef DEBUG_US_ADD
899
    printf("Add of ");
900
    dump(c);
901
    printf(" found at %d", i);
902
    printf(": ");
903
    dump(list, len);
904
    printf(" => ");
905
#endif
906
907
0
    if (c == list[i]-1) {
908
        // c is before start of next range
909
0
        list[i] = c;
910
        // if we touched the HIGH mark, then add a new one
911
0
        if (c == (UNICODESET_HIGH - 1)) {
912
0
            UErrorCode status = U_ZERO_ERROR;
913
0
            ensureCapacity(len+1, status);
914
0
            if (U_FAILURE(status)) {
915
0
                return *this; // There is no way to report this error :-(
916
0
            }
917
0
            list[len++] = UNICODESET_HIGH;
918
0
        }
919
0
        if (i > 0 && c == list[i-1]) {
920
            // collapse adjacent ranges
921
922
            // [..., start_k-1, c, c, limit_k, ..., HIGH]
923
            //                     ^
924
            //                     list[i]
925
926
            //for (int32_t k=i-1; k<len-2; ++k) {
927
            //    list[k] = list[k+2];
928
            //}
929
0
            UChar32* dst = list + i - 1;
930
0
            UChar32* src = dst + 2;
931
0
            UChar32* srclimit = list + len;
932
0
            while (src < srclimit) *(dst++) = *(src++);
933
934
0
            len -= 2;
935
0
        }
936
0
    }
937
938
0
    else if (i > 0 && c == list[i-1]) {
939
        // c is after end of prior range
940
0
        list[i-1]++;
941
        // no need to check for collapse here
942
0
    }
943
944
0
    else {
945
        // At this point we know the new char is not adjacent to
946
        // any existing ranges, and it is not 10FFFF.
947
948
949
        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
950
        //                             ^
951
        //                             list[i]
952
953
        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
954
        //                             ^
955
        //                             list[i]
956
957
0
        UErrorCode status = U_ZERO_ERROR;
958
0
        ensureCapacity(len+2, status);
959
0
        if (U_FAILURE(status)) {
960
0
            return *this; // There is no way to report this error :-(
961
0
        }
962
963
        //for (int32_t k=len-1; k>=i; --k) {
964
        //    list[k+2] = list[k];
965
        //}
966
0
        UChar32* src = list + len;
967
0
        UChar32* dst = src + 2;
968
0
        UChar32* srclimit = list + i;
969
0
        while (src > srclimit) *(--dst) = *(--src);
970
971
0
        list[i] = c;
972
0
        list[i+1] = c+1;
973
0
        len += 2;
974
0
    }
975
976
#ifdef DEBUG_US_ADD
977
    dump(list, len);
978
    printf("\n");
979
980
    for (i=1; i<len; ++i) {
981
        if (list[i] <= list[i-1]) {
982
            // Corrupt array!
983
            printf("ERROR: list has been corrupted\n");
984
            exit(1);
985
        }
986
    }
987
#endif
988
989
0
    releasePattern();
990
0
    return *this;
991
0
}
992
993
/**
994
 * Adds the specified multicharacter to this set if it is not already
995
 * present.  If this set already contains the multicharacter,
996
 * the call leaves this set unchanged.
997
 * Thus "ch" => {"ch"}
998
 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
999
 * @param s the source string
1000
 * @return the modified set, for chaining
1001
 */
1002
0
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1003
0
    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1004
0
    int32_t cp = getSingleCP(s);
1005
0
    if (cp < 0) {
1006
0
        if (!strings->contains((void*) &s)) {
1007
0
            _add(s);
1008
0
            releasePattern();
1009
0
        }
1010
0
    } else {
1011
0
        add((UChar32)cp);
1012
0
    }
1013
0
    return *this;
1014
0
}
1015
1016
/**
1017
 * Adds the given string, in order, to 'strings'.  The given string
1018
 * must have been checked by the caller to not be empty and to not
1019
 * already be in 'strings'.
1020
 */
1021
0
void UnicodeSet::_add(const UnicodeString& s) {
1022
0
    if (isFrozen() || isBogus()) {
1023
0
        return;
1024
0
    }
1025
0
    UnicodeString* t = new UnicodeString(s);
1026
0
    if (t == NULL) { // Check for memory allocation error.
1027
0
        setToBogus();
1028
0
        return;
1029
0
    }
1030
0
    UErrorCode ec = U_ZERO_ERROR;
1031
0
    strings->sortedInsert(t, compareUnicodeString, ec);
1032
0
    if (U_FAILURE(ec)) {
1033
0
        setToBogus();
1034
0
        delete t;
1035
0
    }
1036
0
}
1037
1038
/**
1039
 * @return a code point IF the string consists of a single one.
1040
 * otherwise returns -1.
1041
 * @param string to test
1042
 */
1043
0
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1044
    //if (s.length() < 1) {
1045
    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1046
    //}
1047
0
    if (s.length() > 2) return -1;
1048
0
    if (s.length() == 1) return s.charAt(0);
1049
1050
    // at this point, len = 2
1051
0
    UChar32 cp = s.char32At(0);
1052
0
    if (cp > 0xFFFF) { // is surrogate pair
1053
0
        return cp;
1054
0
    }
1055
0
    return -1;
1056
0
}
1057
1058
/**
1059
 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060
 * If this set already any particular character, it has no effect on that character.
1061
 * @param the source string
1062
 * @return the modified set, for chaining
1063
 */
1064
0
UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1065
0
    UChar32 cp;
1066
0
    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1067
0
        cp = s.char32At(i);
1068
0
        add(cp);
1069
0
    }
1070
0
    return *this;
1071
0
}
1072
1073
/**
1074
 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075
 * If this set already any particular character, it has no effect on that character.
1076
 * @param the source string
1077
 * @return the modified set, for chaining
1078
 */
1079
0
UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1080
0
    UnicodeSet set;
1081
0
    set.addAll(s);
1082
0
    retainAll(set);
1083
0
    return *this;
1084
0
}
1085
1086
/**
1087
 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088
 * If this set already any particular character, it has no effect on that character.
1089
 * @param the source string
1090
 * @return the modified set, for chaining
1091
 */
1092
0
UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1093
0
    UnicodeSet set;
1094
0
    set.addAll(s);
1095
0
    complementAll(set);
1096
0
    return *this;
1097
0
}
1098
1099
/**
1100
 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101
 * If this set already any particular character, it has no effect on that character.
1102
 * @param the source string
1103
 * @return the modified set, for chaining
1104
 */
1105
0
UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1106
0
    UnicodeSet set;
1107
0
    set.addAll(s);
1108
0
    removeAll(set);
1109
0
    return *this;
1110
0
}
1111
1112
0
UnicodeSet& UnicodeSet::removeAllStrings() {
1113
0
    strings->removeAllElements();
1114
0
    return *this;
1115
0
}
1116
1117
1118
/**
1119
 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120
 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121
 * @param the source string
1122
 * @return a newly created set containing the given string
1123
 */
1124
0
UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1125
0
    UnicodeSet *set = new UnicodeSet();
1126
0
    if (set != NULL) { // Check for memory allocation error.
1127
0
        set->add(s);
1128
0
    }
1129
0
    return set;
1130
0
}
1131
1132
1133
/**
1134
 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135
 * @param the source string
1136
 * @return a newly created set containing the given characters
1137
 */
1138
0
UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1139
0
    UnicodeSet *set = new UnicodeSet();
1140
0
    if (set != NULL) { // Check for memory allocation error.
1141
0
        set->addAll(s);
1142
0
    }
1143
0
    return set;
1144
0
}
1145
1146
/**
1147
 * Retain only the elements in this set that are contained in the
1148
 * specified range.  If <code>end > start</code> then an empty range is
1149
 * retained, leaving the set empty.
1150
 *
1151
 * @param start first character, inclusive, of range to be retained
1152
 * to this set.
1153
 * @param end last character, inclusive, of range to be retained
1154
 * to this set.
1155
 */
1156
0
UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1157
0
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1158
0
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1159
0
        retain(range, 2, 0);
1160
0
    } else {
1161
0
        clear();
1162
0
    }
1163
0
    return *this;
1164
0
}
1165
1166
0
UnicodeSet& UnicodeSet::retain(UChar32 c) {
1167
0
    return retain(c, c);
1168
0
}
1169
1170
/**
1171
 * Removes the specified range from this set if it is present.
1172
 * The set will not contain the specified range once the call
1173
 * returns.  If <code>end > start</code> then an empty range is
1174
 * removed, leaving the set unchanged.
1175
 *
1176
 * @param start first character, inclusive, of range to be removed
1177
 * from this set.
1178
 * @param end last character, inclusive, of range to be removed
1179
 * from this set.
1180
 */
1181
0
UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1182
0
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1183
0
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1184
0
        retain(range, 2, 2);
1185
0
    }
1186
0
    return *this;
1187
0
}
1188
1189
/**
1190
 * Removes the specified character from this set if it is present.
1191
 * The set will not contain the specified range once the call
1192
 * returns.
1193
 */
1194
0
UnicodeSet& UnicodeSet::remove(UChar32 c) {
1195
0
    return remove(c, c);
1196
0
}
1197
1198
/**
1199
 * Removes the specified string from this set if it is present.
1200
 * The set will not contain the specified character once the call
1201
 * returns.
1202
 * @param the source string
1203
 * @return the modified set, for chaining
1204
 */
1205
0
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1206
0
    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1207
0
    int32_t cp = getSingleCP(s);
1208
0
    if (cp < 0) {
1209
0
        strings->removeElement((void*) &s);
1210
0
        releasePattern();
1211
0
    } else {
1212
0
        remove((UChar32)cp, (UChar32)cp);
1213
0
    }
1214
0
    return *this;
1215
0
}
1216
1217
/**
1218
 * Complements the specified range in this set.  Any character in
1219
 * the range will be removed if it is in this set, or will be
1220
 * added if it is not in this set.  If <code>end > start</code>
1221
 * then an empty range is xor'ed, leaving the set unchanged.
1222
 *
1223
 * @param start first character, inclusive, of range to be removed
1224
 * from this set.
1225
 * @param end last character, inclusive, of range to be removed
1226
 * from this set.
1227
 */
1228
0
UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1229
0
    if (isFrozen() || isBogus()) {
1230
0
        return *this;
1231
0
    }
1232
0
    if (pinCodePoint(start) <= pinCodePoint(end)) {
1233
0
        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1234
0
        exclusiveOr(range, 2, 0);
1235
0
    }
1236
0
    releasePattern();
1237
0
    return *this;
1238
0
}
1239
1240
0
UnicodeSet& UnicodeSet::complement(UChar32 c) {
1241
0
    return complement(c, c);
1242
0
}
1243
1244
/**
1245
 * This is equivalent to
1246
 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1247
 */
1248
0
UnicodeSet& UnicodeSet::complement(void) {
1249
0
    if (isFrozen() || isBogus()) {
1250
0
        return *this;
1251
0
    }
1252
0
    UErrorCode status = U_ZERO_ERROR;
1253
0
    if (list[0] == UNICODESET_LOW) {
1254
0
        ensureBufferCapacity(len-1, status);
1255
0
        if (U_FAILURE(status)) {
1256
0
            return *this;
1257
0
        }
1258
0
        uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
1259
0
        --len;
1260
0
    } else {
1261
0
        ensureBufferCapacity(len+1, status);
1262
0
        if (U_FAILURE(status)) {
1263
0
            return *this;
1264
0
        }
1265
0
        uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
1266
0
        buffer[0] = UNICODESET_LOW;
1267
0
        ++len;
1268
0
    }
1269
0
    swapBuffers();
1270
0
    releasePattern();
1271
0
    return *this;
1272
0
}
1273
1274
/**
1275
 * Complement the specified string in this set.
1276
 * The set will not contain the specified string once the call
1277
 * returns.
1278
 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279
 * @param s the string to complement
1280
 * @return this object, for chaining
1281
 */
1282
0
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1283
0
    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1284
0
    int32_t cp = getSingleCP(s);
1285
0
    if (cp < 0) {
1286
0
        if (strings->contains((void*) &s)) {
1287
0
            strings->removeElement((void*) &s);
1288
0
        } else {
1289
0
            _add(s);
1290
0
        }
1291
0
        releasePattern();
1292
0
    } else {
1293
0
        complement((UChar32)cp, (UChar32)cp);
1294
0
    }
1295
0
    return *this;
1296
0
}
1297
1298
/**
1299
 * Adds all of the elements in the specified set to this set if
1300
 * they're not already present.  This operation effectively
1301
 * modifies this set so that its value is the <i>union</i> of the two
1302
 * sets.  The behavior of this operation is unspecified if the specified
1303
 * collection is modified while the operation is in progress.
1304
 *
1305
 * @param c set whose elements are to be added to this set.
1306
 * @see #add(char, char)
1307
 */
1308
0
UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1309
0
    if ( c.len>0 && c.list!=NULL ) {
1310
0
        add(c.list, c.len, 0);
1311
0
    }
1312
1313
    // Add strings in order
1314
0
    if ( c.strings!=NULL ) {
1315
0
        for (int32_t i=0; i<c.strings->size(); ++i) {
1316
0
            const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1317
0
            if (!strings->contains((void*) s)) {
1318
0
                _add(*s);
1319
0
            }
1320
0
        }
1321
0
    }
1322
0
    return *this;
1323
0
}
1324
1325
/**
1326
 * Retains only the elements in this set that are contained in the
1327
 * specified set.  In other words, removes from this set all of
1328
 * its elements that are not contained in the specified set.  This
1329
 * operation effectively modifies this set so that its value is
1330
 * the <i>intersection</i> of the two sets.
1331
 *
1332
 * @param c set that defines which elements this set will retain.
1333
 */
1334
0
UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1335
0
    if (isFrozen() || isBogus()) {
1336
0
        return *this;
1337
0
    }
1338
0
    retain(c.list, c.len, 0);
1339
0
    strings->retainAll(*c.strings);
1340
0
    return *this;
1341
0
}
1342
1343
/**
1344
 * Removes from this set all of its elements that are contained in the
1345
 * specified set.  This operation effectively modifies this
1346
 * set so that its value is the <i>asymmetric set difference</i> of
1347
 * the two sets.
1348
 *
1349
 * @param c set that defines which elements will be removed from
1350
 *          this set.
1351
 */
1352
0
UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1353
0
    if (isFrozen() || isBogus()) {
1354
0
        return *this;
1355
0
    }
1356
0
    retain(c.list, c.len, 2);
1357
0
    strings->removeAll(*c.strings);
1358
0
    return *this;
1359
0
}
1360
1361
/**
1362
 * Complements in this set all elements contained in the specified
1363
 * set.  Any character in the other set will be removed if it is
1364
 * in this set, or will be added if it is not in this set.
1365
 *
1366
 * @param c set that defines which elements will be xor'ed from
1367
 *          this set.
1368
 */
1369
0
UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1370
0
    if (isFrozen() || isBogus()) {
1371
0
        return *this;
1372
0
    }
1373
0
    exclusiveOr(c.list, c.len, 0);
1374
1375
0
    for (int32_t i=0; i<c.strings->size(); ++i) {
1376
0
        void* e = c.strings->elementAt(i);
1377
0
        if (!strings->removeElement(e)) {
1378
0
            _add(*(const UnicodeString*)e);
1379
0
        }
1380
0
    }
1381
0
    return *this;
1382
0
}
1383
1384
/**
1385
 * Removes all of the elements from this set.  This set will be
1386
 * empty after this call returns.
1387
 */
1388
0
UnicodeSet& UnicodeSet::clear(void) {
1389
0
    if (isFrozen()) {
1390
0
        return *this;
1391
0
    }
1392
0
    if (list != NULL) {
1393
0
        list[0] = UNICODESET_HIGH;
1394
0
    }
1395
0
    len = 1;
1396
0
    releasePattern();
1397
0
    if (strings != NULL) {
1398
0
        strings->removeAllElements();
1399
0
    }
1400
0
    if (list != NULL && strings != NULL) {
1401
        // Remove bogus
1402
0
        fFlags = 0;
1403
0
    }
1404
0
    return *this;
1405
0
}
1406
1407
/**
1408
 * Iteration method that returns the number of ranges contained in
1409
 * this set.
1410
 * @see #getRangeStart
1411
 * @see #getRangeEnd
1412
 */
1413
0
int32_t UnicodeSet::getRangeCount() const {
1414
0
    return len/2;
1415
0
}
1416
1417
/**
1418
 * Iteration method that returns the first character in the
1419
 * specified range of this set.
1420
 * @see #getRangeCount
1421
 * @see #getRangeEnd
1422
 */
1423
0
UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1424
0
    return list[index*2];
1425
0
}
1426
1427
/**
1428
 * Iteration method that returns the last character in the
1429
 * specified range of this set.
1430
 * @see #getRangeStart
1431
 * @see #getRangeEnd
1432
 */
1433
0
UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1434
0
    return list[index*2 + 1] - 1;
1435
0
}
1436
1437
0
int32_t UnicodeSet::getStringCount() const {
1438
0
    return strings->size();
1439
0
}
1440
1441
0
const UnicodeString* UnicodeSet::getString(int32_t index) const {
1442
0
    return (const UnicodeString*) strings->elementAt(index);
1443
0
}
1444
1445
/**
1446
 * Reallocate this objects internal structures to take up the least
1447
 * possible space, without changing this object's value.
1448
 */
1449
0
UnicodeSet& UnicodeSet::compact() {
1450
0
    if (isFrozen() || isBogus()) {
1451
0
        return *this;
1452
0
    }
1453
    // Delete buffer first to defragment memory less.
1454
0
    if (buffer != NULL) {
1455
0
        uprv_free(buffer);
1456
0
        buffer = NULL;
1457
0
    }
1458
0
    if (len < capacity) {
1459
        // Make the capacity equal to len or 1.
1460
        // We don't want to realloc of 0 size.
1461
0
        int32_t newCapacity = len + (len == 0);
1462
0
        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1463
0
        if (temp) {
1464
0
            list = temp;
1465
0
            capacity = newCapacity;
1466
0
        }
1467
        // else what the heck happened?! We allocated less memory!
1468
        // Oh well. We'll keep our original array.
1469
0
    }
1470
0
    return *this;
1471
0
}
1472
1473
#ifdef DEBUG_SERIALIZE
1474
#include <stdio.h>
1475
#endif
1476
1477
/**
1478
 * Deserialize constructor.
1479
 */
1480
UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1481
0
  : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1482
0
    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1483
0
    fFlags(0) {
1484
1485
0
  if(U_FAILURE(ec)) {
1486
0
    setToBogus();
1487
0
    return;
1488
0
  }
1489
1490
0
  if( (serialization != kSerialized)
1491
0
      || (data==NULL)
1492
0
      || (dataLen < 1)) {
1493
0
    ec = U_ILLEGAL_ARGUMENT_ERROR;
1494
0
    setToBogus();
1495
0
    return;
1496
0
  }
1497
1498
0
  allocateStrings(ec);
1499
0
  if (U_FAILURE(ec)) {
1500
0
    setToBogus();
1501
0
    return;
1502
0
  }
1503
1504
  // bmp?
1505
0
  int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1506
0
  int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1507
1508
0
  len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1509
#ifdef DEBUG_SERIALIZE
1510
  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1511
#endif
1512
0
  capacity = len+1;
1513
0
  list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1514
0
  if(!list || U_FAILURE(ec)) {
1515
0
    setToBogus();
1516
0
    return;
1517
0
  }
1518
  // copy bmp
1519
0
  int32_t i;
1520
0
  for(i = 0; i< bmpLength;i++) {
1521
0
    list[i] = data[i+headerSize];
1522
#ifdef DEBUG_SERIALIZE
1523
    printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1524
#endif
1525
0
  }
1526
  // copy smp
1527
0
  for(i=bmpLength;i<len;i++) {
1528
0
    list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1529
0
              ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1530
#ifdef DEBUG_SERIALIZE
1531
    printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1532
#endif
1533
0
  }
1534
  // terminator
1535
0
  list[len++]=UNICODESET_HIGH;
1536
0
}
1537
1538
1539
0
int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1540
0
    int32_t bmpLength, length, destLength;
1541
1542
0
    if (U_FAILURE(ec)) {
1543
0
        return 0;
1544
0
    }
1545
1546
0
    if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1547
0
        ec=U_ILLEGAL_ARGUMENT_ERROR;
1548
0
        return 0;
1549
0
    }
1550
1551
    /* count necessary 16-bit units */
1552
0
    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553
    // assert(length>=0);
1554
0
    if (length==0) {
1555
        /* empty set */
1556
0
        if (destCapacity>0) {
1557
0
            *dest=0;
1558
0
        } else {
1559
0
            ec=U_BUFFER_OVERFLOW_ERROR;
1560
0
        }
1561
0
        return 1;
1562
0
    }
1563
    /* now length>0 */
1564
1565
0
    if (this->list[length-1]<=0xffff) {
1566
        /* all BMP */
1567
0
        bmpLength=length;
1568
0
    } else if (this->list[0]>=0x10000) {
1569
        /* all supplementary */
1570
0
        bmpLength=0;
1571
0
        length*=2;
1572
0
    } else {
1573
        /* some BMP, some supplementary */
1574
0
        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1575
0
        length=bmpLength+2*(length-bmpLength);
1576
0
    }
1577
#ifdef DEBUG_SERIALIZE
1578
    printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1579
#endif
1580
    /* length: number of 16-bit array units */
1581
0
    if (length>0x7fff) {
1582
        /* there are only 15 bits for the length in the first serialized word */
1583
0
        ec=U_INDEX_OUTOFBOUNDS_ERROR;
1584
0
        return 0;
1585
0
    }
1586
1587
    /*
1588
     * total serialized length:
1589
     * number of 16-bit array units (length) +
1590
     * 1 length unit (always) +
1591
     * 1 bmpLength unit (if there are supplementary values)
1592
     */
1593
0
    destLength=length+((length>bmpLength)?2:1);
1594
0
    if (destLength<=destCapacity) {
1595
0
        const UChar32 *p;
1596
0
        int32_t i;
1597
1598
#ifdef DEBUG_SERIALIZE
1599
        printf("writeHdr\n");
1600
#endif
1601
0
        *dest=(uint16_t)length;
1602
0
        if (length>bmpLength) {
1603
0
            *dest|=0x8000;
1604
0
            *++dest=(uint16_t)bmpLength;
1605
0
        }
1606
0
        ++dest;
1607
1608
        /* write the BMP part of the array */
1609
0
        p=this->list;
1610
0
        for (i=0; i<bmpLength; ++i) {
1611
#ifdef DEBUG_SERIALIZE
1612
          printf("writebmp: %x\n", (int)*p);
1613
#endif
1614
0
            *dest++=(uint16_t)*p++;
1615
0
        }
1616
1617
        /* write the supplementary part of the array */
1618
0
        for (; i<length; i+=2) {
1619
#ifdef DEBUG_SERIALIZE
1620
          printf("write32: %x\n", (int)*p);
1621
#endif
1622
0
            *dest++=(uint16_t)(*p>>16);
1623
0
            *dest++=(uint16_t)*p++;
1624
0
        }
1625
0
    } else {
1626
0
        ec=U_BUFFER_OVERFLOW_ERROR;
1627
0
    }
1628
0
    return destLength;
1629
0
}
1630
1631
//----------------------------------------------------------------
1632
// Implementation: Utility methods
1633
//----------------------------------------------------------------
1634
1635
/**
1636
 * Allocate our strings vector and return TRUE if successful.
1637
 */
1638
0
UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1639
0
    if (U_FAILURE(status)) {
1640
0
        return FALSE;
1641
0
    }
1642
0
    strings = new UVector(uprv_deleteUObject,
1643
0
                          uhash_compareUnicodeString, 1, status);
1644
0
    if (strings == NULL) { // Check for memory allocation error.
1645
0
        status = U_MEMORY_ALLOCATION_ERROR;
1646
0
        return FALSE;
1647
0
    }
1648
0
    if (U_FAILURE(status)) {
1649
0
        delete strings;
1650
0
        strings = NULL;
1651
0
        return FALSE;
1652
0
    } 
1653
0
    return TRUE;
1654
0
}
1655
1656
0
void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1657
0
    if (newLen <= capacity)
1658
0
        return;
1659
0
    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1660
0
    if (temp == NULL) {
1661
0
        ec = U_MEMORY_ALLOCATION_ERROR;
1662
0
        setToBogus();
1663
0
        return;
1664
0
    }
1665
0
    list = temp;
1666
0
    capacity = newLen + GROW_EXTRA;
1667
    // else we keep the original contents on the memory failure.
1668
0
}
1669
1670
0
void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1671
0
    if (buffer != NULL && newLen <= bufferCapacity)
1672
0
        return;
1673
0
    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1674
0
    if (temp == NULL) {
1675
0
        ec = U_MEMORY_ALLOCATION_ERROR;
1676
0
        setToBogus();
1677
0
        return;
1678
0
    }
1679
0
    buffer = temp;
1680
0
    bufferCapacity = newLen + GROW_EXTRA;
1681
    // else we keep the original contents on the memory failure.
1682
0
}
1683
1684
/**
1685
 * Swap list and buffer.
1686
 */
1687
0
void UnicodeSet::swapBuffers(void) {
1688
    // swap list and buffer
1689
0
    UChar32* temp = list;
1690
0
    list = buffer;
1691
0
    buffer = temp;
1692
1693
0
    int32_t c = capacity;
1694
0
    capacity = bufferCapacity;
1695
0
    bufferCapacity = c;
1696
0
}
1697
1698
0
void UnicodeSet::setToBogus() {
1699
0
    clear(); // Remove everything in the set.
1700
0
    fFlags = kIsBogus;
1701
0
}
1702
1703
//----------------------------------------------------------------
1704
// Implementation: Fundamental operators
1705
//----------------------------------------------------------------
1706
1707
0
static inline UChar32 max(UChar32 a, UChar32 b) {
1708
0
    return (a > b) ? a : b;
1709
0
}
1710
1711
// polarity = 0, 3 is normal: x xor y
1712
// polarity = 1, 2: x xor ~y == x === y
1713
1714
0
void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1715
0
    if (isFrozen() || isBogus()) {
1716
0
        return;
1717
0
    }
1718
0
    UErrorCode status = U_ZERO_ERROR;
1719
0
    ensureBufferCapacity(len + otherLen, status);
1720
0
    if (U_FAILURE(status)) {
1721
0
        return;
1722
0
    }
1723
1724
0
    int32_t i = 0, j = 0, k = 0;
1725
0
    UChar32 a = list[i++];
1726
0
    UChar32 b;
1727
0
    if (polarity == 1 || polarity == 2) {
1728
0
        b = UNICODESET_LOW;
1729
0
        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1730
0
            ++j;
1731
0
            b = other[j];
1732
0
        }
1733
0
    } else {
1734
0
        b = other[j++];
1735
0
    }
1736
    // simplest of all the routines
1737
    // sort the values, discarding identicals!
1738
0
    for (;;) {
1739
0
        if (a < b) {
1740
0
            buffer[k++] = a;
1741
0
            a = list[i++];
1742
0
        } else if (b < a) {
1743
0
            buffer[k++] = b;
1744
0
            b = other[j++];
1745
0
        } else if (a != UNICODESET_HIGH) { // at this point, a == b
1746
            // discard both values!
1747
0
            a = list[i++];
1748
0
            b = other[j++];
1749
0
        } else { // DONE!
1750
0
            buffer[k++] = UNICODESET_HIGH;
1751
0
            len = k;
1752
0
            break;
1753
0
        }
1754
0
    }
1755
0
    swapBuffers();
1756
0
    releasePattern();
1757
0
}
1758
1759
// polarity = 0 is normal: x union y
1760
// polarity = 2: x union ~y
1761
// polarity = 1: ~x union y
1762
// polarity = 3: ~x union ~y
1763
1764
0
void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1765
0
    if (isFrozen() || isBogus() || other==NULL) {
1766
0
        return;
1767
0
    }
1768
0
    UErrorCode status = U_ZERO_ERROR;
1769
0
    ensureBufferCapacity(len + otherLen, status);
1770
0
    if (U_FAILURE(status)) {
1771
0
        return;
1772
0
    }
1773
1774
0
    int32_t i = 0, j = 0, k = 0;
1775
0
    UChar32 a = list[i++];
1776
0
    UChar32 b = other[j++];
1777
    // change from xor is that we have to check overlapping pairs
1778
    // polarity bit 1 means a is second, bit 2 means b is.
1779
0
    for (;;) {
1780
0
        switch (polarity) {
1781
0
          case 0: // both first; take lower if unequal
1782
0
            if (a < b) { // take a
1783
                // Back up over overlapping ranges in buffer[]
1784
0
                if (k > 0 && a <= buffer[k-1]) {
1785
                    // Pick latter end value in buffer[] vs. list[]
1786
0
                    a = max(list[i], buffer[--k]);
1787
0
                } else {
1788
                    // No overlap
1789
0
                    buffer[k++] = a;
1790
0
                    a = list[i];
1791
0
                }
1792
0
                i++; // Common if/else code factored out
1793
0
                polarity ^= 1;
1794
0
            } else if (b < a) { // take b
1795
0
                if (k > 0 && b <= buffer[k-1]) {
1796
0
                    b = max(other[j], buffer[--k]);
1797
0
                } else {
1798
0
                    buffer[k++] = b;
1799
0
                    b = other[j];
1800
0
                }
1801
0
                j++;
1802
0
                polarity ^= 2;
1803
0
            } else { // a == b, take a, drop b
1804
0
                if (a == UNICODESET_HIGH) goto loop_end;
1805
                // This is symmetrical; it doesn't matter if
1806
                // we backtrack with a or b. - liu
1807
0
                if (k > 0 && a <= buffer[k-1]) {
1808
0
                    a = max(list[i], buffer[--k]);
1809
0
                } else {
1810
                    // No overlap
1811
0
                    buffer[k++] = a;
1812
0
                    a = list[i];
1813
0
                }
1814
0
                i++;
1815
0
                polarity ^= 1;
1816
0
                b = other[j++];
1817
0
                polarity ^= 2;
1818
0
            }
1819
0
            break;
1820
0
          case 3: // both second; take higher if unequal, and drop other
1821
0
            if (b <= a) { // take a
1822
0
                if (a == UNICODESET_HIGH) goto loop_end;
1823
0
                buffer[k++] = a;
1824
0
            } else { // take b
1825
0
                if (b == UNICODESET_HIGH) goto loop_end;
1826
0
                buffer[k++] = b;
1827
0
            }
1828
0
            a = list[i++];
1829
0
            polarity ^= 1;   // factored common code
1830
0
            b = other[j++];
1831
0
            polarity ^= 2;
1832
0
            break;
1833
0
          case 1: // a second, b first; if b < a, overlap
1834
0
            if (a < b) { // no overlap, take a
1835
0
                buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836
0
            } else if (b < a) { // OVERLAP, drop b
1837
0
                b = other[j++];
1838
0
                polarity ^= 2;
1839
0
            } else { // a == b, drop both!
1840
0
                if (a == UNICODESET_HIGH) goto loop_end;
1841
0
                a = list[i++];
1842
0
                polarity ^= 1;
1843
0
                b = other[j++];
1844
0
                polarity ^= 2;
1845
0
            }
1846
0
            break;
1847
0
          case 2: // a first, b second; if a < b, overlap
1848
0
            if (b < a) { // no overlap, take b
1849
0
                buffer[k++] = b;
1850
0
                b = other[j++];
1851
0
                polarity ^= 2;
1852
0
            } else  if (a < b) { // OVERLAP, drop a
1853
0
                a = list[i++];
1854
0
                polarity ^= 1;
1855
0
            } else { // a == b, drop both!
1856
0
                if (a == UNICODESET_HIGH) goto loop_end;
1857
0
                a = list[i++];
1858
0
                polarity ^= 1;
1859
0
                b = other[j++];
1860
0
                polarity ^= 2;
1861
0
            }
1862
0
            break;
1863
0
        }
1864
0
    }
1865
0
 loop_end:
1866
0
    buffer[k++] = UNICODESET_HIGH;    // terminate
1867
0
    len = k;
1868
0
    swapBuffers();
1869
0
    releasePattern();
1870
0
}
1871
1872
// polarity = 0 is normal: x intersect y
1873
// polarity = 2: x intersect ~y == set-minus
1874
// polarity = 1: ~x intersect y
1875
// polarity = 3: ~x intersect ~y
1876
1877
0
void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878
0
    if (isFrozen() || isBogus()) {
1879
0
        return;
1880
0
    }
1881
0
    UErrorCode status = U_ZERO_ERROR;
1882
0
    ensureBufferCapacity(len + otherLen, status);
1883
0
    if (U_FAILURE(status)) {
1884
0
        return;
1885
0
    }
1886
1887
0
    int32_t i = 0, j = 0, k = 0;
1888
0
    UChar32 a = list[i++];
1889
0
    UChar32 b = other[j++];
1890
    // change from xor is that we have to check overlapping pairs
1891
    // polarity bit 1 means a is second, bit 2 means b is.
1892
0
    for (;;) {
1893
0
        switch (polarity) {
1894
0
          case 0: // both first; drop the smaller
1895
0
            if (a < b) { // drop a
1896
0
                a = list[i++];
1897
0
                polarity ^= 1;
1898
0
            } else if (b < a) { // drop b
1899
0
                b = other[j++];
1900
0
                polarity ^= 2;
1901
0
            } else { // a == b, take one, drop other
1902
0
                if (a == UNICODESET_HIGH) goto loop_end;
1903
0
                buffer[k++] = a;
1904
0
                a = list[i++];
1905
0
                polarity ^= 1;
1906
0
                b = other[j++];
1907
0
                polarity ^= 2;
1908
0
            }
1909
0
            break;
1910
0
          case 3: // both second; take lower if unequal
1911
0
            if (a < b) { // take a
1912
0
                buffer[k++] = a;
1913
0
                a = list[i++];
1914
0
                polarity ^= 1;
1915
0
            } else if (b < a) { // take b
1916
0
                buffer[k++] = b;
1917
0
                b = other[j++];
1918
0
                polarity ^= 2;
1919
0
            } else { // a == b, take one, drop other
1920
0
                if (a == UNICODESET_HIGH) goto loop_end;
1921
0
                buffer[k++] = a;
1922
0
                a = list[i++];
1923
0
                polarity ^= 1;
1924
0
                b = other[j++];
1925
0
                polarity ^= 2;
1926
0
            }
1927
0
            break;
1928
0
          case 1: // a second, b first;
1929
0
            if (a < b) { // NO OVERLAP, drop a
1930
0
                a = list[i++];
1931
0
                polarity ^= 1;
1932
0
            } else if (b < a) { // OVERLAP, take b
1933
0
                buffer[k++] = b;
1934
0
                b = other[j++];
1935
0
                polarity ^= 2;
1936
0
            } else { // a == b, drop both!
1937
0
                if (a == UNICODESET_HIGH) goto loop_end;
1938
0
                a = list[i++];
1939
0
                polarity ^= 1;
1940
0
                b = other[j++];
1941
0
                polarity ^= 2;
1942
0
            }
1943
0
            break;
1944
0
          case 2: // a first, b second; if a < b, overlap
1945
0
            if (b < a) { // no overlap, drop b
1946
0
                b = other[j++];
1947
0
                polarity ^= 2;
1948
0
            } else  if (a < b) { // OVERLAP, take a
1949
0
                buffer[k++] = a;
1950
0
                a = list[i++];
1951
0
                polarity ^= 1;
1952
0
            } else { // a == b, drop both!
1953
0
                if (a == UNICODESET_HIGH) goto loop_end;
1954
0
                a = list[i++];
1955
0
                polarity ^= 1;
1956
0
                b = other[j++];
1957
0
                polarity ^= 2;
1958
0
            }
1959
0
            break;
1960
0
        }
1961
0
    }
1962
0
 loop_end:
1963
0
    buffer[k++] = UNICODESET_HIGH;    // terminate
1964
0
    len = k;
1965
0
    swapBuffers();
1966
0
    releasePattern();
1967
0
}
1968
1969
/**
1970
 * Append the <code>toPattern()</code> representation of a
1971
 * string to the given <code>StringBuffer</code>.
1972
 */
1973
void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1974
0
escapeUnprintable) {
1975
0
    UChar32 cp;
1976
0
    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1977
0
        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1978
0
    }
1979
0
}
1980
1981
/**
1982
 * Append the <code>toPattern()</code> representation of a
1983
 * character to the given <code>StringBuffer</code>.
1984
 */
1985
void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1986
0
escapeUnprintable) {
1987
0
    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1988
        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1989
        // unprintable
1990
0
        if (ICU_Utility::escapeUnprintable(buf, c)) {
1991
0
            return;
1992
0
        }
1993
0
    }
1994
    // Okay to let ':' pass through
1995
0
    switch (c) {
1996
0
    case SET_OPEN:
1997
0
    case SET_CLOSE:
1998
0
    case HYPHEN:
1999
0
    case COMPLEMENT:
2000
0
    case INTERSECTION:
2001
0
    case BACKSLASH:
2002
0
    case OPEN_BRACE:
2003
0
    case CLOSE_BRACE:
2004
0
    case COLON:
2005
0
    case SymbolTable::SYMBOL_REF:
2006
0
        buf.append(BACKSLASH);
2007
0
        break;
2008
0
    default:
2009
        // Escape whitespace
2010
0
        if (PatternProps::isWhiteSpace(c)) {
2011
0
            buf.append(BACKSLASH);
2012
0
        }
2013
0
        break;
2014
0
    }
2015
0
    buf.append(c);
2016
0
}
2017
2018
/**
2019
 * Append a string representation of this set to result.  This will be
2020
 * a cleaned version of the string passed to applyPattern(), if there
2021
 * is one.  Otherwise it will be generated.
2022
 */
2023
UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2024
                                      UBool escapeUnprintable) const
2025
0
{
2026
0
    if (pat != NULL) {
2027
0
        int32_t i;
2028
0
        int32_t backslashCount = 0;
2029
0
        for (i=0; i<patLen; ) {
2030
0
            UChar32 c;
2031
0
            U16_NEXT(pat, i, patLen, c);
2032
0
            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2033
                // If the unprintable character is preceded by an odd
2034
                // number of backslashes, then it has been escaped.
2035
                // Before unescaping it, we delete the final
2036
                // backslash.
2037
0
                if ((backslashCount % 2) == 1) {
2038
0
                    result.truncate(result.length() - 1);
2039
0
                }
2040
0
                ICU_Utility::escapeUnprintable(result, c);
2041
0
                backslashCount = 0;
2042
0
            } else {
2043
0
                result.append(c);
2044
0
                if (c == BACKSLASH) {
2045
0
                    ++backslashCount;
2046
0
                } else {
2047
0
                    backslashCount = 0;
2048
0
                }
2049
0
            }
2050
0
        }
2051
0
        return result;
2052
0
    }
2053
2054
0
    return _generatePattern(result, escapeUnprintable);
2055
0
}
2056
2057
/**
2058
 * Returns a string representation of this set.  If the result of
2059
 * calling this function is passed to a UnicodeSet constructor, it
2060
 * will produce another set that is equal to this one.
2061
 */
2062
UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2063
                                     UBool escapeUnprintable) const
2064
0
{
2065
0
    result.truncate(0);
2066
0
    return _toPattern(result, escapeUnprintable);
2067
0
}
2068
2069
/**
2070
 * Generate and append a string representation of this set to result.
2071
 * This does not use this.pat, the cleaned up copy of the string
2072
 * passed to applyPattern().
2073
 */
2074
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2075
                                            UBool escapeUnprintable) const
2076
0
{
2077
0
    result.append(SET_OPEN);
2078
2079
//  // Check against the predefined categories.  We implicitly build
2080
//  // up ALL category sets the first time toPattern() is called.
2081
//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082
//      if (*this == getCategorySet(cat)) {
2083
//          result.append(COLON);
2084
//          result.append(CATEGORY_NAMES, cat*2, 2);
2085
//          return result.append(CATEGORY_CLOSE);
2086
//      }
2087
//  }
2088
2089
0
    int32_t count = getRangeCount();
2090
2091
    // If the set contains at least 2 intervals and includes both
2092
    // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093
    // be more economical.
2094
0
    if (count > 1 &&
2095
0
        getRangeStart(0) == MIN_VALUE &&
2096
0
        getRangeEnd(count-1) == MAX_VALUE) {
2097
2098
        // Emit the inverse
2099
0
        result.append(COMPLEMENT);
2100
2101
0
        for (int32_t i = 1; i < count; ++i) {
2102
0
            UChar32 start = getRangeEnd(i-1)+1;
2103
0
            UChar32 end = getRangeStart(i)-1;
2104
0
            _appendToPat(result, start, escapeUnprintable);
2105
0
            if (start != end) {
2106
0
                if ((start+1) != end) {
2107
0
                    result.append(HYPHEN);
2108
0
                }
2109
0
                _appendToPat(result, end, escapeUnprintable);
2110
0
            }
2111
0
        }
2112
0
    }
2113
2114
    // Default; emit the ranges as pairs
2115
0
    else {
2116
0
        for (int32_t i = 0; i < count; ++i) {
2117
0
            UChar32 start = getRangeStart(i);
2118
0
            UChar32 end = getRangeEnd(i);
2119
0
            _appendToPat(result, start, escapeUnprintable);
2120
0
            if (start != end) {
2121
0
                if ((start+1) != end) {
2122
0
                    result.append(HYPHEN);
2123
0
                }
2124
0
                _appendToPat(result, end, escapeUnprintable);
2125
0
            }
2126
0
        }
2127
0
    }
2128
2129
0
    for (int32_t i = 0; i<strings->size(); ++i) {
2130
0
        result.append(OPEN_BRACE);
2131
0
        _appendToPat(result,
2132
0
                     *(const UnicodeString*) strings->elementAt(i),
2133
0
                     escapeUnprintable);
2134
0
        result.append(CLOSE_BRACE);
2135
0
    }
2136
0
    return result.append(SET_CLOSE);
2137
0
}
2138
2139
/**
2140
* Release existing cached pattern
2141
*/
2142
0
void UnicodeSet::releasePattern() {
2143
0
    if (pat) {
2144
0
        uprv_free(pat);
2145
0
        pat = NULL;
2146
0
        patLen = 0;
2147
0
    }
2148
0
}
2149
2150
/**
2151
* Set the new pattern to cache.
2152
*/
2153
0
void UnicodeSet::setPattern(const UnicodeString& newPat) {
2154
0
    releasePattern();
2155
0
    int32_t newPatLen = newPat.length();
2156
0
    pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2157
0
    if (pat) {
2158
0
        patLen = newPatLen;
2159
0
        newPat.extractBetween(0, patLen, pat);
2160
0
        pat[patLen] = 0;
2161
0
    }
2162
    // else we don't care if malloc failed. This was just a nice cache.
2163
    // We can regenerate an equivalent pattern later when requested.
2164
0
}
2165
2166
0
UnicodeFunctor *UnicodeSet::freeze() {
2167
0
    if(!isFrozen() && !isBogus()) {
2168
        // Do most of what compact() does before freezing because
2169
        // compact() will not work when the set is frozen.
2170
        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2171
2172
        // Delete buffer first to defragment memory less.
2173
0
        if (buffer != NULL) {
2174
0
            uprv_free(buffer);
2175
0
            buffer = NULL;
2176
0
        }
2177
0
        if (capacity > (len + GROW_EXTRA)) {
2178
            // Make the capacity equal to len or 1.
2179
            // We don't want to realloc of 0 size.
2180
0
            capacity = len + (len == 0);
2181
0
            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2182
0
            if (list == NULL) { // Check for memory allocation error.
2183
0
                setToBogus();
2184
0
                return this;
2185
0
            }
2186
0
        }
2187
2188
        // Optimize contains() and span() and similar functions.
2189
0
        if (!strings->isEmpty()) {
2190
0
            stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2191
0
            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2192
                // All strings are irrelevant for span() etc. because
2193
                // all of each string's code points are contained in this set.
2194
                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195
                // many relevant strings as UTF-16.
2196
                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2197
0
                delete stringSpan;
2198
0
                stringSpan = NULL;
2199
0
            }
2200
0
        }
2201
0
        if (stringSpan == NULL) {
2202
            // No span-relevant strings: Optimize for code point spans.
2203
0
            bmpSet=new BMPSet(list, len);
2204
0
            if (bmpSet == NULL) { // Check for memory allocation error.
2205
0
                setToBogus();
2206
0
            }
2207
0
        }
2208
0
    }
2209
0
    return this;
2210
0
}
2211
2212
0
int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2213
0
    if(length>0 && bmpSet!=NULL) {
2214
0
        return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2215
0
    }
2216
0
    if(length<0) {
2217
0
        length=u_strlen(s);
2218
0
    }
2219
0
    if(length==0) {
2220
0
        return 0;
2221
0
    }
2222
0
    if(stringSpan!=NULL) {
2223
0
        return stringSpan->span(s, length, spanCondition);
2224
0
    } else if(!strings->isEmpty()) {
2225
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2226
0
                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2227
0
                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2228
0
        UnicodeSetStringSpan strSpan(*this, *strings, which);
2229
0
        if(strSpan.needsStringSpanUTF16()) {
2230
0
            return strSpan.span(s, length, spanCondition);
2231
0
        }
2232
0
    }
2233
2234
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2235
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2236
0
    }
2237
2238
0
    UChar32 c;
2239
0
    int32_t start=0, prev=0;
2240
0
    do {
2241
0
        U16_NEXT(s, start, length, c);
2242
0
        if(spanCondition!=contains(c)) {
2243
0
            break;
2244
0
        }
2245
0
    } while((prev=start)<length);
2246
0
    return prev;
2247
0
}
2248
2249
0
int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2250
0
    if(length>0 && bmpSet!=NULL) {
2251
0
        return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2252
0
    }
2253
0
    if(length<0) {
2254
0
        length=u_strlen(s);
2255
0
    }
2256
0
    if(length==0) {
2257
0
        return 0;
2258
0
    }
2259
0
    if(stringSpan!=NULL) {
2260
0
        return stringSpan->spanBack(s, length, spanCondition);
2261
0
    } else if(!strings->isEmpty()) {
2262
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263
0
                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2264
0
                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2265
0
        UnicodeSetStringSpan strSpan(*this, *strings, which);
2266
0
        if(strSpan.needsStringSpanUTF16()) {
2267
0
            return strSpan.spanBack(s, length, spanCondition);
2268
0
        }
2269
0
    }
2270
2271
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2273
0
    }
2274
2275
0
    UChar32 c;
2276
0
    int32_t prev=length;
2277
0
    do {
2278
0
        U16_PREV(s, 0, length, c);
2279
0
        if(spanCondition!=contains(c)) {
2280
0
            break;
2281
0
        }
2282
0
    } while((prev=length)>0);
2283
0
    return prev;
2284
0
}
2285
2286
0
int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2287
0
    if(length>0 && bmpSet!=NULL) {
2288
0
        const uint8_t *s0=(const uint8_t *)s;
2289
0
        return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2290
0
    }
2291
0
    if(length<0) {
2292
0
        length=(int32_t)uprv_strlen(s);
2293
0
    }
2294
0
    if(length==0) {
2295
0
        return 0;
2296
0
    }
2297
0
    if(stringSpan!=NULL) {
2298
0
        return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2299
0
    } else if(!strings->isEmpty()) {
2300
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2301
0
                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2302
0
                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2303
0
        UnicodeSetStringSpan strSpan(*this, *strings, which);
2304
0
        if(strSpan.needsStringSpanUTF8()) {
2305
0
            return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2306
0
        }
2307
0
    }
2308
2309
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2310
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2311
0
    }
2312
2313
0
    UChar32 c;
2314
0
    int32_t start=0, prev=0;
2315
0
    do {
2316
0
        U8_NEXT_OR_FFFD(s, start, length, c);
2317
0
        if(spanCondition!=contains(c)) {
2318
0
            break;
2319
0
        }
2320
0
    } while((prev=start)<length);
2321
0
    return prev;
2322
0
}
2323
2324
0
int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2325
0
    if(length>0 && bmpSet!=NULL) {
2326
0
        const uint8_t *s0=(const uint8_t *)s;
2327
0
        return bmpSet->spanBackUTF8(s0, length, spanCondition);
2328
0
    }
2329
0
    if(length<0) {
2330
0
        length=(int32_t)uprv_strlen(s);
2331
0
    }
2332
0
    if(length==0) {
2333
0
        return 0;
2334
0
    }
2335
0
    if(stringSpan!=NULL) {
2336
0
        return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337
0
    } else if(!strings->isEmpty()) {
2338
0
        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2339
0
                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2340
0
                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2341
0
        UnicodeSetStringSpan strSpan(*this, *strings, which);
2342
0
        if(strSpan.needsStringSpanUTF8()) {
2343
0
            return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2344
0
        }
2345
0
    }
2346
2347
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2348
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2349
0
    }
2350
2351
0
    UChar32 c;
2352
0
    int32_t prev=length;
2353
0
    do {
2354
0
        U8_PREV_OR_FFFD(s, 0, length, c);
2355
0
        if(spanCondition!=contains(c)) {
2356
0
            break;
2357
0
        }
2358
0
    } while((prev=length)>0);
2359
0
    return prev;
2360
0
}
2361
2362
U_NAMESPACE_END