Coverage Report

Created: 2026-06-23 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/uniset_props.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 1999-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  uniset_props.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug25
16
*   created by: Markus W. Scherer
17
*
18
*   Character property dependent functions moved here from uniset.cpp
19
*/
20
21
#include <array>
22
#include <optional>
23
24
#include "unicode/utypes.h"
25
#include "unicode/uniset.h"
26
#include "unicode/parsepos.h"
27
#include "unicode/uchar.h"
28
#include "unicode/uscript.h"
29
#include "unicode/symtable.h"
30
#include "unicode/uset.h"
31
#include "unicode/locid.h"
32
#include "unicode/brkiter.h"
33
#include "unicode/utfiterator.h"
34
#include "uset_imp.h"
35
#include "ruleiter.h"
36
#include "cmemory.h"
37
#include "ucln_cmn.h"
38
#include "util.h"
39
#include "uvector.h"
40
#include "uprops.h"
41
#include "patternprops.h"
42
#include "propname.h"
43
#include "normalizer2impl.h"
44
#include "uinvchar.h"
45
#include "uprops.h"
46
#include "charstr.h"
47
#include "cstring.h"
48
#include "mutex.h"
49
#include "umutex.h"
50
#include "uassert.h"
51
#include "hash.h"
52
53
U_NAMESPACE_USE
54
55
namespace {
56
57
// Special property set IDs
58
constexpr char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
59
constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F]
60
constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:]
61
62
}  // namespace
63
64
// Cached sets ------------------------------------------------------------- ***
65
66
U_CDECL_BEGIN
67
static UBool U_CALLCONV uset_cleanup();
68
69
static UnicodeSet *uni32Singleton;
70
static icu::UInitOnce uni32InitOnce {};
71
72
/**
73
 * Cleanup function for UnicodeSet
74
 */
75
0
static UBool U_CALLCONV uset_cleanup() {
76
0
    delete uni32Singleton;
77
0
    uni32Singleton = nullptr;
78
0
    uni32InitOnce.reset();
79
0
    return true;
80
0
}
81
82
U_CDECL_END
83
84
U_NAMESPACE_BEGIN
85
86
using U_HEADER_ONLY_NAMESPACE::utfStringCodePoints;
87
88
namespace {
89
90
// Cache some sets for other services -------------------------------------- ***
91
0
void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
92
0
    U_ASSERT(uni32Singleton == nullptr);
93
0
    uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode);
94
0
    if(uni32Singleton==nullptr) {
95
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
96
0
    } else {
97
0
        uni32Singleton->freeze();
98
0
    }
99
0
    ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
100
0
}
101
102
103
U_CFUNC UnicodeSet *
104
0
uniset_getUnicode32Instance(UErrorCode &errorCode) {
105
0
    umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
106
0
    return uni32Singleton;
107
0
}
108
109
// helper functions for matching of pattern syntax pieces ------------------ ***
110
// these functions are parallel to the PERL_OPEN etc. strings above
111
112
// using these functions is not only faster than UnicodeString::compare() and
113
// caseCompare(), but they also make UnicodeSet work for simple patterns when
114
// no Unicode properties data is available - when caseCompare() fails
115
116
inline UBool
117
0
isPerlOpen(const UnicodeString &pattern, int32_t pos) {
118
0
    char16_t c;
119
0
    return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
120
0
}
121
122
/*static inline UBool
123
isPerlClose(const UnicodeString &pattern, int32_t pos) {
124
    return pattern.charAt(pos)==u'}';
125
}*/
126
127
inline UBool
128
0
isNameOpen(const UnicodeString &pattern, int32_t pos) {
129
0
    return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
130
0
}
131
132
inline UBool
133
0
isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
134
0
    return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
135
0
}
136
137
/*static inline UBool
138
isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
139
    return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
140
}*/
141
142
// TODO memory debugging provided inside uniset.cpp
143
// could be made available here but probably obsolete with use of modern
144
// memory leak checker tools
145
#define _dbgct(me)
146
147
// Returns the character with the given name or name alias, or U_SENTINEL if no such character
148
// exists.
149
11.9k
UChar32 getCharacterByName(const CharString& name) {
150
13.2k
    for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) {
151
13.2k
        UErrorCode ec = U_ZERO_ERROR;
152
13.2k
        UChar32 ch = u_charFromName(nameChoice, name.data(), &ec);
153
13.2k
        if (U_SUCCESS(ec)) {
154
10.6k
            return ch;
155
10.6k
        }
156
13.2k
    }
157
1.30k
    return U_SENTINEL;
158
11.9k
}
159
160
}  // namespace
161
162
//----------------------------------------------------------------
163
// Constructors &c
164
//----------------------------------------------------------------
165
166
/**
167
 * Constructs a set from the given pattern, optionally ignoring
168
 * white space.  See the class description for the syntax of the
169
 * pattern language.
170
 * @param pattern a string specifying what characters are in the set
171
 */
172
UnicodeSet::UnicodeSet(const UnicodeString& pattern,
173
12.2k
                       UErrorCode& status) {
174
12.2k
    applyPattern(pattern, status);
175
12.2k
    _dbgct(this);
176
12.2k
}
177
178
//----------------------------------------------------------------
179
// Public API
180
//----------------------------------------------------------------
181
182
UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
183
17.7k
                                     UErrorCode& status) {
184
    // Equivalent to
185
    //   return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
186
    // but without dependency on closeOver().
187
17.7k
    ParsePosition pos(0);
188
17.7k
    applyPatternIgnoreSpace(pattern, pos, nullptr, status);
189
17.7k
    if (U_FAILURE(status)) return *this;
190
191
8.04k
    int32_t i = pos.getIndex();
192
    // Skip over trailing whitespace
193
8.04k
    ICU_Utility::skipWhitespace(pattern, i, true);
194
8.04k
    if (i != pattern.length()) {
195
388
        status = U_ILLEGAL_ARGUMENT_ERROR;
196
388
    }
197
8.04k
    return *this;
198
17.7k
}
199
200
void
201
UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
202
                                    ParsePosition& pos,
203
                                    const SymbolTable* symbols,
204
17.7k
                                    UErrorCode& status) {
205
17.7k
    if (U_FAILURE(status)) {
206
0
        return;
207
0
    }
208
17.7k
    if (isFrozen()) {
209
0
        status = U_NO_WRITE_PERMISSION;
210
0
        return;
211
0
    }
212
    // Need to build the pattern in a temporary string because
213
    // _applyPattern calls add() etc., which set pat to empty.
214
17.7k
    UnicodeString rebuiltPat;
215
17.7k
    RuleCharacterIterator chars(pattern, symbols, pos);
216
17.7k
    applyPattern(pattern, pos, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status);
217
17.7k
    if (U_FAILURE(status)) return;
218
8.04k
    if (chars.inVariable()) {
219
        // syntaxError(chars, "Extra chars in variable value");
220
0
        status = U_MALFORMED_SET;
221
0
        return;
222
0
    }
223
8.04k
    setPattern(rebuiltPat);
224
8.04k
}
225
226
/**
227
 * Return true if the given position, in the given pattern, appears
228
 * to be the start of a UnicodeSet pattern.
229
 */
230
0
UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
231
0
    return ((pos+1) < pattern.length() &&
232
0
            pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) ||
233
0
        resemblesPropertyPattern(pattern, pos);
234
0
}
235
236
//----------------------------------------------------------------
237
// Implementation: Pattern parsing
238
//----------------------------------------------------------------
239
240
#define U_DEBUGGING_UNICODESET_PARSING 0
241
242
class UnicodeSet::Lexer {
243
  public:
244
    Lexer(const UnicodeString &pattern,
245
          const ParsePosition &parsePosition,
246
          RuleCharacterIterator &chars,
247
          uint32_t unicodeSetOptions,
248
          const SymbolTable *const symbols,
249
          UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute))
250
98.1k
        : pattern_(pattern), parsePosition_(parsePosition), chars_(chars),
251
98.1k
          unicodeSetOptions_(unicodeSetOptions),
252
98.1k
          charsOptions_(RuleCharacterIterator::PARSE_ESCAPES |
253
98.1k
                        ((unicodeSetOptions & USET_IGNORE_SPACE) != 0
254
98.1k
                             ? RuleCharacterIterator::SKIP_WHITESPACE
255
98.1k
                             : 0)),
256
98.1k
          symbols_(symbols),
257
98.1k
          caseClosure_(caseClosure) {}
258
259
    class LexicalElement {
260
      public:
261
224M
        bool isSetOperator(const char16_t op) const {
262
224M
            return U_SUCCESS(errorCode_) && category_ == SET_OPERATOR && string_[0] == op;
263
224M
        }
264
265
33.5M
        bool isStringLiteral() const {
266
33.5M
            return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL;
267
33.5M
        }
268
269
0
        bool isNamedElement() const {
270
0
            return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT;
271
0
        }
272
273
0
        bool isBracketedElement() const {
274
0
            return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT;
275
0
        }
276
277
185k
        std::optional<UnicodeString> element() const {
278
185k
            if (U_SUCCESS(errorCode_) &&
279
185k
                (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
280
185k
                 category_ == BRACKETED_ELEMENT || category_ == STRING_LITERAL)) {
281
185k
                return string_;
282
185k
            }
283
0
            return std::nullopt;
284
185k
        }
285
286
66.9M
        std::optional<UChar32> codePoint() const {
287
66.9M
            if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT ||
288
66.9M
                                          category_ == BRACKETED_ELEMENT || category_ == NAMED_ELEMENT)) {
289
66.9M
                return string_.char32At(0);
290
66.9M
            }
291
16.0k
            return std::nullopt;
292
66.9M
        }
293
294
        // If `*this` is a valid property-query or set-valued-variable, returns the set represented
295
        // by this lexical element, which lives at least as long as `*this`.  Null otherwise.
296
35.7M
        const UnicodeSet *set() const {
297
35.7M
            if (U_FAILURE(errorCode_)) {
298
18.3k
                return nullptr;
299
18.3k
            }
300
35.7M
            if (category_ == PROPERTY_QUERY || category_ == VARIABLE) {
301
332k
                if (precomputedSet_ != nullptr) {
302
0
                    return precomputedSet_;
303
332k
                } else {
304
332k
                    return &set_;
305
332k
                }
306
332k
            }
307
35.3M
            return nullptr;
308
35.7M
        }
309
310
43.5k
        const UErrorCode& errorCode() const{
311
43.5k
          return errorCode_;
312
43.5k
        }
313
314
#if U_DEBUGGING_UNICODESET_PARSING
315
        UnicodeString debugString() const {
316
            UnicodeString result;
317
            if (U_FAILURE(errorCode_)) {
318
                result.append(u"Ill-formed token (")
319
                    .append(UnicodeString::fromUTF8(u_errorName(errorCode_)))
320
                    .append(u"), possibly ");
321
            }
322
            return result.append(category_names_[category_])
323
                .append(u" '")
324
                .append(sourceText_)
325
                .append(u"'");
326
        }
327
#endif
328
329
      private:
330
        // See https://unicode.org/reports/tr61#Lexical-Elements.
331
        enum Category : std::uint8_t {
332
            SET_OPERATOR,
333
            LITERAL_ELEMENT,
334
            ESCAPED_ELEMENT,
335
            NAMED_ELEMENT,
336
            BRACKETED_ELEMENT,
337
            STRING_LITERAL,
338
            PROPERTY_QUERY,
339
            // Used for ill-formed variables and set-valued variables that are not directly a
340
            // property-query, e.g., $basicLatinLetters=[A-Za-z].  Variables that expand to a single
341
            // lexical element instead have the category of that lexical element, e.g., $Ll=\p{Ll} has
342
            // the category PROPERTY_QUERY, $a=a has the category LITERAL_ELEMENT, and $s={Zeichenkette}
343
            // has the category STRING_LITERAL.
344
            VARIABLE,
345
            END_OF_TEXT,
346
        };
347
        static constexpr std::array<std::u16string_view, END_OF_TEXT + 1> category_names_{{
348
            u"set-operator",
349
            u"literal-element",
350
            u"escaped-element",
351
            u"named-element",
352
            u"bracketed-element",
353
            u"string-literal",
354
            u"property-query",
355
            u"variable",
356
            u"(end of text)",
357
        }};
358
        LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after,
359
                       UErrorCode errorCode, const UnicodeSet *precomputedSet, UnicodeSet set,
360
                       std::u16string_view sourceText)
361
38.9M
            : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode),
362
38.9M
              precomputedSet_(precomputedSet), set_(set), sourceText_(sourceText) {}
363
        Category category_;
364
        UnicodeString string_;
365
        RuleCharacterIterator::Pos after_;
366
        UErrorCode errorCode_;
367
        const UnicodeSet *precomputedSet_;
368
        UnicodeSet set_;
369
        std::u16string_view sourceText_;
370
371
        friend class Lexer;
372
    };
373
374
0
    UnicodeString getPositionForDebugging() const {
375
0
        return pattern_.tempSubString(0, parsePosition_.getIndex()) + u"☞" +
376
0
               pattern_.tempSubString(parsePosition_.getIndex(), 60);
377
0
    }
378
379
46.2M
    bool acceptSetOperator(char16_t op) {
380
46.2M
        if (lookahead().isSetOperator(op)) {
381
5.16M
            advance();
382
5.16M
            return true;
383
5.16M
        }
384
41.1M
        return false;
385
46.2M
    }
386
387
400M
    const LexicalElement &lookahead() {
388
400M
        if (!ahead_.has_value()) {
389
38.8M
            const RuleCharacterIterator::Pos before = getPos();
390
38.8M
            ahead_.emplace(nextToken());
391
38.8M
            chars_.setPos(before);
392
38.8M
        }
393
400M
        return *ahead_;
394
400M
    }
395
396
60.9k
    const LexicalElement &lookahead2() {
397
60.9k
        if (!ahead2_.has_value()) {
398
            // Note that if someone has called `getCharacterIterator` and played with the result,
399
            // `before` may not actually be before `ahead_`, but we do not actually depend on this here,
400
            // since we start from ahead_.after_.
401
60.9k
            const RuleCharacterIterator::Pos before = getPos();
402
60.9k
            chars_.setPos(lookahead().after_);
403
60.9k
            ahead2_.emplace(nextToken());
404
60.9k
            chars_.setPos(before);
405
60.9k
        }
406
60.9k
        return *ahead2_;
407
60.9k
    }
408
409
    // For use in older functions that take the `RuleCharacterIterator` directly.
410
    // Any advancement of the resulting `RuleCharacterIterator` has no effect on the result of subsequent
411
    // calls to `lookahead`, `lookahead2`, `advance`, or `acceptSetOperator`.
412
    // Once `advance` or `acceptSetOperator` has been called, the result of a call to
413
    // `getCharacterIterator` preceding the call to `advance` or `acceptSetOperator` must no longer be
414
    // used.
415
0
    RuleCharacterIterator &getCharacterIterator() {
416
0
        // Make sure we compute a correct `ahead_.after_` so we do not depend on the current value of
417
0
        // `getPos()` for lexing.
418
0
        lookahead();
419
0
        return chars_;
420
0
    }
421
422
0
    int32_t charsOptions() {
423
0
        return charsOptions_;
424
0
    }
425
426
37.1M
    bool atEnd() const {
427
37.1M
        return chars_.atEnd();
428
37.1M
    }
429
430
38.8M
    void advance() {
431
        // If someone called `getCharacterIterator`, we are now changing the character iterator under
432
        // their feet; further, we may not have an `ahead_`, so if they keep playing with it we would be
433
        // working on incorrect values of `getPos`.  This is why the result of `getCharacterIterator`
434
        // must no longer be used.
435
38.8M
        chars_.setPos(lookahead().after_);
436
38.8M
        ahead_ = ahead2_;
437
38.8M
        ahead2_.reset();
438
38.8M
    }
439
440
  private:
441
    // A version of getPos that returns its position instead of taking it as at out parameter, so we
442
    // can have const positions.
443
145M
    RuleCharacterIterator::Pos getPos() const {
444
145M
        RuleCharacterIterator::Pos result;
445
145M
        chars_.getPos(result);
446
145M
        return result;
447
145M
    }
448
449
38.9M
    LexicalElement nextToken() {
450
38.9M
        UErrorCode errorCode = U_ZERO_ERROR;
451
38.9M
        chars_.skipIgnored(charsOptions_);
452
38.9M
        if (chars_.atEnd()) {
453
5.47k
            return LexicalElement(LexicalElement::END_OF_TEXT, {}, getPos(), errorCode,
454
5.47k
                                  /*precomputedSet=*/nullptr,
455
5.47k
                                  /*set=*/{},
456
5.47k
                                  u"");
457
5.47k
        }
458
38.9M
        const int32_t start = parsePosition_.getIndex();
459
38.9M
        const RuleCharacterIterator::Pos before = getPos();
460
        // First try to get the next character without parsing escapes.
461
38.9M
        UBool unusedEscaped;
462
38.9M
        const UChar32 first =
463
38.9M
            chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode);
464
38.9M
        if (first == u'[' || first == u'\\') {
465
2.14M
            const RuleCharacterIterator::Pos afterFirst = getPos();
466
            // This could be a property-query or named-element.
467
2.14M
            const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
468
2.14M
                                                                 RuleCharacterIterator::SKIP_WHITESPACE),
469
2.14M
                                               unusedEscaped, errorCode);
470
2.14M
            if ((first == u'[' && second == u':') ||
471
2.12M
                (first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) {
472
135k
                if (second == u'N') {
473
8.56k
                    UChar32 const queryResult = scanNamedElementBrackets(errorCode);
474
8.56k
                    return LexicalElement(
475
8.56k
                        LexicalElement::NAMED_ELEMENT, UnicodeString(queryResult), getPos(), errorCode,
476
8.56k
                        /*precomputedSet=*/nullptr,
477
8.56k
                        /*set=*/{},
478
8.56k
                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
479
127k
                } else {
480
127k
                    UnicodeSet queryResult = scanPropertyQueryAfterStart(first, second, start, errorCode);
481
127k
                    return LexicalElement(
482
127k
                        LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode,
483
127k
                        /*precomputedSet=*/nullptr, /*set=*/std::move(queryResult),
484
127k
                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
485
127k
                }
486
135k
            }
487
            // Not a property-query.
488
2.00M
            chars_.setPos(afterFirst);
489
2.00M
        }
490
38.7M
        if (first == u'$' && symbols_ != nullptr) {
491
0
            auto nameEnd = parsePosition_;
492
            // The SymbolTable defines the lexing of variable names past the $.
493
0
            if (UnicodeString name = symbols_->parseReference(pattern_, nameEnd, pattern_.length());
494
0
                !name.isEmpty()) {
495
0
                chars_.jumpahead(nameEnd.getIndex() - (start + 1));
496
0
                const std::u16string_view source =
497
0
                    std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start);
498
0
                const UnicodeSet *precomputedSet = symbols_->lookupSet(name);
499
0
                if (precomputedSet != nullptr) {
500
0
                    return LexicalElement(LexicalElement::VARIABLE, {}, getPos(), U_ZERO_ERROR,
501
0
                                          precomputedSet, /*set=*/{}, source);
502
0
                }
503
                // The variable was not a precomputed set.  Use the old-fashioned `lookup`, which
504
                // should give us its source text; if that parses as a single set or element, use
505
                // it.  Note that variables are not allowed in that expansion.
506
                // Implementers of higher-level syntaxes that pre-parse UnicodeSet-valued variables
507
                // can use variables in their variable definitions, but those that simply use the
508
                // source text substitution API cannot.
509
0
                const UnicodeString *const expression = symbols_->lookup(name);
510
0
                if (expression == nullptr) {
511
0
                    return LexicalElement(
512
0
                        LexicalElement::VARIABLE, {}, getPos(), U_UNDEFINED_VARIABLE,
513
0
                        /*precomputedSet=*/nullptr,
514
0
                        /*set=*/{},
515
0
                        source);
516
0
                }
517
0
                return evaluateVariable(*expression, source);
518
0
            }
519
0
        }
520
38.7M
        switch (first) {
521
1.83M
        case u'[':
522
1.83M
            return LexicalElement(
523
1.83M
                LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode,
524
1.83M
                /*precomputedSet=*/nullptr,
525
1.83M
                /*set=*/{},
526
1.83M
                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
527
171k
        case u'\\': {
528
            // Now try to parse the escape.
529
171k
            chars_.setPos(before);
530
171k
            UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode);
531
171k
            return LexicalElement(
532
171k
                LexicalElement::ESCAPED_ELEMENT,
533
171k
                UnicodeString(codePoint), getPos(), errorCode,
534
171k
                nullptr,
535
171k
                /*set=*/{},
536
171k
                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
537
0
        }
538
6.61k
        case u'&':
539
57.5k
        case u'-':
540
1.83M
        case u']':
541
3.37M
        case u'^':
542
3.38M
        case u'$':
543
            // We make $ a set-operator to handle the ICU extensions involving $.
544
3.38M
            return LexicalElement(
545
3.38M
                LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), errorCode,
546
3.38M
                /*precomputedSet=*/nullptr,
547
3.38M
                /*set=*/{},
548
3.38M
                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
549
98.0k
        case u'{': {
550
98.0k
            UnicodeString string;
551
98.0k
            UBool escaped;
552
98.0k
            UChar32 next;
553
98.0k
            int32_t codePointCount = 0;
554
14.7M
            while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
555
14.7M
                const RuleCharacterIterator::Pos beforeNext = getPos();
556
14.7M
                next = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
557
14.7M
                                                     RuleCharacterIterator::SKIP_WHITESPACE),
558
14.7M
                                   unusedEscaped, errorCode);
559
14.7M
                if (next == u'\\') {
560
9.42k
                    const UChar32 afterBackslash =
561
9.42k
                        chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
562
9.42k
                                                      RuleCharacterIterator::SKIP_WHITESPACE),
563
9.42k
                                    unusedEscaped, errorCode);
564
9.42k
                    if (afterBackslash == u'N') {
565
816
                        next = scanNamedElementBrackets(errorCode);
566
816
                        escaped = true;
567
8.60k
                    } else if (afterBackslash == u'p' || afterBackslash == u'P') {
568
70
                        return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(),
569
70
                                              U_MALFORMED_SET,
570
70
                                              /*precomputedSet=*/nullptr,
571
70
                                              /*set=*/{},
572
70
                                              std::u16string_view(pattern_).substr(
573
70
                                                  start, parsePosition_.getIndex() - start));
574
8.53k
                    } else {
575
8.53k
                        chars_.setPos(beforeNext);
576
                        // Parse the escape.
577
8.53k
                        next = chars_.next(charsOptions_, escaped, errorCode);
578
8.53k
                    }
579
14.7M
                } else {
580
14.7M
#if U_ICU_VERSION_MAJOR_NUM < 81
581
14.7M
                    if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) {
582
                        // Transitional prohibition of unescaped spaces in string literals (in
583
                        // ICU 78 and earlier, these were ignored; in ICU 81 they will mean
584
                        // themselves).
585
138
                        errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR;
586
138
                    }
587
#else
588
#error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16.
589
#endif
590
14.7M
                    escaped = false;
591
14.7M
                }
592
14.7M
                if (!escaped && next == u'}') {
593
97.3k
                    return LexicalElement(
594
97.3k
                        codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT
595
97.3k
                                            : LexicalElement::STRING_LITERAL,
596
97.3k
                        std::move(string), getPos(), errorCode,
597
97.3k
                        /*precomputedSet=*/nullptr,
598
97.3k
                        /*set=*/{},
599
97.3k
                        std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
600
97.3k
                }
601
14.6M
                string.append(next);
602
14.6M
                codePointCount += 1;
603
14.6M
            }
604
570
            return LexicalElement(
605
570
                LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET,
606
570
                /*precomputedSet=*/nullptr,
607
570
                /*set=*/{},
608
570
                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
609
98.0k
        }
610
33.2M
        default:
611
33.2M
            return LexicalElement(
612
33.2M
                LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr,
613
33.2M
                /*set=*/{},
614
33.2M
                std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start));
615
38.7M
        }
616
38.7M
    }
617
618
11.5k
    UChar32 scanNamedElementBrackets(UErrorCode &errorCode) {
619
11.5k
        UBool unusedEscaped;
620
11.5k
        const UChar32 open = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
621
11.5k
                                                           RuleCharacterIterator::SKIP_WHITESPACE),
622
11.5k
                                         unusedEscaped, errorCode);
623
11.5k
        if (open == u'{') {
624
11.4k
            int32_t start = parsePosition_.getIndex();
625
11.4k
            std::optional<UChar32> hex;
626
11.4k
            std::optional<UChar32> literal;
627
1.61M
            while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
628
1.61M
                UChar32 last = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
629
1.61M
                                                             RuleCharacterIterator::SKIP_WHITESPACE),
630
1.61M
                                           unusedEscaped, errorCode);
631
1.61M
                if (last == u':') {
632
707
                    if (!hex.has_value()) {
633
499
                        hex.emplace();
634
499
                        for (char16_t digit : std::u16string_view(pattern_).substr(
635
996
                                 start, parsePosition_.getIndex() - 1 - start)) {
636
996
                            uint8_t nibble;
637
996
                            if (digit >= u'0' && digit <= u'9') {
638
559
                                nibble = digit - '0';
639
559
                            } else {
640
437
                                digit = digit & ~0x20;
641
437
                                if (digit >= u'A' && digit <= u'F') {
642
318
                                    nibble = digit - u'A' + 0xA;
643
318
                                } else {
644
119
                                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
645
119
                                    return {};
646
119
                                }
647
437
                            }
648
877
                            *hex = (*hex << 4) + nibble;
649
877
                            if (hex > 0x10FFFF) {
650
21
                                errorCode = U_ILLEGAL_ARGUMENT_ERROR;
651
21
                                return {};
652
21
                            }
653
877
                        }
654
499
                    } else if (!literal.has_value()) {
655
201
                        const auto literalCodePoints = utfStringCodePoints<UChar32, UTF_BEHAVIOR_FFFD>(
656
201
                            std::u16string_view(pattern_).substr(start,
657
201
                                                                 parsePosition_.getIndex() - 1 - start));
658
201
                        auto it = literalCodePoints.begin();
659
201
                        if (it == literalCodePoints.end() || !it->wellFormed() ||
660
161
                                (literal = it->codePoint(), ++it) != literalCodePoints.end()) {
661
161
                            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
662
161
                            return {};
663
161
                        }
664
201
                    } else {
665
7
                        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
666
7
                        return {};
667
7
                    }
668
399
                    start = parsePosition_.getIndex();
669
1.61M
                } else if (last == u'}') {
670
10.9k
                    const std::u16string_view u16name = std::u16string_view(pattern_).substr(
671
10.9k
                        start, parsePosition_.getIndex() - 1 - start);
672
10.9k
                    const UChar32 result = getCharacterByName(CharString().appendInvariantChars(
673
10.9k
                        u16name.data(), static_cast<int32_t>(u16name.length()), errorCode));
674
10.9k
                    if (!U_SUCCESS(errorCode)) {
675
                        // Convert U_INVARIANT_CONVERSION_ERROR to U_ILLEGAL_ARGUMENT_ERROR.
676
98
                        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
677
98
                        return {};
678
98
                    }
679
10.8k
                    if (result < 0 || (hex.has_value() && result != hex) ||
680
9.70k
                            (literal.has_value() && result != literal)) {
681
1.11k
                        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
682
1.11k
                        return {};
683
1.11k
                    }
684
9.70k
                    return result;
685
10.8k
                }
686
1.61M
            }
687
11.4k
        }
688
337
        if (U_SUCCESS(errorCode)) {
689
337
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
690
337
        }
691
337
        return {};
692
11.5k
    }
693
694
0
    LexicalElement evaluateVariable(const UnicodeString &expression, const std::u16string_view source) {
695
0
        UErrorCode errorCode = U_ZERO_ERROR;
696
0
        ParsePosition expressionPosition;
697
0
        RuleCharacterIterator expressionIterator(expression, symbols_, expressionPosition);
698
        // Do not pass the symbols: we do not support recursive expansion of variables.
699
0
        Lexer expressionLexer(expression, expressionPosition, expressionIterator, unicodeSetOptions_,
700
0
                              /*symbols=*/nullptr, caseClosure_);
701
0
        auto variableToken = expressionLexer.lookahead();
702
0
        if (variableToken.isSetOperator(u'[')) {
703
0
            UnicodeString rebuiltPattern;
704
0
            UnicodeSet expressionValue;
705
0
            expressionValue.parseUnicodeSet(expressionLexer, rebuiltPattern, unicodeSetOptions_,
706
0
                                            caseClosure_, /*depth=*/0, errorCode);
707
0
            expressionValue.setPattern(rebuiltPattern);
708
0
            if (!expressionLexer.atEnd()) {
709
0
                return LexicalElement(
710
0
                    LexicalElement::VARIABLE, {}, getPos(), U_MALFORMED_VARIABLE_DEFINITION,
711
0
                    /*precomputedSet=*/nullptr,
712
0
                    /*set=*/{},
713
0
                    source);
714
0
            }
715
0
            return LexicalElement(
716
0
                LexicalElement::VARIABLE, {}, getPos(), errorCode,
717
0
                /*precomputedSet=*/nullptr,
718
0
                /*set=*/std::move(expressionValue),
719
0
                source);
720
0
        } else {
721
0
            expressionLexer.advance();
722
0
            if (!expressionLexer.atEnd()) {
723
0
                return LexicalElement(
724
0
                    LexicalElement::VARIABLE, {}, getPos(), U_MALFORMED_VARIABLE_DEFINITION,
725
0
                    /*precomputedSet=*/nullptr,
726
0
                    /*set=*/{},
727
0
                    source);
728
0
            }
729
0
            switch (variableToken.category_) {
730
0
            case LexicalElement::LITERAL_ELEMENT:
731
0
            case LexicalElement::ESCAPED_ELEMENT:
732
0
            case LexicalElement::NAMED_ELEMENT:
733
0
            case LexicalElement::BRACKETED_ELEMENT:
734
0
            case LexicalElement::STRING_LITERAL:
735
0
            case LexicalElement::PROPERTY_QUERY:
736
                // Return the same lexical element that we found while parsing the variable contents,
737
                // except the source position corresponds to the position of the variable rather than 0
738
                // in its expansion, and the source is the name of the variable rather than its
739
                // expansion.
740
0
                return LexicalElement(
741
0
                    variableToken.category_, std::move(variableToken.string_), getPos(),
742
0
                    variableToken.errorCode_, variableToken.precomputedSet_, std::move(variableToken.set_), source);
743
0
            default:
744
0
                return LexicalElement(LexicalElement::VARIABLE, {}, getPos(),
745
0
                                      U_MALFORMED_VARIABLE_DEFINITION,
746
0
                                      /*precomputedSet=*/nullptr,
747
0
                                      /*set=*/{}, source);
748
0
            }
749
0
        }
750
0
    }
751
752
127k
    UnicodeSet scanPropertyQueryAfterStart(UChar32 first, UChar32 second, int32_t queryStart, UErrorCode &errorCode) {
753
127k
        std::optional<int32_t> queryOperatorPosition;
754
127k
        int32_t queryExpressionStart = parsePosition_.getIndex();
755
127k
        bool exteriorlyNegated = false;
756
127k
        bool interiorlyNegated = false;
757
127k
        UBool unusedEscaped;
758
        // Do not skip whitespace so we can recognize unspaced :].  Lex escapes and
759
        // named-element: while ICU does not support string-valued properties and thus has no
760
        // use for escapes, we still want to lex through escapes to allow downstream
761
        // implementations (mostly unicodetools) to implement string-valued properties.
762
127k
        const UChar32 third = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
763
127k
                                                            RuleCharacterIterator::SKIP_WHITESPACE),
764
127k
                                          unusedEscaped, errorCode);
765
127k
        if (first == u'\\') {
766
110k
            if (third != u'{') {
767
134
                errorCode = U_ILLEGAL_ARGUMENT_ERROR;
768
134
                return {};
769
134
            }
770
110k
            exteriorlyNegated = second == u'P';
771
110k
            queryExpressionStart = parsePosition_.getIndex();
772
110k
        } else {
773
16.7k
            if (third == u'^') {
774
286
                exteriorlyNegated = true;
775
286
                queryExpressionStart = parsePosition_.getIndex();
776
286
            }
777
16.7k
        }
778
126k
        RuleCharacterIterator::Pos beforePenultimate = getPos();
779
126k
        UChar32 penultimateUnescaped =
780
126k
            chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
781
126k
                                          RuleCharacterIterator::SKIP_WHITESPACE),
782
126k
                        unusedEscaped, errorCode);
783
784
11.3M
        while (!chars_.atEnd() && U_SUCCESS(errorCode)) {
785
11.3M
            const RuleCharacterIterator::Pos beforeLast = getPos();
786
11.3M
            UChar32 lastUnescaped =
787
11.3M
                chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES |
788
11.3M
                                              RuleCharacterIterator::SKIP_WHITESPACE),
789
11.3M
                            unusedEscaped, errorCode);
790
11.3M
            if (penultimateUnescaped == u'\\') {
791
20.2k
                if (lastUnescaped == 'N') {
792
2.18k
                    scanNamedElementBrackets(errorCode);
793
2.18k
                    if (!U_SUCCESS(errorCode)) {
794
334
                        return {};
795
334
                    }
796
18.0k
                } else {
797
                    // There must be an escaped-element starting at beforePenultimate.  Go
798
                    // back there and advance through it.
799
18.0k
                    chars_.setPos(beforePenultimate);
800
18.0k
                    chars_.next(charsOptions_ & ~RuleCharacterIterator::SKIP_WHITESPACE, unusedEscaped,
801
18.0k
                                errorCode);
802
18.0k
                }
803
                // Neither a named-element nor an escaped-element can be part of a closing :].
804
19.8k
                lastUnescaped = -1;
805
11.3M
            } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'=') {
806
69.0k
                queryOperatorPosition = parsePosition_.getIndex() - 1;
807
11.3M
            } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'≠') {
808
20
                if (exteriorlyNegated) {
809
                    // Reject doubly negated property queries.
810
3
                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
811
3
                    return {};
812
3
                }
813
17
                interiorlyNegated = true;
814
17
                queryOperatorPosition = parsePosition_.getIndex() - 1;
815
11.3M
            } else if ((first == u'[' && penultimateUnescaped == u':' && lastUnescaped == u']') ||
816
11.2M
                       (first == u'\\' && lastUnescaped == u'}')) {
817
                // Note that no unescaping is performed here, as ICU does not support string-valued or
818
                // or miscellaneous properties.
819
126k
                const int32_t queryExpressionLimit =
820
126k
                    first == u'[' ? parsePosition_.getIndex() - 2 : parsePosition_.getIndex() - 1;
821
                // Contrary to Java, applyPropertyAlias does not support a null property-predicate in
822
                // C++; instead "" indicates the absence of a property-predicate.  This is OK with the
823
                // properties supported by ICU, but not with string-valued or miscellaneous properties;
824
                // see https://github.com/unicode-org/icu/pull/3456.
825
126k
                UnicodeString propertyPredicate;
826
126k
                if (queryOperatorPosition.has_value()) {
827
68.9k
                    propertyPredicate =
828
68.9k
                        pattern_.tempSubStringBetween(*queryOperatorPosition + 1, queryExpressionLimit);
829
68.9k
                    if (propertyPredicate.isEmpty()) {
830
                        // \p{X=} is valid if X is a string-valued or miscellaneous property, but
831
                        // ICU does not support those.  Thus, it is invalid for ICU purposes, and
832
                        // passing an empty propertyPredicate to applyPropertyAlias can be valid
833
                        // (this is how we represent \p{X}), so we need to return the error here.
834
298
                        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
835
298
                        return {};
836
298
                    }
837
68.9k
                }
838
125k
                UnicodeSet result;
839
125k
                result.applyPropertyAlias(
840
125k
                    pattern_.tempSubStringBetween(queryExpressionStart,
841
125k
                                                  queryOperatorPosition.value_or(queryExpressionLimit)),
842
125k
                    propertyPredicate, errorCode);
843
125k
                if (exteriorlyNegated != interiorlyNegated) {
844
21.5k
                    result.complement().removeAllStrings();
845
21.5k
                }
846
125k
                result.setPattern(pattern_.tempSubStringBetween(queryStart, parsePosition_.getIndex()));
847
125k
                return result;
848
126k
            }
849
11.2M
            beforePenultimate = beforeLast;
850
11.2M
            penultimateUnescaped = lastUnescaped;
851
11.2M
        }
852
510
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
853
510
        return {};
854
126k
    }
855
856
    const UnicodeString &pattern_;
857
    const ParsePosition &parsePosition_;
858
    RuleCharacterIterator &chars_;
859
    const uint32_t unicodeSetOptions_;
860
    const int32_t charsOptions_;
861
    const SymbolTable *const symbols_;
862
    UnicodeSet &(UnicodeSet::* const caseClosure_)(int32_t attribute);
863
    std::optional<LexicalElement> ahead_;
864
    std::optional<LexicalElement> ahead2_;
865
};
866
867
namespace {
868
869
constexpr int32_t MAX_DEPTH = 100;
870
871
#if U_DEBUGGING_UNICODESET_PARSING
872
873
#define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
874
    do {                                                                                                \
875
    constexpr std::string_view functionName = __func__;\
876
    static_assert (functionName.substr(0, 5) == "parse");\
877
        if (U_FAILURE(ec)) {                                                                            \
878
            if (depth < 5) {                                                                            \
879
                printf("--- in %s l. %d\n", __func__+5, __LINE__);                                        \
880
            } else if (depth == 5 && std::string_view(__func__+5) == "UnicodeSet") {                 \
881
                printf("--- [...]\n");                                                                  \
882
            }                                                                                           \
883
            return;                                                                                     \
884
        }                                                                                               \
885
    } while (false)
886
#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec)                               \
887
    do {                                                                                                \
888
        constexpr std::string_view functionName = __func__;                                             \
889
        static_assert(functionName.substr(0, 5) == "parse");                                            \
890
        std::string actualUTF8;                                                                         \
891
        std::string contextUTF8;                                                                        \
892
        printf("*** Expected %s, got %s %s\n", (expected),                                              \
893
               UnicodeString(actual).toUTF8String(actualUTF8).c_str(),                                  \
894
               lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str());                      \
895
        printf("--- in %s l. %d\n", __func__ + 5, __LINE__);                                            \
896
        if (U_FAILURE(lexer.lookahead().errorCode())) {                                                 \
897
            (ec) = lexer.lookahead().errorCode();                                                       \
898
        } else {                                                                                        \
899
            (ec) = U_MALFORMED_SET;                                                                     \
900
        }                                                                                               \
901
        return;                                                                                         \
902
    } while (false)
903
904
#else
905
906
#define U_UNICODESET_RETURN_IF_ERROR(ec)                                                                \
907
74.4M
    do {                                                                                                \
908
74.4M
        if (U_FAILURE(ec)) {                                                                            \
909
179k
            return;                                                                                     \
910
179k
        }                                                                                               \
911
74.4M
    } while (false)
912
#define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec)                               \
913
25.1k
    do {                                                                                                \
914
25.1k
        if (U_FAILURE(lexer.lookahead().errorCode())) {                                                 \
915
18.3k
            (ec) = lexer.lookahead().errorCode();                                                       \
916
18.3k
        } else {                                                                                        \
917
6.71k
            (ec) = U_MALFORMED_SET;                                                                     \
918
6.71k
        }                                                                                               \
919
25.1k
        return;                                                                                         \
920
25.1k
    } while (false)
921
922
#endif
923
924
}  // namespace
925
926
/**
927
 * Parse the pattern from the given RuleCharacterIterator.  The
928
 * iterator is advanced over the parsed pattern.
929
 * @param pattern The pattern, only used by debug traces.
930
 * @param parsePosition The ParsePosition underlying chars, only used by debug traces.
931
 * @param chars iterator over the pattern characters.  Upon return
932
 * it will be advanced to the first character after the parsed
933
 * pattern, or the end of the iteration if all characters are
934
 * parsed.
935
 * @param symbols symbol table to use to parse and dereference
936
 * variables, or null if none.
937
 * @param rebuiltPat the pattern that was parsed, rebuilt or
938
 * copied from the input pattern, as appropriate.
939
 * @param options a bit mask of zero or more of the following:
940
 * IGNORE_SPACE, CASE.
941
 */
942
943
void UnicodeSet::applyPattern(const UnicodeString &pattern,
944
                              const ParsePosition &parsePosition,
945
                              RuleCharacterIterator &chars,
946
                              const SymbolTable *symbols,
947
                              UnicodeString &rebuiltPat,
948
                              uint32_t options,
949
                              UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
950
98.1k
                              UErrorCode &ec) {
951
98.1k
    if (U_FAILURE(ec)) return;
952
98.1k
    Lexer lexer(pattern, parsePosition, chars, options, symbols, caseClosure);
953
98.1k
    parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, /*depth=*/0, ec);
954
98.1k
}
955
956
void UnicodeSet::parseUnicodeSet(Lexer &lexer,
957
                                 UnicodeString& rebuiltPat,
958
                                 uint32_t options,
959
                                 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
960
                                 int32_t depth,
961
1.94M
                                 UErrorCode &ec) {
962
1.94M
    clear();
963
964
1.94M
    if (depth > MAX_DEPTH) {
965
37
        U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(),
966
37
                                             ("depth = " + std::to_string(depth)).c_str(), lexer, ec);
967
37
    }
968
969
1.94M
    bool isComplement = false;
970
    // Whether to keep the syntax of the pattern at this level, only doing basic pretty-printing, e.g.,
971
    // turn [ c - z[a]a - b ] into [c-z[a]a-b], but not into [a-z].
972
    // This is true for a property query, or when there is a nested set.  Note that since we recurse,
973
    // innermost sets consisting only of ranges will get simplified.
974
1.94M
    bool preserveSyntaxInPattern = false;
975
    // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc.
976
1.94M
    UnicodeString prettyPrintedPattern;
977
1.94M
    if (lexer.lookahead().set() != nullptr) {
978
        // UnicodeSet ::= property-query | named-element
979
        // Extension:
980
        //              | set-valued-variable
981
111k
        *this = *lexer.lookahead().set();
982
111k
        this->_toPattern(prettyPrintedPattern, /*escapeUnprintable=*/false);
983
111k
        lexer.advance();
984
111k
        preserveSyntaxInPattern = true;
985
1.83M
    } else {
986
        // UnicodeSet ::=                [   Union ]
987
        //              | Complement ::= [ ^ Union ]
988
1.83M
        if (lexer.acceptSetOperator(u'[')) {
989
1.83M
            prettyPrintedPattern.append(u'[');
990
1.83M
            if (lexer.acceptSetOperator(u'^')) {
991
1.53M
                prettyPrintedPattern.append(u'^');
992
1.53M
                isComplement = true;
993
1.53M
            }
994
1.83M
            parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth,
995
1.83M
                       /*containsRestrictions=*/preserveSyntaxInPattern, ec);
996
1.83M
            U_UNICODESET_RETURN_IF_ERROR(ec);
997
1.78M
            if (!lexer.acceptSetOperator(u']')) {
998
5.40k
                U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().debugString(), lexer, ec);
999
5.40k
            }
1000
1.78M
            prettyPrintedPattern.append(u']');
1001
1.78M
        } else {
1002
3.54k
            U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)",
1003
3.54k
                                                 lexer.lookahead().debugString(), lexer,
1004
3.54k
                                                 ec);
1005
3.54k
        }
1006
1.83M
    }
1007
1008
    /**
1009
     * Handle global flags (isComplement, case insensitivity).  If this
1010
     * pattern should be compiled case-insensitive, then we need
1011
     * to close over case BEFORE COMPLEMENTING.  This makes
1012
     * patterns like /[^abc]/i work.
1013
     */
1014
1.89M
    if ((options & USET_CASE_MASK) != 0) {
1015
100k
        (this->*caseClosure)(options);
1016
100k
    }
1017
1.89M
    if (isComplement) {
1018
1.53M
        complement().removeAllStrings();  // code point complement
1019
1.53M
    }
1020
1.89M
    if (preserveSyntaxInPattern) {
1021
244k
        rebuiltPat.append(prettyPrintedPattern);
1022
1.64M
    } else {
1023
1.64M
        _generatePattern(rebuiltPat, /*escapeUnprintable=*/false);
1024
1.64M
    }
1025
1.89M
}
1026
1027
void UnicodeSet::parseUnion(Lexer &lexer,
1028
                            UnicodeString &rebuiltPat,
1029
                            uint32_t options,
1030
                            UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
1031
                            int32_t depth,
1032
                            bool &containsRestrictions,
1033
1.83M
                            UErrorCode &ec) {
1034
    // Union ::= Terms
1035
    //         | UnescapedHyphenMinus Terms
1036
    //         | Terms UnescapedHyphenMinus
1037
    //         | UnescapedHyphenMinus Terms UnescapedHyphenMinus
1038
    // Terms ::= ""
1039
    //         | Terms Term
1040
1.83M
    if (lexer.acceptSetOperator(u'-')) {
1041
2.16k
        add(u'-');
1042
        // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a
1043
        // final one, for consistency with older ICU behaviour.
1044
2.16k
        rebuiltPat.append(u"\\-");
1045
2.16k
    }
1046
37.1M
    while (!lexer.atEnd()) {
1047
        // Note that while a HYPHEN-MINUS mapped by the symbol table is treated as a literal at the
1048
        // beginning of the Union, it is treated as a set elsewhere, including at the end.
1049
37.1M
        if (lexer.acceptSetOperator(u'-')) {
1050
            // We can be here on the first iteration: [--] is allowed by the
1051
            // grammar and by the old parser.
1052
5.31k
            rebuiltPat.append(u'-');
1053
5.31k
            add(u'-');
1054
5.31k
            return;
1055
37.1M
        } else if (lexer.lookahead().isSetOperator(u'$')) {
1056
11.8k
            if (lexer.lookahead2().isSetOperator(u']')) {
1057
                // ICU extensions: A $ is allowed as a literal-element.
1058
                // A Term at the end of a Union consisting of a single $ is an anchor.
1059
2.71k
                rebuiltPat.append(u'$');
1060
                // Consume the dollar.
1061
2.71k
                lexer.advance();
1062
2.71k
                add(U_ETHER);
1063
2.71k
                containsRestrictions = true;
1064
2.71k
                return;
1065
2.71k
            }
1066
11.8k
        }
1067
37.1M
        if (lexer.lookahead().isSetOperator(u']')) {
1068
1.77M
            return;
1069
1.77M
        }
1070
35.3M
        parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec);
1071
35.3M
        U_UNICODESET_RETURN_IF_ERROR(ec);
1072
35.3M
    }
1073
1.83M
}
1074
1075
void UnicodeSet::parseTerm(Lexer &lexer,
1076
                           UnicodeString &rebuiltPat,
1077
                           uint32_t options,
1078
                           UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
1079
                           int32_t depth,
1080
                           bool &containsRestriction,
1081
35.3M
                           UErrorCode &ec) {
1082
    // Term ::= Elements
1083
    //        | Restriction
1084
35.3M
    if (lexer.lookahead().isSetOperator('[') || lexer.lookahead().set() != nullptr) {
1085
1.83M
        containsRestriction = true;
1086
1.83M
        parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec);
1087
1.83M
        U_UNICODESET_RETURN_IF_ERROR(ec);
1088
33.5M
    } else {
1089
33.5M
        parseElements(lexer, rebuiltPat, ec);
1090
33.5M
        U_UNICODESET_RETURN_IF_ERROR(ec);
1091
33.5M
    }
1092
35.3M
}
1093
1094
void UnicodeSet::parseRestriction(Lexer &lexer,
1095
                                  UnicodeString &rebuiltPat,
1096
                                  uint32_t options,
1097
                                  UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute),
1098
                                  int32_t depth,
1099
1.83M
                                  UErrorCode &ec) {
1100
    // Parse a https://www.unicode.org/reports/tr61/#Restriction:
1101
    //   Restriction  ::= UnicodeSet
1102
    //                  | Intersection
1103
    //                  | Difference
1104
    //   Intersection ::= Restriction & UnicodeSet
1105
    //   Difference   ::= Restriction - UnicodeSet
1106
    // or, rewritten to be LL,
1107
    //   Restriction    ::= UnicodeSet RightHandSides
1108
    //   RightHandSides ::= ""
1109
    //                    | & UnicodeSet RightHandSides
1110
    //                    | - UnicodeSet RightHandSides
1111
    // but note that the tree resulting from this LL version is not an expression tree: the
1112
    // operations are left-associative.
1113
    // Start by parsing the first UnicodeSet.
1114
1.83M
    UnicodeSet leftHandSide;
1115
1.83M
    leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
1116
1.83M
    addAll(leftHandSide);
1117
1.83M
    U_UNICODESET_RETURN_IF_ERROR(ec);
1118
    // Now keep looking for an operator that would continue the RightHandSide.
1119
    // The loop terminates because when we run out of source text, the lookahead token will not be a set
1120
    // operator, so that we hit the else branch and return.
1121
1.81M
    for (;;) {
1122
1.81M
        if (lexer.acceptSetOperator(u'&')) {
1123
            // Intersection ::= Restriction & UnicodeSet
1124
6.57k
            rebuiltPat.append(u'&');
1125
6.57k
            UnicodeSet rightHandSide;
1126
6.57k
            rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
1127
6.57k
            U_UNICODESET_RETURN_IF_ERROR(ec);
1128
5.67k
            retainAll(rightHandSide);
1129
1.81M
        } else if (lexer.lookahead().isSetOperator(u'-')) {
1130
            // Here the grammar requires two tokens of lookahead to figure out whether the - is the operator
1131
            // of a Difference or an UnescapedHyphenMinus in the enclosing Union.
1132
12.7k
            if (lexer.lookahead2().isSetOperator(u']')) {
1133
                // The operator is actually an UnescapedHyphenMinus; terminate the Restriction
1134
                // before it.  We return to parseTerm, which immediately returns to parseUnion,
1135
                // which will accept the - and add it to *this.
1136
2.80k
                return;
1137
2.80k
            }
1138
            // Consume the hyphen-minus.
1139
9.98k
            lexer.advance();
1140
            // Difference ::= Restriction - UnicodeSet
1141
9.98k
            rebuiltPat.append(u'-');
1142
9.98k
            UnicodeSet rightHandSide;
1143
9.98k
            rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec);
1144
9.98k
            U_UNICODESET_RETURN_IF_ERROR(ec);
1145
8.46k
            removeAll(rightHandSide);
1146
1.79M
        } else {
1147
            // Not an operator, end of the Restriction.
1148
1.79M
            return;
1149
1.79M
        }
1150
1.81M
    }
1151
1.80M
}
1152
1153
void UnicodeSet::parseElements(Lexer &lexer,
1154
                               UnicodeString &rebuiltPat,
1155
33.5M
                               UErrorCode &ec) {
1156
    // Elements     ::= Element
1157
    //                | Range
1158
    // Range        ::= RangeElement - RangeElement
1159
    // RangeElement ::= literal-element
1160
    //                | escaped-element
1161
    //                | named-element
1162
    //                | bracketed-element
1163
    // Element      ::= RangeElement
1164
    //                | string-literal
1165
    // codePoint().has_value() on a lexical element if it is a RangeElement.
1166
33.5M
    if (lexer.lookahead().isStringLiteral()) {
1167
92.5k
        add(*lexer.lookahead().element());
1168
92.5k
        rebuiltPat.append(u'{');
1169
92.5k
        _appendToPat(rebuiltPat, *lexer.lookahead().element(), /*escapeUnprintable=*/false);
1170
92.5k
        rebuiltPat.append(u'}');
1171
92.5k
        lexer.advance();
1172
92.5k
        return;
1173
92.5k
    }
1174
33.4M
    UChar32 first;
1175
33.4M
    if (lexer.lookahead().isSetOperator(u'$')) {
1176
        // Disallowed by UTS #61, but historically accepted by ICU.  This is an extension.
1177
9.13k
        first = u'$';
1178
33.4M
    } else if (lexer.lookahead().codePoint().has_value()) {
1179
33.4M
        first = *lexer.lookahead().codePoint();
1180
33.4M
    } else {
1181
16.0k
        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal",
1182
16.0k
                                             lexer.lookahead().debugString(),
1183
16.0k
                                             lexer, ec);
1184
16.0k
    }
1185
33.4M
    lexer.advance();
1186
33.4M
    _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false);
1187
33.4M
    if (!lexer.lookahead().isSetOperator(u'-')) {
1188
        // No operator,
1189
        // Elements ::= Element
1190
33.4M
        add(first);
1191
33.4M
        return;
1192
33.4M
    }
1193
    // Here the grammar requires two tokens of lookahead to figure out whether the - is the operator
1194
    // of a Range or an UnescapedHyphenMinus in the enclosing Union.
1195
35.3k
    if (lexer.lookahead2().isSetOperator(u']')) {
1196
        // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it.
1197
1.93k
        add(first);
1198
1.93k
        return;
1199
1.93k
    }
1200
    // Consume the hyphen-minus.
1201
33.4k
    lexer.advance();
1202
    // Elements ::= Range ::= RangeElement - RangeElement
1203
33.4k
    rebuiltPat.append(u'-');
1204
33.4k
    UChar32 last;
1205
33.4k
    if (lexer.lookahead().isSetOperator(u'$')) {
1206
        // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union.
1207
        // This is an extension.
1208
908
        last = u'$';
1209
908
        if (lexer.lookahead2().isSetOperator(u']')) {
1210
3
            U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $",
1211
3
                                                 lexer.lookahead().debugString() + u" followed by " +
1212
3
                                                     lexer.lookahead2().debugString(),
1213
3
                                                 lexer, ec);
1214
3
        }
1215
32.5k
    } else if (lexer.lookahead().codePoint().has_value()) {
1216
32.4k
        last = *lexer.lookahead().codePoint();
1217
32.4k
    } else {
1218
47
        U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec);
1219
47
    }
1220
33.3k
    if (last <= first) {
1221
65
        U_UNICODESET_RETURN_WITH_PARSE_ERROR(
1222
65
            "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec);
1223
65
    }
1224
33.3k
    lexer.advance();
1225
33.3k
    _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false);
1226
33.3k
    add(first, last);
1227
33.3k
    return;
1228
33.3k
}
1229
1230
//----------------------------------------------------------------
1231
// Property set implementation
1232
//----------------------------------------------------------------
1233
1234
namespace {
1235
1236
119M
UBool numericValueFilter(UChar32 ch, void* context) {
1237
119M
    return u_getNumericValue(ch) == *static_cast<double*>(context);
1238
119M
}
1239
1240
229M
UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
1241
229M
    int32_t value = *static_cast<int32_t*>(context);
1242
229M
    return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
1243
229M
}
1244
1245
54.5M
UBool versionFilter(UChar32 ch, void* context) {
1246
54.5M
    static const UVersionInfo none = { 0, 0, 0, 0 };
1247
54.5M
    UVersionInfo v;
1248
54.5M
    u_charAge(ch, v);
1249
54.5M
    UVersionInfo* version = static_cast<UVersionInfo*>(context);
1250
54.5M
    return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
1251
54.5M
}
1252
1253
typedef struct {
1254
    UProperty prop;
1255
    int32_t value;
1256
} IntPropertyContext;
1257
1258
336M
UBool intPropertyFilter(UChar32 ch, void* context) {
1259
336M
    IntPropertyContext* c = static_cast<IntPropertyContext*>(context);
1260
336M
    return u_getIntPropertyValue(ch, c->prop) == c->value;
1261
336M
}
1262
1263
66.1M
UBool scriptExtensionsFilter(UChar32 ch, void* context) {
1264
66.1M
    return uscript_hasScript(ch, *static_cast<UScriptCode*>(context));
1265
66.1M
}
1266
1267
0
UBool idTypeFilter(UChar32 ch, void* context) {
1268
0
    return u_hasIDType(ch, *static_cast<UIdentifierType*>(context));
1269
0
}
1270
1271
}  // namespace
1272
1273
/**
1274
 * Generic filter-based scanning code for UCD property UnicodeSets.
1275
 */
1276
void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
1277
                             void* context,
1278
                             const UnicodeSet* inclusions,
1279
282k
                             UErrorCode &status) {
1280
282k
    if (U_FAILURE(status)) return;
1281
1282
    // Logically, walk through all Unicode characters, noting the start
1283
    // and end of each range for which filter.contain(c) is
1284
    // true.  Add each range to a set.
1285
    //
1286
    // To improve performance, use an inclusions set which
1287
    // encodes information about character ranges that are known
1288
    // to have identical properties.
1289
    // inclusions contains the first characters of
1290
    // same-value ranges for the given property.
1291
1292
282k
    clear();
1293
1294
282k
    UChar32 startHasProperty = -1;
1295
282k
    int32_t limitRange = inclusions->getRangeCount();
1296
1297
420M
    for (int j=0; j<limitRange; ++j) {
1298
        // get current range
1299
420M
        UChar32 start = inclusions->getRangeStart(j);
1300
420M
        UChar32 end = inclusions->getRangeEnd(j);
1301
1302
        // for all the code points in the range, process
1303
1.22G
        for (UChar32 ch = start; ch <= end; ++ch) {
1304
            // only add to this UnicodeSet on inflection points --
1305
            // where the hasProperty value changes to false
1306
806M
            if ((*filter)(ch, context)) {
1307
57.3M
                if (startHasProperty < 0) {
1308
27.9M
                    startHasProperty = ch;
1309
27.9M
                }
1310
749M
            } else if (startHasProperty >= 0) {
1311
27.8M
                add(startHasProperty, ch-1);
1312
27.8M
                startHasProperty = -1;
1313
27.8M
            }
1314
806M
        }
1315
420M
    }
1316
282k
    if (startHasProperty >= 0) {
1317
31.5k
        add(startHasProperty, static_cast<UChar32>(0x10FFFF));
1318
31.5k
    }
1319
282k
    if (isBogus() && U_SUCCESS(status)) {
1320
        // We likely ran out of memory. AHHH!
1321
0
        status = U_MEMORY_ALLOCATION_ERROR;
1322
0
    }
1323
282k
}
1324
1325
namespace {
1326
1327
}  // namespace
1328
1329
//----------------------------------------------------------------
1330
// Property set API
1331
//----------------------------------------------------------------
1332
1333
14.6k
#define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
1334
14.6k
    ec=U_ILLEGAL_ARGUMENT_ERROR; \
1335
14.6k
    return *this; \
1336
14.6k
} UPRV_BLOCK_MACRO_END
1337
1338
UnicodeSet&
1339
266k
UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
1340
266k
    if (U_FAILURE(ec) || isFrozen()) { return *this; }
1341
266k
    if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
1342
35.6k
        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
1343
35.6k
        applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
1344
230k
    } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
1345
6.65k
        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
1346
6.65k
        UScriptCode script = static_cast<UScriptCode>(value);
1347
6.65k
        applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
1348
224k
    } else if (prop == UCHAR_IDENTIFIER_TYPE) {
1349
0
        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
1350
0
        UIdentifierType idType = static_cast<UIdentifierType>(value);
1351
0
        applyFilter(idTypeFilter, &idType, inclusions, ec);
1352
224k
    } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
1353
7.93k
        if (value == 0 || value == 1) {
1354
7.93k
            const USet *set = u_getBinaryPropertySet(prop, &ec);
1355
7.93k
            if (U_FAILURE(ec)) { return *this; }
1356
7.93k
            copyFrom(*UnicodeSet::fromUSet(set), true);
1357
7.93k
            if (value == 0) {
1358
704
                complement().removeAllStrings();  // code point complement
1359
704
            }
1360
7.93k
        } else {
1361
0
            clear();
1362
0
        }
1363
216k
    } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
1364
216k
        const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
1365
216k
        IntPropertyContext c = {prop, value};
1366
216k
        applyFilter(intPropertyFilter, &c, inclusions, ec);
1367
216k
    } else {
1368
0
        ec = U_ILLEGAL_ARGUMENT_ERROR;
1369
0
    }
1370
266k
    return *this;
1371
266k
}
1372
1373
UnicodeSet&
1374
UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
1375
                               const UnicodeString& value,
1376
131k
                               UErrorCode& ec) {
1377
131k
    if (U_FAILURE(ec) || isFrozen()) return *this;
1378
1379
    // prop and value used to be converted to char * using the default
1380
    // converter instead of the invariant conversion.
1381
    // This should not be necessary because all Unicode property and value
1382
    // names use only invariant characters.
1383
    // If there are any variant characters, then we won't find them anyway.
1384
    // Checking first avoids assertion failures in the conversion.
1385
131k
    if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
1386
131k
        !uprv_isInvariantUString(value.getBuffer(), value.length())
1387
131k
    ) {
1388
667
        FAIL(ec);
1389
667
    }
1390
131k
    CharString pname, vname;
1391
131k
    pname.appendInvariantChars(prop, ec);
1392
131k
    vname.appendInvariantChars(value, ec);
1393
131k
    if (U_FAILURE(ec)) return *this;
1394
1395
131k
    UProperty p;
1396
131k
    int32_t v;
1397
131k
    UBool invert = false;
1398
1399
131k
    if (value.length() > 0) {
1400
74.6k
        p = u_getPropertyEnum(pname.data());
1401
74.6k
        if (p == UCHAR_INVALID_CODE) FAIL(ec);
1402
1403
        // Treat gc as gcm
1404
72.1k
        if (p == UCHAR_GENERAL_CATEGORY) {
1405
338
            p = UCHAR_GENERAL_CATEGORY_MASK;
1406
338
        }
1407
1408
72.1k
        if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1409
71.4k
            (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1410
40.3k
            (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1411
40.3k
            v = u_getPropertyValueEnum(p, vname.data());
1412
40.3k
            if (v == UCHAR_INVALID_CODE) {
1413
                // Handle numeric CCC
1414
21.3k
                if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1415
17.5k
                    p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1416
21.2k
                    p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1417
21.2k
                    char* end;
1418
21.2k
                    double val = uprv_strtod(vname.data(), &end);
1419
                    // Anything between 0 and 255 is valid even if unused.
1420
                    // Cast double->int only after range check.
1421
                    // We catch NaN here because comparing it with both 0 and 255 will be false
1422
                    // (as are all comparisons with NaN).
1423
21.2k
                    if (*end != 0 || !(0 <= val && val <= 255) ||
1424
21.2k
                            (v = static_cast<int32_t>(val)) != val) {
1425
                        // non-integral value or outside 0..255, or trailing junk
1426
26
                        FAIL(ec);
1427
26
                    }
1428
21.2k
                } else {
1429
63
                    FAIL(ec);
1430
63
                }
1431
21.3k
            }
1432
40.3k
        }
1433
1434
31.8k
        else {
1435
1436
31.8k
            switch (p) {
1437
18.6k
            case UCHAR_NUMERIC_VALUE:
1438
18.6k
                {
1439
18.6k
                    char* end;
1440
18.6k
                    double val = uprv_strtod(vname.data(), &end);
1441
18.6k
                    if (*end != 0) {
1442
6
                        FAIL(ec);
1443
6
                    }
1444
18.6k
                    applyFilter(numericValueFilter, &val,
1445
18.6k
                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
1446
18.6k
                    return *this;
1447
18.6k
                }
1448
1.01k
            case UCHAR_NAME:
1449
1.01k
                {
1450
1.01k
                    const UChar32 ch = getCharacterByName(vname);
1451
1.01k
                    if (ch < 0) {
1452
152
                        FAIL(ec);
1453
152
                    }
1454
865
                    clear();
1455
865
                    add(ch);
1456
865
                    return *this;
1457
1.01k
                }
1458
3
            case UCHAR_UNICODE_1_NAME:
1459
                // ICU 49 deprecates the Unicode_1_Name property APIs.
1460
3
                FAIL(ec);
1461
5.49k
            case UCHAR_AGE:
1462
5.49k
                {
1463
5.49k
                    UVersionInfo version;
1464
5.49k
                    u_versionFromString(version, vname.data());
1465
5.49k
                    applyFilter(versionFilter, &version,
1466
5.49k
                                CharacterProperties::getInclusionsForProperty(p, ec), ec);
1467
5.49k
                    return *this;
1468
3
                }
1469
6.67k
            case UCHAR_SCRIPT_EXTENSIONS:
1470
6.67k
                v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
1471
6.67k
                if (v == UCHAR_INVALID_CODE) {
1472
17
                    FAIL(ec);
1473
17
                }
1474
                // fall through to calling applyIntPropertyValue()
1475
6.65k
                break;
1476
6.65k
            case UCHAR_IDENTIFIER_TYPE:
1477
0
                v = u_getPropertyValueEnum(p, vname.data());
1478
0
                if (v == UCHAR_INVALID_CODE) {
1479
0
                    FAIL(ec);
1480
0
                }
1481
                // fall through to calling applyIntPropertyValue()
1482
0
                break;
1483
15
            default:
1484
                // p is a non-binary, non-enumerated property that we
1485
                // don't support (yet).
1486
15
                FAIL(ec);
1487
31.8k
            }
1488
31.8k
        }
1489
72.1k
    }
1490
1491
56.5k
    else {
1492
        // value is empty.  Interpret as General Category, Script, or
1493
        // Binary property.
1494
56.5k
        p = UCHAR_GENERAL_CATEGORY_MASK;
1495
56.5k
        v = u_getPropertyValueEnum(p, pname.data());
1496
56.5k
        if (v == UCHAR_INVALID_CODE) {
1497
36.9k
            p = UCHAR_SCRIPT;
1498
36.9k
            v = u_getPropertyValueEnum(p, pname.data());
1499
36.9k
            if (v == UCHAR_INVALID_CODE) {
1500
22.2k
                p = u_getPropertyEnum(pname.data());
1501
22.2k
                if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1502
7.22k
                    v = 1;
1503
14.9k
                } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
1504
2.17k
                    set(MIN_VALUE, MAX_VALUE);
1505
2.17k
                    return *this;
1506
12.8k
                } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
1507
1.59k
                    set(0, 0x7F);
1508
1.59k
                    return *this;
1509
11.2k
                } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
1510
                    // [:Assigned:]=[:^Cn:]
1511
0
                    p = UCHAR_GENERAL_CATEGORY_MASK;
1512
0
                    v = U_GC_CN_MASK;
1513
0
                    invert = true;
1514
11.2k
                } else {
1515
11.2k
                    FAIL(ec);
1516
11.2k
                }
1517
22.2k
            }
1518
36.9k
        }
1519
56.5k
    }
1520
1521
88.4k
    applyIntPropertyValue(p, v, ec);
1522
88.4k
    if(invert) {
1523
0
        complement().removeAllStrings();  // code point complement
1524
0
    }
1525
1526
88.4k
    if (isBogus() && U_SUCCESS(ec)) {
1527
        // We likely ran out of memory. AHHH!
1528
0
        ec = U_MEMORY_ALLOCATION_ERROR;
1529
0
    }
1530
88.4k
    return *this;
1531
131k
}
1532
1533
//----------------------------------------------------------------
1534
// Property set patterns
1535
//----------------------------------------------------------------
1536
1537
/**
1538
 * Return true if the given position, in the given pattern, appears
1539
 * to be the start of a property set pattern.
1540
 */
1541
UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1542
0
                                           int32_t pos) {
1543
    // Patterns are at least 5 characters long
1544
0
    if ((pos+5) > pattern.length()) {
1545
0
        return false;
1546
0
    }
1547
1548
    // Look for an opening [:, [:^, \p, or \P
1549
0
    return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1550
0
}
1551
1552
U_NAMESPACE_END