/src/icu/icu4c/source/common/uniset_props.cpp
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 1999-2014, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: uniset_props.cpp |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2004aug25 |
16 | | * created by: Markus W. Scherer |
17 | | * |
18 | | * Character property dependent functions moved here from uniset.cpp |
19 | | */ |
20 | | |
21 | | #include <array> |
22 | | #include <optional> |
23 | | |
24 | | #include "unicode/utypes.h" |
25 | | #include "unicode/uniset.h" |
26 | | #include "unicode/parsepos.h" |
27 | | #include "unicode/uchar.h" |
28 | | #include "unicode/uscript.h" |
29 | | #include "unicode/symtable.h" |
30 | | #include "unicode/uset.h" |
31 | | #include "unicode/locid.h" |
32 | | #include "unicode/brkiter.h" |
33 | | #include "unicode/utfiterator.h" |
34 | | #include "uset_imp.h" |
35 | | #include "ruleiter.h" |
36 | | #include "cmemory.h" |
37 | | #include "ucln_cmn.h" |
38 | | #include "util.h" |
39 | | #include "uvector.h" |
40 | | #include "uprops.h" |
41 | | #include "patternprops.h" |
42 | | #include "propname.h" |
43 | | #include "normalizer2impl.h" |
44 | | #include "uinvchar.h" |
45 | | #include "uprops.h" |
46 | | #include "charstr.h" |
47 | | #include "cstring.h" |
48 | | #include "mutex.h" |
49 | | #include "umutex.h" |
50 | | #include "uassert.h" |
51 | | #include "hash.h" |
52 | | |
53 | | U_NAMESPACE_USE |
54 | | |
55 | | namespace { |
56 | | |
57 | | // Special property set IDs |
58 | | constexpr char ANY[] = "ANY"; // [\u0000-\U0010FFFF] |
59 | | constexpr char ASCII[] = "ASCII"; // [\u0000-\u007F] |
60 | | constexpr char ASSIGNED[] = "Assigned"; // [:^Cn:] |
61 | | |
62 | | } // namespace |
63 | | |
64 | | // Cached sets ------------------------------------------------------------- *** |
65 | | |
66 | | U_CDECL_BEGIN |
67 | | static UBool U_CALLCONV uset_cleanup(); |
68 | | |
69 | | static UnicodeSet *uni32Singleton; |
70 | | static icu::UInitOnce uni32InitOnce {}; |
71 | | |
72 | | /** |
73 | | * Cleanup function for UnicodeSet |
74 | | */ |
75 | 0 | static UBool U_CALLCONV uset_cleanup() { |
76 | 0 | delete uni32Singleton; |
77 | 0 | uni32Singleton = nullptr; |
78 | 0 | uni32InitOnce.reset(); |
79 | 0 | return true; |
80 | 0 | } |
81 | | |
82 | | U_CDECL_END |
83 | | |
84 | | U_NAMESPACE_BEGIN |
85 | | |
86 | | using U_HEADER_ONLY_NAMESPACE::utfStringCodePoints; |
87 | | |
88 | | namespace { |
89 | | |
90 | | // Cache some sets for other services -------------------------------------- *** |
91 | 0 | void U_CALLCONV createUni32Set(UErrorCode &errorCode) { |
92 | 0 | U_ASSERT(uni32Singleton == nullptr); |
93 | 0 | uni32Singleton = new UnicodeSet(UnicodeString(u"[:age=3.2:]"), errorCode); |
94 | 0 | if(uni32Singleton==nullptr) { |
95 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
96 | 0 | } else { |
97 | 0 | uni32Singleton->freeze(); |
98 | 0 | } |
99 | 0 | ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); |
100 | 0 | } |
101 | | |
102 | | |
103 | | U_CFUNC UnicodeSet * |
104 | 0 | uniset_getUnicode32Instance(UErrorCode &errorCode) { |
105 | 0 | umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); |
106 | 0 | return uni32Singleton; |
107 | 0 | } |
108 | | |
109 | | // helper functions for matching of pattern syntax pieces ------------------ *** |
110 | | // these functions are parallel to the PERL_OPEN etc. strings above |
111 | | |
112 | | // using these functions is not only faster than UnicodeString::compare() and |
113 | | // caseCompare(), but they also make UnicodeSet work for simple patterns when |
114 | | // no Unicode properties data is available - when caseCompare() fails |
115 | | |
116 | | inline UBool |
117 | 0 | isPerlOpen(const UnicodeString &pattern, int32_t pos) { |
118 | 0 | char16_t c; |
119 | 0 | return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P'); |
120 | 0 | } |
121 | | |
122 | | /*static inline UBool |
123 | | isPerlClose(const UnicodeString &pattern, int32_t pos) { |
124 | | return pattern.charAt(pos)==u'}'; |
125 | | }*/ |
126 | | |
127 | | inline UBool |
128 | 0 | isNameOpen(const UnicodeString &pattern, int32_t pos) { |
129 | 0 | return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N'; |
130 | 0 | } |
131 | | |
132 | | inline UBool |
133 | 0 | isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { |
134 | 0 | return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':'; |
135 | 0 | } |
136 | | |
137 | | /*static inline UBool |
138 | | isPOSIXClose(const UnicodeString &pattern, int32_t pos) { |
139 | | return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']'; |
140 | | }*/ |
141 | | |
142 | | // TODO memory debugging provided inside uniset.cpp |
143 | | // could be made available here but probably obsolete with use of modern |
144 | | // memory leak checker tools |
145 | | #define _dbgct(me) |
146 | | |
147 | | // Returns the character with the given name or name alias, or U_SENTINEL if no such character |
148 | | // exists. |
149 | 11.9k | UChar32 getCharacterByName(const CharString& name) { |
150 | 13.2k | for (const UCharNameChoice nameChoice : std::array{U_EXTENDED_CHAR_NAME, U_CHAR_NAME_ALIAS}) { |
151 | 13.2k | UErrorCode ec = U_ZERO_ERROR; |
152 | 13.2k | UChar32 ch = u_charFromName(nameChoice, name.data(), &ec); |
153 | 13.2k | if (U_SUCCESS(ec)) { |
154 | 10.6k | return ch; |
155 | 10.6k | } |
156 | 13.2k | } |
157 | 1.30k | return U_SENTINEL; |
158 | 11.9k | } |
159 | | |
160 | | } // namespace |
161 | | |
162 | | //---------------------------------------------------------------- |
163 | | // Constructors &c |
164 | | //---------------------------------------------------------------- |
165 | | |
166 | | /** |
167 | | * Constructs a set from the given pattern, optionally ignoring |
168 | | * white space. See the class description for the syntax of the |
169 | | * pattern language. |
170 | | * @param pattern a string specifying what characters are in the set |
171 | | */ |
172 | | UnicodeSet::UnicodeSet(const UnicodeString& pattern, |
173 | 12.2k | UErrorCode& status) { |
174 | 12.2k | applyPattern(pattern, status); |
175 | 12.2k | _dbgct(this); |
176 | 12.2k | } |
177 | | |
178 | | //---------------------------------------------------------------- |
179 | | // Public API |
180 | | //---------------------------------------------------------------- |
181 | | |
182 | | UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, |
183 | 17.7k | UErrorCode& status) { |
184 | | // Equivalent to |
185 | | // return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status); |
186 | | // but without dependency on closeOver(). |
187 | 17.7k | ParsePosition pos(0); |
188 | 17.7k | applyPatternIgnoreSpace(pattern, pos, nullptr, status); |
189 | 17.7k | if (U_FAILURE(status)) return *this; |
190 | | |
191 | 8.04k | int32_t i = pos.getIndex(); |
192 | | // Skip over trailing whitespace |
193 | 8.04k | ICU_Utility::skipWhitespace(pattern, i, true); |
194 | 8.04k | if (i != pattern.length()) { |
195 | 388 | status = U_ILLEGAL_ARGUMENT_ERROR; |
196 | 388 | } |
197 | 8.04k | return *this; |
198 | 17.7k | } |
199 | | |
200 | | void |
201 | | UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, |
202 | | ParsePosition& pos, |
203 | | const SymbolTable* symbols, |
204 | 17.7k | UErrorCode& status) { |
205 | 17.7k | if (U_FAILURE(status)) { |
206 | 0 | return; |
207 | 0 | } |
208 | 17.7k | if (isFrozen()) { |
209 | 0 | status = U_NO_WRITE_PERMISSION; |
210 | 0 | return; |
211 | 0 | } |
212 | | // Need to build the pattern in a temporary string because |
213 | | // _applyPattern calls add() etc., which set pat to empty. |
214 | 17.7k | UnicodeString rebuiltPat; |
215 | 17.7k | RuleCharacterIterator chars(pattern, symbols, pos); |
216 | 17.7k | applyPattern(pattern, pos, chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, status); |
217 | 17.7k | if (U_FAILURE(status)) return; |
218 | 8.04k | if (chars.inVariable()) { |
219 | | // syntaxError(chars, "Extra chars in variable value"); |
220 | 0 | status = U_MALFORMED_SET; |
221 | 0 | return; |
222 | 0 | } |
223 | 8.04k | setPattern(rebuiltPat); |
224 | 8.04k | } |
225 | | |
226 | | /** |
227 | | * Return true if the given position, in the given pattern, appears |
228 | | * to be the start of a UnicodeSet pattern. |
229 | | */ |
230 | 0 | UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { |
231 | 0 | return ((pos+1) < pattern.length() && |
232 | 0 | pattern.charAt(pos) == static_cast<char16_t>(91)/*[*/) || |
233 | 0 | resemblesPropertyPattern(pattern, pos); |
234 | 0 | } |
235 | | |
236 | | //---------------------------------------------------------------- |
237 | | // Implementation: Pattern parsing |
238 | | //---------------------------------------------------------------- |
239 | | |
240 | | #define U_DEBUGGING_UNICODESET_PARSING 0 |
241 | | |
242 | | class UnicodeSet::Lexer { |
243 | | public: |
244 | | Lexer(const UnicodeString &pattern, |
245 | | const ParsePosition &parsePosition, |
246 | | RuleCharacterIterator &chars, |
247 | | uint32_t unicodeSetOptions, |
248 | | const SymbolTable *const symbols, |
249 | | UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute)) |
250 | 98.1k | : pattern_(pattern), parsePosition_(parsePosition), chars_(chars), |
251 | 98.1k | unicodeSetOptions_(unicodeSetOptions), |
252 | 98.1k | charsOptions_(RuleCharacterIterator::PARSE_ESCAPES | |
253 | 98.1k | ((unicodeSetOptions & USET_IGNORE_SPACE) != 0 |
254 | 98.1k | ? RuleCharacterIterator::SKIP_WHITESPACE |
255 | 98.1k | : 0)), |
256 | 98.1k | symbols_(symbols), |
257 | 98.1k | caseClosure_(caseClosure) {} |
258 | | |
259 | | class LexicalElement { |
260 | | public: |
261 | 224M | bool isSetOperator(const char16_t op) const { |
262 | 224M | return U_SUCCESS(errorCode_) && category_ == SET_OPERATOR && string_[0] == op; |
263 | 224M | } |
264 | | |
265 | 33.5M | bool isStringLiteral() const { |
266 | 33.5M | return U_SUCCESS(errorCode_) && category_ == STRING_LITERAL; |
267 | 33.5M | } |
268 | | |
269 | 0 | bool isNamedElement() const { |
270 | 0 | return U_SUCCESS(errorCode_) && category_ == NAMED_ELEMENT; |
271 | 0 | } |
272 | | |
273 | 0 | bool isBracketedElement() const { |
274 | 0 | return U_SUCCESS(errorCode_) && category_ == BRACKETED_ELEMENT; |
275 | 0 | } |
276 | | |
277 | 185k | std::optional<UnicodeString> element() const { |
278 | 185k | if (U_SUCCESS(errorCode_) && |
279 | 185k | (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT || |
280 | 185k | category_ == BRACKETED_ELEMENT || category_ == STRING_LITERAL)) { |
281 | 185k | return string_; |
282 | 185k | } |
283 | 0 | return std::nullopt; |
284 | 185k | } |
285 | | |
286 | 66.9M | std::optional<UChar32> codePoint() const { |
287 | 66.9M | if (U_SUCCESS(errorCode_) && (category_ == LITERAL_ELEMENT || category_ == ESCAPED_ELEMENT || |
288 | 66.9M | category_ == BRACKETED_ELEMENT || category_ == NAMED_ELEMENT)) { |
289 | 66.9M | return string_.char32At(0); |
290 | 66.9M | } |
291 | 16.0k | return std::nullopt; |
292 | 66.9M | } |
293 | | |
294 | | // If `*this` is a valid property-query or set-valued-variable, returns the set represented |
295 | | // by this lexical element, which lives at least as long as `*this`. Null otherwise. |
296 | 35.7M | const UnicodeSet *set() const { |
297 | 35.7M | if (U_FAILURE(errorCode_)) { |
298 | 18.3k | return nullptr; |
299 | 18.3k | } |
300 | 35.7M | if (category_ == PROPERTY_QUERY || category_ == VARIABLE) { |
301 | 332k | if (precomputedSet_ != nullptr) { |
302 | 0 | return precomputedSet_; |
303 | 332k | } else { |
304 | 332k | return &set_; |
305 | 332k | } |
306 | 332k | } |
307 | 35.3M | return nullptr; |
308 | 35.7M | } |
309 | | |
310 | 43.5k | const UErrorCode& errorCode() const{ |
311 | 43.5k | return errorCode_; |
312 | 43.5k | } |
313 | | |
314 | | #if U_DEBUGGING_UNICODESET_PARSING |
315 | | UnicodeString debugString() const { |
316 | | UnicodeString result; |
317 | | if (U_FAILURE(errorCode_)) { |
318 | | result.append(u"Ill-formed token (") |
319 | | .append(UnicodeString::fromUTF8(u_errorName(errorCode_))) |
320 | | .append(u"), possibly "); |
321 | | } |
322 | | return result.append(category_names_[category_]) |
323 | | .append(u" '") |
324 | | .append(sourceText_) |
325 | | .append(u"'"); |
326 | | } |
327 | | #endif |
328 | | |
329 | | private: |
330 | | // See https://unicode.org/reports/tr61#Lexical-Elements. |
331 | | enum Category : std::uint8_t { |
332 | | SET_OPERATOR, |
333 | | LITERAL_ELEMENT, |
334 | | ESCAPED_ELEMENT, |
335 | | NAMED_ELEMENT, |
336 | | BRACKETED_ELEMENT, |
337 | | STRING_LITERAL, |
338 | | PROPERTY_QUERY, |
339 | | // Used for ill-formed variables and set-valued variables that are not directly a |
340 | | // property-query, e.g., $basicLatinLetters=[A-Za-z]. Variables that expand to a single |
341 | | // lexical element instead have the category of that lexical element, e.g., $Ll=\p{Ll} has |
342 | | // the category PROPERTY_QUERY, $a=a has the category LITERAL_ELEMENT, and $s={Zeichenkette} |
343 | | // has the category STRING_LITERAL. |
344 | | VARIABLE, |
345 | | END_OF_TEXT, |
346 | | }; |
347 | | static constexpr std::array<std::u16string_view, END_OF_TEXT + 1> category_names_{{ |
348 | | u"set-operator", |
349 | | u"literal-element", |
350 | | u"escaped-element", |
351 | | u"named-element", |
352 | | u"bracketed-element", |
353 | | u"string-literal", |
354 | | u"property-query", |
355 | | u"variable", |
356 | | u"(end of text)", |
357 | | }}; |
358 | | LexicalElement(Category category, UnicodeString string, RuleCharacterIterator::Pos after, |
359 | | UErrorCode errorCode, const UnicodeSet *precomputedSet, UnicodeSet set, |
360 | | std::u16string_view sourceText) |
361 | 38.9M | : category_(category), string_(std::move(string)), after_(after), errorCode_(errorCode), |
362 | 38.9M | precomputedSet_(precomputedSet), set_(set), sourceText_(sourceText) {} |
363 | | Category category_; |
364 | | UnicodeString string_; |
365 | | RuleCharacterIterator::Pos after_; |
366 | | UErrorCode errorCode_; |
367 | | const UnicodeSet *precomputedSet_; |
368 | | UnicodeSet set_; |
369 | | std::u16string_view sourceText_; |
370 | | |
371 | | friend class Lexer; |
372 | | }; |
373 | | |
374 | 0 | UnicodeString getPositionForDebugging() const { |
375 | 0 | return pattern_.tempSubString(0, parsePosition_.getIndex()) + u"☞" + |
376 | 0 | pattern_.tempSubString(parsePosition_.getIndex(), 60); |
377 | 0 | } |
378 | | |
379 | 46.2M | bool acceptSetOperator(char16_t op) { |
380 | 46.2M | if (lookahead().isSetOperator(op)) { |
381 | 5.16M | advance(); |
382 | 5.16M | return true; |
383 | 5.16M | } |
384 | 41.1M | return false; |
385 | 46.2M | } |
386 | | |
387 | 400M | const LexicalElement &lookahead() { |
388 | 400M | if (!ahead_.has_value()) { |
389 | 38.8M | const RuleCharacterIterator::Pos before = getPos(); |
390 | 38.8M | ahead_.emplace(nextToken()); |
391 | 38.8M | chars_.setPos(before); |
392 | 38.8M | } |
393 | 400M | return *ahead_; |
394 | 400M | } |
395 | | |
396 | 60.9k | const LexicalElement &lookahead2() { |
397 | 60.9k | if (!ahead2_.has_value()) { |
398 | | // Note that if someone has called `getCharacterIterator` and played with the result, |
399 | | // `before` may not actually be before `ahead_`, but we do not actually depend on this here, |
400 | | // since we start from ahead_.after_. |
401 | 60.9k | const RuleCharacterIterator::Pos before = getPos(); |
402 | 60.9k | chars_.setPos(lookahead().after_); |
403 | 60.9k | ahead2_.emplace(nextToken()); |
404 | 60.9k | chars_.setPos(before); |
405 | 60.9k | } |
406 | 60.9k | return *ahead2_; |
407 | 60.9k | } |
408 | | |
409 | | // For use in older functions that take the `RuleCharacterIterator` directly. |
410 | | // Any advancement of the resulting `RuleCharacterIterator` has no effect on the result of subsequent |
411 | | // calls to `lookahead`, `lookahead2`, `advance`, or `acceptSetOperator`. |
412 | | // Once `advance` or `acceptSetOperator` has been called, the result of a call to |
413 | | // `getCharacterIterator` preceding the call to `advance` or `acceptSetOperator` must no longer be |
414 | | // used. |
415 | 0 | RuleCharacterIterator &getCharacterIterator() { |
416 | 0 | // Make sure we compute a correct `ahead_.after_` so we do not depend on the current value of |
417 | 0 | // `getPos()` for lexing. |
418 | 0 | lookahead(); |
419 | 0 | return chars_; |
420 | 0 | } |
421 | | |
422 | 0 | int32_t charsOptions() { |
423 | 0 | return charsOptions_; |
424 | 0 | } |
425 | | |
426 | 37.1M | bool atEnd() const { |
427 | 37.1M | return chars_.atEnd(); |
428 | 37.1M | } |
429 | | |
430 | 38.8M | void advance() { |
431 | | // If someone called `getCharacterIterator`, we are now changing the character iterator under |
432 | | // their feet; further, we may not have an `ahead_`, so if they keep playing with it we would be |
433 | | // working on incorrect values of `getPos`. This is why the result of `getCharacterIterator` |
434 | | // must no longer be used. |
435 | 38.8M | chars_.setPos(lookahead().after_); |
436 | 38.8M | ahead_ = ahead2_; |
437 | 38.8M | ahead2_.reset(); |
438 | 38.8M | } |
439 | | |
440 | | private: |
441 | | // A version of getPos that returns its position instead of taking it as at out parameter, so we |
442 | | // can have const positions. |
443 | 145M | RuleCharacterIterator::Pos getPos() const { |
444 | 145M | RuleCharacterIterator::Pos result; |
445 | 145M | chars_.getPos(result); |
446 | 145M | return result; |
447 | 145M | } |
448 | | |
449 | 38.9M | LexicalElement nextToken() { |
450 | 38.9M | UErrorCode errorCode = U_ZERO_ERROR; |
451 | 38.9M | chars_.skipIgnored(charsOptions_); |
452 | 38.9M | if (chars_.atEnd()) { |
453 | 5.47k | return LexicalElement(LexicalElement::END_OF_TEXT, {}, getPos(), errorCode, |
454 | 5.47k | /*precomputedSet=*/nullptr, |
455 | 5.47k | /*set=*/{}, |
456 | 5.47k | u""); |
457 | 5.47k | } |
458 | 38.9M | const int32_t start = parsePosition_.getIndex(); |
459 | 38.9M | const RuleCharacterIterator::Pos before = getPos(); |
460 | | // First try to get the next character without parsing escapes. |
461 | 38.9M | UBool unusedEscaped; |
462 | 38.9M | const UChar32 first = |
463 | 38.9M | chars_.next(charsOptions_ & ~RuleCharacterIterator::PARSE_ESCAPES, unusedEscaped, errorCode); |
464 | 38.9M | if (first == u'[' || first == u'\\') { |
465 | 2.14M | const RuleCharacterIterator::Pos afterFirst = getPos(); |
466 | | // This could be a property-query or named-element. |
467 | 2.14M | const UChar32 second = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
468 | 2.14M | RuleCharacterIterator::SKIP_WHITESPACE), |
469 | 2.14M | unusedEscaped, errorCode); |
470 | 2.14M | if ((first == u'[' && second == u':') || |
471 | 2.12M | (first == u'\\' && (second == u'p' || second == u'P' || second == u'N'))) { |
472 | 135k | if (second == u'N') { |
473 | 8.56k | UChar32 const queryResult = scanNamedElementBrackets(errorCode); |
474 | 8.56k | return LexicalElement( |
475 | 8.56k | LexicalElement::NAMED_ELEMENT, UnicodeString(queryResult), getPos(), errorCode, |
476 | 8.56k | /*precomputedSet=*/nullptr, |
477 | 8.56k | /*set=*/{}, |
478 | 8.56k | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
479 | 127k | } else { |
480 | 127k | UnicodeSet queryResult = scanPropertyQueryAfterStart(first, second, start, errorCode); |
481 | 127k | return LexicalElement( |
482 | 127k | LexicalElement::PROPERTY_QUERY, {}, getPos(), errorCode, |
483 | 127k | /*precomputedSet=*/nullptr, /*set=*/std::move(queryResult), |
484 | 127k | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
485 | 127k | } |
486 | 135k | } |
487 | | // Not a property-query. |
488 | 2.00M | chars_.setPos(afterFirst); |
489 | 2.00M | } |
490 | 38.7M | if (first == u'$' && symbols_ != nullptr) { |
491 | 0 | auto nameEnd = parsePosition_; |
492 | | // The SymbolTable defines the lexing of variable names past the $. |
493 | 0 | if (UnicodeString name = symbols_->parseReference(pattern_, nameEnd, pattern_.length()); |
494 | 0 | !name.isEmpty()) { |
495 | 0 | chars_.jumpahead(nameEnd.getIndex() - (start + 1)); |
496 | 0 | const std::u16string_view source = |
497 | 0 | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start); |
498 | 0 | const UnicodeSet *precomputedSet = symbols_->lookupSet(name); |
499 | 0 | if (precomputedSet != nullptr) { |
500 | 0 | return LexicalElement(LexicalElement::VARIABLE, {}, getPos(), U_ZERO_ERROR, |
501 | 0 | precomputedSet, /*set=*/{}, source); |
502 | 0 | } |
503 | | // The variable was not a precomputed set. Use the old-fashioned `lookup`, which |
504 | | // should give us its source text; if that parses as a single set or element, use |
505 | | // it. Note that variables are not allowed in that expansion. |
506 | | // Implementers of higher-level syntaxes that pre-parse UnicodeSet-valued variables |
507 | | // can use variables in their variable definitions, but those that simply use the |
508 | | // source text substitution API cannot. |
509 | 0 | const UnicodeString *const expression = symbols_->lookup(name); |
510 | 0 | if (expression == nullptr) { |
511 | 0 | return LexicalElement( |
512 | 0 | LexicalElement::VARIABLE, {}, getPos(), U_UNDEFINED_VARIABLE, |
513 | 0 | /*precomputedSet=*/nullptr, |
514 | 0 | /*set=*/{}, |
515 | 0 | source); |
516 | 0 | } |
517 | 0 | return evaluateVariable(*expression, source); |
518 | 0 | } |
519 | 0 | } |
520 | 38.7M | switch (first) { |
521 | 1.83M | case u'[': |
522 | 1.83M | return LexicalElement( |
523 | 1.83M | LexicalElement::SET_OPERATOR, UnicodeString(u'['), getPos(), errorCode, |
524 | 1.83M | /*precomputedSet=*/nullptr, |
525 | 1.83M | /*set=*/{}, |
526 | 1.83M | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
527 | 171k | case u'\\': { |
528 | | // Now try to parse the escape. |
529 | 171k | chars_.setPos(before); |
530 | 171k | UChar32 codePoint = chars_.next(charsOptions_, unusedEscaped, errorCode); |
531 | 171k | return LexicalElement( |
532 | 171k | LexicalElement::ESCAPED_ELEMENT, |
533 | 171k | UnicodeString(codePoint), getPos(), errorCode, |
534 | 171k | nullptr, |
535 | 171k | /*set=*/{}, |
536 | 171k | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
537 | 0 | } |
538 | 6.61k | case u'&': |
539 | 57.5k | case u'-': |
540 | 1.83M | case u']': |
541 | 3.37M | case u'^': |
542 | 3.38M | case u'$': |
543 | | // We make $ a set-operator to handle the ICU extensions involving $. |
544 | 3.38M | return LexicalElement( |
545 | 3.38M | LexicalElement::SET_OPERATOR, UnicodeString(first), getPos(), errorCode, |
546 | 3.38M | /*precomputedSet=*/nullptr, |
547 | 3.38M | /*set=*/{}, |
548 | 3.38M | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
549 | 98.0k | case u'{': { |
550 | 98.0k | UnicodeString string; |
551 | 98.0k | UBool escaped; |
552 | 98.0k | UChar32 next; |
553 | 98.0k | int32_t codePointCount = 0; |
554 | 14.7M | while (!chars_.atEnd() && U_SUCCESS(errorCode)) { |
555 | 14.7M | const RuleCharacterIterator::Pos beforeNext = getPos(); |
556 | 14.7M | next = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
557 | 14.7M | RuleCharacterIterator::SKIP_WHITESPACE), |
558 | 14.7M | unusedEscaped, errorCode); |
559 | 14.7M | if (next == u'\\') { |
560 | 9.42k | const UChar32 afterBackslash = |
561 | 9.42k | chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
562 | 9.42k | RuleCharacterIterator::SKIP_WHITESPACE), |
563 | 9.42k | unusedEscaped, errorCode); |
564 | 9.42k | if (afterBackslash == u'N') { |
565 | 816 | next = scanNamedElementBrackets(errorCode); |
566 | 816 | escaped = true; |
567 | 8.60k | } else if (afterBackslash == u'p' || afterBackslash == u'P') { |
568 | 70 | return LexicalElement(LexicalElement::STRING_LITERAL, {}, getPos(), |
569 | 70 | U_MALFORMED_SET, |
570 | 70 | /*precomputedSet=*/nullptr, |
571 | 70 | /*set=*/{}, |
572 | 70 | std::u16string_view(pattern_).substr( |
573 | 70 | start, parsePosition_.getIndex() - start)); |
574 | 8.53k | } else { |
575 | 8.53k | chars_.setPos(beforeNext); |
576 | | // Parse the escape. |
577 | 8.53k | next = chars_.next(charsOptions_, escaped, errorCode); |
578 | 8.53k | } |
579 | 14.7M | } else { |
580 | 14.7M | #if U_ICU_VERSION_MAJOR_NUM < 81 |
581 | 14.7M | if (U_SUCCESS(errorCode) && PatternProps::isWhiteSpace(next)) { |
582 | | // Transitional prohibition of unescaped spaces in string literals (in |
583 | | // ICU 78 and earlier, these were ignored; in ICU 81 they will mean |
584 | | // themselves). |
585 | 138 | errorCode = UErrorCode::U_ILLEGAL_ARGUMENT_ERROR; |
586 | 138 | } |
587 | | #else |
588 | | #error Remove this transitional check, see ICU-23307 and ICU-TC minutes of 2026-01-16. |
589 | | #endif |
590 | 14.7M | escaped = false; |
591 | 14.7M | } |
592 | 14.7M | if (!escaped && next == u'}') { |
593 | 97.3k | return LexicalElement( |
594 | 97.3k | codePointCount == 1 ? LexicalElement::BRACKETED_ELEMENT |
595 | 97.3k | : LexicalElement::STRING_LITERAL, |
596 | 97.3k | std::move(string), getPos(), errorCode, |
597 | 97.3k | /*precomputedSet=*/nullptr, |
598 | 97.3k | /*set=*/{}, |
599 | 97.3k | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
600 | 97.3k | } |
601 | 14.6M | string.append(next); |
602 | 14.6M | codePointCount += 1; |
603 | 14.6M | } |
604 | 570 | return LexicalElement( |
605 | 570 | LexicalElement::STRING_LITERAL, {}, getPos(), U_MALFORMED_SET, |
606 | 570 | /*precomputedSet=*/nullptr, |
607 | 570 | /*set=*/{}, |
608 | 570 | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
609 | 98.0k | } |
610 | 33.2M | default: |
611 | 33.2M | return LexicalElement( |
612 | 33.2M | LexicalElement::LITERAL_ELEMENT, UnicodeString(first), getPos(), errorCode, nullptr, |
613 | 33.2M | /*set=*/{}, |
614 | 33.2M | std::u16string_view(pattern_).substr(start, parsePosition_.getIndex() - start)); |
615 | 38.7M | } |
616 | 38.7M | } |
617 | | |
618 | 11.5k | UChar32 scanNamedElementBrackets(UErrorCode &errorCode) { |
619 | 11.5k | UBool unusedEscaped; |
620 | 11.5k | const UChar32 open = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
621 | 11.5k | RuleCharacterIterator::SKIP_WHITESPACE), |
622 | 11.5k | unusedEscaped, errorCode); |
623 | 11.5k | if (open == u'{') { |
624 | 11.4k | int32_t start = parsePosition_.getIndex(); |
625 | 11.4k | std::optional<UChar32> hex; |
626 | 11.4k | std::optional<UChar32> literal; |
627 | 1.61M | while (!chars_.atEnd() && U_SUCCESS(errorCode)) { |
628 | 1.61M | UChar32 last = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
629 | 1.61M | RuleCharacterIterator::SKIP_WHITESPACE), |
630 | 1.61M | unusedEscaped, errorCode); |
631 | 1.61M | if (last == u':') { |
632 | 707 | if (!hex.has_value()) { |
633 | 499 | hex.emplace(); |
634 | 499 | for (char16_t digit : std::u16string_view(pattern_).substr( |
635 | 996 | start, parsePosition_.getIndex() - 1 - start)) { |
636 | 996 | uint8_t nibble; |
637 | 996 | if (digit >= u'0' && digit <= u'9') { |
638 | 559 | nibble = digit - '0'; |
639 | 559 | } else { |
640 | 437 | digit = digit & ~0x20; |
641 | 437 | if (digit >= u'A' && digit <= u'F') { |
642 | 318 | nibble = digit - u'A' + 0xA; |
643 | 318 | } else { |
644 | 119 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
645 | 119 | return {}; |
646 | 119 | } |
647 | 437 | } |
648 | 877 | *hex = (*hex << 4) + nibble; |
649 | 877 | if (hex > 0x10FFFF) { |
650 | 21 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
651 | 21 | return {}; |
652 | 21 | } |
653 | 877 | } |
654 | 499 | } else if (!literal.has_value()) { |
655 | 201 | const auto literalCodePoints = utfStringCodePoints<UChar32, UTF_BEHAVIOR_FFFD>( |
656 | 201 | std::u16string_view(pattern_).substr(start, |
657 | 201 | parsePosition_.getIndex() - 1 - start)); |
658 | 201 | auto it = literalCodePoints.begin(); |
659 | 201 | if (it == literalCodePoints.end() || !it->wellFormed() || |
660 | 161 | (literal = it->codePoint(), ++it) != literalCodePoints.end()) { |
661 | 161 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
662 | 161 | return {}; |
663 | 161 | } |
664 | 201 | } else { |
665 | 7 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
666 | 7 | return {}; |
667 | 7 | } |
668 | 399 | start = parsePosition_.getIndex(); |
669 | 1.61M | } else if (last == u'}') { |
670 | 10.9k | const std::u16string_view u16name = std::u16string_view(pattern_).substr( |
671 | 10.9k | start, parsePosition_.getIndex() - 1 - start); |
672 | 10.9k | const UChar32 result = getCharacterByName(CharString().appendInvariantChars( |
673 | 10.9k | u16name.data(), static_cast<int32_t>(u16name.length()), errorCode)); |
674 | 10.9k | if (!U_SUCCESS(errorCode)) { |
675 | | // Convert U_INVARIANT_CONVERSION_ERROR to U_ILLEGAL_ARGUMENT_ERROR. |
676 | 98 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
677 | 98 | return {}; |
678 | 98 | } |
679 | 10.8k | if (result < 0 || (hex.has_value() && result != hex) || |
680 | 9.70k | (literal.has_value() && result != literal)) { |
681 | 1.11k | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
682 | 1.11k | return {}; |
683 | 1.11k | } |
684 | 9.70k | return result; |
685 | 10.8k | } |
686 | 1.61M | } |
687 | 11.4k | } |
688 | 337 | if (U_SUCCESS(errorCode)) { |
689 | 337 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
690 | 337 | } |
691 | 337 | return {}; |
692 | 11.5k | } |
693 | | |
694 | 0 | LexicalElement evaluateVariable(const UnicodeString &expression, const std::u16string_view source) { |
695 | 0 | UErrorCode errorCode = U_ZERO_ERROR; |
696 | 0 | ParsePosition expressionPosition; |
697 | 0 | RuleCharacterIterator expressionIterator(expression, symbols_, expressionPosition); |
698 | | // Do not pass the symbols: we do not support recursive expansion of variables. |
699 | 0 | Lexer expressionLexer(expression, expressionPosition, expressionIterator, unicodeSetOptions_, |
700 | 0 | /*symbols=*/nullptr, caseClosure_); |
701 | 0 | auto variableToken = expressionLexer.lookahead(); |
702 | 0 | if (variableToken.isSetOperator(u'[')) { |
703 | 0 | UnicodeString rebuiltPattern; |
704 | 0 | UnicodeSet expressionValue; |
705 | 0 | expressionValue.parseUnicodeSet(expressionLexer, rebuiltPattern, unicodeSetOptions_, |
706 | 0 | caseClosure_, /*depth=*/0, errorCode); |
707 | 0 | expressionValue.setPattern(rebuiltPattern); |
708 | 0 | if (!expressionLexer.atEnd()) { |
709 | 0 | return LexicalElement( |
710 | 0 | LexicalElement::VARIABLE, {}, getPos(), U_MALFORMED_VARIABLE_DEFINITION, |
711 | 0 | /*precomputedSet=*/nullptr, |
712 | 0 | /*set=*/{}, |
713 | 0 | source); |
714 | 0 | } |
715 | 0 | return LexicalElement( |
716 | 0 | LexicalElement::VARIABLE, {}, getPos(), errorCode, |
717 | 0 | /*precomputedSet=*/nullptr, |
718 | 0 | /*set=*/std::move(expressionValue), |
719 | 0 | source); |
720 | 0 | } else { |
721 | 0 | expressionLexer.advance(); |
722 | 0 | if (!expressionLexer.atEnd()) { |
723 | 0 | return LexicalElement( |
724 | 0 | LexicalElement::VARIABLE, {}, getPos(), U_MALFORMED_VARIABLE_DEFINITION, |
725 | 0 | /*precomputedSet=*/nullptr, |
726 | 0 | /*set=*/{}, |
727 | 0 | source); |
728 | 0 | } |
729 | 0 | switch (variableToken.category_) { |
730 | 0 | case LexicalElement::LITERAL_ELEMENT: |
731 | 0 | case LexicalElement::ESCAPED_ELEMENT: |
732 | 0 | case LexicalElement::NAMED_ELEMENT: |
733 | 0 | case LexicalElement::BRACKETED_ELEMENT: |
734 | 0 | case LexicalElement::STRING_LITERAL: |
735 | 0 | case LexicalElement::PROPERTY_QUERY: |
736 | | // Return the same lexical element that we found while parsing the variable contents, |
737 | | // except the source position corresponds to the position of the variable rather than 0 |
738 | | // in its expansion, and the source is the name of the variable rather than its |
739 | | // expansion. |
740 | 0 | return LexicalElement( |
741 | 0 | variableToken.category_, std::move(variableToken.string_), getPos(), |
742 | 0 | variableToken.errorCode_, variableToken.precomputedSet_, std::move(variableToken.set_), source); |
743 | 0 | default: |
744 | 0 | return LexicalElement(LexicalElement::VARIABLE, {}, getPos(), |
745 | 0 | U_MALFORMED_VARIABLE_DEFINITION, |
746 | 0 | /*precomputedSet=*/nullptr, |
747 | 0 | /*set=*/{}, source); |
748 | 0 | } |
749 | 0 | } |
750 | 0 | } |
751 | | |
752 | 127k | UnicodeSet scanPropertyQueryAfterStart(UChar32 first, UChar32 second, int32_t queryStart, UErrorCode &errorCode) { |
753 | 127k | std::optional<int32_t> queryOperatorPosition; |
754 | 127k | int32_t queryExpressionStart = parsePosition_.getIndex(); |
755 | 127k | bool exteriorlyNegated = false; |
756 | 127k | bool interiorlyNegated = false; |
757 | 127k | UBool unusedEscaped; |
758 | | // Do not skip whitespace so we can recognize unspaced :]. Lex escapes and |
759 | | // named-element: while ICU does not support string-valued properties and thus has no |
760 | | // use for escapes, we still want to lex through escapes to allow downstream |
761 | | // implementations (mostly unicodetools) to implement string-valued properties. |
762 | 127k | const UChar32 third = chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
763 | 127k | RuleCharacterIterator::SKIP_WHITESPACE), |
764 | 127k | unusedEscaped, errorCode); |
765 | 127k | if (first == u'\\') { |
766 | 110k | if (third != u'{') { |
767 | 134 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
768 | 134 | return {}; |
769 | 134 | } |
770 | 110k | exteriorlyNegated = second == u'P'; |
771 | 110k | queryExpressionStart = parsePosition_.getIndex(); |
772 | 110k | } else { |
773 | 16.7k | if (third == u'^') { |
774 | 286 | exteriorlyNegated = true; |
775 | 286 | queryExpressionStart = parsePosition_.getIndex(); |
776 | 286 | } |
777 | 16.7k | } |
778 | 126k | RuleCharacterIterator::Pos beforePenultimate = getPos(); |
779 | 126k | UChar32 penultimateUnescaped = |
780 | 126k | chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
781 | 126k | RuleCharacterIterator::SKIP_WHITESPACE), |
782 | 126k | unusedEscaped, errorCode); |
783 | | |
784 | 11.3M | while (!chars_.atEnd() && U_SUCCESS(errorCode)) { |
785 | 11.3M | const RuleCharacterIterator::Pos beforeLast = getPos(); |
786 | 11.3M | UChar32 lastUnescaped = |
787 | 11.3M | chars_.next(charsOptions_ & ~(RuleCharacterIterator::PARSE_ESCAPES | |
788 | 11.3M | RuleCharacterIterator::SKIP_WHITESPACE), |
789 | 11.3M | unusedEscaped, errorCode); |
790 | 11.3M | if (penultimateUnescaped == u'\\') { |
791 | 20.2k | if (lastUnescaped == 'N') { |
792 | 2.18k | scanNamedElementBrackets(errorCode); |
793 | 2.18k | if (!U_SUCCESS(errorCode)) { |
794 | 334 | return {}; |
795 | 334 | } |
796 | 18.0k | } else { |
797 | | // There must be an escaped-element starting at beforePenultimate. Go |
798 | | // back there and advance through it. |
799 | 18.0k | chars_.setPos(beforePenultimate); |
800 | 18.0k | chars_.next(charsOptions_ & ~RuleCharacterIterator::SKIP_WHITESPACE, unusedEscaped, |
801 | 18.0k | errorCode); |
802 | 18.0k | } |
803 | | // Neither a named-element nor an escaped-element can be part of a closing :]. |
804 | 19.8k | lastUnescaped = -1; |
805 | 11.3M | } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'=') { |
806 | 69.0k | queryOperatorPosition = parsePosition_.getIndex() - 1; |
807 | 11.3M | } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'≠') { |
808 | 20 | if (exteriorlyNegated) { |
809 | | // Reject doubly negated property queries. |
810 | 3 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
811 | 3 | return {}; |
812 | 3 | } |
813 | 17 | interiorlyNegated = true; |
814 | 17 | queryOperatorPosition = parsePosition_.getIndex() - 1; |
815 | 11.3M | } else if ((first == u'[' && penultimateUnescaped == u':' && lastUnescaped == u']') || |
816 | 11.2M | (first == u'\\' && lastUnescaped == u'}')) { |
817 | | // Note that no unescaping is performed here, as ICU does not support string-valued or |
818 | | // or miscellaneous properties. |
819 | 126k | const int32_t queryExpressionLimit = |
820 | 126k | first == u'[' ? parsePosition_.getIndex() - 2 : parsePosition_.getIndex() - 1; |
821 | | // Contrary to Java, applyPropertyAlias does not support a null property-predicate in |
822 | | // C++; instead "" indicates the absence of a property-predicate. This is OK with the |
823 | | // properties supported by ICU, but not with string-valued or miscellaneous properties; |
824 | | // see https://github.com/unicode-org/icu/pull/3456. |
825 | 126k | UnicodeString propertyPredicate; |
826 | 126k | if (queryOperatorPosition.has_value()) { |
827 | 68.9k | propertyPredicate = |
828 | 68.9k | pattern_.tempSubStringBetween(*queryOperatorPosition + 1, queryExpressionLimit); |
829 | 68.9k | if (propertyPredicate.isEmpty()) { |
830 | | // \p{X=} is valid if X is a string-valued or miscellaneous property, but |
831 | | // ICU does not support those. Thus, it is invalid for ICU purposes, and |
832 | | // passing an empty propertyPredicate to applyPropertyAlias can be valid |
833 | | // (this is how we represent \p{X}), so we need to return the error here. |
834 | 298 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
835 | 298 | return {}; |
836 | 298 | } |
837 | 68.9k | } |
838 | 125k | UnicodeSet result; |
839 | 125k | result.applyPropertyAlias( |
840 | 125k | pattern_.tempSubStringBetween(queryExpressionStart, |
841 | 125k | queryOperatorPosition.value_or(queryExpressionLimit)), |
842 | 125k | propertyPredicate, errorCode); |
843 | 125k | if (exteriorlyNegated != interiorlyNegated) { |
844 | 21.5k | result.complement().removeAllStrings(); |
845 | 21.5k | } |
846 | 125k | result.setPattern(pattern_.tempSubStringBetween(queryStart, parsePosition_.getIndex())); |
847 | 125k | return result; |
848 | 126k | } |
849 | 11.2M | beforePenultimate = beforeLast; |
850 | 11.2M | penultimateUnescaped = lastUnescaped; |
851 | 11.2M | } |
852 | 510 | errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
853 | 510 | return {}; |
854 | 126k | } |
855 | | |
856 | | const UnicodeString &pattern_; |
857 | | const ParsePosition &parsePosition_; |
858 | | RuleCharacterIterator &chars_; |
859 | | const uint32_t unicodeSetOptions_; |
860 | | const int32_t charsOptions_; |
861 | | const SymbolTable *const symbols_; |
862 | | UnicodeSet &(UnicodeSet::* const caseClosure_)(int32_t attribute); |
863 | | std::optional<LexicalElement> ahead_; |
864 | | std::optional<LexicalElement> ahead2_; |
865 | | }; |
866 | | |
867 | | namespace { |
868 | | |
869 | | constexpr int32_t MAX_DEPTH = 100; |
870 | | |
871 | | #if U_DEBUGGING_UNICODESET_PARSING |
872 | | |
873 | | #define U_UNICODESET_RETURN_IF_ERROR(ec) \ |
874 | | do { \ |
875 | | constexpr std::string_view functionName = __func__;\ |
876 | | static_assert (functionName.substr(0, 5) == "parse");\ |
877 | | if (U_FAILURE(ec)) { \ |
878 | | if (depth < 5) { \ |
879 | | printf("--- in %s l. %d\n", __func__+5, __LINE__); \ |
880 | | } else if (depth == 5 && std::string_view(__func__+5) == "UnicodeSet") { \ |
881 | | printf("--- [...]\n"); \ |
882 | | } \ |
883 | | return; \ |
884 | | } \ |
885 | | } while (false) |
886 | | #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec) \ |
887 | | do { \ |
888 | | constexpr std::string_view functionName = __func__; \ |
889 | | static_assert(functionName.substr(0, 5) == "parse"); \ |
890 | | std::string actualUTF8; \ |
891 | | std::string contextUTF8; \ |
892 | | printf("*** Expected %s, got %s %s\n", (expected), \ |
893 | | UnicodeString(actual).toUTF8String(actualUTF8).c_str(), \ |
894 | | lexer.getPositionForDebugging().toUTF8String(contextUTF8).c_str()); \ |
895 | | printf("--- in %s l. %d\n", __func__ + 5, __LINE__); \ |
896 | | if (U_FAILURE(lexer.lookahead().errorCode())) { \ |
897 | | (ec) = lexer.lookahead().errorCode(); \ |
898 | | } else { \ |
899 | | (ec) = U_MALFORMED_SET; \ |
900 | | } \ |
901 | | return; \ |
902 | | } while (false) |
903 | | |
904 | | #else |
905 | | |
906 | | #define U_UNICODESET_RETURN_IF_ERROR(ec) \ |
907 | 74.4M | do { \ |
908 | 74.4M | if (U_FAILURE(ec)) { \ |
909 | 179k | return; \ |
910 | 179k | } \ |
911 | 74.4M | } while (false) |
912 | | #define U_UNICODESET_RETURN_WITH_PARSE_ERROR(expected, actual, lexer, ec) \ |
913 | 25.1k | do { \ |
914 | 25.1k | if (U_FAILURE(lexer.lookahead().errorCode())) { \ |
915 | 18.3k | (ec) = lexer.lookahead().errorCode(); \ |
916 | 18.3k | } else { \ |
917 | 6.71k | (ec) = U_MALFORMED_SET; \ |
918 | 6.71k | } \ |
919 | 25.1k | return; \ |
920 | 25.1k | } while (false) |
921 | | |
922 | | #endif |
923 | | |
924 | | } // namespace |
925 | | |
926 | | /** |
927 | | * Parse the pattern from the given RuleCharacterIterator. The |
928 | | * iterator is advanced over the parsed pattern. |
929 | | * @param pattern The pattern, only used by debug traces. |
930 | | * @param parsePosition The ParsePosition underlying chars, only used by debug traces. |
931 | | * @param chars iterator over the pattern characters. Upon return |
932 | | * it will be advanced to the first character after the parsed |
933 | | * pattern, or the end of the iteration if all characters are |
934 | | * parsed. |
935 | | * @param symbols symbol table to use to parse and dereference |
936 | | * variables, or null if none. |
937 | | * @param rebuiltPat the pattern that was parsed, rebuilt or |
938 | | * copied from the input pattern, as appropriate. |
939 | | * @param options a bit mask of zero or more of the following: |
940 | | * IGNORE_SPACE, CASE. |
941 | | */ |
942 | | |
943 | | void UnicodeSet::applyPattern(const UnicodeString &pattern, |
944 | | const ParsePosition &parsePosition, |
945 | | RuleCharacterIterator &chars, |
946 | | const SymbolTable *symbols, |
947 | | UnicodeString &rebuiltPat, |
948 | | uint32_t options, |
949 | | UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), |
950 | 98.1k | UErrorCode &ec) { |
951 | 98.1k | if (U_FAILURE(ec)) return; |
952 | 98.1k | Lexer lexer(pattern, parsePosition, chars, options, symbols, caseClosure); |
953 | 98.1k | parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, /*depth=*/0, ec); |
954 | 98.1k | } |
955 | | |
956 | | void UnicodeSet::parseUnicodeSet(Lexer &lexer, |
957 | | UnicodeString& rebuiltPat, |
958 | | uint32_t options, |
959 | | UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), |
960 | | int32_t depth, |
961 | 1.94M | UErrorCode &ec) { |
962 | 1.94M | clear(); |
963 | | |
964 | 1.94M | if (depth > MAX_DEPTH) { |
965 | 37 | U_UNICODESET_RETURN_WITH_PARSE_ERROR(("depth <= " + std::to_string(MAX_DEPTH)).c_str(), |
966 | 37 | ("depth = " + std::to_string(depth)).c_str(), lexer, ec); |
967 | 37 | } |
968 | | |
969 | 1.94M | bool isComplement = false; |
970 | | // Whether to keep the syntax of the pattern at this level, only doing basic pretty-printing, e.g., |
971 | | // turn [ c - z[a]a - b ] into [c-z[a]a-b], but not into [a-z]. |
972 | | // This is true for a property query, or when there is a nested set. Note that since we recurse, |
973 | | // innermost sets consisting only of ranges will get simplified. |
974 | 1.94M | bool preserveSyntaxInPattern = false; |
975 | | // A pattern that preserves the original syntax but strips spaces, normalizes escaping, etc. |
976 | 1.94M | UnicodeString prettyPrintedPattern; |
977 | 1.94M | if (lexer.lookahead().set() != nullptr) { |
978 | | // UnicodeSet ::= property-query | named-element |
979 | | // Extension: |
980 | | // | set-valued-variable |
981 | 111k | *this = *lexer.lookahead().set(); |
982 | 111k | this->_toPattern(prettyPrintedPattern, /*escapeUnprintable=*/false); |
983 | 111k | lexer.advance(); |
984 | 111k | preserveSyntaxInPattern = true; |
985 | 1.83M | } else { |
986 | | // UnicodeSet ::= [ Union ] |
987 | | // | Complement ::= [ ^ Union ] |
988 | 1.83M | if (lexer.acceptSetOperator(u'[')) { |
989 | 1.83M | prettyPrintedPattern.append(u'['); |
990 | 1.83M | if (lexer.acceptSetOperator(u'^')) { |
991 | 1.53M | prettyPrintedPattern.append(u'^'); |
992 | 1.53M | isComplement = true; |
993 | 1.53M | } |
994 | 1.83M | parseUnion(lexer, prettyPrintedPattern, options, caseClosure, depth, |
995 | 1.83M | /*containsRestrictions=*/preserveSyntaxInPattern, ec); |
996 | 1.83M | U_UNICODESET_RETURN_IF_ERROR(ec); |
997 | 1.78M | if (!lexer.acceptSetOperator(u']')) { |
998 | 5.40k | U_UNICODESET_RETURN_WITH_PARSE_ERROR("]", lexer.lookahead().debugString(), lexer, ec); |
999 | 5.40k | } |
1000 | 1.78M | prettyPrintedPattern.append(u']'); |
1001 | 1.78M | } else { |
1002 | 3.54k | U_UNICODESET_RETURN_WITH_PARSE_ERROR(R"([: | \p | \P | \N | [)", |
1003 | 3.54k | lexer.lookahead().debugString(), lexer, |
1004 | 3.54k | ec); |
1005 | 3.54k | } |
1006 | 1.83M | } |
1007 | | |
1008 | | /** |
1009 | | * Handle global flags (isComplement, case insensitivity). If this |
1010 | | * pattern should be compiled case-insensitive, then we need |
1011 | | * to close over case BEFORE COMPLEMENTING. This makes |
1012 | | * patterns like /[^abc]/i work. |
1013 | | */ |
1014 | 1.89M | if ((options & USET_CASE_MASK) != 0) { |
1015 | 100k | (this->*caseClosure)(options); |
1016 | 100k | } |
1017 | 1.89M | if (isComplement) { |
1018 | 1.53M | complement().removeAllStrings(); // code point complement |
1019 | 1.53M | } |
1020 | 1.89M | if (preserveSyntaxInPattern) { |
1021 | 244k | rebuiltPat.append(prettyPrintedPattern); |
1022 | 1.64M | } else { |
1023 | 1.64M | _generatePattern(rebuiltPat, /*escapeUnprintable=*/false); |
1024 | 1.64M | } |
1025 | 1.89M | } |
1026 | | |
1027 | | void UnicodeSet::parseUnion(Lexer &lexer, |
1028 | | UnicodeString &rebuiltPat, |
1029 | | uint32_t options, |
1030 | | UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), |
1031 | | int32_t depth, |
1032 | | bool &containsRestrictions, |
1033 | 1.83M | UErrorCode &ec) { |
1034 | | // Union ::= Terms |
1035 | | // | UnescapedHyphenMinus Terms |
1036 | | // | Terms UnescapedHyphenMinus |
1037 | | // | UnescapedHyphenMinus Terms UnescapedHyphenMinus |
1038 | | // Terms ::= "" |
1039 | | // | Terms Term |
1040 | 1.83M | if (lexer.acceptSetOperator(u'-')) { |
1041 | 2.16k | add(u'-'); |
1042 | | // When we otherwise preserve the syntax, we escape an initial UnescapedHyphenMinus, but not a |
1043 | | // final one, for consistency with older ICU behaviour. |
1044 | 2.16k | rebuiltPat.append(u"\\-"); |
1045 | 2.16k | } |
1046 | 37.1M | while (!lexer.atEnd()) { |
1047 | | // Note that while a HYPHEN-MINUS mapped by the symbol table is treated as a literal at the |
1048 | | // beginning of the Union, it is treated as a set elsewhere, including at the end. |
1049 | 37.1M | if (lexer.acceptSetOperator(u'-')) { |
1050 | | // We can be here on the first iteration: [--] is allowed by the |
1051 | | // grammar and by the old parser. |
1052 | 5.31k | rebuiltPat.append(u'-'); |
1053 | 5.31k | add(u'-'); |
1054 | 5.31k | return; |
1055 | 37.1M | } else if (lexer.lookahead().isSetOperator(u'$')) { |
1056 | 11.8k | if (lexer.lookahead2().isSetOperator(u']')) { |
1057 | | // ICU extensions: A $ is allowed as a literal-element. |
1058 | | // A Term at the end of a Union consisting of a single $ is an anchor. |
1059 | 2.71k | rebuiltPat.append(u'$'); |
1060 | | // Consume the dollar. |
1061 | 2.71k | lexer.advance(); |
1062 | 2.71k | add(U_ETHER); |
1063 | 2.71k | containsRestrictions = true; |
1064 | 2.71k | return; |
1065 | 2.71k | } |
1066 | 11.8k | } |
1067 | 37.1M | if (lexer.lookahead().isSetOperator(u']')) { |
1068 | 1.77M | return; |
1069 | 1.77M | } |
1070 | 35.3M | parseTerm(lexer, rebuiltPat, options, caseClosure, depth, containsRestrictions, ec); |
1071 | 35.3M | U_UNICODESET_RETURN_IF_ERROR(ec); |
1072 | 35.3M | } |
1073 | 1.83M | } |
1074 | | |
1075 | | void UnicodeSet::parseTerm(Lexer &lexer, |
1076 | | UnicodeString &rebuiltPat, |
1077 | | uint32_t options, |
1078 | | UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), |
1079 | | int32_t depth, |
1080 | | bool &containsRestriction, |
1081 | 35.3M | UErrorCode &ec) { |
1082 | | // Term ::= Elements |
1083 | | // | Restriction |
1084 | 35.3M | if (lexer.lookahead().isSetOperator('[') || lexer.lookahead().set() != nullptr) { |
1085 | 1.83M | containsRestriction = true; |
1086 | 1.83M | parseRestriction(lexer, rebuiltPat, options, caseClosure, depth, ec); |
1087 | 1.83M | U_UNICODESET_RETURN_IF_ERROR(ec); |
1088 | 33.5M | } else { |
1089 | 33.5M | parseElements(lexer, rebuiltPat, ec); |
1090 | 33.5M | U_UNICODESET_RETURN_IF_ERROR(ec); |
1091 | 33.5M | } |
1092 | 35.3M | } |
1093 | | |
1094 | | void UnicodeSet::parseRestriction(Lexer &lexer, |
1095 | | UnicodeString &rebuiltPat, |
1096 | | uint32_t options, |
1097 | | UnicodeSet &(UnicodeSet::*caseClosure)(int32_t attribute), |
1098 | | int32_t depth, |
1099 | 1.83M | UErrorCode &ec) { |
1100 | | // Parse a https://www.unicode.org/reports/tr61/#Restriction: |
1101 | | // Restriction ::= UnicodeSet |
1102 | | // | Intersection |
1103 | | // | Difference |
1104 | | // Intersection ::= Restriction & UnicodeSet |
1105 | | // Difference ::= Restriction - UnicodeSet |
1106 | | // or, rewritten to be LL, |
1107 | | // Restriction ::= UnicodeSet RightHandSides |
1108 | | // RightHandSides ::= "" |
1109 | | // | & UnicodeSet RightHandSides |
1110 | | // | - UnicodeSet RightHandSides |
1111 | | // but note that the tree resulting from this LL version is not an expression tree: the |
1112 | | // operations are left-associative. |
1113 | | // Start by parsing the first UnicodeSet. |
1114 | 1.83M | UnicodeSet leftHandSide; |
1115 | 1.83M | leftHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); |
1116 | 1.83M | addAll(leftHandSide); |
1117 | 1.83M | U_UNICODESET_RETURN_IF_ERROR(ec); |
1118 | | // Now keep looking for an operator that would continue the RightHandSide. |
1119 | | // The loop terminates because when we run out of source text, the lookahead token will not be a set |
1120 | | // operator, so that we hit the else branch and return. |
1121 | 1.81M | for (;;) { |
1122 | 1.81M | if (lexer.acceptSetOperator(u'&')) { |
1123 | | // Intersection ::= Restriction & UnicodeSet |
1124 | 6.57k | rebuiltPat.append(u'&'); |
1125 | 6.57k | UnicodeSet rightHandSide; |
1126 | 6.57k | rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); |
1127 | 6.57k | U_UNICODESET_RETURN_IF_ERROR(ec); |
1128 | 5.67k | retainAll(rightHandSide); |
1129 | 1.81M | } else if (lexer.lookahead().isSetOperator(u'-')) { |
1130 | | // Here the grammar requires two tokens of lookahead to figure out whether the - is the operator |
1131 | | // of a Difference or an UnescapedHyphenMinus in the enclosing Union. |
1132 | 12.7k | if (lexer.lookahead2().isSetOperator(u']')) { |
1133 | | // The operator is actually an UnescapedHyphenMinus; terminate the Restriction |
1134 | | // before it. We return to parseTerm, which immediately returns to parseUnion, |
1135 | | // which will accept the - and add it to *this. |
1136 | 2.80k | return; |
1137 | 2.80k | } |
1138 | | // Consume the hyphen-minus. |
1139 | 9.98k | lexer.advance(); |
1140 | | // Difference ::= Restriction - UnicodeSet |
1141 | 9.98k | rebuiltPat.append(u'-'); |
1142 | 9.98k | UnicodeSet rightHandSide; |
1143 | 9.98k | rightHandSide.parseUnicodeSet(lexer, rebuiltPat, options, caseClosure, depth + 1, ec); |
1144 | 9.98k | U_UNICODESET_RETURN_IF_ERROR(ec); |
1145 | 8.46k | removeAll(rightHandSide); |
1146 | 1.79M | } else { |
1147 | | // Not an operator, end of the Restriction. |
1148 | 1.79M | return; |
1149 | 1.79M | } |
1150 | 1.81M | } |
1151 | 1.80M | } |
1152 | | |
1153 | | void UnicodeSet::parseElements(Lexer &lexer, |
1154 | | UnicodeString &rebuiltPat, |
1155 | 33.5M | UErrorCode &ec) { |
1156 | | // Elements ::= Element |
1157 | | // | Range |
1158 | | // Range ::= RangeElement - RangeElement |
1159 | | // RangeElement ::= literal-element |
1160 | | // | escaped-element |
1161 | | // | named-element |
1162 | | // | bracketed-element |
1163 | | // Element ::= RangeElement |
1164 | | // | string-literal |
1165 | | // codePoint().has_value() on a lexical element if it is a RangeElement. |
1166 | 33.5M | if (lexer.lookahead().isStringLiteral()) { |
1167 | 92.5k | add(*lexer.lookahead().element()); |
1168 | 92.5k | rebuiltPat.append(u'{'); |
1169 | 92.5k | _appendToPat(rebuiltPat, *lexer.lookahead().element(), /*escapeUnprintable=*/false); |
1170 | 92.5k | rebuiltPat.append(u'}'); |
1171 | 92.5k | lexer.advance(); |
1172 | 92.5k | return; |
1173 | 92.5k | } |
1174 | 33.4M | UChar32 first; |
1175 | 33.4M | if (lexer.lookahead().isSetOperator(u'$')) { |
1176 | | // Disallowed by UTS #61, but historically accepted by ICU. This is an extension. |
1177 | 9.13k | first = u'$'; |
1178 | 33.4M | } else if (lexer.lookahead().codePoint().has_value()) { |
1179 | 33.4M | first = *lexer.lookahead().codePoint(); |
1180 | 33.4M | } else { |
1181 | 16.0k | U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement | string-literal", |
1182 | 16.0k | lexer.lookahead().debugString(), |
1183 | 16.0k | lexer, ec); |
1184 | 16.0k | } |
1185 | 33.4M | lexer.advance(); |
1186 | 33.4M | _appendToPat(rebuiltPat, first, /*escapeUnprintable=*/false); |
1187 | 33.4M | if (!lexer.lookahead().isSetOperator(u'-')) { |
1188 | | // No operator, |
1189 | | // Elements ::= Element |
1190 | 33.4M | add(first); |
1191 | 33.4M | return; |
1192 | 33.4M | } |
1193 | | // Here the grammar requires two tokens of lookahead to figure out whether the - is the operator |
1194 | | // of a Range or an UnescapedHyphenMinus in the enclosing Union. |
1195 | 35.3k | if (lexer.lookahead2().isSetOperator(u']')) { |
1196 | | // The operator is actually an UnescapedHyphenMinus; terminate the Elements before it. |
1197 | 1.93k | add(first); |
1198 | 1.93k | return; |
1199 | 1.93k | } |
1200 | | // Consume the hyphen-minus. |
1201 | 33.4k | lexer.advance(); |
1202 | | // Elements ::= Range ::= RangeElement - RangeElement |
1203 | 33.4k | rebuiltPat.append(u'-'); |
1204 | 33.4k | UChar32 last; |
1205 | 33.4k | if (lexer.lookahead().isSetOperator(u'$')) { |
1206 | | // Disallowed by UTS #61, but historically accepted by ICU except at the end of a Union. |
1207 | | // This is an extension. |
1208 | 908 | last = u'$'; |
1209 | 908 | if (lexer.lookahead2().isSetOperator(u']')) { |
1210 | 3 | U_UNICODESET_RETURN_WITH_PARSE_ERROR("Term after Range ending in unescaped $", |
1211 | 3 | lexer.lookahead().debugString() + u" followed by " + |
1212 | 3 | lexer.lookahead2().debugString(), |
1213 | 3 | lexer, ec); |
1214 | 3 | } |
1215 | 32.5k | } else if (lexer.lookahead().codePoint().has_value()) { |
1216 | 32.4k | last = *lexer.lookahead().codePoint(); |
1217 | 32.4k | } else { |
1218 | 47 | U_UNICODESET_RETURN_WITH_PARSE_ERROR("RangeElement", lexer.lookahead().debugString(), lexer, ec); |
1219 | 47 | } |
1220 | 33.3k | if (last <= first) { |
1221 | 65 | U_UNICODESET_RETURN_WITH_PARSE_ERROR( |
1222 | 65 | "first < last in Range", UnicodeString(last) + u"-" + UnicodeString(first), lexer, ec); |
1223 | 65 | } |
1224 | 33.3k | lexer.advance(); |
1225 | 33.3k | _appendToPat(rebuiltPat, last, /*escapeUnprintable=*/false); |
1226 | 33.3k | add(first, last); |
1227 | 33.3k | return; |
1228 | 33.3k | } |
1229 | | |
1230 | | //---------------------------------------------------------------- |
1231 | | // Property set implementation |
1232 | | //---------------------------------------------------------------- |
1233 | | |
1234 | | namespace { |
1235 | | |
1236 | 119M | UBool numericValueFilter(UChar32 ch, void* context) { |
1237 | 119M | return u_getNumericValue(ch) == *static_cast<double*>(context); |
1238 | 119M | } |
1239 | | |
1240 | 229M | UBool generalCategoryMaskFilter(UChar32 ch, void* context) { |
1241 | 229M | int32_t value = *static_cast<int32_t*>(context); |
1242 | 229M | return (U_GET_GC_MASK((UChar32) ch) & value) != 0; |
1243 | 229M | } |
1244 | | |
1245 | 54.5M | UBool versionFilter(UChar32 ch, void* context) { |
1246 | 54.5M | static const UVersionInfo none = { 0, 0, 0, 0 }; |
1247 | 54.5M | UVersionInfo v; |
1248 | 54.5M | u_charAge(ch, v); |
1249 | 54.5M | UVersionInfo* version = static_cast<UVersionInfo*>(context); |
1250 | 54.5M | return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; |
1251 | 54.5M | } |
1252 | | |
1253 | | typedef struct { |
1254 | | UProperty prop; |
1255 | | int32_t value; |
1256 | | } IntPropertyContext; |
1257 | | |
1258 | 336M | UBool intPropertyFilter(UChar32 ch, void* context) { |
1259 | 336M | IntPropertyContext* c = static_cast<IntPropertyContext*>(context); |
1260 | 336M | return u_getIntPropertyValue(ch, c->prop) == c->value; |
1261 | 336M | } |
1262 | | |
1263 | 66.1M | UBool scriptExtensionsFilter(UChar32 ch, void* context) { |
1264 | 66.1M | return uscript_hasScript(ch, *static_cast<UScriptCode*>(context)); |
1265 | 66.1M | } |
1266 | | |
1267 | 0 | UBool idTypeFilter(UChar32 ch, void* context) { |
1268 | 0 | return u_hasIDType(ch, *static_cast<UIdentifierType*>(context)); |
1269 | 0 | } |
1270 | | |
1271 | | } // namespace |
1272 | | |
1273 | | /** |
1274 | | * Generic filter-based scanning code for UCD property UnicodeSets. |
1275 | | */ |
1276 | | void UnicodeSet::applyFilter(UnicodeSet::Filter filter, |
1277 | | void* context, |
1278 | | const UnicodeSet* inclusions, |
1279 | 282k | UErrorCode &status) { |
1280 | 282k | if (U_FAILURE(status)) return; |
1281 | | |
1282 | | // Logically, walk through all Unicode characters, noting the start |
1283 | | // and end of each range for which filter.contain(c) is |
1284 | | // true. Add each range to a set. |
1285 | | // |
1286 | | // To improve performance, use an inclusions set which |
1287 | | // encodes information about character ranges that are known |
1288 | | // to have identical properties. |
1289 | | // inclusions contains the first characters of |
1290 | | // same-value ranges for the given property. |
1291 | | |
1292 | 282k | clear(); |
1293 | | |
1294 | 282k | UChar32 startHasProperty = -1; |
1295 | 282k | int32_t limitRange = inclusions->getRangeCount(); |
1296 | | |
1297 | 420M | for (int j=0; j<limitRange; ++j) { |
1298 | | // get current range |
1299 | 420M | UChar32 start = inclusions->getRangeStart(j); |
1300 | 420M | UChar32 end = inclusions->getRangeEnd(j); |
1301 | | |
1302 | | // for all the code points in the range, process |
1303 | 1.22G | for (UChar32 ch = start; ch <= end; ++ch) { |
1304 | | // only add to this UnicodeSet on inflection points -- |
1305 | | // where the hasProperty value changes to false |
1306 | 806M | if ((*filter)(ch, context)) { |
1307 | 57.3M | if (startHasProperty < 0) { |
1308 | 27.9M | startHasProperty = ch; |
1309 | 27.9M | } |
1310 | 749M | } else if (startHasProperty >= 0) { |
1311 | 27.8M | add(startHasProperty, ch-1); |
1312 | 27.8M | startHasProperty = -1; |
1313 | 27.8M | } |
1314 | 806M | } |
1315 | 420M | } |
1316 | 282k | if (startHasProperty >= 0) { |
1317 | 31.5k | add(startHasProperty, static_cast<UChar32>(0x10FFFF)); |
1318 | 31.5k | } |
1319 | 282k | if (isBogus() && U_SUCCESS(status)) { |
1320 | | // We likely ran out of memory. AHHH! |
1321 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
1322 | 0 | } |
1323 | 282k | } |
1324 | | |
1325 | | namespace { |
1326 | | |
1327 | | } // namespace |
1328 | | |
1329 | | //---------------------------------------------------------------- |
1330 | | // Property set API |
1331 | | //---------------------------------------------------------------- |
1332 | | |
1333 | 14.6k | #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \ |
1334 | 14.6k | ec=U_ILLEGAL_ARGUMENT_ERROR; \ |
1335 | 14.6k | return *this; \ |
1336 | 14.6k | } UPRV_BLOCK_MACRO_END |
1337 | | |
1338 | | UnicodeSet& |
1339 | 266k | UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { |
1340 | 266k | if (U_FAILURE(ec) || isFrozen()) { return *this; } |
1341 | 266k | if (prop == UCHAR_GENERAL_CATEGORY_MASK) { |
1342 | 35.6k | const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
1343 | 35.6k | applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); |
1344 | 230k | } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { |
1345 | 6.65k | const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
1346 | 6.65k | UScriptCode script = static_cast<UScriptCode>(value); |
1347 | 6.65k | applyFilter(scriptExtensionsFilter, &script, inclusions, ec); |
1348 | 224k | } else if (prop == UCHAR_IDENTIFIER_TYPE) { |
1349 | 0 | const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
1350 | 0 | UIdentifierType idType = static_cast<UIdentifierType>(value); |
1351 | 0 | applyFilter(idTypeFilter, &idType, inclusions, ec); |
1352 | 224k | } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { |
1353 | 7.93k | if (value == 0 || value == 1) { |
1354 | 7.93k | const USet *set = u_getBinaryPropertySet(prop, &ec); |
1355 | 7.93k | if (U_FAILURE(ec)) { return *this; } |
1356 | 7.93k | copyFrom(*UnicodeSet::fromUSet(set), true); |
1357 | 7.93k | if (value == 0) { |
1358 | 704 | complement().removeAllStrings(); // code point complement |
1359 | 704 | } |
1360 | 7.93k | } else { |
1361 | 0 | clear(); |
1362 | 0 | } |
1363 | 216k | } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { |
1364 | 216k | const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); |
1365 | 216k | IntPropertyContext c = {prop, value}; |
1366 | 216k | applyFilter(intPropertyFilter, &c, inclusions, ec); |
1367 | 216k | } else { |
1368 | 0 | ec = U_ILLEGAL_ARGUMENT_ERROR; |
1369 | 0 | } |
1370 | 266k | return *this; |
1371 | 266k | } |
1372 | | |
1373 | | UnicodeSet& |
1374 | | UnicodeSet::applyPropertyAlias(const UnicodeString& prop, |
1375 | | const UnicodeString& value, |
1376 | 131k | UErrorCode& ec) { |
1377 | 131k | if (U_FAILURE(ec) || isFrozen()) return *this; |
1378 | | |
1379 | | // prop and value used to be converted to char * using the default |
1380 | | // converter instead of the invariant conversion. |
1381 | | // This should not be necessary because all Unicode property and value |
1382 | | // names use only invariant characters. |
1383 | | // If there are any variant characters, then we won't find them anyway. |
1384 | | // Checking first avoids assertion failures in the conversion. |
1385 | 131k | if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || |
1386 | 131k | !uprv_isInvariantUString(value.getBuffer(), value.length()) |
1387 | 131k | ) { |
1388 | 667 | FAIL(ec); |
1389 | 667 | } |
1390 | 131k | CharString pname, vname; |
1391 | 131k | pname.appendInvariantChars(prop, ec); |
1392 | 131k | vname.appendInvariantChars(value, ec); |
1393 | 131k | if (U_FAILURE(ec)) return *this; |
1394 | | |
1395 | 131k | UProperty p; |
1396 | 131k | int32_t v; |
1397 | 131k | UBool invert = false; |
1398 | | |
1399 | 131k | if (value.length() > 0) { |
1400 | 74.6k | p = u_getPropertyEnum(pname.data()); |
1401 | 74.6k | if (p == UCHAR_INVALID_CODE) FAIL(ec); |
1402 | | |
1403 | | // Treat gc as gcm |
1404 | 72.1k | if (p == UCHAR_GENERAL_CATEGORY) { |
1405 | 338 | p = UCHAR_GENERAL_CATEGORY_MASK; |
1406 | 338 | } |
1407 | | |
1408 | 72.1k | if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || |
1409 | 71.4k | (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || |
1410 | 40.3k | (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { |
1411 | 40.3k | v = u_getPropertyValueEnum(p, vname.data()); |
1412 | 40.3k | if (v == UCHAR_INVALID_CODE) { |
1413 | | // Handle numeric CCC |
1414 | 21.3k | if (p == UCHAR_CANONICAL_COMBINING_CLASS || |
1415 | 17.5k | p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || |
1416 | 21.2k | p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { |
1417 | 21.2k | char* end; |
1418 | 21.2k | double val = uprv_strtod(vname.data(), &end); |
1419 | | // Anything between 0 and 255 is valid even if unused. |
1420 | | // Cast double->int only after range check. |
1421 | | // We catch NaN here because comparing it with both 0 and 255 will be false |
1422 | | // (as are all comparisons with NaN). |
1423 | 21.2k | if (*end != 0 || !(0 <= val && val <= 255) || |
1424 | 21.2k | (v = static_cast<int32_t>(val)) != val) { |
1425 | | // non-integral value or outside 0..255, or trailing junk |
1426 | 26 | FAIL(ec); |
1427 | 26 | } |
1428 | 21.2k | } else { |
1429 | 63 | FAIL(ec); |
1430 | 63 | } |
1431 | 21.3k | } |
1432 | 40.3k | } |
1433 | | |
1434 | 31.8k | else { |
1435 | | |
1436 | 31.8k | switch (p) { |
1437 | 18.6k | case UCHAR_NUMERIC_VALUE: |
1438 | 18.6k | { |
1439 | 18.6k | char* end; |
1440 | 18.6k | double val = uprv_strtod(vname.data(), &end); |
1441 | 18.6k | if (*end != 0) { |
1442 | 6 | FAIL(ec); |
1443 | 6 | } |
1444 | 18.6k | applyFilter(numericValueFilter, &val, |
1445 | 18.6k | CharacterProperties::getInclusionsForProperty(p, ec), ec); |
1446 | 18.6k | return *this; |
1447 | 18.6k | } |
1448 | 1.01k | case UCHAR_NAME: |
1449 | 1.01k | { |
1450 | 1.01k | const UChar32 ch = getCharacterByName(vname); |
1451 | 1.01k | if (ch < 0) { |
1452 | 152 | FAIL(ec); |
1453 | 152 | } |
1454 | 865 | clear(); |
1455 | 865 | add(ch); |
1456 | 865 | return *this; |
1457 | 1.01k | } |
1458 | 3 | case UCHAR_UNICODE_1_NAME: |
1459 | | // ICU 49 deprecates the Unicode_1_Name property APIs. |
1460 | 3 | FAIL(ec); |
1461 | 5.49k | case UCHAR_AGE: |
1462 | 5.49k | { |
1463 | 5.49k | UVersionInfo version; |
1464 | 5.49k | u_versionFromString(version, vname.data()); |
1465 | 5.49k | applyFilter(versionFilter, &version, |
1466 | 5.49k | CharacterProperties::getInclusionsForProperty(p, ec), ec); |
1467 | 5.49k | return *this; |
1468 | 3 | } |
1469 | 6.67k | case UCHAR_SCRIPT_EXTENSIONS: |
1470 | 6.67k | v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); |
1471 | 6.67k | if (v == UCHAR_INVALID_CODE) { |
1472 | 17 | FAIL(ec); |
1473 | 17 | } |
1474 | | // fall through to calling applyIntPropertyValue() |
1475 | 6.65k | break; |
1476 | 6.65k | case UCHAR_IDENTIFIER_TYPE: |
1477 | 0 | v = u_getPropertyValueEnum(p, vname.data()); |
1478 | 0 | if (v == UCHAR_INVALID_CODE) { |
1479 | 0 | FAIL(ec); |
1480 | 0 | } |
1481 | | // fall through to calling applyIntPropertyValue() |
1482 | 0 | break; |
1483 | 15 | default: |
1484 | | // p is a non-binary, non-enumerated property that we |
1485 | | // don't support (yet). |
1486 | 15 | FAIL(ec); |
1487 | 31.8k | } |
1488 | 31.8k | } |
1489 | 72.1k | } |
1490 | | |
1491 | 56.5k | else { |
1492 | | // value is empty. Interpret as General Category, Script, or |
1493 | | // Binary property. |
1494 | 56.5k | p = UCHAR_GENERAL_CATEGORY_MASK; |
1495 | 56.5k | v = u_getPropertyValueEnum(p, pname.data()); |
1496 | 56.5k | if (v == UCHAR_INVALID_CODE) { |
1497 | 36.9k | p = UCHAR_SCRIPT; |
1498 | 36.9k | v = u_getPropertyValueEnum(p, pname.data()); |
1499 | 36.9k | if (v == UCHAR_INVALID_CODE) { |
1500 | 22.2k | p = u_getPropertyEnum(pname.data()); |
1501 | 22.2k | if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { |
1502 | 7.22k | v = 1; |
1503 | 14.9k | } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { |
1504 | 2.17k | set(MIN_VALUE, MAX_VALUE); |
1505 | 2.17k | return *this; |
1506 | 12.8k | } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { |
1507 | 1.59k | set(0, 0x7F); |
1508 | 1.59k | return *this; |
1509 | 11.2k | } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { |
1510 | | // [:Assigned:]=[:^Cn:] |
1511 | 0 | p = UCHAR_GENERAL_CATEGORY_MASK; |
1512 | 0 | v = U_GC_CN_MASK; |
1513 | 0 | invert = true; |
1514 | 11.2k | } else { |
1515 | 11.2k | FAIL(ec); |
1516 | 11.2k | } |
1517 | 22.2k | } |
1518 | 36.9k | } |
1519 | 56.5k | } |
1520 | | |
1521 | 88.4k | applyIntPropertyValue(p, v, ec); |
1522 | 88.4k | if(invert) { |
1523 | 0 | complement().removeAllStrings(); // code point complement |
1524 | 0 | } |
1525 | | |
1526 | 88.4k | if (isBogus() && U_SUCCESS(ec)) { |
1527 | | // We likely ran out of memory. AHHH! |
1528 | 0 | ec = U_MEMORY_ALLOCATION_ERROR; |
1529 | 0 | } |
1530 | 88.4k | return *this; |
1531 | 131k | } |
1532 | | |
1533 | | //---------------------------------------------------------------- |
1534 | | // Property set patterns |
1535 | | //---------------------------------------------------------------- |
1536 | | |
1537 | | /** |
1538 | | * Return true if the given position, in the given pattern, appears |
1539 | | * to be the start of a property set pattern. |
1540 | | */ |
1541 | | UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, |
1542 | 0 | int32_t pos) { |
1543 | | // Patterns are at least 5 characters long |
1544 | 0 | if ((pos+5) > pattern.length()) { |
1545 | 0 | return false; |
1546 | 0 | } |
1547 | | |
1548 | | // Look for an opening [:, [:^, \p, or \P |
1549 | 0 | return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); |
1550 | 0 | } |
1551 | | |
1552 | | U_NAMESPACE_END |