/src/icu/source/common/uniset_props.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 1999-2014, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | *******************************************************************************  | 
10  |  | *   file name:  uniset_props.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2004aug25  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | *  | 
18  |  | *   Character property dependent functions moved here from uniset.cpp  | 
19  |  | */  | 
20  |  |  | 
21  |  | #include "unicode/utypes.h"  | 
22  |  | #include "unicode/uniset.h"  | 
23  |  | #include "unicode/parsepos.h"  | 
24  |  | #include "unicode/uchar.h"  | 
25  |  | #include "unicode/uscript.h"  | 
26  |  | #include "unicode/symtable.h"  | 
27  |  | #include "unicode/uset.h"  | 
28  |  | #include "unicode/locid.h"  | 
29  |  | #include "unicode/brkiter.h"  | 
30  |  | #include "uset_imp.h"  | 
31  |  | #include "ruleiter.h"  | 
32  |  | #include "cmemory.h"  | 
33  |  | #include "ucln_cmn.h"  | 
34  |  | #include "util.h"  | 
35  |  | #include "uvector.h"  | 
36  |  | #include "uprops.h"  | 
37  |  | #include "propname.h"  | 
38  |  | #include "normalizer2impl.h"  | 
39  |  | #include "uinvchar.h"  | 
40  |  | #include "uprops.h"  | 
41  |  | #include "charstr.h"  | 
42  |  | #include "cstring.h"  | 
43  |  | #include "mutex.h"  | 
44  |  | #include "umutex.h"  | 
45  |  | #include "uassert.h"  | 
46  |  | #include "hash.h"  | 
47  |  |  | 
48  |  | U_NAMESPACE_USE  | 
49  |  |  | 
50  |  | // Special property set IDs  | 
51  |  | static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]  | 
52  |  | static const char ASCII[] = "ASCII"; // [\u0000-\u007F]  | 
53  |  | static const char ASSIGNED[] = "Assigned"; // [:^Cn:]  | 
54  |  |  | 
55  |  | // Unicode name property alias  | 
56  | 0  | #define NAME_PROP "na"  | 
57  | 0  | #define NAME_PROP_LENGTH 2  | 
58  |  |  | 
59  |  | // Cached sets ------------------------------------------------------------- ***  | 
60  |  |  | 
61  |  | U_CDECL_BEGIN  | 
62  |  | static UBool U_CALLCONV uset_cleanup();  | 
63  |  |  | 
64  |  | static UnicodeSet *uni32Singleton;  | 
65  |  | static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;  | 
66  |  |  | 
67  |  | /**  | 
68  |  |  * Cleanup function for UnicodeSet  | 
69  |  |  */  | 
70  | 0  | static UBool U_CALLCONV uset_cleanup(void) { | 
71  | 0  |     delete uni32Singleton;  | 
72  | 0  |     uni32Singleton = NULL;  | 
73  | 0  |     uni32InitOnce.reset();  | 
74  | 0  |     return TRUE;  | 
75  | 0  | }  | 
76  |  |  | 
77  |  | U_CDECL_END  | 
78  |  |  | 
79  |  | U_NAMESPACE_BEGIN  | 
80  |  |  | 
81  |  | namespace { | 
82  |  |  | 
83  |  | // Cache some sets for other services -------------------------------------- ***  | 
84  | 0  | void U_CALLCONV createUni32Set(UErrorCode &errorCode) { | 
85  | 0  |     U_ASSERT(uni32Singleton == NULL);  | 
86  | 0  |     uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); | 
87  | 0  |     if(uni32Singleton==NULL) { | 
88  | 0  |         errorCode=U_MEMORY_ALLOCATION_ERROR;  | 
89  | 0  |     } else { | 
90  | 0  |         uni32Singleton->freeze();  | 
91  | 0  |     }  | 
92  | 0  |     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);  | 
93  | 0  | }  | 
94  |  |  | 
95  |  |  | 
96  |  | U_CFUNC UnicodeSet *  | 
97  | 0  | uniset_getUnicode32Instance(UErrorCode &errorCode) { | 
98  | 0  |     umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);  | 
99  | 0  |     return uni32Singleton;  | 
100  | 0  | }  | 
101  |  |  | 
102  |  | // helper functions for matching of pattern syntax pieces ------------------ ***  | 
103  |  | // these functions are parallel to the PERL_OPEN etc. strings above  | 
104  |  |  | 
105  |  | // using these functions is not only faster than UnicodeString::compare() and  | 
106  |  | // caseCompare(), but they also make UnicodeSet work for simple patterns when  | 
107  |  | // no Unicode properties data is available - when caseCompare() fails  | 
108  |  |  | 
109  |  | static inline UBool  | 
110  | 0  | isPerlOpen(const UnicodeString &pattern, int32_t pos) { | 
111  | 0  |     UChar c;  | 
112  | 0  |     return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');  | 
113  | 0  | }  | 
114  |  |  | 
115  |  | /*static inline UBool  | 
116  |  | isPerlClose(const UnicodeString &pattern, int32_t pos) { | 
117  |  |     return pattern.charAt(pos)==u'}';  | 
118  |  | }*/  | 
119  |  |  | 
120  |  | static inline UBool  | 
121  | 0  | isNameOpen(const UnicodeString &pattern, int32_t pos) { | 
122  | 0  |     return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';  | 
123  | 0  | }  | 
124  |  |  | 
125  |  | static inline UBool  | 
126  | 0  | isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { | 
127  | 0  |     return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';  | 
128  | 0  | }  | 
129  |  |  | 
130  |  | /*static inline UBool  | 
131  |  | isPOSIXClose(const UnicodeString &pattern, int32_t pos) { | 
132  |  |     return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';  | 
133  |  | }*/  | 
134  |  |  | 
135  |  | // TODO memory debugging provided inside uniset.cpp  | 
136  |  | // could be made available here but probably obsolete with use of modern  | 
137  |  | // memory leak checker tools  | 
138  |  | #define _dbgct(me)  | 
139  |  |  | 
140  |  | }  // namespace  | 
141  |  |  | 
142  |  | //----------------------------------------------------------------  | 
143  |  | // Constructors &c  | 
144  |  | //----------------------------------------------------------------  | 
145  |  |  | 
146  |  | /**  | 
147  |  |  * Constructs a set from the given pattern, optionally ignoring  | 
148  |  |  * white space.  See the class description for the syntax of the  | 
149  |  |  * pattern language.  | 
150  |  |  * @param pattern a string specifying what characters are in the set  | 
151  |  |  */  | 
152  |  | UnicodeSet::UnicodeSet(const UnicodeString& pattern,  | 
153  | 0  |                        UErrorCode& status) { | 
154  | 0  |     applyPattern(pattern, status);  | 
155  | 0  |     _dbgct(this);  | 
156  | 0  | }  | 
157  |  |  | 
158  |  | //----------------------------------------------------------------  | 
159  |  | // Public API  | 
160  |  | //----------------------------------------------------------------  | 
161  |  |  | 
162  |  | UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,  | 
163  | 0  |                                      UErrorCode& status) { | 
164  |  |     // Equivalent to  | 
165  |  |     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);  | 
166  |  |     // but without dependency on closeOver().  | 
167  | 0  |     ParsePosition pos(0);  | 
168  | 0  |     applyPatternIgnoreSpace(pattern, pos, NULL, status);  | 
169  | 0  |     if (U_FAILURE(status)) return *this;  | 
170  |  |  | 
171  | 0  |     int32_t i = pos.getIndex();  | 
172  |  |     // Skip over trailing whitespace  | 
173  | 0  |     ICU_Utility::skipWhitespace(pattern, i, TRUE);  | 
174  | 0  |     if (i != pattern.length()) { | 
175  | 0  |         status = U_ILLEGAL_ARGUMENT_ERROR;  | 
176  | 0  |     }  | 
177  | 0  |     return *this;  | 
178  | 0  | }  | 
179  |  |  | 
180  |  | void  | 
181  |  | UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,  | 
182  |  |                                     ParsePosition& pos,  | 
183  |  |                                     const SymbolTable* symbols,  | 
184  | 0  |                                     UErrorCode& status) { | 
185  | 0  |     if (U_FAILURE(status)) { | 
186  | 0  |         return;  | 
187  | 0  |     }  | 
188  | 0  |     if (isFrozen()) { | 
189  | 0  |         status = U_NO_WRITE_PERMISSION;  | 
190  | 0  |         return;  | 
191  | 0  |     }  | 
192  |  |     // Need to build the pattern in a temporary string because  | 
193  |  |     // _applyPattern calls add() etc., which set pat to empty.  | 
194  | 0  |     UnicodeString rebuiltPat;  | 
195  | 0  |     RuleCharacterIterator chars(pattern, symbols, pos);  | 
196  | 0  |     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);  | 
197  | 0  |     if (U_FAILURE(status)) return;  | 
198  | 0  |     if (chars.inVariable()) { | 
199  |  |         // syntaxError(chars, "Extra chars in variable value");  | 
200  | 0  |         status = U_MALFORMED_SET;  | 
201  | 0  |         return;  | 
202  | 0  |     }  | 
203  | 0  |     setPattern(rebuiltPat);  | 
204  | 0  | }  | 
205  |  |  | 
206  |  | /**  | 
207  |  |  * Return true if the given position, in the given pattern, appears  | 
208  |  |  * to be the start of a UnicodeSet pattern.  | 
209  |  |  */  | 
210  | 0  | UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { | 
211  | 0  |     return ((pos+1) < pattern.length() &&  | 
212  | 0  |             pattern.charAt(pos) == (UChar)91/*[*/) ||  | 
213  | 0  |         resemblesPropertyPattern(pattern, pos);  | 
214  | 0  | }  | 
215  |  |  | 
216  |  | //----------------------------------------------------------------  | 
217  |  | // Implementation: Pattern parsing  | 
218  |  | //----------------------------------------------------------------  | 
219  |  |  | 
220  |  | namespace { | 
221  |  |  | 
222  |  | /**  | 
223  |  |  * A small all-inline class to manage a UnicodeSet pointer.  Add  | 
224  |  |  * operator->() etc. as needed.  | 
225  |  |  */  | 
226  |  | class UnicodeSetPointer { | 
227  |  |     UnicodeSet* p;  | 
228  |  | public:  | 
229  | 0  |     inline UnicodeSetPointer() : p(0) {} | 
230  | 0  |     inline ~UnicodeSetPointer() { delete p; } | 
231  | 0  |     inline UnicodeSet* pointer() { return p; } | 
232  | 0  |     inline UBool allocate() { | 
233  | 0  |         if (p == 0) { | 
234  | 0  |             p = new UnicodeSet();  | 
235  | 0  |         }  | 
236  | 0  |         return p != 0;  | 
237  | 0  |     }  | 
238  |  | };  | 
239  |  |  | 
240  |  | constexpr int32_t MAX_DEPTH = 100;  | 
241  |  |  | 
242  |  | }  // namespace  | 
243  |  |  | 
244  |  | /**  | 
245  |  |  * Parse the pattern from the given RuleCharacterIterator.  The  | 
246  |  |  * iterator is advanced over the parsed pattern.  | 
247  |  |  * @param chars iterator over the pattern characters.  Upon return  | 
248  |  |  * it will be advanced to the first character after the parsed  | 
249  |  |  * pattern, or the end of the iteration if all characters are  | 
250  |  |  * parsed.  | 
251  |  |  * @param symbols symbol table to use to parse and dereference  | 
252  |  |  * variables, or null if none.  | 
253  |  |  * @param rebuiltPat the pattern that was parsed, rebuilt or  | 
254  |  |  * copied from the input pattern, as appropriate.  | 
255  |  |  * @param options a bit mask of zero or more of the following:  | 
256  |  |  * IGNORE_SPACE, CASE.  | 
257  |  |  */  | 
258  |  | void UnicodeSet::applyPattern(RuleCharacterIterator& chars,  | 
259  |  |                               const SymbolTable* symbols,  | 
260  |  |                               UnicodeString& rebuiltPat,  | 
261  |  |                               uint32_t options,  | 
262  |  |                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),  | 
263  |  |                               int32_t depth,  | 
264  | 0  |                               UErrorCode& ec) { | 
265  | 0  |     if (U_FAILURE(ec)) return;  | 
266  | 0  |     if (depth > MAX_DEPTH) { | 
267  | 0  |         ec = U_ILLEGAL_ARGUMENT_ERROR;  | 
268  | 0  |         return;  | 
269  | 0  |     }  | 
270  |  |  | 
271  |  |     // Syntax characters: [ ] ^ - & { } | 
272  |  |  | 
273  |  |     // Recognized special forms for chars, sets: c-c s-s s&s  | 
274  |  |  | 
275  | 0  |     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |  | 
276  | 0  |                    RuleCharacterIterator::PARSE_ESCAPES;  | 
277  | 0  |     if ((options & USET_IGNORE_SPACE) != 0) { | 
278  | 0  |         opts |= RuleCharacterIterator::SKIP_WHITESPACE;  | 
279  | 0  |     }  | 
280  |  | 
  | 
281  | 0  |     UnicodeString patLocal, buf;  | 
282  | 0  |     UBool usePat = FALSE;  | 
283  | 0  |     UnicodeSetPointer scratch;  | 
284  | 0  |     RuleCharacterIterator::Pos backup;  | 
285  |  |  | 
286  |  |     // mode: 0=before [, 1=between [...], 2=after ]  | 
287  |  |     // lastItem: 0=none, 1=char, 2=set  | 
288  | 0  |     int8_t lastItem = 0, mode = 0;  | 
289  | 0  |     UChar32 lastChar = 0;  | 
290  | 0  |     UChar op = 0;  | 
291  |  | 
  | 
292  | 0  |     UBool invert = FALSE;  | 
293  |  | 
  | 
294  | 0  |     clear();  | 
295  |  | 
  | 
296  | 0  |     while (mode != 2 && !chars.atEnd()) { | 
297  | 0  |         U_ASSERT((lastItem == 0 && op == 0) ||  | 
298  | 0  |                  (lastItem == 1 && (op == 0 || op == u'-')) ||  | 
299  | 0  |                  (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));  | 
300  |  | 
  | 
301  | 0  |         UChar32 c = 0;  | 
302  | 0  |         UBool literal = FALSE;  | 
303  | 0  |         UnicodeSet* nested = 0; // alias - do not delete  | 
304  |  |  | 
305  |  |         // -------- Check for property pattern  | 
306  |  |  | 
307  |  |         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed  | 
308  | 0  |         int8_t setMode = 0;  | 
309  | 0  |         if (resemblesPropertyPattern(chars, opts)) { | 
310  | 0  |             setMode = 2;  | 
311  | 0  |         }  | 
312  |  |  | 
313  |  |         // -------- Parse '[' of opening delimiter OR nested set.  | 
314  |  |         // If there is a nested set, use `setMode' to define how  | 
315  |  |         // the set should be parsed.  If the '[' is part of the  | 
316  |  |         // opening delimiter for this pattern, parse special  | 
317  |  |         // strings "[", "[^", "[-", and "[^-".  Check for stand-in  | 
318  |  |         // characters representing a nested set in the symbol  | 
319  |  |         // table.  | 
320  |  |  | 
321  | 0  |         else { | 
322  |  |             // Prepare to backup if necessary  | 
323  | 0  |             chars.getPos(backup);  | 
324  | 0  |             c = chars.next(opts, literal, ec);  | 
325  | 0  |             if (U_FAILURE(ec)) return;  | 
326  |  |  | 
327  | 0  |             if (c == u'[' && !literal) { | 
328  | 0  |                 if (mode == 1) { | 
329  | 0  |                     chars.setPos(backup); // backup  | 
330  | 0  |                     setMode = 1;  | 
331  | 0  |                 } else { | 
332  |  |                     // Handle opening '[' delimiter  | 
333  | 0  |                     mode = 1;  | 
334  | 0  |                     patLocal.append(u'[');  | 
335  | 0  |                     chars.getPos(backup); // prepare to backup  | 
336  | 0  |                     c = chars.next(opts, literal, ec);   | 
337  | 0  |                     if (U_FAILURE(ec)) return;  | 
338  | 0  |                     if (c == u'^' && !literal) { | 
339  | 0  |                         invert = TRUE;  | 
340  | 0  |                         patLocal.append(u'^');  | 
341  | 0  |                         chars.getPos(backup); // prepare to backup  | 
342  | 0  |                         c = chars.next(opts, literal, ec);  | 
343  | 0  |                         if (U_FAILURE(ec)) return;  | 
344  | 0  |                     }  | 
345  |  |                     // Fall through to handle special leading '-';  | 
346  |  |                     // otherwise restart loop for nested [], \p{}, etc. | 
347  | 0  |                     if (c == u'-') { | 
348  | 0  |                         literal = TRUE;  | 
349  |  |                         // Fall through to handle literal '-' below  | 
350  | 0  |                     } else { | 
351  | 0  |                         chars.setPos(backup); // backup  | 
352  | 0  |                         continue;  | 
353  | 0  |                     }  | 
354  | 0  |                 }  | 
355  | 0  |             } else if (symbols != 0) { | 
356  | 0  |                 const UnicodeFunctor *m = symbols->lookupMatcher(c);  | 
357  | 0  |                 if (m != 0) { | 
358  | 0  |                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);  | 
359  | 0  |                     if (ms == NULL) { | 
360  | 0  |                         ec = U_MALFORMED_SET;  | 
361  | 0  |                         return;  | 
362  | 0  |                     }  | 
363  |  |                     // casting away const, but `nested' won't be modified  | 
364  |  |                     // (important not to modify stored set)  | 
365  | 0  |                     nested = const_cast<UnicodeSet*>(ms);  | 
366  | 0  |                     setMode = 3;  | 
367  | 0  |                 }  | 
368  | 0  |             }  | 
369  | 0  |         }  | 
370  |  |  | 
371  |  |         // -------- Handle a nested set.  This either is inline in  | 
372  |  |         // the pattern or represented by a stand-in that has  | 
373  |  |         // previously been parsed and was looked up in the symbol  | 
374  |  |         // table.  | 
375  |  |  | 
376  | 0  |         if (setMode != 0) { | 
377  | 0  |             if (lastItem == 1) { | 
378  | 0  |                 if (op != 0) { | 
379  |  |                     // syntaxError(chars, "Char expected after operator");  | 
380  | 0  |                     ec = U_MALFORMED_SET;  | 
381  | 0  |                     return;  | 
382  | 0  |                 }  | 
383  | 0  |                 add(lastChar, lastChar);  | 
384  | 0  |                 _appendToPat(patLocal, lastChar, FALSE);  | 
385  | 0  |                 lastItem = 0;  | 
386  | 0  |                 op = 0;  | 
387  | 0  |             }  | 
388  |  |  | 
389  | 0  |             if (op == u'-' || op == u'&') { | 
390  | 0  |                 patLocal.append(op);  | 
391  | 0  |             }  | 
392  |  | 
  | 
393  | 0  |             if (nested == 0) { | 
394  |  |                 // lazy allocation  | 
395  | 0  |                 if (!scratch.allocate()) { | 
396  | 0  |                     ec = U_MEMORY_ALLOCATION_ERROR;  | 
397  | 0  |                     return;  | 
398  | 0  |                 }  | 
399  | 0  |                 nested = scratch.pointer();  | 
400  | 0  |             }  | 
401  | 0  |             switch (setMode) { | 
402  | 0  |             case 1:  | 
403  | 0  |                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);  | 
404  | 0  |                 break;  | 
405  | 0  |             case 2:  | 
406  | 0  |                 chars.skipIgnored(opts);  | 
407  | 0  |                 nested->applyPropertyPattern(chars, patLocal, ec);  | 
408  | 0  |                 if (U_FAILURE(ec)) return;  | 
409  | 0  |                 break;  | 
410  | 0  |             case 3: // `nested' already parsed  | 
411  | 0  |                 nested->_toPattern(patLocal, FALSE);  | 
412  | 0  |                 break;  | 
413  | 0  |             }  | 
414  |  |  | 
415  | 0  |             usePat = TRUE;  | 
416  |  | 
  | 
417  | 0  |             if (mode == 0) { | 
418  |  |                 // Entire pattern is a category; leave parse loop  | 
419  | 0  |                 *this = *nested;  | 
420  | 0  |                 mode = 2;  | 
421  | 0  |                 break;  | 
422  | 0  |             }  | 
423  |  |  | 
424  | 0  |             switch (op) { | 
425  | 0  |             case u'-':  | 
426  | 0  |                 removeAll(*nested);  | 
427  | 0  |                 break;  | 
428  | 0  |             case u'&':  | 
429  | 0  |                 retainAll(*nested);  | 
430  | 0  |                 break;  | 
431  | 0  |             case 0:  | 
432  | 0  |                 addAll(*nested);  | 
433  | 0  |                 break;  | 
434  | 0  |             }  | 
435  |  |  | 
436  | 0  |             op = 0;  | 
437  | 0  |             lastItem = 2;  | 
438  |  | 
  | 
439  | 0  |             continue;  | 
440  | 0  |         }  | 
441  |  |  | 
442  | 0  |         if (mode == 0) { | 
443  |  |             // syntaxError(chars, "Missing '['");  | 
444  | 0  |             ec = U_MALFORMED_SET;  | 
445  | 0  |             return;  | 
446  | 0  |         }  | 
447  |  |  | 
448  |  |         // -------- Parse special (syntax) characters.  If the  | 
449  |  |         // current character is not special, or if it is escaped,  | 
450  |  |         // then fall through and handle it below.  | 
451  |  |  | 
452  | 0  |         if (!literal) { | 
453  | 0  |             switch (c) { | 
454  | 0  |             case u']':  | 
455  | 0  |                 if (lastItem == 1) { | 
456  | 0  |                     add(lastChar, lastChar);  | 
457  | 0  |                     _appendToPat(patLocal, lastChar, FALSE);  | 
458  | 0  |                 }  | 
459  |  |                 // Treat final trailing '-' as a literal  | 
460  | 0  |                 if (op == u'-') { | 
461  | 0  |                     add(op, op);  | 
462  | 0  |                     patLocal.append(op);  | 
463  | 0  |                 } else if (op == u'&') { | 
464  |  |                     // syntaxError(chars, "Trailing '&'");  | 
465  | 0  |                     ec = U_MALFORMED_SET;  | 
466  | 0  |                     return;  | 
467  | 0  |                 }  | 
468  | 0  |                 patLocal.append(u']');  | 
469  | 0  |                 mode = 2;  | 
470  | 0  |                 continue;  | 
471  | 0  |             case u'-':  | 
472  | 0  |                 if (op == 0) { | 
473  | 0  |                     if (lastItem != 0) { | 
474  | 0  |                         op = (UChar) c;  | 
475  | 0  |                         continue;  | 
476  | 0  |                     } else { | 
477  |  |                         // Treat final trailing '-' as a literal  | 
478  | 0  |                         add(c, c);  | 
479  | 0  |                         c = chars.next(opts, literal, ec);  | 
480  | 0  |                         if (U_FAILURE(ec)) return;  | 
481  | 0  |                         if (c == u']' && !literal) { | 
482  | 0  |                             patLocal.append(u"-]", 2);  | 
483  | 0  |                             mode = 2;  | 
484  | 0  |                             continue;  | 
485  | 0  |                         }  | 
486  | 0  |                     }  | 
487  | 0  |                 }  | 
488  |  |                 // syntaxError(chars, "'-' not after char or set");  | 
489  | 0  |                 ec = U_MALFORMED_SET;  | 
490  | 0  |                 return;  | 
491  | 0  |             case u'&':  | 
492  | 0  |                 if (lastItem == 2 && op == 0) { | 
493  | 0  |                     op = (UChar) c;  | 
494  | 0  |                     continue;  | 
495  | 0  |                 }  | 
496  |  |                 // syntaxError(chars, "'&' not after set");  | 
497  | 0  |                 ec = U_MALFORMED_SET;  | 
498  | 0  |                 return;  | 
499  | 0  |             case u'^':  | 
500  |  |                 // syntaxError(chars, "'^' not after '['");  | 
501  | 0  |                 ec = U_MALFORMED_SET;  | 
502  | 0  |                 return;  | 
503  | 0  |             case u'{': | 
504  | 0  |                 if (op != 0) { | 
505  |  |                     // syntaxError(chars, "Missing operand after operator");  | 
506  | 0  |                     ec = U_MALFORMED_SET;  | 
507  | 0  |                     return;  | 
508  | 0  |                 }  | 
509  | 0  |                 if (lastItem == 1) { | 
510  | 0  |                     add(lastChar, lastChar);  | 
511  | 0  |                     _appendToPat(patLocal, lastChar, FALSE);  | 
512  | 0  |                 }  | 
513  | 0  |                 lastItem = 0;  | 
514  | 0  |                 buf.truncate(0);  | 
515  | 0  |                 { | 
516  | 0  |                     UBool ok = FALSE;  | 
517  | 0  |                     while (!chars.atEnd()) { | 
518  | 0  |                         c = chars.next(opts, literal, ec);  | 
519  | 0  |                         if (U_FAILURE(ec)) return;  | 
520  | 0  |                         if (c == u'}' && !literal) { | 
521  | 0  |                             ok = TRUE;  | 
522  | 0  |                             break;  | 
523  | 0  |                         }  | 
524  | 0  |                         buf.append(c);  | 
525  | 0  |                     }  | 
526  | 0  |                     if (!ok) { | 
527  |  |                         // syntaxError(chars, "Invalid multicharacter string");  | 
528  | 0  |                         ec = U_MALFORMED_SET;  | 
529  | 0  |                         return;  | 
530  | 0  |                     }  | 
531  | 0  |                 }  | 
532  |  |                 // We have new string. Add it to set and continue;  | 
533  |  |                 // we don't need to drop through to the further  | 
534  |  |                 // processing  | 
535  | 0  |                 add(buf);  | 
536  | 0  |                 patLocal.append(u'{'); | 
537  | 0  |                 _appendToPat(patLocal, buf, FALSE);  | 
538  | 0  |                 patLocal.append(u'}');  | 
539  | 0  |                 continue;  | 
540  | 0  |             case SymbolTable::SYMBOL_REF:  | 
541  |  |                 //         symbols  nosymbols  | 
542  |  |                 // [a-$]   error    error (ambiguous)  | 
543  |  |                 // [a$]    anchor   anchor  | 
544  |  |                 // [a-$x]  var "x"* literal '$'  | 
545  |  |                 // [a-$.]  error    literal '$'  | 
546  |  |                 // *We won't get here in the case of var "x"  | 
547  | 0  |                 { | 
548  | 0  |                     chars.getPos(backup);  | 
549  | 0  |                     c = chars.next(opts, literal, ec);  | 
550  | 0  |                     if (U_FAILURE(ec)) return;  | 
551  | 0  |                     UBool anchor = (c == u']' && !literal);  | 
552  | 0  |                     if (symbols == 0 && !anchor) { | 
553  | 0  |                         c = SymbolTable::SYMBOL_REF;  | 
554  | 0  |                         chars.setPos(backup);  | 
555  | 0  |                         break; // literal '$'  | 
556  | 0  |                     }  | 
557  | 0  |                     if (anchor && op == 0) { | 
558  | 0  |                         if (lastItem == 1) { | 
559  | 0  |                             add(lastChar, lastChar);  | 
560  | 0  |                             _appendToPat(patLocal, lastChar, FALSE);  | 
561  | 0  |                         }  | 
562  | 0  |                         add(U_ETHER);  | 
563  | 0  |                         usePat = TRUE;  | 
564  | 0  |                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);  | 
565  | 0  |                         patLocal.append(u']');  | 
566  | 0  |                         mode = 2;  | 
567  | 0  |                         continue;  | 
568  | 0  |                     }  | 
569  |  |                     // syntaxError(chars, "Unquoted '$'");  | 
570  | 0  |                     ec = U_MALFORMED_SET;  | 
571  | 0  |                     return;  | 
572  | 0  |                 }  | 
573  | 0  |             default:  | 
574  | 0  |                 break;  | 
575  | 0  |             }  | 
576  | 0  |         }  | 
577  |  |  | 
578  |  |         // -------- Parse literal characters.  This includes both  | 
579  |  |         // escaped chars ("\u4E01") and non-syntax characters | 
580  |  |         // ("a"). | 
581  |  |  | 
582  | 0  |         switch (lastItem) { | 
583  | 0  |         case 0:  | 
584  | 0  |             lastItem = 1;  | 
585  | 0  |             lastChar = c;  | 
586  | 0  |             break;  | 
587  | 0  |         case 1:  | 
588  | 0  |             if (op == u'-') { | 
589  | 0  |                 if (lastChar >= c) { | 
590  |  |                     // Don't allow redundant (a-a) or empty (b-a) ranges;  | 
591  |  |                     // these are most likely typos.  | 
592  |  |                     // syntaxError(chars, "Invalid range");  | 
593  | 0  |                     ec = U_MALFORMED_SET;  | 
594  | 0  |                     return;  | 
595  | 0  |                 }  | 
596  | 0  |                 add(lastChar, c);  | 
597  | 0  |                 _appendToPat(patLocal, lastChar, FALSE);  | 
598  | 0  |                 patLocal.append(op);  | 
599  | 0  |                 _appendToPat(patLocal, c, FALSE);  | 
600  | 0  |                 lastItem = 0;  | 
601  | 0  |                 op = 0;  | 
602  | 0  |             } else { | 
603  | 0  |                 add(lastChar, lastChar);  | 
604  | 0  |                 _appendToPat(patLocal, lastChar, FALSE);  | 
605  | 0  |                 lastChar = c;  | 
606  | 0  |             }  | 
607  | 0  |             break;  | 
608  | 0  |         case 2:  | 
609  | 0  |             if (op != 0) { | 
610  |  |                 // syntaxError(chars, "Set expected after operator");  | 
611  | 0  |                 ec = U_MALFORMED_SET;  | 
612  | 0  |                 return;  | 
613  | 0  |             }  | 
614  | 0  |             lastChar = c;  | 
615  | 0  |             lastItem = 1;  | 
616  | 0  |             break;  | 
617  | 0  |         }  | 
618  | 0  |     }  | 
619  |  |  | 
620  | 0  |     if (mode != 2) { | 
621  |  |         // syntaxError(chars, "Missing ']'");  | 
622  | 0  |         ec = U_MALFORMED_SET;  | 
623  | 0  |         return;  | 
624  | 0  |     }  | 
625  |  |  | 
626  | 0  |     chars.skipIgnored(opts);  | 
627  |  |  | 
628  |  |     /**  | 
629  |  |      * Handle global flags (invert, case insensitivity).  If this  | 
630  |  |      * pattern should be compiled case-insensitive, then we need  | 
631  |  |      * to close over case BEFORE COMPLEMENTING.  This makes  | 
632  |  |      * patterns like /[^abc]/i work.  | 
633  |  |      */  | 
634  | 0  |     if ((options & USET_CASE_INSENSITIVE) != 0) { | 
635  | 0  |         (this->*caseClosure)(USET_CASE_INSENSITIVE);  | 
636  | 0  |     }  | 
637  | 0  |     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { | 
638  | 0  |         (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);  | 
639  | 0  |     }  | 
640  | 0  |     if (invert) { | 
641  | 0  |         complement();  | 
642  | 0  |     }  | 
643  |  |  | 
644  |  |     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the  | 
645  |  |     // generated pattern.  | 
646  | 0  |     if (usePat) { | 
647  | 0  |         rebuiltPat.append(patLocal);  | 
648  | 0  |     } else { | 
649  | 0  |         _generatePattern(rebuiltPat, FALSE);  | 
650  | 0  |     }  | 
651  | 0  |     if (isBogus() && U_SUCCESS(ec)) { | 
652  |  |         // We likely ran out of memory. AHHH!  | 
653  | 0  |         ec = U_MEMORY_ALLOCATION_ERROR;  | 
654  | 0  |     }  | 
655  | 0  | }  | 
656  |  |  | 
657  |  | //----------------------------------------------------------------  | 
658  |  | // Property set implementation  | 
659  |  | //----------------------------------------------------------------  | 
660  |  |  | 
661  |  | namespace { | 
662  |  |  | 
663  | 0  | static UBool numericValueFilter(UChar32 ch, void* context) { | 
664  | 0  |     return u_getNumericValue(ch) == *(double*)context;  | 
665  | 0  | }  | 
666  |  |  | 
667  | 0  | static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { | 
668  | 0  |     int32_t value = *(int32_t*)context;  | 
669  | 0  |     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;  | 
670  | 0  | }  | 
671  |  |  | 
672  | 0  | static UBool versionFilter(UChar32 ch, void* context) { | 
673  | 0  |     static const UVersionInfo none = { 0, 0, 0, 0 }; | 
674  | 0  |     UVersionInfo v;  | 
675  | 0  |     u_charAge(ch, v);  | 
676  | 0  |     UVersionInfo* version = (UVersionInfo*)context;  | 
677  | 0  |     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;  | 
678  | 0  | }  | 
679  |  |  | 
680  |  | typedef struct { | 
681  |  |     UProperty prop;  | 
682  |  |     int32_t value;  | 
683  |  | } IntPropertyContext;  | 
684  |  |  | 
685  | 0  | static UBool intPropertyFilter(UChar32 ch, void* context) { | 
686  | 0  |     IntPropertyContext* c = (IntPropertyContext*)context;  | 
687  | 0  |     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;  | 
688  | 0  | }  | 
689  |  |  | 
690  | 0  | static UBool scriptExtensionsFilter(UChar32 ch, void* context) { | 
691  | 0  |     return uscript_hasScript(ch, *(UScriptCode*)context);  | 
692  | 0  | }  | 
693  |  |  | 
694  |  | }  // namespace  | 
695  |  |  | 
696  |  | /**  | 
697  |  |  * Generic filter-based scanning code for UCD property UnicodeSets.  | 
698  |  |  */  | 
699  |  | void UnicodeSet::applyFilter(UnicodeSet::Filter filter,  | 
700  |  |                              void* context,  | 
701  |  |                              const UnicodeSet* inclusions,  | 
702  | 0  |                              UErrorCode &status) { | 
703  | 0  |     if (U_FAILURE(status)) return;  | 
704  |  |  | 
705  |  |     // Logically, walk through all Unicode characters, noting the start  | 
706  |  |     // and end of each range for which filter.contain(c) is  | 
707  |  |     // true.  Add each range to a set.  | 
708  |  |     //  | 
709  |  |     // To improve performance, use an inclusions set which  | 
710  |  |     // encodes information about character ranges that are known  | 
711  |  |     // to have identical properties.  | 
712  |  |     // inclusions contains the first characters of  | 
713  |  |     // same-value ranges for the given property.  | 
714  |  |  | 
715  | 0  |     clear();  | 
716  |  | 
  | 
717  | 0  |     UChar32 startHasProperty = -1;  | 
718  | 0  |     int32_t limitRange = inclusions->getRangeCount();  | 
719  |  | 
  | 
720  | 0  |     for (int j=0; j<limitRange; ++j) { | 
721  |  |         // get current range  | 
722  | 0  |         UChar32 start = inclusions->getRangeStart(j);  | 
723  | 0  |         UChar32 end = inclusions->getRangeEnd(j);  | 
724  |  |  | 
725  |  |         // for all the code points in the range, process  | 
726  | 0  |         for (UChar32 ch = start; ch <= end; ++ch) { | 
727  |  |             // only add to this UnicodeSet on inflection points --  | 
728  |  |             // where the hasProperty value changes to false  | 
729  | 0  |             if ((*filter)(ch, context)) { | 
730  | 0  |                 if (startHasProperty < 0) { | 
731  | 0  |                     startHasProperty = ch;  | 
732  | 0  |                 }  | 
733  | 0  |             } else if (startHasProperty >= 0) { | 
734  | 0  |                 add(startHasProperty, ch-1);  | 
735  | 0  |                 startHasProperty = -1;  | 
736  | 0  |             }  | 
737  | 0  |         }  | 
738  | 0  |     }  | 
739  | 0  |     if (startHasProperty >= 0) { | 
740  | 0  |         add((UChar32)startHasProperty, (UChar32)0x10FFFF);  | 
741  | 0  |     }  | 
742  | 0  |     if (isBogus() && U_SUCCESS(status)) { | 
743  |  |         // We likely ran out of memory. AHHH!  | 
744  | 0  |         status = U_MEMORY_ALLOCATION_ERROR;  | 
745  | 0  |     }  | 
746  | 0  | }  | 
747  |  |  | 
748  |  | namespace { | 
749  |  |  | 
750  | 0  | static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { | 
751  |  |     /* Note: we use ' ' in compiler code page */  | 
752  | 0  |     int32_t j = 0;  | 
753  | 0  |     char ch;  | 
754  | 0  |     --dstCapacity; /* make room for term. zero */  | 
755  | 0  |     while ((ch = *src++) != 0) { | 
756  | 0  |         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { | 
757  | 0  |             continue;  | 
758  | 0  |         }  | 
759  | 0  |         if (j >= dstCapacity) return FALSE;  | 
760  | 0  |         dst[j++] = ch;  | 
761  | 0  |     }  | 
762  | 0  |     if (j > 0 && dst[j-1] == ' ') --j;  | 
763  | 0  |     dst[j] = 0;  | 
764  | 0  |     return TRUE;  | 
765  | 0  | }  | 
766  |  |  | 
767  |  | }  // namespace  | 
768  |  |  | 
769  |  | //----------------------------------------------------------------  | 
770  |  | // Property set API  | 
771  |  | //----------------------------------------------------------------  | 
772  |  |  | 
773  | 0  | #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \ | 
774  | 0  |     ec=U_ILLEGAL_ARGUMENT_ERROR; \  | 
775  | 0  |     return *this; \  | 
776  | 0  | } UPRV_BLOCK_MACRO_END  | 
777  |  |  | 
778  |  | UnicodeSet&  | 
779  | 0  | UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { | 
780  | 0  |     if (U_FAILURE(ec) || isFrozen()) { return *this; } | 
781  | 0  |     if (prop == UCHAR_GENERAL_CATEGORY_MASK) { | 
782  | 0  |         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);  | 
783  | 0  |         applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);  | 
784  | 0  |     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { | 
785  | 0  |         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);  | 
786  | 0  |         UScriptCode script = (UScriptCode)value;  | 
787  | 0  |         applyFilter(scriptExtensionsFilter, &script, inclusions, ec);  | 
788  | 0  |     } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { | 
789  | 0  |         if (value == 0 || value == 1) { | 
790  | 0  |             const USet *set = u_getBinaryPropertySet(prop, &ec);  | 
791  | 0  |             if (U_FAILURE(ec)) { return *this; } | 
792  | 0  |             copyFrom(*UnicodeSet::fromUSet(set), TRUE);  | 
793  | 0  |             if (value == 0) { | 
794  | 0  |                 complement();  | 
795  | 0  |             }  | 
796  | 0  |         } else { | 
797  | 0  |             clear();  | 
798  | 0  |         }  | 
799  | 0  |     } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { | 
800  | 0  |         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);  | 
801  | 0  |         IntPropertyContext c = {prop, value}; | 
802  | 0  |         applyFilter(intPropertyFilter, &c, inclusions, ec);  | 
803  | 0  |     } else { | 
804  | 0  |         ec = U_ILLEGAL_ARGUMENT_ERROR;  | 
805  | 0  |     }  | 
806  | 0  |     return *this;  | 
807  | 0  | }  | 
808  |  |  | 
809  |  | UnicodeSet&  | 
810  |  | UnicodeSet::applyPropertyAlias(const UnicodeString& prop,  | 
811  |  |                                const UnicodeString& value,  | 
812  | 0  |                                UErrorCode& ec) { | 
813  | 0  |     if (U_FAILURE(ec) || isFrozen()) return *this;  | 
814  |  |  | 
815  |  |     // prop and value used to be converted to char * using the default  | 
816  |  |     // converter instead of the invariant conversion.  | 
817  |  |     // This should not be necessary because all Unicode property and value  | 
818  |  |     // names use only invariant characters.  | 
819  |  |     // If there are any variant characters, then we won't find them anyway.  | 
820  |  |     // Checking first avoids assertion failures in the conversion.  | 
821  | 0  |     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||  | 
822  | 0  |         !uprv_isInvariantUString(value.getBuffer(), value.length())  | 
823  | 0  |     ) { | 
824  | 0  |         FAIL(ec);  | 
825  | 0  |     }  | 
826  | 0  |     CharString pname, vname;  | 
827  | 0  |     pname.appendInvariantChars(prop, ec);  | 
828  | 0  |     vname.appendInvariantChars(value, ec);  | 
829  | 0  |     if (U_FAILURE(ec)) return *this;  | 
830  |  |  | 
831  | 0  |     UProperty p;  | 
832  | 0  |     int32_t v;  | 
833  | 0  |     UBool invert = FALSE;  | 
834  |  | 
  | 
835  | 0  |     if (value.length() > 0) { | 
836  | 0  |         p = u_getPropertyEnum(pname.data());  | 
837  | 0  |         if (p == UCHAR_INVALID_CODE) FAIL(ec);  | 
838  |  |  | 
839  |  |         // Treat gc as gcm  | 
840  | 0  |         if (p == UCHAR_GENERAL_CATEGORY) { | 
841  | 0  |             p = UCHAR_GENERAL_CATEGORY_MASK;  | 
842  | 0  |         }  | 
843  |  | 
  | 
844  | 0  |         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||  | 
845  | 0  |             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||  | 
846  | 0  |             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { | 
847  | 0  |             v = u_getPropertyValueEnum(p, vname.data());  | 
848  | 0  |             if (v == UCHAR_INVALID_CODE) { | 
849  |  |                 // Handle numeric CCC  | 
850  | 0  |                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||  | 
851  | 0  |                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||  | 
852  | 0  |                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { | 
853  | 0  |                     char* end;  | 
854  | 0  |                     double val = uprv_strtod(vname.data(), &end);  | 
855  |  |                     // Anything between 0 and 255 is valid even if unused.  | 
856  |  |                     // Cast double->int only after range check.  | 
857  |  |                     // We catch NaN here because comparing it with both 0 and 255 will be false  | 
858  |  |                     // (as are all comparisons with NaN).  | 
859  | 0  |                     if (*end != 0 || !(0 <= val && val <= 255) ||  | 
860  | 0  |                             (v = (int32_t)val) != val) { | 
861  |  |                         // non-integral value or outside 0..255, or trailing junk  | 
862  | 0  |                         FAIL(ec);  | 
863  | 0  |                     }  | 
864  | 0  |                 } else { | 
865  | 0  |                     FAIL(ec);  | 
866  | 0  |                 }  | 
867  | 0  |             }  | 
868  | 0  |         }  | 
869  |  |  | 
870  | 0  |         else { | 
871  |  | 
  | 
872  | 0  |             switch (p) { | 
873  | 0  |             case UCHAR_NUMERIC_VALUE:  | 
874  | 0  |                 { | 
875  | 0  |                     char* end;  | 
876  | 0  |                     double val = uprv_strtod(vname.data(), &end);  | 
877  | 0  |                     if (*end != 0) { | 
878  | 0  |                         FAIL(ec);  | 
879  | 0  |                     }  | 
880  | 0  |                     applyFilter(numericValueFilter, &val,  | 
881  | 0  |                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);  | 
882  | 0  |                     return *this;  | 
883  | 0  |                 }  | 
884  | 0  |             case UCHAR_NAME:  | 
885  | 0  |                 { | 
886  |  |                     // Must munge name, since u_charFromName() does not do  | 
887  |  |                     // 'loose' matching.  | 
888  | 0  |                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength  | 
889  | 0  |                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);  | 
890  | 0  |                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);  | 
891  | 0  |                     if (U_SUCCESS(ec)) { | 
892  | 0  |                         clear();  | 
893  | 0  |                         add(ch);  | 
894  | 0  |                         return *this;  | 
895  | 0  |                     } else { | 
896  | 0  |                         FAIL(ec);  | 
897  | 0  |                     }  | 
898  | 0  |                 }  | 
899  | 0  |             case UCHAR_UNICODE_1_NAME:  | 
900  |  |                 // ICU 49 deprecates the Unicode_1_Name property APIs.  | 
901  | 0  |                 FAIL(ec);  | 
902  | 0  |             case UCHAR_AGE:  | 
903  | 0  |                 { | 
904  |  |                     // Must munge name, since u_versionFromString() does not do  | 
905  |  |                     // 'loose' matching.  | 
906  | 0  |                     char buf[128];  | 
907  | 0  |                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);  | 
908  | 0  |                     UVersionInfo version;  | 
909  | 0  |                     u_versionFromString(version, buf);  | 
910  | 0  |                     applyFilter(versionFilter, &version,  | 
911  | 0  |                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);  | 
912  | 0  |                     return *this;  | 
913  | 0  |                 }  | 
914  | 0  |             case UCHAR_SCRIPT_EXTENSIONS:  | 
915  | 0  |                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());  | 
916  | 0  |                 if (v == UCHAR_INVALID_CODE) { | 
917  | 0  |                     FAIL(ec);  | 
918  | 0  |                 }  | 
919  |  |                 // fall through to calling applyIntPropertyValue()  | 
920  | 0  |                 break;  | 
921  | 0  |             default:  | 
922  |  |                 // p is a non-binary, non-enumerated property that we  | 
923  |  |                 // don't support (yet).  | 
924  | 0  |                 FAIL(ec);  | 
925  | 0  |             }  | 
926  | 0  |         }  | 
927  | 0  |     }  | 
928  |  |  | 
929  | 0  |     else { | 
930  |  |         // value is empty.  Interpret as General Category, Script, or  | 
931  |  |         // Binary property.  | 
932  | 0  |         p = UCHAR_GENERAL_CATEGORY_MASK;  | 
933  | 0  |         v = u_getPropertyValueEnum(p, pname.data());  | 
934  | 0  |         if (v == UCHAR_INVALID_CODE) { | 
935  | 0  |             p = UCHAR_SCRIPT;  | 
936  | 0  |             v = u_getPropertyValueEnum(p, pname.data());  | 
937  | 0  |             if (v == UCHAR_INVALID_CODE) { | 
938  | 0  |                 p = u_getPropertyEnum(pname.data());  | 
939  | 0  |                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { | 
940  | 0  |                     v = 1;  | 
941  | 0  |                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { | 
942  | 0  |                     set(MIN_VALUE, MAX_VALUE);  | 
943  | 0  |                     return *this;  | 
944  | 0  |                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { | 
945  | 0  |                     set(0, 0x7F);  | 
946  | 0  |                     return *this;  | 
947  | 0  |                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { | 
948  |  |                     // [:Assigned:]=[:^Cn:]  | 
949  | 0  |                     p = UCHAR_GENERAL_CATEGORY_MASK;  | 
950  | 0  |                     v = U_GC_CN_MASK;  | 
951  | 0  |                     invert = TRUE;  | 
952  | 0  |                 } else { | 
953  | 0  |                     FAIL(ec);  | 
954  | 0  |                 }  | 
955  | 0  |             }  | 
956  | 0  |         }  | 
957  | 0  |     }  | 
958  |  |  | 
959  | 0  |     applyIntPropertyValue(p, v, ec);  | 
960  | 0  |     if(invert) { | 
961  | 0  |         complement();  | 
962  | 0  |     }  | 
963  |  | 
  | 
964  | 0  |     if (isBogus() && U_SUCCESS(ec)) { | 
965  |  |         // We likely ran out of memory. AHHH!  | 
966  | 0  |         ec = U_MEMORY_ALLOCATION_ERROR;  | 
967  | 0  |     }  | 
968  | 0  |     return *this;  | 
969  | 0  | }  | 
970  |  |  | 
971  |  | //----------------------------------------------------------------  | 
972  |  | // Property set patterns  | 
973  |  | //----------------------------------------------------------------  | 
974  |  |  | 
975  |  | /**  | 
976  |  |  * Return true if the given position, in the given pattern, appears  | 
977  |  |  * to be the start of a property set pattern.  | 
978  |  |  */  | 
979  |  | UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,  | 
980  | 0  |                                            int32_t pos) { | 
981  |  |     // Patterns are at least 5 characters long  | 
982  | 0  |     if ((pos+5) > pattern.length()) { | 
983  | 0  |         return FALSE;  | 
984  | 0  |     }  | 
985  |  |  | 
986  |  |     // Look for an opening [:, [:^, \p, or \P  | 
987  | 0  |     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);  | 
988  | 0  | }  | 
989  |  |  | 
990  |  | /**  | 
991  |  |  * Return true if the given iterator appears to point at a  | 
992  |  |  * property pattern.  Regardless of the result, return with the  | 
993  |  |  * iterator unchanged.  | 
994  |  |  * @param chars iterator over the pattern characters.  Upon return  | 
995  |  |  * it will be unchanged.  | 
996  |  |  * @param iterOpts RuleCharacterIterator options  | 
997  |  |  */  | 
998  |  | UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,  | 
999  | 0  |                                            int32_t iterOpts) { | 
1000  |  |     // NOTE: literal will always be FALSE, because we don't parse escapes.  | 
1001  | 0  |     UBool result = FALSE, literal;  | 
1002  | 0  |     UErrorCode ec = U_ZERO_ERROR;  | 
1003  | 0  |     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;  | 
1004  | 0  |     RuleCharacterIterator::Pos pos;  | 
1005  | 0  |     chars.getPos(pos);  | 
1006  | 0  |     UChar32 c = chars.next(iterOpts, literal, ec);  | 
1007  | 0  |     if (c == u'[' || c == u'\\') { | 
1008  | 0  |         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,  | 
1009  | 0  |                                literal, ec);  | 
1010  | 0  |         result = (c == u'[') ? (d == u':') :  | 
1011  | 0  |                                (d == u'N' || d == u'p' || d == u'P');  | 
1012  | 0  |     }  | 
1013  | 0  |     chars.setPos(pos);  | 
1014  | 0  |     return result && U_SUCCESS(ec);  | 
1015  | 0  | }  | 
1016  |  |  | 
1017  |  | /**  | 
1018  |  |  * Parse the given property pattern at the given parse position.  | 
1019  |  |  */  | 
1020  |  | UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,  | 
1021  |  |                                              ParsePosition& ppos,  | 
1022  | 0  |                                              UErrorCode &ec) { | 
1023  | 0  |     int32_t pos = ppos.getIndex();  | 
1024  |  | 
  | 
1025  | 0  |     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} | 
1026  | 0  |     UBool isName = FALSE; // true for \N{pat}, o/w false | 
1027  | 0  |     UBool invert = FALSE;  | 
1028  |  | 
  | 
1029  | 0  |     if (U_FAILURE(ec)) return *this;  | 
1030  |  |  | 
1031  |  |     // Minimum length is 5 characters, e.g. \p{L} | 
1032  | 0  |     if ((pos+5) > pattern.length()) { | 
1033  | 0  |         FAIL(ec);  | 
1034  | 0  |     }  | 
1035  |  |  | 
1036  |  |     // On entry, ppos should point to one of the following locations:  | 
1037  |  |     // Look for an opening [:, [:^, \p, or \P  | 
1038  | 0  |     if (isPOSIXOpen(pattern, pos)) { | 
1039  | 0  |         posix = TRUE;  | 
1040  | 0  |         pos += 2;  | 
1041  | 0  |         pos = ICU_Utility::skipWhitespace(pattern, pos);  | 
1042  | 0  |         if (pos < pattern.length() && pattern.charAt(pos) == u'^') { | 
1043  | 0  |             ++pos;  | 
1044  | 0  |             invert = TRUE;  | 
1045  | 0  |         }  | 
1046  | 0  |     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { | 
1047  | 0  |         UChar c = pattern.charAt(pos+1);  | 
1048  | 0  |         invert = (c == u'P');  | 
1049  | 0  |         isName = (c == u'N');  | 
1050  | 0  |         pos += 2;  | 
1051  | 0  |         pos = ICU_Utility::skipWhitespace(pattern, pos);  | 
1052  | 0  |         if (pos == pattern.length() || pattern.charAt(pos++) != u'{') { | 
1053  |  |             // Syntax error; "\p" or "\P" not followed by "{" | 
1054  | 0  |             FAIL(ec);  | 
1055  | 0  |         }  | 
1056  | 0  |     } else { | 
1057  |  |         // Open delimiter not seen  | 
1058  | 0  |         FAIL(ec);  | 
1059  | 0  |     }  | 
1060  |  |  | 
1061  |  |     // Look for the matching close delimiter, either :] or }  | 
1062  | 0  |     int32_t close;  | 
1063  | 0  |     if (posix) { | 
1064  | 0  |       close = pattern.indexOf(u":]", 2, pos);  | 
1065  | 0  |     } else { | 
1066  | 0  |       close = pattern.indexOf(u'}', pos);  | 
1067  | 0  |     }  | 
1068  | 0  |     if (close < 0) { | 
1069  |  |         // Syntax error; close delimiter missing  | 
1070  | 0  |         FAIL(ec);  | 
1071  | 0  |     }  | 
1072  |  |  | 
1073  |  |     // Look for an '=' sign.  If this is present, we will parse a  | 
1074  |  |     // medium \p{gc=Cf} or long \p{GeneralCategory=Format} | 
1075  |  |     // pattern.  | 
1076  | 0  |     int32_t equals = pattern.indexOf(u'=', pos);  | 
1077  | 0  |     UnicodeString propName, valueName;  | 
1078  | 0  |     if (equals >= 0 && equals < close && !isName) { | 
1079  |  |         // Equals seen; parse medium/long pattern  | 
1080  | 0  |         pattern.extractBetween(pos, equals, propName);  | 
1081  | 0  |         pattern.extractBetween(equals+1, close, valueName);  | 
1082  | 0  |     }  | 
1083  |  |  | 
1084  | 0  |     else { | 
1085  |  |         // Handle case where no '=' is seen, and \N{} | 
1086  | 0  |         pattern.extractBetween(pos, close, propName);  | 
1087  |  |               | 
1088  |  |         // Handle \N{name} | 
1089  | 0  |         if (isName) { | 
1090  |  |             // This is a little inefficient since it means we have to  | 
1091  |  |             // parse NAME_PROP back to UCHAR_NAME even though we already  | 
1092  |  |             // know it's UCHAR_NAME.  If we refactor the API to  | 
1093  |  |             // support args of (UProperty, char*) then we can remove  | 
1094  |  |             // NAME_PROP and make this a little more efficient.  | 
1095  | 0  |             valueName = propName;  | 
1096  | 0  |             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);  | 
1097  | 0  |         }  | 
1098  | 0  |     }  | 
1099  |  | 
  | 
1100  | 0  |     applyPropertyAlias(propName, valueName, ec);  | 
1101  |  | 
  | 
1102  | 0  |     if (U_SUCCESS(ec)) { | 
1103  | 0  |         if (invert) { | 
1104  | 0  |             complement();  | 
1105  | 0  |         }  | 
1106  |  |               | 
1107  |  |         // Move to the limit position after the close delimiter if the  | 
1108  |  |         // parse succeeded.  | 
1109  | 0  |         ppos.setIndex(close + (posix ? 2 : 1));  | 
1110  | 0  |     }  | 
1111  |  | 
  | 
1112  | 0  |     return *this;  | 
1113  | 0  | }  | 
1114  |  |  | 
1115  |  | /**  | 
1116  |  |  * Parse a property pattern.  | 
1117  |  |  * @param chars iterator over the pattern characters.  Upon return  | 
1118  |  |  * it will be advanced to the first character after the parsed  | 
1119  |  |  * pattern, or the end of the iteration if all characters are  | 
1120  |  |  * parsed.  | 
1121  |  |  * @param rebuiltPat the pattern that was parsed, rebuilt or  | 
1122  |  |  * copied from the input pattern, as appropriate.  | 
1123  |  |  */  | 
1124  |  | void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,  | 
1125  |  |                                       UnicodeString& rebuiltPat,  | 
1126  | 0  |                                       UErrorCode& ec) { | 
1127  | 0  |     if (U_FAILURE(ec)) return;  | 
1128  | 0  |     UnicodeString pattern;  | 
1129  | 0  |     chars.lookahead(pattern);  | 
1130  | 0  |     ParsePosition pos(0);  | 
1131  | 0  |     applyPropertyPattern(pattern, pos, ec);  | 
1132  | 0  |     if (U_FAILURE(ec)) return;  | 
1133  | 0  |     if (pos.getIndex() == 0) { | 
1134  |  |         // syntaxError(chars, "Invalid property pattern");  | 
1135  | 0  |         ec = U_MALFORMED_SET;  | 
1136  | 0  |         return;  | 
1137  | 0  |     }  | 
1138  | 0  |     chars.jumpahead(pos.getIndex());  | 
1139  | 0  |     rebuiltPat.append(pattern, 0, pos.getIndex());  | 
1140  | 0  | }  | 
1141  |  |  | 
1142  |  | U_NAMESPACE_END  |