/src/icu/source/common/static_unicode_sets.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2018 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  |  | 
4  |  | #include "unicode/utypes.h"  | 
5  |  |  | 
6  |  | #if !UCONFIG_NO_FORMATTING  | 
7  |  |  | 
8  |  | // Allow implicit conversion from char16_t* to UnicodeString for this file:  | 
9  |  | // Helpful in toString methods and elsewhere.  | 
10  |  | #define UNISTR_FROM_STRING_EXPLICIT  | 
11  |  |  | 
12  |  | #include "static_unicode_sets.h"  | 
13  |  | #include "umutex.h"  | 
14  |  | #include "ucln_cmn.h"  | 
15  |  | #include "unicode/uniset.h"  | 
16  |  | #include "uresimp.h"  | 
17  |  | #include "cstring.h"  | 
18  |  | #include "uassert.h"  | 
19  |  |  | 
20  |  | using namespace icu;  | 
21  |  | using namespace icu::unisets;  | 
22  |  |  | 
23  |  |  | 
24  |  | namespace { | 
25  |  |  | 
26  |  | UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {}; | 
27  |  |  | 
28  |  | // Save the empty instance in static memory to have well-defined behavior if a  | 
29  |  | // regular UnicodeSet cannot be allocated.  | 
30  |  | alignas(UnicodeSet)  | 
31  |  | char gEmptyUnicodeSet[sizeof(UnicodeSet)];  | 
32  |  |  | 
33  |  | // Whether the gEmptyUnicodeSet is initialized and ready to use.  | 
34  |  | UBool gEmptyUnicodeSetInitialized = FALSE;  | 
35  |  |  | 
36  | 0  | inline UnicodeSet* getImpl(Key key) { | 
37  | 0  |     UnicodeSet* candidate = gUnicodeSets[key];  | 
38  | 0  |     if (candidate == nullptr) { | 
39  | 0  |         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);  | 
40  | 0  |     }  | 
41  | 0  |     return candidate;  | 
42  | 0  | }  | 
43  |  |  | 
44  | 0  | UnicodeSet* computeUnion(Key k1, Key k2) { | 
45  | 0  |     UnicodeSet* result = new UnicodeSet();  | 
46  | 0  |     if (result == nullptr) { | 
47  | 0  |         return nullptr;  | 
48  | 0  |     }  | 
49  | 0  |     result->addAll(*getImpl(k1));  | 
50  | 0  |     result->addAll(*getImpl(k2));  | 
51  | 0  |     result->freeze();  | 
52  | 0  |     return result;  | 
53  | 0  | }  | 
54  |  |  | 
55  | 0  | UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { | 
56  | 0  |     UnicodeSet* result = new UnicodeSet();  | 
57  | 0  |     if (result == nullptr) { | 
58  | 0  |         return nullptr;  | 
59  | 0  |     }  | 
60  | 0  |     result->addAll(*getImpl(k1));  | 
61  | 0  |     result->addAll(*getImpl(k2));  | 
62  | 0  |     result->addAll(*getImpl(k3));  | 
63  | 0  |     result->freeze();  | 
64  | 0  |     return result;  | 
65  | 0  | }  | 
66  |  |  | 
67  |  |  | 
68  | 0  | void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { | 
69  |  |     // assert unicodeSets.get(key) == null;  | 
70  | 0  |     gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);  | 
71  | 0  | }  | 
72  |  |  | 
73  |  | class ParseDataSink : public ResourceSink { | 
74  |  |   public:  | 
75  | 0  |     void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE { | 
76  | 0  |         ResourceTable contextsTable = value.getTable(status);  | 
77  | 0  |         if (U_FAILURE(status)) { return; } | 
78  | 0  |         for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { | 
79  | 0  |             if (uprv_strcmp(key, "date") == 0) { | 
80  |  |                 // ignore  | 
81  | 0  |             } else { | 
82  | 0  |                 ResourceTable strictnessTable = value.getTable(status);  | 
83  | 0  |                 if (U_FAILURE(status)) { return; } | 
84  | 0  |                 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { | 
85  | 0  |                     bool isLenient = (uprv_strcmp(key, "lenient") == 0);  | 
86  | 0  |                     ResourceArray array = value.getArray(status);  | 
87  | 0  |                     if (U_FAILURE(status)) { return; } | 
88  | 0  |                     for (int k = 0; k < array.getSize(); k++) { | 
89  | 0  |                         array.getValue(k, value);  | 
90  | 0  |                         UnicodeString str = value.getUnicodeString(status);  | 
91  | 0  |                         if (U_FAILURE(status)) { return; } | 
92  |  |                         // There is both lenient and strict data for comma/period,  | 
93  |  |                         // but not for any of the other symbols.  | 
94  | 0  |                         if (str.indexOf(u'.') != -1) { | 
95  | 0  |                             saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);  | 
96  | 0  |                         } else if (str.indexOf(u',') != -1) { | 
97  | 0  |                             saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);  | 
98  | 0  |                         } else if (str.indexOf(u'+') != -1) { | 
99  | 0  |                             saveSet(PLUS_SIGN, str, status);  | 
100  | 0  |                         } else if (str.indexOf(u'-') != -1) { | 
101  | 0  |                             saveSet(MINUS_SIGN, str, status);  | 
102  | 0  |                         } else if (str.indexOf(u'$') != -1) { | 
103  | 0  |                             saveSet(DOLLAR_SIGN, str, status);  | 
104  | 0  |                         } else if (str.indexOf(u'£') != -1) { | 
105  | 0  |                             saveSet(POUND_SIGN, str, status);  | 
106  | 0  |                         } else if (str.indexOf(u'₹') != -1) { | 
107  | 0  |                             saveSet(RUPEE_SIGN, str, status);  | 
108  | 0  |                         } else if (str.indexOf(u'¥') != -1) { | 
109  | 0  |                             saveSet(YEN_SIGN, str, status);  | 
110  | 0  |                         } else if (str.indexOf(u'₩') != -1) { | 
111  | 0  |                             saveSet(WON_SIGN, str, status);  | 
112  | 0  |                         } else if (str.indexOf(u'%') != -1) { | 
113  | 0  |                             saveSet(PERCENT_SIGN, str, status);  | 
114  | 0  |                         } else if (str.indexOf(u'‰') != -1) { | 
115  | 0  |                             saveSet(PERMILLE_SIGN, str, status);  | 
116  | 0  |                         } else if (str.indexOf(u'’') != -1) { | 
117  | 0  |                             saveSet(APOSTROPHE_SIGN, str, status);  | 
118  | 0  |                         } else { | 
119  |  |                             // Unknown class of parse lenients  | 
120  |  |                             // TODO(ICU-20428): Make ICU automatically accept new classes?  | 
121  | 0  |                             U_ASSERT(FALSE);  | 
122  | 0  |                         }  | 
123  | 0  |                         if (U_FAILURE(status)) { return; } | 
124  | 0  |                     }  | 
125  | 0  |                 }  | 
126  | 0  |             }  | 
127  | 0  |         }  | 
128  | 0  |     }  | 
129  |  | };  | 
130  |  |  | 
131  |  |  | 
132  |  | icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;  | 
133  |  |  | 
134  | 0  | UBool U_CALLCONV cleanupNumberParseUniSets() { | 
135  | 0  |     if (gEmptyUnicodeSetInitialized) { | 
136  | 0  |         reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();  | 
137  | 0  |         gEmptyUnicodeSetInitialized = FALSE;  | 
138  | 0  |     }  | 
139  | 0  |     for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { | 
140  | 0  |         delete gUnicodeSets[i];  | 
141  | 0  |         gUnicodeSets[i] = nullptr;  | 
142  | 0  |     }  | 
143  | 0  |     gNumberParseUniSetsInitOnce.reset();  | 
144  | 0  |     return TRUE;  | 
145  | 0  | }  | 
146  |  |  | 
147  | 0  | void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { | 
148  | 0  |     ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);  | 
149  |  |  | 
150  |  |     // Initialize the empty instance for well-defined fallback behavior  | 
151  | 0  |     new(gEmptyUnicodeSet) UnicodeSet();  | 
152  | 0  |     reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();  | 
153  | 0  |     gEmptyUnicodeSetInitialized = TRUE;  | 
154  |  |  | 
155  |  |     // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.  | 
156  |  |     // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).  | 
157  | 0  |     gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(  | 
158  | 0  |             u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);  | 
159  | 0  |     gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);  | 
160  |  | 
  | 
161  | 0  |     LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));  | 
162  | 0  |     if (U_FAILURE(status)) { return; } | 
163  | 0  |     ParseDataSink sink;  | 
164  | 0  |     ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);  | 
165  | 0  |     if (U_FAILURE(status)) { return; } | 
166  |  |  | 
167  |  |     // NOTE: It is OK for these assertions to fail if there was a no-data build.  | 
168  | 0  |     U_ASSERT(gUnicodeSets[COMMA] != nullptr);  | 
169  | 0  |     U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);  | 
170  | 0  |     U_ASSERT(gUnicodeSets[PERIOD] != nullptr);  | 
171  | 0  |     U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);  | 
172  | 0  |     U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);  | 
173  |  | 
  | 
174  | 0  |     LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(  | 
175  | 0  |         u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",  | 
176  | 0  |         status  | 
177  | 0  |     ), status);  | 
178  | 0  |     if (U_FAILURE(status)) { return; } | 
179  | 0  |     otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);  | 
180  | 0  |     gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();  | 
181  | 0  |     gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);  | 
182  | 0  |     gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(  | 
183  | 0  |             STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);  | 
184  |  | 
  | 
185  | 0  |     U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);  | 
186  | 0  |     U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);  | 
187  | 0  |     U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);  | 
188  | 0  |     U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);  | 
189  |  | 
  | 
190  | 0  |     gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);  | 
191  | 0  |     if (U_FAILURE(status)) { return; } | 
192  |  |  | 
193  | 0  |     U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);  | 
194  | 0  |     U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);  | 
195  | 0  |     U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);  | 
196  | 0  |     U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);  | 
197  | 0  |     U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);  | 
198  |  | 
  | 
199  | 0  |     gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);  | 
200  | 0  |     if (U_FAILURE(status)) { return; } | 
201  | 0  |     gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);  | 
202  | 0  |     gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);  | 
203  |  | 
  | 
204  | 0  |     for (auto* uniset : gUnicodeSets) { | 
205  | 0  |         if (uniset != nullptr) { | 
206  | 0  |             uniset->freeze();  | 
207  | 0  |         }  | 
208  | 0  |     }  | 
209  | 0  | }  | 
210  |  |  | 
211  |  | }  | 
212  |  |  | 
213  | 0  | const UnicodeSet* unisets::get(Key key) { | 
214  | 0  |     UErrorCode localStatus = U_ZERO_ERROR;  | 
215  | 0  |     umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);  | 
216  | 0  |     if (U_FAILURE(localStatus)) { | 
217  | 0  |         return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);  | 
218  | 0  |     }  | 
219  | 0  |     return getImpl(key);  | 
220  | 0  | }  | 
221  |  |  | 
222  | 0  | Key unisets::chooseFrom(UnicodeString str, Key key1) { | 
223  | 0  |     return get(key1)->contains(str) ? key1 : NONE;  | 
224  | 0  | }  | 
225  |  |  | 
226  | 0  | Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { | 
227  | 0  |     return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);  | 
228  | 0  | }  | 
229  |  |  | 
230  |  | //Key unisets::chooseCurrency(UnicodeString str) { | 
231  |  | //    if (get(DOLLAR_SIGN)->contains(str)) { | 
232  |  | //        return DOLLAR_SIGN;  | 
233  |  | //    } else if (get(POUND_SIGN)->contains(str)) { | 
234  |  | //        return POUND_SIGN;  | 
235  |  | //    } else if (get(RUPEE_SIGN)->contains(str)) { | 
236  |  | //        return RUPEE_SIGN;  | 
237  |  | //    } else if (get(YEN_SIGN)->contains(str)) { | 
238  |  | //        return YEN_SIGN;  | 
239  |  | //    } else { | 
240  |  | //        return NONE;  | 
241  |  | //    }  | 
242  |  | //}  | 
243  |  |  | 
244  |  |  | 
245  |  | #endif /* #if !UCONFIG_NO_FORMATTING */  |