/src/icu/source/common/static_unicode_sets.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2018 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | |
4 | | #include "unicode/utypes.h" |
5 | | |
6 | | #if !UCONFIG_NO_FORMATTING |
7 | | |
8 | | // Allow implicit conversion from char16_t* to UnicodeString for this file: |
9 | | // Helpful in toString methods and elsewhere. |
10 | | #define UNISTR_FROM_STRING_EXPLICIT |
11 | | |
12 | | #include "static_unicode_sets.h" |
13 | | #include "umutex.h" |
14 | | #include "ucln_cmn.h" |
15 | | #include "unicode/uniset.h" |
16 | | #include "uresimp.h" |
17 | | #include "cstring.h" |
18 | | #include "uassert.h" |
19 | | |
20 | | using namespace icu; |
21 | | using namespace icu::unisets; |
22 | | |
23 | | |
24 | | namespace { |
25 | | |
26 | | UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {}; |
27 | | |
28 | | // Save the empty instance in static memory to have well-defined behavior if a |
29 | | // regular UnicodeSet cannot be allocated. |
30 | | alignas(UnicodeSet) |
31 | | char gEmptyUnicodeSet[sizeof(UnicodeSet)]; |
32 | | |
33 | | // Whether the gEmptyUnicodeSet is initialized and ready to use. |
34 | | UBool gEmptyUnicodeSetInitialized = FALSE; |
35 | | |
36 | 0 | inline UnicodeSet* getImpl(Key key) { |
37 | 0 | UnicodeSet* candidate = gUnicodeSets[key]; |
38 | 0 | if (candidate == nullptr) { |
39 | 0 | return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); |
40 | 0 | } |
41 | 0 | return candidate; |
42 | 0 | } |
43 | | |
44 | 0 | UnicodeSet* computeUnion(Key k1, Key k2) { |
45 | 0 | UnicodeSet* result = new UnicodeSet(); |
46 | 0 | if (result == nullptr) { |
47 | 0 | return nullptr; |
48 | 0 | } |
49 | 0 | result->addAll(*getImpl(k1)); |
50 | 0 | result->addAll(*getImpl(k2)); |
51 | 0 | result->freeze(); |
52 | 0 | return result; |
53 | 0 | } |
54 | | |
55 | 0 | UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { |
56 | 0 | UnicodeSet* result = new UnicodeSet(); |
57 | 0 | if (result == nullptr) { |
58 | 0 | return nullptr; |
59 | 0 | } |
60 | 0 | result->addAll(*getImpl(k1)); |
61 | 0 | result->addAll(*getImpl(k2)); |
62 | 0 | result->addAll(*getImpl(k3)); |
63 | 0 | result->freeze(); |
64 | 0 | return result; |
65 | 0 | } |
66 | | |
67 | | |
68 | 0 | void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { |
69 | | // assert unicodeSets.get(key) == null; |
70 | 0 | gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status); |
71 | 0 | } |
72 | | |
73 | | class ParseDataSink : public ResourceSink { |
74 | | public: |
75 | 0 | void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE { |
76 | 0 | ResourceTable contextsTable = value.getTable(status); |
77 | 0 | if (U_FAILURE(status)) { return; } |
78 | 0 | for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { |
79 | 0 | if (uprv_strcmp(key, "date") == 0) { |
80 | | // ignore |
81 | 0 | } else { |
82 | 0 | ResourceTable strictnessTable = value.getTable(status); |
83 | 0 | if (U_FAILURE(status)) { return; } |
84 | 0 | for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { |
85 | 0 | bool isLenient = (uprv_strcmp(key, "lenient") == 0); |
86 | 0 | ResourceArray array = value.getArray(status); |
87 | 0 | if (U_FAILURE(status)) { return; } |
88 | 0 | for (int k = 0; k < array.getSize(); k++) { |
89 | 0 | array.getValue(k, value); |
90 | 0 | UnicodeString str = value.getUnicodeString(status); |
91 | 0 | if (U_FAILURE(status)) { return; } |
92 | | // There is both lenient and strict data for comma/period, |
93 | | // but not for any of the other symbols. |
94 | 0 | if (str.indexOf(u'.') != -1) { |
95 | 0 | saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status); |
96 | 0 | } else if (str.indexOf(u',') != -1) { |
97 | 0 | saveSet(isLenient ? COMMA : STRICT_COMMA, str, status); |
98 | 0 | } else if (str.indexOf(u'+') != -1) { |
99 | 0 | saveSet(PLUS_SIGN, str, status); |
100 | 0 | } else if (str.indexOf(u'-') != -1) { |
101 | 0 | saveSet(MINUS_SIGN, str, status); |
102 | 0 | } else if (str.indexOf(u'$') != -1) { |
103 | 0 | saveSet(DOLLAR_SIGN, str, status); |
104 | 0 | } else if (str.indexOf(u'£') != -1) { |
105 | 0 | saveSet(POUND_SIGN, str, status); |
106 | 0 | } else if (str.indexOf(u'₹') != -1) { |
107 | 0 | saveSet(RUPEE_SIGN, str, status); |
108 | 0 | } else if (str.indexOf(u'¥') != -1) { |
109 | 0 | saveSet(YEN_SIGN, str, status); |
110 | 0 | } else if (str.indexOf(u'₩') != -1) { |
111 | 0 | saveSet(WON_SIGN, str, status); |
112 | 0 | } else if (str.indexOf(u'%') != -1) { |
113 | 0 | saveSet(PERCENT_SIGN, str, status); |
114 | 0 | } else if (str.indexOf(u'‰') != -1) { |
115 | 0 | saveSet(PERMILLE_SIGN, str, status); |
116 | 0 | } else if (str.indexOf(u'’') != -1) { |
117 | 0 | saveSet(APOSTROPHE_SIGN, str, status); |
118 | 0 | } else { |
119 | | // Unknown class of parse lenients |
120 | | // TODO(ICU-20428): Make ICU automatically accept new classes? |
121 | 0 | U_ASSERT(FALSE); |
122 | 0 | } |
123 | 0 | if (U_FAILURE(status)) { return; } |
124 | 0 | } |
125 | 0 | } |
126 | 0 | } |
127 | 0 | } |
128 | 0 | } |
129 | | }; |
130 | | |
131 | | |
132 | | icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER; |
133 | | |
134 | 0 | UBool U_CALLCONV cleanupNumberParseUniSets() { |
135 | 0 | if (gEmptyUnicodeSetInitialized) { |
136 | 0 | reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet(); |
137 | 0 | gEmptyUnicodeSetInitialized = FALSE; |
138 | 0 | } |
139 | 0 | for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { |
140 | 0 | delete gUnicodeSets[i]; |
141 | 0 | gUnicodeSets[i] = nullptr; |
142 | 0 | } |
143 | 0 | gNumberParseUniSetsInitOnce.reset(); |
144 | 0 | return TRUE; |
145 | 0 | } |
146 | | |
147 | 0 | void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { |
148 | 0 | ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets); |
149 | | |
150 | | // Initialize the empty instance for well-defined fallback behavior |
151 | 0 | new(gEmptyUnicodeSet) UnicodeSet(); |
152 | 0 | reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze(); |
153 | 0 | gEmptyUnicodeSetInitialized = TRUE; |
154 | | |
155 | | // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. |
156 | | // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). |
157 | 0 | gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet( |
158 | 0 | u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status); |
159 | 0 | gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status); |
160 | |
|
161 | 0 | LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status)); |
162 | 0 | if (U_FAILURE(status)) { return; } |
163 | 0 | ParseDataSink sink; |
164 | 0 | ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status); |
165 | 0 | if (U_FAILURE(status)) { return; } |
166 | | |
167 | | // NOTE: It is OK for these assertions to fail if there was a no-data build. |
168 | 0 | U_ASSERT(gUnicodeSets[COMMA] != nullptr); |
169 | 0 | U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr); |
170 | 0 | U_ASSERT(gUnicodeSets[PERIOD] != nullptr); |
171 | 0 | U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr); |
172 | 0 | U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr); |
173 | |
|
174 | 0 | LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet( |
175 | 0 | u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", |
176 | 0 | status |
177 | 0 | ), status); |
178 | 0 | if (U_FAILURE(status)) { return; } |
179 | 0 | otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]); |
180 | 0 | gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan(); |
181 | 0 | gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); |
182 | 0 | gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( |
183 | 0 | STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); |
184 | |
|
185 | 0 | U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr); |
186 | 0 | U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr); |
187 | 0 | U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr); |
188 | 0 | U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr); |
189 | |
|
190 | 0 | gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status); |
191 | 0 | if (U_FAILURE(status)) { return; } |
192 | | |
193 | 0 | U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr); |
194 | 0 | U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr); |
195 | 0 | U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr); |
196 | 0 | U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr); |
197 | 0 | U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr); |
198 | |
|
199 | 0 | gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); |
200 | 0 | if (U_FAILURE(status)) { return; } |
201 | 0 | gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); |
202 | 0 | gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); |
203 | |
|
204 | 0 | for (auto* uniset : gUnicodeSets) { |
205 | 0 | if (uniset != nullptr) { |
206 | 0 | uniset->freeze(); |
207 | 0 | } |
208 | 0 | } |
209 | 0 | } |
210 | | |
211 | | } |
212 | | |
213 | 0 | const UnicodeSet* unisets::get(Key key) { |
214 | 0 | UErrorCode localStatus = U_ZERO_ERROR; |
215 | 0 | umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); |
216 | 0 | if (U_FAILURE(localStatus)) { |
217 | 0 | return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet); |
218 | 0 | } |
219 | 0 | return getImpl(key); |
220 | 0 | } |
221 | | |
222 | 0 | Key unisets::chooseFrom(UnicodeString str, Key key1) { |
223 | 0 | return get(key1)->contains(str) ? key1 : NONE; |
224 | 0 | } |
225 | | |
226 | 0 | Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { |
227 | 0 | return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); |
228 | 0 | } |
229 | | |
230 | | //Key unisets::chooseCurrency(UnicodeString str) { |
231 | | // if (get(DOLLAR_SIGN)->contains(str)) { |
232 | | // return DOLLAR_SIGN; |
233 | | // } else if (get(POUND_SIGN)->contains(str)) { |
234 | | // return POUND_SIGN; |
235 | | // } else if (get(RUPEE_SIGN)->contains(str)) { |
236 | | // return RUPEE_SIGN; |
237 | | // } else if (get(YEN_SIGN)->contains(str)) { |
238 | | // return YEN_SIGN; |
239 | | // } else { |
240 | | // return NONE; |
241 | | // } |
242 | | //} |
243 | | |
244 | | |
245 | | #endif /* #if !UCONFIG_NO_FORMATTING */ |