Coverage Report

Created: 2023-02-22 06:51

/src/icu/source/common/static_unicode_sets.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2018 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
// Allow implicit conversion from char16_t* to UnicodeString for this file:
9
// Helpful in toString methods and elsewhere.
10
#define UNISTR_FROM_STRING_EXPLICIT
11
12
#include "static_unicode_sets.h"
13
#include "umutex.h"
14
#include "ucln_cmn.h"
15
#include "unicode/uniset.h"
16
#include "uresimp.h"
17
#include "cstring.h"
18
#include "uassert.h"
19
20
using namespace icu;
21
using namespace icu::unisets;
22
23
24
namespace {
25
26
UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
27
28
// Save the empty instance in static memory to have well-defined behavior if a
29
// regular UnicodeSet cannot be allocated.
30
alignas(UnicodeSet)
31
char gEmptyUnicodeSet[sizeof(UnicodeSet)];
32
33
// Whether the gEmptyUnicodeSet is initialized and ready to use.
34
UBool gEmptyUnicodeSetInitialized = FALSE;
35
36
0
inline UnicodeSet* getImpl(Key key) {
37
0
    UnicodeSet* candidate = gUnicodeSets[key];
38
0
    if (candidate == nullptr) {
39
0
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
40
0
    }
41
0
    return candidate;
42
0
}
43
44
0
UnicodeSet* computeUnion(Key k1, Key k2) {
45
0
    UnicodeSet* result = new UnicodeSet();
46
0
    if (result == nullptr) {
47
0
        return nullptr;
48
0
    }
49
0
    result->addAll(*getImpl(k1));
50
0
    result->addAll(*getImpl(k2));
51
0
    result->freeze();
52
0
    return result;
53
0
}
54
55
0
UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
56
0
    UnicodeSet* result = new UnicodeSet();
57
0
    if (result == nullptr) {
58
0
        return nullptr;
59
0
    }
60
0
    result->addAll(*getImpl(k1));
61
0
    result->addAll(*getImpl(k2));
62
0
    result->addAll(*getImpl(k3));
63
0
    result->freeze();
64
0
    return result;
65
0
}
66
67
68
0
void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
69
    // assert unicodeSets.get(key) == null;
70
0
    gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
71
0
}
72
73
class ParseDataSink : public ResourceSink {
74
  public:
75
0
    void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
76
0
        ResourceTable contextsTable = value.getTable(status);
77
0
        if (U_FAILURE(status)) { return; }
78
0
        for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
79
0
            if (uprv_strcmp(key, "date") == 0) {
80
                // ignore
81
0
            } else {
82
0
                ResourceTable strictnessTable = value.getTable(status);
83
0
                if (U_FAILURE(status)) { return; }
84
0
                for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
85
0
                    bool isLenient = (uprv_strcmp(key, "lenient") == 0);
86
0
                    ResourceArray array = value.getArray(status);
87
0
                    if (U_FAILURE(status)) { return; }
88
0
                    for (int k = 0; k < array.getSize(); k++) {
89
0
                        array.getValue(k, value);
90
0
                        UnicodeString str = value.getUnicodeString(status);
91
0
                        if (U_FAILURE(status)) { return; }
92
                        // There is both lenient and strict data for comma/period,
93
                        // but not for any of the other symbols.
94
0
                        if (str.indexOf(u'.') != -1) {
95
0
                            saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
96
0
                        } else if (str.indexOf(u',') != -1) {
97
0
                            saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
98
0
                        } else if (str.indexOf(u'+') != -1) {
99
0
                            saveSet(PLUS_SIGN, str, status);
100
0
                        } else if (str.indexOf(u'-') != -1) {
101
0
                            saveSet(MINUS_SIGN, str, status);
102
0
                        } else if (str.indexOf(u'$') != -1) {
103
0
                            saveSet(DOLLAR_SIGN, str, status);
104
0
                        } else if (str.indexOf(u'£') != -1) {
105
0
                            saveSet(POUND_SIGN, str, status);
106
0
                        } else if (str.indexOf(u'₹') != -1) {
107
0
                            saveSet(RUPEE_SIGN, str, status);
108
0
                        } else if (str.indexOf(u'¥') != -1) {
109
0
                            saveSet(YEN_SIGN, str, status);
110
0
                        } else if (str.indexOf(u'₩') != -1) {
111
0
                            saveSet(WON_SIGN, str, status);
112
0
                        } else if (str.indexOf(u'%') != -1) {
113
0
                            saveSet(PERCENT_SIGN, str, status);
114
0
                        } else if (str.indexOf(u'‰') != -1) {
115
0
                            saveSet(PERMILLE_SIGN, str, status);
116
0
                        } else if (str.indexOf(u'’') != -1) {
117
0
                            saveSet(APOSTROPHE_SIGN, str, status);
118
0
                        } else {
119
                            // Unknown class of parse lenients
120
                            // TODO(ICU-20428): Make ICU automatically accept new classes?
121
0
                            U_ASSERT(FALSE);
122
0
                        }
123
0
                        if (U_FAILURE(status)) { return; }
124
0
                    }
125
0
                }
126
0
            }
127
0
        }
128
0
    }
129
};
130
131
132
icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
133
134
0
UBool U_CALLCONV cleanupNumberParseUniSets() {
135
0
    if (gEmptyUnicodeSetInitialized) {
136
0
        reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
137
0
        gEmptyUnicodeSetInitialized = FALSE;
138
0
    }
139
0
    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
140
0
        delete gUnicodeSets[i];
141
0
        gUnicodeSets[i] = nullptr;
142
0
    }
143
0
    gNumberParseUniSetsInitOnce.reset();
144
0
    return TRUE;
145
0
}
146
147
0
void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
148
0
    ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
149
150
    // Initialize the empty instance for well-defined fallback behavior
151
0
    new(gEmptyUnicodeSet) UnicodeSet();
152
0
    reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
153
0
    gEmptyUnicodeSetInitialized = TRUE;
154
155
    // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156
    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157
0
    gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
158
0
            u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
159
0
    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
160
161
0
    LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
162
0
    if (U_FAILURE(status)) { return; }
163
0
    ParseDataSink sink;
164
0
    ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
165
0
    if (U_FAILURE(status)) { return; }
166
167
    // NOTE: It is OK for these assertions to fail if there was a no-data build.
168
0
    U_ASSERT(gUnicodeSets[COMMA] != nullptr);
169
0
    U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
170
0
    U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
171
0
    U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
172
0
    U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
173
174
0
    LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
175
0
        u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
176
0
        status
177
0
    ), status);
178
0
    if (U_FAILURE(status)) { return; }
179
0
    otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
180
0
    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
181
0
    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
182
0
    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
183
0
            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
184
185
0
    U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
186
0
    U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
187
0
    U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
188
0
    U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
189
190
0
    gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
191
0
    if (U_FAILURE(status)) { return; }
192
193
0
    U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
194
0
    U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
195
0
    U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
196
0
    U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
197
0
    U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
198
199
0
    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
200
0
    if (U_FAILURE(status)) { return; }
201
0
    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
202
0
    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
203
204
0
    for (auto* uniset : gUnicodeSets) {
205
0
        if (uniset != nullptr) {
206
0
            uniset->freeze();
207
0
        }
208
0
    }
209
0
}
210
211
}
212
213
0
const UnicodeSet* unisets::get(Key key) {
214
0
    UErrorCode localStatus = U_ZERO_ERROR;
215
0
    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
216
0
    if (U_FAILURE(localStatus)) {
217
0
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
218
0
    }
219
0
    return getImpl(key);
220
0
}
221
222
0
Key unisets::chooseFrom(UnicodeString str, Key key1) {
223
0
    return get(key1)->contains(str) ? key1 : NONE;
224
0
}
225
226
0
Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
227
0
    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
228
0
}
229
230
//Key unisets::chooseCurrency(UnicodeString str) {
231
//    if (get(DOLLAR_SIGN)->contains(str)) {
232
//        return DOLLAR_SIGN;
233
//    } else if (get(POUND_SIGN)->contains(str)) {
234
//        return POUND_SIGN;
235
//    } else if (get(RUPEE_SIGN)->contains(str)) {
236
//        return RUPEE_SIGN;
237
//    } else if (get(YEN_SIGN)->contains(str)) {
238
//        return YEN_SIGN;
239
//    } else {
240
//        return NONE;
241
//    }
242
//}
243
244
245
#endif /* #if !UCONFIG_NO_FORMATTING */