/src/icu/source/common/static_unicode_sets.cpp

Source (jump to first uncovered line)
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT

#include "static_unicode_sets.h"
#include "umutex.h"
#include "ucln_cmn.h"
#include "unicode/uniset.h"
#include "uresimp.h"
#include "cstring.h"
#include "uassert.h"

using namespace icu;
using namespace icu::unisets;


namespace {

UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};

// Save the empty instance in static memory to have well-defined behavior if a
// regular UnicodeSet cannot be allocated.
alignas(UnicodeSet)
char gEmptyUnicodeSet[sizeof(UnicodeSet)];

// Whether the gEmptyUnicodeSet is initialized and ready to use.
UBool gEmptyUnicodeSetInitialized = FALSE;

inline UnicodeSet* getImpl(Key key) {
    UnicodeSet* candidate = gUnicodeSets[key];
    if (candidate == nullptr) {
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
    }
    return candidate;
}

UnicodeSet* computeUnion(Key k1, Key k2) {
    UnicodeSet* result = new UnicodeSet();
    if (result == nullptr) {
        return nullptr;
    }
    result->addAll(*getImpl(k1));
    result->addAll(*getImpl(k2));
    result->freeze();
    return result;
}

UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
    UnicodeSet* result = new UnicodeSet();
    if (result == nullptr) {
        return nullptr;
    }
    result->addAll(*getImpl(k1));
    result->addAll(*getImpl(k2));
    result->addAll(*getImpl(k3));
    result->freeze();
    return result;
}


void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
    // assert unicodeSets.get(key) == null;
    gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
}

class ParseDataSink : public ResourceSink {
  public:
    void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
        ResourceTable contextsTable = value.getTable(status);
        if (U_FAILURE(status)) { return; }
        for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
            if (uprv_strcmp(key, "date") == 0) {
                // ignore
            } else {
                ResourceTable strictnessTable = value.getTable(status);
                if (U_FAILURE(status)) { return; }
                for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
                    bool isLenient = (uprv_strcmp(key, "lenient") == 0);
                    ResourceArray array = value.getArray(status);
                    if (U_FAILURE(status)) { return; }
                    for (int k = 0; k < array.getSize(); k++) {
                        array.getValue(k, value);
                        UnicodeString str = value.getUnicodeString(status);
                        if (U_FAILURE(status)) { return; }
                        // There is both lenient and strict data for comma/period,
                        // but not for any of the other symbols.
                        if (str.indexOf(u'.') != -1) {
                            saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
                        } else if (str.indexOf(u',') != -1) {
                            saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
                        } else if (str.indexOf(u'+') != -1) {
                            saveSet(PLUS_SIGN, str, status);
                        } else if (str.indexOf(u'-') != -1) {
                            saveSet(MINUS_SIGN, str, status);
                        } else if (str.indexOf(u'$') != -1) {
                            saveSet(DOLLAR_SIGN, str, status);
                        } else if (str.indexOf(u'£') != -1) {
                            saveSet(POUND_SIGN, str, status);
                        } else if (str.indexOf(u'₹') != -1) {
                            saveSet(RUPEE_SIGN, str, status);
                        } else if (str.indexOf(u'¥') != -1) {
                            saveSet(YEN_SIGN, str, status);
                        } else if (str.indexOf(u'₩') != -1) {
                            saveSet(WON_SIGN, str, status);
                        } else if (str.indexOf(u'%') != -1) {
                            saveSet(PERCENT_SIGN, str, status);
                        } else if (str.indexOf(u'‰') != -1) {
                            saveSet(PERMILLE_SIGN, str, status);
                        } else if (str.indexOf(u'’') != -1) {
                            saveSet(APOSTROPHE_SIGN, str, status);
                        } else {
                            // Unknown class of parse lenients
                            // TODO(ICU-20428): Make ICU automatically accept new classes?
                            U_ASSERT(FALSE);
                        }
                        if (U_FAILURE(status)) { return; }
                    }
                }
            }
        }
    }
};


icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;

UBool U_CALLCONV cleanupNumberParseUniSets() {
    if (gEmptyUnicodeSetInitialized) {
        reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
        gEmptyUnicodeSetInitialized = FALSE;
    }
    for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
        delete gUnicodeSets[i];
        gUnicodeSets[i] = nullptr;
    }
    gNumberParseUniSetsInitOnce.reset();
    return TRUE;
}

void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
    ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);

    // Initialize the empty instance for well-defined fallback behavior
    new(gEmptyUnicodeSet) UnicodeSet();
    reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
    gEmptyUnicodeSetInitialized = TRUE;

    // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
    // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
    gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
            u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
    gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);

    LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
    if (U_FAILURE(status)) { return; }
    ParseDataSink sink;
    ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
    if (U_FAILURE(status)) { return; }

    // NOTE: It is OK for these assertions to fail if there was a no-data build.
    U_ASSERT(gUnicodeSets[COMMA] != nullptr);
    U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
    U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
    U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
    U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);

    LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
        u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
        status
    ), status);
    if (U_FAILURE(status)) { return; }
    otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
    gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
    gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
    gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
            STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);

    U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);

    gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
    if (U_FAILURE(status)) { return; }

    U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
    U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);

    gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
    if (U_FAILURE(status)) { return; }
    gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
    gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);

    for (auto* uniset : gUnicodeSets) {
        if (uniset != nullptr) {
            uniset->freeze();
        }
    }
}

}

const UnicodeSet* unisets::get(Key key) {
    UErrorCode localStatus = U_ZERO_ERROR;
    umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
    if (U_FAILURE(localStatus)) {
        return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
    }
    return getImpl(key);
}

Key unisets::chooseFrom(UnicodeString str, Key key1) {
    return get(key1)->contains(str) ? key1 : NONE;
}

Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
    return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
}

//Key unisets::chooseCurrency(UnicodeString str) {
//    if (get(DOLLAR_SIGN)->contains(str)) {
//        return DOLLAR_SIGN;
//    } else if (get(POUND_SIGN)->contains(str)) {
//        return POUND_SIGN;
//    } else if (get(RUPEE_SIGN)->contains(str)) {
//        return RUPEE_SIGN;
//    } else if (get(YEN_SIGN)->contains(str)) {
//        return YEN_SIGN;
//    } else {
//        return NONE;
//    }
//}


#endif /* #if !UCONFIG_NO_FORMATTING */

Coverage Report

Created: 2025-06-24 06:43

Line	Count	Source (jump to first uncovered line)
1		// © 2018 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3
4		#include "unicode/utypes.h"
5
6		#if !UCONFIG_NO_FORMATTING
7
8		// Allow implicit conversion from char16_t* to UnicodeString for this file:
9		// Helpful in toString methods and elsewhere.
10		#define UNISTR_FROM_STRING_EXPLICIT
11
12		#include "static_unicode_sets.h"
13		#include "umutex.h"
14		#include "ucln_cmn.h"
15		#include "unicode/uniset.h"
16		#include "uresimp.h"
17		#include "cstring.h"
18		#include "uassert.h"
19
20		using namespace icu;
21		using namespace icu::unisets;
22
23
24		namespace {
25
26		UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
27
28		// Save the empty instance in static memory to have well-defined behavior if a
29		// regular UnicodeSet cannot be allocated.
30		alignas(UnicodeSet)
31		char gEmptyUnicodeSet[sizeof(UnicodeSet)];
32
33		// Whether the gEmptyUnicodeSet is initialized and ready to use.
34		UBool gEmptyUnicodeSetInitialized = FALSE;
35
36	0	inline UnicodeSet* getImpl(Key key) {
37	0	UnicodeSet* candidate = gUnicodeSets[key];
38	0	if (candidate == nullptr) {
39	0	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
40	0	}
41	0	return candidate;
42	0	}
43
44	0	UnicodeSet* computeUnion(Key k1, Key k2) {
45	0	UnicodeSet* result = new UnicodeSet();
46	0	if (result == nullptr) {
47	0	return nullptr;
48	0	}
49	0	result->addAll(*getImpl(k1));
50	0	result->addAll(*getImpl(k2));
51	0	result->freeze();
52	0	return result;
53	0	}
54
55	0	UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
56	0	UnicodeSet* result = new UnicodeSet();
57	0	if (result == nullptr) {
58	0	return nullptr;
59	0	}
60	0	result->addAll(*getImpl(k1));
61	0	result->addAll(*getImpl(k2));
62	0	result->addAll(*getImpl(k3));
63	0	result->freeze();
64	0	return result;
65	0	}
66
67
68	0	void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
69		// assert unicodeSets.get(key) == null;
70	0	gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
71	0	}
72
73		class ParseDataSink : public ResourceSink {
74		public:
75	0	void put(const char* key, ResourceValue& value, UBool /noFallback/, UErrorCode& status) U_OVERRIDE {
76	0	ResourceTable contextsTable = value.getTable(status);
77	0	if (U_FAILURE(status)) { return; }
78	0	for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
79	0	if (uprv_strcmp(key, "date") == 0) {
80		// ignore
81	0	} else {
82	0	ResourceTable strictnessTable = value.getTable(status);
83	0	if (U_FAILURE(status)) { return; }
84	0	for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
85	0	bool isLenient = (uprv_strcmp(key, "lenient") == 0);
86	0	ResourceArray array = value.getArray(status);
87	0	if (U_FAILURE(status)) { return; }
88	0	for (int k = 0; k < array.getSize(); k++) {
89	0	array.getValue(k, value);
90	0	UnicodeString str = value.getUnicodeString(status);
91	0	if (U_FAILURE(status)) { return; }
92		// There is both lenient and strict data for comma/period,
93		// but not for any of the other symbols.
94	0	if (str.indexOf(u'.') != -1) {
95	0	saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
96	0	} else if (str.indexOf(u',') != -1) {
97	0	saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
98	0	} else if (str.indexOf(u'+') != -1) {
99	0	saveSet(PLUS_SIGN, str, status);
100	0	} else if (str.indexOf(u'-') != -1) {
101	0	saveSet(MINUS_SIGN, str, status);
102	0	} else if (str.indexOf(u'$') != -1) {
103	0	saveSet(DOLLAR_SIGN, str, status);
104	0	} else if (str.indexOf(u'£') != -1) {
105	0	saveSet(POUND_SIGN, str, status);
106	0	} else if (str.indexOf(u'₹') != -1) {
107	0	saveSet(RUPEE_SIGN, str, status);
108	0	} else if (str.indexOf(u'¥') != -1) {
109	0	saveSet(YEN_SIGN, str, status);
110	0	} else if (str.indexOf(u'₩') != -1) {
111	0	saveSet(WON_SIGN, str, status);
112	0	} else if (str.indexOf(u'%') != -1) {
113	0	saveSet(PERCENT_SIGN, str, status);
114	0	} else if (str.indexOf(u'‰') != -1) {
115	0	saveSet(PERMILLE_SIGN, str, status);
116	0	} else if (str.indexOf(u'’') != -1) {
117	0	saveSet(APOSTROPHE_SIGN, str, status);
118	0	} else {
119		// Unknown class of parse lenients
120		// TODO(ICU-20428): Make ICU automatically accept new classes?
121	0	U_ASSERT(FALSE);
122	0	}
123	0	if (U_FAILURE(status)) { return; }
124	0	}
125	0	}
126	0	}
127	0	}
128	0	}
129		};
130
131
132		icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
133
134	0	UBool U_CALLCONV cleanupNumberParseUniSets() {
135	0	if (gEmptyUnicodeSetInitialized) {
136	0	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
137	0	gEmptyUnicodeSetInitialized = FALSE;
138	0	}
139	0	for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
140	0	delete gUnicodeSets[i];
141	0	gUnicodeSets[i] = nullptr;
142	0	}
143	0	gNumberParseUniSetsInitOnce.reset();
144	0	return TRUE;
145	0	}
146
147	0	void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
148	0	ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
149
150		// Initialize the empty instance for well-defined fallback behavior
151	0	new(gEmptyUnicodeSet) UnicodeSet();
152	0	reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
153	0	gEmptyUnicodeSetInitialized = TRUE;
154
155		// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
156		// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
157	0	gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
158	0	u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
159	0	gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
160
161	0	LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
162	0	if (U_FAILURE(status)) { return; }
163	0	ParseDataSink sink;
164	0	ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
165	0	if (U_FAILURE(status)) { return; }
166
167		// NOTE: It is OK for these assertions to fail if there was a no-data build.
168	0	U_ASSERT(gUnicodeSets[COMMA] != nullptr);
169	0	U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
170	0	U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
171	0	U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
172	0	U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
173
174	0	LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
175	0	u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
176	0	status
177	0	), status);
178	0	if (U_FAILURE(status)) { return; }
179	0	otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
180	0	gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
181	0	gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
182	0	gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
183	0	STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
184
185	0	U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
186	0	U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
187	0	U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
188	0	U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
189
190	0	gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
191	0	if (U_FAILURE(status)) { return; }
192
193	0	U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
194	0	U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
195	0	U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
196	0	U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
197	0	U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
198
199	0	gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
200	0	if (U_FAILURE(status)) { return; }
201	0	gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
202	0	gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
203
204	0	for (auto* uniset : gUnicodeSets) {
205	0	if (uniset != nullptr) {
206	0	uniset->freeze();
207	0	}
208	0	}
209	0	}
210
211		}
212
213	0	const UnicodeSet* unisets::get(Key key) {
214	0	UErrorCode localStatus = U_ZERO_ERROR;
215	0	umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
216	0	if (U_FAILURE(localStatus)) {
217	0	return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
218	0	}
219	0	return getImpl(key);
220	0	}
221
222	0	Key unisets::chooseFrom(UnicodeString str, Key key1) {
223	0	return get(key1)->contains(str) ? key1 : NONE;
224	0	}
225
226	0	Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
227	0	return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
228	0	}
229
230		//Key unisets::chooseCurrency(UnicodeString str) {
231		// if (get(DOLLAR_SIGN)->contains(str)) {
232		// return DOLLAR_SIGN;
233		// } else if (get(POUND_SIGN)->contains(str)) {
234		// return POUND_SIGN;
235		// } else if (get(RUPEE_SIGN)->contains(str)) {
236		// return RUPEE_SIGN;
237		// } else if (get(YEN_SIGN)->contains(str)) {
238		// return YEN_SIGN;
239		// } else {
240		// return NONE;
241		// }
242		//}
243
244
245		#endif /* #if !UCONFIG_NO_FORMATTING */