Coverage Report

Created: 2026-06-23 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/characterproperties.cpp
Line
Count
Source
1
// © 2018 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
// characterproperties.cpp
5
// created: 2018sep03 Markus W. Scherer
6
7
#include "unicode/utypes.h"
8
#include "unicode/localpointer.h"
9
#include "unicode/uchar.h"
10
#include "unicode/ucpmap.h"
11
#include "unicode/ucptrie.h"
12
#include "unicode/umutablecptrie.h"
13
#include "unicode/uniset.h"
14
#include "unicode/uscript.h"
15
#include "unicode/uset.h"
16
#include "cmemory.h"
17
#include "emojiprops.h"
18
#include "mutex.h"
19
#include "normalizer2impl.h"
20
#include "uassert.h"
21
#include "ubidi_props.h"
22
#include "ucase.h"
23
#include "ucln_cmn.h"
24
#include "umutex.h"
25
#include "uprops.h"
26
27
using icu::LocalPointer;
28
#if !UCONFIG_NO_NORMALIZATION
29
using icu::Normalizer2Factory;
30
using icu::Normalizer2Impl;
31
#endif
32
using icu::UInitOnce;
33
using icu::UnicodeSet;
34
35
namespace {
36
37
UBool U_CALLCONV characterproperties_cleanup();
38
39
constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
40
41
struct Inclusion {
42
    UnicodeSet  *fSet = nullptr;
43
    UInitOnce    fInitOnce {};
44
};
45
Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
46
47
UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
48
49
UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
50
51
icu::UMutex cpMutex;
52
53
//----------------------------------------------------------------
54
// Inclusions list
55
//----------------------------------------------------------------
56
57
// USetAdder implementation
58
// Does not use uset.h to reduce code dependencies
59
void U_CALLCONV
60
255k
_set_add(USet *set, UChar32 c) {
61
255k
    reinterpret_cast<UnicodeSet*>(set)->add(c);
62
255k
}
63
64
void U_CALLCONV
65
168
_set_addRange(USet *set, UChar32 start, UChar32 end) {
66
168
    reinterpret_cast<UnicodeSet*>(set)->add(start, end);
67
168
}
68
69
void U_CALLCONV
70
5.54k
_set_addString(USet *set, const char16_t *str, int32_t length) {
71
5.54k
    reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));
72
5.54k
}
73
74
0
UBool U_CALLCONV characterproperties_cleanup() {
75
0
    for (Inclusion &in: gInclusions) {
76
0
        delete in.fSet;
77
0
        in.fSet = nullptr;
78
0
        in.fInitOnce.reset();
79
0
    }
80
0
    for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
81
0
        delete sets[i];
82
0
        sets[i] = nullptr;
83
0
    }
84
0
    for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
85
0
        ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
86
0
        maps[i] = nullptr;
87
0
    }
88
0
    return true;
89
0
}
90
91
56
void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
92
    // This function is invoked only via umtx_initOnce().
93
56
    U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
94
56
    if (src == UPROPS_SRC_NONE) {
95
0
        errorCode = U_INTERNAL_PROGRAM_ERROR;
96
0
        return;
97
0
    }
98
56
    U_ASSERT(gInclusions[src].fSet == nullptr);
99
100
56
    LocalPointer<UnicodeSet> incl(new UnicodeSet());
101
56
    if (incl.isNull()) {
102
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
103
0
        return;
104
0
    }
105
56
    USetAdder sa = {
106
56
        reinterpret_cast<USet*>(incl.getAlias()),
107
56
        _set_add,
108
56
        _set_addRange,
109
56
        _set_addString,
110
56
        nullptr, // don't need remove()
111
56
        nullptr // don't need removeRange()
112
56
    };
113
114
56
    switch(src) {
115
7
    case UPROPS_SRC_CHAR:
116
7
        uchar_addPropertyStarts(&sa, &errorCode);
117
7
        break;
118
6
    case UPROPS_SRC_PROPSVEC:
119
6
        upropsvec_addPropertyStarts(&sa, &errorCode);
120
6
        break;
121
3
    case UPROPS_SRC_CHAR_AND_PROPSVEC:
122
3
        uchar_addPropertyStarts(&sa, &errorCode);
123
3
        upropsvec_addPropertyStarts(&sa, &errorCode);
124
3
        break;
125
0
#if !UCONFIG_NO_NORMALIZATION
126
3
    case UPROPS_SRC_CASE_AND_NORM: {
127
3
        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
128
3
        if(U_SUCCESS(errorCode)) {
129
3
            impl->addPropertyStarts(&sa, errorCode);
130
3
        }
131
3
        ucase_addPropertyStarts(&sa, &errorCode);
132
3
        break;
133
0
    }
134
4
    case UPROPS_SRC_NFC: {
135
4
        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
136
4
        if(U_SUCCESS(errorCode)) {
137
4
            impl->addPropertyStarts(&sa, errorCode);
138
4
        }
139
4
        break;
140
0
    }
141
1
    case UPROPS_SRC_NFKC: {
142
1
        const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
143
1
        if(U_SUCCESS(errorCode)) {
144
1
            impl->addPropertyStarts(&sa, errorCode);
145
1
        }
146
1
        break;
147
0
    }
148
2
    case UPROPS_SRC_NFKC_CF: {
149
2
        const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
150
2
        if(U_SUCCESS(errorCode)) {
151
2
            impl->addPropertyStarts(&sa, errorCode);
152
2
        }
153
2
        break;
154
0
    }
155
1
    case UPROPS_SRC_NFC_CANON_ITER: {
156
1
        const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
157
1
        if(U_SUCCESS(errorCode)) {
158
1
            impl->addCanonIterPropertyStarts(&sa, errorCode);
159
1
        }
160
1
        break;
161
0
    }
162
0
#endif
163
4
    case UPROPS_SRC_CASE:
164
4
        ucase_addPropertyStarts(&sa, &errorCode);
165
4
        break;
166
4
    case UPROPS_SRC_BIDI:
167
4
        ubidi_addPropertyStarts(&sa, &errorCode);
168
4
        break;
169
3
    case UPROPS_SRC_INPC:
170
4
    case UPROPS_SRC_INSC:
171
8
    case UPROPS_SRC_VO:
172
8
        uprops_addPropertyStarts(src, &sa, &errorCode);
173
8
        break;
174
3
    case UPROPS_SRC_EMOJI: {
175
3
        const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
176
3
        if (U_SUCCESS(errorCode)) {
177
3
            ep->addPropertyStarts(&sa, errorCode);
178
3
        }
179
3
        break;
180
4
    }
181
3
    case UPROPS_SRC_IDSU:
182
        // New in Unicode 15.1 for just two characters.
183
3
        sa.add(sa.set, 0x2FFE);
184
3
        sa.add(sa.set, 0x2FFF + 1);
185
3
        break;
186
1
    case UPROPS_SRC_ID_COMPAT_MATH:
187
4
    case UPROPS_SRC_MCM:
188
4
        uprops_addPropertyStarts(src, &sa, &errorCode);
189
4
        break;
190
3
    case UPROPS_SRC_BLOCK:
191
3
        ublock_addPropertyStarts(&sa, errorCode);
192
3
        break;
193
0
    default:
194
0
        errorCode = U_INTERNAL_PROGRAM_ERROR;
195
0
        break;
196
56
    }
197
198
56
    if (U_FAILURE(errorCode)) {
199
0
        return;
200
0
    }
201
56
    if (incl->isBogus()) {
202
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
203
0
        return;
204
0
    }
205
    // Compact for caching.
206
56
    incl->compact();
207
56
    gInclusions[src].fSet = incl.orphan();
208
56
    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
209
56
}
210
211
66.6k
const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
212
66.6k
    if (U_FAILURE(errorCode)) { return nullptr; }
213
66.6k
    if (src < 0 || UPROPS_SRC_COUNT <= src) {
214
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
215
0
        return nullptr;
216
0
    }
217
66.6k
    Inclusion &i = gInclusions[src];
218
66.6k
    umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
219
66.6k
    return i.fSet;
220
66.6k
}
221
222
68
void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
223
    // This function is invoked only via umtx_initOnce().
224
68
    U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
225
68
    int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
226
68
    U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
227
68
    UPropertySource src = uprops_getSource(prop);
228
68
    const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
229
68
    if (U_FAILURE(errorCode)) {
230
0
        return;
231
0
    }
232
233
68
    LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
234
68
    if (intPropIncl.isNull()) {
235
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
236
0
        return;
237
0
    }
238
68
    int32_t numRanges = incl->getRangeCount();
239
68
    int32_t prevValue = 0;
240
155k
    for (int32_t i = 0; i < numRanges; ++i) {
241
155k
        UChar32 rangeEnd = incl->getRangeEnd(i);
242
537k
        for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
243
            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
244
382k
            int32_t value = u_getIntPropertyValue(c, prop);
245
382k
            if (value != prevValue) {
246
85.2k
                intPropIncl->add(c);
247
85.2k
                prevValue = value;
248
85.2k
            }
249
382k
        }
250
155k
    }
251
252
68
    if (intPropIncl->isBogus()) {
253
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
254
0
        return;
255
0
    }
256
    // Compact for caching.
257
68
    intPropIncl->compact();
258
68
    gInclusions[inclIndex].fSet = intPropIncl.orphan();
259
68
    ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
260
68
}
261
262
}  // namespace
263
264
U_NAMESPACE_BEGIN
265
266
const UnicodeSet *CharacterProperties::getInclusionsForProperty(
267
282k
        UProperty prop, UErrorCode &errorCode) {
268
282k
    if (U_FAILURE(errorCode)) { return nullptr; }
269
282k
    if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
270
216k
        int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
271
216k
        Inclusion &i = gInclusions[inclIndex];
272
216k
        umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
273
216k
        return i.fSet;
274
216k
    } else {
275
66.5k
        UPropertySource src = uprops_getSource(prop);
276
66.5k
        return getInclusionsForSource(src, errorCode);
277
66.5k
    }
278
282k
}
279
280
U_NAMESPACE_END
281
282
namespace {
283
284
154
UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
285
154
    if (U_FAILURE(errorCode)) { return nullptr; }
286
154
    LocalPointer<UnicodeSet> set(new UnicodeSet());
287
154
    if (set.isNull()) {
288
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
289
0
        return nullptr;
290
0
    }
291
154
    if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
292
        // property of strings
293
7
        const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
294
7
        if (U_FAILURE(errorCode)) { return nullptr; }
295
7
        USetAdder sa = {
296
7
            reinterpret_cast<USet*>(set.getAlias()),
297
7
            _set_add,
298
7
            _set_addRange,
299
7
            _set_addString,
300
7
            nullptr, // don't need remove()
301
7
            nullptr // don't need removeRange()
302
7
        };
303
7
        ep->addStrings(&sa, property, errorCode);
304
7
        if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
305
            // property of _only_ strings
306
5
            set->freeze();
307
5
            return set.orphan();
308
5
        }
309
7
    }
310
311
149
    const UnicodeSet *inclusions =
312
149
        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
313
149
    if (U_FAILURE(errorCode)) { return nullptr; }
314
149
    int32_t numRanges = inclusions->getRangeCount();
315
149
    UChar32 startHasProperty = -1;
316
317
394k
    for (int32_t i = 0; i < numRanges; ++i) {
318
393k
        UChar32 rangeEnd = inclusions->getRangeEnd(i);
319
1.35M
        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
320
            // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
321
964k
            if (u_hasBinaryProperty(c, property)) {
322
238k
                if (startHasProperty < 0) {
323
                    // Transition from false to true.
324
45.7k
                    startHasProperty = c;
325
45.7k
                }
326
726k
            } else if (startHasProperty >= 0) {
327
                // Transition from true to false.
328
45.7k
                set->add(startHasProperty, c - 1);
329
45.7k
                startHasProperty = -1;
330
45.7k
            }
331
964k
        }
332
393k
    }
333
149
    if (startHasProperty >= 0) {
334
6
        set->add(startHasProperty, 0x10FFFF);
335
6
    }
336
149
    set->freeze();
337
149
    return set.orphan();
338
149
}
339
340
27
UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
341
27
    if (U_FAILURE(errorCode)) { return nullptr; }
342
27
    uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
343
27
    icu::LocalUMutableCPTriePointer mutableTrie(
344
27
        umutablecptrie_open(nullValue, nullValue, &errorCode));
345
27
    const UnicodeSet *inclusions =
346
27
        icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
347
27
    if (U_FAILURE(errorCode)) { return nullptr; }
348
27
    int32_t numRanges = inclusions->getRangeCount();
349
27
    UChar32 start = 0;
350
27
    uint32_t value = nullValue;
351
352
20.1k
    for (int32_t i = 0; i < numRanges; ++i) {
353
20.1k
        UChar32 rangeEnd = inclusions->getRangeEnd(i);
354
54.0k
        for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
355
            // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
356
33.9k
            uint32_t nextValue = u_getIntPropertyValue(c, property);
357
33.9k
            if (value != nextValue) {
358
33.9k
                if (value != nullValue) {
359
23.1k
                    umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
360
23.1k
                }
361
33.9k
                start = c;
362
33.9k
                value = nextValue;
363
33.9k
            }
364
33.9k
        }
365
20.1k
    }
366
27
    if (value != 0) {
367
7
        umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
368
7
    }
369
370
27
    UCPTrieType type;
371
27
    if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
372
2
        type = UCPTRIE_TYPE_FAST;
373
25
    } else {
374
25
        type = UCPTRIE_TYPE_SMALL;
375
25
    }
376
27
    UCPTrieValueWidth valueWidth;
377
    // TODO: UCharacterProperty.IntProperty
378
27
    int32_t max = u_getIntPropertyMaxValue(property);
379
27
    if (max <= 0xff) {
380
26
        valueWidth = UCPTRIE_VALUE_BITS_8;
381
26
    } else if (max <= 0xffff) {
382
1
        valueWidth = UCPTRIE_VALUE_BITS_16;
383
1
    } else {
384
0
        valueWidth = UCPTRIE_VALUE_BITS_32;
385
0
    }
386
27
    return reinterpret_cast<UCPMap *>(
387
27
        umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
388
27
}
389
390
}  // namespace
391
392
U_NAMESPACE_BEGIN
393
394
157k
const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
395
157k
    if (U_FAILURE(errorCode)) { return nullptr; }
396
157k
    if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
397
936
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
398
936
        return nullptr;
399
936
    }
400
156k
    Mutex m(&cpMutex);
401
156k
    UnicodeSet *set = sets[property];
402
156k
    if (set == nullptr) {
403
154
        sets[property] = set = makeSet(property, errorCode);
404
154
    }
405
156k
    return set;
406
157k
}
407
408
U_NAMESPACE_END
409
410
U_NAMESPACE_USE
411
412
U_CAPI const USet * U_EXPORT2
413
11.3k
u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
414
11.3k
    const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
415
11.3k
    return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
416
11.3k
}
417
418
U_CAPI const UCPMap * U_EXPORT2
419
3.43k
u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
420
3.43k
    if (U_FAILURE(*pErrorCode)) { return nullptr; }
421
3.43k
    if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
422
2.70k
        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
423
2.70k
        return nullptr;
424
2.70k
    }
425
732
    Mutex m(&cpMutex);
426
732
    UCPMap *map = maps[property - UCHAR_INT_START];
427
732
    if (map == nullptr) {
428
27
        maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
429
27
    }
430
732
    return map;
431
3.43k
}