Coverage Report

Created: 2025-12-07 06:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/number_affixutils.cpp
Line
Count
Source
1
// © 2017 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
#include "number_affixutils.h"
9
#include "unicode/utf16.h"
10
#include "unicode/uniset.h"
11
12
using namespace icu;
13
using namespace icu::number;
14
using namespace icu::number::impl;
15
16
171k
TokenConsumer::~TokenConsumer() = default;
17
98.9k
SymbolProvider::~SymbolProvider() = default;
18
19
7.45k
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20
7.45k
    AffixPatternState state = STATE_BASE;
21
7.45k
    int32_t offset = 0;
22
7.45k
    int32_t length = 0;
23
3.90M
    for (; offset < patternString.length();) {
24
3.89M
        UChar32 cp = patternString.char32At(offset);
25
26
3.89M
        switch (state) {
27
3.80M
            case STATE_BASE:
28
3.80M
                if (cp == u'\'') {
29
                    // First quote
30
433
                    state = STATE_FIRST_QUOTE;
31
3.80M
                } else {
32
                    // Unquoted symbol
33
3.80M
                    length++;
34
3.80M
                }
35
3.80M
                break;
36
433
            case STATE_FIRST_QUOTE:
37
433
                if (cp == u'\'') {
38
                    // Repeated quote
39
341
                    length++;
40
341
                    state = STATE_BASE;
41
341
                } else {
42
                    // Quoted code point
43
92
                    length++;
44
92
                    state = STATE_INSIDE_QUOTE;
45
92
                }
46
433
                break;
47
66.1k
            case STATE_INSIDE_QUOTE:
48
66.1k
                if (cp == u'\'') {
49
                    // End of quoted sequence
50
5.55k
                    state = STATE_AFTER_QUOTE;
51
60.6k
                } else {
52
                    // Quoted code point
53
60.6k
                    length++;
54
60.6k
                }
55
66.1k
                break;
56
23.3k
            case STATE_AFTER_QUOTE:
57
23.3k
                if (cp == u'\'') {
58
                    // Double quote inside of quoted sequence
59
5.46k
                    length++;
60
5.46k
                    state = STATE_INSIDE_QUOTE;
61
17.8k
                } else {
62
                    // Unquoted symbol
63
17.8k
                    length++;
64
17.8k
                }
65
23.3k
                break;
66
0
            default:
67
0
                UPRV_UNREACHABLE_EXIT;
68
3.89M
        }
69
70
3.89M
        offset += U16_LENGTH(cp);
71
3.89M
    }
72
73
7.45k
    switch (state) {
74
0
        case STATE_FIRST_QUOTE:
75
0
        case STATE_INSIDE_QUOTE:
76
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
77
0
            break;
78
7.45k
        default:
79
7.45k
            break;
80
7.45k
    }
81
82
7.45k
    return length;
83
7.45k
}
84
85
2.55M
UnicodeString AffixUtils::escape(const UnicodeString &input) {
86
2.55M
    AffixPatternState state = STATE_BASE;
87
2.55M
    int32_t offset = 0;
88
2.55M
    UnicodeString output;
89
2.55M
    for (; offset < input.length();) {
90
0
        UChar32 cp = input.char32At(offset);
91
92
0
        switch (cp) {
93
0
            case u'\'':
94
0
                output.append(u"''", -1);
95
0
                break;
96
97
0
            case u'-':
98
0
            case u'+':
99
0
            case u'%':
100
0
            case u'‰':
101
0
            case u'¤':
102
0
                if (state == STATE_BASE) {
103
0
                    output.append(u'\'');
104
0
                    output.append(cp);
105
0
                    state = STATE_INSIDE_QUOTE;
106
0
                } else {
107
0
                    output.append(cp);
108
0
                }
109
0
                break;
110
111
0
            default:
112
0
                if (state == STATE_INSIDE_QUOTE) {
113
0
                    output.append(u'\'');
114
0
                    output.append(cp);
115
0
                    state = STATE_BASE;
116
0
                } else {
117
0
                    output.append(cp);
118
0
                }
119
0
                break;
120
0
        }
121
0
        offset += U16_LENGTH(cp);
122
0
    }
123
124
2.55M
    if (state == STATE_INSIDE_QUOTE) {
125
0
        output.append(u'\'');
126
0
    }
127
128
2.55M
    return output;
129
2.55M
}
130
131
310k
Field AffixUtils::getFieldForType(AffixPatternType type) {
132
310k
    switch (type) {
133
97.1k
        case TYPE_MINUS_SIGN:
134
97.1k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135
93.4k
        case TYPE_PLUS_SIGN:
136
93.4k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137
891
        case TYPE_APPROXIMATELY_SIGN:
138
891
            return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139
27.2k
        case TYPE_PERCENT:
140
27.2k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141
51.8k
        case TYPE_PERMILLE:
142
51.8k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143
38.2k
        case TYPE_CURRENCY_SINGLE:
144
38.2k
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145
505
        case TYPE_CURRENCY_DOUBLE:
146
505
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147
967
        case TYPE_CURRENCY_TRIPLE:
148
967
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149
194
        case TYPE_CURRENCY_QUAD:
150
194
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151
80
        case TYPE_CURRENCY_QUINT:
152
80
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153
65
        case TYPE_CURRENCY_OVERFLOW:
154
65
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155
0
        default:
156
0
            UPRV_UNREACHABLE_EXIT;
157
310k
    }
158
310k
}
159
160
int32_t
161
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162
732k
                     const SymbolProvider &provider, Field field, UErrorCode &status) {
163
732k
    int32_t length = 0;
164
732k
    AffixTag tag;
165
921k
    while (hasNext(tag, affixPattern)) {
166
188k
        tag = nextToken(tag, affixPattern, status);
167
188k
        if (U_FAILURE(status)) { return length; }
168
188k
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169
            // Don't go to the provider for this special case
170
0
            length += output.insertCodePoint(
171
0
                position + length,
172
0
                0xFFFD,
173
0
                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174
0
                status);
175
188k
        } else if (tag.type < 0) {
176
180k
            length += output.insert(
177
180k
                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178
180k
        } else {
179
7.14k
            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180
7.14k
        }
181
188k
    }
182
732k
    return length;
183
732k
}
184
185
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186
0
                                            const SymbolProvider &provider, UErrorCode &status) {
187
0
    int32_t length = 0;
188
0
    AffixTag tag;
189
0
    while (hasNext(tag, affixPattern)) {
190
0
        tag = nextToken(tag, affixPattern, status);
191
0
        if (U_FAILURE(status)) { return length; }
192
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193
0
            length += 1;
194
0
        } else if (tag.type < 0) {
195
0
            length += provider.getSymbol(tag.type).length();
196
0
        } else {
197
0
            length += U16_LENGTH(tag.codePoint);
198
0
        }
199
0
    }
200
0
    return length;
201
0
}
202
203
bool
204
36.7k
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205
36.7k
    if (affixPattern.length() == 0) {
206
13.2k
        return false;
207
13.2k
    }
208
23.5k
    AffixTag tag;
209
8.69M
    while (hasNext(tag, affixPattern)) {
210
8.67M
        tag = nextToken(tag, affixPattern, status);
211
8.67M
        if (U_FAILURE(status)) { return false; }
212
8.67M
        if (tag.type == type) {
213
2.63k
            return true;
214
2.63k
        }
215
8.67M
    }
216
20.9k
    return false;
217
23.5k
}
218
219
2.46M
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220
2.46M
    if (affixPattern.length() == 0) {
221
2.34M
        return false;
222
2.34M
    }
223
113k
    AffixTag tag;
224
69.1M
    while (hasNext(tag, affixPattern)) {
225
69.0M
        tag = nextToken(tag, affixPattern, status);
226
69.0M
        if (U_FAILURE(status)) { return false; }
227
69.0M
        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228
40.0k
            return true;
229
40.0k
        }
230
69.0M
    }
231
73.1k
    return false;
232
113k
}
233
234
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235
0
                                      char16_t replacementChar, UErrorCode &status) {
236
0
    UnicodeString output(affixPattern); // copy
237
0
    if (affixPattern.length() == 0) {
238
0
        return output;
239
0
    }
240
0
    AffixTag tag;
241
0
    while (hasNext(tag, affixPattern)) {
242
0
        tag = nextToken(tag, affixPattern, status);
243
0
        if (U_FAILURE(status)) { return output; }
244
0
        if (tag.type == type) {
245
0
            output.replace(tag.offset - 1, 1, replacementChar);
246
0
        }
247
0
    }
248
0
    return output;
249
0
}
250
251
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252
0
                                                  const UnicodeSet& ignorables, UErrorCode& status) {
253
0
    if (affixPattern.length() == 0) {
254
0
        return true;
255
0
    }
256
0
    AffixTag tag;
257
0
    while (hasNext(tag, affixPattern)) {
258
0
        tag = nextToken(tag, affixPattern, status);
259
0
        if (U_FAILURE(status)) { return false; }
260
0
        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261
0
            return false;
262
0
        }
263
0
    }
264
0
    return true;
265
0
}
266
267
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268
171k
                                     UErrorCode& status) {
269
171k
    if (affixPattern.length() == 0) {
270
0
        return;
271
0
    }
272
171k
    AffixTag tag;
273
66.0M
    while (hasNext(tag, affixPattern)) {
274
65.8M
        tag = nextToken(tag, affixPattern, status);
275
65.8M
        if (U_FAILURE(status)) { return; }
276
65.8M
        consumer.consumeToken(tag.type, tag.codePoint, status);
277
65.8M
        if (U_FAILURE(status)) { return; }
278
65.8M
    }
279
171k
}
280
281
143M
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282
143M
    int32_t offset = tag.offset;
283
143M
    int32_t state = tag.state;
284
144M
    for (; offset < patternString.length();) {
285
144M
        UChar32 cp = patternString.char32At(offset);
286
144M
        int32_t count = U16_LENGTH(cp);
287
288
144M
        switch (state) {
289
139M
            case STATE_BASE:
290
139M
                switch (cp) {
291
310k
                    case u'\'':
292
310k
                        state = STATE_FIRST_QUOTE;
293
310k
                        offset += count;
294
                        // continue to the next code point
295
310k
                        break;
296
206k
                    case u'-':
297
206k
                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298
98.6k
                    case u'+':
299
98.6k
                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300
1.72k
                    case u'~':
301
1.72k
                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302
56.1k
                    case u'%':
303
56.1k
                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304
102k
                    case u'‰':
305
102k
                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306
101k
                    case u'¤':
307
101k
                        state = STATE_FIRST_CURR;
308
101k
                        offset += count;
309
                        // continue to the next code point
310
101k
                        break;
311
138M
                    default:
312
138M
                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313
139M
                }
314
411k
                break;
315
411k
            case STATE_FIRST_QUOTE:
316
310k
                if (cp == u'\'') {
317
260k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318
260k
                } else {
319
49.4k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320
49.4k
                }
321
4.25M
            case STATE_INSIDE_QUOTE:
322
4.25M
                if (cp == u'\'') {
323
309k
                    state = STATE_AFTER_QUOTE;
324
309k
                    offset += count;
325
                    // continue to the next code point
326
309k
                    break;
327
3.94M
                } else {
328
3.94M
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329
3.94M
                }
330
309k
            case STATE_AFTER_QUOTE:
331
309k
                if (cp == u'\'') {
332
261k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333
261k
                } else {
334
48.4k
                    state = STATE_BASE;
335
                    // re-evaluate this code point
336
48.4k
                    break;
337
48.4k
                }
338
43.7k
            case STATE_FIRST_CURR:
339
43.7k
                if (cp == u'¤') {
340
7.69k
                    state = STATE_SECOND_CURR;
341
7.69k
                    offset += count;
342
                    // continue to the next code point
343
7.69k
                    break;
344
36.0k
                } else {
345
36.0k
                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346
36.0k
                }
347
7.10k
            case STATE_SECOND_CURR:
348
7.10k
                if (cp == u'¤') {
349
5.23k
                    state = STATE_THIRD_CURR;
350
5.23k
                    offset += count;
351
                    // continue to the next code point
352
5.23k
                    break;
353
5.23k
                } else {
354
1.86k
                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355
1.86k
                }
356
3.94k
            case STATE_THIRD_CURR:
357
3.94k
                if (cp == u'¤') {
358
2.48k
                    state = STATE_FOURTH_CURR;
359
2.48k
                    offset += count;
360
                    // continue to the next code point
361
2.48k
                    break;
362
2.48k
                } else {
363
1.45k
                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364
1.45k
                }
365
2.21k
            case STATE_FOURTH_CURR:
366
2.21k
                if (cp == u'¤') {
367
1.78k
                    state = STATE_FIFTH_CURR;
368
1.78k
                    offset += count;
369
                    // continue to the next code point
370
1.78k
                    break;
371
1.78k
                } else {
372
430
                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373
430
                }
374
1.71k
            case STATE_FIFTH_CURR:
375
1.71k
                if (cp == u'¤') {
376
950
                    state = STATE_OVERFLOW_CURR;
377
950
                    offset += count;
378
                    // continue to the next code point
379
950
                    break;
380
950
                } else {
381
760
                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382
760
                }
383
5.86k
            case STATE_OVERFLOW_CURR:
384
5.86k
                if (cp == u'¤') {
385
5.00k
                    offset += count;
386
                    // continue to the next code point and loop back to this state
387
5.00k
                    break;
388
5.00k
                } else {
389
858
                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390
858
                }
391
0
            default:
392
0
                UPRV_UNREACHABLE_EXIT;
393
144M
        }
394
144M
    }
395
    // End of string
396
59.5k
    switch (state) {
397
0
        case STATE_BASE:
398
            // No more tokens in string.
399
0
            return {-1};
400
0
        case STATE_FIRST_QUOTE:
401
0
        case STATE_INSIDE_QUOTE:
402
            // For consistent behavior with the JDK and ICU 58, set an error here.
403
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
404
0
            return {-1};
405
0
        case STATE_AFTER_QUOTE:
406
            // No more tokens in string.
407
0
            return {-1};
408
57.2k
        case STATE_FIRST_CURR:
409
57.2k
            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410
591
        case STATE_SECOND_CURR:
411
591
            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412
1.28k
        case STATE_THIRD_CURR:
413
1.28k
            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414
272
        case STATE_FOURTH_CURR:
415
272
            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416
75
        case STATE_FIFTH_CURR:
417
75
            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418
92
        case STATE_OVERFLOW_CURR:
419
92
            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420
0
        default:
421
0
            UPRV_UNREACHABLE_EXIT;
422
59.5k
    }
423
59.5k
}
424
425
144M
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426
    // First check for the {-1} and default initializer syntax.
427
144M
    if (tag.offset < 0) {
428
0
        return false;
429
144M
    } else if (tag.offset == 0) {
430
1.04M
        return string.length() > 0;
431
1.04M
    }
432
    // The rest of the fields are safe to use now.
433
    // Special case: the last character in string is an end quote.
434
143M
    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435
1.06k
        string.charAt(tag.offset) == u'\'') {
436
1.06k
        return false;
437
143M
    } else if (tag.state != STATE_BASE) {
438
4.25M
        return true;
439
139M
    } else {
440
139M
        return tag.offset < string.length();
441
139M
    }
442
143M
}
443
444
#endif /* #if !UCONFIG_NO_FORMATTING */