Coverage Report

Created: 2026-01-22 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/number_affixutils.cpp
Line
Count
Source
1
// © 2017 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
#include "number_affixutils.h"
9
#include "unicode/utf16.h"
10
#include "unicode/uniset.h"
11
12
using namespace icu;
13
using namespace icu::number;
14
using namespace icu::number::impl;
15
16
184k
TokenConsumer::~TokenConsumer() = default;
17
115k
SymbolProvider::~SymbolProvider() = default;
18
19
5.51k
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20
5.51k
    AffixPatternState state = STATE_BASE;
21
5.51k
    int32_t offset = 0;
22
5.51k
    int32_t length = 0;
23
3.42M
    for (; offset < patternString.length();) {
24
3.41M
        UChar32 cp = patternString.char32At(offset);
25
26
3.41M
        switch (state) {
27
3.28M
            case STATE_BASE:
28
3.28M
                if (cp == u'\'') {
29
                    // First quote
30
1.05k
                    state = STATE_FIRST_QUOTE;
31
3.28M
                } else {
32
                    // Unquoted symbol
33
3.28M
                    length++;
34
3.28M
                }
35
3.28M
                break;
36
1.05k
            case STATE_FIRST_QUOTE:
37
1.05k
                if (cp == u'\'') {
38
                    // Repeated quote
39
441
                    length++;
40
441
                    state = STATE_BASE;
41
609
                } else {
42
                    // Quoted code point
43
609
                    length++;
44
609
                    state = STATE_INSIDE_QUOTE;
45
609
                }
46
1.05k
                break;
47
117k
            case STATE_INSIDE_QUOTE:
48
117k
                if (cp == u'\'') {
49
                    // End of quoted sequence
50
1.80k
                    state = STATE_AFTER_QUOTE;
51
115k
                } else {
52
                    // Quoted code point
53
115k
                    length++;
54
115k
                }
55
117k
                break;
56
9.38k
            case STATE_AFTER_QUOTE:
57
9.38k
                if (cp == u'\'') {
58
                    // Double quote inside of quoted sequence
59
1.19k
                    length++;
60
1.19k
                    state = STATE_INSIDE_QUOTE;
61
8.18k
                } else {
62
                    // Unquoted symbol
63
8.18k
                    length++;
64
8.18k
                }
65
9.38k
                break;
66
0
            default:
67
0
                UPRV_UNREACHABLE_EXIT;
68
3.41M
        }
69
70
3.41M
        offset += U16_LENGTH(cp);
71
3.41M
    }
72
73
5.51k
    switch (state) {
74
0
        case STATE_FIRST_QUOTE:
75
0
        case STATE_INSIDE_QUOTE:
76
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
77
0
            break;
78
5.51k
        default:
79
5.51k
            break;
80
5.51k
    }
81
82
5.51k
    return length;
83
5.51k
}
84
85
2.56M
UnicodeString AffixUtils::escape(const UnicodeString &input) {
86
2.56M
    AffixPatternState state = STATE_BASE;
87
2.56M
    int32_t offset = 0;
88
2.56M
    UnicodeString output;
89
2.56M
    for (; offset < input.length();) {
90
0
        UChar32 cp = input.char32At(offset);
91
92
0
        switch (cp) {
93
0
            case u'\'':
94
0
                output.append(u"''", -1);
95
0
                break;
96
97
0
            case u'-':
98
0
            case u'+':
99
0
            case u'%':
100
0
            case u'‰':
101
0
            case u'¤':
102
0
                if (state == STATE_BASE) {
103
0
                    output.append(u'\'');
104
0
                    output.append(cp);
105
0
                    state = STATE_INSIDE_QUOTE;
106
0
                } else {
107
0
                    output.append(cp);
108
0
                }
109
0
                break;
110
111
0
            default:
112
0
                if (state == STATE_INSIDE_QUOTE) {
113
0
                    output.append(u'\'');
114
0
                    output.append(cp);
115
0
                    state = STATE_BASE;
116
0
                } else {
117
0
                    output.append(cp);
118
0
                }
119
0
                break;
120
0
        }
121
0
        offset += U16_LENGTH(cp);
122
0
    }
123
124
2.56M
    if (state == STATE_INSIDE_QUOTE) {
125
0
        output.append(u'\'');
126
0
    }
127
128
2.56M
    return output;
129
2.56M
}
130
131
328k
Field AffixUtils::getFieldForType(AffixPatternType type) {
132
328k
    switch (type) {
133
115k
        case TYPE_MINUS_SIGN:
134
115k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135
108k
        case TYPE_PLUS_SIGN:
136
108k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137
1.05k
        case TYPE_APPROXIMATELY_SIGN:
138
1.05k
            return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139
31.6k
        case TYPE_PERCENT:
140
31.6k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141
29.3k
        case TYPE_PERMILLE:
142
29.3k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143
39.8k
        case TYPE_CURRENCY_SINGLE:
144
39.8k
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145
805
        case TYPE_CURRENCY_DOUBLE:
146
805
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147
1.21k
        case TYPE_CURRENCY_TRIPLE:
148
1.21k
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149
606
        case TYPE_CURRENCY_QUAD:
150
606
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151
160
        case TYPE_CURRENCY_QUINT:
152
160
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153
135
        case TYPE_CURRENCY_OVERFLOW:
154
135
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155
0
        default:
156
0
            UPRV_UNREACHABLE_EXIT;
157
328k
    }
158
328k
}
159
160
int32_t
161
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162
865k
                     const SymbolProvider &provider, Field field, UErrorCode &status) {
163
865k
    int32_t length = 0;
164
865k
    AffixTag tag;
165
1.08M
    while (hasNext(tag, affixPattern)) {
166
221k
        tag = nextToken(tag, affixPattern, status);
167
221k
        if (U_FAILURE(status)) { return length; }
168
221k
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169
            // Don't go to the provider for this special case
170
0
            length += output.insertCodePoint(
171
0
                position + length,
172
0
                0xFFFD,
173
0
                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174
0
                status);
175
221k
        } else if (tag.type < 0) {
176
214k
            length += output.insert(
177
214k
                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178
214k
        } else {
179
7.23k
            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180
7.23k
        }
181
221k
    }
182
865k
    return length;
183
865k
}
184
185
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186
0
                                            const SymbolProvider &provider, UErrorCode &status) {
187
0
    int32_t length = 0;
188
0
    AffixTag tag;
189
0
    while (hasNext(tag, affixPattern)) {
190
0
        tag = nextToken(tag, affixPattern, status);
191
0
        if (U_FAILURE(status)) { return length; }
192
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193
0
            length += 1;
194
0
        } else if (tag.type < 0) {
195
0
            length += provider.getSymbol(tag.type).length();
196
0
        } else {
197
0
            length += U16_LENGTH(tag.codePoint);
198
0
        }
199
0
    }
200
0
    return length;
201
0
}
202
203
bool
204
39.2k
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205
39.2k
    if (affixPattern.length() == 0) {
206
14.4k
        return false;
207
14.4k
    }
208
24.7k
    AffixTag tag;
209
10.6M
    while (hasNext(tag, affixPattern)) {
210
10.6M
        tag = nextToken(tag, affixPattern, status);
211
10.6M
        if (U_FAILURE(status)) { return false; }
212
10.6M
        if (tag.type == type) {
213
2.99k
            return true;
214
2.99k
        }
215
10.6M
    }
216
21.7k
    return false;
217
24.7k
}
218
219
2.46M
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220
2.46M
    if (affixPattern.length() == 0) {
221
2.35M
        return false;
222
2.35M
    }
223
109k
    AffixTag tag;
224
66.7M
    while (hasNext(tag, affixPattern)) {
225
66.6M
        tag = nextToken(tag, affixPattern, status);
226
66.6M
        if (U_FAILURE(status)) { return false; }
227
66.6M
        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228
42.7k
            return true;
229
42.7k
        }
230
66.6M
    }
231
67.1k
    return false;
232
109k
}
233
234
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235
0
                                      char16_t replacementChar, UErrorCode &status) {
236
0
    UnicodeString output(affixPattern); // copy
237
0
    if (affixPattern.length() == 0) {
238
0
        return output;
239
0
    }
240
0
    AffixTag tag;
241
0
    while (hasNext(tag, affixPattern)) {
242
0
        tag = nextToken(tag, affixPattern, status);
243
0
        if (U_FAILURE(status)) { return output; }
244
0
        if (tag.type == type) {
245
0
            output.replace(tag.offset - 1, 1, replacementChar);
246
0
        }
247
0
    }
248
0
    return output;
249
0
}
250
251
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252
0
                                                  const UnicodeSet& ignorables, UErrorCode& status) {
253
0
    if (affixPattern.length() == 0) {
254
0
        return true;
255
0
    }
256
0
    AffixTag tag;
257
0
    while (hasNext(tag, affixPattern)) {
258
0
        tag = nextToken(tag, affixPattern, status);
259
0
        if (U_FAILURE(status)) { return false; }
260
0
        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261
0
            return false;
262
0
        }
263
0
    }
264
0
    return true;
265
0
}
266
267
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268
184k
                                     UErrorCode& status) {
269
184k
    if (affixPattern.length() == 0) {
270
0
        return;
271
0
    }
272
184k
    AffixTag tag;
273
63.5M
    while (hasNext(tag, affixPattern)) {
274
63.3M
        tag = nextToken(tag, affixPattern, status);
275
63.3M
        if (U_FAILURE(status)) { return; }
276
63.3M
        consumer.consumeToken(tag.type, tag.codePoint, status);
277
63.3M
        if (U_FAILURE(status)) { return; }
278
63.3M
    }
279
184k
}
280
281
140M
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282
140M
    int32_t offset = tag.offset;
283
140M
    int32_t state = tag.state;
284
141M
    for (; offset < patternString.length();) {
285
141M
        UChar32 cp = patternString.char32At(offset);
286
141M
        int32_t count = U16_LENGTH(cp);
287
288
141M
        switch (state) {
289
135M
            case STATE_BASE:
290
135M
                switch (cp) {
291
289k
                    case u'\'':
292
289k
                        state = STATE_FIRST_QUOTE;
293
289k
                        offset += count;
294
                        // continue to the next code point
295
289k
                        break;
296
232k
                    case u'-':
297
232k
                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298
111k
                    case u'+':
299
111k
                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300
2.06k
                    case u'~':
301
2.06k
                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302
68.9k
                    case u'%':
303
68.9k
                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304
58.9k
                    case u'‰':
305
58.9k
                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306
107k
                    case u'¤':
307
107k
                        state = STATE_FIRST_CURR;
308
107k
                        offset += count;
309
                        // continue to the next code point
310
107k
                        break;
311
135M
                    default:
312
135M
                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313
135M
                }
314
397k
                break;
315
397k
            case STATE_FIRST_QUOTE:
316
289k
                if (cp == u'\'') {
317
244k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318
244k
                } else {
319
45.4k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320
45.4k
                }
321
4.94M
            case STATE_INSIDE_QUOTE:
322
4.94M
                if (cp == u'\'') {
323
272k
                    state = STATE_AFTER_QUOTE;
324
272k
                    offset += count;
325
                    // continue to the next code point
326
272k
                    break;
327
4.66M
                } else {
328
4.66M
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329
4.66M
                }
330
272k
            case STATE_AFTER_QUOTE:
331
272k
                if (cp == u'\'') {
332
228k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333
228k
                } else {
334
44.0k
                    state = STATE_BASE;
335
                    // re-evaluate this code point
336
44.0k
                    break;
337
44.0k
                }
338
48.7k
            case STATE_FIRST_CURR:
339
48.7k
                if (cp == u'¤') {
340
9.92k
                    state = STATE_SECOND_CURR;
341
9.92k
                    offset += count;
342
                    // continue to the next code point
343
9.92k
                    break;
344
38.8k
                } else {
345
38.8k
                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346
38.8k
                }
347
8.43k
            case STATE_SECOND_CURR:
348
8.43k
                if (cp == u'¤') {
349
6.64k
                    state = STATE_THIRD_CURR;
350
6.64k
                    offset += count;
351
                    // continue to the next code point
352
6.64k
                    break;
353
6.64k
                } else {
354
1.78k
                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355
1.78k
                }
356
4.42k
            case STATE_THIRD_CURR:
357
4.42k
                if (cp == u'¤') {
358
3.46k
                    state = STATE_FOURTH_CURR;
359
3.46k
                    offset += count;
360
                    // continue to the next code point
361
3.46k
                    break;
362
3.46k
                } else {
363
963
                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364
963
                }
365
3.23k
            case STATE_FOURTH_CURR:
366
3.23k
                if (cp == u'¤') {
367
1.77k
                    state = STATE_FIFTH_CURR;
368
1.77k
                    offset += count;
369
                    // continue to the next code point
370
1.77k
                    break;
371
1.77k
                } else {
372
1.45k
                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373
1.45k
                }
374
1.54k
            case STATE_FIFTH_CURR:
375
1.54k
                if (cp == u'¤') {
376
859
                    state = STATE_OVERFLOW_CURR;
377
859
                    offset += count;
378
                    // continue to the next code point
379
859
                    break;
380
859
                } else {
381
683
                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382
683
                }
383
1.94k
            case STATE_OVERFLOW_CURR:
384
1.94k
                if (cp == u'¤') {
385
1.32k
                    offset += count;
386
                    // continue to the next code point and loop back to this state
387
1.32k
                    break;
388
1.32k
                } else {
389
621
                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390
621
                }
391
0
            default:
392
0
                UPRV_UNREACHABLE_EXIT;
393
141M
        }
394
141M
    }
395
    // End of string
396
63.5k
    switch (state) {
397
0
        case STATE_BASE:
398
            // No more tokens in string.
399
0
            return {-1};
400
0
        case STATE_FIRST_QUOTE:
401
0
        case STATE_INSIDE_QUOTE:
402
            // For consistent behavior with the JDK and ICU 58, set an error here.
403
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
404
0
            return {-1};
405
0
        case STATE_AFTER_QUOTE:
406
            // No more tokens in string.
407
0
            return {-1};
408
59.1k
        case STATE_FIRST_CURR:
409
59.1k
            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410
1.49k
        case STATE_SECOND_CURR:
411
1.49k
            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412
2.21k
        case STATE_THIRD_CURR:
413
2.21k
            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414
232
        case STATE_FOURTH_CURR:
415
232
            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416
235
        case STATE_FIFTH_CURR:
417
235
            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418
238
        case STATE_OVERFLOW_CURR:
419
238
            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420
0
        default:
421
0
            UPRV_UNREACHABLE_EXIT;
422
63.5k
    }
423
63.5k
}
424
425
141M
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426
    // First check for the {-1} and default initializer syntax.
427
141M
    if (tag.offset < 0) {
428
0
        return false;
429
141M
    } else if (tag.offset == 0) {
430
1.18M
        return string.length() > 0;
431
1.18M
    }
432
    // The rest of the fields are safe to use now.
433
    // Special case: the last character in string is an end quote.
434
140M
    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435
1.38k
        string.charAt(tag.offset) == u'\'') {
436
1.38k
        return false;
437
140M
    } else if (tag.state != STATE_BASE) {
438
4.94M
        return true;
439
135M
    } else {
440
135M
        return tag.offset < string.length();
441
135M
    }
442
140M
}
443
444
#endif /* #if !UCONFIG_NO_FORMATTING */