Coverage Report

Created: 2025-09-05 07:16

/src/icu/icu4c/source/i18n/number_affixutils.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2017 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
#include "number_affixutils.h"
9
#include "unicode/utf16.h"
10
#include "unicode/uniset.h"
11
12
using namespace icu;
13
using namespace icu::number;
14
using namespace icu::number::impl;
15
16
140k
TokenConsumer::~TokenConsumer() = default;
17
108k
SymbolProvider::~SymbolProvider() = default;
18
19
3.28k
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20
3.28k
    AffixPatternState state = STATE_BASE;
21
3.28k
    int32_t offset = 0;
22
3.28k
    int32_t length = 0;
23
3.76M
    for (; offset < patternString.length();) {
24
3.75M
        UChar32 cp = patternString.char32At(offset);
25
26
3.75M
        switch (state) {
27
3.71M
            case STATE_BASE:
28
3.71M
                if (cp == u'\'') {
29
                    // First quote
30
189
                    state = STATE_FIRST_QUOTE;
31
3.71M
                } else {
32
                    // Unquoted symbol
33
3.71M
                    length++;
34
3.71M
                }
35
3.71M
                break;
36
189
            case STATE_FIRST_QUOTE:
37
189
                if (cp == u'\'') {
38
                    // Repeated quote
39
124
                    length++;
40
124
                    state = STATE_BASE;
41
124
                } else {
42
                    // Quoted code point
43
65
                    length++;
44
65
                    state = STATE_INSIDE_QUOTE;
45
65
                }
46
189
                break;
47
42.8k
            case STATE_INSIDE_QUOTE:
48
42.8k
                if (cp == u'\'') {
49
                    // End of quoted sequence
50
316
                    state = STATE_AFTER_QUOTE;
51
42.5k
                } else {
52
                    // Quoted code point
53
42.5k
                    length++;
54
42.5k
                }
55
42.8k
                break;
56
4.70k
            case STATE_AFTER_QUOTE:
57
4.70k
                if (cp == u'\'') {
58
                    // Double quote inside of quoted sequence
59
251
                    length++;
60
251
                    state = STATE_INSIDE_QUOTE;
61
4.44k
                } else {
62
                    // Unquoted symbol
63
4.44k
                    length++;
64
4.44k
                }
65
4.70k
                break;
66
0
            default:
67
0
                UPRV_UNREACHABLE_EXIT;
68
3.75M
        }
69
70
3.75M
        offset += U16_LENGTH(cp);
71
3.75M
    }
72
73
3.28k
    switch (state) {
74
0
        case STATE_FIRST_QUOTE:
75
0
        case STATE_INSIDE_QUOTE:
76
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
77
0
            break;
78
3.28k
        default:
79
3.28k
            break;
80
3.28k
    }
81
82
3.28k
    return length;
83
3.28k
}
84
85
2.08M
UnicodeString AffixUtils::escape(const UnicodeString &input) {
86
2.08M
    AffixPatternState state = STATE_BASE;
87
2.08M
    int32_t offset = 0;
88
2.08M
    UnicodeString output;
89
2.08M
    for (; offset < input.length();) {
90
0
        UChar32 cp = input.char32At(offset);
91
92
0
        switch (cp) {
93
0
            case u'\'':
94
0
                output.append(u"''", -1);
95
0
                break;
96
97
0
            case u'-':
98
0
            case u'+':
99
0
            case u'%':
100
0
            case u'‰':
101
0
            case u'¤':
102
0
                if (state == STATE_BASE) {
103
0
                    output.append(u'\'');
104
0
                    output.append(cp);
105
0
                    state = STATE_INSIDE_QUOTE;
106
0
                } else {
107
0
                    output.append(cp);
108
0
                }
109
0
                break;
110
111
0
            default:
112
0
                if (state == STATE_INSIDE_QUOTE) {
113
0
                    output.append(u'\'');
114
0
                    output.append(cp);
115
0
                    state = STATE_BASE;
116
0
                } else {
117
0
                    output.append(cp);
118
0
                }
119
0
                break;
120
0
        }
121
0
        offset += U16_LENGTH(cp);
122
0
    }
123
124
2.08M
    if (state == STATE_INSIDE_QUOTE) {
125
0
        output.append(u'\'');
126
0
    }
127
128
2.08M
    return output;
129
2.08M
}
130
131
262k
Field AffixUtils::getFieldForType(AffixPatternType type) {
132
262k
    switch (type) {
133
106k
        case TYPE_MINUS_SIGN:
134
106k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135
107k
        case TYPE_PLUS_SIGN:
136
107k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137
462
        case TYPE_APPROXIMATELY_SIGN:
138
462
            return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139
19.7k
        case TYPE_PERCENT:
140
19.7k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141
1.80k
        case TYPE_PERMILLE:
142
1.80k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143
25.4k
        case TYPE_CURRENCY_SINGLE:
144
25.4k
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145
446
        case TYPE_CURRENCY_DOUBLE:
146
446
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147
461
        case TYPE_CURRENCY_TRIPLE:
148
461
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149
58
        case TYPE_CURRENCY_QUAD:
150
58
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151
30
        case TYPE_CURRENCY_QUINT:
152
30
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153
0
        case TYPE_CURRENCY_OVERFLOW:
154
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155
0
        default:
156
0
            UPRV_UNREACHABLE_EXIT;
157
262k
    }
158
262k
}
159
160
int32_t
161
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162
809k
                     const SymbolProvider &provider, Field field, UErrorCode &status) {
163
809k
    int32_t length = 0;
164
809k
    AffixTag tag;
165
1.01M
    while (hasNext(tag, affixPattern)) {
166
206k
        tag = nextToken(tag, affixPattern, status);
167
206k
        if (U_FAILURE(status)) { return length; }
168
206k
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169
            // Don't go to the provider for this special case
170
0
            length += output.insertCodePoint(
171
0
                position + length,
172
0
                0xFFFD,
173
0
                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174
0
                status);
175
206k
        } else if (tag.type < 0) {
176
200k
            length += output.insert(
177
200k
                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178
200k
        } else {
179
6.39k
            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180
6.39k
        }
181
206k
    }
182
809k
    return length;
183
809k
}
184
185
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186
0
                                            const SymbolProvider &provider, UErrorCode &status) {
187
0
    int32_t length = 0;
188
0
    AffixTag tag;
189
0
    while (hasNext(tag, affixPattern)) {
190
0
        tag = nextToken(tag, affixPattern, status);
191
0
        if (U_FAILURE(status)) { return length; }
192
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193
0
            length += 1;
194
0
        } else if (tag.type < 0) {
195
0
            length += provider.getSymbol(tag.type).length();
196
0
        } else {
197
0
            length += U16_LENGTH(tag.codePoint);
198
0
        }
199
0
    }
200
0
    return length;
201
0
}
202
203
bool
204
36.5k
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205
36.5k
    if (affixPattern.length() == 0) {
206
14.7k
        return false;
207
14.7k
    }
208
21.8k
    AffixTag tag;
209
3.42M
    while (hasNext(tag, affixPattern)) {
210
3.40M
        tag = nextToken(tag, affixPattern, status);
211
3.40M
        if (U_FAILURE(status)) { return false; }
212
3.40M
        if (tag.type == type) {
213
1.32k
            return true;
214
1.32k
        }
215
3.40M
    }
216
20.4k
    return false;
217
21.8k
}
218
219
2.02M
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220
2.02M
    if (affixPattern.length() == 0) {
221
1.95M
        return false;
222
1.95M
    }
223
68.8k
    AffixTag tag;
224
60.7M
    while (hasNext(tag, affixPattern)) {
225
60.7M
        tag = nextToken(tag, affixPattern, status);
226
60.7M
        if (U_FAILURE(status)) { return false; }
227
60.7M
        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228
26.4k
            return true;
229
26.4k
        }
230
60.7M
    }
231
42.4k
    return false;
232
68.8k
}
233
234
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235
0
                                      char16_t replacementChar, UErrorCode &status) {
236
0
    UnicodeString output(affixPattern); // copy
237
0
    if (affixPattern.length() == 0) {
238
0
        return output;
239
0
    }
240
0
    AffixTag tag;
241
0
    while (hasNext(tag, affixPattern)) {
242
0
        tag = nextToken(tag, affixPattern, status);
243
0
        if (U_FAILURE(status)) { return output; }
244
0
        if (tag.type == type) {
245
0
            output.replace(tag.offset - 1, 1, replacementChar);
246
0
        }
247
0
    }
248
0
    return output;
249
0
}
250
251
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252
0
                                                  const UnicodeSet& ignorables, UErrorCode& status) {
253
0
    if (affixPattern.length() == 0) {
254
0
        return true;
255
0
    }
256
0
    AffixTag tag;
257
0
    while (hasNext(tag, affixPattern)) {
258
0
        tag = nextToken(tag, affixPattern, status);
259
0
        if (U_FAILURE(status)) { return false; }
260
0
        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261
0
            return false;
262
0
        }
263
0
    }
264
0
    return true;
265
0
}
266
267
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268
140k
                                     UErrorCode& status) {
269
140k
    if (affixPattern.length() == 0) {
270
0
        return;
271
0
    }
272
140k
    AffixTag tag;
273
60.1M
    while (hasNext(tag, affixPattern)) {
274
59.9M
        tag = nextToken(tag, affixPattern, status);
275
59.9M
        if (U_FAILURE(status)) { return; }
276
59.9M
        consumer.consumeToken(tag.type, tag.codePoint, status);
277
59.9M
        if (U_FAILURE(status)) { return; }
278
59.9M
    }
279
140k
}
280
281
124M
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282
124M
    int32_t offset = tag.offset;
283
124M
    int32_t state = tag.state;
284
125M
    for (; offset < patternString.length();) {
285
125M
        UChar32 cp = patternString.char32At(offset);
286
125M
        int32_t count = U16_LENGTH(cp);
287
288
125M
        switch (state) {
289
120M
            case STATE_BASE:
290
120M
                switch (cp) {
291
414k
                    case u'\'':
292
414k
                        state = STATE_FIRST_QUOTE;
293
414k
                        offset += count;
294
                        // continue to the next code point
295
414k
                        break;
296
204k
                    case u'-':
297
204k
                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298
116k
                    case u'+':
299
116k
                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300
462
                    case u'~':
301
462
                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302
39.8k
                    case u'%':
303
39.8k
                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304
3.45k
                    case u'‰':
305
3.45k
                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306
60.1k
                    case u'¤':
307
60.1k
                        state = STATE_FIRST_CURR;
308
60.1k
                        offset += count;
309
                        // continue to the next code point
310
60.1k
                        break;
311
119M
                    default:
312
119M
                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313
120M
                }
314
474k
                break;
315
474k
            case STATE_FIRST_QUOTE:
316
414k
                if (cp == u'\'') {
317
350k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318
350k
                } else {
319
63.5k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320
63.5k
                }
321
3.91M
            case STATE_INSIDE_QUOTE:
322
3.91M
                if (cp == u'\'') {
323
418k
                    state = STATE_AFTER_QUOTE;
324
418k
                    offset += count;
325
                    // continue to the next code point
326
418k
                    break;
327
3.50M
                } else {
328
3.50M
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329
3.50M
                }
330
418k
            case STATE_AFTER_QUOTE:
331
418k
                if (cp == u'\'') {
332
355k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333
355k
                } else {
334
62.7k
                    state = STATE_BASE;
335
                    // re-evaluate this code point
336
62.7k
                    break;
337
62.7k
                }
338
25.8k
            case STATE_FIRST_CURR:
339
25.8k
                if (cp == u'¤') {
340
4.57k
                    state = STATE_SECOND_CURR;
341
4.57k
                    offset += count;
342
                    // continue to the next code point
343
4.57k
                    break;
344
21.3k
                } else {
345
21.3k
                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346
21.3k
                }
347
4.05k
            case STATE_SECOND_CURR:
348
4.05k
                if (cp == u'¤') {
349
2.65k
                    state = STATE_THIRD_CURR;
350
2.65k
                    offset += count;
351
                    // continue to the next code point
352
2.65k
                    break;
353
2.65k
                } else {
354
1.40k
                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355
1.40k
                }
356
1.82k
            case STATE_THIRD_CURR:
357
1.82k
                if (cp == u'¤') {
358
760
                    state = STATE_FOURTH_CURR;
359
760
                    offset += count;
360
                    // continue to the next code point
361
760
                    break;
362
1.06k
                } else {
363
1.06k
                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364
1.06k
                }
365
678
            case STATE_FOURTH_CURR:
366
678
                if (cp == u'¤') {
367
432
                    state = STATE_FIFTH_CURR;
368
432
                    offset += count;
369
                    // continue to the next code point
370
432
                    break;
371
432
                } else {
372
246
                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373
246
                }
374
419
            case STATE_FIFTH_CURR:
375
419
                if (cp == u'¤') {
376
0
                    state = STATE_OVERFLOW_CURR;
377
0
                    offset += count;
378
                    // continue to the next code point
379
0
                    break;
380
419
                } else {
381
419
                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382
419
                }
383
0
            case STATE_OVERFLOW_CURR:
384
0
                if (cp == u'¤') {
385
0
                    offset += count;
386
                    // continue to the next code point and loop back to this state
387
0
                    break;
388
0
                } else {
389
0
                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390
0
                }
391
0
            default:
392
0
                UPRV_UNREACHABLE_EXIT;
393
125M
        }
394
125M
    }
395
    // End of string
396
35.7k
    switch (state) {
397
0
        case STATE_BASE:
398
            // No more tokens in string.
399
0
            return {-1};
400
0
        case STATE_FIRST_QUOTE:
401
0
        case STATE_INSIDE_QUOTE:
402
            // For consistent behavior with the JDK and ICU 58, set an error here.
403
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
404
0
            return {-1};
405
0
        case STATE_AFTER_QUOTE:
406
            // No more tokens in string.
407
0
            return {-1};
408
34.2k
        case STATE_FIRST_CURR:
409
34.2k
            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410
518
        case STATE_SECOND_CURR:
411
518
            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412
824
        case STATE_THIRD_CURR:
413
824
            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414
82
        case STATE_FOURTH_CURR:
415
82
            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416
13
        case STATE_FIFTH_CURR:
417
13
            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418
0
        case STATE_OVERFLOW_CURR:
419
0
            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420
0
        default:
421
0
            UPRV_UNREACHABLE_EXIT;
422
35.7k
    }
423
35.7k
}
424
425
125M
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426
    // First check for the {-1} and default initializer syntax.
427
125M
    if (tag.offset < 0) {
428
0
        return false;
429
125M
    } else if (tag.offset == 0) {
430
1.04M
        return string.length() > 0;
431
1.04M
    }
432
    // The rest of the fields are safe to use now.
433
    // Special case: the last character in string is an end quote.
434
124M
    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435
124M
        string.charAt(tag.offset) == u'\'') {
436
730
        return false;
437
124M
    } else if (tag.state != STATE_BASE) {
438
3.91M
        return true;
439
120M
    } else {
440
120M
        return tag.offset < string.length();
441
120M
    }
442
124M
}
443
444
#endif /* #if !UCONFIG_NO_FORMATTING */