Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/i18n/number_affixutils.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2017 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
#include "number_affixutils.h"
9
#include "unicode/utf16.h"
10
#include "unicode/uniset.h"
11
12
using namespace icu;
13
using namespace icu::number;
14
using namespace icu::number::impl;
15
16
163k
TokenConsumer::~TokenConsumer() = default;
17
118k
SymbolProvider::~SymbolProvider() = default;
18
19
3.68k
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20
3.68k
    AffixPatternState state = STATE_BASE;
21
3.68k
    int32_t offset = 0;
22
3.68k
    int32_t length = 0;
23
1.95M
    for (; offset < patternString.length();) {
24
1.94M
        UChar32 cp = patternString.char32At(offset);
25
26
1.94M
        switch (state) {
27
1.93M
            case STATE_BASE:
28
1.93M
                if (cp == u'\'') {
29
                    // First quote
30
169
                    state = STATE_FIRST_QUOTE;
31
1.93M
                } else {
32
                    // Unquoted symbol
33
1.93M
                    length++;
34
1.93M
                }
35
1.93M
                break;
36
169
            case STATE_FIRST_QUOTE:
37
169
                if (cp == u'\'') {
38
                    // Repeated quote
39
108
                    length++;
40
108
                    state = STATE_BASE;
41
108
                } else {
42
                    // Quoted code point
43
61
                    length++;
44
61
                    state = STATE_INSIDE_QUOTE;
45
61
                }
46
169
                break;
47
5.27k
            case STATE_INSIDE_QUOTE:
48
5.27k
                if (cp == u'\'') {
49
                    // End of quoted sequence
50
316
                    state = STATE_AFTER_QUOTE;
51
4.95k
                } else {
52
                    // Quoted code point
53
4.95k
                    length++;
54
4.95k
                }
55
5.27k
                break;
56
3.41k
            case STATE_AFTER_QUOTE:
57
3.41k
                if (cp == u'\'') {
58
                    // Double quote inside of quoted sequence
59
255
                    length++;
60
255
                    state = STATE_INSIDE_QUOTE;
61
3.15k
                } else {
62
                    // Unquoted symbol
63
3.15k
                    length++;
64
3.15k
                }
65
3.41k
                break;
66
0
            default:
67
0
                UPRV_UNREACHABLE_EXIT;
68
1.94M
        }
69
70
1.94M
        offset += U16_LENGTH(cp);
71
1.94M
    }
72
73
3.68k
    switch (state) {
74
0
        case STATE_FIRST_QUOTE:
75
0
        case STATE_INSIDE_QUOTE:
76
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
77
0
            break;
78
3.68k
        default:
79
3.68k
            break;
80
3.68k
    }
81
82
3.68k
    return length;
83
3.68k
}
84
85
2.40M
UnicodeString AffixUtils::escape(const UnicodeString &input) {
86
2.40M
    AffixPatternState state = STATE_BASE;
87
2.40M
    int32_t offset = 0;
88
2.40M
    UnicodeString output;
89
2.40M
    for (; offset < input.length();) {
90
0
        UChar32 cp = input.char32At(offset);
91
92
0
        switch (cp) {
93
0
            case u'\'':
94
0
                output.append(u"''", -1);
95
0
                break;
96
97
0
            case u'-':
98
0
            case u'+':
99
0
            case u'%':
100
0
            case u'‰':
101
0
            case u'¤':
102
0
                if (state == STATE_BASE) {
103
0
                    output.append(u'\'');
104
0
                    output.append(cp);
105
0
                    state = STATE_INSIDE_QUOTE;
106
0
                } else {
107
0
                    output.append(cp);
108
0
                }
109
0
                break;
110
111
0
            default:
112
0
                if (state == STATE_INSIDE_QUOTE) {
113
0
                    output.append(u'\'');
114
0
                    output.append(cp);
115
0
                    state = STATE_BASE;
116
0
                } else {
117
0
                    output.append(cp);
118
0
                }
119
0
                break;
120
0
        }
121
0
        offset += U16_LENGTH(cp);
122
0
    }
123
124
2.40M
    if (state == STATE_INSIDE_QUOTE) {
125
0
        output.append(u'\'');
126
0
    }
127
128
2.40M
    return output;
129
2.40M
}
130
131
297k
Field AffixUtils::getFieldForType(AffixPatternType type) {
132
297k
    switch (type) {
133
119k
        case TYPE_MINUS_SIGN:
134
119k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135
112k
        case TYPE_PLUS_SIGN:
136
112k
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137
237
        case TYPE_APPROXIMATELY_SIGN:
138
237
            return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139
34.3k
        case TYPE_PERCENT:
140
34.3k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141
1.25k
        case TYPE_PERMILLE:
142
1.25k
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143
29.4k
        case TYPE_CURRENCY_SINGLE:
144
29.4k
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145
796
        case TYPE_CURRENCY_DOUBLE:
146
796
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147
152
        case TYPE_CURRENCY_TRIPLE:
148
152
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149
64
        case TYPE_CURRENCY_QUAD:
150
64
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151
38
        case TYPE_CURRENCY_QUINT:
152
38
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153
0
        case TYPE_CURRENCY_OVERFLOW:
154
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155
0
        default:
156
0
            UPRV_UNREACHABLE_EXIT;
157
297k
    }
158
297k
}
159
160
int32_t
161
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162
881k
                     const SymbolProvider &provider, Field field, UErrorCode &status) {
163
881k
    int32_t length = 0;
164
881k
    AffixTag tag;
165
1.10M
    while (hasNext(tag, affixPattern)) {
166
226k
        tag = nextToken(tag, affixPattern, status);
167
226k
        if (U_FAILURE(status)) { return length; }
168
226k
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169
            // Don't go to the provider for this special case
170
0
            length += output.insertCodePoint(
171
0
                position + length,
172
0
                0xFFFD,
173
0
                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174
0
                status);
175
226k
        } else if (tag.type < 0) {
176
218k
            length += output.insert(
177
218k
                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178
218k
        } else {
179
8.46k
            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180
8.46k
        }
181
226k
    }
182
881k
    return length;
183
881k
}
184
185
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186
0
                                            const SymbolProvider &provider, UErrorCode &status) {
187
0
    int32_t length = 0;
188
0
    AffixTag tag;
189
0
    while (hasNext(tag, affixPattern)) {
190
0
        tag = nextToken(tag, affixPattern, status);
191
0
        if (U_FAILURE(status)) { return length; }
192
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193
0
            length += 1;
194
0
        } else if (tag.type < 0) {
195
0
            length += provider.getSymbol(tag.type).length();
196
0
        } else {
197
0
            length += U16_LENGTH(tag.codePoint);
198
0
        }
199
0
    }
200
0
    return length;
201
0
}
202
203
bool
204
39.4k
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205
39.4k
    if (affixPattern.length() == 0) {
206
14.6k
        return false;
207
14.6k
    }
208
24.7k
    AffixTag tag;
209
6.91M
    while (hasNext(tag, affixPattern)) {
210
6.89M
        tag = nextToken(tag, affixPattern, status);
211
6.89M
        if (U_FAILURE(status)) { return false; }
212
6.89M
        if (tag.type == type) {
213
1.74k
            return true;
214
1.74k
        }
215
6.89M
    }
216
23.0k
    return false;
217
24.7k
}
218
219
2.33M
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220
2.33M
    if (affixPattern.length() == 0) {
221
2.25M
        return false;
222
2.25M
    }
223
75.3k
    AffixTag tag;
224
70.7M
    while (hasNext(tag, affixPattern)) {
225
70.7M
        tag = nextToken(tag, affixPattern, status);
226
70.7M
        if (U_FAILURE(status)) { return false; }
227
70.7M
        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228
30.5k
            return true;
229
30.5k
        }
230
70.7M
    }
231
44.8k
    return false;
232
75.3k
}
233
234
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235
0
                                      char16_t replacementChar, UErrorCode &status) {
236
0
    UnicodeString output(affixPattern); // copy
237
0
    if (affixPattern.length() == 0) {
238
0
        return output;
239
0
    }
240
0
    AffixTag tag;
241
0
    while (hasNext(tag, affixPattern)) {
242
0
        tag = nextToken(tag, affixPattern, status);
243
0
        if (U_FAILURE(status)) { return output; }
244
0
        if (tag.type == type) {
245
0
            output.replace(tag.offset - 1, 1, replacementChar);
246
0
        }
247
0
    }
248
0
    return output;
249
0
}
250
251
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252
0
                                                  const UnicodeSet& ignorables, UErrorCode& status) {
253
0
    if (affixPattern.length() == 0) {
254
0
        return true;
255
0
    }
256
0
    AffixTag tag;
257
0
    while (hasNext(tag, affixPattern)) {
258
0
        tag = nextToken(tag, affixPattern, status);
259
0
        if (U_FAILURE(status)) { return false; }
260
0
        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261
0
            return false;
262
0
        }
263
0
    }
264
0
    return true;
265
0
}
266
267
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268
163k
                                     UErrorCode& status) {
269
163k
    if (affixPattern.length() == 0) {
270
0
        return;
271
0
    }
272
163k
    AffixTag tag;
273
68.5M
    while (hasNext(tag, affixPattern)) {
274
68.3M
        tag = nextToken(tag, affixPattern, status);
275
68.3M
        if (U_FAILURE(status)) { return; }
276
68.3M
        consumer.consumeToken(tag.type, tag.codePoint, status);
277
68.3M
        if (U_FAILURE(status)) { return; }
278
68.3M
    }
279
163k
}
280
281
146M
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282
146M
    int32_t offset = tag.offset;
283
146M
    int32_t state = tag.state;
284
147M
    for (; offset < patternString.length();) {
285
147M
        UChar32 cp = patternString.char32At(offset);
286
147M
        int32_t count = U16_LENGTH(cp);
287
288
147M
        switch (state) {
289
141M
            case STATE_BASE:
290
141M
                switch (cp) {
291
332k
                    case u'\'':
292
332k
                        state = STATE_FIRST_QUOTE;
293
332k
                        offset += count;
294
                        // continue to the next code point
295
332k
                        break;
296
237k
                    case u'-':
297
237k
                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298
117k
                    case u'+':
299
117k
                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300
237
                    case u'~':
301
237
                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302
72.4k
                    case u'%':
303
72.4k
                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304
2.49k
                    case u'‰':
305
2.49k
                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306
74.3k
                    case u'¤':
307
74.3k
                        state = STATE_FIRST_CURR;
308
74.3k
                        offset += count;
309
                        // continue to the next code point
310
74.3k
                        break;
311
140M
                    default:
312
140M
                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313
141M
                }
314
406k
                break;
315
406k
            case STATE_FIRST_QUOTE:
316
332k
                if (cp == u'\'') {
317
273k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318
273k
                } else {
319
59.0k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320
59.0k
                }
321
4.88M
            case STATE_INSIDE_QUOTE:
322
4.88M
                if (cp == u'\'') {
323
336k
                    state = STATE_AFTER_QUOTE;
324
336k
                    offset += count;
325
                    // continue to the next code point
326
336k
                    break;
327
4.54M
                } else {
328
4.54M
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329
4.54M
                }
330
336k
            case STATE_AFTER_QUOTE:
331
336k
                if (cp == u'\'') {
332
278k
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333
278k
                } else {
334
58.0k
                    state = STATE_BASE;
335
                    // re-evaluate this code point
336
58.0k
                    break;
337
58.0k
                }
338
33.9k
            case STATE_FIRST_CURR:
339
33.9k
                if (cp == u'¤') {
340
6.33k
                    state = STATE_SECOND_CURR;
341
6.33k
                    offset += count;
342
                    // continue to the next code point
343
6.33k
                    break;
344
27.6k
                } else {
345
27.6k
                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346
27.6k
                }
347
5.77k
            case STATE_SECOND_CURR:
348
5.77k
                if (cp == u'¤') {
349
3.99k
                    state = STATE_THIRD_CURR;
350
3.99k
                    offset += count;
351
                    // continue to the next code point
352
3.99k
                    break;
353
3.99k
                } else {
354
1.78k
                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355
1.78k
                }
356
3.80k
            case STATE_THIRD_CURR:
357
3.80k
                if (cp == u'¤') {
358
1.53k
                    state = STATE_FOURTH_CURR;
359
1.53k
                    offset += count;
360
                    // continue to the next code point
361
1.53k
                    break;
362
2.27k
                } else {
363
2.27k
                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364
2.27k
                }
365
1.44k
            case STATE_FOURTH_CURR:
366
1.44k
                if (cp == u'¤') {
367
990
                    state = STATE_FIFTH_CURR;
368
990
                    offset += count;
369
                    // continue to the next code point
370
990
                    break;
371
990
                } else {
372
453
                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373
453
                }
374
976
            case STATE_FIFTH_CURR:
375
976
                if (cp == u'¤') {
376
0
                    state = STATE_OVERFLOW_CURR;
377
0
                    offset += count;
378
                    // continue to the next code point
379
0
                    break;
380
976
                } else {
381
976
                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382
976
                }
383
0
            case STATE_OVERFLOW_CURR:
384
0
                if (cp == u'¤') {
385
0
                    offset += count;
386
                    // continue to the next code point and loop back to this state
387
0
                    break;
388
0
                } else {
389
0
                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390
0
                }
391
0
            default:
392
0
                UPRV_UNREACHABLE_EXIT;
393
147M
        }
394
147M
    }
395
    // End of string
396
41.2k
    switch (state) {
397
0
        case STATE_BASE:
398
            // No more tokens in string.
399
0
            return {-1};
400
0
        case STATE_FIRST_QUOTE:
401
0
        case STATE_INSIDE_QUOTE:
402
            // For consistent behavior with the JDK and ICU 58, set an error here.
403
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
404
0
            return {-1};
405
0
        case STATE_AFTER_QUOTE:
406
            // No more tokens in string.
407
0
            return {-1};
408
40.3k
        case STATE_FIRST_CURR:
409
40.3k
            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410
561
        case STATE_SECOND_CURR:
411
561
            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412
189
        case STATE_THIRD_CURR:
413
189
            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414
89
        case STATE_FOURTH_CURR:
415
89
            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416
14
        case STATE_FIFTH_CURR:
417
14
            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418
0
        case STATE_OVERFLOW_CURR:
419
0
            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420
0
        default:
421
0
            UPRV_UNREACHABLE_EXIT;
422
41.2k
    }
423
41.2k
}
424
425
147M
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426
    // First check for the {-1} and default initializer syntax.
427
147M
    if (tag.offset < 0) {
428
0
        return false;
429
147M
    } else if (tag.offset == 0) {
430
1.14M
        return string.length() > 0;
431
1.14M
    }
432
    // The rest of the fields are safe to use now.
433
    // Special case: the last character in string is an end quote.
434
146M
    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435
146M
        string.charAt(tag.offset) == u'\'') {
436
929
        return false;
437
146M
    } else if (tag.state != STATE_BASE) {
438
4.88M
        return true;
439
141M
    } else {
440
141M
        return tag.offset < string.length();
441
141M
    }
442
146M
}
443
444
#endif /* #if !UCONFIG_NO_FORMATTING */