Coverage Report

Created: 2023-02-22 06:51

/src/icu/source/i18n/number_affixutils.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2017 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
#include "number_affixutils.h"
9
#include "unicode/utf16.h"
10
#include "unicode/uniset.h"
11
12
using namespace icu;
13
using namespace icu::number;
14
using namespace icu::number::impl;
15
16
0
TokenConsumer::~TokenConsumer() = default;
17
0
SymbolProvider::~SymbolProvider() = default;
18
19
0
int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20
0
    AffixPatternState state = STATE_BASE;
21
0
    int32_t offset = 0;
22
0
    int32_t length = 0;
23
0
    for (; offset < patternString.length();) {
24
0
        UChar32 cp = patternString.char32At(offset);
25
26
0
        switch (state) {
27
0
            case STATE_BASE:
28
0
                if (cp == u'\'') {
29
                    // First quote
30
0
                    state = STATE_FIRST_QUOTE;
31
0
                } else {
32
                    // Unquoted symbol
33
0
                    length++;
34
0
                }
35
0
                break;
36
0
            case STATE_FIRST_QUOTE:
37
0
                if (cp == u'\'') {
38
                    // Repeated quote
39
0
                    length++;
40
0
                    state = STATE_BASE;
41
0
                } else {
42
                    // Quoted code point
43
0
                    length++;
44
0
                    state = STATE_INSIDE_QUOTE;
45
0
                }
46
0
                break;
47
0
            case STATE_INSIDE_QUOTE:
48
0
                if (cp == u'\'') {
49
                    // End of quoted sequence
50
0
                    state = STATE_AFTER_QUOTE;
51
0
                } else {
52
                    // Quoted code point
53
0
                    length++;
54
0
                }
55
0
                break;
56
0
            case STATE_AFTER_QUOTE:
57
0
                if (cp == u'\'') {
58
                    // Double quote inside of quoted sequence
59
0
                    length++;
60
0
                    state = STATE_INSIDE_QUOTE;
61
0
                } else {
62
                    // Unquoted symbol
63
0
                    length++;
64
0
                }
65
0
                break;
66
0
            default:
67
0
                UPRV_UNREACHABLE;
68
0
        }
69
70
0
        offset += U16_LENGTH(cp);
71
0
    }
72
73
0
    switch (state) {
74
0
        case STATE_FIRST_QUOTE:
75
0
        case STATE_INSIDE_QUOTE:
76
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
77
0
            break;
78
0
        default:
79
0
            break;
80
0
    }
81
82
0
    return length;
83
0
}
84
85
0
UnicodeString AffixUtils::escape(const UnicodeString &input) {
86
0
    AffixPatternState state = STATE_BASE;
87
0
    int32_t offset = 0;
88
0
    UnicodeString output;
89
0
    for (; offset < input.length();) {
90
0
        UChar32 cp = input.char32At(offset);
91
92
0
        switch (cp) {
93
0
            case u'\'':
94
0
                output.append(u"''", -1);
95
0
                break;
96
97
0
            case u'-':
98
0
            case u'+':
99
0
            case u'%':
100
0
            case u'‰':
101
0
            case u'¤':
102
0
                if (state == STATE_BASE) {
103
0
                    output.append(u'\'');
104
0
                    output.append(cp);
105
0
                    state = STATE_INSIDE_QUOTE;
106
0
                } else {
107
0
                    output.append(cp);
108
0
                }
109
0
                break;
110
111
0
            default:
112
0
                if (state == STATE_INSIDE_QUOTE) {
113
0
                    output.append(u'\'');
114
0
                    output.append(cp);
115
0
                    state = STATE_BASE;
116
0
                } else {
117
0
                    output.append(cp);
118
0
                }
119
0
                break;
120
0
        }
121
0
        offset += U16_LENGTH(cp);
122
0
    }
123
124
0
    if (state == STATE_INSIDE_QUOTE) {
125
0
        output.append(u'\'');
126
0
    }
127
128
0
    return output;
129
0
}
130
131
0
Field AffixUtils::getFieldForType(AffixPatternType type) {
132
0
    switch (type) {
133
0
        case TYPE_MINUS_SIGN:
134
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135
0
        case TYPE_PLUS_SIGN:
136
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137
0
        case TYPE_APPROXIMATELY_SIGN:
138
            // TODO: Introduce a new field for the approximately sign?
139
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
140
0
        case TYPE_PERCENT:
141
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
142
0
        case TYPE_PERMILLE:
143
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
144
0
        case TYPE_CURRENCY_SINGLE:
145
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
146
0
        case TYPE_CURRENCY_DOUBLE:
147
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
148
0
        case TYPE_CURRENCY_TRIPLE:
149
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
150
0
        case TYPE_CURRENCY_QUAD:
151
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
152
0
        case TYPE_CURRENCY_QUINT:
153
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
154
0
        case TYPE_CURRENCY_OVERFLOW:
155
0
            return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
156
0
        default:
157
0
            UPRV_UNREACHABLE;
158
0
    }
159
0
}
160
161
int32_t
162
AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
163
0
                     const SymbolProvider &provider, Field field, UErrorCode &status) {
164
0
    int32_t length = 0;
165
0
    AffixTag tag;
166
0
    while (hasNext(tag, affixPattern)) {
167
0
        tag = nextToken(tag, affixPattern, status);
168
0
        if (U_FAILURE(status)) { return length; }
169
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
170
            // Don't go to the provider for this special case
171
0
            length += output.insertCodePoint(
172
0
                position + length,
173
0
                0xFFFD,
174
0
                {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
175
0
                status);
176
0
        } else if (tag.type < 0) {
177
0
            length += output.insert(
178
0
                    position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
179
0
        } else {
180
0
            length += output.insertCodePoint(position + length, tag.codePoint, field, status);
181
0
        }
182
0
    }
183
0
    return length;
184
0
}
185
186
int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
187
0
                                            const SymbolProvider &provider, UErrorCode &status) {
188
0
    int32_t length = 0;
189
0
    AffixTag tag;
190
0
    while (hasNext(tag, affixPattern)) {
191
0
        tag = nextToken(tag, affixPattern, status);
192
0
        if (U_FAILURE(status)) { return length; }
193
0
        if (tag.type == TYPE_CURRENCY_OVERFLOW) {
194
0
            length += 1;
195
0
        } else if (tag.type < 0) {
196
0
            length += provider.getSymbol(tag.type).length();
197
0
        } else {
198
0
            length += U16_LENGTH(tag.codePoint);
199
0
        }
200
0
    }
201
0
    return length;
202
0
}
203
204
bool
205
0
AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
206
0
    if (affixPattern.length() == 0) {
207
0
        return false;
208
0
    }
209
0
    AffixTag tag;
210
0
    while (hasNext(tag, affixPattern)) {
211
0
        tag = nextToken(tag, affixPattern, status);
212
0
        if (U_FAILURE(status)) { return false; }
213
0
        if (tag.type == type) {
214
0
            return true;
215
0
        }
216
0
    }
217
0
    return false;
218
0
}
219
220
0
bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
221
0
    if (affixPattern.length() == 0) {
222
0
        return false;
223
0
    }
224
0
    AffixTag tag;
225
0
    while (hasNext(tag, affixPattern)) {
226
0
        tag = nextToken(tag, affixPattern, status);
227
0
        if (U_FAILURE(status)) { return false; }
228
0
        if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
229
0
            return true;
230
0
        }
231
0
    }
232
0
    return false;
233
0
}
234
235
UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
236
0
                                      char16_t replacementChar, UErrorCode &status) {
237
0
    UnicodeString output(affixPattern); // copy
238
0
    if (affixPattern.length() == 0) {
239
0
        return output;
240
0
    }
241
0
    AffixTag tag;
242
0
    while (hasNext(tag, affixPattern)) {
243
0
        tag = nextToken(tag, affixPattern, status);
244
0
        if (U_FAILURE(status)) { return output; }
245
0
        if (tag.type == type) {
246
0
            output.replace(tag.offset - 1, 1, replacementChar);
247
0
        }
248
0
    }
249
0
    return output;
250
0
}
251
252
bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
253
0
                                                  const UnicodeSet& ignorables, UErrorCode& status) {
254
0
    if (affixPattern.length() == 0) {
255
0
        return true;
256
0
    }
257
0
    AffixTag tag;
258
0
    while (hasNext(tag, affixPattern)) {
259
0
        tag = nextToken(tag, affixPattern, status);
260
0
        if (U_FAILURE(status)) { return false; }
261
0
        if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
262
0
            return false;
263
0
        }
264
0
    }
265
0
    return true;
266
0
}
267
268
void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
269
0
                                     UErrorCode& status) {
270
0
    if (affixPattern.length() == 0) {
271
0
        return;
272
0
    }
273
0
    AffixTag tag;
274
0
    while (hasNext(tag, affixPattern)) {
275
0
        tag = nextToken(tag, affixPattern, status);
276
0
        if (U_FAILURE(status)) { return; }
277
0
        consumer.consumeToken(tag.type, tag.codePoint, status);
278
0
        if (U_FAILURE(status)) { return; }
279
0
    }
280
0
}
281
282
0
AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
283
0
    int32_t offset = tag.offset;
284
0
    int32_t state = tag.state;
285
0
    for (; offset < patternString.length();) {
286
0
        UChar32 cp = patternString.char32At(offset);
287
0
        int32_t count = U16_LENGTH(cp);
288
289
0
        switch (state) {
290
0
            case STATE_BASE:
291
0
                switch (cp) {
292
0
                    case u'\'':
293
0
                        state = STATE_FIRST_QUOTE;
294
0
                        offset += count;
295
                        // continue to the next code point
296
0
                        break;
297
0
                    case u'-':
298
0
                        return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
299
0
                    case u'+':
300
0
                        return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
301
0
                    case u'~':
302
0
                        return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
303
0
                    case u'%':
304
0
                        return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
305
0
                    case u'‰':
306
0
                        return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
307
0
                    case u'¤':
308
0
                        state = STATE_FIRST_CURR;
309
0
                        offset += count;
310
                        // continue to the next code point
311
0
                        break;
312
0
                    default:
313
0
                        return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
314
0
                }
315
0
                break;
316
0
            case STATE_FIRST_QUOTE:
317
0
                if (cp == u'\'') {
318
0
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
319
0
                } else {
320
0
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
321
0
                }
322
0
            case STATE_INSIDE_QUOTE:
323
0
                if (cp == u'\'') {
324
0
                    state = STATE_AFTER_QUOTE;
325
0
                    offset += count;
326
                    // continue to the next code point
327
0
                    break;
328
0
                } else {
329
0
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
330
0
                }
331
0
            case STATE_AFTER_QUOTE:
332
0
                if (cp == u'\'') {
333
0
                    return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
334
0
                } else {
335
0
                    state = STATE_BASE;
336
                    // re-evaluate this code point
337
0
                    break;
338
0
                }
339
0
            case STATE_FIRST_CURR:
340
0
                if (cp == u'¤') {
341
0
                    state = STATE_SECOND_CURR;
342
0
                    offset += count;
343
                    // continue to the next code point
344
0
                    break;
345
0
                } else {
346
0
                    return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
347
0
                }
348
0
            case STATE_SECOND_CURR:
349
0
                if (cp == u'¤') {
350
0
                    state = STATE_THIRD_CURR;
351
0
                    offset += count;
352
                    // continue to the next code point
353
0
                    break;
354
0
                } else {
355
0
                    return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
356
0
                }
357
0
            case STATE_THIRD_CURR:
358
0
                if (cp == u'¤') {
359
0
                    state = STATE_FOURTH_CURR;
360
0
                    offset += count;
361
                    // continue to the next code point
362
0
                    break;
363
0
                } else {
364
0
                    return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
365
0
                }
366
0
            case STATE_FOURTH_CURR:
367
0
                if (cp == u'¤') {
368
0
                    state = STATE_FIFTH_CURR;
369
0
                    offset += count;
370
                    // continue to the next code point
371
0
                    break;
372
0
                } else {
373
0
                    return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
374
0
                }
375
0
            case STATE_FIFTH_CURR:
376
0
                if (cp == u'¤') {
377
0
                    state = STATE_OVERFLOW_CURR;
378
0
                    offset += count;
379
                    // continue to the next code point
380
0
                    break;
381
0
                } else {
382
0
                    return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
383
0
                }
384
0
            case STATE_OVERFLOW_CURR:
385
0
                if (cp == u'¤') {
386
0
                    offset += count;
387
                    // continue to the next code point and loop back to this state
388
0
                    break;
389
0
                } else {
390
0
                    return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
391
0
                }
392
0
            default:
393
0
                UPRV_UNREACHABLE;
394
0
        }
395
0
    }
396
    // End of string
397
0
    switch (state) {
398
0
        case STATE_BASE:
399
            // No more tokens in string.
400
0
            return {-1};
401
0
        case STATE_FIRST_QUOTE:
402
0
        case STATE_INSIDE_QUOTE:
403
            // For consistent behavior with the JDK and ICU 58, set an error here.
404
0
            status = U_ILLEGAL_ARGUMENT_ERROR;
405
0
            return {-1};
406
0
        case STATE_AFTER_QUOTE:
407
            // No more tokens in string.
408
0
            return {-1};
409
0
        case STATE_FIRST_CURR:
410
0
            return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
411
0
        case STATE_SECOND_CURR:
412
0
            return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
413
0
        case STATE_THIRD_CURR:
414
0
            return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
415
0
        case STATE_FOURTH_CURR:
416
0
            return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
417
0
        case STATE_FIFTH_CURR:
418
0
            return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
419
0
        case STATE_OVERFLOW_CURR:
420
0
            return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
421
0
        default:
422
0
            UPRV_UNREACHABLE;
423
0
    }
424
0
}
425
426
0
bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
427
    // First check for the {-1} and default initializer syntax.
428
0
    if (tag.offset < 0) {
429
0
        return false;
430
0
    } else if (tag.offset == 0) {
431
0
        return string.length() > 0;
432
0
    }
433
    // The rest of the fields are safe to use now.
434
    // Special case: the last character in string is an end quote.
435
0
    if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
436
0
        string.charAt(tag.offset) == u'\'') {
437
0
        return false;
438
0
    } else if (tag.state != STATE_BASE) {
439
0
        return true;
440
0
    } else {
441
0
        return tag.offset < string.length();
442
0
    }
443
0
}
444
445
#endif /* #if !UCONFIG_NO_FORMATTING */