Coverage Report

Created: 2026-01-22 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/numparse_decimal.cpp
Line
Count
Source
1
// © 2018 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
// Allow implicit conversion from char16_t* to UnicodeString for this file:
9
// Helpful in toString methods and elsewhere.
10
#define UNISTR_FROM_STRING_EXPLICIT
11
12
#include "numparse_types.h"
13
#include "numparse_decimal.h"
14
#include "static_unicode_sets.h"
15
#include "numparse_utils.h"
16
#include "unicode/uchar.h"
17
#include "putilimp.h"
18
#include "number_decimalquantity.h"
19
#include "string_segment.h"
20
21
using namespace icu;
22
using namespace icu::numparse;
23
using namespace icu::numparse::impl;
24
25
26
DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27
208k
                               parse_flags_t parseFlags) {
28
208k
    if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29
19.7k
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30
19.7k
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31
188k
    } else {
32
188k
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33
188k
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34
188k
    }
35
208k
    bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36
37
    // Parsing is very lenient even in strict mode, almost any dot or comma is a
38
    // grouping separator. Parsing strings like "1.234" in French was treating '.'
39
    // like an ignorable grouping separator, and we want it to be excluded.
40
    // We keep the public behavior when strictParse is false, but when it is true
41
    // we restrict grouping separators to the smaller set of equivalents.
42
208k
    unisets::Key groupingKey = unisets::chooseFrom(groupingSeparator,
43
208k
            strictSeparators ? unisets::STRICT_COMMA : unisets::ALL_SEPARATORS,
44
208k
            strictSeparators ? unisets::STRICT_PERIOD : unisets::ALL_SEPARATORS);
45
208k
    if (groupingKey < 0) {
46
34.5k
        groupingKey = unisets::chooseFrom(
47
34.5k
            groupingSeparator, unisets::OTHER_GROUPING_SEPARATORS);
48
34.5k
    }
49
208k
    if (groupingKey >= 0) {
50
        // Attempt to find separators in the static cache
51
205k
        groupingUniSet = unisets::get(groupingKey);
52
205k
    } else if (!groupingSeparator.isEmpty()) {
53
2.53k
        auto* set = new UnicodeSet();
54
2.53k
        set->add(groupingSeparator.char32At(0));
55
2.53k
        set->freeze();
56
2.53k
        groupingUniSet = set;
57
2.53k
        fLocalGroupingUniSet.adoptInstead(set);
58
2.53k
    } else {
59
0
        groupingUniSet = unisets::get(unisets::EMPTY);
60
0
    }
61
62
208k
    unisets::Key decimalKey = unisets::chooseFrom(
63
208k
            decimalSeparator,
64
208k
            strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
65
208k
            strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
66
208k
    if (decimalKey >= 0) {
67
206k
        decimalUniSet = unisets::get(decimalKey);
68
206k
    } else if (!decimalSeparator.isEmpty()) {
69
1.74k
        auto* set = new UnicodeSet();
70
1.74k
        set->add(decimalSeparator.char32At(0));
71
1.74k
        set->freeze();
72
1.74k
        decimalUniSet = set;
73
1.74k
        fLocalDecimalUniSet.adoptInstead(set);
74
1.74k
    } else {
75
0
        decimalUniSet = unisets::get(unisets::EMPTY);
76
0
    }
77
78
208k
    if (groupingKey >= 0 && decimalKey >= 0) {
79
        // Everything is available in the static cache
80
203k
        separatorSet = groupingUniSet;
81
203k
        leadSet = unisets::get(
82
203k
                strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
83
203k
                                 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
84
203k
    } else {
85
4.28k
        auto* set = new UnicodeSet();
86
4.28k
        set->addAll(*groupingUniSet);
87
4.28k
        set->addAll(*decimalUniSet);
88
4.28k
        set->freeze();
89
4.28k
        separatorSet = set;
90
4.28k
        fLocalSeparatorSet.adoptInstead(set);
91
4.28k
        leadSet = nullptr;
92
4.28k
    }
93
94
208k
    UChar32 cpZero = symbols.getCodePointZero();
95
208k
    if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
96
        // Uncommon case: okay to allocate.
97
0
        auto* digitStrings = new UnicodeString[10];
98
0
        fLocalDigitStrings.adoptInstead(digitStrings);
99
0
        for (int32_t i = 0; i <= 9; i++) {
100
0
            digitStrings[i] = symbols.getConstDigitSymbol(i);
101
0
        }
102
0
    }
103
104
208k
    requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
105
208k
    groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
106
208k
    integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
107
208k
    grouping1 = grouper.getPrimary();
108
208k
    grouping2 = grouper.getSecondary();
109
110
    // Fraction grouping parsing is disabled for now but could be enabled later.
111
    // See https://unicode-org.atlassian.net/browse/ICU-10794
112
    // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
113
208k
}
114
115
1.65M
bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
116
1.65M
    return match(segment, result, 0, status);
117
1.65M
}
118
119
bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
120
1.71M
                           UErrorCode&) const {
121
1.71M
    if (result.seenNumber() && exponentSign == 0) {
122
        // A number has already been consumed.
123
19.1k
        return false;
124
1.69M
    } else if (exponentSign != 0) {
125
        // scientific notation always comes after the number
126
56.7k
        U_ASSERT(!result.quantity.bogus);
127
56.7k
    }
128
129
    // Initial offset before any character consumption.
130
1.69M
    int32_t initialOffset = segment.getOffset();
131
132
    // Return value: whether to ask for more characters.
133
1.69M
    bool maybeMore = false;
134
135
    // All digits consumed so far.
136
1.69M
    number::impl::DecimalQuantity digitsConsumed;
137
1.69M
    digitsConsumed.bogus = true;
138
139
    // The total number of digits after the decimal place, used for scaling the result.
140
1.69M
    int32_t digitsAfterDecimalPlace = 0;
141
142
    // The actual grouping and decimal separators used in the string.
143
    // If non-null, we have seen that token.
144
1.69M
    UnicodeString actualGroupingString;
145
1.69M
    UnicodeString actualDecimalString;
146
1.69M
    actualGroupingString.setToBogus();
147
1.69M
    actualDecimalString.setToBogus();
148
149
    // Information for two groups: the previous group and the current group.
150
    //
151
    // Each group has three pieces of information:
152
    //
153
    // Offset: the string position of the beginning of the group, including a leading separator
154
    // if there was a leading separator. This is needed in case we need to rewind the parse to
155
    // that position.
156
    //
157
    // Separator type:
158
    // 0 => beginning of string
159
    // 1 => lead separator is a grouping separator
160
    // 2 => lead separator is a decimal separator
161
    //
162
    // Count: the number of digits in the group. If -1, the group has been validated.
163
1.69M
    int32_t currGroupOffset = 0;
164
1.69M
    int32_t currGroupSepType = 0;
165
1.69M
    int32_t currGroupCount = 0;
166
1.69M
    int32_t prevGroupOffset = -1;
167
1.69M
    int32_t prevGroupSepType = -1;
168
1.69M
    int32_t prevGroupCount = -1;
169
170
78.1M
    while (segment.length() > 0) {
171
78.1M
        maybeMore = false;
172
173
        // Attempt to match a digit.
174
78.1M
        int8_t digit = -1;
175
176
        // Try by code point digit value.
177
78.1M
        UChar32 cp = segment.getCodePoint();
178
78.1M
        if (u_isdigit(cp)) {
179
75.9M
            segment.adjustOffset(U16_LENGTH(cp));
180
75.9M
            digit = static_cast<int8_t>(u_digit(cp, 10));
181
75.9M
        }
182
183
        // Try by digit string.
184
78.1M
        if (digit == -1 && !fLocalDigitStrings.isNull()) {
185
0
            for (int32_t i = 0; i < 10; i++) {
186
0
                const UnicodeString& str = fLocalDigitStrings[i];
187
0
                if (str.isEmpty()) {
188
0
                    continue;
189
0
                }
190
0
                int32_t overlap = segment.getCommonPrefixLength(str);
191
0
                if (overlap == str.length()) {
192
0
                    segment.adjustOffset(overlap);
193
0
                    digit = static_cast<int8_t>(i);
194
0
                    break;
195
0
                }
196
0
                maybeMore = maybeMore || (overlap == segment.length());
197
0
            }
198
0
        }
199
200
78.1M
        if (digit >= 0) {
201
            // Digit was found.
202
75.9M
            if (digitsConsumed.bogus) {
203
1.46M
                digitsConsumed.bogus = false;
204
1.46M
                digitsConsumed.clear();
205
1.46M
            }
206
75.9M
            digitsConsumed.appendDigit(digit, 0, true);
207
75.9M
            currGroupCount++;
208
75.9M
            if (!actualDecimalString.isBogus()) {
209
963k
                digitsAfterDecimalPlace++;
210
963k
            }
211
75.9M
            continue;
212
75.9M
        }
213
214
        // Attempt to match a literal grouping or decimal separator.
215
2.17M
        bool isDecimal = false;
216
2.17M
        bool isGrouping = false;
217
218
        // 1) Attempt the decimal separator string literal.
219
        // if (we have not seen a decimal separator yet) { ... }
220
2.17M
        if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
221
1.98M
            int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
222
1.98M
            maybeMore = maybeMore || (overlap == segment.length());
223
1.98M
            if (overlap == decimalSeparator.length()) {
224
200k
                isDecimal = true;
225
200k
                actualDecimalString = decimalSeparator;
226
200k
            }
227
1.98M
        }
228
229
        // 2) Attempt to match the actual grouping string literal.
230
2.17M
        if (!actualGroupingString.isBogus()) {
231
300k
            int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
232
300k
            maybeMore = maybeMore || (overlap == segment.length());
233
300k
            if (overlap == actualGroupingString.length()) {
234
13.6k
                isGrouping = true;
235
13.6k
            }
236
300k
        }
237
238
        // 2.5) Attempt to match a new the grouping separator string literal.
239
        // if (we have not seen a grouping or decimal separator yet) { ... }
240
2.17M
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
241
348k
            !groupingSeparator.isEmpty()) {
242
348k
            int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
243
348k
            maybeMore = maybeMore || (overlap == segment.length());
244
348k
            if (overlap == groupingSeparator.length()) {
245
192k
                isGrouping = true;
246
192k
                actualGroupingString = groupingSeparator;
247
192k
            }
248
348k
        }
249
250
        // 3) Attempt to match a decimal separator from the equivalence set.
251
        // if (we have not seen a decimal separator yet) { ... }
252
        // The !isGrouping is to confirm that we haven't yet matched the current character.
253
2.17M
        if (!isGrouping && actualDecimalString.isBogus()) {
254
1.57M
            if (decimalUniSet->contains(cp)) {
255
2.83k
                isDecimal = true;
256
2.83k
                actualDecimalString = UnicodeString(cp);
257
2.83k
            }
258
1.57M
        }
259
260
        // 4) Attempt to match a grouping separator from the equivalence set.
261
        // if (we have not seen a grouping or decimal separator yet) { ... }
262
2.17M
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
263
155k
            if (groupingUniSet->contains(cp)) {
264
112k
                isGrouping = true;
265
112k
                actualGroupingString = UnicodeString(cp);
266
112k
            }
267
155k
        }
268
269
        // Leave if we failed to match this as a separator.
270
2.17M
        if (!isDecimal && !isGrouping) {
271
1.65M
            break;
272
1.65M
        }
273
274
        // Check for conditions when we don't want to accept the separator.
275
522k
        if (isDecimal && integerOnly) {
276
901
            break;
277
521k
        } else if (currGroupSepType == 2 && isGrouping) {
278
            // Fraction grouping
279
4.05k
            break;
280
4.05k
        }
281
282
        // Validate intermediate grouping sizes.
283
517k
        bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
284
517k
        bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
285
517k
        if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
286
            // Invalid grouping sizes.
287
16.4k
            if (isGrouping && currGroupCount == 0) {
288
                // Trailing grouping separators: these are taken care of below
289
1.15k
                U_ASSERT(currGroupSepType == 1);
290
15.2k
            } else if (requireGroupingMatch) {
291
                // Strict mode: reject the parse
292
15.2k
                digitsConsumed.clear();
293
15.2k
                digitsConsumed.bogus = true;
294
15.2k
            }
295
16.4k
            break;
296
501k
        } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
297
352
            break;
298
500k
        } else {
299
            // Grouping sizes OK so far.
300
500k
            prevGroupOffset = currGroupOffset;
301
500k
            prevGroupCount = currGroupCount;
302
500k
            if (isDecimal) {
303
                // Do not validate this group any more.
304
193k
                prevGroupSepType = -1;
305
306k
            } else {
306
306k
                prevGroupSepType = currGroupSepType;
307
306k
            }
308
500k
        }
309
310
        // OK to accept the separator.
311
        // Special case: don't update currGroup if it is empty; this allows two grouping
312
        // separators in a row in lenient mode.
313
500k
        if (currGroupCount != 0) {
314
18.0k
            currGroupOffset = segment.getOffset();
315
18.0k
        }
316
500k
        currGroupSepType = isGrouping ? 1 : 2;
317
500k
        currGroupCount = 0;
318
500k
        if (isGrouping) {
319
306k
            segment.adjustOffset(actualGroupingString.length());
320
306k
        } else {
321
193k
            segment.adjustOffset(actualDecimalString.length());
322
193k
        }
323
500k
    }
324
325
    // End of main loop.
326
    // Back up if there was a trailing grouping separator.
327
    // Shift prev -> curr so we can check it as a final group.
328
1.69M
    if (currGroupSepType != 2 && currGroupCount == 0) {
329
230k
        maybeMore = true;
330
230k
        segment.setOffset(currGroupOffset);
331
230k
        currGroupOffset = prevGroupOffset;
332
230k
        currGroupSepType = prevGroupSepType;
333
230k
        currGroupCount = prevGroupCount;
334
230k
        prevGroupOffset = -1;
335
230k
        prevGroupSepType = 0;
336
230k
        prevGroupCount = 1;
337
230k
    }
338
339
    // Validate final grouping sizes.
340
1.69M
    bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
341
1.69M
    bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
342
1.69M
    if (!requireGroupingMatch) {
343
        // The cases we need to handle here are lone digits.
344
        // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
345
        // See more examples in numberformattestspecification.txt
346
56.7k
        int32_t digitsToRemove = 0;
347
56.7k
        if (!prevValidSecondary) {
348
0
            segment.setOffset(prevGroupOffset);
349
0
            digitsToRemove += prevGroupCount;
350
0
            digitsToRemove += currGroupCount;
351
56.7k
        } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
352
0
            maybeMore = true;
353
0
            segment.setOffset(currGroupOffset);
354
0
            digitsToRemove += currGroupCount;
355
0
        }
356
56.7k
        if (digitsToRemove != 0) {
357
0
            digitsConsumed.adjustMagnitude(-digitsToRemove);
358
0
            digitsConsumed.truncate();
359
0
        }
360
56.7k
        prevValidSecondary = true;
361
56.7k
        currValidPrimary = true;
362
56.7k
    }
363
1.69M
    if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
364
        // Grouping failure.
365
269k
        digitsConsumed.bogus = true;
366
269k
    }
367
368
    // Strings that start with a separator but have no digits,
369
    // or strings that failed a grouping size check.
370
1.69M
    if (digitsConsumed.bogus) {
371
418k
        maybeMore = maybeMore || (segment.length() == 0);
372
418k
        segment.setOffset(initialOffset);
373
418k
        return maybeMore;
374
418k
    }
375
376
    // We passed all inspections. Start post-processing.
377
378
    // Adjust for fraction part.
379
1.27M
    digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
380
381
    // Set the digits, either normal or exponent.
382
1.27M
    if (exponentSign != 0 && segment.getOffset() != initialOffset) {
383
22.0k
        bool overflow = false;
384
22.0k
        if (digitsConsumed.fitsInLong()) {
385
21.7k
            int64_t exponentLong = digitsConsumed.toLong(false);
386
21.7k
            U_ASSERT(exponentLong >= 0);
387
21.7k
            if (exponentLong <= INT32_MAX) {
388
20.3k
                auto exponentInt = static_cast<int32_t>(exponentLong);
389
20.3k
                if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
390
420
                    overflow = true;
391
420
                }
392
20.3k
            } else {
393
1.34k
                overflow = true;
394
1.34k
            }
395
21.7k
        } else {
396
304
            overflow = true;
397
304
        }
398
22.0k
        if (overflow) {
399
2.06k
            if (exponentSign == -1) {
400
                // Set to zero
401
377
                result.quantity.clear();
402
1.69k
            } else {
403
                // Set to infinity
404
1.69k
                result.quantity.bogus = true;
405
1.69k
                result.flags |= FLAG_INFINITY;
406
1.69k
            }
407
2.06k
        }
408
1.25M
    } else {
409
1.25M
        result.quantity = digitsConsumed;
410
1.25M
    }
411
412
    // Set other information into the result and return.
413
1.27M
    if (!actualDecimalString.isBogus()) {
414
193k
        result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
415
193k
    }
416
1.27M
    result.setCharsConsumed(segment);
417
1.27M
    return segment.length() == 0 || maybeMore;
418
1.69M
}
419
420
4.42M
bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
421
4.42M
    if (requireGroupingMatch) {
422
4.31M
        if (sepType == -1) {
423
            // No such group (prevGroup before first shift).
424
1.83M
            return true;
425
2.48M
        } else if (sepType == 0) {
426
            // First group.
427
2.07M
            if (isPrimary) {
428
                // No grouping separators is OK.
429
1.67M
                return true;
430
1.67M
            } else {
431
404k
                return count != 0 && count <= grouping2;
432
404k
            }
433
2.07M
        } else if (sepType == 1) {
434
            // Middle group.
435
211k
            if (isPrimary) {
436
210k
                return count == grouping1;
437
210k
            } else {
438
1.20k
                return count == grouping2;
439
1.20k
            }
440
211k
        } else {
441
193k
            U_ASSERT(sepType == 2);
442
            // After the decimal separator.
443
193k
            return true;
444
193k
        }
445
4.31M
    } else {
446
113k
        if (sepType == 1) {
447
            // #11230: don't accept middle groups with only 1 digit.
448
0
            return count != 1;
449
113k
        } else {
450
113k
            return true;
451
113k
        }
452
113k
    }
453
4.42M
}
454
455
6.81M
bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
456
    // The common case uses a static leadSet for efficiency.
457
6.81M
    if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
458
6.04M
        return segment.startsWith(*leadSet);
459
6.04M
    }
460
768k
    if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
461
2.74k
        return true;
462
2.74k
    }
463
765k
    if (fLocalDigitStrings.isNull()) {
464
765k
        return false;
465
765k
    }
466
0
    for (int32_t i = 0; i < 10; i++) {
467
0
        if (segment.startsWith(fLocalDigitStrings[i])) {
468
0
            return true;
469
0
        }
470
0
    }
471
0
    return false;
472
0
}
473
474
0
UnicodeString DecimalMatcher::toString() const {
475
0
    return u"<Decimal>";
476
0
}
477
478
479
#endif /* #if !UCONFIG_NO_FORMATTING */