Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/i18n/numparse_decimal.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2018 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
// Allow implicit conversion from char16_t* to UnicodeString for this file:
9
// Helpful in toString methods and elsewhere.
10
#define UNISTR_FROM_STRING_EXPLICIT
11
12
#include "numparse_types.h"
13
#include "numparse_decimal.h"
14
#include "static_unicode_sets.h"
15
#include "numparse_utils.h"
16
#include "unicode/uchar.h"
17
#include "putilimp.h"
18
#include "number_decimalquantity.h"
19
#include "string_segment.h"
20
21
using namespace icu;
22
using namespace icu::numparse;
23
using namespace icu::numparse::impl;
24
25
26
DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27
210k
                               parse_flags_t parseFlags) {
28
210k
    if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29
14.0k
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30
14.0k
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31
196k
    } else {
32
196k
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33
196k
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34
196k
    }
35
210k
    bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36
210k
    unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
37
210k
                                                : unisets::ALL_SEPARATORS;
38
39
    // Attempt to find separators in the static cache
40
41
210k
    groupingUniSet = unisets::get(groupingKey);
42
210k
    unisets::Key decimalKey = unisets::chooseFrom(
43
210k
            decimalSeparator,
44
210k
            strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
45
210k
            strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
46
210k
    if (decimalKey >= 0) {
47
209k
        decimalUniSet = unisets::get(decimalKey);
48
209k
    } else if (!decimalSeparator.isEmpty()) {
49
1.14k
        auto* set = new UnicodeSet();
50
1.14k
        set->add(decimalSeparator.char32At(0));
51
1.14k
        set->freeze();
52
1.14k
        decimalUniSet = set;
53
1.14k
        fLocalDecimalUniSet.adoptInstead(set);
54
1.14k
    } else {
55
0
        decimalUniSet = unisets::get(unisets::EMPTY);
56
0
    }
57
58
210k
    if (groupingKey >= 0 && decimalKey >= 0) {
59
        // Everything is available in the static cache
60
209k
        separatorSet = groupingUniSet;
61
209k
        leadSet = unisets::get(
62
209k
                strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
63
209k
                                 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
64
209k
    } else {
65
1.14k
        auto* set = new UnicodeSet();
66
1.14k
        set->addAll(*groupingUniSet);
67
1.14k
        set->addAll(*decimalUniSet);
68
1.14k
        set->freeze();
69
1.14k
        separatorSet = set;
70
1.14k
        fLocalSeparatorSet.adoptInstead(set);
71
1.14k
        leadSet = nullptr;
72
1.14k
    }
73
74
210k
    UChar32 cpZero = symbols.getCodePointZero();
75
210k
    if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
76
        // Uncommon case: okay to allocate.
77
0
        auto* digitStrings = new UnicodeString[10];
78
0
        fLocalDigitStrings.adoptInstead(digitStrings);
79
0
        for (int32_t i = 0; i <= 9; i++) {
80
0
            digitStrings[i] = symbols.getConstDigitSymbol(i);
81
0
        }
82
0
    }
83
84
210k
    requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
85
210k
    groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
86
210k
    integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
87
210k
    grouping1 = grouper.getPrimary();
88
210k
    grouping2 = grouper.getSecondary();
89
90
    // Fraction grouping parsing is disabled for now but could be enabled later.
91
    // See https://unicode-org.atlassian.net/browse/ICU-10794
92
    // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
93
210k
}
94
95
2.77M
bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
96
2.77M
    return match(segment, result, 0, status);
97
2.77M
}
98
99
bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
100
3.15M
                           UErrorCode&) const {
101
3.15M
    if (result.seenNumber() && exponentSign == 0) {
102
        // A number has already been consumed.
103
244k
        return false;
104
2.91M
    } else if (exponentSign != 0) {
105
        // scientific notation always comes after the number
106
381k
        U_ASSERT(!result.quantity.bogus);
107
381k
    }
108
109
    // Initial offset before any character consumption.
110
2.91M
    int32_t initialOffset = segment.getOffset();
111
112
    // Return value: whether to ask for more characters.
113
2.91M
    bool maybeMore = false;
114
115
    // All digits consumed so far.
116
2.91M
    number::impl::DecimalQuantity digitsConsumed;
117
2.91M
    digitsConsumed.bogus = true;
118
119
    // The total number of digits after the decimal place, used for scaling the result.
120
2.91M
    int32_t digitsAfterDecimalPlace = 0;
121
122
    // The actual grouping and decimal separators used in the string.
123
    // If non-null, we have seen that token.
124
2.91M
    UnicodeString actualGroupingString;
125
2.91M
    UnicodeString actualDecimalString;
126
2.91M
    actualGroupingString.setToBogus();
127
2.91M
    actualDecimalString.setToBogus();
128
129
    // Information for two groups: the previous group and the current group.
130
    //
131
    // Each group has three pieces of information:
132
    //
133
    // Offset: the string position of the beginning of the group, including a leading separator
134
    // if there was a leading separator. This is needed in case we need to rewind the parse to
135
    // that position.
136
    //
137
    // Separator type:
138
    // 0 => beginning of string
139
    // 1 => lead separator is a grouping separator
140
    // 2 => lead separator is a decimal separator
141
    //
142
    // Count: the number of digits in the group. If -1, the group has been validated.
143
2.91M
    int32_t currGroupOffset = 0;
144
2.91M
    int32_t currGroupSepType = 0;
145
2.91M
    int32_t currGroupCount = 0;
146
2.91M
    int32_t prevGroupOffset = -1;
147
2.91M
    int32_t prevGroupSepType = -1;
148
2.91M
    int32_t prevGroupCount = -1;
149
150
90.3M
    while (segment.length() > 0) {
151
90.3M
        maybeMore = false;
152
153
        // Attempt to match a digit.
154
90.3M
        int8_t digit = -1;
155
156
        // Try by code point digit value.
157
90.3M
        UChar32 cp = segment.getCodePoint();
158
90.3M
        if (u_isdigit(cp)) {
159
87.3M
            segment.adjustOffset(U16_LENGTH(cp));
160
87.3M
            digit = static_cast<int8_t>(u_digit(cp, 10));
161
87.3M
        }
162
163
        // Try by digit string.
164
90.3M
        if (digit == -1 && !fLocalDigitStrings.isNull()) {
165
0
            for (int32_t i = 0; i < 10; i++) {
166
0
                const UnicodeString& str = fLocalDigitStrings[i];
167
0
                if (str.isEmpty()) {
168
0
                    continue;
169
0
                }
170
0
                int32_t overlap = segment.getCommonPrefixLength(str);
171
0
                if (overlap == str.length()) {
172
0
                    segment.adjustOffset(overlap);
173
0
                    digit = static_cast<int8_t>(i);
174
0
                    break;
175
0
                }
176
0
                maybeMore = maybeMore || (overlap == segment.length());
177
0
            }
178
0
        }
179
180
90.3M
        if (digit >= 0) {
181
            // Digit was found.
182
87.3M
            if (digitsConsumed.bogus) {
183
2.34M
                digitsConsumed.bogus = false;
184
2.34M
                digitsConsumed.clear();
185
2.34M
            }
186
87.3M
            digitsConsumed.appendDigit(digit, 0, true);
187
87.3M
            currGroupCount++;
188
87.3M
            if (!actualDecimalString.isBogus()) {
189
4.09M
                digitsAfterDecimalPlace++;
190
4.09M
            }
191
87.3M
            continue;
192
87.3M
        }
193
194
        // Attempt to match a literal grouping or decimal separator.
195
3.03M
        bool isDecimal = false;
196
3.03M
        bool isGrouping = false;
197
198
        // 1) Attempt the decimal separator string literal.
199
        // if (we have not seen a decimal separator yet) { ... }
200
3.03M
        if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
201
2.90M
            int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
202
2.90M
            maybeMore = maybeMore || (overlap == segment.length());
203
2.90M
            if (overlap == decimalSeparator.length()) {
204
131k
                isDecimal = true;
205
131k
                actualDecimalString = decimalSeparator;
206
131k
            }
207
2.90M
        }
208
209
        // 2) Attempt to match the actual grouping string literal.
210
3.03M
        if (!actualGroupingString.isBogus()) {
211
4.03k
            int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
212
4.03k
            maybeMore = maybeMore || (overlap == segment.length());
213
4.03k
            if (overlap == actualGroupingString.length()) {
214
2.47k
                isGrouping = true;
215
2.47k
            }
216
4.03k
        }
217
218
        // 2.5) Attempt to match a new the grouping separator string literal.
219
        // if (we have not seen a grouping or decimal separator yet) { ... }
220
3.03M
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
221
3.03M
            !groupingSeparator.isEmpty()) {
222
20.1k
            int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
223
20.1k
            maybeMore = maybeMore || (overlap == segment.length());
224
20.1k
            if (overlap == groupingSeparator.length()) {
225
1.86k
                isGrouping = true;
226
1.86k
                actualGroupingString = groupingSeparator;
227
1.86k
            }
228
20.1k
        }
229
230
        // 3) Attempt to match a decimal separator from the equivalence set.
231
        // if (we have not seen a decimal separator yet) { ... }
232
        // The !isGrouping is to confirm that we haven't yet matched the current character.
233
3.03M
        if (!isGrouping && actualDecimalString.isBogus()) {
234
2.77M
            if (decimalUniSet->contains(cp)) {
235
1.57k
                isDecimal = true;
236
1.57k
                actualDecimalString = UnicodeString(cp);
237
1.57k
            }
238
2.77M
        }
239
240
        // 4) Attempt to match a grouping separator from the equivalence set.
241
        // if (we have not seen a grouping or decimal separator yet) { ... }
242
3.03M
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
243
18.1k
            if (groupingUniSet->contains(cp)) {
244
1.78k
                isGrouping = true;
245
1.78k
                actualGroupingString = UnicodeString(cp);
246
1.78k
            }
247
18.1k
        }
248
249
        // Leave if we failed to match this as a separator.
250
3.03M
        if (!isDecimal && !isGrouping) {
251
2.89M
            break;
252
2.89M
        }
253
254
        // Check for conditions when we don't want to accept the separator.
255
138k
        if (isDecimal && integerOnly) {
256
2.54k
            break;
257
136k
        } else if (currGroupSepType == 2 && isGrouping) {
258
            // Fraction grouping
259
0
            break;
260
0
        }
261
262
        // Validate intermediate grouping sizes.
263
136k
        bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
264
136k
        bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
265
136k
        if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
266
            // Invalid grouping sizes.
267
1.29k
            if (isGrouping && currGroupCount == 0) {
268
                // Trailing grouping separators: these are taken care of below
269
121
                U_ASSERT(currGroupSepType == 1);
270
1.17k
            } else if (requireGroupingMatch) {
271
                // Strict mode: reject the parse
272
1.17k
                digitsConsumed.clear();
273
1.17k
                digitsConsumed.bogus = true;
274
1.17k
            }
275
1.29k
            break;
276
134k
        } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
277
53
            break;
278
134k
        } else {
279
            // Grouping sizes OK so far.
280
134k
            prevGroupOffset = currGroupOffset;
281
134k
            prevGroupCount = currGroupCount;
282
134k
            if (isDecimal) {
283
                // Do not validate this group any more.
284
130k
                prevGroupSepType = -1;
285
130k
            } else {
286
4.81k
                prevGroupSepType = currGroupSepType;
287
4.81k
            }
288
134k
        }
289
290
        // OK to accept the separator.
291
        // Special case: don't update currGroup if it is empty; this allows two grouping
292
        // separators in a row in lenient mode.
293
134k
        if (currGroupCount != 0) {
294
21.9k
            currGroupOffset = segment.getOffset();
295
21.9k
        }
296
134k
        currGroupSepType = isGrouping ? 1 : 2;
297
134k
        currGroupCount = 0;
298
134k
        if (isGrouping) {
299
4.81k
            segment.adjustOffset(actualGroupingString.length());
300
130k
        } else {
301
130k
            segment.adjustOffset(actualDecimalString.length());
302
130k
        }
303
134k
    }
304
305
    // End of main loop.
306
    // Back up if there was a trailing grouping separator.
307
    // Shift prev -> curr so we can check it as a final group.
308
2.91M
    if (currGroupSepType != 2 && currGroupCount == 0) {
309
571k
        maybeMore = true;
310
571k
        segment.setOffset(currGroupOffset);
311
571k
        currGroupOffset = prevGroupOffset;
312
571k
        currGroupSepType = prevGroupSepType;
313
571k
        currGroupCount = prevGroupCount;
314
571k
        prevGroupOffset = -1;
315
571k
        prevGroupSepType = 0;
316
571k
        prevGroupCount = 1;
317
571k
    }
318
319
    // Validate final grouping sizes.
320
2.91M
    bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
321
2.91M
    bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
322
2.91M
    if (!requireGroupingMatch) {
323
        // The cases we need to handle here are lone digits.
324
        // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
325
        // See more examples in numberformattestspecification.txt
326
381k
        int32_t digitsToRemove = 0;
327
381k
        if (!prevValidSecondary) {
328
0
            segment.setOffset(prevGroupOffset);
329
0
            digitsToRemove += prevGroupCount;
330
0
            digitsToRemove += currGroupCount;
331
381k
        } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
332
0
            maybeMore = true;
333
0
            segment.setOffset(currGroupOffset);
334
0
            digitsToRemove += currGroupCount;
335
0
        }
336
381k
        if (digitsToRemove != 0) {
337
0
            digitsConsumed.adjustMagnitude(-digitsToRemove);
338
0
            digitsConsumed.truncate();
339
0
        }
340
381k
        prevValidSecondary = true;
341
381k
        currValidPrimary = true;
342
381k
    }
343
2.91M
    if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
344
        // Grouping failure.
345
448k
        digitsConsumed.bogus = true;
346
448k
    }
347
348
    // Strings that start with a separator but have no digits,
349
    // or strings that failed a grouping size check.
350
2.91M
    if (digitsConsumed.bogus) {
351
573k
        maybeMore = maybeMore || (segment.length() == 0);
352
573k
        segment.setOffset(initialOffset);
353
573k
        return maybeMore;
354
573k
    }
355
356
    // We passed all inspections. Start post-processing.
357
358
    // Adjust for fraction part.
359
2.34M
    digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
360
361
    // Set the digits, either normal or exponent.
362
2.34M
    if (exponentSign != 0 && segment.getOffset() != initialOffset) {
363
257k
        bool overflow = false;
364
257k
        if (digitsConsumed.fitsInLong()) {
365
250k
            int64_t exponentLong = digitsConsumed.toLong(false);
366
250k
            U_ASSERT(exponentLong >= 0);
367
250k
            if (exponentLong <= INT32_MAX) {
368
249k
                auto exponentInt = static_cast<int32_t>(exponentLong);
369
249k
                if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
370
12
                    overflow = true;
371
12
                }
372
249k
            } else {
373
677
                overflow = true;
374
677
            }
375
250k
        } else {
376
7.11k
            overflow = true;
377
7.11k
        }
378
257k
        if (overflow) {
379
7.80k
            if (exponentSign == -1) {
380
                // Set to zero
381
7.12k
                result.quantity.clear();
382
7.12k
            } else {
383
                // Set to infinity
384
678
                result.quantity.bogus = true;
385
678
                result.flags |= FLAG_INFINITY;
386
678
            }
387
7.80k
        }
388
2.08M
    } else {
389
2.08M
        result.quantity = digitsConsumed;
390
2.08M
    }
391
392
    // Set other information into the result and return.
393
2.34M
    if (!actualDecimalString.isBogus()) {
394
129k
        result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
395
129k
    }
396
2.34M
    result.setCharsConsumed(segment);
397
2.34M
    return segment.length() == 0 || maybeMore;
398
2.91M
}
399
400
6.10M
bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
401
6.10M
    if (requireGroupingMatch) {
402
5.33M
        if (sepType == -1) {
403
            // No such group (prevGroup before first shift).
404
2.66M
            return true;
405
2.67M
        } else if (sepType == 0) {
406
            // First group.
407
2.53M
            if (isPrimary) {
408
                // No grouping separators is OK.
409
2.08M
                return true;
410
2.08M
            } else {
411
450k
                return count != 0 && count <= grouping2;
412
450k
            }
413
2.53M
        } else if (sepType == 1) {
414
            // Middle group.
415
6.30k
            if (isPrimary) {
416
5.11k
                return count == grouping1;
417
5.11k
            } else {
418
1.19k
                return count == grouping2;
419
1.19k
            }
420
130k
        } else {
421
130k
            U_ASSERT(sepType == 2);
422
            // After the decimal separator.
423
130k
            return true;
424
130k
        }
425
5.33M
    } else {
426
762k
        if (sepType == 1) {
427
            // #11230: don't accept middle groups with only 1 digit.
428
0
            return count != 1;
429
762k
        } else {
430
762k
            return true;
431
762k
        }
432
762k
    }
433
6.10M
}
434
435
9.78M
bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
436
    // The common case uses a static leadSet for efficiency.
437
9.78M
    if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
438
9.74M
        return segment.startsWith(*leadSet);
439
9.74M
    }
440
46.7k
    if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
441
72
        return true;
442
72
    }
443
46.6k
    if (fLocalDigitStrings.isNull()) {
444
46.6k
        return false;
445
46.6k
    }
446
0
    for (int32_t i = 0; i < 10; i++) {
447
0
        if (segment.startsWith(fLocalDigitStrings[i])) {
448
0
            return true;
449
0
        }
450
0
    }
451
0
    return false;
452
0
}
453
454
0
UnicodeString DecimalMatcher::toString() const {
455
0
    return u"<Decimal>";
456
0
}
457
458
459
#endif /* #if !UCONFIG_NO_FORMATTING */